From 19d2bf9e2c869ca85c5e0eb2ea9e1b7668409c21 Mon Sep 17 00:00:00 2001 From: Scott Breyer Date: Wed, 17 Jul 2024 11:43:25 -0400 Subject: [PATCH] [v1.22.x] prov/psm3: update provider to sync with IEFS 11.7.0.0.110 Updates: - Improved auto-tuning features for PSM3, including dynamic Credit Flows and detecting the presence of the rv kernel module. - Improved PSM3 intra-node performance for large message sizes. Signed-off-by: Scott Breyer (cherry picked from commit 386f5744fd343b30f769eb91cdbc4badf7fe37fc) --- prov/psm3/Makefile.include | 4 - prov/psm3/VERSION | 2 +- prov/psm3/autogen.sh | 8 - prov/psm3/configure.ac | 6 +- prov/psm3/configure.m4 | 2 +- prov/psm3/debian/changelog | 2 +- prov/psm3/psm3/Makefile.include | 4 - prov/psm3/psm3/hal_sockets/sockets_ep.c | 4 +- prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c | 1 - prov/psm3/psm3/hal_sockets/sockets_hal.c | 18 +- .../psm3/hal_sockets/sockets_hal_inline_i.h | 4 +- prov/psm3/psm3/hal_sockets/sockets_proto.c | 3 +- prov/psm3/psm3/hal_verbs/verbs_ep.c | 11 +- prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c | 1 - prov/psm3/psm3/hal_verbs/verbs_hal.c | 16 +- prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h | 4 +- prov/psm3/psm3/hal_verbs/verbs_service.h | 2 - prov/psm3/psm3/include/utils_env.h | 2 + prov/psm3/psm3/psm.c | 39 +- prov/psm3/psm3/psm2.h | 34 +- prov/psm3/psm3/psm2_hal.c | 41 -- prov/psm3/psm3/psm2_hal.h | 21 +- prov/psm3/psm3/psm2_hal_loopback.c | 14 +- prov/psm3/psm3/psm_config.h | 22 +- prov/psm3/psm3/psm_context.c | 2 - prov/psm3/psm3/psm_ep.c | 30 ++ prov/psm3/psm3/psm_ep.h | 2 +- prov/psm3/psm3/psm_ep_connect.c | 8 +- prov/psm3/psm3/psm_error.c | 2 +- prov/psm3/psm3/psm_lock.h | 6 + prov/psm3/psm3/psm_mq.c | 20 +- prov/psm3/psm3/psm_mq_internal.h | 2 +- prov/psm3/psm3/psm_mq_recv.c | 2 +- prov/psm3/psm3/psm_nic_select.c | 34 +- prov/psm3/psm3/psm_rndv_mod.c | 11 + prov/psm3/psm3/psm_rndv_mod.h | 2 + prov/psm3/psm3/psm_user.h | 101 ++++- prov/psm3/psm3/psm_utils.c | 3 +- prov/psm3/psm3/psm_utils.h | 2 - prov/psm3/psm3/psm_verbs_mr.c | 20 +- prov/psm3/psm3/ptl_am/am_config.h | 33 +- prov/psm3/psm3/ptl_am/am_reqrep_shmem.c | 387 +++++++++--------- prov/psm3/psm3/ptl_am/psm_am_internal.h | 31 +- prov/psm3/psm3/ptl_am/ptl.c | 101 +++-- prov/psm3/psm3/ptl_am/ptl_fwd.h | 3 - prov/psm3/psm3/ptl_ips/ips_config.h | 4 + prov/psm3/psm3/ptl_ips/ips_expected_proto.h | 2 - prov/psm3/psm3/ptl_ips/ips_path_rec.h | 12 - prov/psm3/psm3/ptl_ips/ips_proto.c | 178 ++++++-- prov/psm3/psm3/ptl_ips/ips_proto.h | 16 +- prov/psm3/psm3/ptl_ips/ips_proto_connect.c | 8 +- prov/psm3/psm3/ptl_ips/ips_proto_expected.c | 225 ++++------ prov/psm3/psm3/ptl_ips/ips_proto_header.h | 4 +- prov/psm3/psm3/ptl_ips/ips_proto_help.h | 10 +- prov/psm3/psm3/ptl_ips/ips_proto_mq.c | 160 ++++---- prov/psm3/psm3/ptl_ips/ips_proto_params.h | 5 +- prov/psm3/psm3/ptl_ips/ips_proto_recv.c | 53 +-- prov/psm3/psm3/ptl_ips/ips_scb.h | 2 +- prov/psm3/psm3/ptl_ips/ips_tid.c | 55 --- prov/psm3/psm3/ptl_ips/ips_tid.h | 61 --- prov/psm3/psm3/ptl_ips/ips_tidcache.c | 53 --- prov/psm3/psm3/ptl_ips/ips_tidcache.h | 158 ------- prov/psm3/psm3/ptl_ips/ips_tidflow.c | 60 +-- prov/psm3/psm3/ptl_ips/ips_tidflow.h | 1 - prov/psm3/psm3/ptl_ips/ptl_ips.h | 6 +- prov/psm3/psm3/utils/utils_dsa.c | 230 +++++++++-- prov/psm3/psm3/utils/utils_env.c | 2 +- prov/psm3/src/psmx3.h | 4 +- prov/psm3/src/psmx3_atomic.c | 143 ++++++- prov/psm3/src/psmx3_attr.c | 30 +- prov/psm3/src/psmx3_av.c | 4 +- prov/psm3/src/psmx3_init.c | 44 +- 72 files changed, 1300 insertions(+), 1297 deletions(-) delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tid.c delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tid.h delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tidcache.c delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tidcache.h diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include index 47424fc2caf..9a7ef74370a 100644 --- a/prov/psm3/Makefile.include +++ b/prov/psm3/Makefile.include @@ -101,10 +101,6 @@ prov_psm3_psm3_libptl_ips_la_SOURCES = \ prov/psm3/psm3/ptl_ips/ips_recvq.h \ prov/psm3/psm3/ptl_ips/ips_scb.c \ prov/psm3/psm3/ptl_ips/ips_scb.h \ - prov/psm3/psm3/ptl_ips/ips_tid.c \ - prov/psm3/psm3/ptl_ips/ips_tid.h \ - prov/psm3/psm3/ptl_ips/ips_tidcache.c \ - prov/psm3/psm3/ptl_ips/ips_tidcache.h \ prov/psm3/psm3/ptl_ips/ips_tidflow.c \ prov/psm3/psm3/ptl_ips/ips_tidflow.h \ prov/psm3/psm3/ptl_ips/ptl.c \ diff --git a/prov/psm3/VERSION b/prov/psm3/VERSION index 144229f3d51..8cb63b0114c 100644 --- a/prov/psm3/VERSION +++ b/prov/psm3/VERSION @@ -1 +1 @@ -3_6_0_1 +3_7_0_0 diff --git a/prov/psm3/autogen.sh b/prov/psm3/autogen.sh index b3894e4712b..2d7687527a6 100755 --- a/prov/psm3/autogen.sh +++ b/prov/psm3/autogen.sh @@ -5,14 +5,6 @@ if test ! -f src/psmx3.h; then exit 1 fi -if [ -f psm3/Makefile.include.base ] -then - make -f - <@]) + [EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=check [check means match --enable-psm3-verbs option]@:>@]) ],[],[enable_psm3_rc=check]) AS_IF([test "x$enable_psm3_udp" = "xyes"], [ @@ -429,6 +429,10 @@ AS_IF([test x"$enable_atomics" != x"no"], ]) unset LIBS_save +dnl Check for 128-bit integer support +AC_CHECK_TYPE([__int128], + [AC_DEFINE(HAVE___INT128, 1, [Set to 1 to use 128-bit ints])]) + dnl Check for gcc cpuid intrinsics AC_MSG_CHECKING(compiler support for cpuid) AC_LINK_IFELSE([AC_LANG_PROGRAM([[ diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4 index 25aea136db6..5c8c083f7dc 100644 --- a/prov/psm3/configure.m4 +++ b/prov/psm3/configure.m4 @@ -456,7 +456,7 @@ AC_ARG_ENABLE([psm3-udp], [enable_psm3_udp=no]) AC_ARG_ENABLE([psm3-rc], [AS_HELP_STRING([--enable-psm3-rc], - [EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=[Verbs HAL]@:>@])], + [EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=check [check means match --enable-psm3-verbs option]@:>@])], [], [enable_psm3_rc=check]) dnl ------------- Extra Features diff --git a/prov/psm3/debian/changelog b/prov/psm3/debian/changelog index 0b1b356686f..52852ac0f5e 100644 --- a/prov/psm3/debian/changelog +++ b/prov/psm3/debian/changelog @@ -1,4 +1,4 @@ -libpsm3-fi (11.6.0.0-231) unstable; urgency=medium +libpsm3-fi (11.7.0.0-110) unstable; urgency=medium * Initial release diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include index cc52b8f1868..fd253089532 100644 --- a/prov/psm3/psm3/Makefile.include +++ b/prov/psm3/psm3/Makefile.include @@ -66,10 +66,6 @@ psm3_libptl_ips_la_SOURCES = \ psm3/ptl_ips/ips_recvq.h \ psm3/ptl_ips/ips_scb.c \ psm3/ptl_ips/ips_scb.h \ - psm3/ptl_ips/ips_tid.c \ - psm3/ptl_ips/ips_tid.h \ - psm3/ptl_ips/ips_tidcache.c \ - psm3/ptl_ips/ips_tidcache.h \ psm3/ptl_ips/ips_tidflow.c \ psm3/ptl_ips/ips_tidflow.h \ psm3/ptl_ips/ptl.c \ diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.c b/prov/psm3/psm3/hal_sockets/sockets_ep.c index 27b98631508..ce7ddb61bc3 100755 --- a/prov/psm3/psm3/hal_sockets/sockets_ep.c +++ b/prov/psm3/psm3/hal_sockets/sockets_ep.c @@ -1125,7 +1125,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) // unfortunately default TCP max_buffering (16K) is too small // so flow_credit_bytes < 16K would prevent getting a good pipeline of // packets/ACKs going - proto->flow_credit_bytes = ep->mtu * proto->flow_credits; + proto->flow_credit_bytes = ep->mtu * proto->max_credits; } else { // sockets buffering needs to place an upper bound on bytes // while flow_credits places an upper bound on pkts @@ -1229,7 +1229,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) } // Fetch current link state to update linkinfo fields in ips_proto: -// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables +// ep_base_lid, ep_lmc, ep_link_rate // These are all fields which can change during a link bounce. // Note "active" state is not adjusted as on link down PSM will wait for // the link to become usable again so it's always a viable/active device diff --git a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c index b6235c533e6..645dfd3ebd2 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c +++ b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c @@ -58,7 +58,6 @@ #include #include #include "ips_proto.h" -#include "ptl_ips/ips_tid.h" #include "ptl_ips/ips_expected_proto.h" // flags=0 for send, 1 for recv diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.c b/prov/psm3/psm3/hal_sockets/sockets_hal.c index 8d4527bdd64..dd9ec3735dc 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_hal.c +++ b/prov/psm3/psm3/hal_sockets/sockets_hal.c @@ -73,7 +73,7 @@ static int psm3_hfp_sockets_initialize(psmi_hal_instance_t *phi, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) // testing on HED-2629 suggests turning off RNDV can help // latency for messages in size 8-256 KB - gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV; + psm3_gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV; #endif /* we initialize a few HAL software specific capabilities which * are known before context_open can open RV or parse HAL specific @@ -175,11 +175,11 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq) * corresponding PSM3_* env variables. * Otherwise these defaults are used. */ - mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH; + mq->rndv_nic_thresh = PSM3_MQ_RNDV_NIC_THRESH; mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR; // Even without RDMA do we want to disable rendezvous? // even without RDMA, the receiver controlled pacing helps scalability - mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous + mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (PSMI_IS_GPU_ENABLED) @@ -220,16 +220,6 @@ static int psm3_hfp_sockets_get_unit_active(int unit) return psm3_sockets_get_unit_active(unit, SIMS_FILTER); } -static int psm3_hfp_sockets_get_num_contexts(int unit) -{ - return 1024; -} - -static int psm3_hfp_sockets_get_num_free_contexts(int unit) -{ - return 1024; -} - static int psm3_hfp_sockets_get_default_pkey(void) { return 0; /* use slot 0 as default */ @@ -305,8 +295,6 @@ static hfp_sockets_t psm3_sockets_hi = { .hfp_get_num_ports = psm3_hfp_sockets_get_num_ports, .hfp_get_unit_active = psm3_hfp_sockets_get_unit_active, .hfp_get_port_active = psm3_hfp_sockets_get_port_active, - .hfp_get_num_contexts = psm3_hfp_sockets_get_num_contexts, - .hfp_get_num_free_contexts = psm3_hfp_sockets_get_num_free_contexts, .hfp_get_default_pkey = psm3_hfp_sockets_get_default_pkey, .hfp_get_port_subnet = psm3_hfp_sockets_get_port_subnet, .hfp_get_unit_pci_bus = psm3_hfp_sockets_get_unit_pci_bus, diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h index 28b13150466..9b703674147 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h +++ b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h @@ -189,7 +189,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_proto_init( } // Fetch current link state to update linkinfo fields in ips_proto: -// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables +// ep_base_lid, ep_lmc, ep_link_rate // These are all fields which can change during a link bounce. // Note "active" state is not adjusted as on link down PSM will wait for // the link to become usable again so it's always a viable/active device @@ -409,7 +409,7 @@ static PSMI_HAL_INLINE void psm3_hfp_sockets_ips_ipsaddr_disconnect( { } -/* Handle HAL specific initialization of ibta path record query, CCA +/* Handle HAL specific initialization of ibta path record query * and dispersive routing */ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_ibta_init( diff --git a/prov/psm3/psm3/hal_sockets/sockets_proto.c b/prov/psm3/psm3/hal_sockets/sockets_proto.c index e7f90bb6982..c694151dcf1 100644 --- a/prov/psm3/psm3/hal_sockets/sockets_proto.c +++ b/prov/psm3/psm3/hal_sockets/sockets_proto.c @@ -74,7 +74,7 @@ psm3_tcp_proto_local_ack(struct ips_proto *proto, struct ips_flow *flow) psmi_seqnum_t last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num; while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num, - last_seq_num.psn_num, flow->xmit_ack_num.psn_num-1) + last_seq_num.psn_num, (flow->xmit_ack_num.psn_num-1) & proto->psn_mask) ) { STAILQ_REMOVE_HEAD(unackedq, nextq); #ifdef PSM_DEBUG @@ -151,6 +151,7 @@ psm3_tcp_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) // local ack if (scb) { // this check is unnecessary, but can make KW happy flow->xmit_ack_num.psn_num = 1 + (__be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask); + flow->xmit_ack_num.psn_num &= proto->psn_mask; } psm3_tcp_proto_local_ack(proto, flow); } diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.c b/prov/psm3/psm3/hal_verbs/verbs_ep.c index 10a4e845e4b..f4e30d6c5e9 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_ep.c +++ b/prov/psm3/psm3/hal_verbs/verbs_ep.c @@ -397,8 +397,6 @@ psm3_verbs_parse_params(psm2_ep_t ep) // min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * // chunk size (psm3_mq_max_window_rv(mq, 0) after // psm3_mq_initialize_params) - // for OPA native, actual window_rv may be smaller, but for UD it - // is not reduced psm3_getenv("PSM3_RV_MR_CACHE_SIZE", "kernel space MR cache size" " (MBs, 0 lets rv module decide) [0]", @@ -550,7 +548,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) ep->chunk_max_size = ep->mtu; #ifdef PSM_BYTE_FLOW_CREDITS // let flow_credits be the control - proto->flow_credit_bytes = ep->mtu * proto->flow_credits; + proto->flow_credit_bytes = ep->mtu * proto->max_credits; _HFI_DBG("initial flow_credits %d bytes %d\n", proto->flow_credits, proto->flow_credit_bytes); #else @@ -594,7 +592,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz) } // Fetch current link state to update linkinfo fields in ips_proto: -// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables +// ep_base_lid, ep_lmc, ep_link_rate // These are all fields which can change during a link bounce. // Note "active" state is not adjusted as on link down PSM will wait for // the link to become usable again so it's always a viable/active device @@ -2884,8 +2882,11 @@ unsigned psm3_verbs_parse_rdmamode(int reload) // IPS_PROTOEXP_FLAGS_INTERLEAVE are N/A when RDMA not enabled default_value = 0; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) #ifdef RNDV_MOD + if (psm3_rv_available()) { + default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL; + } +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) // GPUDIRECT causes default_value of RDMA=1 if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect()) default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL; diff --git a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c index 8fc324adedb..ab0942e5497 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c +++ b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c @@ -58,7 +58,6 @@ #include #include #include "ips_proto.h" -#include "ptl_ips/ips_tid.h" #include "ptl_ips/ips_expected_proto.h" // flags=0 for send, 1 for recv diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.c b/prov/psm3/psm3/hal_verbs/verbs_hal.c index 9575b316ff2..69d27478b48 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal.c +++ b/prov/psm3/psm3/hal_verbs/verbs_hal.c @@ -166,12 +166,12 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq) * Otherwise these defaults are used. */ unsigned rdmamode = psm3_verbs_parse_rdmamode(1); - mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH; + mq->rndv_nic_thresh = PSM3_MQ_RNDV_NIC_THRESH; mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR; if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) { // TBD - when RDMA is disabled do we want to disable rendezvous? // even without RDMA, the receiver controlled pacing helps scalability - mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous + mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous } mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) @@ -213,16 +213,6 @@ static int psm3_hfp_verbs_get_unit_active(int unit) return psm3_verbs_get_unit_active(unit, VIMS_FILTER); } -static int psm3_hfp_verbs_get_num_contexts(int unit) -{ - return 1024; -} - -static int psm3_hfp_verbs_get_num_free_contexts(int unit) -{ - return 1024; -} - static int psm3_hfp_verbs_get_default_pkey(void) { return 0; /* use slot 0 as default */ @@ -293,8 +283,6 @@ static hfp_verbs_t psm3_verbs_hi = { .hfp_get_num_ports = psm3_hfp_verbs_get_num_ports, .hfp_get_unit_active = psm3_hfp_verbs_get_unit_active, .hfp_get_port_active = psm3_hfp_verbs_get_port_active, - .hfp_get_num_contexts = psm3_hfp_verbs_get_num_contexts, - .hfp_get_num_free_contexts = psm3_hfp_verbs_get_num_free_contexts, .hfp_get_default_pkey = psm3_hfp_verbs_get_default_pkey, .hfp_get_port_subnet = psm3_hfp_verbs_get_port_subnet, .hfp_get_unit_pci_bus = psm3_hfp_verbs_get_unit_pci_bus, diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h index 2ba92503e9f..8ef06d9ae97 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h +++ b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h @@ -181,7 +181,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_proto_init( } // Fetch current link state to update linkinfo fields in ips_proto: -// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables +// ep_base_lid, ep_lmc, ep_link_rate // These are all fields which can change during a link bounce. // Note "active" state is not adjusted as on link down PSM will wait for // the link to become usable again so it's always a viable/active device @@ -610,7 +610,7 @@ static PSMI_HAL_INLINE void psm3_hfp_verbs_ips_ipsaddr_disconnect( #endif } -/* Handle HAL specific initialization of ibta path record query, CCA +/* Handle HAL specific initialization of ibta path record query * and dispersive routing */ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ibta_init( diff --git a/prov/psm3/psm3/hal_verbs/verbs_service.h b/prov/psm3/psm3/hal_verbs/verbs_service.h index 1767ce33038..dba159c82f8 100644 --- a/prov/psm3/psm3/hal_verbs/verbs_service.h +++ b/prov/psm3/psm3/hal_verbs/verbs_service.h @@ -112,7 +112,5 @@ int psm3_verbs_get_unit_active(int unit, enum verbs_init_max_speed init_max_spee returns <= 0 if no port on any of the units is active. */ int psm3_hfp_verbs_have_active_unit(int num_units); -/* get the number of contexts from the unit id. */ -int psm3_verbs_get_num_contexts(int unit); #endif /* PSM_HAL_VERBS_SERVICE_H */ #endif /* PSM_VERBS */ diff --git a/prov/psm3/psm3/include/utils_env.h b/prov/psm3/psm3/include/utils_env.h index d95660f6e01..770f04cc44a 100644 --- a/prov/psm3/psm3/include/utils_env.h +++ b/prov/psm3/psm3/include/utils_env.h @@ -153,6 +153,8 @@ int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr, union psmi_envvar_val *newval); MOCK_DCL_EPILOGUE(psm3_getenv_range); +int psm3_count_tuples(const char *str); + /* * Parsing int, unsigned int and long parameters * 0 -> ok, *val updated diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c index df138dd8a2f..e46f868f054 100644 --- a/prov/psm3/psm3/psm.c +++ b/prov/psm3/psm3/psm.c @@ -69,6 +69,8 @@ int psm3_allow_routers; // PSM3_ALLOW_ROUTERS char *psm3_allow_subnets[PSMI_MAX_SUBNETS]; // PSM3_SUBNETS int psm3_num_allow_subnets; unsigned int psm3_addr_per_nic = 1; +unsigned int psm3_reg_mr_fail_limit = 100; +unsigned int psm3_reg_mr_warn_cnt = 10; const char *psm3_nic_wildcard = NULL; @@ -108,7 +110,7 @@ uint32_t gdr_copy_limit_recv; int is_gpudirect_enabled = 0; int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect(). int is_driver_gpudirect_enabled; -uint32_t gpu_thresh_rndv = GPU_THRESH_RNDV; +uint32_t psm3_gpu_thresh_rndv = PSM3_GPU_THRESH_RNDV; uint64_t psm3_gpu_cache_evict; // in bytes #endif @@ -653,7 +655,7 @@ static void psmi_gpu_init(void) ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV", "RNDV protocol is used for GPU send message sizes greater than the threshold", NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)gpu_thresh_rndv, + (union psmi_envvar_val)psm3_gpu_thresh_rndv, (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX, NULL, NULL, &env_gpu_thresh_rndv); if (ret > 0) @@ -665,9 +667,10 @@ static void psmi_gpu_init(void) "[Deprecated, use PSM3_GPU_THRESH_RNDV]" " RNDV protocol is used for GPU send message sizes greater than the threshold", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)gpu_thresh_rndv, &env_gpu_thresh_rndv); + (union psmi_envvar_val)psm3_gpu_thresh_rndv, + &env_gpu_thresh_rndv); - gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint; + psm3_gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint; union psmi_envvar_val env_gdr_copy_limit_send; @@ -683,8 +686,8 @@ static void psmi_gpu_init(void) (union psmi_envvar_val)GDR_COPY_LIMIT_SEND, &env_gdr_copy_limit_send); gdr_copy_limit_send = env_gdr_copy_limit_send.e_int; - if (gdr_copy_limit_send < 8 || gdr_copy_limit_send > gpu_thresh_rndv) - gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, gpu_thresh_rndv); + if (gdr_copy_limit_send < 8 || gdr_copy_limit_send > psm3_gpu_thresh_rndv) + gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, psm3_gpu_thresh_rndv); union psmi_envvar_val env_gdr_copy_limit_recv; psm3_getenv("PSM3_GDRCOPY_LIMIT_RECV", @@ -1344,6 +1347,18 @@ psm2_error_t psm3_init(int *major, int *minor) } psm3_addr_per_nic = env_addr_per_nic.e_uint; } + { + union psmi_envvar_val env_reg_mr_fail_limit; + psm3_getenv("PSM3_REG_MR_FAIL_LIMIT", + "Max number of consecutive reg_mr failures", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)100, &env_reg_mr_fail_limit); + if (env_reg_mr_fail_limit.e_uint >= 1) { + psm3_reg_mr_fail_limit = env_reg_mr_fail_limit.e_uint; + if (psm3_reg_mr_warn_cnt > psm3_reg_mr_fail_limit) + psm3_reg_mr_warn_cnt = psm3_reg_mr_fail_limit; + } + } { union psmi_envvar_val env_allow_routers; psm3_getenv("PSM3_ALLOW_ROUTERS", @@ -1576,8 +1591,8 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, 0, /* PSM2_INFO_QUERY_NUM_PORTS */ 1, /* PSM2_INFO_QUERY_UNIT_STATUS */ 2, /* PSM2_INFO_QUERY_UNIT_PORT_STATUS */ - 1, /* PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */ - 1, /* PSM2_INFO_QUERY_NUM_CONTEXTS */ + 0, /* was PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */ + 0, /* was PSM2_INFO_QUERY_NUM_CONTEXTS */ 0, /* was PSM2_INFO_QUERY_CONFIG */ 0, /* was PSM2_INFO_QUERY_THRESH */ 0, /* was PSM2_INFO_QUERY_DEVICE_NAME */ @@ -1621,14 +1636,6 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out, args[1].port); rv = PSM2_OK; break; - case PSM2_INFO_QUERY_NUM_FREE_CONTEXTS: - *((uint32_t*)out) = psmi_hal_get_num_free_contexts(args[0].unit); - rv = PSM2_OK; - break; - case PSM2_INFO_QUERY_NUM_CONTEXTS: - *((uint32_t*)out) = psmi_hal_get_num_contexts(args[0].unit); - rv = PSM2_OK; - break; case PSM2_INFO_QUERY_FEATURE_MASK: { #ifdef PSM_CUDA diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h index b9ff1c598d1..cadb561dbd4 100644 --- a/prov/psm3/psm3/psm2.h +++ b/prov/psm3/psm3/psm2.h @@ -66,7 +66,7 @@ extern "C" { * @file psm2.h * @page psm2_main PSM2 API * - * @brief PSM2 OPA Messaging Library + * @brief PSM2 Messaging Library * * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level * user-level communications interface for the OPA family of products. @@ -666,11 +666,11 @@ typedef psm2_epid_t psm2_nid_t; */ psm2_nid_t psm3_epid_nid(psm2_epid_t epid); -/** @brief Get Endpoint identifier's OPA context number */ +/** @brief Get Endpoint identifier's context number */ uint64_t psm3_epid_context(psm2_epid_t epid); #endif // 0 -/** @brief Get Endpoint identifier's OPA port (deprecated, use +/** @brief Get Endpoint identifier's network port (deprecated, use * @ref psm3_epid_context instead) */ uint64_t psm3_epid_port(psm2_epid_t epid); @@ -743,10 +743,10 @@ struct psm3_ep_open_opts { int imm_size; /* Immediate data size for endpoint */ }; -/** @brief OPA endpoint creation +/** @brief PSM3 endpoint creation * - * Function used to create a new local communication endpoint on an OPA - * adapter. The returned endpoint handle is required in all PSM2 communication + * Function used to create a new local communication endpoint on an adapter/NIC. + * The returned endpoint handle is required in all PSM2 communication * operations, as PSM2 can manage communication over multiple endpoints. An * opened endpoint has no global context until the user connects the endpoint * to other global endpoints by way of @ref psm3_ep_connect. All local endpoint @@ -1328,7 +1328,7 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr); /* PSM2_COMPONENT_IB options */ /* Default service level to use to communicate with remote endpoints */ #define PSM2_IB_OPT_DF_SL 0x201 - /**< [@b uint32_t ] Default OPA SL to use for all remote communication. + /**< [@b uint32_t ] Default OPA/IB SL to use for all remote communication. * If unset defaults to Service Level 0. * * component object: Opened PSM2 endpoint id (@ref psm2_ep_t). @@ -1337,7 +1337,7 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr); /* Set IB service level to use for communication to an endpoint */ #define PSM2_IB_OPT_EP_SL 0x202 - /**< [@b uint32_t ] OPA SL to use for communication to specified + /**< [@b uint32_t ] OPA/IB SL to use for communication to specified * remote endpoint. * * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address. @@ -1348,7 +1348,7 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr); /* MQ options that can be set in psm3_mq_init and psm2_{set,get}_opt */ #define PSM2_MQ_OPT_RNDV_IB_SZ 0x301 /**< [@b uint32_t ] Size at which to start enabling rendezvous - * messaging for OPA messages (if unset, defaults to values + * messaging for PSM3 messages (if unset, defaults to values * between 56000 and 72000 depending on the system configuration) * * component object: PSM2 Matched Queue (@ref psm2_mq_t). @@ -1615,19 +1615,11 @@ typedef enum psm2_info_query_et active. */ PSM2_INFO_QUERY_UNIT_PORT_STATUS, -/*! Required input arguments: 1 - 1. type: uint32_t, description: the unit for which the number of - free contexts is desired (use: psm2_info_query_arg_t.unit). - Output parameter: uint32_t, description: the number of free - contexts.. */ - PSM2_INFO_QUERY_NUM_FREE_CONTEXTS, +/*! removed QUERY_NUM_FREE_CONTEXTS, but kept placeholder to retain values in enum */ + PSM2_WAS_INFO_QUERY_NUM_FREE_CONTEXTS, -/*! Required input arguments: 1 - 1. type: uint32_t, description: the unit for which the number of - contexts is desired (use: psm2_info_query_arg_t.unit). - Output parameter: uint32_t, description: the number of - contexts.. */ - PSM2_INFO_QUERY_NUM_CONTEXTS, +/*! removed QUERY_NUM_CONTEXTS, but kept placeholder to retain values in enum */ + PSM2_WAS_INFO_QUERY_NUM_CONTEXTS, /*! removed QUERY_CONFIG, but kept placeholder to retain values in enum */ PSM2_WAS_INFO_QUERY_CONFIG, diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c index 0c347ce2160..31a1cf67ecf 100644 --- a/prov/psm3/psm3/psm2_hal.c +++ b/prov/psm3/psm3/psm2_hal.c @@ -110,8 +110,6 @@ void psm3_hal_register_instance(psmi_hal_instance_t *psm_hi) REJECT_IMPROPER_HI(hfp_get_num_ports); REJECT_IMPROPER_HI(hfp_get_unit_active); REJECT_IMPROPER_HI(hfp_get_port_active); - REJECT_IMPROPER_HI(hfp_get_num_contexts); - REJECT_IMPROPER_HI(hfp_get_num_free_contexts); REJECT_IMPROPER_HI(hfp_get_default_pkey); REJECT_IMPROPER_HI(hfp_get_port_subnet); REJECT_IMPROPER_HI(hfp_get_unit_pci_bus); @@ -293,37 +291,6 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) rv = -1; } break; - case psmi_hal_pre_init_cache_func_get_num_contexts: - { - int unit = va_arg(ap,int); - if ((unit >= 0) && (unit < p->params.num_units)) - { - if (!p->params.num_contexts_valid[unit]) { - p->params.num_contexts_valid[unit] = 1; - p->params.num_contexts[unit] = p->hfp_get_num_contexts(unit); - } - rv = p->params.num_contexts[unit]; - } - else - rv = -1; - } - break; - case psmi_hal_pre_init_cache_func_get_num_free_contexts: - { - int unit = va_arg(ap,int); - - if ((unit >= 0) && (unit < p->params.num_units)) - { - if (!p->params.num_free_contexts_valid[unit]) { - p->params.num_free_contexts_valid[unit] = 1; - p->params.num_free_contexts[unit] = p->hfp_get_num_free_contexts(unit); - } - rv = p->params.num_free_contexts[unit]; - } - else - rv = -1; - } - break; case psmi_hal_pre_init_cache_func_get_default_pkey: rv = p->params.default_pkey; break; @@ -581,10 +548,6 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p) FREE_HAL_CACHE(port_speed_valid); FREE_HAL_CACHE(port_lid); FREE_HAL_CACHE(port_lid_valid); - FREE_HAL_CACHE(num_contexts); - FREE_HAL_CACHE(num_contexts_valid); - FREE_HAL_CACHE(num_free_contexts); - FREE_HAL_CACHE(num_free_contexts_valid); FREE_HAL_CACHE(port_subnet_valid); FREE_HAL_CACHE(port_subnet); FREE_HAL_CACHE(port_subnet_addr); @@ -638,10 +601,6 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p, ALLOC_HAL_CACHE(port_speed_valid, int8_t, nunits*(nports+1)); ALLOC_HAL_CACHE(port_lid, int, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(port_lid_valid, int8_t, nunits*(nports+1)*psm3_addr_per_nic); - ALLOC_HAL_CACHE(num_contexts, uint16_t, nunits); - ALLOC_HAL_CACHE(num_contexts_valid, uint16_t, nunits); - ALLOC_HAL_CACHE(num_free_contexts, uint16_t, nunits); - ALLOC_HAL_CACHE(num_free_contexts_valid, uint16_t, nunits); ALLOC_HAL_CACHE(port_subnet_valid, int8_t, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(port_subnet, psmi_subnet128_t, nunits*(nports+1)*psm3_addr_per_nic); ALLOC_HAL_CACHE(port_subnet_addr, psmi_naddr128_t, nunits*(nports+1)*psm3_addr_per_nic); diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h index 055261da6c4..91d187dcd56 100644 --- a/prov/psm3/psm3/psm2_hal.h +++ b/prov/psm3/psm3/psm2_hal.h @@ -83,10 +83,10 @@ struct psm3_ep_open_opts; */ typedef enum { - PSM_HAL_INDEX_VERBS = 1, - PSM_HAL_INDEX_SOCKETS = 2, - PSM_HAL_INDEX_LOOPBACK = 3, - PSM_HAL_INDEX_MAX = 3, + PSM_HAL_INDEX_VERBS = 0, + PSM_HAL_INDEX_SOCKETS = 1, + PSM_HAL_INDEX_LOOPBACK = 2, + PSM_HAL_INDEX_MAX = 2, } psmi_hal_instance_index_t; /* This string is used as the hal_name for both log messages @@ -232,8 +232,6 @@ typedef struct _psmi_hal_params int8_t *port_speed_valid; int *port_lid; int8_t *port_lid_valid; - uint16_t *num_contexts,*num_contexts_valid; - uint16_t *num_free_contexts,*num_free_contexts_valid; // information from port_get_subnet int8_t *port_subnet_valid; uint8_t *port_subnet_addr_fmt; @@ -340,13 +338,6 @@ struct _psmi_hal_instance int (*hfp_get_unit_active)(int unit); int (*hfp_get_port_active)(int unit,int port); - /* NOTE: hfp_get_num_contexts is a function that must - be callable before the hal instance is initialized. */ - int (*hfp_get_num_contexts)(int unit); - /* NOTE: hfp_get_num_free_contexts is a function that must - be callable before the hal instance is initialized. */ - int (*hfp_get_num_free_contexts)(int unit); - /* Returns the default pkey: NOTE: hfp_get_default_pkey is a function that must be callable before the hal instance is initialized. */ @@ -519,8 +510,6 @@ enum psmi_hal_pre_init_cache_func_krnls psmi_hal_pre_init_cache_func_get_port_active, psmi_hal_pre_init_cache_func_get_port_speed, psmi_hal_pre_init_cache_func_get_port_lid, - psmi_hal_pre_init_cache_func_get_num_contexts, - psmi_hal_pre_init_cache_func_get_num_free_contexts, psmi_hal_pre_init_cache_func_get_default_pkey, psmi_hal_pre_init_cache_func_get_port_subnet, psmi_hal_pre_init_cache_func_get_port_subnet_name, @@ -580,8 +569,6 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...) #define psmi_hal_get_port_active(...) PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__) #define psmi_hal_get_port_speed(...) PSMI_HAL_DISPATCH_PI(get_port_speed,__VA_ARGS__) #define psmi_hal_get_port_lid(...) PSMI_HAL_DISPATCH_PI(get_port_lid,__VA_ARGS__) -#define psmi_hal_get_num_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__) -#define psmi_hal_get_num_free_contexts(...) PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__) #define psmi_hal_get_default_pkey(...) PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__) #define psmi_hal_get_port_subnet(...) PSMI_HAL_DISPATCH_PI(get_port_subnet,__VA_ARGS__) #define psmi_hal_get_port_subnet_name(...) PSMI_HAL_DISPATCH_PI(get_port_subnet_name,__VA_ARGS__) diff --git a/prov/psm3/psm3/psm2_hal_loopback.c b/prov/psm3/psm3/psm2_hal_loopback.c index 913a45dec78..6789ad18f59 100644 --- a/prov/psm3/psm3/psm2_hal_loopback.c +++ b/prov/psm3/psm3/psm2_hal_loopback.c @@ -131,16 +131,6 @@ static int psm3_hfp_loopback_get_port_active(int unit, int port) return (unit == 0) && (port == 1); } -static int psm3_hfp_loopback_get_num_contexts(int unit) -{ - return 1024; -} - -static int psm3_hfp_loopback_get_num_free_contexts(int unit) -{ - return 1024; -} - static int psm3_hfp_loopback_get_port_subnet(int unit, int port, int addr_index, psmi_subnet128_t *subnet, psmi_naddr128_t *addr, int *idx, psmi_gid128_t *gid) @@ -213,7 +203,7 @@ static void psm3_hfp_loopback_mq_init_defaults(struct psm2_mq *mq) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) mq->ips_gpu_window_rv_str = NULL; // no rendezvous #endif - mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous + mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY; // RDMA and MR cache N/A, leave ep->rdmamode, ep->mr_cache_mode and // ep->rv_gpu_cache_size as set by caller (0, NONE, 0) @@ -276,8 +266,6 @@ hfp_loopback_t psm3_loopback_hi = { .hfp_get_num_ports = psm3_hfp_loopback_get_num_ports, .hfp_get_unit_active = psm3_hfp_loopback_get_unit_active, .hfp_get_port_active = psm3_hfp_loopback_get_port_active, - .hfp_get_num_contexts = psm3_hfp_loopback_get_num_contexts, - .hfp_get_num_free_contexts = psm3_hfp_loopback_get_num_free_contexts, .hfp_get_default_pkey = psm3_hfp_loopback_get_default_pkey, .hfp_get_port_subnet = psm3_hfp_loopback_get_port_subnet, .hfp_get_unit_pci_bus = psm3_hfp_loopback_get_unit_pci_bus, diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h index 4ce7de78157..9bd59690005 100644 --- a/prov/psm3/psm3/psm_config.h +++ b/prov/psm3/psm3/psm_config.h @@ -140,9 +140,13 @@ * Mutexlock should be used for experimentation while the more useful * mutexlock-debug should be enabled during development to catch potential * errors. + * + * When mutexlock-debug is enabled, mutexlock-debug-log-contention may also + * be enabled to log anytime a lock is contended for */ #ifdef PSM_DEBUG #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG +//#define PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION #else #define PSMI_LOCK_IS_SPINLOCK /* #define PSMI_LOCK_IS_MUTEXLOCK */ @@ -168,7 +172,7 @@ /* All GPU transfers beyond this threshold use * RNDV protocol. It is mostly a send side knob. */ -#define GPU_THRESH_RNDV 8000 +#define PSM3_GPU_THRESH_RNDV 8000 #define GPUDIRECT_THRESH_RV 3 @@ -179,20 +183,26 @@ #define PSM_MQ_NIC_MAX_TINY 8 /* max TINY payload allowed */ -#define PSM_MQ_NIC_RNDV_THRESH 64000 +#define PSM3_MQ_RNDV_NIC_THRESH 64000 #define PSM_CPU_NIC_RNDV_WINDOW_STR "131072" #ifdef PSM_CUDA #define PSM_GPU_NIC_RNDV_WINDOW_STR "2097152" #elif defined(PSM_ONEAPI) #define PSM_GPU_NIC_RNDV_WINDOW_STR "131072:524287,262144:1048575,524288" #endif -#define PSM_MQ_NIC_MAX_RNDV_WINDOW (4 * 1024 * 1024) /* max rndv window */ +#define PSM3_MQ_RNDV_NIC_WINDOW_MAX (4 * 1024 * 1024) /* max rndv window */ + +/* + * Rendezvous threshold is same for CMA, scale-up or LONG_DATA mechanisms + */ +#define PSM3_MQ_RNDV_SHM_THRESH 16000 -#define MQ_SHM_THRESH_RNDV 16000 #if defined(PSM_CUDA) -#define MQ_SHM_GPU_THRESH_RNDV 127 +/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ +#define PSM3_MQ_RNDV_SHM_GPU_THRESH 63 #elif defined(PSM_ONEAPI) -#define MQ_SHM_GPU_THRESH_RNDV 127 +/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ +#define PSM3_MQ_RNDV_SHM_GPU_THRESH 127 #endif // LEARN_HASH_SELECTOR has PSM3 dynamically learn the combinations diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c index 35477d69f2f..678b394d71e 100644 --- a/prov/psm3/psm3/psm_context.c +++ b/prov/psm3/psm3/psm_context.c @@ -386,8 +386,6 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit) int cpu_and_count = CPU_COUNT(&andcpuset); if (cpu_and_count > 0 && pthread_setaffinity_np(mythread, sizeof(andcpuset), &andcpuset)) { - // bug on OPA, dev_name not yet initialized - // ok on UD and UDP _HFI_ERROR( "Failed to set %s (unit %d) cpu set: %s\n", ep->dev_name, unit, strerror(errno)); //err = -PSM_HAL_ERROR_GENERAL_ERROR; goto bail; diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c index 36dbf40abfa..86dfa9a88d0 100644 --- a/prov/psm3/psm3/psm_ep.c +++ b/prov/psm3/psm3/psm_ep.c @@ -455,6 +455,10 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, opts.outsl = opts_i->outsl; if (opts_i->service_id) opts.service_id = (uint64_t) opts_i->service_id; +#ifdef PSM3_PATH_REC_QUERY + if (opts_i->path_res_type != PSM2_PATH_RES_NONE) + opts.path_res_type = opts_i->path_res_type; +#endif if (opts_i->senddesc_num) opts.senddesc_num = opts_i->senddesc_num; if (opts_i->imm_size) @@ -470,7 +474,33 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled, opts.service_id = (uint64_t) envvar_val.e_ulonglong; } +#ifdef PSM3_PATH_REC_QUERY + const char *PSM3_PATH_REC_HELP = + "Mechanism to query NIC path record [opp, umad or none] (default is none)"; + /* Get Path resolution type from environment Possible choices are: + * + * NONE : Default same as previous instances. Utilizes static data. + * OPP : Use OFED Plus Plus library to do path record queries. + * UMAD : Use raw libibumad interface to form and process path records. + */ + if (!psm3_getenv("PSM3_PATH_REC", PSM3_PATH_REC_HELP, + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, + (union psmi_envvar_val)"none", &envvar_val)) { + if (!strcasecmp(envvar_val.e_str, "none")) + opts.path_res_type = PSM2_PATH_RES_NONE; + else if (!strcasecmp(envvar_val.e_str, "opp")) + opts.path_res_type = PSM2_PATH_RES_OPP; + else if (!strcasecmp(envvar_val.e_str, "umad")) + opts.path_res_type = PSM2_PATH_RES_UMAD; + else { + _HFI_INFO("Invalid value for PSM3_PATH_REC ('%s') %-40s Using: none\n", + envvar_val.e_str, PSM3_PATH_REC_HELP); + opts.path_res_type = PSM2_PATH_RES_NONE; + } + } +#else opts.path_res_type = PSM2_PATH_RES_NONE; +#endif /* Get user specified port number to use. */ if (!psm3_getenv("PSM3_NIC_PORT", "NIC Port number (0 autodetects)", diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h index c1ec006eff9..f8376331e32 100644 --- a/prov/psm3/psm3/psm_ep.h +++ b/prov/psm3/psm3/psm_ep.h @@ -173,7 +173,7 @@ struct psm2_ep { uint32_t hfi_imm_size; /** Immediate data size */ uint32_t connections; /**> Number of connections */ - /* HAL indicates send segmentation support (OPA Send DMA or UDP GSO) + /* HAL indicates send segmentation support (Send DMA or UDP GSO) * by setting max_segs>1 and max_size > 1 MTU. * chunk_size used will be min(chunk_max_segs*frag_size, chunk_max_size) * Can set 1 huge and other reasonable if want only 1 to control diff --git a/prov/psm3/psm3/psm_ep_connect.c b/prov/psm3/psm3/psm_ep_connect.c index 5e36cab14ae..56f66610c45 100644 --- a/prov/psm3/psm3/psm_ep_connect.c +++ b/prov/psm3/psm3/psm_ep_connect.c @@ -280,8 +280,8 @@ psm3_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, } else if (!psm3_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { deverr = - "there is no OPA PSM3 device (nic)"; - eperr = " OPA"; + "there is no PSM3 device (nic)"; + eperr = " nic"; } len = snprintf(errbuf, sizeof(errbuf) - 1, @@ -540,8 +540,8 @@ psm2_error_t psm3_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr, } else if (!psm3_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { deverr = - "there is no OPA PSM3 device (nic)"; - eperr = " OPA"; + "there is no PSM3 device (nic)"; + eperr = " nic"; } len = snprintf(errbuf, sizeof(errbuf) - 1, diff --git a/prov/psm3/psm3/psm_error.c b/prov/psm3/psm3/psm_error.c index 1958b9cd77c..814139adff6 100644 --- a/prov/psm3/psm3/psm_error.c +++ b/prov/psm3/psm3/psm_error.c @@ -268,7 +268,7 @@ struct psmi_error_item psmi_error_items[] = { {PSMI_NOLOG, "unknown 18"}, {PSMI_NOLOG, "unknown 19"}, {PSMI_NOLOG, "Endpoint was closed"}, /* PSM2_EP_WAS_CLOSED = 20 */ - {LOG_ALERT, "PSM Could not find an OPA Unit"}, /* PSM2_EP_NO_DEVICE = 21 */ + {LOG_ALERT, "PSM Could not find a NIC"}, /* PSM2_EP_NO_DEVICE = 21 */ {PSMI_NOLOG, "User passed a bad unit number"}, /* PSM2_EP_UNIT_NOT_FOUND = 22 */ {LOG_ALERT, "Failure in initializing endpoint"}, /* PSM2_EP_DEVICE_FAILURE = 23 */ {PSMI_NOLOG, "Error closing the endpoing error"}, /* PSM2_EP_CLOSE_TIMEOUT = 24 */ diff --git a/prov/psm3/psm3/psm_lock.h b/prov/psm3/psm3/psm_lock.h index c483dba57e9..0965d26ba26 100644 --- a/prov/psm3/psm3/psm_lock.h +++ b/prov/psm3/psm3/psm_lock.h @@ -88,6 +88,9 @@ typedef struct { #elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG) pthread_mutex_t lock; pthread_t lock_owner; +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + const char *lock_owner_loc; +#endif #elif defined(PSMI_LOCK_IS_MUTEXLOCK) pthread_mutex_t lock; #endif @@ -154,6 +157,9 @@ PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock)) pthread_mutex_init(&(lock->lock), &attr); pthread_mutexattr_destroy(&attr); lock->lock_owner = PSMI_LOCK_NO_OWNER; +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + lock->lock_owner_loc = "NONE"; +#endif #endif } diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c index 5203715fff8..4248ff7d28d 100644 --- a/prov/psm3/psm3/psm_mq.c +++ b/prov/psm3/psm3/psm_mq.c @@ -1426,13 +1426,13 @@ psm2_error_t psm3_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get) switch (key) { case PSM2_MQ_RNDV_HFI_SZ: if (get) - *((uint32_t *) value) = mq->hfi_thresh_rv; + *((uint32_t *) value) = mq->rndv_nic_thresh; else { val32 = *((uint32_t *) value); - mq->hfi_thresh_rv = val32; + mq->rndv_nic_thresh = val32; } _HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n", - mq->hfi_thresh_rv, get ? "GET" : "SET"); + mq->rndv_nic_thresh, get ? "GET" : "SET"); break; case PSM2_MQ_RNDV_SHM_SZ: @@ -1655,7 +1655,7 @@ static int psm3_mq_parse_window_rv(const char *str, if (delim) *delim = '\0'; // parse window - if (psm3_parse_str_uint(s, &win, 1, PSM_MQ_NIC_MAX_RNDV_WINDOW)) { + if (psm3_parse_str_uint(s, &win, 1, PSM3_MQ_RNDV_NIC_WINDOW_MAX)) { if (errstr_size) snprintf(errstr, errstr_size, " Invalid window_rv: %s", s); goto fail; @@ -2576,9 +2576,9 @@ psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo) // shm_thresh_rv is N/A to NIC and HAL, so we set this here and let // HAL set the rest of the defaults - mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV; + mq->shm_thresh_rv = PSM3_MQ_RNDV_SHM_THRESH; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) - mq->shm_gpu_thresh_rv = MQ_SHM_GPU_THRESH_RNDV; + mq->shm_gpu_thresh_rv = PSM3_MQ_RNDV_SHM_GPU_THRESH; #endif psmi_hal_mq_init_defaults(mq); @@ -2618,8 +2618,8 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) psm3_getenv("PSM3_MQ_RNDV_NIC_THRESH", "NIC eager-to-rendezvous switchover", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv); - mq->hfi_thresh_rv = env_hfirv.e_uint; + (union psmi_envvar_val)mq->rndv_nic_thresh, &env_hfirv); + mq->rndv_nic_thresh = env_hfirv.e_uint; #define WINDOW_SYNTAX "Specified as window_size:limit,window_size:limit, ...\nwhere limit is the largest message size the window_size is applicable to.\nThe last window_size in the list will be used for all remaining message\nsizes (eg. its limit is optional and ignored).\nwindow_size must be <= 4194304 and the limit in each entry must be larger\nthan the prior entry." @@ -2682,9 +2682,6 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) #endif /* PSM_CUDA || PSM_ONEAPI */ } - /* Re-evaluate this since it may have changed after initializing the shm - * device */ - mq->shm_thresh_rv = psm3_shm_mq_rv_thresh; psm3_getenv("PSM3_MQ_RNDV_SHM_THRESH", "shm eager-to-rendezvous switchover", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, @@ -2693,7 +2690,6 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) if (PSMI_IS_GPU_ENABLED) { - mq->shm_gpu_thresh_rv = psm3_shm_mq_gpu_rv_thresh; psm3_getenv("PSM3_MQ_RNDV_SHM_GPU_THRESH", "shm eager-to-rendezvous switchover for GPU send", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h index 6c7127b0245..824dc1ad60a 100644 --- a/prov/psm3/psm3/psm_mq_internal.h +++ b/prov/psm3/psm3/psm_mq_internal.h @@ -178,7 +178,7 @@ struct psm2_mq { STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */ uint32_t hfi_thresh_tiny; - uint32_t hfi_thresh_rv; + uint32_t rndv_nic_thresh; uint32_t shm_thresh_rv; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) uint32_t shm_gpu_thresh_rv; diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c index 7b481351843..181d4dd5ba7 100644 --- a/prov/psm3/psm3/psm_mq_recv.c +++ b/prov/psm3/psm3/psm_mq_recv.c @@ -463,7 +463,7 @@ psm3_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag, /* We don't know recv_msglen yet but we set it here for * mq_iprobe */ req->req_data.send_msglen = req->req_data.recv_msglen = send_msglen; - PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->hfi_thresh_rv, + PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->rndv_nic_thresh, OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid, "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); req->state = MQ_STATE_UNEXP_RV; diff --git a/prov/psm3/psm3/psm_nic_select.c b/prov/psm3/psm3/psm_nic_select.c index 1a451f5eb67..58d3ab72b15 100644 --- a/prov/psm3/psm3/psm_nic_select.c +++ b/prov/psm3/psm3/psm_nic_select.c @@ -290,24 +290,22 @@ static void psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start, long *unit_end, int nunits) { - { - int found, saved_hfis[nunits]; - - /* else, we are going to look at: - (a hash of the job key plus the local rank id) mod nunits. */ - found = hfi_find_active_hfis(nunits, -1, saved_hfis); - if (found) - *unit_start = saved_hfis[((psm3_get_mylocalrank()+1) + - psm3_get_uuid_hash(job_key)) % found]; - else - // none found, caller will fail, start is a don't care - *unit_start = 0; - /* just in case, caller will check all other units, with wrap */ - if (*unit_start > 0) - *unit_end = *unit_start - 1; - else - *unit_end = nunits-1; - } + int found, saved_hfis[nunits]; + + /* we are going to look at: + (a hash of the job key plus the local rank id) mod nunits. */ + found = hfi_find_active_hfis(nunits, -1, saved_hfis); + if (found) + *unit_start = saved_hfis[((psm3_get_mylocalrank()+1) + + psm3_get_uuid_hash(job_key)) % found]; + else + // none found, caller will fail, start is a don't care + *unit_start = 0; + /* just in case, caller will check all other units, with wrap */ + if (*unit_start > 0) + *unit_end = *unit_start - 1; + else + *unit_end = nunits-1; _HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n", *unit_start, *unit_end); } diff --git a/prov/psm3/psm3/psm_rndv_mod.c b/prov/psm3/psm3/psm_rndv_mod.c index f980fe73b49..1daa81f5c2c 100644 --- a/prov/psm3/psm3/psm_rndv_mod.c +++ b/prov/psm3/psm3/psm_rndv_mod.c @@ -284,6 +284,17 @@ static void rv_unmap_event_ring(psm3_rv_t rv, struct rv_event_ring* ring) ring->num = 0; } +// RV is available if RV_FILE_NAME (/dev/rv) exists +int psm3_rv_available() +{ + int fd = open(RV_FILE_NAME, O_RDWR); + if (fd == -1) { + return 0; + } + close(fd); + return 1; +} + // we call this once per ep (eg. NIC) so we supply the local address // of our NIC for use in the IB CM bind, especially for ethernet psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info) diff --git a/prov/psm3/psm3/psm_rndv_mod.h b/prov/psm3/psm3/psm_rndv_mod.h index a9e246ed563..d6f0001a37c 100644 --- a/prov/psm3/psm3/psm_rndv_mod.h +++ b/prov/psm3/psm3/psm_rndv_mod.h @@ -185,6 +185,8 @@ static inline uint16_t psm3_rv_get_gpu_user_minor_bldtime_version(void) extern uint64_t psm3_min_gpu_bar_size(void); #endif +extern int psm3_rv_available(); + extern psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info); extern int psm3_rv_close(psm3_rv_t rv); diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h index 18c58d9934d..28a6e9de4dd 100644 --- a/prov/psm3/psm3/psm_user.h +++ b/prov/psm3/psm3/psm_user.h @@ -200,6 +200,9 @@ typedef void *psmi_hal_hw_context; #define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff) #define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff) +extern unsigned int psm3_reg_mr_fail_limit; +extern unsigned int psm3_reg_mr_warn_cnt; + int psm3_verno_client(); int psm3_verno_isinteroperable(uint16_t verno); int MOCKABLE(psm3_isinitialized)(); @@ -213,7 +216,6 @@ int psm3_get_current_proc_location(); int psm3_get_max_cpu_numa(); extern int psm3_allow_routers; -extern uint32_t non_dw_mul_sdma; extern psmi_lock_t psm3_creation_lock; extern psm2_ep_t psm3_opened_endpoint; extern int psm3_opened_endpoint_count; @@ -246,43 +248,96 @@ extern void psm3_wake(psm2_ep_t ep); // wake from psm3_wait PSMI_ALWAYS_INLINE( int _psmi_mutex_trylock_inner(pthread_mutex_t *mutex, - const char *curloc, pthread_t *lock_owner)) + const char *curloc, pthread_t *lock_owner +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + , int check, const char **lock_owner_loc +#endif + )) { psmi_assert_always_loc(*lock_owner != pthread_self(), curloc); +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + // this is imperfect as the owner's unlock can race with this function + // so we fetch loc1 and loc2 just before and after our trylock. Still + // imperfect, but helps provide insight on frequently contended locks + const char *loc1 = *lock_owner_loc; +#endif int ret = pthread_mutex_trylock(mutex); - if (ret == 0) + if (ret == 0) { *lock_owner = pthread_self(); +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + *lock_owner_loc = curloc; + } else { + const char *loc2 = *lock_owner_loc; + if (check) + _HFI_VDBG("%s is trying for lock held by %s %s\n", curloc, loc1, loc2); +#endif + } return ret; } PSMI_ALWAYS_INLINE( int _psmi_mutex_lock_inner(pthread_mutex_t *mutex, - const char *curloc, pthread_t *lock_owner)) + const char *curloc, pthread_t *lock_owner +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + , const char **lock_owner_loc +#endif + )) { psmi_assert_always_loc(*lock_owner != pthread_self(), curloc); +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + // this is imperfect as the owner's unlock can race with this function + // so we fetch loc1 and loc2 just before and after our trylock. Still + // imperfect, but helps provide insight on frequently contended locks + const char *loc1 = *lock_owner_loc; + if (! _psmi_mutex_trylock_inner(mutex, curloc, lock_owner, 0, lock_owner_loc)) + return 0; + const char *loc2 = *lock_owner_loc; + _HFI_VDBG("%s is waiting for lock held by %s %s\n", curloc, loc1, loc2); +#endif int ret = pthread_mutex_lock(mutex); psmi_assert_always_loc(ret != EDEADLK, curloc); *lock_owner = pthread_self(); +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + *lock_owner_loc = curloc; +#endif return ret; } PSMI_ALWAYS_INLINE( void _psmi_mutex_unlock_inner(pthread_mutex_t *mutex, - const char *curloc, pthread_t *lock_owner)) + const char *curloc, pthread_t *lock_owner +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + , const char **lock_owner_loc +#endif + )) { psmi_assert_always_loc(*lock_owner == pthread_self(), curloc); *lock_owner = PSMI_LOCK_NO_OWNER; +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION + *lock_owner_loc = "NONE"; +#endif psmi_assert_always_loc(pthread_mutex_unlock(mutex) != EPERM, curloc); return; } #define _PSMI_LOCK_INIT(pl) /* static initialization */ +#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION +#define _PSMI_LOCK_TRY(pl) \ + _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner), 1, &((pl).lock_owner_loc)) +#define _PSMI_LOCK(pl) \ + _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner), &((pl).lock_owner_loc)) +#define _PSMI_UNLOCK(pl) \ + _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC, \ + &((pl).lock_owner), &((pl).lock_owner_loc)) +#else #define _PSMI_LOCK_TRY(pl) \ _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC, \ &((pl).lock_owner)) @@ -292,6 +347,7 @@ _psmi_mutex_unlock_inner(pthread_mutex_t *mutex, #define _PSMI_UNLOCK(pl) \ _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC, \ &((pl).lock_owner)) +#endif #define _PSMI_LOCK_ASSERT(pl) \ psmi_assert_always((pl).lock_owner == pthread_self()); #define _PSMI_UNLOCK_ASSERT(pl) \ @@ -375,13 +431,13 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak)); extern int is_gdr_copy_enabled; /* This limit dictates when the sender turns off * GDR Copy and uses SDMA. The limit needs to be less than equal - * GPU RNDV threshold (gpu_thresh_rndv) + * GPU RNDV threshold (psm3_gpu_thresh_rndv) * set to 0 if GDR Copy disabled */ extern uint32_t gdr_copy_limit_send; /* This limit dictates when the reciever turns off * GDR Copy. The limit needs to be less than equal - * GPU RNDV threshold (gpu_thresh_rndv) + * GPU RNDV threshold (psm3_gpu_thresh_rndv) * set to 0 if GDR Copy disabled */ extern uint32_t gdr_copy_limit_recv; @@ -389,7 +445,7 @@ extern int is_gpudirect_enabled; // only for use during parsing of other params extern int _device_support_gpudirect; extern uint32_t gpudirect_rdma_send_limit; extern uint32_t gpudirect_rdma_recv_limit; -extern uint32_t gpu_thresh_rndv; +extern uint32_t psm3_gpu_thresh_rndv; #define MAX_ZE_DEVICES 8 @@ -920,31 +976,31 @@ int gpu_p2p_supported()) { if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported; + _gpu_p2p_supported = 0; + if (unlikely(!is_cuda_enabled)) { - _gpu_p2p_supported=0; + _HFI_DBG("returning 0 (cuda disabled)\n"); return 0; } - int num_devices, dev; - CUcontext c; - /* Check which devices the current device has p2p access to. */ - CUdevice current_device; + CUdevice current_device; + CUcontext current_context; + int num_devices, dev_idx; PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices); - _gpu_p2p_supported = 0; if (num_devices > 1) { - PSMI_CUDA_CALL(cuCtxGetCurrent, &c); - if (c == NULL) { + PSMI_CUDA_CALL(cuCtxGetCurrent, ¤t_context); + if (current_context == NULL) { _HFI_INFO("Unable to find active CUDA context, assuming P2P not supported\n"); return 0; } PSMI_CUDA_CALL(cuCtxGetDevice, ¤t_device); } - for (dev = 0; dev < num_devices; dev++) { + for (dev_idx = 0; dev_idx < num_devices; dev_idx++) { CUdevice device; - PSMI_CUDA_CALL(cuDeviceGet, &device, dev); + PSMI_CUDA_CALL(cuDeviceGet, &device, dev_idx); if (num_devices > 1 && device != current_device) { int canAccessPeer = 0; @@ -952,16 +1008,17 @@ int gpu_p2p_supported()) current_device, device); if (canAccessPeer != 1) - _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev); + _HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev_idx); else - _gpu_p2p_supported |= (1 << device); + _gpu_p2p_supported |= (1 << dev_idx); } else { /* Always support p2p on the same GPU */ - my_gpu_device = device; - _gpu_p2p_supported |= (1 << device); + my_gpu_device = dev_idx; + _gpu_p2p_supported |= (1 << dev_idx); } } + _HFI_DBG("returning (0x%x), device 0x%x (%d)\n", _gpu_p2p_supported, (1 << my_gpu_device), my_gpu_device); return _gpu_p2p_supported; } diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c index c2525fa935c..0f1a3fe1d5d 100644 --- a/prov/psm3/psm3/psm_utils.c +++ b/prov/psm3/psm3/psm_utils.c @@ -130,7 +130,7 @@ uint32_t psm3_ceil_log2(uint64_t val) // so that psm2_epid_t contents can remain opaque to psm2 API callers // who will not see this more detailed psmi_epid_t but will just see psm2_epid_t // A psm2_nid_t also uses this format, but has 0 in the protocol and process -// specific fields (protocol, context, subcontext, qpn, pri_sock, aux_sock). +// specific fields (protocol, context, qpn, pri_sock, aux_sock). typedef union { psm2_epid_t psm2_epid; // to cast to/from psm2_epid_t uint64_t w[3]; // word by word access @@ -884,7 +884,6 @@ uint8_t psm3_epid_prefix_len(psm2_epid_t epid) } // The locally unique identifiers for the HW resources -// OPA Native - Context (also need sub-context) // Verbs - 24b QPN (IB, OPA and RoCE Verbs) // Sockets - 16b primary socket number (sin_port) (UDP/TCP) // This should not be called for psm2_nid_t diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h index d39b49e6711..57742fc39ea 100644 --- a/prov/psm3/psm3/psm_utils.h +++ b/prov/psm3/psm3/psm_utils.h @@ -466,8 +466,6 @@ uint32_t psm3_crc(unsigned char *buf, int len); * CPUID return values */ #define CPUID_FAMILY_XEON 0x00000600 -#define CPUID_MODEL_PHI_GEN2 87 -#define CPUID_MODEL_PHI_GEN2M 133 /* * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX * due to Little Endian and Hex it is not so obvious diff --git a/prov/psm3/psm3/psm_verbs_mr.c b/prov/psm3/psm3/psm_verbs_mr.c index aa145cfdc28..fa8fdf39499 100644 --- a/prov/psm3/psm3/psm_verbs_mr.c +++ b/prov/psm3/psm3/psm_verbs_mr.c @@ -1347,6 +1347,7 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache, psm3_verbs_mr_t key) { int save_errno; + static int fail_cnt = 0; /* Number of failed priority reg_mr requests */ ASSERT_MRC_FREE_LOCK(cache, mrc); #ifdef PSM_HAVE_RNDV_MOD @@ -1414,11 +1415,26 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache, mrc->alloc_id = key->alloc_id; #endif ADD_STAT(cache, mrc->length, registered_bytes, max_registered_bytes); + /* Reset the fail counter */ + fail_cnt = 0; return mrc; failed_reg_mr: - _HFI_ERROR("reg_mr failed: "MRC_FMT": %s\n", MR_OUT_MRC(key), - strerror(save_errno)); + if (priority) { + /* Print the first failure */ + if (!fail_cnt) + _HFI_ERROR("reg_mr failed: "MRC_FMT": %s\n", + MR_OUT_MRC(key), strerror(save_errno)); + fail_cnt++; + /* Print a warning after consecutive failures */ + if (fail_cnt == psm3_reg_mr_warn_cnt) + _HFI_ERROR("reg_mr failed %d times in a row.\n", + psm3_reg_mr_warn_cnt); + /* Bail out if it fails too many times */ + if (fail_cnt >= psm3_reg_mr_fail_limit) + psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, + "reg_mr failed for too many times.\n"); + } cache->failed++; cache->failed_reg_mr++; free_mr(cache, mrc); diff --git a/prov/psm3/psm3/ptl_am/am_config.h b/prov/psm3/psm3/ptl_am/am_config.h index 79600601037..9ff2c3972e4 100644 --- a/prov/psm3/psm3/ptl_am/am_config.h +++ b/prov/psm3/psm3/ptl_am/am_config.h @@ -56,25 +56,9 @@ #include "psm_config.h" -/* - * Can change the rendezvous threshold based on usage of cma (or not) - */ -#define PSMI_MQ_RV_THRESH_CMA 16000 - -/* If no kernel assisted copy is available this is the rendezvous threshold */ -#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000 - #define AMSH_HAVE_CMA 0x1 #define AMSH_HAVE_KASSIST 0x1 -#if defined(PSM_CUDA) -/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ -#define PSMI_MQ_GPU_RV_THRESH 127 -#elif defined(PSM_ONEAPI) -/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */ -#define PSMI_MQ_GPU_RV_THRESH 127 -#endif - /* Each block reserves some space at the beginning to store auxiliary data */ #define AMSH_BLOCK_HEADER_SIZE 4096 @@ -86,6 +70,12 @@ * am_pkt_bulk_t header struct. */ #define AMLONG_SZ_NO_DSA 8192 +// for AI workloads with limited processes and multi-ep, better to have +// large MTU and will default to CMA off for all but 1st EP +#define AMLONG_SZ_MULTIEP 32768 +// This is the range we allow AMLONG_SZ to be configured as +#define AMLONG_SZ_MIN 1024 +#define AMLONG_SZ_MAX (1024*1024) #ifdef PSM_DSA /* DSA benefits from larger bulk packets and hence larger copies */ @@ -94,7 +84,14 @@ #define AMLONG_SZ_DSA (1024*512) #endif -#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET -#define PSMI_KASSIST_MODE_DEFAULT_STRING "cma-get" +// GPU only supports GET("cma-get") or OFF("none"), so can't use PUT as default +#define PSM3_KASSIST_MODE_DEFAULT PSM3_KASSIST_CMA_GET +#define PSM3_KASSIST_MODE_DEFAULT_STRING "cma-get" + +#ifdef PSM_FI +#define SHM_FAULTINJ_CMA_ERR 10000 /* 1 every X CMA get/put error */ +#define SHM_FAULTINJ_CMA_NOTAVAIL 4 /* 1 every X CMA available at init */ +#endif /* PSM_FI */ + #endif /* PTL_AM_AM_CONFIG_H */ diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c index 020f3afb349..89dbdd6cd87 100644 --- a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c +++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c @@ -87,28 +87,15 @@ #endif #endif -int psm3_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) -int psm3_shm_mq_gpu_rv_thresh = PSMI_MQ_GPU_RV_THRESH; -#endif - -// qcounts and qelemsz tunable via amsh_fifo_getconfig(); -static amsh_qinfo_t amsh_qcounts = { - .qreqFifoShort = AMSHORT_Q_NO_DSA, - .qreqFifoLong = AMLONG_Q_NO_DSA, - .qrepFifoShort = AMSHORT_Q_NO_DSA, - .qrepFifoLong = AMLONG_Q_NO_DSA -}; +/* AMLONG_PAYLOAD is number of bytes available in a bulk packet for payload. */ +#define AMLONG_PAYLOAD(FifoLong) ((FifoLong) - sizeof(am_pkt_bulk_t)) -static amsh_qinfo_t amsh_qelemsz = { - .qreqFifoShort = sizeof(am_pkt_short_t), - .qreqFifoLong = AMLONG_SZ_NO_DSA, - .qrepFifoShort = sizeof(am_pkt_short_t), - .qrepFifoLong = AMLONG_SZ_NO_DSA -}; +/* req and rep MTU is the same, so can use either here */ +/* this is our local MTU, use when receiving data */ +#define AMLONG_MTU_LOCAL(ptl) AMLONG_PAYLOAD((ptl)->qelemsz.qreqFifoLong) -/* AMLONG_MTU is the number of bytes available in a bulk packet for payload. */ -#define AMLONG_MTU (amsh_qelemsz.qreqFifoLong-sizeof(am_pkt_bulk_t)) +/* this is the MTU of a peer, use when sending data */ +#define AMLONG_MTU_DEST(ptl, destidx) AMLONG_PAYLOAD((ptl)->am_ep[destidx].qdir.qreqH->longbulkq.elem_sz) ustatic struct { void *addr; @@ -124,9 +111,9 @@ static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, /* Kassist helper functions */ #if _HFI_DEBUGGING -static const char *psmi_kassist_getmode(int mode); +static const char *psm3_kassist_getmode(int mode); #endif -static int psm3_get_kassist_mode(); +static int psm3_get_kassist_mode(int first_ep); int psm3_epaddr_pid(psm2_epaddr_t epaddr); static inline void @@ -152,19 +139,28 @@ am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems) } } -#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \ - PSMI_PAGESIZE) -static inline uintptr_t am_ctl_sizeof_block() +#define AMSH_QSIZE(ptl, type) \ + PSMI_ALIGNUP((ptl)->qelemsz.q ## type * (ptl)->qcounts.q ## type, \ + PSMI_PAGESIZE) + +// compute size for our inbound shm segment +static inline uintptr_t am_ctl_sizeof_block(struct ptl_am *ptl) { - return PSMI_ALIGNUP( - PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) + + return PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) + /* reqctrl block */ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + - _PA(reqFifoShort) + _PA(reqFifoLong) + + AMSH_QSIZE(ptl, reqFifoShort) + AMSH_QSIZE(ptl, reqFifoLong) + /*reqctrl block */ PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) + - /* align to page size */ - _PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE); + AMSH_QSIZE(ptl, repFifoShort) + AMSH_QSIZE(ptl, repFifoLong); +} + +// compute size for a remote node's shm segment +static inline uintptr_t am_ctl_sizeof_seg(struct am_ctl_nodeinfo *nodeinfo) +{ + return ((uintptr_t) nodeinfo->qdir.qrepFifoLong + + nodeinfo->amsh_qsizes.qrepFifoLong) + - nodeinfo->amsh_shmbase; } #undef _PA @@ -189,7 +185,7 @@ static void read_extra_ep_data(uint32_t data, uint32_t *pid, uint32_t *gpu) *gpu = (data & ~pid_mask) >> 22; } -static void am_update_directory(struct am_ctl_nodeinfo *); +static void am_update_directory(struct am_ctl_nodeinfo *, size_t segsz); static void amsh_atexit() @@ -282,15 +278,8 @@ psm2_error_t psm3_shm_create(ptl_t *ptl_gen) int shmfd = -1; char *amsh_keyname = NULL; int iterator; - /* Get which kassist mode to use. */ - ptl->psmi_kassist_mode = psm3_get_kassist_mode(); - - _HFI_PRDBG("kassist_mode %d %s use_kassist %d\n", - ptl->psmi_kassist_mode, - psmi_kassist_getmode(ptl->psmi_kassist_mode), - (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF)); - segsz = am_ctl_sizeof_block(); + segsz = am_ctl_sizeof_block(ptl); for (iterator = 0; iterator < INT_MAX; iterator++) { snprintf(shmbuf, sizeof(shmbuf), @@ -426,9 +415,10 @@ psm2_error_t psm3_epdir_extend(ptl_t *ptl_gen) } /** - * Unmap shm regions upon proper disconnect with other processes + * Unmap peer's shm region upon proper disconnect with other processes */ -psm2_error_t psm3_do_unmap(uintptr_t shmbase) +psm2_error_t psm3_do_unmap(struct am_ctl_nodeinfo *nodeinfo) + { psm2_error_t err = PSM2_OK; #if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER) @@ -437,9 +427,9 @@ psm2_error_t psm3_do_unmap(uintptr_t shmbase) /* ignore other errors as context could be destroyed before this */ CUresult cudaerr; //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, - // cuMemHostUnregister, (void*)shmbase); + // cuMemHostUnregister, (void*)nodeinfo->amsh_shmbase); psmi_count_cuMemHostUnregister++; - cudaerr = psmi_cuMemHostUnregister((void*)shmbase); + cudaerr = psmi_cuMemHostUnregister((void*)nodeinfo->amsh_shmbase); if (cudaerr) { const char *pStr = NULL; psmi_count_cuGetErrorString++; @@ -453,16 +443,16 @@ psm2_error_t psm3_do_unmap(uintptr_t shmbase) if (PSMI_IS_GPU_ENABLED) { ze_result_t result; //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, - // (void *)shmbase); + // (void *)nodeinfo->amsh_shmbase); psmi_count_zexDriverReleaseImportedPointer++; result = psmi_zexDriverReleaseImportedPointer(ze_driver, - (void *)shmbase); + (void *)nodeinfo->amsh_shmbase); if (result != ZE_RESULT_SUCCESS) { _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result)); } } #endif - if (munmap((void *)shmbase, am_ctl_sizeof_block())) { + if (munmap((void *)nodeinfo->amsh_shmbase, am_ctl_sizeof_seg(nodeinfo))) { err = psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error with munmap of shared segment: %s", @@ -484,11 +474,10 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; int i; - int use_kassist; uint16_t shmidx; char shmbuf[256]; void *dest_mapptr; - size_t segsz; + size_t segsz = 0; psm2_error_t err = PSM2_OK; int dest_shmfd; struct am_ctl_nodeinfo *dest_nodeinfo; @@ -509,9 +498,6 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm } - use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF); - - segsz = am_ctl_sizeof_block(); for (iterator = 0; iterator < INT_MAX; iterator++) { snprintf(shmbuf, sizeof(shmbuf), @@ -521,9 +507,10 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm iterator); dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU); if (dest_shmfd < 0) { - if (errno == EACCES && iterator < INT_MAX) + if (errno == EACCES && iterator < INT_MAX) { + err = PSM2_SHMEM_SEGMENT_ERR; continue; - else { + } else { err = psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error opening remote " @@ -544,8 +531,9 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm close(dest_shmfd); goto fail; } - if (getuid() == st.st_uid) { + if (getuid() == st.st_uid && st.st_size) { err = PSM2_OK; + segsz = st.st_size; break; } else { err = PSM2_SHMEM_SEGMENT_ERR; @@ -561,6 +549,7 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm "namespace exhausted."); goto fail; } + psmi_assert(segsz); dest_mapptr = mmap(NULL, segsz, PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0); @@ -613,45 +602,26 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm for (i = 0; i <= ptl->max_ep_idx; i++) { if (!psm3_epid_zero_internal(ptl->am_ep[i].epid)) - am_update_directory(&ptl->am_ep[i]); + am_update_directory(&ptl->am_ep[i], am_ctl_sizeof_seg(&ptl->am_ep[i])); } } for (i = 0; i < ptl->am_ep_size; i++) { psmi_assert(psm3_epid_cmp_internal(ptl->am_ep[i].epid, epid)); if (psm3_epid_zero_internal(ptl->am_ep[i].epid)) { + // populate our local copy of the peer's nodeinfo ptl->am_ep[i].epid = epid; ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno; ptl->am_ep[i].pid = dest_nodeinfo->pid; - if (use_kassist) { - /* If we are able to use CMA assume everyone - * else on the node can also use it. - * Advertise that CMA is active via the - * feature flag. - */ - - if (psm3_cma_available()) { - ptl->am_ep[i].amsh_features |= - AMSH_HAVE_CMA; - psm3_shm_mq_rv_thresh = - PSMI_MQ_RV_THRESH_CMA; - } else { - ptl->psmi_kassist_mode = - PSMI_KASSIST_OFF; - use_kassist = 0; - psm3_shm_mq_rv_thresh = - PSMI_MQ_RV_THRESH_NO_KASSIST; - } - } else - psm3_shm_mq_rv_thresh = - PSMI_MQ_RV_THRESH_NO_KASSIST; - _HFI_CONNDBG("KASSIST MODE: %s\n", - psmi_kassist_getmode(ptl->psmi_kassist_mode)); + ptl->am_ep[i].amsh_features = dest_nodeinfo->amsh_features; + _HFI_CONNDBG("Peer KASSIST: %d\n", + (ptl->am_ep[i].amsh_features & AMSH_HAVE_CMA) != 0); shmidx = *shmidx_o = i; _HFI_CONNDBG("Mapped epid %s into shmidx %d\n", psm3_epid_fmt_internal(epid, 0), shmidx); ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr; ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes; if (i > ptl->max_ep_idx) ptl->max_ep_idx = i; + am_update_directory(&ptl->am_ep[shmidx], segsz); break; } } @@ -671,10 +641,6 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm * Initialize pointer structure and locks for endpoint shared-memory AM. */ -#define AMSH_QSIZE(type) \ - PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type, \ - PSMI_PAGESIZE) - static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; @@ -689,10 +655,10 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) if ((err = psm3_shm_create(ptl_gen))) goto fail; - ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort); - ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong); - ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort); - ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong); + ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(ptl, reqFifoShort); + ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(ptl, reqFifoLong); + ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(ptl, repFifoShort); + ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(ptl, repFifoLong); /* We core dump right after here if we don't check the mmap */ @@ -710,38 +676,38 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL; ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL; - am_update_directory(ptl->self_nodeinfo); + am_update_directory(ptl->self_nodeinfo, am_ctl_sizeof_block(ptl)); ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort)); ptl->reqH.end = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) + - amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort); + ptl->qcounts.qreqFifoShort * (uintptr_t)ptl->qelemsz.qreqFifoShort); ptl->repH.head = ptl->repH.base = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort)); ptl->repH.end = (am_pkt_short_t *) (((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) + - amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort); + ptl->qcounts.qrepFifoShort * (uintptr_t)ptl->qelemsz.qrepFifoShort); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq, - amsh_qcounts.qreqFifoShort, - amsh_qelemsz.qreqFifoShort); + ptl->qcounts.qreqFifoShort, + ptl->qelemsz.qreqFifoShort); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq, - amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong); + ptl->qcounts.qreqFifoLong, ptl->qelemsz.qreqFifoLong); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq, - amsh_qcounts.qrepFifoShort, - amsh_qelemsz.qrepFifoShort); + ptl->qcounts.qrepFifoShort, + ptl->qelemsz.qrepFifoShort); am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq, - amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong); + ptl->qcounts.qrepFifoLong, ptl->qelemsz.qrepFifoLong); /* Set bulkidx in every bulk packet */ am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong, - amsh_qelemsz.qreqFifoLong, - amsh_qcounts.qreqFifoLong); + ptl->qelemsz.qreqFifoLong, + ptl->qcounts.qreqFifoLong); am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong, - amsh_qelemsz.qrepFifoLong, - amsh_qcounts.qrepFifoLong); + ptl->qelemsz.qrepFifoLong, + ptl->qcounts.qrepFifoLong); /* install the old sighandler back */ sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL); @@ -751,6 +717,7 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen) return err; } +/* unmap our own local shared memory segment (ptl->self_nodeinfo) */ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; @@ -796,7 +763,7 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen) } } #endif - if (munmap((void *)shmbase, am_ctl_sizeof_block())) { + if (munmap((void *)shmbase, am_ctl_sizeof_block(ptl))) { err = psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR, "Error with munmap of shared segment: %s", @@ -815,23 +782,21 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen) * updated when a new epaddr is connected to or on every epaddr already * connected to whenever the shared memory segment is relocated via mremap. * - * @param epaddr Endpoint address for which to update local directory. + * @param ptl our local endpoint + * @param nodeinfo entry in directory to update + * @param segsz optional expected size of shared memory segment contents + * for sanity check (if 0 check is skipped) */ static -void am_update_directory(struct am_ctl_nodeinfo *nodeinfo) +void am_update_directory(struct am_ctl_nodeinfo *nodeinfo, size_t segsz) { - uintptr_t base_this; - - base_this = nodeinfo->amsh_shmbase + - AMSH_BLOCK_HEADER_SIZE; - /* Request queues */ - nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this; + nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) + (nodeinfo->amsh_shmbase + AMSH_BLOCK_HEADER_SIZE); nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *) ((uintptr_t) nodeinfo->qdir.qreqH + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); - nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *) ((uintptr_t) nodeinfo->qdir.qreqFifoShort + nodeinfo->amsh_qsizes.qreqFifoShort); @@ -840,7 +805,6 @@ void am_update_directory(struct am_ctl_nodeinfo *nodeinfo) nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *) ((uintptr_t) nodeinfo->qdir.qreqFifoLong + nodeinfo->amsh_qsizes.qreqFifoLong); - nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *) ((uintptr_t) nodeinfo->qdir.qrepH + PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE)); @@ -860,17 +824,11 @@ void am_update_directory(struct am_ctl_nodeinfo *nodeinfo) nodeinfo->qdir.qrepFifoLong); /* Sanity check */ - uintptr_t base_next = - (uintptr_t) nodeinfo->qdir.qrepFifoLong + - nodeinfo->amsh_qsizes.qrepFifoLong; - - // this assert can happen if shm Fifo settings inconsistent - // such as 1 rank enabling DSA and another not enabling DSA - if (base_next - base_this > am_ctl_sizeof_block()) { - _HFI_ERROR("Inconsistent shm, Fifo parameters delta=%lu > block=%lu. Aborting\n", - (unsigned long)(base_next - base_this), - (unsigned long)am_ctl_sizeof_block()); - psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block()); + uintptr_t delta = am_ctl_sizeof_seg(nodeinfo); + if (segsz && delta != segsz) { + _HFI_ERROR("Inconsistent shm, Fifo parameters delta=%lu != segsz=%lu. Aborting\n", + (unsigned long)delta, (unsigned long) segsz); + psmi_assert_always(delta == segsz); } } @@ -947,7 +905,7 @@ amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t /* other setup */ ptl->am_ep[shmidx].epaddr = epaddr; - am_update_directory(&ptl->am_ep[shmidx]); + am_update_directory(&ptl->am_ep[shmidx], 0); /* Finally, add to table */ if ((err = psm3_epid_add(ptl->ep, epid, epaddr))) goto fail; @@ -990,7 +948,7 @@ amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr) ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno; ptl->am_ep[shmidx].pid = nodeinfo->pid; ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes; - am_update_directory(&ptl->am_ep[shmidx]); + am_update_directory(&ptl->am_ep[shmidx], 0); return; } @@ -1227,7 +1185,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req) */ if (((am_epaddr_t *) epaddr)->cstate_incoming == AMSH_CSTATE_INCOMING_DISC_REQUESTED) - err = psm3_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); + err = psm3_do_unmap(&ptl->am_ep[shmidx]); req->epid_mask[i] = AMSH_CMASK_POSTREQ; } else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) { cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; @@ -1925,9 +1883,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, psm2_handler_t handler, psm2_amarg_t *args, int nargs, const void *src, size_t len, void *dst, int flags)) { -#ifdef PSM_DEBUG struct ptl_am *ptl = (struct ptl_am *)ptl_gen; -#endif uint16_t type; uint32_t bulkidx; uint16_t hidx = (uint16_t) handler; @@ -1952,7 +1908,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, } else { int i; - psmi_assert(len < amsh_qelemsz.qreqFifoLong); + psmi_assert(len <= AMLONG_MTU_DEST(ptl, destidx)); psmi_assert(src != NULL || nargs > NSHORT_ARGS); type = AMFMT_SHORT; @@ -1986,6 +1942,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, uint8_t *src_this = (uint8_t *) src; uint8_t *dst_this = (uint8_t *) dst; uint32_t bytes_this; + uint32_t mtu = AMLONG_MTU_DEST(ptl, destidx); #ifdef PSM_DSA int use_dsa = psm3_use_dsa(len); #endif @@ -1996,7 +1953,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, is_reply ? "rep" : "req", src, dst, (uint32_t) len, hidx); while (bytes_left) { - bytes_this = min(bytes_left, AMLONG_MTU); + bytes_this = min(bytes_left, mtu); AMSH_POLL_UNTIL(ptl_gen, is_reply, (bulkpkt = am_ctl_getslot_long(ptl_gen, @@ -2162,6 +2119,7 @@ psm3_am_reqq_add(int amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr, ptl->psmi_am_reqq_fifo.lastp = &nreq->next; } +// process inbound packet on our local shm fifos static void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq) { @@ -2206,12 +2164,12 @@ void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq) bulkptr = (uintptr_t) ptl->self_nodeinfo->qdir. qreqFifoLong; - bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong; + bulkptr += bulkidx * (uintptr_t)ptl->qelemsz.qreqFifoLong; } else { bulkptr = (uintptr_t) ptl->self_nodeinfo->qdir. qrepFifoLong; - bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong; + bulkptr += bulkidx * (uintptr_t)ptl->qelemsz.qrepFifoLong; } break; default: @@ -2223,6 +2181,7 @@ void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq) } bulkpkt = (am_pkt_bulk_t *) bulkptr; + psmi_assert(bulkpkt->len <= AMLONG_MTU_LOCAL(ptl)); _HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d " "from_idx=%d pkt=%p/%p hidx=%d\n", ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag, @@ -2459,6 +2418,8 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, { uint32_t bytes_left = len; uint32_t bytes_this = 0; + ptl_t *ptl = epaddr->ptlctl->ptl; + uint32_t mtu = AMLONG_MTU_DEST((struct ptl_am *)ptl, ((am_epaddr_t *) epaddr)->shmidx); psm2_handler_t handler = mq_handler_hidx; @@ -2468,7 +2429,7 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, args[2].u32w0 = 0; psmi_assert(!(flags_user & PSM2_MQ_FLAG_SENDSYNC));// needs rndv - if (len <= AMLONG_MTU) { + if (len <= mtu) { if (len <= 32) args[0].u32w0 = MQ_MSG_TINY; else @@ -2480,15 +2441,15 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr, do { args[2].u32w0 += bytes_this; - bytes_this = min(bytes_left, AMLONG_MTU); + bytes_this = min(bytes_left, mtu); /* Assume that shared-memory active messages are delivered in order */ if (flags_internal & PSMI_REQ_FLAG_FASTPATH) { - psm3_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl, + psm3_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, handler, args, 3, (void *)ubuf, bytes_this, NULL, 0); } else { - psm3_amsh_short_request(epaddr->ptlctl->ptl, epaddr, + psm3_amsh_short_request(ptl, epaddr, handler, args, 3, ubuf, bytes_this, 0); } @@ -2657,15 +2618,15 @@ int psm3_epaddr_pid(psm2_epaddr_t epaddr) } #if _HFI_DEBUGGING static -const char *psmi_kassist_getmode(int mode) +const char *psm3_kassist_getmode(int mode) { switch (mode) { - case PSMI_KASSIST_OFF: - return "kassist off"; - case PSMI_KASSIST_CMA_GET: - return "cma get"; - case PSMI_KASSIST_CMA_PUT: - return "cma put"; + case PSM3_KASSIST_OFF: + return "none"; + case PSM3_KASSIST_CMA_GET: + return "cma-get"; + case PSM3_KASSIST_CMA_PUT: + return "cma-put"; default: return "unknown"; } @@ -2673,10 +2634,21 @@ const char *psmi_kassist_getmode(int mode) #endif static -int psm3_get_kassist_mode() +int psm3_get_kassist_mode(int first_ep) { - /* Cuda PSM2 supports only KASSIST_CMA_GET */ - int mode = PSMI_KASSIST_CMA_GET; + /* GPU supports only KASSIST_CMA_GET or NONE */ + int mode = (first_ep?PSM3_KASSIST_MODE_DEFAULT:PSM3_KASSIST_OFF); +#ifdef PSM_FI + if_pf(PSM3_FAULTINJ_ENABLED()) { + PSM3_FAULTINJ_STATIC_DECL(fi_cma_notavail, "cma_notavail", + "CMA not available", + 1, SHM_FAULTINJ_CMA_NOTAVAIL); + if (PSM3_FAULTINJ_IS_FAULT(fi_cma_notavail, NULL, "")) + return PSM3_KASSIST_OFF; + } +#endif + if (! psm3_cma_available()) + return PSM3_KASSIST_OFF; #ifdef PSM_DSA // dsa_available is determined during psm3_init(), while kassist is // not checked until a shm ep is being opened. So dsa_available is @@ -2686,7 +2658,7 @@ int psm3_get_kassist_mode() // where kassist applies, so we must turn it off so DSA can // do the copies for all rndv shm messages if (psm3_dsa_available()) - return PSMI_KASSIST_OFF; + return PSM3_KASSIST_OFF; #endif union psmi_envvar_val env_kassist; @@ -2707,22 +2679,23 @@ int psm3_get_kassist_mode() #endif PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val) - PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) { + (first_ep?PSM3_KASSIST_MODE_DEFAULT_STRING:"none"), + &env_kassist)) { char *s = env_kassist.e_str; if ( #if defined(PSM_CUDA) || defined(PSM_ONEAPI) ! PSMI_IS_GPU_ENABLED && #endif strcasecmp(s, "cma-put") == 0) - mode = PSMI_KASSIST_CMA_PUT; + mode = PSM3_KASSIST_CMA_PUT; else if (strcasecmp(s, "cma-get") == 0) - mode = PSMI_KASSIST_CMA_GET; + mode = PSM3_KASSIST_CMA_GET; else if (strcasecmp(s, "none") == 0) - mode = PSMI_KASSIST_OFF; + mode = PSM3_KASSIST_OFF; else { _HFI_INFO("Invalid value for PSM3_KASSIST_MODE ('%s') %-40s Using: cma-get\n", s, PSM3_KASSIST_MODE_HELP); - mode = PSMI_KASSIST_CMA_GET; + mode = PSM3_KASSIST_CMA_GET; } } return mode; @@ -2792,7 +2765,6 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, "Fatal error in " "connecting to shm segment"); } - am_update_directory(&ptl->am_ep[shmidx]); tok->shmidx = shmidx; } @@ -2890,7 +2862,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf, */ cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing; if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) { - err = psm3_do_unmap(ptl->am_ep[shmidx].amsh_shmbase); + err = psm3_do_unmap(&ptl->am_ep[shmidx]); psm3_epid_remove(epaddr->ptlctl->ep, epaddr->epid); } } @@ -2934,54 +2906,91 @@ psm3_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters) parameters->max_handlers = PSMI_AM_NUM_HANDLERS; parameters->max_nargs = PSMI_AM_MAX_ARGS; - parameters->max_request_short = AMLONG_MTU; - parameters->max_reply_short = AMLONG_MTU; + // we have not yet connected to our peers. If we are certain multi-ep + // is not going to be used, we can report our local MTU. + // Otherwise, to be safe we must report our smallest valid MTU. + // This value is only used in psmx3 to indicate the max atomic size + // so a modest value is acceptable as most apps (such as intelSHMEM) + // will only do atomics on a single data item of <= 128 bits + if (psm3_multi_ep_enabled) { + parameters->max_request_short = AMLONG_PAYLOAD(AMLONG_SZ_MIN); + parameters->max_reply_short = AMLONG_PAYLOAD(AMLONG_SZ_MIN); + } else { + parameters->max_request_short = + AMLONG_MTU_LOCAL((struct ptl_am *)(ep->ptl_amsh.ptl)); + parameters->max_reply_short = + AMLONG_MTU_LOCAL((struct ptl_am *)(ep->ptl_amsh.ptl)); + } return PSM2_OK; } -static void amsh_fifo_getconfig() +// for multi-ep, we use different defaults for the additional EPs +// to avoid serialization within CMA +static void amsh_fifo_getconfig(struct ptl_am *ptl) { union psmi_envvar_val env_var; + // defaults + ptl->qcounts.qreqFifoShort = AMSHORT_Q_NO_DSA; + ptl->qcounts.qreqFifoLong = AMLONG_Q_NO_DSA; + ptl->qcounts.qrepFifoShort = AMSHORT_Q_NO_DSA; + ptl->qcounts.qrepFifoLong = AMLONG_Q_NO_DSA; + + ptl->qelemsz.qreqFifoShort = sizeof(am_pkt_short_t); + ptl->qelemsz.qreqFifoLong = AMLONG_SZ_NO_DSA; + ptl->qelemsz.qrepFifoShort = sizeof(am_pkt_short_t); + ptl->qelemsz.qrepFifoLong = AMLONG_SZ_NO_DSA; + #ifdef PSM_DSA if (psm3_dsa_available()) { // adjust defaults - amsh_qcounts.qreqFifoShort = AMSHORT_Q_DSA; - amsh_qcounts.qrepFifoShort = AMSHORT_Q_DSA; - amsh_qcounts.qreqFifoLong = AMLONG_Q_DSA; - amsh_qcounts.qrepFifoLong = AMLONG_Q_DSA; - amsh_qelemsz.qreqFifoLong = AMLONG_SZ_DSA; - amsh_qelemsz.qrepFifoLong = AMLONG_SZ_DSA; - } + ptl->qcounts.qreqFifoShort = AMSHORT_Q_DSA; + ptl->qcounts.qrepFifoShort = AMSHORT_Q_DSA; + ptl->qcounts.qreqFifoLong = AMLONG_Q_DSA; + ptl->qcounts.qrepFifoLong = AMLONG_Q_DSA; + + ptl->qelemsz.qreqFifoLong = AMLONG_SZ_DSA; + ptl->qelemsz.qrepFifoLong = AMLONG_SZ_DSA; + } else #endif + if (ptl->kassist_mode == PSM3_KASSIST_OFF + && psm3_get_mylocalrank_count() > 1 + && psm3_get_mylocalrank_count() <= 16) { + // adjust defaults for large message AI workloads + ptl->qelemsz.qreqFifoLong = AMLONG_SZ_MULTIEP; + ptl->qelemsz.qrepFifoLong = AMLONG_SZ_MULTIEP; + } psm3_getenv("PSM3_SHM_SHORT_Q_DEPTH", "Number of entries on shm undirectional short msg fifos", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)amsh_qcounts.qreqFifoShort, &env_var); - amsh_qcounts.qreqFifoShort = env_var.e_uint; - amsh_qcounts.qrepFifoShort = env_var.e_uint; + (union psmi_envvar_val)ptl->qcounts.qreqFifoShort, &env_var); + ptl->qcounts.qreqFifoShort = env_var.e_uint; + ptl->qcounts.qrepFifoShort = env_var.e_uint; psm3_getenv("PSM3_SHM_LONG_Q_DEPTH", "Number of entries on shm undirectional long msg fifos", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)amsh_qcounts.qreqFifoLong, &env_var); - amsh_qcounts.qreqFifoLong = env_var.e_uint; - amsh_qcounts.qrepFifoLong = env_var.e_uint; + (union psmi_envvar_val)ptl->qcounts.qreqFifoLong, &env_var); + ptl->qcounts.qreqFifoLong = env_var.e_uint; + ptl->qcounts.qrepFifoLong = env_var.e_uint; // PSM3_SHM_SHORT_MTU - untunable at sizeof(am_pkt_short_t) - psm3_getenv("PSM3_SHM_LONG_MTU", - "Size of buffers on shm undirectional long msg fifos", + psm3_getenv_range("PSM3_SHM_LONG_MTU", + "Size of buffers on shm undirectional long msg fifos", NULL, PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)amsh_qelemsz.qreqFifoLong, &env_var); - amsh_qelemsz.qreqFifoLong = env_var.e_uint; - amsh_qelemsz.qrepFifoLong = env_var.e_uint; + (union psmi_envvar_val)ptl->qelemsz.qreqFifoLong, + (union psmi_envvar_val)AMLONG_SZ_MIN, + (union psmi_envvar_val)AMLONG_SZ_MAX, + NULL, NULL, &env_var); + ptl->qelemsz.qreqFifoLong = env_var.e_uint; + ptl->qelemsz.qrepFifoLong = env_var.e_uint; _HFI_PRDBG("shm Q Short: %u of %u bytes, Long: %u of %u bytes\n", - amsh_qcounts.qreqFifoShort, amsh_qelemsz.qreqFifoShort, - amsh_qcounts.qreqFifoLong, amsh_qelemsz.qrepFifoLong); + ptl->qcounts.qreqFifoShort, ptl->qelemsz.qreqFifoShort, + ptl->qcounts.qreqFifoLong, ptl->qelemsz.qrepFifoLong); } /** @@ -2996,6 +3005,7 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) { struct ptl_am *ptl = (struct ptl_am *)ptl_gen; psm2_error_t err = PSM2_OK; + int first_ep = (psm3_opened_endpoint_count == 0); /* Preconditions */ psmi_assert_always(ep != NULL); @@ -3011,8 +3021,14 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) ptl->connect_phase = 0; ptl->connect_incoming = 0; ptl->connect_outgoing = 0; + /* Get which kassist mode to use. */ + ptl->kassist_mode = psm3_get_kassist_mode(first_ep); + + _HFI_PRDBG("kassist_mode %d %s\n", + ptl->kassist_mode, + psm3_kassist_getmode(ptl->kassist_mode)); - amsh_fifo_getconfig(); + amsh_fifo_getconfig(ptl); #ifdef PSM_ONEAPI #ifndef PSM_HAVE_PIDFD @@ -3046,21 +3062,8 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl) goto fail; ptl->self_nodeinfo->psm_verno = PSMI_VERNO; - if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) { - if (psm3_cma_available()) { - ptl->self_nodeinfo->amsh_features |= - AMSH_HAVE_CMA; - psm3_shm_mq_rv_thresh = - PSMI_MQ_RV_THRESH_CMA; - } else { - ptl->psmi_kassist_mode = - PSMI_KASSIST_OFF; - psm3_shm_mq_rv_thresh = - PSMI_MQ_RV_THRESH_NO_KASSIST; - } - } else { - psm3_shm_mq_rv_thresh = - PSMI_MQ_RV_THRESH_NO_KASSIST; + if (ptl->kassist_mode != PSM3_KASSIST_OFF) { + ptl->self_nodeinfo->amsh_features |= AMSH_HAVE_CMA; } ptl->self_nodeinfo->pid = getpid(); ptl->self_nodeinfo->epid = ep->epid; diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h index 203b9512c3a..0796dbee9e9 100644 --- a/prov/psm3/psm3/ptl_am/psm_am_internal.h +++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h @@ -132,14 +132,14 @@ typedef struct psmi_handlertab { #define PSMI_AM_DISC_REQ 3 #define PSMI_AM_DISC_REP 4 -#define PSMI_KASSIST_OFF 0x0 -#define PSMI_KASSIST_CMA_GET 0x1 -#define PSMI_KASSIST_CMA_PUT 0x2 +#define PSM3_KASSIST_OFF 0x0 +#define PSM3_KASSIST_CMA_GET 0x1 +#define PSM3_KASSIST_CMA_PUT 0x2 -#define PSMI_KASSIST_CMA 0x3 -#define PSMI_KASSIST_GET 0x1 -#define PSMI_KASSIST_PUT 0x2 -#define PSMI_KASSIST_MASK 0x3 +#define PSM3_KASSIST_CMA 0x3 +#define PSM3_KASSIST_GET 0x1 +#define PSM3_KASSIST_PUT 0x2 +#define PSM3_KASSIST_MASK 0x3 int psm3_epaddr_pid(psm2_epaddr_t epaddr); @@ -404,7 +404,7 @@ struct amsh_qdirectory { * Shared fifo element counts and sizes ****************************************** * These values are context-wide, they can only be set early on and can't be * - * modified at runtime. All endpoints are expected to use the same values. + * modified at runtime. Each endpoint could potentially use different values. */ typedef struct amsh_qinfo { @@ -424,6 +424,10 @@ struct amsh_qinfo { * * This structure is carefully arranged to optimize cache locality and * performance. Do not modify without careful and thorough analysis. + * + * In addition to the copies in ptl_am.am_ep and ptl_am.self_nodeinfo + * this is also placed at the beginning of the shared memory segment so + * our peers can get info about our version, epid, qsizes, features, etc */ struct am_ctl_nodeinfo { uint16_t psm_verno; @@ -433,7 +437,7 @@ struct am_ctl_nodeinfo { psm2_epaddr_t epaddr; uintptr_t amsh_shmbase; amsh_qinfo_t amsh_qsizes; - uint32_t amsh_features; + volatile uint32_t amsh_features; struct amsh_qdirectory qdir; } __attribute__((aligned(64))); @@ -450,7 +454,7 @@ struct ptl_am { int zero_polls; int amsh_only_polls; int max_ep_idx, am_ep_size; - int psmi_kassist_mode; + int kassist_mode; char *amsh_keyname; /* These three items carefully picked to fit in one cache line. */ @@ -460,8 +464,8 @@ struct ptl_am { am_pkt_short_t amsh_empty_shortpkt; - struct am_ctl_nodeinfo *self_nodeinfo; - struct am_ctl_nodeinfo *am_ep; + struct am_ctl_nodeinfo *self_nodeinfo; /* our local advertized shm */ + struct am_ctl_nodeinfo *am_ep; /* local array w/copy of each peer's info */ #ifdef PSM_CUDA am_cuda_memhandle_cache_t memhandle_cache; #endif @@ -472,6 +476,9 @@ struct ptl_am { #define AMSH_GPU_BOUNCE_BUF_SZ (256*1024) void *gpu_bounce_buf; // for H to D #endif + // qcounts and qelemsz tunable via amsh_fifo_getconfig() + amsh_qinfo_t qcounts; + amsh_qinfo_t qelemsz; } __attribute__((aligned(64))); #endif diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c index 8a38d22ad4d..a6af3c356ac 100644 --- a/prov/psm3/psm3/ptl_am/ptl.c +++ b/prov/psm3/psm3/ptl_am/ptl.c @@ -66,6 +66,24 @@ #include "am_oneapi_memhandle_cache.h" #endif +#ifdef PSM_FI +/* + * fault injection for psm3_cma_get() and psm3_cma_put(). + * since the reaction to cma faults is for the given endpoint to stop + * using CMA, this should be set to be quite rare and only 1 fault per + * endpoint can occur, then the endpoint stops using CMA altogether + */ +PSMI_ALWAYS_INLINE(int cma_do_fault(psm2_ep_t ep)) +{ + if_pf(PSM3_FAULTINJ_ENABLED()) { + PSM3_FAULTINJ_STATIC_DECL(fi, "cma_err", "CMA failure", + 0, SHM_FAULTINJ_CMA_ERR); + return PSM3_FAULTINJ_IS_FAULT(fi, ep, ""); + } else + return 0; +} +#endif + /* not reported yet, so just track in a global so can pass a pointer to * psm3_mq_handle_envelope and psm3_mq_handle_rts */ @@ -153,7 +171,8 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, } #endif - if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET) + // since we will do the cma_get, can decide based on local config of ptl + if ((ptl->kassist_mode & PSM3_KASSIST_GET) && req->req_data.recv_msglen > 0 && (pid = psm3_epaddr_pid(epaddr))) { #if defined(PSM_CUDA) || defined(PSM_ONEAPI) @@ -167,10 +186,18 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, if (!ptl->gpu_bounce_buf) PSM3_GPU_HOST_ALLOC(&ptl->gpu_bounce_buf, AMSH_GPU_BOUNCE_BUF_SZ); while (cnt < req->req_data.recv_msglen) { + size_t res; size_t nbytes = min(req->req_data.recv_msglen-cnt, AMSH_GPU_BOUNCE_BUF_SZ); - size_t res = psm3_cma_get(pid, (void *)(req->rts_sbuf+cnt), +#ifdef PSM_FI + if_pf(cma_do_fault(ptl->ep)) + res = -1; + else +#endif + res = psm3_cma_get(pid, (void *)(req->rts_sbuf+cnt), ptl->gpu_bounce_buf, nbytes); + if (res == -1) + goto fail_cma; void *buf; psmi_assert_always(nbytes == res); if (PSMI_USE_GDR_COPY_RECV(nbytes) @@ -191,35 +218,42 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, PSM3_GPU_SYNCHRONIZE_MEMCPY(); } else { /* cma can be done in handler context or not. */ - size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, + size_t nbytes; +#ifdef PSM_FI + if_pf(cma_do_fault(ptl->ep)) + nbytes = -1; + else +#endif + nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, req->req_data.buf, req->req_data.recv_msglen); + if (nbytes == -1) + goto fail_cma; psmi_assert_always(nbytes == req->req_data.recv_msglen); } #else /* cma can be done in handler context or not. */ - size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, + size_t nbytes; +#ifdef PSM_FI + if_pf(cma_do_fault(ptl->ep)) + nbytes = -1; + else +#endif + nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf, req->req_data.buf, req->req_data.recv_msglen); - if (nbytes == -1) { - ptl->psmi_kassist_mode = PSMI_KASSIST_OFF; - _HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n"); - } - else { - psmi_assert_always(nbytes == req->req_data.recv_msglen); - cma_succeed = 1; - } + if (nbytes == -1) + goto fail_cma; psmi_assert_always(nbytes == req->req_data.recv_msglen); #endif + cma_succeed = 1; } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) send_cts: -#endif args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr; args[1].u64w0 = (uint64_t) (uintptr_t) req; args[2].u64w0 = (uint64_t) (uintptr_t) req->req_data.buf; args[3].u32w0 = req->req_data.recv_msglen; args[3].u32w1 = tok != NULL ? 1 : 0; - args[4].u32w0 = ptl->psmi_kassist_mode; // pass current kassist mode to the peer process + args[4].u32w0 = ptl->kassist_mode; // pass current kassist mode to the peer process if (tok != NULL) { psm3_am_reqq_add(AMREQUEST_SHORT, tok->ptl, @@ -235,12 +269,18 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted, req->mq->stats.rx_shm_bytes += req->req_data.recv_msglen; /* 0-byte completion or we used kassist */ - if (pid || cma_succeed || + if (cma_succeed || req->req_data.recv_msglen == 0 || gpu_ipc_send_completion == 1) { psm3_mq_handle_rts_complete(req); } PSM2_LOG_MSG("leaving."); return PSM2_OK; + +fail_cma: + ptl->kassist_mode = PSM3_KASSIST_OFF; + ptl->self_nodeinfo->amsh_features &= ~AMSH_HAVE_CMA; + _HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n"); + goto send_cts; } static @@ -417,22 +457,25 @@ psm3_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf, if (msglen > 0) { rarg[0].u64w0 = args[1].u64w0; /* rreq */ - int kassist_mode = ((struct ptl_am *)ptl)->psmi_kassist_mode; + int kassist_mode = ((struct ptl_am *)ptl)->kassist_mode; int kassist_mode_peer = args[4].u32w0; - // In general, peer process(es) shall have the same kassist mode set, - // but due to dynamic CMA failure detection, we must align local and remote state, - // and make protocol to adopt to that potential change. - if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) { - ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; - goto no_kassist; - } - if (kassist_mode & PSMI_KASSIST_PUT) { + if (kassist_mode_peer & PSM3_KASSIST_GET) { + // peer did cma_get(), nothing for us to do + } else if (kassist_mode & PSM3_KASSIST_PUT) { + // we can do cma_put() int pid = psm3_epaddr_pid(tok->tok.epaddr_incoming); - size_t nbytes = psm3_cma_put(sreq->req_data.buf, pid, dest, msglen); + size_t nbytes; +#ifdef PSM_FI + if_pf(cma_do_fault(((struct ptl_am *)ptl)->ep)) + nbytes = -1; + else +#endif + nbytes = psm3_cma_put(sreq->req_data.buf, pid, dest, msglen); if (nbytes == -1) { _HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n"); - ((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF; + ((struct ptl_am *)ptl)->kassist_mode = PSM3_KASSIST_OFF; + ((struct ptl_am *)ptl)->self_nodeinfo->amsh_features &= ~AMSH_HAVE_CMA; goto no_kassist; } @@ -441,8 +484,8 @@ psm3_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf, /* Send response that PUT is complete */ psm3_amsh_short_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, NULL, 0, 0); - } else if (!(kassist_mode & PSMI_KASSIST_MASK)) { - /* Only transfer if kassist is off, i.e. neither GET nor PUT. */ + } else { + /* Only transfer if peer didn't do GET and we didn't do PUT */ no_kassist: psm3_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg, 1, sreq->req_data.buf, msglen, dest, 0); diff --git a/prov/psm3/psm3/ptl_am/ptl_fwd.h b/prov/psm3/psm3/ptl_am/ptl_fwd.h index 85593aad847..09588cdda03 100644 --- a/prov/psm3/psm3/ptl_am/ptl_fwd.h +++ b/prov/psm3/psm3/ptl_am/ptl_fwd.h @@ -59,7 +59,4 @@ /* Symbol in am ptl */ extern struct ptl_ctl_init psm3_ptl_amsh; -extern int psm3_shm_mq_rv_thresh; -extern int psm3_shm_mq_gpu_rv_thresh; - #endif diff --git a/prov/psm3/psm3/ptl_ips/ips_config.h b/prov/psm3/psm3/ptl_ips/ips_config.h index 6eb9db5ceaf..1a253aa4a23 100644 --- a/prov/psm3/psm3/ptl_ips/ips_config.h +++ b/prov/psm3/psm3/ptl_ips/ips_config.h @@ -65,6 +65,10 @@ #define DF_OPP_LIBRARY "libopasadb.so.1.0.0" #define DATA_VFABRIC_OFFSET 8 +#define IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT 32 +#define IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT 128 +#define IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT 16 + /* Send retransmission */ #define IPS_PROTO_SPIO_RETRY_US_DEFAULT 2 /* in uS */ diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h index 2bdd85a309c..221706ade25 100644 --- a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h @@ -105,8 +105,6 @@ struct ips_protoexp { psm_transfer_type_t ctrl_xfer_type; struct ips_scbctrl tid_scbc_rv; // pool of SCBs for TID sends - // for OPA this includes: TIDEXP, CTS, - // EXPTID_COMPLETION // For UD: CTS, ERR_CHK_RDMA, // ERR_CHK_RDMA_RESP mpool_t tid_desc_send_pool; diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.h b/prov/psm3/psm3/ptl_ips/ips_path_rec.h index 17fa819a396..6ef9e5820b2 100644 --- a/prov/psm3/psm3/ptl_ips/ips_path_rec.h +++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.h @@ -67,18 +67,6 @@ /* Default size of path group hash table */ #define DF_PATH_GRP_HASH_SIZE 255 -/* Default size of CCT table. Must be multiple of 64 */ -#define DF_CCT_TABLE_SIZE 128 - -/* CCT max IPD delay. */ -#define DF_CCT_MAX_IPD_DELAY_US 21 - -/* CCA divisor shift */ -#define CCA_DIVISOR_SHIFT 14 - -/* CCA ipd mask */ -#define CCA_IPD_MASK 0x3FFF - /* A lot of these are IBTA specific defines that are available in other header * files. To minimize dependencies with PSM build process they are listed * here. Most of this is used to implement IBTA compliance features with PSM diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c index f6c9c215bcb..372dd75ea56 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto.c @@ -81,6 +81,11 @@ #define CTRL_MSG_DISCONNECT_REQUEST_QUEUED 0x0080 #define CTRL_MSG_DISCONNECT_REPLY_QUEUED 0x0100 +#define CREDITS_INC_THRESH 2048 +// we are using 31 bits psn, and int16_t for psn diff on nak detection +// to play safe we set max credit to 16384 +#define IPS_MAX_CREDIT 16384 + #if defined(PSM_CUDA) || defined(PSM_ONEAPI) uint32_t gpudirect_rdma_send_limit; uint32_t gpudirect_rdma_recv_limit; @@ -106,6 +111,59 @@ void psmi_gpu_hostbuf_alloc_func(int is_alloc, void *context, void *obj) } #endif /* PSM_CUDA || PSM_ONEAPI */ +static int parse_flow_credits(const char *str, + size_t errstr_size, char errstr[], + int tvals[3]) +{ + psmi_assert(tvals); + int ntup = psm3_count_tuples(str); + int ret = psm3_parse_str_tuples(str, ntup, tvals); + if (ret < 0) + return ret; + // back compatibility - when only one value specified, set max=min, step=0 + // this also can make value check to be accurate + if (ntup == 1) { + tvals[1] = tvals[0]; + tvals[2] = 0; + } + if (tvals[0] < 0 || tvals[1] < 0 || tvals[2] < 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Negative values not allowed"); + return -2; + } + if (tvals[0] > IPS_MAX_CREDIT || tvals[1] > IPS_MAX_CREDIT || tvals[2] > IPS_MAX_CREDIT) { + if (errstr_size) + snprintf(errstr, errstr_size, " Max allowed is %u", IPS_MAX_CREDIT); + return -2; + } + if (tvals[0] == 0 || tvals[1] == 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Zero values not allowed on min, max"); + return -2; + } + if (tvals[1] > tvals[0] && tvals[2] == 0) { + if (errstr_size) + snprintf(errstr, errstr_size, " Zero values not allowed on adjust when max > min"); + return -2; + } + if (tvals[0] > tvals[1]) { + if (errstr_size) + snprintf(errstr, errstr_size, " min (%d) must be <= max (%d)", tvals[0], tvals[1]); + return -2; + } + return 0; +} + +static int parse_check_flow_credits(int type, + const union psmi_envvar_val val, void *ptr, + size_t errstr_size, char errstr[]) +{ + // parser will set tvals to result, use a copy to protect input of defaults + int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2]}; + psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES); + return parse_flow_credits(val.e_str, errstr_size, errstr, tvals); +} + psm2_error_t psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size, @@ -133,16 +191,58 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, { /* Number of credits per flow */ union psmi_envvar_val env_flow_credits; + int tvals[3] = { + min(IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT, num_of_send_desc), + min(IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT, num_of_send_desc), + IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT + }; + char fcredits_def[32]; + snprintf(fcredits_def, sizeof(fcredits_def), "%d:%d:%d", tvals[0], tvals[1], tvals[2]); + + (void)psm3_getenv_range("PSM3_FLOW_CREDITS", + "Number of unacked packets (credits) per flow in ", + "Specified as min:max:adjust where min and max is the range of credits,\n" + "and adjust is the adjustment amount for adjusting credits", + PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES, + (union psmi_envvar_val)fcredits_def, + (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, + parse_check_flow_credits, tvals, + &env_flow_credits); + if (parse_flow_credits(env_flow_credits.e_str, 0, NULL, tvals) < 0) { + // already checked, shouldn't get parse errors nor empty strings + psmi_assert(0); + } + if (tvals[0] > num_of_send_desc) { + tvals[0] = num_of_send_desc; + } + if (tvals[1] > num_of_send_desc) { + tvals[1] = num_of_send_desc; + } + + // set init flow credits. Use PSM2_FLOW_CREDITS when possible int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc); + if (df_flow_credits > tvals[0] && df_flow_credits < tvals[1]) { + proto->flow_credits = df_flow_credits; + } else { + proto->flow_credits = (tvals[0] + tvals[1]) / 2; + } + proto->min_credits = tvals[0]; + proto->max_credits = tvals[1]; + proto->credits_adjust = tvals[2]; + } - psm3_getenv("PSM3_FLOW_CREDITS", - "Number of unacked packets (credits) per flow (default is 64)", - PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, - (union psmi_envvar_val)df_flow_credits, - &env_flow_credits); - proto->flow_credits = env_flow_credits.e_uint; + { + union psmi_envvar_val env_thresh; + psm3_getenv_range("PSM3_CREDITS_INC_THRESH", + "Threshold for increasing credits", NULL, + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, + (union psmi_envvar_val)CREDITS_INC_THRESH, + (union psmi_envvar_val)0, (union psmi_envvar_val)UINT16_MAX, + NULL, NULL, &env_thresh); + proto->credits_inc_thresh = env_thresh.e_uint; } + /* * Checksum packets within PSM. Default is off. * This is heavy weight and done in software so not recommended for @@ -197,7 +297,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, proto->multirail_thresh_load_balance = env_thresh_load_balance.e_uint; } - /* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */ + /* Initialize IBTA related stuff (path record, etc.) */ if ((err = psm3_ips_ibta_init(proto))) goto fail; @@ -233,9 +333,6 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS; } - /* - * Initialize SDMA, otherwise, turn on all PIO. - */ // initialize sdma after PSM3_MR_CACHE_MODE proto->flags |= IPS_PROTO_FLAG_SPIO; @@ -316,13 +413,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, * If we enable tid-based expected rendezvous, the expected protocol code * handles its own rv scb buffers. If not, we have to enable eager-based * rendezvous and we allocate scb buffers for it. - * For UD PSM3_RDMA (ep->rdmamode) controls our use of RDMA for Rendezvous - * For STL100 PSM3_TID controls use of EXPTID for Rendezvous + * For verbs PSM3_RDMA (ep->rdmamode) controls our use of RDMA for Rendezvous */ protoexp_flags = proto->ep->rdmamode; // PSM3_RDMA - // protoexp implements RDMA for UD and TID for STL100 native. N/A to UDP - // when proto->protoexp is NULL, we will not attempt to use TID nor RDMA + // protoexp implements RDMA for verbs. N/A to sockets + // when proto->protoexp is NULL, we will not attempt to use RDMA { (void)protoexp_flags; // for UD, even when RDMA is enabled, we may fall back to LONG_DATA @@ -594,7 +690,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, ((uint64_t)env_mr_cache_size_mb.e_uint * (1024*1024)) / max(psm3_mq_max_window_rv(proto->mq, 0)/2, - proto->mq->hfi_thresh_rv)); + proto->mq->rndv_nic_thresh)); } else { // only send DMA, size based on smaller MRs default_cache_entries = max(default_cache_entries, @@ -692,7 +788,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl, #if defined(PSM_CUDA) || defined(PSM_ONEAPI) _HFI_DBG("GDR Copy: %d limit send=%u recv=%u gpu_rndv=%u GPU RDMA flags=0x%x limit send=%u recv=%u\n", is_gdr_copy_enabled, gdr_copy_limit_send, gdr_copy_limit_recv, - gpu_thresh_rndv, + psm3_gpu_thresh_rndv, proto->flags & (IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV |IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND), gpudirect_rdma_send_limit, gpudirect_rdma_recv_limit); @@ -946,7 +1042,7 @@ proto_sdma_init(struct ips_proto *proto) if (! is_gpudirect_enabled || !psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_SDMA)) env_sdma.e_uint = 0; - else + else psm3_getenv("PSM3_GPUDIRECT_SDMA", "UD GPU send dma flags (0 disables send dma, 1 enables), default 1", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, @@ -1449,6 +1545,7 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) // immediately ack the msg struct ips_scb_unackedq *unackedq = &flow->scb_unacked; flow->xmit_ack_num.psn_num = 1 + (__be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask); + flow->xmit_ack_num.psn_num &= proto->psn_mask; psmi_assert(scb == STAILQ_FIRST(unackedq)); STAILQ_REMOVE_HEAD(unackedq, nextq); @@ -1517,6 +1614,22 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed) #else proto->stats.pio_no_flow_credits++; #endif + if (flow->credits <= 0) { +// _HFI_VDBG("flow=%p next=%d first_os=%d delta=%d\n", flow, +// flow->xmit_seq_num.psn_num, flow->credits_inc_psn, +// flow->xmit_seq_num.psn_num - flow->credits_inc_psn); + if (flow->max_credits < proto->max_credits && !between(flow->credits_inc_psn, + (flow->credits_inc_psn + proto->credits_inc_thresh) & proto->psn_mask, + flow->xmit_seq_num.psn_num)) { + // adjust with a small "random" number to avoid potential oscillation + uint16_t actual_adjust = min(proto->credits_adjust + (flow->xmit_seq_num.psn_num & 0xF), + proto->max_credits - flow->max_credits); + flow->max_credits += actual_adjust; + flow->credits += actual_adjust; + flow->credits_inc_psn = flow->xmit_seq_num.psn_num; + _HFI_VDBG("Increased flow (%p) credits to %d\n", flow, flow->max_credits); + } + } psmi_timer_request(proto->timerq, flow->timer_send, get_cycles() + proto->timeout_send); } @@ -1620,9 +1733,8 @@ psm3_ips_proto_timer_ack_callback(struct psmi_timer *current_timer, scb->abs_timeout = t_cyc_next + scb->ack_timeout; if (done_local) { _HFI_VDBG - ("sending err_chk flow=%d with first=%d,last=%d\n", - flow->flowid, - STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num, + ("sending err_chk flow=%p with first=%d, last=%d\n", + flow, scb->seq_num.psn_num, STAILQ_LAST(&flow->scb_unacked, ips_scb, nextq)->seq_num.psn_num); #ifdef PSM_BYTE_FLOW_CREDITS @@ -1639,23 +1751,29 @@ psm3_ips_proto_timer_ack_callback(struct psmi_timer *current_timer, flow->xmit_seq_num : SLIST_FIRST(&flow->scb_pend)->seq_num; - if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { - // for UD we use RC QP instead of STL100's TIDFLOW HW - // UDP has no RDMA - psmi_assert_always(0); // we don't allocate ips_flow for TID - message_type = OPCODE_ERR_CHK; // keep KlockWorks happy - } else { - PSM2_LOG_MSG("sending ERR_CHK message"); - message_type = OPCODE_ERR_CHK; - err_chk_seq.psn_num = (err_chk_seq.psn_num - 1) + PSM2_LOG_MSG("sending ERR_CHK message"); + message_type = OPCODE_ERR_CHK; + err_chk_seq.psn_num = (err_chk_seq.psn_num - 1) & proto->psn_mask; - } ctrlscb.ips_lrh.bth[2] = __cpu_to_be32(err_chk_seq.psn_num); psm3_ips_proto_send_ctrl_message(flow, message_type, &flow->ipsaddr->ctrl_msg_queued, &ctrlscb, ctrlscb.cksum, 0); + flow->credits_inc_psn = scb->seq_num.psn_num; + // decrease flow credits + if (flow->max_credits > proto->min_credits) { + uint16_t actual_adjust = min(proto->credits_adjust + (flow->xmit_seq_num.psn_num & 0xF), + flow->max_credits - proto->min_credits); + flow->max_credits -= actual_adjust; + if (flow->credits > actual_adjust) { + flow->credits -= actual_adjust; + } else { + flow->credits = 0; + } + _HFI_VDBG("Decreased flow (%p) credits to %d\n", flow, flow->max_credits); + } } t_cyc_next = get_cycles() + scb->ack_timeout; diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h index 9c1b920f075..47bf7a50c1d 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto.h @@ -59,13 +59,13 @@ #include "ips_config.h" #include "psm_user.h" -#include "ips_tid.h" #include "ips_recvhdrq.h" #include "ips_epstate.h" #include "ips_proto_am.h" #include "ips_tidflow.h" #include "ips_path_rec.h" +#if defined(PSM_SOCKETS) && defined(USE_UDP) // when defined, this enables use of byte based flow credits in addition // to packet based. // It can help UDP to avoid overflowing the sockets kernel buffers. @@ -73,6 +73,7 @@ // memory at scale. // UD/RC, TCP and OPA HALs self configure so this has no effect #define PSM_BYTE_FLOW_CREDITS +#endif typedef enum ips_path_type { IPS_PATH_LOW_PRIORITY, @@ -328,7 +329,6 @@ typedef enum psm_transfer_type { typedef enum psm_protocol_type { PSM_PROTOCOL_GO_BACK_N = 0, - PSM_PROTOCOL_TIDFLOW, PSM_PROTOCOL_LAST /* Keep this the last protocol type */ } psm_protocol_type_t; @@ -369,6 +369,10 @@ struct ips_proto { #ifdef PSM_BYTE_FLOW_CREDITS uint32_t flow_credit_bytes; // credit limit in bytes #endif + uint16_t min_credits; // min credits + uint16_t max_credits; // max credits + uint16_t credits_adjust; // credit adjusting amount + uint16_t credits_inc_thresh; // credit increase threshold mpool_t pend_sends_pool; struct ips_ibta_compliance_fn ibta; struct ips_proto_stats stats; @@ -510,8 +514,6 @@ struct ips_flow { uint16_t protocol:3; /* go-back-n or tidflow */ uint16_t flags:8; /* flow state flags */ - // TBD - cwin only needed for OPA for CCA - uint16_t cwin; /* Size of congestion window in packets */ // to allow for good pipelining of send/ACK need to trigger an ack at // least every ack_interval packets (roughy flow_credits/4) or every // ack_inteval_bytes bytes (roughly flow_credit_bytes/4) whichever @@ -537,12 +539,14 @@ struct ips_flow { // For UDP, sockets has byte oriented buffering so we need to // impose a credit_bytes limit to allow sufficient pkt credits // but avoid sockets buffer overflow and recv side discards/flow control - int16_t credits; /* Current credits available to send on flow */ + int16_t credits; /* Current credits available to send on flow */ + uint16_t max_credits; /* credits limit */ + uint32_t credits_inc_psn; /* the reference pkt psn used for increasing credit. We increase */ + /* credit if current psn - credits_inc_psn > credit_inc_thresh */ #ifdef PSM_BYTE_FLOW_CREDITS int32_t credit_bytes; /* Current credit bytes avail to send on flow */ #endif uint32_t ack_index; /* Index of the last ACK message type in pending message queue */ - psmi_seqnum_t xmit_seq_num; /* next psn for xmit */ psmi_seqnum_t xmit_ack_num; /* last xmited psn acked + 1 */ psmi_seqnum_t recv_seq_num; /* next psn expect to recv */ diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_connect.c b/prov/psm3/psm3/ptl_ips/ips_proto_connect.c index fd729dc7d9c..024ecf13ef9 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_connect.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_connect.c @@ -317,10 +317,11 @@ ips_ipsaddr_set_req_params(struct ips_proto *proto, ipsaddr->connidx_outgoing = req->hdr.connidx; ipsaddr->runid_key = req->runid_key; /* ipsaddr->initpsn = req->initpsn; */ - _HFI_CONNDBG("%s -> %s: connidx_incoming=%u connidx_outgoing=%u\n", + _HFI_CONNDBG("%s -> %s: connidx_incoming=%u connidx_outgoing=%u flow=%p\n", psm3_epid_fmt_internal(proto->ep->epid, 0), psm3_epid_fmt_internal(ipsaddr->epaddr.epid, 1), - ipsaddr->connidx_incoming, ipsaddr->connidx_outgoing); + ipsaddr->connidx_incoming, ipsaddr->connidx_outgoing, + &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]); err = psm3_epid_set_hostname(psm3_epid_nid(((psm2_epaddr_t) ipsaddr)->epid), @@ -611,7 +612,8 @@ MOCKABLE(psm3_ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto, flow->recv_seq_num.psn_val = 0; flow->xmit_ack_num.psn_val = 0; flow->flags = 0; - flow->credits = flow->cwin = proto->flow_credits; + flow->credits = proto->flow_credits; + flow->max_credits = proto->flow_credits; flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1); flow->ack_counter = 0; #ifdef PSM_BYTE_FLOW_CREDITS diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c index c39231b8679..4cc1ebc701b 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c @@ -53,17 +53,16 @@ /* Copyright (c) 2016 Intel Corporation. All rights reserved. */ -// This file implements the TID protocol for STL100 and the RDMA -// protocol for UD mode. The majority of functons in this file (perhaps all) -// are not used when TID/RDMA is disabled via PSM3_TID o PSM3_RDMA respectively -// RDMA is N/A for UDP, so it will behave as if PSM3_RDMA is disabled +// This file implements the RDMA +// protocol for verbs mode. The majority of functons in this file (perhaps all) +// are not used when RDMA is disabled via PSM3_RDMA +// RDMA is N/A for sockets, so it will behave as if PSM3_RDMA is disabled // and not use functions in this file. #include "psm_user.h" #include "psm2_hal.h" #include "ips_scb.h" -#include "ips_tid.h" #include "ips_tidflow.h" #include "ips_proto.h" #include "ips_expected_proto.h" @@ -113,7 +112,6 @@ static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, struct ips_gpu_hostbuf *chb_prev, uint32_t tsess_srcoff, uint32_t tsess_length, - uint32_t tsess_unaligned_start, psm2_chb_match_type_t type); #endif @@ -175,13 +173,11 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, if (err != PSM2_OK) goto fail; - if ((err = psm3_ips_scbctrl_init(ep, num_of_send_desc, 0, 0, 0, ips_tid_scbavail_callback, protoexp, &protoexp->tid_scbc_rv))) goto fail; - { union psmi_envvar_val env_rts_cts_interleave; @@ -256,75 +252,71 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto, #endif #endif +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + if (PSMI_IS_GPU_ENABLED) { + struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; + uint32_t maxsz, chunksz, max_elements; + uint32_t pool_num_obj_max_total; + uint32_t small_pool_num_obj_max_total; + if ((err = psm3_parse_mpool_env(protoexp->proto->mq, 1, + &rlim, &maxsz, &chunksz))) + goto fail; -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - { - if (PSMI_IS_GPU_ENABLED) { - struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS; - uint32_t maxsz, chunksz, max_elements; - uint32_t pool_num_obj_max_total; - uint32_t small_pool_num_obj_max_total; - - if ((err = psm3_parse_mpool_env(protoexp->proto->mq, 1, - &rlim, &maxsz, &chunksz))) - goto fail; - - /* the maxsz is the amount in MB, not the number of entries, - * since the element size depends on the window size */ - max_elements = (maxsz*1024*1024) / - psm3_mq_max_window_rv(proto->mq, 1); - /* mpool requires max_elements to be power of 2. round down. */ - max_elements = 1 << (31 - __builtin_clz(max_elements)); - /* need at least 2 buffers */ - max_elements = max(2, max_elements); - protoexp->gpu_hostbuf_recv_cfg.bufsz = - psm3_mq_max_window_rv(proto->mq, 1); - - protoexp->gpu_hostbuf_pool_recv = - psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), - chunksz, max_elements, 0, - UNDEFINED, NULL, NULL, - psmi_gpu_hostbuf_alloc_func, - (void *) - &protoexp->gpu_hostbuf_recv_cfg); - - if (protoexp->gpu_hostbuf_pool_recv == NULL) { - err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY, - "Couldn't allocate GPU host receive buffer pool"); - goto fail; - } - psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_recv, - NULL, &pool_num_obj_max_total); - - protoexp->gpu_hostbuf_small_recv_cfg.bufsz = - GPU_SMALLHOSTBUF_SZ; - protoexp->gpu_hostbuf_pool_small_recv = - psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), - chunksz, max_elements, 0, - UNDEFINED, NULL, NULL, - psmi_gpu_hostbuf_alloc_func, - (void *) - &protoexp->gpu_hostbuf_small_recv_cfg); - - if (protoexp->gpu_hostbuf_pool_small_recv == NULL) { - err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY, - "Couldn't allocate GPU host small receive buffer pool"); - goto fail; - } - psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_small_recv, - NULL, &small_pool_num_obj_max_total); - _HFI_DBG("GPU Recv Copy Pipeline: %u of %u bytes (small), %u of %u bytes\n", - small_pool_num_obj_max_total, - protoexp->gpu_hostbuf_small_recv_cfg.bufsz, - pool_num_obj_max_total, - protoexp->gpu_hostbuf_recv_cfg.bufsz); - PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp); - STAILQ_INIT(&protoexp->gpupend_getreqsq); - } else { - protoexp->gpu_hostbuf_pool_recv = NULL; - protoexp->gpu_hostbuf_pool_small_recv = NULL; + /* the maxsz is the amount in MB, not the number of entries, + * since the element size depends on the window size */ + max_elements = (maxsz*1024*1024) / + psm3_mq_max_window_rv(proto->mq, 1); + /* mpool requires max_elements to be power of 2. round down. */ + max_elements = 1 << (31 - __builtin_clz(max_elements)); + /* need at least 2 buffers */ + max_elements = max(2, max_elements); + protoexp->gpu_hostbuf_recv_cfg.bufsz = + psm3_mq_max_window_rv(proto->mq, 1); + + protoexp->gpu_hostbuf_pool_recv = + psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_gpu_hostbuf_alloc_func, + (void *) + &protoexp->gpu_hostbuf_recv_cfg); + + if (protoexp->gpu_hostbuf_pool_recv == NULL) { + err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate GPU host receive buffer pool"); + goto fail; + } + psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_recv, + NULL, &pool_num_obj_max_total); + + protoexp->gpu_hostbuf_small_recv_cfg.bufsz = + GPU_SMALLHOSTBUF_SZ; + protoexp->gpu_hostbuf_pool_small_recv = + psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf), + chunksz, max_elements, 0, + UNDEFINED, NULL, NULL, + psmi_gpu_hostbuf_alloc_func, + (void *) + &protoexp->gpu_hostbuf_small_recv_cfg); + + if (protoexp->gpu_hostbuf_pool_small_recv == NULL) { + err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY, + "Couldn't allocate GPU host small receive buffer pool"); + goto fail; } + psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_small_recv, + NULL, &small_pool_num_obj_max_total); + _HFI_DBG("GPU Recv Copy Pipeline: %u of %u bytes (small), %u of %u bytes\n", + small_pool_num_obj_max_total, + protoexp->gpu_hostbuf_small_recv_cfg.bufsz, + pool_num_obj_max_total, + protoexp->gpu_hostbuf_recv_cfg.bufsz); + PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp); + STAILQ_INIT(&protoexp->gpupend_getreqsq); + } else { + protoexp->gpu_hostbuf_pool_recv = NULL; + protoexp->gpu_hostbuf_pool_small_recv = NULL; } #endif psmi_assert(err == PSM2_OK); @@ -368,14 +360,11 @@ psm2_error_t psm3_ips_protoexp_fini(struct ips_protoexp *protoexp) if ((err = psm3_ips_scbctrl_fini(&protoexp->tid_scbc_rv))) goto fail; - /* finalize tid flow control. */ if ((err = psm3_ips_tf_fini(&protoexp->tfc))) goto fail; - psmi_free(protoexp); - fail: return err; } @@ -414,19 +403,16 @@ void ips_tid_mravail_callback(struct ips_proto *proto) #endif // PSM_HAVE_RDMA -// On STL100 ips_tf is a user space control for the HW tidflow which +// On STL100 ips_tf was a user space control for the HW tidflow which // would fully process most valid inbound EXPTID packets within an RV Window. -// For UD we maintain the user space control to help manage each active +// For verbs we maintain the user space control to help manage each active // RV window. // There is one CTS per RV window (typically 128K). -// For UD with RV, RDMA is used instread of EXPTID, with 1 RDMA per RV window. +// For verbs with RV, RDMA is used instread of EXPTID, with 1 RDMA per RV window // Typically there are 32 (HFI_TF_NFLOWS) configured. // The 32 is hard coded, could make it tunable. // The tidflow provides a natural pacing mechanism and limits the total amount -// of inflight EXPTID or RDMA incoming to given receiver. -// In addition on STL100 there is an upper bound on TIDs which limited total -// inbound DMA for a receiver to avoid 4MB. For smaller messages tidflow -// count may be the limit, for larger messages TIDs would be the limit. +// of inflight RDMA incoming to given receiver. /* New Tid Flows are available. If there are pending get requests put the * get timer on the timerq so it can be processed. */ @@ -544,12 +530,10 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp, tidflows = ips_tf_available(&protoexp->tfc); _HFI_MMDBG("available tidflow %u\n", tidflows); - if ( - tidflows > 0) + if (tidflows > 0) // get the actual TIDs and tidflows and send the CTS ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0); - else if ( - tidflows != -1) + else if (tidflows != -1) // out of TIDs, set a timer to try again later psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs, PSMI_TIMER_PRIO_1); @@ -732,7 +716,6 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc) // so it cannot issue any sends directly, otherwise we will have a recursive // situation and potentially deeper recursion if more send CQEs found // key notes in this regard: -// OPA100 code which may send acks here is ifdef'ed out since N/A to RC QP RDMA // psm3_mq_handle_rts_complete - sets flags in req and queues it, no callbacks // psm3_mpool_put(tidsendc) - tid_desc_send_pool has no callback configured // ips_tid_mravail_callback - psmi_timer_request call queues timer for future @@ -1171,13 +1154,10 @@ int ips_protoexp_process_err_chk_rdma_resp(struct ips_recvhdrq_event *rcv_ev) #endif // defined(PSM_VERBS) #ifdef PSM_HAVE_RDMA -// Intermediate STL100 EXTID packets can be delivered to software when -// acks are requested. -// The final packet in a STL100 EXTID flow is also delivered to software -// to indicate the completion of the flow and can contain unaligned data. -// for RDMA Write we will simply use immediate data in the write -// to indicate the completed receive of the RDMA Write -// if we use RDMA Read, the local SQ Completion will indicate this +// Upon completion of an RDMA Write, a completion is delivered with +// immediate data. The immediate data is used +// to indicate the completed receive of the RDMA Write. +// If we use RDMA Read, the local SQ Completion will indicate this. #if defined(PSM_VERBS) // could build and pass a ips_recvhdrq_event or pass struct ips_recvhdrq // but all we really need is proto and len @@ -1270,7 +1250,7 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref, /* Do some sanity checking */ psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY); - // STL100 does this at the end of ips_protoexp_send_tid_completion + // STL100 did this at the end of ips_protoexp_send_tid_completion // TBD - seems like this should be done after ips_tid_recv_free // so we have more likelihood of getting freshly freed resources? if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) { @@ -1403,7 +1383,6 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, struct ips_gpu_hostbuf *chb_prev, uint32_t tsess_srcoff, uint32_t tsess_length, - uint32_t tsess_unaligned_start, psm2_chb_match_type_t type) { struct ips_proto *proto = protoexp->proto; @@ -1447,8 +1426,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, tidsendc->userbuf = (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = - (void *)((uintptr_t)tidsendc->userbuf + - tsess_unaligned_start); + (void *)((uintptr_t)tidsendc->userbuf); return; } } else { @@ -1467,8 +1445,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, tidsendc->userbuf = (void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf); tidsendc->buffer = - (void *)((uintptr_t)tidsendc->userbuf + - tsess_unaligned_start); + (void *)((uintptr_t)tidsendc->userbuf); return; } if ((tsess_srcoff > chb->offset) @@ -1489,8 +1466,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp, (void *)((uintptr_t) chb->host_buf + tsess_srcoff - chb->offset); tidsendc->buffer = - (void *)((uintptr_t)tidsendc->userbuf + - tsess_unaligned_start ); + (void *)((uintptr_t)tidsendc->userbuf); return; } } @@ -1571,7 +1547,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, #if defined(PSM_SOCKETS) && PSMI_HAL_INST_CNT == 1 psmi_assert_always(0); // should not get here #elif defined(PSM_VERBS) - // for UD we do not need a ips_flow since we will use the RC QP and + // for verbs we do not need a ips_flow since we will use the RC QP and // then will use our main flow for the final RV completion control msg // The path record for use by RDMA will be selected when the connection // is established @@ -1646,7 +1622,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, chb, tid_list->tsess_srcoff, tid_list->tsess_length, - 0, rc); } else { // no match, need to prefetch @@ -1655,7 +1630,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, NULL, tid_list->tsess_srcoff, tid_list->tsess_length, - 0, PSMI_GPU_CONTINUE); } protoexp->proto->strat_stats.rndv_rdma_hbuf_send++; @@ -1678,7 +1652,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tidsendc->rv_conn_count = 0; #endif - _HFI_EXP ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d" "\n", @@ -1686,7 +1659,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp, tid_list->tsess_srcoff, tid_list->tsess_length ); - // start sending TIDEXP packets + // start sending RDMA packets ips_tid_send_exp(tidsendc); /* Add as a pending op and ring up the timer */ @@ -1726,12 +1699,9 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) struct ips_proto *proto = protoexp->proto; psm2_error_t err = PSM2_OK; - // for STL100 native we would loop on ips_scb_prepare_tid_sendctrl and - // ips_proto_flow_enqueue to prepare EXPTID scbs for the TIDFLOW protocol - // and queue and issue them. Once they were all posted the is_complete - // flag would be set. For larger messages, it might take multiple - // attempts to get resources to queue everything in which case callbacks - // and timers ensure progress + // for STL100 the EXPTID scbs were sent by software and had to loop + // to get and queue scbs for the STL100 TIDFLOW protocol. + // Once they were all posted the is_complete flag would be set. // For verbs we are delegating the RC Write "flow" to the NIC's RC QP // it will manage segmentation, sequence numbers and acks for the flow // so our job is done here after one call. @@ -1865,8 +1835,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc) * */ -// we got a CTS and processed it. Now we can start sending EXPTID packets. -// For UD we will use RDMA instead of EXPTID +// we got a CTS and processed it. Now we can start sending RDMA packets. static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc) { @@ -1978,7 +1947,7 @@ ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current) while (!STAILQ_EMPTY(phead)) { tidsendc = STAILQ_FIRST(phead); - // we have some scb's and can use them to queue some more EXPTID packets + // we have some scb's and can use them to queue some more packets #if defined(PSM_VERBS) #ifdef RNDV_MOD if (tidsendc->rv_need_err_chk_rdma) @@ -2024,15 +1993,6 @@ ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current) } #endif // PSM_HAVE_RDMA -/* Right now, in the kernel we are allowing for virtually non-contiguous pages, - in a single call, and we are therefore locking one page at a time, but since - the intended use of this routine is for a single group of - virtually contiguous pages, that should change to improve - performance. That means possibly changing the calling MPI code. - Doing so gets rid of some of the loop stuff here, and in the driver, - and allows for a single call to the core VM code in the kernel, - rather than one per page, definitely improving performance. */ - static psm2_error_t @@ -2261,9 +2221,8 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) #endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if ( - 1 /* due to unaligned recv using hostbuf, must always do this */ - ) { + /* due to unaligned recv using hostbuf, must always do this */ + { /* Before processing pending TID requests, first try to free up * any GPU host buffers that are now idle. */ struct ips_tid_get_gpupend *cphead = @@ -2392,8 +2351,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current) psmi_assert(nbytes_this >= 4); - // for STL native the tids and tidflows available pace incoming TIDs - // for UD we still use tidflows available to pace incoming RDMA + // for verbs we use tidflows available to pace incoming RDMA if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) { /* We're out of tidflow. If this process used all the resource, * the free callback will reschedule the operation, otherwise, @@ -2576,6 +2534,3 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc) return err; } #endif // PSM_HAVE_RDMA - - - diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_header.h b/prov/psm3/psm3/ptl_ips/ips_proto_header.h index aa0e84c17a7..8f2ee039cb6 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_header.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto_header.h @@ -148,7 +148,7 @@ struct ips_message_header { }; } PACK_SUFFIX; -/* desc_genc is up to 32 bits, but EXPTID header (and RDMA immediate data) +/* desc_genc is up to 32 bits, RDMA immediate data * only has room for 16 bits */ #define IPS_HDR_RDESCID_GENC_MASK 0xffff @@ -157,7 +157,7 @@ struct ips_message_header { * OpCodes in BTH[0], 24-31 bits. Order is important!!! */ #define OPCODE_RESERVED 0xC0 /* reserved */ -/* TINY to EXPTID_COMPLETION/ERR_CHK_RDMA_RESP are level 2 packets */ +/* TINY to ERR_CHK_RDMA_RESP are level 2 packets */ /* sending queue keeps a copy and resends if timeout waiting for ack */ /* order and reliability maintained */ #define OPCODE_TINY 0xC1 /* 0 <= msglen <= 8 */ diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_help.h b/prov/psm3/psm3/ptl_ips/ips_proto_help.h index b584f8d7c5b..cdf02155a3e 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_help.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto_help.h @@ -261,16 +261,10 @@ ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr, scb->abs_timeout = TIMEOUT_INFINITE; scb->scb_flags |= IPS_SEND_FLAG_PENDING; - if (flow->protocol == PSM_PROTOCOL_TIDFLOW) { - flow->xmit_seq_num.psn_seq += scb->nfrag; - scb->seq_num = flow->xmit_seq_num; - scb->seq_num.psn_seq--; - } else { - flow->xmit_seq_num.psn_num = + flow->xmit_seq_num.psn_num = (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask; - scb->seq_num.psn_num = + scb->seq_num.psn_num = (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask; - } return; } diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c index cdcc480e89a..a4f71ab8e5e 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c @@ -538,7 +538,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, } #endif - PSM2_LOG_EPM_COND((len > proto->mq->hfi_thresh_rv) && + PSM2_LOG_EPM_COND((len > proto->mq->rndv_nic_thresh) && proto->protoexp, OPCODE_LONG_RTS,PSM2_LOG_TX,proto->ep->epid, req->rts_peer->epid, "scb->ips_lrh.hdr_data.u32w0: %d",scb->ips_lrh.hdr_data.u32w0); @@ -556,8 +556,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, goto fail; #ifdef PSM_HAVE_REG_MR // TBD - we may want to include odd bytes at start - // and end of message in the RTS itself as opposed to being in last - // EXPTID payload packet's header + // and end of message in the RTS itself as opposed to using unaligned RDMA // then the RDMA Write can be better aligned and may perform better // Start registering memory for anticipated CTS requesting RDMA // TBD - we could reduce duation of memory pin by doing this only @@ -573,7 +572,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req, // registration for zero length sync messages // PSM3_RDMA if disabled causes proto->protoexp == NULL if (! ips_scb_buffer(scb) && len - && len > proto->mq->hfi_thresh_rv + && len > proto->mq->rndv_nic_thresh && proto->protoexp /* expected tid recieve enabled */ && ips_epaddr_rdma_connected(ipsaddr) && !req->mr @@ -618,7 +617,7 @@ int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len, { if ( !(flags_user & PSM2_MQ_FLAG_INJECT) && - len > gpu_thresh_rndv){ + len > psm3_gpu_thresh_rndv){ return 1; } @@ -798,7 +797,6 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user #endif // PSM_HAVE_REG_MR { ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; - // TBD for OPA flow_type could be DMA proto->strat_stats.short_cuCopy_isend++; proto->strat_stats.short_cuCopy_isend_bytes += len; } @@ -823,7 +821,6 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user } else #endif { - // TBD for OPA flow_type could be DMA proto->strat_stats.short_copy_cpu_isend++; proto->strat_stats.short_copy_cpu_isend_bytes += len; } @@ -894,7 +891,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user psm3_epaddr_get_name(mq->ep->epid, 0), psm3_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid, 1), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2], req); - } else if (len <= mq->hfi_thresh_rv) { + } else if (len <= mq->rndv_nic_thresh) { req->send_msgoff = 0; req->rts_peer = (psm2_epaddr_t) ipsaddr; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) @@ -931,7 +928,6 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user } else #endif { - // TBD for OPA flow_type could be DMA proto->strat_stats.eager_copy_cpu_isend++; proto->strat_stats.eager_copy_cpu_isend_bytes += len; } @@ -1130,7 +1126,6 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, #endif // PSM_HAVE_REG_MR { ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU; - // TBD for OPA flow_type could be DMA proto->strat_stats.short_cuCopy_send++; proto->strat_stats.short_cuCopy_send_bytes += len; } @@ -1157,7 +1152,6 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, } else #endif { - // TBD for OPA flow_type could be DMA proto->strat_stats.short_copy_cpu_send++; proto->strat_stats.short_copy_cpu_send_bytes += len; } @@ -1240,7 +1234,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, psm3_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid, 1), ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]); - } else if (len <= mq->hfi_thresh_rv) { + } else if (len <= mq->rndv_nic_thresh) { // for FI_INJECT eager comes from user buffer, needs end to end ack psm2_mq_req_t req; @@ -1289,7 +1283,6 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags, } else #endif { - // TBD for OPA flow_type could be DMA proto->strat_stats.eager_copy_cpu_send++; proto->strat_stats.eager_copy_cpu_send_bytes += len; } @@ -1390,7 +1383,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) /* Cases where we do not use TIDs: * 0) Received full message as payload to RTS, CTS is just an ack * 1) Recv on a host buffer, Send on a gpu buffer and len is <= 3 bytes - * 2) Recv on a host buffer, Send on a host buffer and len <= hfi_thresh_rv + * 2) Recv on a host buffer, Send on a host buffer and len <= rndv_nic_thresh * 3) Recv on gpu buf and len is <= 3 bytes * 4) Expected protocol not initialized. */ @@ -1398,7 +1391,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) || (!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV)|| (!req->is_sendbuf_gpu_mem && - req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv))) || + req->req_data.recv_msglen <= proto->mq->rndv_nic_thresh))) || (req->is_buf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV) || proto->protoexp == NULL /* no expected tid recieve */ #ifdef PSM_HAVE_REG_MR @@ -1411,7 +1404,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) #ifdef PSM_HAVE_REG_MR || ! ips_epaddr_rdma_connected((ips_epaddr_t *) epaddr) #endif - || req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv /* less rv theshold */ + || req->req_data.recv_msglen <= proto->mq->rndv_nic_thresh /* less rv theshold */ ) { /* no expected tid recieve */ #endif // PSM_CUDA || PSM_ONEAPI #ifdef PSM_HAVE_REG_MR @@ -1434,7 +1427,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) #if defined(PSM_CUDA) || defined(PSM_ONEAPI) req->is_buf_gpu_mem, req->is_sendbuf_gpu_mem, #endif - proto->mq->hfi_thresh_rv, + proto->mq->rndv_nic_thresh, #ifdef PSM_HAVE_REG_MR proto->protoexp?ips_epaddr_rdma_connected((ips_epaddr_t *) epaddr):0, #endif @@ -1489,9 +1482,6 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted) // buffers which match smaller messages can get MR cache hit for // various sized messages which may arrive in the buffer #ifdef PSM_HAVE_REG_MR - // TBD is this assert valid for OPA also? Should be since - // with pick LONG DATA above if recv_msgoff >= recv_msglen - // and send_msglen should == recv_msglen psmi_assert(req->req_data.send_msglen); // 0 len uses LONG_DATA above #if defined(PSM_CUDA) || defined(PSM_ONEAPI) // for GPU receive buffer we need to sort things out at a lower level @@ -1591,73 +1581,66 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) psmi_assert(nbytes_left > 0); PSM2_LOG_MSG("entering."); - { - /* use PIO transfer */ - flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; - frag_size = flow->frag_size; - chunk_size = min(proto->ep->chunk_max_segs*frag_size, + flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]; + frag_size = flow->frag_size; + chunk_size = min(proto->ep->chunk_max_segs*frag_size, proto->ep->chunk_max_size); #if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (req->is_buf_gpu_mem) { + if (req->is_buf_gpu_mem) { #ifdef PSM_HAVE_REG_MR - // rare, but when RV connection not available, we - // can select LONG DATA for a GPU send buffer. Normally - // won't happen for GPU send >3 unless RDMA disabled - // or RV not connected - // TBD - no upper bound for send DMA here - // non-priority MR and will fallback if can't register - if (!req->mr && req->req_data.send_msglen > proto->iovec_gpu_thresh_eager) { - req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, - req->req_data.buf, req->req_data.send_msglen, - IBV_ACCESS_IS_GPU_ADDR); - } - if (req->mr) { - proto->strat_stats.rndv_long_gdr_send += dostats; - proto->strat_stats.rndv_long_gdr_send_bytes += dostats*req->req_data.send_msglen; - } else -#endif -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) - // for GPU send buffer <= 3, receiver can select - // LONG DATA and we can use GDRCopy - // must repin per attempt - if (req->req_data.send_msglen <= gdr_copy_limit_send && + // rare, but when RV connection not available, we + // can select LONG DATA for a GPU send buffer. Normally + // won't happen for GPU send >3 unless RDMA disabled + // or RV not connected + // TBD - no upper bound for send DMA here + // non-priority MR and will fallback if can't register + if (!req->mr && req->req_data.send_msglen > proto->iovec_gpu_thresh_eager) { + req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, + req->req_data.buf, req->req_data.send_msglen, + IBV_ACCESS_IS_GPU_ADDR); + } + if (req->mr) { + proto->strat_stats.rndv_long_gdr_send += dostats; + proto->strat_stats.rndv_long_gdr_send_bytes += dostats*req->req_data.send_msglen; + } else +#endif /* PSM_HAVE_REG_MR */ + // for GPU send buffer <= 3, receiver can select + // LONG DATA and we can use GDRCopy + // must repin per attempt + if (req->req_data.send_msglen <= gdr_copy_limit_send && 0 != (buf = (uintptr_t)psmi_hal_gdr_convert_gpu_to_host_addr( - (unsigned long)req->req_data.buf, - req->req_data.send_msglen, 0, proto->ep))) { - converted = 1; - proto->strat_stats.rndv_long_gdrcopy_send += dostats; - proto->strat_stats.rndv_long_gdrcopy_send_bytes += dostats*req->req_data.send_msglen; - } else { - buf = (uintptr_t) req->req_data.buf + req->recv_msgoff; -#else - { -#endif - proto->strat_stats.rndv_long_cuCopy_send += dostats; - proto->strat_stats.rndv_long_cuCopy_send_bytes += dostats*req->req_data.send_msglen; - } + (unsigned long)req->req_data.buf, + req->req_data.send_msglen, 0, proto->ep))) { + converted = 1; + proto->strat_stats.rndv_long_gdrcopy_send += dostats; + proto->strat_stats.rndv_long_gdrcopy_send_bytes += dostats*req->req_data.send_msglen; } else { -#endif + buf = (uintptr_t) req->req_data.buf + req->recv_msgoff; + proto->strat_stats.rndv_long_cuCopy_send += dostats; + proto->strat_stats.rndv_long_cuCopy_send_bytes += dostats*req->req_data.send_msglen; + } + } else { +#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ #ifdef PSM_HAVE_REG_MR - // TBD - no upper bound for send DMA here - // non-priority MR and will fallback if can't register - if (!req->mr && req->req_data.send_msglen > proto->iovec_thresh_eager) { - req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, - req->req_data.buf, - req->req_data.send_msglen, 0); - } - if (req->mr) { - proto->strat_stats.rndv_long_dma_cpu_send += dostats; - proto->strat_stats.rndv_long_dma_cpu_send_bytes += dostats*req->req_data.send_msglen; - } else -#endif - { - proto->strat_stats.rndv_long_copy_cpu_send += dostats; - proto->strat_stats.rndv_long_copy_cpu_send_bytes += dostats*req->req_data.send_msglen; - } -#if defined(PSM_CUDA) || defined(PSM_ONEAPI) + // TBD - no upper bound for send DMA here + // non-priority MR and will fallback if can't register + if (!req->mr && req->req_data.send_msglen > proto->iovec_thresh_eager) { + req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0, + req->req_data.buf, + req->req_data.send_msglen, 0); } -#endif + if (req->mr) { + proto->strat_stats.rndv_long_dma_cpu_send += dostats; + proto->strat_stats.rndv_long_dma_cpu_send_bytes += dostats*(uint64_t)req->req_data.send_msglen; + } else +#endif /* PSM_HAVE_REG_MR */ + { + proto->strat_stats.rndv_long_copy_cpu_send += dostats; + proto->strat_stats.rndv_long_copy_cpu_send_bytes += (uint64_t)dostats*req->req_data.send_msglen; + } +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) } +#endif do { /* @@ -1667,8 +1650,8 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) */ /* - * When tid code path is enabled, we don’t allocate scbc_rv - * objects. If the message is less than the hfi_thresh_rv, + * When tid code path is enabled, we don't allocate scbc_rv + * objects. If the message is less than the rndv_nic_thresh, * we normally use eager protocol to do the transfer. * However, if it is sync send, we use the rendezvous * rts/cts/rts-data protocol. @@ -1691,9 +1674,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) unaligned_bytes = nbytes_left & 0x3; if (unaligned_bytes) { #if defined(PSM_CUDA) || defined(PSM_ONEAPI) - if (!req->is_buf_gpu_mem - || converted - ) + if (!req->is_buf_gpu_mem || converted) mq_copy_tiny_host_mem((uint32_t *)&scb->ips_lrh.mdata, (uint32_t *)buf, unaligned_bytes); else @@ -1721,8 +1702,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req) #endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI) // SDMA identifies GPU buffers itself. But PIO path needs flags - if (req->is_buf_gpu_mem - ) { + if (req->is_buf_gpu_mem) { #ifdef PSM_HAVE_REG_MR if (!req->mr && !converted) #else @@ -1820,9 +1800,9 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) proto->epaddr_stats.cts_rdma_recv++; #if defined(PSM_CUDA) || defined(PSM_ONEAPI) - psmi_assert(p_hdr->data[1].u32w1 > min(gpu_thresh_rndv, mq->hfi_thresh_rv)); // msglen + psmi_assert(p_hdr->data[1].u32w1 > min(psm3_gpu_thresh_rndv, mq->rndv_nic_thresh)); // msglen #else - psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv); // msglen + psmi_assert(p_hdr->data[1].u32w1 > mq->rndv_nic_thresh); // msglen #endif psmi_assert(proto->protoexp != NULL); @@ -1857,7 +1837,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev) proto->psmi_logevent_tid_send_reqs.next_warning = 0; } else { flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)]; - flow->recv_seq_num.psn_num -= 1; /* Decrement seq number to NAK proper CTS */ + flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & proto->psn_mask; /* Decrement seq number to NAK proper CTS */ ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow); static unsigned int msg_cnt = 0; if (msg_cnt++ == 0) { /* Report the message only once */ @@ -2012,7 +1992,7 @@ psm3_ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev) req->rts_peer = (psm2_epaddr_t) ipsaddr; req->rts_reqidx_peer = p_hdr->data[1].u32w0; - if (req->req_data.send_msglen > mq->hfi_thresh_rv) + if (req->req_data.send_msglen > mq->rndv_nic_thresh) { PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,req->rts_peer->epid,mq->ep->epid, "req->rts_reqidx_peer: %d",req->rts_reqidx_peer); diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_params.h b/prov/psm3/psm3/ptl_ips/ips_proto_params.h index 31148806fed..f288d6c54a1 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_params.h +++ b/prov/psm3/psm3/ptl_ips/ips_proto_params.h @@ -110,8 +110,7 @@ #define PSM_CRC_SIZE_IN_BYTES 8 /* - * version of protocol header (known to chip also). - * This value for OPA is defined in spec. + * version of protocol header */ #define IPS_PROTO_VERSION 0x1 @@ -199,7 +198,7 @@ /* Path selection policies: * * (a) Adaptive - Dynamically determine the least loaded paths using various - * feedback mechanism - Completion time via ACKs, NAKs, CCA using BECNs. + * feedback mechanism - Completion time via ACKs, NAKs, etc. * * (b) Static schemes - * (i) static_src - Use path keyed off source context diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c index 716599c8e05..2fbc0a0773b 100644 --- a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c +++ b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c @@ -224,8 +224,8 @@ pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow, /* NAK post process for any flow where an scb may describe more than 1 packet - * (OPA dma flow or GSO PIO flow). In which case we may need to resume in - * middle of scb. + * (verbs send dma flow or GSO PIO flow). In which case we may need to + * resume in middle of scb. */ void psm3_ips_segmentation_nak_post_process(struct ips_proto *proto, struct ips_flow *flow) @@ -406,7 +406,7 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) SLIST_FIRST(scb_pend) = NULL; psmi_assert(flow->scb_num_pending == 0); /* Reset congestion window - all packets ACK'd */ - flow->credits = flow->cwin = proto->flow_credits; + flow->credits = flow->max_credits; flow->ack_interval = max((flow->credits >> 2) - 1, 1); #ifdef PSM_BYTE_FLOW_CREDITS flow->credit_bytes = proto->flow_credit_bytes; @@ -445,29 +445,6 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev) psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ - { - /* Increase congestion window if flow is not congested */ - if_pf(flow->cwin < proto->flow_credits) { - // this only happens for OPA, so we don't have to - // increase ack_interval_bytes and flow_credit_bytes - // since we never decrease them for congestion - flow->credits += - min(flow->cwin << 1, - proto->flow_credits) - flow->cwin; - flow->cwin = min(flow->cwin << 1, proto->flow_credits); - flow->ack_interval = max((flow->credits >> 2) - 1, 1); -#ifdef PSM_BYTE_FLOW_CREDITS - //flow->credit_bytes += TBD - //flow->ack_interval_bytes = max((flow->credit_bytes >> 2) - 1, 1); - _HFI_VDBG("after grow cwin: flow_credits %d bytes %d\n", - flow->credits, flow->credit_bytes); -#else - _HFI_VDBG("after grow cwin: flow_credits %d\n", - flow->credits); -#endif - } - } - /* Reclaimed some credits - attempt to flush flow */ if (!SLIST_EMPTY(scb_pend)) flow->flush(flow, NULL); @@ -495,7 +472,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) struct ips_scb_unackedq *unackedq; struct ips_scb_pendlist *scb_pend; psmi_seqnum_t ack_seq_num, last_seq_num; - psm_protocol_type_t protocol; + //psm_protocol_type_t protocol; ips_epaddr_flow_t flowid; ips_scb_t *scb; @@ -506,7 +483,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) // we need to resend unacked packets starting with ack_seq_num. So check // psn of 1st NAK would like us to retransmit (e.g. don't -1 before check) if ((flowid = ips_proto_flowid(p_hdr)) < EP_NUM_FLOW_ENTRIES) { - protocol = PSM_PROTOCOL_GO_BACK_N; + //protocol = PSM_PROTOCOL_GO_BACK_N; psmi_assert(flowid < EP_NUM_FLOW_ENTRIES); flow = &ipsaddr->flows[flowid]; if (!pio_dma_ack_valid(proto, flow, ack_seq_num)) @@ -589,7 +566,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) SLIST_FIRST(scb_pend) = NULL; psmi_assert(flow->scb_num_pending == 0); /* Reset congestion window if all packets acknowledged */ - flow->credits = flow->cwin = proto->flow_credits; + flow->credits = flow->max_credits; flow->ack_interval = max((flow->credits >> 2) - 1, 1); #ifdef PSM_BYTE_FLOW_CREDITS flow->credit_bytes = proto->flow_credit_bytes; @@ -628,10 +605,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) psmi_assert(!STAILQ_EMPTY(unackedq)); /* sanity for above loop */ - if (protocol == PSM_PROTOCOL_TIDFLOW) - // we don't put TID (aka RDMA) pkts on UD, shouldn't get NAKs about it - _HFI_ERROR("post processing, Got nak for TID flow, not allowed for UD\n"); - else if (scb->nfrag > 1) + if (scb->nfrag > 1) psm3_ips_segmentation_nak_post_process(proto, flow); /* Always cancel ACK timer as we are going to restart the flow */ @@ -665,19 +639,16 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev) { int num_resent = 0; - /* Reclaim all credits upto congestion window only */ - flow->credits = flow->cwin; + /* Reclaim all credits */ + flow->credits = flow->max_credits; flow->ack_interval = max((flow->credits >> 2) - 1, 1); #ifdef PSM_BYTE_FLOW_CREDITS - // TBD cwin not implemented for UD and UDP so can predict - // credit_bytes here - psmi_assert(flow->cwin == proto->flow_credits); flow->credit_bytes = proto->flow_credit_bytes; flow->ack_interval_bytes = max((flow->credit_bytes >> 2) - 1, 1); - _HFI_VDBG("after reclaim cwin: flow_credits %d\n", - flow->credits); + _HFI_VDBG("after reclaim credits: flow->credits %d credit_bytes %u\n", + flow->credits, flow->credit_bytes); #else /* PSM_BYTE_FLOW_CREDITS */ - _HFI_VDBG("after reclaim cwin: flow_credits %d\n", + _HFI_VDBG("after reclaim credits: flow_credits %d\n", flow->credits); #endif /* PSM_BYTE_FLOW_CREDITS */ diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.h b/prov/psm3/psm3/ptl_ips/ips_scb.h index f51c9a27b67..97670116fdf 100644 --- a/prov/psm3/psm3/ptl_ips/ips_scb.h +++ b/prov/psm3/psm3/ptl_ips/ips_scb.h @@ -150,7 +150,7 @@ struct ips_scb { uint32_t scb_flags; /* When nfrag==1, frag_size and *remaining are undefined. * An scb can describe a large user buffer (nfrag>1) for segmentation - * (UDP GSO and OPA send DMA). + * (UDP GSO and verbs send DMA). * When such a buffer needs retransmission, the payload and payload_size * will be advanced to reflect what needs to be retransmitted. * *_remaining also are reduced to reflect what remains. diff --git a/prov/psm3/psm3/ptl_ips/ips_tid.c b/prov/psm3/psm3/ptl_ips/ips_tid.c deleted file mode 100644 index e7349dde133..00000000000 --- a/prov/psm3/psm3/ptl_ips/ips_tid.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ - diff --git a/prov/psm3/psm3/ptl_ips/ips_tid.h b/prov/psm3/psm3/ptl_ips/ips_tid.h deleted file mode 100644 index 6d31defc872..00000000000 --- a/prov/psm3/psm3/ptl_ips/ips_tid.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */ - -/* included header files */ - -#ifndef _IPS_TID_H -#define _IPS_TID_H - -#endif /* _IPS_TID_H */ diff --git a/prov/psm3/psm3/ptl_ips/ips_tidcache.c b/prov/psm3/psm3/ptl_ips/ips_tidcache.c deleted file mode 100644 index f7588b83fe0..00000000000 --- a/prov/psm3/psm3/ptl_ips/ips_tidcache.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - diff --git a/prov/psm3/psm3/ptl_ips/ips_tidcache.h b/prov/psm3/psm3/ptl_ips/ips_tidcache.h deleted file mode 100644 index 6d31284427e..00000000000 --- a/prov/psm3/psm3/ptl_ips/ips_tidcache.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - - This file is provided under a dual BSD/GPLv2 license. When using or - redistributing this file, you may do so under either license. - - GPL LICENSE SUMMARY - - Copyright(c) 2015 Intel Corporation. - - This program is free software; you can redistribute it and/or modify - it under the terms of version 2 of the GNU General Public License as - published by the Free Software Foundation. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - Contact Information: - Intel Corporation, www.intel.com - - BSD LICENSE - - Copyright(c) 2015 Intel Corporation. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -*/ - -#ifndef _IPS_TIDCACHE_H -#define _IPS_TIDCACHE_H - -#include -#include -#include -#include - -/* - * Design notes. - * - * PSM needs to call into driver to program receiving buffer pages to - * HFI gen1 hardware, each tid can be programmed with physically contiguous - * power-of-two pages from 1 pages to 512 pages. This procedure takes - * time. - * - * Lots of applications tend to re-use the same receiving buffer, caching - * such programmed tids in user space process will save time and improve - * application performance. - * - * This PSM tid registration caching design requires cooperation between - * PSM and driver. Here is what happen between PSM and driver. - * - * 1. PSM call into driver with a chunk of buffer with virtual address - * and length. - * 2. driver pins the buffer pages, program hardware with the physical - * pages, get a list of tids. - * 3. driver caches the tids with the corresponding virtual address in - * user space for each tid, and return the list of tids back to PSM. - * 4. PSM also caches the list of tids with the corresponding virtual - * address for each tid, and use the list of tids for transmission. - * 5. when process frees a buffer, kernel VM will catch the event and - * calls the callback in driver to notify that the virtual address - * range is gone in the process. - * 6. driver will search its cache system and find the tids with the - * removed virtual address, put these tid in an invalidation queue - * and notify PSM the event. - * 7. PSM will pick the event and remove the tids from its own cache - * as well. - * 8. PSM must check such invalidation event every time before searching - * its caching system to match tids for a 'new' buffer chunk. - * 9, when the caching system is full, and a new buffer chunk is asked - * to register, PSM picks a victim to remove. - */ - -typedef struct -{ - unsigned long start; /* start virtual address */ - uint32_t tidinfo; /* tid encoding */ - uint16_t length; /* length in pages */ - uint16_t invalidate; /* invalidate flag */ - uint16_t refcount; /* usage reference count */ - uint16_t i_prev; /* idle queue previous */ - uint16_t i_next; /* idle queue next */ -} rbtree_tidcache_mapitem_pl_t; - -typedef struct { - uint32_t ntid; /* tids are cached */ - uint32_t nidle; /* tids are idle */ -} rbtree_tidcache_map_pl_t; - -#define RBTREE_MI_PL rbtree_tidcache_mapitem_pl_t -#define RBTREE_MAP_PL rbtree_tidcache_map_pl_t - -#include "psm3_rbtree.h" - -/* - * Macro definition for easy programming. - */ - -#define NTID p_map->payload.ntid -#define REFCNT(x) p_map->root[x].payload.refcount -#define INVALIDATE(x) p_map->root[x].payload.invalidate - -#define LENGTH(x) p_map->root[x].payload.length -#define START(x) p_map->root[x].payload.start -#define END(x) (START(x) + (LENGTH(x)<<12)) - -/* - * Macro for idle tid queue management. - */ -#define NIDLE p_map->payload.nidle -#define IHEAD 0 -#define INEXT(x) p_map->root[x].payload.i_next -#define IPREV(x) p_map->root[x].payload.i_prev - -#define IDLE_REMOVE(x) do { \ - INEXT(IPREV(x)) = INEXT(x); \ - IPREV(INEXT(x)) = IPREV(x); \ - NIDLE--; \ - } while (0) - -#define IDLE_INSERT(x) do { \ - INEXT(x) = INEXT(IHEAD); \ - IPREV(x) = IHEAD; \ - IPREV(INEXT(IHEAD)) = x; \ - INEXT(IHEAD) = x; \ - NIDLE++; \ - } while (0) - -extern void ips_tidcache_map_init(cl_qmap_t *p_map, - cl_map_item_t* const root, - cl_map_item_t* const nil_item); - -#endif diff --git a/prov/psm3/psm3/ptl_ips/ips_tidflow.c b/prov/psm3/psm3/ptl_ips/ips_tidflow.c index dc7b7754d07..3305aedb865 100644 --- a/prov/psm3/psm3/ptl_ips/ips_tidflow.c +++ b/prov/psm3/psm3/ptl_ips/ips_tidflow.c @@ -59,9 +59,8 @@ #include "ips_expected_proto.h" #include "ips_tidflow.h" -// TBD - this is only needed for OPA or UD w/RNDV -// can reduce to just counting allocations on UD and -// not build for UDP. +// TBD - this is only needed for UD w/RNDV +// could omit from build for UDP. // Once that is done, could #ifdef PSMI_STATSTYPE_RDMA declaration // // TBD - move this into HAL and have init, fini, alloc, dealloc @@ -104,47 +103,40 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp, tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx; } - { - tfc->tf_ctrl = (struct ips_tf_ctrl *) - psmi_calloc(ep, UNDEFINED, 1, - sizeof(struct ips_tf_ctrl)); - if (tfc->tf_ctrl == NULL) { - return PSM2_NO_MEMORY; - } - } + tfc->tf_ctrl = (struct ips_tf_ctrl *)psmi_calloc(ep, UNDEFINED, 1, + sizeof(struct ips_tf_ctrl)); + if (tfc->tf_ctrl == NULL) + return PSM2_NO_MEMORY; /* * Only the master process can initialize. */ - { - tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS; - tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS; + tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS; + tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS; - for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { -// USE_RC TBD this is bizzare. For native mode it works fine -// for UD/UDP mode it crashes at next_free assignment below on some systems + for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { +// USE_RC TBD this is bizzare. +// For UD/UDP mode it crashes at next_free assignment below on some systems // but adding this print or moving next_free assignment to separate // loop works fine. Really odd if this is a compiler issue, but // I don't see any other reason. We should be single threaded here // enabling the empty call to tidflow_reset doesn't help -// stubbing tidflow_reset on native works fine, can't explain crash -// nor workaround - /* Update flow state */ - tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED; - tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx; - tfc->tf_ctrl->tf[tf_idx].next_gen = 0; +// Can't explain crash nor workaround + /* Update flow state */ + tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED; + tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx; + tfc->tf_ctrl->tf[tf_idx].next_gen = 0; #if 0 - tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; + tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; #endif - } + } #if 1 - for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { - tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; - } -#endif - tfc->tf_ctrl->tf_head = 0; + for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) { + tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1; } +#endif + tfc->tf_ctrl->tf_head = 0; #if TF_ADD /* TF_ADD: Add a new stats type for tid flows in psm_stats.h */ @@ -179,12 +171,9 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, struct ips_tf_ctrl *ctrl = tfc->tf_ctrl; struct ips_tf_entry *entry; - if (!ctrl->tf_num_avail) { psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS); *tidrecvc = NULL; - - return PSM2_EP_NO_RESOURCES; } @@ -192,7 +181,6 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, ctrl->tf_head = entry->next_free; ctrl->tf_num_avail--; - tfc->tf_num_total++; tfc->tf_num_inuse++; @@ -206,7 +194,6 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx); psmi_assert_always(entry->next_gen < tfc->tf_gen_mask); - return PSM2_OK; } @@ -233,12 +220,10 @@ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int use tfc->tidrecvc[tf_idx].rdescid.u32w1++; } - entry->next_free = ctrl->tf_head; ctrl->tf_head = tf_idx; ctrl->tf_num_avail++; - tfc->tf_num_inuse--; /* If an available callback is registered invoke it */ if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb) @@ -246,4 +231,3 @@ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int use return PSM2_OK; } - diff --git a/prov/psm3/psm3/ptl_ips/ips_tidflow.h b/prov/psm3/psm3/ptl_ips/ips_tidflow.h index f3c29351bae..bfa2546c267 100644 --- a/prov/psm3/psm3/ptl_ips/ips_tidflow.h +++ b/prov/psm3/psm3/ptl_ips/ips_tidflow.h @@ -121,5 +121,4 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc, /* Deallocate a tidflow */ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int used); - #endif diff --git a/prov/psm3/psm3/ptl_ips/ptl_ips.h b/prov/psm3/psm3/ptl_ips/ptl_ips.h index f5ab06d25ea..d33bd586adc 100644 --- a/prov/psm3/psm3/ptl_ips/ptl_ips.h +++ b/prov/psm3/psm3/ptl_ips/ptl_ips.h @@ -60,10 +60,8 @@ #include "ips_proto.h" -struct gen1_ptl_shared; // OPA-only shared context - /* - * PTL at the ips level (for OPA) + * PTL at the ips level (for NIC) * * This PTL structure glues all the ips components together. * @@ -115,8 +113,6 @@ struct ptl_ips { /* context's status check timeout in cycles -- cached */ uint64_t status_cyc_timeout; - /* Shared contexts context - OPA only */ - struct gen1_ptl_shared *recvshc; /* Rcv thread context */ struct ptl_rcvthread *rcvthread; } diff --git a/prov/psm3/psm3/utils/utils_dsa.c b/prov/psm3/psm3/utils/utils_dsa.c index 2c697b1cf20..a990babb208 100644 --- a/prov/psm3/psm3/utils/utils_dsa.c +++ b/prov/psm3/psm3/utils/utils_dsa.c @@ -115,6 +115,7 @@ struct dsa_wq { uint32_t use_count; // how many threads assigned to this WQ uint32_t max_xfer_size; // Maximum supported transfer size uint8_t dedicated; // is this a dedicated (1) or shared (0) WQ + int fd; // Only valid if wq_reg is NULL }; static struct dsa_wq dsa_wqs[DSA_MAX_QUEUES]; static uint32_t dsa_my_num_wqs; @@ -123,9 +124,13 @@ static psmi_spinlock_t dsa_wq_lock; // protects dsa_wq.use_count // Each thread is assigned a DSA WQ on 1st memcpy +// These are only available in the thread, so we can only initialize them on +// 1st IO and we can't clear them since ep close could be called by main thread static __thread void *dsa_wq_reg = NULL; static __thread uint8_t dsa_wq_dedicated; static __thread uint32_t dsa_wq_xfer_limit; +static __thread int dsa_wq_fd = -1; + // we keep completion record in thread local storage instead of stack // this way if a DSA completion times out and arrives late it still has a @@ -156,6 +161,116 @@ static inline void movdir64b(struct dsa_hw_desc *desc, volatile void *reg) : : "a" (reg), "d" (desc)); } +/* + * Submit work to the shared workqueue. + * + * Return: + * 0 == success + * -1 == Failure (timeout) + */ +static int dsa_swq_queue(struct dsa_hw_desc *desc, void *wq_reg, + struct dsa_stats *stats) +{ + uint64_t start_cycles, end_cycles; + int ret = 0; + + if (enqcmd(desc, wq_reg)) { + // must retry, limit attempts + start_cycles = get_cycles(); + end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT)/4; + while (enqcmd(desc, wq_reg)) { + if (get_cycles() > end_cycles) { + _HFI_INFO("DSA SWQ Enqueue Timeout\n"); + ret = -1; + stats->dsa_error++; + break; + } + } + if (!ret) + stats->dsa_swq_wait_ns += + cycles_to_nanosecs(get_cycles() - + start_cycles); + } else { + stats->dsa_swq_no_wait++; + } + + return ret; +} + +/* + * Submit work through the write call. + * + * Return: + * 0 == success + * -1 == Failure (timeout) + */ +static int dsa_write_queue(struct dsa_hw_desc *desc, int wq_fd, + struct dsa_stats *stats) +{ + uint64_t start_cycles, end_cycles; + int ret; + + ret = write(wq_fd, desc, sizeof(*desc)); + if (ret != sizeof(*desc)) { + _HFI_VDBG("DSA write failed: ret %d (%s)\n", + ret, strerror(errno)); + + /* Return if the err code is not "EAGAIN" */ + if (errno != EAGAIN) + return -1; + // must retry, limit attempts + start_cycles = get_cycles(); + end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT)/4; + ret = 0; + while (write(wq_fd, desc, sizeof(*desc) != sizeof(*desc))) { + if (errno != EAGAIN) { + _HFI_INFO("DSA write failed: (%s)\n", + strerror(errno)); + ret = -1; + break; + } + if (get_cycles() > end_cycles) { + _HFI_INFO("DSA Write Enqueue Timeout\n"); + ret = -1; + stats->dsa_error++; + break; + } + } + if (!ret) + stats->dsa_wait_ns += + cycles_to_nanosecs(get_cycles() - + start_cycles); + } else { + stats->dsa_no_wait++; + ret = 0; + } + + return ret; +} + +/* + * Return: + * 0 -- Success + * -1 -- Failure + */ +static int dsa_submit(struct dsa_hw_desc *desc, void *wq_reg, + uint8_t wq_dedicated, int wq_fd, + struct dsa_stats *stats) +{ + int ret = 0; + + if (wq_reg) { + if (wq_dedicated) + /* use MOVDIR64B for DWQ */ + movdir64b(desc, wq_reg); + else + ret = dsa_swq_queue(desc, wq_reg, stats); + } else { + ret = dsa_write_queue(desc, wq_fd, stats); + } + + return ret; +} /* use DSA to copy a block of memory */ /* !rx-> copy from app to shm (sender), rx-> copy from shm to app (receiver) */ @@ -255,27 +370,14 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx, // make sure completion status zeroing fully written before post to HW //_mm_sfence(); { asm volatile("sfence":::"memory"); } - if (dsa_wq_dedicated) { - /* use MOVDIR64B for DWQ */ - movdir64b(&desc, dsa_wq_reg); - } else { - /* use ENQCMDS for SWQ */ - if (enqcmd(&desc, dsa_wq_reg)) { - // must retry, limit attempts - start_cycles = get_cycles(); - end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT)/4; - while (enqcmd(&desc, dsa_wq_reg)) { - if (get_cycles() > end_cycles) { - _HFI_INFO("Disabling DSA: DSA SWQ Enqueue Timeout\n"); - dsa_available = 0; - stats->dsa_error++; - goto memcpy_exit; - } - } - stats->dsa_swq_wait_ns += cycles_to_nanosecs(get_cycles() - start_cycles); - } else { - stats->dsa_swq_no_wait++; - } + + // Submit the work + if (dsa_submit(&desc, dsa_wq_reg, dsa_wq_dedicated, dsa_wq_fd, + stats)) { + // Fail to submit + _HFI_INFO("Disabling DSA: failed to submit work.\n"); + dsa_available = 0; + goto memcpy_exit; } if (cpu_n) { @@ -348,20 +450,31 @@ static void dsa_free_wqs(void) int proc; int i; + // free dsa_wqs, info relevant to our PROC for (i=0; i= 0) { + close(dsa_wqs[i].fd); + dsa_wqs[i].fd = -1; + } } + // free what we parsed for (proc=0; proc < dsa_num_proc; proc++) { for (i=0; ifd, &desc, sizeof(desc)); + if (ret == sizeof(desc)) { + ret = 0; + + start_cycles = get_cycles(); + end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT); + while (comp.status == 0) { + if (get_cycles() > end_cycles && comp.status == 0) { + _HFI_ERROR("DSA timed out.\n"); + return -1; + } + } + + if (comp.status != DSA_COMP_SUCCESS) + ret = -1; + } else { + _HFI_ERROR("write failed: ret %d (%s)\n", + ret, strerror(errno)); + ret = -1; + } + + return ret; +} + /* initialize DSA - call once per process */ /* Some invalid inputs and DSA initialization errors are treated as fatal errors * since if DSA gets initialized on some nodes, but not on others, the * inconsistency in shm FIFO sizes causes an obsure fatal error later in * PSM3 intialization. So make the error more obvious and fail sooner. + * + * Note: if this fails, caller may try again later, so must cleanup any + * globals or resources we allocate before return failure. */ int psm3_dsa_init(void) { @@ -518,6 +675,7 @@ int psm3_dsa_init(void) char *delim; int new_proc = 0; proc = 0; + dsa_num_wqs[proc] = 0; if (! temp) { _HFI_ERROR("Can't alloocate temp string"); @@ -564,8 +722,11 @@ int psm3_dsa_init(void) dsa_wq_mode[proc][dsa_num_wqs[proc]] = mode; dsa_wq_filename[proc][dsa_num_wqs[proc]] = psmi_strdup(PSMI_EP_NONE, s); dsa_num_wqs[proc]++; - if (new_proc) + if (new_proc) { proc++; + if (proc < DSA_MAX_PROC) + dsa_num_wqs[proc] = 0; + } s = delim+1; } while (delim); psmi_free(temp); @@ -680,8 +841,11 @@ int psm3_dsa_init(void) for (i=0; i= 0) return; // typical case, already picked one // rcvthread, pick last and don't count it @@ -761,6 +936,7 @@ static inline void psm3_dsa_pick_wq(void) dsa_wq_reg = dsa_wqs[sel].wq_reg; dsa_wq_dedicated = dsa_wqs[sel].dedicated; dsa_wq_xfer_limit = dsa_wqs[sel].max_xfer_size; + dsa_wq_fd = dsa_wqs[sel].fd; } diff --git a/prov/psm3/psm3/utils/utils_env.c b/prov/psm3/psm3/utils/utils_env.c index 55efb77bc2b..d2b3a68ca64 100644 --- a/prov/psm3/psm3/utils/utils_env.c +++ b/prov/psm3/psm3/utils/utils_env.c @@ -409,7 +409,7 @@ static int psm3_getenv_is_verblevel(int printlevel) // count number of fields in a str_tuple (field:field:....) // The number is number of colons + 1 -static int psm3_count_tuples(const char *str) +int psm3_count_tuples(const char *str) { int ret = 1; if (! str) diff --git a/prov/psm3/src/psmx3.h b/prov/psm3/src/psmx3.h index 5209f138d5f..35b12916f55 100644 --- a/prov/psm3/src/psmx3.h +++ b/prov/psm3/src/psmx3.h @@ -857,9 +857,9 @@ struct psmx3_env { }; #define PSMX3_MAX_UNITS PSMI_MAX_RAILS /* from psm_config.h */ +#define PSMX3_MAX_EPS 64 /* no real limit, used to report max_trx_ctxt */ struct psmx3_domain_info { int max_trx_ctxt; - int free_trx_ctxt; int num_units; /* total HW units found by PSM3 */ int num_reported_units; /* num entries in arrays below */ int num_active_units; /* total active found, >= num_reported_units */ @@ -867,8 +867,6 @@ struct psmx3_domain_info { int unit_is_active[PSMX3_MAX_UNITS]; int unit_id[PSMX3_MAX_UNITS]; /* PSM3 unit_id */ int addr_index[PSMX3_MAX_UNITS];/* PSM3 address index within unit_id */ - int unit_nctxts[PSMX3_MAX_UNITS]; - int unit_nfreectxts[PSMX3_MAX_UNITS]; char default_domain_name[PSMX3_MAX_UNITS * NAME_MAX]; /* autoselect:irdma0;irdma1;..... */ char default_fabric_name[PSMX3_MAX_UNITS * NAME_MAX]; /* RoCE 192.168.101.0/24;RoCE 192.168.102.0/24;.... */ }; diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c index 87e8fc50bc8..c59de18c24e 100644 --- a/prov/psm3/src/psmx3_atomic.c +++ b/prov/psm3/src/psmx3_atomic.c @@ -115,6 +115,19 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, } #endif +#ifdef HAVE___INT128 +#define CASE_INT_TYPE(FUNC,...) \ + case FI_INT8: FUNC(__VA_ARGS__,int8_t); break; \ + case FI_UINT8: FUNC(__VA_ARGS__,uint8_t); break; \ + case FI_INT16: FUNC(__VA_ARGS__,int16_t); break; \ + case FI_UINT16: FUNC(__VA_ARGS__,uint16_t); break; \ + case FI_INT32: FUNC(__VA_ARGS__,int32_t); break; \ + case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break; \ + case FI_INT64: FUNC(__VA_ARGS__,int64_t); break; \ + case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break; \ + case FI_INT128: FUNC(__VA_ARGS__,ofi_int128_t); break; \ + case FI_UINT128: FUNC(__VA_ARGS__,ofi_uint128_t); break; +#else #define CASE_INT_TYPE(FUNC,...) \ case FI_INT8: FUNC(__VA_ARGS__,int8_t); break; \ case FI_UINT8: FUNC(__VA_ARGS__,uint8_t); break; \ @@ -124,6 +137,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break; \ case FI_INT64: FUNC(__VA_ARGS__,int64_t); break; \ case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break; +#endif #define CASE_FP_TYPE(FUNC,...) \ case FI_FLOAT: FUNC(__VA_ARGS__,float); break; \ @@ -168,6 +182,20 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, #define PSMX3_BXOR(dst,src) (dst) ^= (src) #define PSMX3_COPY(dst,src) (dst) = (src) +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +/* res is always CPU address, dst could be GPU address */ +#define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \ + do { \ + /*int i;*/ \ + TYPE *d = (dst); \ + TYPE *r = (res); \ + psmx3_lock(&psmx3_atomic_lock, 1); \ + /* for (i=0; i<(cnt); i++) */ \ + /*r[i] = d[i];*/ \ + psm3_memcpy(r, d, sizeof(TYPE)*cnt); \ + psmx3_unlock(&psmx3_atomic_lock, 1); \ + } while (0) +#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ #define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \ do { \ int i; \ @@ -178,7 +206,29 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count, r[i] = d[i]; \ psmx3_unlock(&psmx3_atomic_lock, 1); \ } while (0) +#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */ +#if defined(PSM_CUDA) || defined(PSM_ONEAPI) +/* src is always CPU address, dst could be GPU address */ +#define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \ + do { \ + int i; \ + TYPE *d = (dst); \ + TYPE *s = (src); \ + psmx3_lock(&psmx3_atomic_lock, 1); \ + for (i=0; i temp) @@ -315,7 +315,7 @@ static uint64_t get_max_inject_size(void) { } if (have_shm) { - temp = MQ_SHM_THRESH_RNDV; + temp = PSM3_MQ_RNDV_SHM_THRESH; psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_THRESH"), &temp, 0, UINT_MAX); if (thresh_rv > temp) @@ -327,7 +327,7 @@ static uint64_t get_max_inject_size(void) { if (have_nic) { // GPU ips rendezvous threshold // sockets HAL avoids rendezvous, so this may be overly restrictive - temp = GPU_THRESH_RNDV; + temp = PSM3_GPU_THRESH_RNDV; // PSM3_CUDA_THRESH_RNDV depricated, use PSM3_GPU_THRESH_RNDV if set psm3_parse_str_uint(psm3_env_get("PSM3_CUDA_THRESH_RNDV"), &temp, 0, UINT_MAX); @@ -339,7 +339,7 @@ static uint64_t get_max_inject_size(void) { if (have_shm) { // GPU shm rendezvous threshold - temp = MQ_SHM_GPU_THRESH_RNDV; + temp = PSM3_MQ_RNDV_SHM_GPU_THRESH; psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_GPU_THRESH"), &temp, 0, UINT_MAX); if (thresh_rv > temp) @@ -596,19 +596,11 @@ void psmx3_update_prov_info(struct fi_info *info, ! psmx3_domain_info.default_domain_name[0]) unit = 0; - if (unit == PSMX3_DEFAULT_UNIT || !psmx3_env.multi_ep) { - p->domain_attr->tx_ctx_cnt = psmx3_domain_info.free_trx_ctxt; - p->domain_attr->rx_ctx_cnt = psmx3_domain_info.free_trx_ctxt; - p->domain_attr->max_ep_tx_ctx = psmx3_domain_info.max_trx_ctxt; - p->domain_attr->max_ep_rx_ctx = psmx3_domain_info.max_trx_ctxt; - p->domain_attr->max_ep_stx_ctx = psmx3_domain_info.max_trx_ctxt; - } else { - p->domain_attr->tx_ctx_cnt = psmx3_domain_info.unit_nfreectxts[unit]; - p->domain_attr->rx_ctx_cnt = psmx3_domain_info.unit_nfreectxts[unit]; - p->domain_attr->max_ep_tx_ctx = psmx3_domain_info.unit_nctxts[unit]; - p->domain_attr->max_ep_rx_ctx = psmx3_domain_info.unit_nctxts[unit]; - p->domain_attr->max_ep_stx_ctx = psmx3_domain_info.unit_nctxts[unit]; - } + p->domain_attr->tx_ctx_cnt = psmx3_domain_info.max_trx_ctxt; + p->domain_attr->rx_ctx_cnt = psmx3_domain_info.max_trx_ctxt; + p->domain_attr->max_ep_tx_ctx = psmx3_domain_info.max_trx_ctxt; + p->domain_attr->max_ep_rx_ctx = psmx3_domain_info.max_trx_ctxt; + p->domain_attr->max_ep_stx_ctx = psmx3_domain_info.max_trx_ctxt; free(p->domain_attr->name); if (unit == PSMX3_DEFAULT_UNIT) diff --git a/prov/psm3/src/psmx3_av.c b/prov/psm3/src/psmx3_av.c index ac1d89ae531..53742afc121 100644 --- a/prov/psm3/src/psmx3_av.c +++ b/prov/psm3/src/psmx3_av.c @@ -234,11 +234,11 @@ void psmx3_epid_to_epaddr(struct psmx3_trx_ctxt *trx_ctxt, psmx3_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__, "psm3_ep_connect returned error %s, remote epid=%s." "Try setting FI_PSM3_CONN_TIMEOUT " - "to a larger value (current: %d seconds).\n", + "to a larger value (current: %d seconds). Aborting\n", psm3_error_get_string(err), psm3_epid_fmt(epid, 0), psmx3_env.conn_timeout); else psmx3_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__, - "psm3_ep_connect returned error %s, remote epid=%s.\n", + "psm3_ep_connect returned error %s, remote epid=%s. Aborting\n", psm3_error_get_string(err), psm3_epid_fmt(epid, 0)); abort(); diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c index c20035a84de..29359d3ea34 100644 --- a/prov/psm3/src/psmx3_init.c +++ b/prov/psm3/src/psmx3_init.c @@ -391,16 +391,12 @@ static int psmx3_init_lib(void) static int psmx3_update_hfi_info(void) { unsigned short i, j, psmx3_unit; - int nctxts = 0; - int nfreectxts = 0; int multirail = 0; - int counted_unit; char *s = NULL; char unit_name[NAME_MAX]; char fabric_name[NAME_MAX]; uint32_t cnt = 0; uint32_t addr_cnt = 0; - int tmp_nctxts, tmp_nfreectxts; int unit_active; int ret; psm2_info_query_arg_t args[4]; @@ -459,25 +455,6 @@ static int psmx3_update_hfi_info(void) continue; } - if (PSM2_OK != psm3_info_query(PSM2_INFO_QUERY_NUM_FREE_CONTEXTS, - &tmp_nfreectxts, 1, args) || (tmp_nfreectxts < 0)) - { - PSMX3_WARN(&psmx3_prov, FI_LOG_CORE, - "Failed to read number of free contexts from HFI unit_id %d\n", - i); - continue; - } - - if (PSM2_OK != psm3_info_query(PSM2_INFO_QUERY_NUM_CONTEXTS, - &tmp_nctxts, 1, args) || (tmp_nctxts < 0)) - { - PSMX3_WARN(&psmx3_prov, FI_LOG_CORE, - "Failed to read number of contexts from HFI unit_id %d\n", - i); - continue; - } - - counted_unit = 0; for (j=0; j < addr_cnt; j++) { psmx3_unit = i * addr_cnt + j; args[1].port = 1; // VERBS_PORT @@ -506,12 +483,6 @@ static int psmx3_update_hfi_info(void) continue; } - if (! counted_unit) { - nctxts += tmp_nctxts; - nfreectxts += tmp_nfreectxts; - counted_unit = 1; - } - psmx3_domain_info.num_active_units++; /* for PSM3_MULTIRAIL only report 1 "autoselect" unit */ @@ -519,8 +490,6 @@ static int psmx3_update_hfi_info(void) psmx3_domain_info.unit_is_active[psmx3_unit] = 1; psmx3_domain_info.unit_id[psmx3_unit] = i; psmx3_domain_info.addr_index[psmx3_unit] = j; - psmx3_domain_info.unit_nctxts[psmx3_unit] = tmp_nctxts; - psmx3_domain_info.unit_nfreectxts[psmx3_unit] = tmp_nfreectxts; psmx3_domain_info.active_units[psmx3_domain_info.num_reported_units++] = psmx3_unit; } if (psmx3_domain_info.num_active_units == 1) { @@ -554,22 +523,19 @@ static int psmx3_update_hfi_info(void) } PSMX3_INFO(&psmx3_prov, FI_LOG_CORE, - "hfi1 units: total %d, reported %d, active %d; " - "hfi1 contexts: total %d, free %d\n", + "psm3 units: total %d, reported %d, active %d\n", psmx3_domain_info.num_units, psmx3_domain_info.num_reported_units, - psmx3_domain_info.num_active_units, nctxts, nfreectxts); + psmx3_domain_info.num_active_units); if (psmx3_env.multi_ep) { - psmx3_domain_info.max_trx_ctxt = nctxts; - psmx3_domain_info.free_trx_ctxt = nfreectxts; + psmx3_domain_info.max_trx_ctxt = PSMX3_MAX_EPS; } else { psmx3_domain_info.max_trx_ctxt = 1; - psmx3_domain_info.free_trx_ctxt = (nfreectxts == 0) ? 0 : 1; } PSMX3_INFO(&psmx3_prov, FI_LOG_CORE, - "Tx/Rx contexts: %d in total, %d available.\n", - psmx3_domain_info.max_trx_ctxt, psmx3_domain_info.free_trx_ctxt); + "Tx/Rx contexts: %d allowed per process.\n", + psmx3_domain_info.max_trx_ctxt); return 0; }