Skip to content

Commit

Permalink
[v1.22.x] prov/psm3: update provider to sync with IEFS 11.7.0.0.110
Browse files Browse the repository at this point in the history
    Updates:
    - Improved auto-tuning features for PSM3, including
      dynamic Credit Flows and detecting the presence of
      the rv kernel module.
    - Improved PSM3 intra-node performance for large message
      sizes.

Signed-off-by: Scott Breyer <[email protected]>
(cherry picked from commit 386f574)
  • Loading branch information
sjb017 authored and j-xiong committed Jul 20, 2024
1 parent bdaa18b commit 19d2bf9
Show file tree
Hide file tree
Showing 72 changed files with 1,300 additions and 1,297 deletions.
4 changes: 0 additions & 4 deletions prov/psm3/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,6 @@ prov_psm3_psm3_libptl_ips_la_SOURCES = \
prov/psm3/psm3/ptl_ips/ips_recvq.h \
prov/psm3/psm3/ptl_ips/ips_scb.c \
prov/psm3/psm3/ptl_ips/ips_scb.h \
prov/psm3/psm3/ptl_ips/ips_tid.c \
prov/psm3/psm3/ptl_ips/ips_tid.h \
prov/psm3/psm3/ptl_ips/ips_tidcache.c \
prov/psm3/psm3/ptl_ips/ips_tidcache.h \
prov/psm3/psm3/ptl_ips/ips_tidflow.c \
prov/psm3/psm3/ptl_ips/ips_tidflow.h \
prov/psm3/psm3/ptl_ips/ptl.c \
Expand Down
2 changes: 1 addition & 1 deletion prov/psm3/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3_6_0_1
3_7_0_0
8 changes: 0 additions & 8 deletions prov/psm3/autogen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@ if test ! -f src/psmx3.h; then
exit 1
fi

if [ -f psm3/Makefile.include.base ]
then
make -f - <<EOF
psm3/Makefile.include: psm3/Makefile.include.base
cp psm3/Makefile.include.base psm3/Makefile.include
EOF
fi

set -x

autoreconf -ivf
6 changes: 5 additions & 1 deletion prov/psm3/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ AC_ARG_ENABLE([psm3-udp],
],[],[enable_psm3_udp=no])
AC_ARG_ENABLE([psm3-rc],
[AS_HELP_STRING([--enable-psm3-rc],
[EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=[Verbs HAL]@:>@])
[EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=check [check means match --enable-psm3-verbs option]@:>@])
],[],[enable_psm3_rc=check])
AS_IF([test "x$enable_psm3_udp" = "xyes"],
[
Expand Down Expand Up @@ -429,6 +429,10 @@ AS_IF([test x"$enable_atomics" != x"no"],
])
unset LIBS_save

dnl Check for 128-bit integer support
AC_CHECK_TYPE([__int128],
[AC_DEFINE(HAVE___INT128, 1, [Set to 1 to use 128-bit ints])])

dnl Check for gcc cpuid intrinsics
AC_MSG_CHECKING(compiler support for cpuid)
AC_LINK_IFELSE([AC_LANG_PROGRAM([[
Expand Down
2 changes: 1 addition & 1 deletion prov/psm3/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ AC_ARG_ENABLE([psm3-udp],
[enable_psm3_udp=no])
AC_ARG_ENABLE([psm3-rc],
[AS_HELP_STRING([--enable-psm3-rc],
[EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=[Verbs HAL]@:>@])],
[EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=check [check means match --enable-psm3-verbs option]@:>@])],
[],
[enable_psm3_rc=check])
dnl ------------- Extra Features
Expand Down
2 changes: 1 addition & 1 deletion prov/psm3/debian/changelog
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
libpsm3-fi (11.6.0.0-231) unstable; urgency=medium
libpsm3-fi (11.7.0.0-110) unstable; urgency=medium

* Initial release

Expand Down
4 changes: 0 additions & 4 deletions prov/psm3/psm3/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,6 @@ psm3_libptl_ips_la_SOURCES = \
psm3/ptl_ips/ips_recvq.h \
psm3/ptl_ips/ips_scb.c \
psm3/ptl_ips/ips_scb.h \
psm3/ptl_ips/ips_tid.c \
psm3/ptl_ips/ips_tid.h \
psm3/ptl_ips/ips_tidcache.c \
psm3/ptl_ips/ips_tidcache.h \
psm3/ptl_ips/ips_tidflow.c \
psm3/ptl_ips/ips_tidflow.h \
psm3/ptl_ips/ptl.c \
Expand Down
4 changes: 2 additions & 2 deletions prov/psm3/psm3/hal_sockets/sockets_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
// unfortunately default TCP max_buffering (16K) is too small
// so flow_credit_bytes < 16K would prevent getting a good pipeline of
// packets/ACKs going
proto->flow_credit_bytes = ep->mtu * proto->flow_credits;
proto->flow_credit_bytes = ep->mtu * proto->max_credits;
} else {
// sockets buffering needs to place an upper bound on bytes
// while flow_credits places an upper bound on pkts
Expand Down Expand Up @@ -1229,7 +1229,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
}

// Fetch current link state to update linkinfo fields in ips_proto:
// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
// ep_base_lid, ep_lmc, ep_link_rate
// These are all fields which can change during a link bounce.
// Note "active" state is not adjusted as on link down PSM will wait for
// the link to become usable again so it's always a viable/active device
Expand Down
1 change: 0 additions & 1 deletion prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
#include <sys/ioctl.h>
#include <sys/types.h>
#include "ips_proto.h"
#include "ptl_ips/ips_tid.h"
#include "ptl_ips/ips_expected_proto.h"

// flags=0 for send, 1 for recv
Expand Down
18 changes: 3 additions & 15 deletions prov/psm3/psm3/hal_sockets/sockets_hal.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ static int psm3_hfp_sockets_initialize(psmi_hal_instance_t *phi,
#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
// testing on HED-2629 suggests turning off RNDV can help
// latency for messages in size 8-256 KB
gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV;
psm3_gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV;
#endif
/* we initialize a few HAL software specific capabilities which
* are known before context_open can open RV or parse HAL specific
Expand Down Expand Up @@ -175,11 +175,11 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq)
* corresponding PSM3_* env variables.
* Otherwise these defaults are used.
*/
mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH;
mq->rndv_nic_thresh = PSM3_MQ_RNDV_NIC_THRESH;
mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR;
// Even without RDMA do we want to disable rendezvous?
// even without RDMA, the receiver controlled pacing helps scalability
mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
if (PSMI_IS_GPU_ENABLED)
Expand Down Expand Up @@ -220,16 +220,6 @@ static int psm3_hfp_sockets_get_unit_active(int unit)
return psm3_sockets_get_unit_active(unit, SIMS_FILTER);
}

static int psm3_hfp_sockets_get_num_contexts(int unit)
{
return 1024;
}

static int psm3_hfp_sockets_get_num_free_contexts(int unit)
{
return 1024;
}

static int psm3_hfp_sockets_get_default_pkey(void)
{
return 0; /* use slot 0 as default */
Expand Down Expand Up @@ -305,8 +295,6 @@ static hfp_sockets_t psm3_sockets_hi = {
.hfp_get_num_ports = psm3_hfp_sockets_get_num_ports,
.hfp_get_unit_active = psm3_hfp_sockets_get_unit_active,
.hfp_get_port_active = psm3_hfp_sockets_get_port_active,
.hfp_get_num_contexts = psm3_hfp_sockets_get_num_contexts,
.hfp_get_num_free_contexts = psm3_hfp_sockets_get_num_free_contexts,
.hfp_get_default_pkey = psm3_hfp_sockets_get_default_pkey,
.hfp_get_port_subnet = psm3_hfp_sockets_get_port_subnet,
.hfp_get_unit_pci_bus = psm3_hfp_sockets_get_unit_pci_bus,
Expand Down
4 changes: 2 additions & 2 deletions prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_proto_init(
}

// Fetch current link state to update linkinfo fields in ips_proto:
// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
// ep_base_lid, ep_lmc, ep_link_rate
// These are all fields which can change during a link bounce.
// Note "active" state is not adjusted as on link down PSM will wait for
// the link to become usable again so it's always a viable/active device
Expand Down Expand Up @@ -409,7 +409,7 @@ static PSMI_HAL_INLINE void psm3_hfp_sockets_ips_ipsaddr_disconnect(
{
}

/* Handle HAL specific initialization of ibta path record query, CCA
/* Handle HAL specific initialization of ibta path record query
* and dispersive routing
*/
static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_ibta_init(
Expand Down
3 changes: 2 additions & 1 deletion prov/psm3/psm3/hal_sockets/sockets_proto.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ psm3_tcp_proto_local_ack(struct ips_proto *proto, struct ips_flow *flow)

psmi_seqnum_t last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
last_seq_num.psn_num, flow->xmit_ack_num.psn_num-1)
last_seq_num.psn_num, (flow->xmit_ack_num.psn_num-1) & proto->psn_mask)
) {
STAILQ_REMOVE_HEAD(unackedq, nextq);
#ifdef PSM_DEBUG
Expand Down Expand Up @@ -151,6 +151,7 @@ psm3_tcp_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
// local ack
if (scb) { // this check is unnecessary, but can make KW happy
flow->xmit_ack_num.psn_num = 1 + (__be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask);
flow->xmit_ack_num.psn_num &= proto->psn_mask;
}
psm3_tcp_proto_local_ack(proto, flow);
}
Expand Down
11 changes: 6 additions & 5 deletions prov/psm3/psm3/hal_verbs/verbs_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,6 @@ psm3_verbs_parse_params(psm2_ep_t ep)
// min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) *
// chunk size (psm3_mq_max_window_rv(mq, 0) after
// psm3_mq_initialize_params)
// for OPA native, actual window_rv may be smaller, but for UD it
// is not reduced
psm3_getenv("PSM3_RV_MR_CACHE_SIZE",
"kernel space MR cache size"
" (MBs, 0 lets rv module decide) [0]",
Expand Down Expand Up @@ -550,7 +548,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
ep->chunk_max_size = ep->mtu;
#ifdef PSM_BYTE_FLOW_CREDITS
// let flow_credits be the control
proto->flow_credit_bytes = ep->mtu * proto->flow_credits;
proto->flow_credit_bytes = ep->mtu * proto->max_credits;
_HFI_DBG("initial flow_credits %d bytes %d\n",
proto->flow_credits, proto->flow_credit_bytes);
#else
Expand Down Expand Up @@ -594,7 +592,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
}

// Fetch current link state to update linkinfo fields in ips_proto:
// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
// ep_base_lid, ep_lmc, ep_link_rate
// These are all fields which can change during a link bounce.
// Note "active" state is not adjusted as on link down PSM will wait for
// the link to become usable again so it's always a viable/active device
Expand Down Expand Up @@ -2884,8 +2882,11 @@ unsigned psm3_verbs_parse_rdmamode(int reload)
// IPS_PROTOEXP_FLAGS_INTERLEAVE are N/A when RDMA not enabled

default_value = 0;
#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
#ifdef RNDV_MOD
if (psm3_rv_available()) {
default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL;
}
#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
// GPUDIRECT causes default_value of RDMA=1
if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect())
default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL;
Expand Down
1 change: 0 additions & 1 deletion prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
#include <sys/ioctl.h>
#include <sys/types.h>
#include "ips_proto.h"
#include "ptl_ips/ips_tid.h"
#include "ptl_ips/ips_expected_proto.h"

// flags=0 for send, 1 for recv
Expand Down
16 changes: 2 additions & 14 deletions prov/psm3/psm3/hal_verbs/verbs_hal.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,12 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq)
* Otherwise these defaults are used.
*/
unsigned rdmamode = psm3_verbs_parse_rdmamode(1);
mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH;
mq->rndv_nic_thresh = PSM3_MQ_RNDV_NIC_THRESH;
mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR;
if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) {
// TBD - when RDMA is disabled do we want to disable rendezvous?
// even without RDMA, the receiver controlled pacing helps scalability
mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
}
mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
Expand Down Expand Up @@ -213,16 +213,6 @@ static int psm3_hfp_verbs_get_unit_active(int unit)
return psm3_verbs_get_unit_active(unit, VIMS_FILTER);
}

static int psm3_hfp_verbs_get_num_contexts(int unit)
{
return 1024;
}

static int psm3_hfp_verbs_get_num_free_contexts(int unit)
{
return 1024;
}

static int psm3_hfp_verbs_get_default_pkey(void)
{
return 0; /* use slot 0 as default */
Expand Down Expand Up @@ -293,8 +283,6 @@ static hfp_verbs_t psm3_verbs_hi = {
.hfp_get_num_ports = psm3_hfp_verbs_get_num_ports,
.hfp_get_unit_active = psm3_hfp_verbs_get_unit_active,
.hfp_get_port_active = psm3_hfp_verbs_get_port_active,
.hfp_get_num_contexts = psm3_hfp_verbs_get_num_contexts,
.hfp_get_num_free_contexts = psm3_hfp_verbs_get_num_free_contexts,
.hfp_get_default_pkey = psm3_hfp_verbs_get_default_pkey,
.hfp_get_port_subnet = psm3_hfp_verbs_get_port_subnet,
.hfp_get_unit_pci_bus = psm3_hfp_verbs_get_unit_pci_bus,
Expand Down
4 changes: 2 additions & 2 deletions prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_proto_init(
}

// Fetch current link state to update linkinfo fields in ips_proto:
// ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
// ep_base_lid, ep_lmc, ep_link_rate
// These are all fields which can change during a link bounce.
// Note "active" state is not adjusted as on link down PSM will wait for
// the link to become usable again so it's always a viable/active device
Expand Down Expand Up @@ -610,7 +610,7 @@ static PSMI_HAL_INLINE void psm3_hfp_verbs_ips_ipsaddr_disconnect(
#endif
}

/* Handle HAL specific initialization of ibta path record query, CCA
/* Handle HAL specific initialization of ibta path record query
* and dispersive routing
*/
static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ibta_init(
Expand Down
2 changes: 0 additions & 2 deletions prov/psm3/psm3/hal_verbs/verbs_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,5 @@ int psm3_verbs_get_unit_active(int unit, enum verbs_init_max_speed init_max_spee
returns <= 0 if no port on any of the units is active. */
int psm3_hfp_verbs_have_active_unit(int num_units);

/* get the number of contexts from the unit id. */
int psm3_verbs_get_num_contexts(int unit);
#endif /* PSM_HAL_VERBS_SERVICE_H */
#endif /* PSM_VERBS */
2 changes: 2 additions & 0 deletions prov/psm3/psm3/include/utils_env.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr,
union psmi_envvar_val *newval);
MOCK_DCL_EPILOGUE(psm3_getenv_range);

int psm3_count_tuples(const char *str);

/*
* Parsing int, unsigned int and long parameters
* 0 -> ok, *val updated
Expand Down
Loading

0 comments on commit 19d2bf9

Please sign in to comment.