From 19d2bf9e2c869ca85c5e0eb2ea9e1b7668409c21 Mon Sep 17 00:00:00 2001
From: Scott Breyer <scott.breyer@intel.com>
Date: Wed, 17 Jul 2024 11:43:25 -0400
Subject: [PATCH] [v1.22.x] prov/psm3: update provider to sync with IEFS
 11.7.0.0.110

    Updates:
    - Improved auto-tuning features for PSM3, including
      dynamic Credit Flows and detecting the presence of
      the rv kernel module.
    - Improved PSM3 intra-node performance for large message
      sizes.

Signed-off-by: Scott Breyer <scott.breyer@intel.com>
(cherry picked from commit 386f5744fd343b30f769eb91cdbc4badf7fe37fc)
---
 prov/psm3/Makefile.include                    |   4 -
 prov/psm3/VERSION                             |   2 +-
 prov/psm3/autogen.sh                          |   8 -
 prov/psm3/configure.ac                        |   6 +-
 prov/psm3/configure.m4                        |   2 +-
 prov/psm3/debian/changelog                    |   2 +-
 prov/psm3/psm3/Makefile.include               |   4 -
 prov/psm3/psm3/hal_sockets/sockets_ep.c       |   4 +-
 prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c   |   1 -
 prov/psm3/psm3/hal_sockets/sockets_hal.c      |  18 +-
 .../psm3/hal_sockets/sockets_hal_inline_i.h   |   4 +-
 prov/psm3/psm3/hal_sockets/sockets_proto.c    |   3 +-
 prov/psm3/psm3/hal_verbs/verbs_ep.c           |  11 +-
 prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c       |   1 -
 prov/psm3/psm3/hal_verbs/verbs_hal.c          |  16 +-
 prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h |   4 +-
 prov/psm3/psm3/hal_verbs/verbs_service.h      |   2 -
 prov/psm3/psm3/include/utils_env.h            |   2 +
 prov/psm3/psm3/psm.c                          |  39 +-
 prov/psm3/psm3/psm2.h                         |  34 +-
 prov/psm3/psm3/psm2_hal.c                     |  41 --
 prov/psm3/psm3/psm2_hal.h                     |  21 +-
 prov/psm3/psm3/psm2_hal_loopback.c            |  14 +-
 prov/psm3/psm3/psm_config.h                   |  22 +-
 prov/psm3/psm3/psm_context.c                  |   2 -
 prov/psm3/psm3/psm_ep.c                       |  30 ++
 prov/psm3/psm3/psm_ep.h                       |   2 +-
 prov/psm3/psm3/psm_ep_connect.c               |   8 +-
 prov/psm3/psm3/psm_error.c                    |   2 +-
 prov/psm3/psm3/psm_lock.h                     |   6 +
 prov/psm3/psm3/psm_mq.c                       |  20 +-
 prov/psm3/psm3/psm_mq_internal.h              |   2 +-
 prov/psm3/psm3/psm_mq_recv.c                  |   2 +-
 prov/psm3/psm3/psm_nic_select.c               |  34 +-
 prov/psm3/psm3/psm_rndv_mod.c                 |  11 +
 prov/psm3/psm3/psm_rndv_mod.h                 |   2 +
 prov/psm3/psm3/psm_user.h                     | 101 ++++-
 prov/psm3/psm3/psm_utils.c                    |   3 +-
 prov/psm3/psm3/psm_utils.h                    |   2 -
 prov/psm3/psm3/psm_verbs_mr.c                 |  20 +-
 prov/psm3/psm3/ptl_am/am_config.h             |  33 +-
 prov/psm3/psm3/ptl_am/am_reqrep_shmem.c       | 387 +++++++++---------
 prov/psm3/psm3/ptl_am/psm_am_internal.h       |  31 +-
 prov/psm3/psm3/ptl_am/ptl.c                   | 101 +++--
 prov/psm3/psm3/ptl_am/ptl_fwd.h               |   3 -
 prov/psm3/psm3/ptl_ips/ips_config.h           |   4 +
 prov/psm3/psm3/ptl_ips/ips_expected_proto.h   |   2 -
 prov/psm3/psm3/ptl_ips/ips_path_rec.h         |  12 -
 prov/psm3/psm3/ptl_ips/ips_proto.c            | 178 ++++++--
 prov/psm3/psm3/ptl_ips/ips_proto.h            |  16 +-
 prov/psm3/psm3/ptl_ips/ips_proto_connect.c    |   8 +-
 prov/psm3/psm3/ptl_ips/ips_proto_expected.c   | 225 ++++------
 prov/psm3/psm3/ptl_ips/ips_proto_header.h     |   4 +-
 prov/psm3/psm3/ptl_ips/ips_proto_help.h       |  10 +-
 prov/psm3/psm3/ptl_ips/ips_proto_mq.c         | 160 ++++----
 prov/psm3/psm3/ptl_ips/ips_proto_params.h     |   5 +-
 prov/psm3/psm3/ptl_ips/ips_proto_recv.c       |  53 +--
 prov/psm3/psm3/ptl_ips/ips_scb.h              |   2 +-
 prov/psm3/psm3/ptl_ips/ips_tid.c              |  55 ---
 prov/psm3/psm3/ptl_ips/ips_tid.h              |  61 ---
 prov/psm3/psm3/ptl_ips/ips_tidcache.c         |  53 ---
 prov/psm3/psm3/ptl_ips/ips_tidcache.h         | 158 -------
 prov/psm3/psm3/ptl_ips/ips_tidflow.c          |  60 +--
 prov/psm3/psm3/ptl_ips/ips_tidflow.h          |   1 -
 prov/psm3/psm3/ptl_ips/ptl_ips.h              |   6 +-
 prov/psm3/psm3/utils/utils_dsa.c              | 230 +++++++++--
 prov/psm3/psm3/utils/utils_env.c              |   2 +-
 prov/psm3/src/psmx3.h                         |   4 +-
 prov/psm3/src/psmx3_atomic.c                  | 143 ++++++-
 prov/psm3/src/psmx3_attr.c                    |  30 +-
 prov/psm3/src/psmx3_av.c                      |   4 +-
 prov/psm3/src/psmx3_init.c                    |  44 +-
 72 files changed, 1300 insertions(+), 1297 deletions(-)
 delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tid.c
 delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tid.h
 delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tidcache.c
 delete mode 100644 prov/psm3/psm3/ptl_ips/ips_tidcache.h

diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include
index 47424fc2caf..9a7ef74370a 100644
--- a/prov/psm3/Makefile.include
+++ b/prov/psm3/Makefile.include
@@ -101,10 +101,6 @@ prov_psm3_psm3_libptl_ips_la_SOURCES = \
 	prov/psm3/psm3/ptl_ips/ips_recvq.h \
 	prov/psm3/psm3/ptl_ips/ips_scb.c \
 	prov/psm3/psm3/ptl_ips/ips_scb.h \
-	prov/psm3/psm3/ptl_ips/ips_tid.c \
-	prov/psm3/psm3/ptl_ips/ips_tid.h \
-	prov/psm3/psm3/ptl_ips/ips_tidcache.c \
-	prov/psm3/psm3/ptl_ips/ips_tidcache.h \
 	prov/psm3/psm3/ptl_ips/ips_tidflow.c \
 	prov/psm3/psm3/ptl_ips/ips_tidflow.h \
 	prov/psm3/psm3/ptl_ips/ptl.c \
diff --git a/prov/psm3/VERSION b/prov/psm3/VERSION
index 144229f3d51..8cb63b0114c 100644
--- a/prov/psm3/VERSION
+++ b/prov/psm3/VERSION
@@ -1 +1 @@
-3_6_0_1
+3_7_0_0
diff --git a/prov/psm3/autogen.sh b/prov/psm3/autogen.sh
index b3894e4712b..2d7687527a6 100755
--- a/prov/psm3/autogen.sh
+++ b/prov/psm3/autogen.sh
@@ -5,14 +5,6 @@ if test ! -f src/psmx3.h; then
 	exit 1
 fi
 
-if [ -f psm3/Makefile.include.base ]
-then
-	make -f - <<EOF
-psm3/Makefile.include: psm3/Makefile.include.base
-	cp psm3/Makefile.include.base psm3/Makefile.include
-EOF
-fi
-
 set -x
 
 autoreconf -ivf
diff --git a/prov/psm3/configure.ac b/prov/psm3/configure.ac
index a985fc05b85..53569e8e510 100644
--- a/prov/psm3/configure.ac
+++ b/prov/psm3/configure.ac
@@ -129,7 +129,7 @@ AC_ARG_ENABLE([psm3-udp],
     ],[],[enable_psm3_udp=no])
 AC_ARG_ENABLE([psm3-rc],
     [AS_HELP_STRING([--enable-psm3-rc],
-        [EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=[Verbs HAL]@:>@])
+        [EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=check [check means match --enable-psm3-verbs option]@:>@])
     ],[],[enable_psm3_rc=check])
 AS_IF([test "x$enable_psm3_udp" = "xyes"],
       [
@@ -429,6 +429,10 @@ AS_IF([test x"$enable_atomics" != x"no"],
 ])
 unset LIBS_save
 
+dnl Check for 128-bit integer support
+AC_CHECK_TYPE([__int128],
+	[AC_DEFINE(HAVE___INT128, 1, [Set to 1 to use 128-bit ints])])
+
 dnl Check for gcc cpuid intrinsics
 AC_MSG_CHECKING(compiler support for cpuid)
 AC_LINK_IFELSE([AC_LANG_PROGRAM([[
diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4
index 25aea136db6..5c8c083f7dc 100644
--- a/prov/psm3/configure.m4
+++ b/prov/psm3/configure.m4
@@ -456,7 +456,7 @@ AC_ARG_ENABLE([psm3-udp],
     [enable_psm3_udp=no])
 AC_ARG_ENABLE([psm3-rc],
     [AS_HELP_STRING([--enable-psm3-rc],
-        [EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=[Verbs HAL]@:>@])],
+        [EXPERIMENTAL: Enable User Space RC QPs on applicable HALs @<:@default=check [check means match --enable-psm3-verbs option]@:>@])],
     [],
     [enable_psm3_rc=check])
 dnl ------------- Extra Features
diff --git a/prov/psm3/debian/changelog b/prov/psm3/debian/changelog
index 0b1b356686f..52852ac0f5e 100644
--- a/prov/psm3/debian/changelog
+++ b/prov/psm3/debian/changelog
@@ -1,4 +1,4 @@
-libpsm3-fi (11.6.0.0-231) unstable; urgency=medium
+libpsm3-fi (11.7.0.0-110) unstable; urgency=medium
 
   * Initial release 
 
diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include
index cc52b8f1868..fd253089532 100644
--- a/prov/psm3/psm3/Makefile.include
+++ b/prov/psm3/psm3/Makefile.include
@@ -66,10 +66,6 @@ psm3_libptl_ips_la_SOURCES = \
 	psm3/ptl_ips/ips_recvq.h \
 	psm3/ptl_ips/ips_scb.c \
 	psm3/ptl_ips/ips_scb.h \
-	psm3/ptl_ips/ips_tid.c \
-	psm3/ptl_ips/ips_tid.h \
-	psm3/ptl_ips/ips_tidcache.c \
-	psm3/ptl_ips/ips_tidcache.h \
 	psm3/ptl_ips/ips_tidflow.c \
 	psm3/ptl_ips/ips_tidflow.h \
 	psm3/ptl_ips/ptl.c \
diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.c b/prov/psm3/psm3/hal_sockets/sockets_ep.c
index 27b98631508..ce7ddb61bc3 100755
--- a/prov/psm3/psm3/hal_sockets/sockets_ep.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_ep.c
@@ -1125,7 +1125,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 		// unfortunately default TCP max_buffering (16K) is too small
 		// so flow_credit_bytes < 16K would prevent getting a good pipeline of
 		// packets/ACKs going
-		proto->flow_credit_bytes = ep->mtu * proto->flow_credits;
+		proto->flow_credit_bytes = ep->mtu * proto->max_credits;
 	} else {
 		// sockets buffering needs to place an upper bound on bytes
 		// while flow_credits places an upper bound on pkts
@@ -1229,7 +1229,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 }
 
 // Fetch current link state to update linkinfo fields in ips_proto:
-// 	ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
+// 	ep_base_lid, ep_lmc, ep_link_rate
 // These are all fields which can change during a link bounce.
 // Note "active" state is not adjusted as on link down PSM will wait for
 // the link to become usable again so it's always a viable/active device
diff --git a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c
index b6235c533e6..645dfd3ebd2 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c
@@ -58,7 +58,6 @@
 #include <sys/ioctl.h>
 #include <sys/types.h>
 #include "ips_proto.h"
-#include "ptl_ips/ips_tid.h"
 #include "ptl_ips/ips_expected_proto.h"
 
 // flags=0 for send, 1 for recv
diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.c b/prov/psm3/psm3/hal_sockets/sockets_hal.c
index 8d4527bdd64..dd9ec3735dc 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_hal.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_hal.c
@@ -73,7 +73,7 @@ static int psm3_hfp_sockets_initialize(psmi_hal_instance_t *phi,
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	// testing on HED-2629 suggests turning off RNDV can help
 	// latency for messages in size 8-256 KB
-	gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV;
+	psm3_gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV;
 #endif
 	/* we initialize a few HAL software specific capabilities which
 	 * are known before context_open can open RV or parse HAL specific
@@ -175,11 +175,11 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq)
 	 * corresponding PSM3_* env variables.
 	 * Otherwise these defaults are used.
 	 */
-	mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH;
+	mq->rndv_nic_thresh = PSM3_MQ_RNDV_NIC_THRESH;
 	mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR;
 	// Even without RDMA do we want to disable rendezvous?
 	// even without RDMA, the receiver controlled pacing helps scalability
-	mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
+	mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	if (PSMI_IS_GPU_ENABLED)
@@ -220,16 +220,6 @@ static int psm3_hfp_sockets_get_unit_active(int unit)
 	return psm3_sockets_get_unit_active(unit, SIMS_FILTER);
 }
 
-static int psm3_hfp_sockets_get_num_contexts(int unit)
-{
-	return 1024;
-}
-
-static int psm3_hfp_sockets_get_num_free_contexts(int unit)
-{
-	return 1024;
-}
-
 static int psm3_hfp_sockets_get_default_pkey(void)
 {
 	return 0;	/* use slot 0 as default */
@@ -305,8 +295,6 @@ static hfp_sockets_t psm3_sockets_hi = {
 		.hfp_get_num_ports			  = psm3_hfp_sockets_get_num_ports,
 		.hfp_get_unit_active			  = psm3_hfp_sockets_get_unit_active,
 		.hfp_get_port_active			  = psm3_hfp_sockets_get_port_active,
-		.hfp_get_num_contexts			  = psm3_hfp_sockets_get_num_contexts,
-		.hfp_get_num_free_contexts		  = psm3_hfp_sockets_get_num_free_contexts,
 		.hfp_get_default_pkey			  = psm3_hfp_sockets_get_default_pkey,
 		.hfp_get_port_subnet			  = psm3_hfp_sockets_get_port_subnet,
 		.hfp_get_unit_pci_bus			  = psm3_hfp_sockets_get_unit_pci_bus,
diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h
index 28b13150466..9b703674147 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h
+++ b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h
@@ -189,7 +189,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_proto_init(
 }
 
 // Fetch current link state to update linkinfo fields in ips_proto:
-// 	ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
+// 	ep_base_lid, ep_lmc, ep_link_rate
 // These are all fields which can change during a link bounce.
 // Note "active" state is not adjusted as on link down PSM will wait for
 // the link to become usable again so it's always a viable/active device
@@ -409,7 +409,7 @@ static PSMI_HAL_INLINE void psm3_hfp_sockets_ips_ipsaddr_disconnect(
 {
 }
 
-/* Handle HAL specific initialization of ibta path record query, CCA
+/* Handle HAL specific initialization of ibta path record query
  * and dispersive routing
  */
 static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_ibta_init(
diff --git a/prov/psm3/psm3/hal_sockets/sockets_proto.c b/prov/psm3/psm3/hal_sockets/sockets_proto.c
index e7f90bb6982..c694151dcf1 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_proto.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_proto.c
@@ -74,7 +74,7 @@ psm3_tcp_proto_local_ack(struct ips_proto *proto, struct ips_flow *flow)
 
 	psmi_seqnum_t last_seq_num = STAILQ_LAST(unackedq, ips_scb, nextq)->seq_num;
 	while (between((scb = STAILQ_FIRST(unackedq))->seq_num.psn_num,
-		       last_seq_num.psn_num, flow->xmit_ack_num.psn_num-1)
+		       last_seq_num.psn_num, (flow->xmit_ack_num.psn_num-1) & proto->psn_mask)
 	    ) {
 		STAILQ_REMOVE_HEAD(unackedq, nextq);
 #ifdef PSM_DEBUG
@@ -151,6 +151,7 @@ psm3_tcp_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 		// local ack
 		if (scb) { // this check is unnecessary, but can make KW happy
 			flow->xmit_ack_num.psn_num = 1 + (__be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask);
+			flow->xmit_ack_num.psn_num &= proto->psn_mask;
 		}
 		psm3_tcp_proto_local_ack(proto, flow);
 	}
diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.c b/prov/psm3/psm3/hal_verbs/verbs_ep.c
index 10a4e845e4b..f4e30d6c5e9 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_ep.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_ep.c
@@ -397,8 +397,6 @@ psm3_verbs_parse_params(psm2_ep_t ep)
 	// min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) *
 	// chunk size (psm3_mq_max_window_rv(mq, 0) after
 	// psm3_mq_initialize_params)
-	// for OPA native, actual window_rv may be smaller, but for UD it
-	// is not reduced
 	psm3_getenv("PSM3_RV_MR_CACHE_SIZE",
 			"kernel space MR cache size"
 			" (MBs, 0 lets rv module decide) [0]",
@@ -550,7 +548,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 	ep->chunk_max_size = ep->mtu;
 #ifdef PSM_BYTE_FLOW_CREDITS
 	// let flow_credits be the control
-	proto->flow_credit_bytes = ep->mtu * proto->flow_credits;
+	proto->flow_credit_bytes = ep->mtu * proto->max_credits;
 	_HFI_DBG("initial flow_credits %d bytes %d\n",
 				proto->flow_credits, proto->flow_credit_bytes);
 #else
@@ -594,7 +592,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 }
 
 // Fetch current link state to update linkinfo fields in ips_proto:
-// 	ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
+// 	ep_base_lid, ep_lmc, ep_link_rate
 // These are all fields which can change during a link bounce.
 // Note "active" state is not adjusted as on link down PSM will wait for
 // the link to become usable again so it's always a viable/active device
@@ -2884,8 +2882,11 @@ unsigned psm3_verbs_parse_rdmamode(int reload)
 	// IPS_PROTOEXP_FLAGS_INTERLEAVE are N/A when RDMA not enabled
 
 	default_value = 0;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 #ifdef RNDV_MOD
+	if (psm3_rv_available()) {
+		default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL;
+	}
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	// GPUDIRECT causes default_value of RDMA=1
 	if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect())
 		default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL;
diff --git a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c
index 8fc324adedb..ab0942e5497 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c
@@ -58,7 +58,6 @@
 #include <sys/ioctl.h>
 #include <sys/types.h>
 #include "ips_proto.h"
-#include "ptl_ips/ips_tid.h"
 #include "ptl_ips/ips_expected_proto.h"
 
 // flags=0 for send, 1 for recv
diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.c b/prov/psm3/psm3/hal_verbs/verbs_hal.c
index 9575b316ff2..69d27478b48 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_hal.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_hal.c
@@ -166,12 +166,12 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq)
 	 * Otherwise these defaults are used.
 	 */
 	unsigned rdmamode = psm3_verbs_parse_rdmamode(1);
-	mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH;
+	mq->rndv_nic_thresh = PSM3_MQ_RNDV_NIC_THRESH;
 	mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR;
 	if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) {
 		// TBD - when RDMA is disabled do we want to disable rendezvous?
 		// even without RDMA, the receiver controlled pacing helps scalability
-		mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
+		mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
 	}
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
@@ -213,16 +213,6 @@ static int psm3_hfp_verbs_get_unit_active(int unit)
 	return psm3_verbs_get_unit_active(unit, VIMS_FILTER);
 }
 
-static int psm3_hfp_verbs_get_num_contexts(int unit)
-{
-	return 1024;
-}
-
-static int psm3_hfp_verbs_get_num_free_contexts(int unit)
-{
-	return 1024;
-}
-
 static int psm3_hfp_verbs_get_default_pkey(void)
 {
 	return 0;	/* use slot 0 as default */
@@ -293,8 +283,6 @@ static hfp_verbs_t psm3_verbs_hi = {
 		.hfp_get_num_ports			  = psm3_hfp_verbs_get_num_ports,
 		.hfp_get_unit_active			  = psm3_hfp_verbs_get_unit_active,
 		.hfp_get_port_active			  = psm3_hfp_verbs_get_port_active,
-		.hfp_get_num_contexts			  = psm3_hfp_verbs_get_num_contexts,
-		.hfp_get_num_free_contexts		  = psm3_hfp_verbs_get_num_free_contexts,
 		.hfp_get_default_pkey			  = psm3_hfp_verbs_get_default_pkey,
 		.hfp_get_port_subnet			  = psm3_hfp_verbs_get_port_subnet,
 		.hfp_get_unit_pci_bus			  = psm3_hfp_verbs_get_unit_pci_bus,
diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
index 2ba92503e9f..8ef06d9ae97 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
+++ b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
@@ -181,7 +181,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_proto_init(
 }
 
 // Fetch current link state to update linkinfo fields in ips_proto:
-// 	ep_base_lid, ep_lmc, ep_link_rate, QoS tables, CCA tables
+// 	ep_base_lid, ep_lmc, ep_link_rate
 // These are all fields which can change during a link bounce.
 // Note "active" state is not adjusted as on link down PSM will wait for
 // the link to become usable again so it's always a viable/active device
@@ -610,7 +610,7 @@ static PSMI_HAL_INLINE void psm3_hfp_verbs_ips_ipsaddr_disconnect(
 #endif
 }
 
-/* Handle HAL specific initialization of ibta path record query, CCA
+/* Handle HAL specific initialization of ibta path record query
  * and dispersive routing
  */
 static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ibta_init(
diff --git a/prov/psm3/psm3/hal_verbs/verbs_service.h b/prov/psm3/psm3/hal_verbs/verbs_service.h
index 1767ce33038..dba159c82f8 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_service.h
+++ b/prov/psm3/psm3/hal_verbs/verbs_service.h
@@ -112,7 +112,5 @@ int psm3_verbs_get_unit_active(int unit, enum verbs_init_max_speed init_max_spee
    returns <= 0 if no port on any of the units is active. */
 int psm3_hfp_verbs_have_active_unit(int num_units);
 
-/* get the number of contexts from the unit id. */
-int psm3_verbs_get_num_contexts(int unit);
 #endif /* PSM_HAL_VERBS_SERVICE_H */
 #endif /* PSM_VERBS */
diff --git a/prov/psm3/psm3/include/utils_env.h b/prov/psm3/psm3/include/utils_env.h
index d95660f6e01..770f04cc44a 100644
--- a/prov/psm3/psm3/include/utils_env.h
+++ b/prov/psm3/psm3/include/utils_env.h
@@ -153,6 +153,8 @@ int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr,
 		union psmi_envvar_val *newval);
 MOCK_DCL_EPILOGUE(psm3_getenv_range);
 
+int psm3_count_tuples(const char *str);
+
 /*
  * Parsing int, unsigned int and long parameters
  * 0 -> ok, *val updated
diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c
index df138dd8a2f..e46f868f054 100644
--- a/prov/psm3/psm3/psm.c
+++ b/prov/psm3/psm3/psm.c
@@ -69,6 +69,8 @@ int psm3_allow_routers;	// PSM3_ALLOW_ROUTERS
 char *psm3_allow_subnets[PSMI_MAX_SUBNETS];	// PSM3_SUBNETS
 int psm3_num_allow_subnets;
 unsigned int psm3_addr_per_nic = 1;
+unsigned int psm3_reg_mr_fail_limit = 100;
+unsigned int psm3_reg_mr_warn_cnt = 10;
 
 const char *psm3_nic_wildcard = NULL;
 
@@ -108,7 +110,7 @@ uint32_t gdr_copy_limit_recv;
 int is_gpudirect_enabled = 0;
 int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
 int is_driver_gpudirect_enabled;
-uint32_t gpu_thresh_rndv = GPU_THRESH_RNDV;
+uint32_t psm3_gpu_thresh_rndv = PSM3_GPU_THRESH_RNDV;
 uint64_t psm3_gpu_cache_evict;	// in bytes
 #endif
 
@@ -653,7 +655,7 @@ static void psmi_gpu_init(void)
 	ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV",
 			  "RNDV protocol is used for GPU send message sizes greater than the threshold",
 			  NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-			  (union psmi_envvar_val)gpu_thresh_rndv,
+			  (union psmi_envvar_val)psm3_gpu_thresh_rndv,
 			  (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX,
 			  NULL, NULL, &env_gpu_thresh_rndv);
 	if (ret > 0)
@@ -665,9 +667,10 @@ static void psmi_gpu_init(void)
 			    "[Deprecated, use PSM3_GPU_THRESH_RNDV]"
 			    " RNDV protocol is used for GPU send message sizes greater than the threshold",
 			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-			    (union psmi_envvar_val)gpu_thresh_rndv, &env_gpu_thresh_rndv);
+			    (union psmi_envvar_val)psm3_gpu_thresh_rndv,
+				&env_gpu_thresh_rndv);
 
-	gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint;
+	psm3_gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint;
 
 
 	union psmi_envvar_val env_gdr_copy_limit_send;
@@ -683,8 +686,8 @@ static void psmi_gpu_init(void)
 				(union psmi_envvar_val)GDR_COPY_LIMIT_SEND, &env_gdr_copy_limit_send);
 	gdr_copy_limit_send = env_gdr_copy_limit_send.e_int;
 
-	if (gdr_copy_limit_send < 8 || gdr_copy_limit_send > gpu_thresh_rndv)
-		gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, gpu_thresh_rndv);
+	if (gdr_copy_limit_send < 8 || gdr_copy_limit_send > psm3_gpu_thresh_rndv)
+		gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, psm3_gpu_thresh_rndv);
 
 	union psmi_envvar_val env_gdr_copy_limit_recv;
 	psm3_getenv("PSM3_GDRCOPY_LIMIT_RECV",
@@ -1344,6 +1347,18 @@ psm2_error_t psm3_init(int *major, int *minor)
 		}
 		psm3_addr_per_nic = env_addr_per_nic.e_uint;
 	}
+	{
+		union psmi_envvar_val env_reg_mr_fail_limit;
+		psm3_getenv("PSM3_REG_MR_FAIL_LIMIT",
+					"Max number of consecutive reg_mr failures",
+					PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+					(union psmi_envvar_val)100, &env_reg_mr_fail_limit);
+		if (env_reg_mr_fail_limit.e_uint >= 1) {
+			psm3_reg_mr_fail_limit = env_reg_mr_fail_limit.e_uint;
+			if (psm3_reg_mr_warn_cnt > psm3_reg_mr_fail_limit)
+				psm3_reg_mr_warn_cnt = psm3_reg_mr_fail_limit;
+		}
+	}
 	{
 		union psmi_envvar_val env_allow_routers;
 		psm3_getenv("PSM3_ALLOW_ROUTERS",
@@ -1576,8 +1591,8 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 		0, /* PSM2_INFO_QUERY_NUM_PORTS         */
 		1, /* PSM2_INFO_QUERY_UNIT_STATUS       */
 		2, /* PSM2_INFO_QUERY_UNIT_PORT_STATUS  */
-		1, /* PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */
-		1, /* PSM2_INFO_QUERY_NUM_CONTEXTS      */
+		0, /* was PSM2_INFO_QUERY_NUM_FREE_CONTEXTS */
+		0, /* was PSM2_INFO_QUERY_NUM_CONTEXTS      */
 		0, /* was PSM2_INFO_QUERY_CONFIG        */
 		0, /* was PSM2_INFO_QUERY_THRESH        */
 		0, /* was PSM2_INFO_QUERY_DEVICE_NAME   */
@@ -1621,14 +1636,6 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 								args[1].port);
 		rv = PSM2_OK;
 		break;
-	case PSM2_INFO_QUERY_NUM_FREE_CONTEXTS:
-		*((uint32_t*)out) = psmi_hal_get_num_free_contexts(args[0].unit);
-		rv = PSM2_OK;
-		break;
-	case PSM2_INFO_QUERY_NUM_CONTEXTS:
-		*((uint32_t*)out) = psmi_hal_get_num_contexts(args[0].unit);
-		rv = PSM2_OK;
-		break;
 	case PSM2_INFO_QUERY_FEATURE_MASK:
 		{
 #ifdef PSM_CUDA
diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h
index b9ff1c598d1..cadb561dbd4 100644
--- a/prov/psm3/psm3/psm2.h
+++ b/prov/psm3/psm3/psm2.h
@@ -66,7 +66,7 @@ extern "C" {
  * @file psm2.h
  * @page psm2_main PSM2 API
  *
- * @brief PSM2 OPA Messaging Library
+ * @brief PSM2 Messaging Library
  *
  * The PSM2 OPA Messaging API, or PSM2 API, is Intel's low-level
  * user-level communications interface for the OPA family of products.
@@ -666,11 +666,11 @@ typedef psm2_epid_t psm2_nid_t;
  */
 psm2_nid_t psm3_epid_nid(psm2_epid_t epid);
 
-/** @brief Get Endpoint identifier's OPA context number */
+/** @brief Get Endpoint identifier's context number */
 uint64_t psm3_epid_context(psm2_epid_t epid);
 #endif // 0
 
-/** @brief Get Endpoint identifier's OPA port (deprecated, use
+/** @brief Get Endpoint identifier's network port (deprecated, use
  * @ref psm3_epid_context instead) */
 uint64_t psm3_epid_port(psm2_epid_t epid);
 
@@ -743,10 +743,10 @@ struct psm3_ep_open_opts {
 	int imm_size;		/* Immediate data size for endpoint */
 };
 
-/** @brief OPA endpoint creation
+/** @brief PSM3 endpoint creation
  *
- * Function used to create a new local communication endpoint on an OPA
- * adapter.  The returned endpoint handle is required in all PSM2 communication
+ * Function used to create a new local communication endpoint on an adapter/NIC.
+ * The returned endpoint handle is required in all PSM2 communication
  * operations, as PSM2 can manage communication over multiple endpoints.  An
  * opened endpoint has no global context until the user connects the endpoint
  * to other global endpoints by way of @ref psm3_ep_connect.  All local endpoint
@@ -1328,7 +1328,7 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr);
 /* PSM2_COMPONENT_IB options */
 /* Default service level to use to communicate with remote endpoints */
 #define PSM2_IB_OPT_DF_SL 0x201
-  /**< [@b uint32_t ] Default OPA SL to use for all remote communication.
+  /**< [@b uint32_t ] Default OPA/IB SL to use for all remote communication.
    * If unset defaults to Service Level 0.
    *
    * component object: Opened PSM2 endpoint id (@ref psm2_ep_t).
@@ -1337,7 +1337,7 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr);
 
 /* Set IB service level to use for communication to an endpoint */
 #define PSM2_IB_OPT_EP_SL 0x202
-  /**< [@b uint32_t ] OPA SL to use for communication to specified
+  /**< [@b uint32_t ] OPA/IB SL to use for communication to specified
    * remote endpoint.
    *
    * component object: PSM2 endpoint (@ ref psm2_epaddr_t) address.
@@ -1348,7 +1348,7 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr);
 /* MQ options that can be set in psm3_mq_init and psm2_{set,get}_opt */
 #define PSM2_MQ_OPT_RNDV_IB_SZ       0x301
   /**< [@b uint32_t ] Size at which to start enabling rendezvous
-   * messaging for OPA messages (if unset, defaults to values
+   * messaging for PSM3 messages (if unset, defaults to values
    * between 56000 and 72000 depending on the system configuration)
    *
    * component object: PSM2 Matched Queue (@ref psm2_mq_t).
@@ -1615,19 +1615,11 @@ typedef enum psm2_info_query_et
                      active.  */
 	PSM2_INFO_QUERY_UNIT_PORT_STATUS,
 
-/*! Required input arguments: 1
-   1.  type: uint32_t, description: the unit for which the number of
-       free contexts is desired (use: psm2_info_query_arg_t.unit).
-   Output parameter: uint32_t, description: the number of free
-                     contexts..  */
-	PSM2_INFO_QUERY_NUM_FREE_CONTEXTS,
+/*! removed QUERY_NUM_FREE_CONTEXTS, but kept placeholder to retain values in enum */
+	PSM2_WAS_INFO_QUERY_NUM_FREE_CONTEXTS,
 
-/*! Required input arguments: 1
-   1.  type: uint32_t, description: the unit for which the number of
-       contexts is desired (use: psm2_info_query_arg_t.unit).
-   Output parameter: uint32_t, description: the number of
-                     contexts..  */
-	PSM2_INFO_QUERY_NUM_CONTEXTS,
+/*! removed QUERY_NUM_CONTEXTS, but kept placeholder to retain values in enum */
+	PSM2_WAS_INFO_QUERY_NUM_CONTEXTS,
 
 /*! removed QUERY_CONFIG, but kept placeholder to retain values in enum */
 	PSM2_WAS_INFO_QUERY_CONFIG,
diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c
index 0c347ce2160..31a1cf67ecf 100644
--- a/prov/psm3/psm3/psm2_hal.c
+++ b/prov/psm3/psm3/psm2_hal.c
@@ -110,8 +110,6 @@ void psm3_hal_register_instance(psmi_hal_instance_t *psm_hi)
 	REJECT_IMPROPER_HI(hfp_get_num_ports);
 	REJECT_IMPROPER_HI(hfp_get_unit_active);
 	REJECT_IMPROPER_HI(hfp_get_port_active);
-	REJECT_IMPROPER_HI(hfp_get_num_contexts);
-	REJECT_IMPROPER_HI(hfp_get_num_free_contexts);
 	REJECT_IMPROPER_HI(hfp_get_default_pkey);
 	REJECT_IMPROPER_HI(hfp_get_port_subnet);
 	REJECT_IMPROPER_HI(hfp_get_unit_pci_bus);
@@ -293,37 +291,6 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 					rv = -1;
 			}
 			break;
-		case psmi_hal_pre_init_cache_func_get_num_contexts:
-			{
-				int unit = va_arg(ap,int);
-				if ((unit >= 0) && (unit < p->params.num_units))
-				{
-					if (!p->params.num_contexts_valid[unit]) {
-						p->params.num_contexts_valid[unit] = 1;
-						p->params.num_contexts[unit] = p->hfp_get_num_contexts(unit);
-					}
-					rv = p->params.num_contexts[unit];
-				}
-				else
-					rv = -1;
-			}
-			break;
-		case psmi_hal_pre_init_cache_func_get_num_free_contexts:
-			{
-				int unit = va_arg(ap,int);
-
-				if ((unit >= 0) && (unit < p->params.num_units))
-				{
-					if (!p->params.num_free_contexts_valid[unit]) {
-						p->params.num_free_contexts_valid[unit] = 1;
-						p->params.num_free_contexts[unit] = p->hfp_get_num_free_contexts(unit);
-					}
-					rv = p->params.num_free_contexts[unit];
-				}
-				else
-					rv = -1;
-			}
-			break;
 		case psmi_hal_pre_init_cache_func_get_default_pkey:
 			rv = p->params.default_pkey;
 			break;
@@ -581,10 +548,6 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p)
 	FREE_HAL_CACHE(port_speed_valid);
 	FREE_HAL_CACHE(port_lid);
 	FREE_HAL_CACHE(port_lid_valid);
-	FREE_HAL_CACHE(num_contexts);
-	FREE_HAL_CACHE(num_contexts_valid);
-	FREE_HAL_CACHE(num_free_contexts);
-	FREE_HAL_CACHE(num_free_contexts_valid);
 	FREE_HAL_CACHE(port_subnet_valid);
 	FREE_HAL_CACHE(port_subnet);
 	FREE_HAL_CACHE(port_subnet_addr);
@@ -638,10 +601,6 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p,
 	ALLOC_HAL_CACHE(port_speed_valid, int8_t, nunits*(nports+1));
 	ALLOC_HAL_CACHE(port_lid, int, nunits*(nports+1)*psm3_addr_per_nic);
 	ALLOC_HAL_CACHE(port_lid_valid, int8_t, nunits*(nports+1)*psm3_addr_per_nic);
-	ALLOC_HAL_CACHE(num_contexts, uint16_t, nunits);
-	ALLOC_HAL_CACHE(num_contexts_valid, uint16_t, nunits);
-	ALLOC_HAL_CACHE(num_free_contexts, uint16_t, nunits);
-	ALLOC_HAL_CACHE(num_free_contexts_valid, uint16_t, nunits);
 	ALLOC_HAL_CACHE(port_subnet_valid, int8_t, nunits*(nports+1)*psm3_addr_per_nic);
 	ALLOC_HAL_CACHE(port_subnet, psmi_subnet128_t, nunits*(nports+1)*psm3_addr_per_nic);
 	ALLOC_HAL_CACHE(port_subnet_addr, psmi_naddr128_t, nunits*(nports+1)*psm3_addr_per_nic);
diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h
index 055261da6c4..91d187dcd56 100644
--- a/prov/psm3/psm3/psm2_hal.h
+++ b/prov/psm3/psm3/psm2_hal.h
@@ -83,10 +83,10 @@ struct psm3_ep_open_opts;
  */
 typedef enum
 {
-	PSM_HAL_INDEX_VERBS	=  1,
-	PSM_HAL_INDEX_SOCKETS	=  2,
-	PSM_HAL_INDEX_LOOPBACK	=  3,
-	PSM_HAL_INDEX_MAX	=  3,
+	PSM_HAL_INDEX_VERBS	=  0,
+	PSM_HAL_INDEX_SOCKETS	=  1,
+	PSM_HAL_INDEX_LOOPBACK	=  2,
+	PSM_HAL_INDEX_MAX	=  2,
 } psmi_hal_instance_index_t;
 
 /* This string is used as the hal_name for both log messages
@@ -232,8 +232,6 @@ typedef struct _psmi_hal_params
 	int8_t     *port_speed_valid;
 	int        *port_lid;
 	int8_t     *port_lid_valid;
-	uint16_t   *num_contexts,*num_contexts_valid;
-	uint16_t   *num_free_contexts,*num_free_contexts_valid;
 		// information from port_get_subnet
 	int8_t     *port_subnet_valid;
 	uint8_t    *port_subnet_addr_fmt;
@@ -340,13 +338,6 @@ struct _psmi_hal_instance
 	int (*hfp_get_unit_active)(int unit);
 	int (*hfp_get_port_active)(int unit,int port);
 
-	/* NOTE: hfp_get_num_contexts is a function that must
-	   be callable before the hal instance is initialized. */
-	int (*hfp_get_num_contexts)(int unit);
-	/* NOTE: hfp_get_num_free_contexts is a function that must
-	   be callable before the hal instance is initialized. */
-	int (*hfp_get_num_free_contexts)(int unit);
-
 	/* Returns the default pkey:
 	   NOTE: hfp_get_default_pkey is a function that must
 	   be callable before the hal instance is initialized. */
@@ -519,8 +510,6 @@ enum psmi_hal_pre_init_cache_func_krnls
 	psmi_hal_pre_init_cache_func_get_port_active,
 	psmi_hal_pre_init_cache_func_get_port_speed,
 	psmi_hal_pre_init_cache_func_get_port_lid,
-	psmi_hal_pre_init_cache_func_get_num_contexts,
-	psmi_hal_pre_init_cache_func_get_num_free_contexts,
 	psmi_hal_pre_init_cache_func_get_default_pkey,
 	psmi_hal_pre_init_cache_func_get_port_subnet,
 	psmi_hal_pre_init_cache_func_get_port_subnet_name,
@@ -580,8 +569,6 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 #define psmi_hal_get_port_active(...)                           PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__)
 #define psmi_hal_get_port_speed(...)                            PSMI_HAL_DISPATCH_PI(get_port_speed,__VA_ARGS__)
 #define psmi_hal_get_port_lid(...)				PSMI_HAL_DISPATCH_PI(get_port_lid,__VA_ARGS__)
-#define psmi_hal_get_num_contexts(...)                          PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__)
-#define psmi_hal_get_num_free_contexts(...)                     PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__)
 #define psmi_hal_get_default_pkey(...)			        PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__)
 #define psmi_hal_get_port_subnet(...)				PSMI_HAL_DISPATCH_PI(get_port_subnet,__VA_ARGS__)
 #define psmi_hal_get_port_subnet_name(...)                      PSMI_HAL_DISPATCH_PI(get_port_subnet_name,__VA_ARGS__)
diff --git a/prov/psm3/psm3/psm2_hal_loopback.c b/prov/psm3/psm3/psm2_hal_loopback.c
index 913a45dec78..6789ad18f59 100644
--- a/prov/psm3/psm3/psm2_hal_loopback.c
+++ b/prov/psm3/psm3/psm2_hal_loopback.c
@@ -131,16 +131,6 @@ static int psm3_hfp_loopback_get_port_active(int unit, int port)
 	return (unit == 0) && (port == 1);
 }
 
-static int psm3_hfp_loopback_get_num_contexts(int unit)
-{
-	return 1024;
-}
-
-static int psm3_hfp_loopback_get_num_free_contexts(int unit)
-{
-	return 1024;
-}
-
 static int psm3_hfp_loopback_get_port_subnet(int unit, int port, int addr_index,
 	psmi_subnet128_t *subnet, psmi_naddr128_t *addr,
 	int *idx, psmi_gid128_t *gid)
@@ -213,7 +203,7 @@ static void psm3_hfp_loopback_mq_init_defaults(struct psm2_mq *mq)
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	mq->ips_gpu_window_rv_str =  NULL; // no rendezvous
 #endif
-	mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
+	mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
 	// RDMA and MR cache N/A, leave ep->rdmamode, ep->mr_cache_mode and
 	// ep->rv_gpu_cache_size as set by caller (0, NONE, 0)
@@ -276,8 +266,6 @@ hfp_loopback_t psm3_loopback_hi = {
 		.hfp_get_num_ports			  = psm3_hfp_loopback_get_num_ports,
 		.hfp_get_unit_active			  = psm3_hfp_loopback_get_unit_active,
 		.hfp_get_port_active			  = psm3_hfp_loopback_get_port_active,
-		.hfp_get_num_contexts			  = psm3_hfp_loopback_get_num_contexts,
-		.hfp_get_num_free_contexts		  = psm3_hfp_loopback_get_num_free_contexts,
 		.hfp_get_default_pkey			  = psm3_hfp_loopback_get_default_pkey,
 		.hfp_get_port_subnet			  = psm3_hfp_loopback_get_port_subnet,
 		.hfp_get_unit_pci_bus			  = psm3_hfp_loopback_get_unit_pci_bus,
diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h
index 4ce7de78157..9bd59690005 100644
--- a/prov/psm3/psm3/psm_config.h
+++ b/prov/psm3/psm3/psm_config.h
@@ -140,9 +140,13 @@
  * Mutexlock should be used for experimentation while the more useful
  * mutexlock-debug should be enabled during development to catch potential
  * errors.
+ *
+ * When mutexlock-debug is enabled, mutexlock-debug-log-contention may also
+ * be enabled to log anytime a lock is contended for
  */
 #ifdef PSM_DEBUG
 #define PSMI_LOCK_IS_MUTEXLOCK_DEBUG
+//#define PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
 #else
 #define PSMI_LOCK_IS_SPINLOCK
 /* #define PSMI_LOCK_IS_MUTEXLOCK */
@@ -168,7 +172,7 @@
 /* All GPU transfers beyond this threshold use
  * RNDV protocol. It is mostly a send side knob.
  */
-#define GPU_THRESH_RNDV 8000
+#define PSM3_GPU_THRESH_RNDV 8000
 
 #define GPUDIRECT_THRESH_RV 3
 
@@ -179,20 +183,26 @@
 
 
 #define PSM_MQ_NIC_MAX_TINY		8	/* max TINY payload allowed */
-#define PSM_MQ_NIC_RNDV_THRESH	 	64000
+#define PSM3_MQ_RNDV_NIC_THRESH	 	64000
 #define PSM_CPU_NIC_RNDV_WINDOW_STR "131072"
 #ifdef PSM_CUDA
 #define PSM_GPU_NIC_RNDV_WINDOW_STR "2097152"
 #elif defined(PSM_ONEAPI)
 #define PSM_GPU_NIC_RNDV_WINDOW_STR "131072:524287,262144:1048575,524288"
 #endif
-#define PSM_MQ_NIC_MAX_RNDV_WINDOW	(4 * 1024 * 1024) /* max rndv window */
+#define PSM3_MQ_RNDV_NIC_WINDOW_MAX	(4 * 1024 * 1024) /* max rndv window */
+
+/*
+ * Rendezvous threshold is same for CMA, scale-up or LONG_DATA mechanisms
+ */
+#define PSM3_MQ_RNDV_SHM_THRESH 16000
 
-#define MQ_SHM_THRESH_RNDV 16000
 #if defined(PSM_CUDA)
-#define MQ_SHM_GPU_THRESH_RNDV 127
+/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
+#define PSM3_MQ_RNDV_SHM_GPU_THRESH 63
 #elif defined(PSM_ONEAPI)
-#define MQ_SHM_GPU_THRESH_RNDV 127
+/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
+#define PSM3_MQ_RNDV_SHM_GPU_THRESH 127
 #endif
 
 // LEARN_HASH_SELECTOR has PSM3 dynamically learn the combinations
diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c
index 35477d69f2f..678b394d71e 100644
--- a/prov/psm3/psm3/psm_context.c
+++ b/prov/psm3/psm3/psm_context.c
@@ -386,8 +386,6 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit)
 		int cpu_and_count = CPU_COUNT(&andcpuset);
 
 		if (cpu_and_count > 0 && pthread_setaffinity_np(mythread, sizeof(andcpuset), &andcpuset)) {
-			// bug on OPA, dev_name not yet initialized
-			// ok on UD and UDP
 			_HFI_ERROR( "Failed to set %s (unit %d) cpu set: %s\n", ep->dev_name,  unit, strerror(errno));
 			//err = -PSM_HAL_ERROR_GENERAL_ERROR;
 			goto bail;
diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c
index 36dbf40abfa..86dfa9a88d0 100644
--- a/prov/psm3/psm3/psm_ep.c
+++ b/prov/psm3/psm3/psm_ep.c
@@ -455,6 +455,10 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 		opts.outsl = opts_i->outsl;
 	if (opts_i->service_id)
 		opts.service_id = (uint64_t) opts_i->service_id;
+#ifdef PSM3_PATH_REC_QUERY
+	if (opts_i->path_res_type != PSM2_PATH_RES_NONE)
+		opts.path_res_type = opts_i->path_res_type;
+#endif
 	if (opts_i->senddesc_num)
 		opts.senddesc_num = opts_i->senddesc_num;
 	if (opts_i->imm_size)
@@ -470,7 +474,33 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 		opts.service_id = (uint64_t) envvar_val.e_ulonglong;
 	}
 
+#ifdef PSM3_PATH_REC_QUERY
+	const char *PSM3_PATH_REC_HELP =
+			 "Mechanism to query NIC path record [opp, umad or none] (default is none)";
+	/* Get Path resolution type from environment Possible choices are:
+	 *
+	 * NONE : Default same as previous instances. Utilizes static data.
+	 * OPP  : Use OFED Plus Plus library to do path record queries.
+	 * UMAD : Use raw libibumad interface to form and process path records.
+	 */
+	if (!psm3_getenv("PSM3_PATH_REC", PSM3_PATH_REC_HELP,
+			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			 (union psmi_envvar_val)"none", &envvar_val)) {
+		if (!strcasecmp(envvar_val.e_str, "none"))
+			opts.path_res_type = PSM2_PATH_RES_NONE;
+		else if (!strcasecmp(envvar_val.e_str, "opp"))
+			opts.path_res_type = PSM2_PATH_RES_OPP;
+		else if (!strcasecmp(envvar_val.e_str, "umad"))
+			opts.path_res_type = PSM2_PATH_RES_UMAD;
+		else {
+			_HFI_INFO("Invalid value for PSM3_PATH_REC ('%s') %-40s Using: none\n",
+				envvar_val.e_str, PSM3_PATH_REC_HELP);
+			opts.path_res_type = PSM2_PATH_RES_NONE;
+		}
+	}
+#else
 	opts.path_res_type = PSM2_PATH_RES_NONE;
+#endif
 
 	/* Get user specified port number to use. */
 	if (!psm3_getenv("PSM3_NIC_PORT", "NIC Port number (0 autodetects)",
diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h
index c1ec006eff9..f8376331e32 100644
--- a/prov/psm3/psm3/psm_ep.h
+++ b/prov/psm3/psm3/psm_ep.h
@@ -173,7 +173,7 @@ struct psm2_ep {
 	uint32_t hfi_imm_size;	  /** Immediate data size */
 	uint32_t connections;	    /**> Number of connections */
 
-	/* HAL indicates send segmentation support (OPA Send DMA or UDP GSO)
+	/* HAL indicates send segmentation support (Send DMA or UDP GSO)
 	 * by setting max_segs>1 and max_size > 1 MTU.
 	 * chunk_size used will be min(chunk_max_segs*frag_size, chunk_max_size)
 	 * Can set 1 huge and other reasonable if want only 1 to control
diff --git a/prov/psm3/psm3/psm_ep_connect.c b/prov/psm3/psm3/psm_ep_connect.c
index 5e36cab14ae..56f66610c45 100644
--- a/prov/psm3/psm3/psm_ep_connect.c
+++ b/prov/psm3/psm3/psm_ep_connect.c
@@ -280,8 +280,8 @@ psm3_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid,
 			} else
 			    if (!psm3_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
 				deverr =
-				    "there is no OPA PSM3 device (nic)";
-				eperr = " OPA";
+				    "there is no PSM3 device (nic)";
+				eperr = " nic";
 			}
 
 			len = snprintf(errbuf, sizeof(errbuf) - 1,
@@ -540,8 +540,8 @@ psm2_error_t psm3_ep_disconnect2(psm2_ep_t ep, int num_of_epaddr,
 			} else
 			    if (!psm3_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
 				deverr =
-				    "there is no OPA PSM3 device (nic)";
-				eperr = " OPA";
+				    "there is no PSM3 device (nic)";
+				eperr = " nic";
 			}
 
 			len = snprintf(errbuf, sizeof(errbuf) - 1,
diff --git a/prov/psm3/psm3/psm_error.c b/prov/psm3/psm3/psm_error.c
index 1958b9cd77c..814139adff6 100644
--- a/prov/psm3/psm3/psm_error.c
+++ b/prov/psm3/psm3/psm_error.c
@@ -268,7 +268,7 @@ struct psmi_error_item psmi_error_items[] = {
 	{PSMI_NOLOG, "unknown 18"},
 	{PSMI_NOLOG, "unknown 19"},
 	{PSMI_NOLOG, "Endpoint was closed"},	/* PSM2_EP_WAS_CLOSED = 20 */
-	{LOG_ALERT, "PSM Could not find an OPA Unit"},	/* PSM2_EP_NO_DEVICE = 21 */
+	{LOG_ALERT, "PSM Could not find a NIC"},	/* PSM2_EP_NO_DEVICE = 21 */
 	{PSMI_NOLOG, "User passed a bad unit number"},	/* PSM2_EP_UNIT_NOT_FOUND = 22 */
 	{LOG_ALERT, "Failure in initializing endpoint"},	/* PSM2_EP_DEVICE_FAILURE = 23 */
 	{PSMI_NOLOG, "Error closing the endpoing error"},	/* PSM2_EP_CLOSE_TIMEOUT = 24 */
diff --git a/prov/psm3/psm3/psm_lock.h b/prov/psm3/psm3/psm_lock.h
index c483dba57e9..0965d26ba26 100644
--- a/prov/psm3/psm3/psm_lock.h
+++ b/prov/psm3/psm3/psm_lock.h
@@ -88,6 +88,9 @@ typedef struct {
 #elif defined(PSMI_LOCK_IS_MUTEXLOCK_DEBUG)
 	pthread_mutex_t lock;
 	pthread_t lock_owner;
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+	const char *lock_owner_loc;
+#endif
 #elif defined(PSMI_LOCK_IS_MUTEXLOCK)
 	pthread_mutex_t lock;
 #endif
@@ -154,6 +157,9 @@ PSMI_ALWAYS_INLINE(void psmi_init_lock(psmi_lock_t *lock))
 	pthread_mutex_init(&(lock->lock), &attr);
 	pthread_mutexattr_destroy(&attr);
 	lock->lock_owner = PSMI_LOCK_NO_OWNER;
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+	lock->lock_owner_loc = "NONE";
+#endif
 #endif
 }
 
diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c
index 5203715fff8..4248ff7d28d 100644
--- a/prov/psm3/psm3/psm_mq.c
+++ b/prov/psm3/psm3/psm_mq.c
@@ -1426,13 +1426,13 @@ psm2_error_t psm3_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get)
 	switch (key) {
 	case PSM2_MQ_RNDV_HFI_SZ:
 		if (get)
-			*((uint32_t *) value) = mq->hfi_thresh_rv;
+			*((uint32_t *) value) = mq->rndv_nic_thresh;
 		else {
 			val32 = *((uint32_t *) value);
-			mq->hfi_thresh_rv = val32;
+			mq->rndv_nic_thresh = val32;
 		}
 		_HFI_VDBG("RNDV_HFI_SZ = %d (%s)\n",
-			  mq->hfi_thresh_rv, get ? "GET" : "SET");
+			  mq->rndv_nic_thresh, get ? "GET" : "SET");
 		break;
 
 	case PSM2_MQ_RNDV_SHM_SZ:
@@ -1655,7 +1655,7 @@ static int psm3_mq_parse_window_rv(const char *str,
 		if (delim)
 			*delim = '\0';
 		// parse window
-		if (psm3_parse_str_uint(s, &win, 1, PSM_MQ_NIC_MAX_RNDV_WINDOW)) {
+		if (psm3_parse_str_uint(s, &win, 1, PSM3_MQ_RNDV_NIC_WINDOW_MAX)) {
 			if (errstr_size)
 				snprintf(errstr, errstr_size, " Invalid window_rv: %s", s);
 			goto fail;
@@ -2576,9 +2576,9 @@ psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo)
 
 	// shm_thresh_rv is N/A to NIC and HAL, so we set this here and let
 	// HAL set the rest of the defaults
-	mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV;
+	mq->shm_thresh_rv = PSM3_MQ_RNDV_SHM_THRESH;
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	mq->shm_gpu_thresh_rv = MQ_SHM_GPU_THRESH_RNDV;
+	mq->shm_gpu_thresh_rv = PSM3_MQ_RNDV_SHM_GPU_THRESH;
 #endif
 
 	psmi_hal_mq_init_defaults(mq);
@@ -2618,8 +2618,8 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 	psm3_getenv("PSM3_MQ_RNDV_NIC_THRESH",
 		    "NIC eager-to-rendezvous switchover",
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-		    (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv);
-	mq->hfi_thresh_rv = env_hfirv.e_uint;
+		    (union psmi_envvar_val)mq->rndv_nic_thresh, &env_hfirv);
+	mq->rndv_nic_thresh = env_hfirv.e_uint;
 
 #define WINDOW_SYNTAX "Specified as window_size:limit,window_size:limit, ...\nwhere limit is the largest message size the window_size is applicable to.\nThe last window_size in the list will be used for all remaining message\nsizes (eg. its limit is optional and ignored).\nwindow_size must be <= 4194304 and the limit in each entry must be larger\nthan the prior entry."
 
@@ -2682,9 +2682,6 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 #endif /* PSM_CUDA || PSM_ONEAPI */
 	}
 
-	/* Re-evaluate this since it may have changed after initializing the shm
-	 * device */
-	mq->shm_thresh_rv = psm3_shm_mq_rv_thresh;
 	psm3_getenv("PSM3_MQ_RNDV_SHM_THRESH",
 		    "shm eager-to-rendezvous switchover",
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
@@ -2693,7 +2690,6 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	if (PSMI_IS_GPU_ENABLED) {
-		mq->shm_gpu_thresh_rv = psm3_shm_mq_gpu_rv_thresh;
 		psm3_getenv("PSM3_MQ_RNDV_SHM_GPU_THRESH",
 			"shm eager-to-rendezvous switchover for GPU send",
 			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h
index 6c7127b0245..824dc1ad60a 100644
--- a/prov/psm3/psm3/psm_mq_internal.h
+++ b/prov/psm3/psm3/psm_mq_internal.h
@@ -178,7 +178,7 @@ struct psm2_mq {
 	STAILQ_HEAD(, psm2_mq_req) eager_q; /**> eager request queue */
 
 	uint32_t hfi_thresh_tiny;
-	uint32_t hfi_thresh_rv;
+	uint32_t rndv_nic_thresh;
 	uint32_t shm_thresh_rv;
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	uint32_t shm_gpu_thresh_rv;
diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c
index 7b481351843..181d4dd5ba7 100644
--- a/prov/psm3/psm3/psm_mq_recv.c
+++ b/prov/psm3/psm3/psm_mq_recv.c
@@ -463,7 +463,7 @@ psm3_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 		/* We don't know recv_msglen yet but we set it here for
 		 * mq_iprobe */
 		req->req_data.send_msglen = req->req_data.recv_msglen = send_msglen;
-		PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->hfi_thresh_rv,
+		PSM2_LOG_EPM_COND(req->req_data.send_msglen > mq->rndv_nic_thresh,
 				 OPCODE_LONG_RTS,PSM2_LOG_RX,src->epid,mq->ep->epid,
 				    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
 		req->state = MQ_STATE_UNEXP_RV;
diff --git a/prov/psm3/psm3/psm_nic_select.c b/prov/psm3/psm3/psm_nic_select.c
index 1a451f5eb67..58d3ab72b15 100644
--- a/prov/psm3/psm3/psm_nic_select.c
+++ b/prov/psm3/psm3/psm_nic_select.c
@@ -290,24 +290,22 @@ static void
 psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start,
 			     long *unit_end, int nunits)
 {
-	{
-		int found, saved_hfis[nunits];
-
-		/* else, we are going to look at:
-		   (a hash of the job key plus the local rank id) mod nunits. */
-		found = hfi_find_active_hfis(nunits, -1, saved_hfis);
-		if (found)
-			*unit_start = saved_hfis[((psm3_get_mylocalrank()+1) +
-				psm3_get_uuid_hash(job_key)) % found];
-		else
-			// none found, caller will fail, start is a don't care
-			*unit_start = 0;
-		/* just in case, caller will check all other units, with wrap */
-		if (*unit_start > 0)
-			*unit_end = *unit_start - 1;
-		else
-			*unit_end = nunits-1;
-	}
+	int found, saved_hfis[nunits];
+
+	/* we are going to look at:
+	   (a hash of the job key plus the local rank id) mod nunits. */
+	found = hfi_find_active_hfis(nunits, -1, saved_hfis);
+	if (found)
+		*unit_start = saved_hfis[((psm3_get_mylocalrank()+1) +
+			psm3_get_uuid_hash(job_key)) % found];
+	else
+		// none found, caller will fail, start is a don't care
+		*unit_start = 0;
+	/* just in case, caller will check all other units, with wrap */
+	if (*unit_start > 0)
+		*unit_end = *unit_start - 1;
+	else
+		*unit_end = nunits-1;
 	_HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n",
 		*unit_start, *unit_end);
 }
diff --git a/prov/psm3/psm3/psm_rndv_mod.c b/prov/psm3/psm3/psm_rndv_mod.c
index f980fe73b49..1daa81f5c2c 100644
--- a/prov/psm3/psm3/psm_rndv_mod.c
+++ b/prov/psm3/psm3/psm_rndv_mod.c
@@ -284,6 +284,17 @@ static void rv_unmap_event_ring(psm3_rv_t rv, struct rv_event_ring* ring)
 	ring->num = 0;
 }
 
+// RV is available if RV_FILE_NAME (/dev/rv) exists
+int psm3_rv_available()
+{
+	int fd = open(RV_FILE_NAME, O_RDWR);
+	if (fd == -1) {
+		return 0;
+	}
+	close(fd);
+	return 1;
+}
+
 // we call this once per ep (eg. NIC) so we supply the local address
 // of our NIC for use in the IB CM bind, especially for ethernet
 psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
diff --git a/prov/psm3/psm3/psm_rndv_mod.h b/prov/psm3/psm3/psm_rndv_mod.h
index a9e246ed563..d6f0001a37c 100644
--- a/prov/psm3/psm3/psm_rndv_mod.h
+++ b/prov/psm3/psm3/psm_rndv_mod.h
@@ -185,6 +185,8 @@ static inline uint16_t psm3_rv_get_gpu_user_minor_bldtime_version(void)
 extern uint64_t psm3_min_gpu_bar_size(void);
 #endif
 
+extern int psm3_rv_available();
+
 extern psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info);
 
 extern int psm3_rv_close(psm3_rv_t rv);
diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h
index 18c58d9934d..28a6e9de4dd 100644
--- a/prov/psm3/psm3/psm_user.h
+++ b/prov/psm3/psm3/psm_user.h
@@ -200,6 +200,9 @@ typedef void *psmi_hal_hw_context;
 #define PSMI_VERNO_GET_MAJOR(verno) (((verno)>>8) & 0xff)
 #define PSMI_VERNO_GET_MINOR(verno) (((verno)>>0) & 0xff)
 
+extern unsigned int psm3_reg_mr_fail_limit;
+extern unsigned int psm3_reg_mr_warn_cnt;
+
 int psm3_verno_client();
 int psm3_verno_isinteroperable(uint16_t verno);
 int MOCKABLE(psm3_isinitialized)();
@@ -213,7 +216,6 @@ int psm3_get_current_proc_location();
 int psm3_get_max_cpu_numa();
 
 extern int psm3_allow_routers;
-extern uint32_t non_dw_mul_sdma;
 extern psmi_lock_t psm3_creation_lock;
 extern psm2_ep_t psm3_opened_endpoint;
 extern int psm3_opened_endpoint_count;
@@ -246,43 +248,96 @@ extern void psm3_wake(psm2_ep_t ep);	// wake from psm3_wait
 PSMI_ALWAYS_INLINE(
 int
 _psmi_mutex_trylock_inner(pthread_mutex_t *mutex,
-			  const char *curloc, pthread_t *lock_owner))
+			  const char *curloc, pthread_t *lock_owner
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+			  , int check, const char **lock_owner_loc
+#endif
+			  ))
 {
 	psmi_assert_always_loc(*lock_owner != pthread_self(),
 			       curloc);
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+	// this is imperfect as the owner's unlock can race with this function
+	// so we fetch loc1 and loc2 just before and after our trylock.  Still
+	// imperfect, but helps provide insight on frequently contended locks
+	const char *loc1 = *lock_owner_loc;
+#endif
 	int ret = pthread_mutex_trylock(mutex);
-	if (ret == 0)
+	if (ret == 0) {
 		*lock_owner = pthread_self();
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+		*lock_owner_loc = curloc;
+	} else {
+		const char *loc2 = *lock_owner_loc;
+		if (check)
+			_HFI_VDBG("%s is trying for lock held by %s %s\n", curloc, loc1, loc2);
+#endif
+	}
 	return ret;
 }
 
 PSMI_ALWAYS_INLINE(
 int
 _psmi_mutex_lock_inner(pthread_mutex_t *mutex,
-		       const char *curloc, pthread_t *lock_owner))
+		       const char *curloc, pthread_t *lock_owner
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+		       , const char **lock_owner_loc
+#endif
+		       ))
 {
 	psmi_assert_always_loc(*lock_owner != pthread_self(),
 			       curloc);
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+	// this is imperfect as the owner's unlock can race with this function
+	// so we fetch loc1 and loc2 just before and after our trylock.  Still
+	// imperfect, but helps provide insight on frequently contended locks
+	const char *loc1 = *lock_owner_loc;
+	if (! _psmi_mutex_trylock_inner(mutex, curloc, lock_owner, 0, lock_owner_loc))
+		return 0;
+	const char *loc2 = *lock_owner_loc;
+	_HFI_VDBG("%s is waiting for lock held by %s %s\n", curloc, loc1, loc2);
+#endif
 	int ret = pthread_mutex_lock(mutex);
 	psmi_assert_always_loc(ret != EDEADLK, curloc);
 	*lock_owner = pthread_self();
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+	*lock_owner_loc = curloc;
+#endif
 	return ret;
 }
 
 PSMI_ALWAYS_INLINE(
 void
 _psmi_mutex_unlock_inner(pthread_mutex_t *mutex,
-			 const char *curloc, pthread_t *lock_owner))
+			 const char *curloc, pthread_t *lock_owner
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+			 , const char **lock_owner_loc
+#endif
+			 ))
 {
 	psmi_assert_always_loc(*lock_owner == pthread_self(),
 			       curloc);
 	*lock_owner = PSMI_LOCK_NO_OWNER;
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+	*lock_owner_loc = "NONE";
+#endif
 	psmi_assert_always_loc(pthread_mutex_unlock(mutex) !=
 			       EPERM, curloc);
 	return;
 }
 
 #define _PSMI_LOCK_INIT(pl)	/* static initialization */
+#ifdef PSMI_LOCK_MUTEXLOCK_DEBUG_LOG_CONTENTION
+#define _PSMI_LOCK_TRY(pl)							\
+	    _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC,		\
+					&((pl).lock_owner), 1, &((pl).lock_owner_loc))
+#define _PSMI_LOCK(pl)								\
+	    _psmi_mutex_lock_inner(&((pl).lock), PSMI_CURLOC,			\
+                    &((pl).lock_owner), &((pl).lock_owner_loc))
+#define _PSMI_UNLOCK(pl)							\
+	    _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC,			\
+                    &((pl).lock_owner), &((pl).lock_owner_loc))
+#else
 #define _PSMI_LOCK_TRY(pl)							\
 	    _psmi_mutex_trylock_inner(&((pl).lock), PSMI_CURLOC,		\
 					&((pl).lock_owner))
@@ -292,6 +347,7 @@ _psmi_mutex_unlock_inner(pthread_mutex_t *mutex,
 #define _PSMI_UNLOCK(pl)							\
 	    _psmi_mutex_unlock_inner(&((pl).lock), PSMI_CURLOC,			\
                                         &((pl).lock_owner))
+#endif
 #define _PSMI_LOCK_ASSERT(pl)							\
 	psmi_assert_always((pl).lock_owner == pthread_self());
 #define _PSMI_UNLOCK_ASSERT(pl)							\
@@ -375,13 +431,13 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 extern int is_gdr_copy_enabled;
 /* This limit dictates when the sender turns off
  * GDR Copy and uses SDMA. The limit needs to be less than equal
- * GPU RNDV threshold (gpu_thresh_rndv)
+ * GPU RNDV threshold (psm3_gpu_thresh_rndv)
  * set to 0 if GDR Copy disabled
  */
 extern uint32_t gdr_copy_limit_send;
 /* This limit dictates when the reciever turns off
  * GDR Copy. The limit needs to be less than equal
- * GPU RNDV threshold (gpu_thresh_rndv)
+ * GPU RNDV threshold (psm3_gpu_thresh_rndv)
  * set to 0 if GDR Copy disabled
  */
 extern uint32_t gdr_copy_limit_recv;
@@ -389,7 +445,7 @@ extern int is_gpudirect_enabled; // only for use during parsing of other params
 extern int _device_support_gpudirect;
 extern uint32_t gpudirect_rdma_send_limit;
 extern uint32_t gpudirect_rdma_recv_limit;
-extern uint32_t gpu_thresh_rndv;
+extern uint32_t psm3_gpu_thresh_rndv;
 
 #define MAX_ZE_DEVICES 8
 
@@ -920,31 +976,31 @@ int gpu_p2p_supported())
 {
 	if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported;
 
+	_gpu_p2p_supported = 0;
+
 	if (unlikely(!is_cuda_enabled)) {
-		_gpu_p2p_supported=0;
+		_HFI_DBG("returning 0 (cuda disabled)\n");
 		return 0;
 	}
 
-	int num_devices, dev;
-	CUcontext c;
-
 	/* Check which devices the current device has p2p access to. */
-	CUdevice current_device;
+	CUdevice  current_device;
+	CUcontext current_context;
+	int num_devices, dev_idx;
 	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-	_gpu_p2p_supported = 0;
 
 	if (num_devices > 1) {
-		PSMI_CUDA_CALL(cuCtxGetCurrent, &c);
-		if (c == NULL) {
+		PSMI_CUDA_CALL(cuCtxGetCurrent, &current_context);
+		if (current_context == NULL) {
 			_HFI_INFO("Unable to find active CUDA context, assuming P2P not supported\n");
 			return 0;
 		}
 		PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
 	}
 
-	for (dev = 0; dev < num_devices; dev++) {
+	for (dev_idx = 0; dev_idx < num_devices; dev_idx++) {
 		CUdevice device;
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
+		PSMI_CUDA_CALL(cuDeviceGet, &device, dev_idx);
 
 		if (num_devices > 1 && device != current_device) {
 			int canAccessPeer = 0;
@@ -952,16 +1008,17 @@ int gpu_p2p_supported())
 					current_device, device);
 
 			if (canAccessPeer != 1)
-				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev);
+				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev_idx);
 			else
-				_gpu_p2p_supported |= (1 << device);
+				_gpu_p2p_supported |= (1 << dev_idx);
 		} else {
 			/* Always support p2p on the same GPU */
-			my_gpu_device = device;
-			_gpu_p2p_supported |= (1 << device);
+			my_gpu_device = dev_idx;
+			_gpu_p2p_supported |= (1 << dev_idx);
 		}
 	}
 
+	_HFI_DBG("returning (0x%x), device 0x%x (%d)\n", _gpu_p2p_supported, (1 << my_gpu_device), my_gpu_device);
 	return _gpu_p2p_supported;
 }
 
diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c
index c2525fa935c..0f1a3fe1d5d 100644
--- a/prov/psm3/psm3/psm_utils.c
+++ b/prov/psm3/psm3/psm_utils.c
@@ -130,7 +130,7 @@ uint32_t psm3_ceil_log2(uint64_t val)
 // so that psm2_epid_t contents can remain opaque to psm2 API callers
 // who will not see this more detailed psmi_epid_t but will just see psm2_epid_t
 // A psm2_nid_t also uses this format, but has 0 in the protocol and process
-// specific fields (protocol, context, subcontext, qpn, pri_sock, aux_sock).
+// specific fields (protocol, context, qpn, pri_sock, aux_sock).
 typedef union {
 	psm2_epid_t psm2_epid;	// to cast to/from psm2_epid_t
 	uint64_t w[3];	// word by word access
@@ -884,7 +884,6 @@ uint8_t psm3_epid_prefix_len(psm2_epid_t epid)
 }
 
 // The locally unique identifiers for the HW resources
-// OPA Native - Context (also need sub-context)
 // Verbs - 24b QPN (IB, OPA and RoCE Verbs)
 // Sockets - 16b primary socket number (sin_port) (UDP/TCP)
 // This should not be called for psm2_nid_t
diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h
index d39b49e6711..57742fc39ea 100644
--- a/prov/psm3/psm3/psm_utils.h
+++ b/prov/psm3/psm3/psm_utils.h
@@ -466,8 +466,6 @@ uint32_t psm3_crc(unsigned char *buf, int len);
  * CPUID return values
  */
 #define CPUID_FAMILY_XEON       0x00000600
-#define CPUID_MODEL_PHI_GEN2    87
-#define CPUID_MODEL_PHI_GEN2M   133
 /*
  * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX
  * due to Little Endian and Hex it is not so obvious
diff --git a/prov/psm3/psm3/psm_verbs_mr.c b/prov/psm3/psm3/psm_verbs_mr.c
index aa145cfdc28..fa8fdf39499 100644
--- a/prov/psm3/psm3/psm_verbs_mr.c
+++ b/prov/psm3/psm3/psm_verbs_mr.c
@@ -1347,6 +1347,7 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache,
 								psm3_verbs_mr_t key)
 {
 	int save_errno;
+	static int fail_cnt = 0; /* Number of failed priority reg_mr requests */
 
 	ASSERT_MRC_FREE_LOCK(cache, mrc);
 #ifdef PSM_HAVE_RNDV_MOD
@@ -1414,11 +1415,26 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache,
 	mrc->alloc_id = key->alloc_id;
 #endif
 	ADD_STAT(cache, mrc->length, registered_bytes, max_registered_bytes);
+	/* Reset the fail counter */
+	fail_cnt = 0;
 	return mrc;
 
 failed_reg_mr:
-	_HFI_ERROR("reg_mr failed: "MRC_FMT": %s\n", MR_OUT_MRC(key),
-				strerror(save_errno));
+	if (priority) {
+		/* Print the first failure */
+		if (!fail_cnt)
+			_HFI_ERROR("reg_mr failed: "MRC_FMT": %s\n",
+				   MR_OUT_MRC(key), strerror(save_errno));
+		fail_cnt++;
+		/* Print a warning after consecutive failures */
+		if (fail_cnt == psm3_reg_mr_warn_cnt)
+			_HFI_ERROR("reg_mr failed %d times in a row.\n",
+				   psm3_reg_mr_warn_cnt);
+		/* Bail out if it fails too many times */
+		if (fail_cnt >= psm3_reg_mr_fail_limit)
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					  "reg_mr failed for too many times.\n");
+	}
 	cache->failed++;
 	cache->failed_reg_mr++;
 	free_mr(cache, mrc);
diff --git a/prov/psm3/psm3/ptl_am/am_config.h b/prov/psm3/psm3/ptl_am/am_config.h
index 79600601037..9ff2c3972e4 100644
--- a/prov/psm3/psm3/ptl_am/am_config.h
+++ b/prov/psm3/psm3/ptl_am/am_config.h
@@ -56,25 +56,9 @@
 
 #include "psm_config.h"
 
-/*
- * Can change the rendezvous threshold based on usage of cma (or not)
- */
-#define PSMI_MQ_RV_THRESH_CMA      16000
-
-/* If no kernel assisted copy is available this is the rendezvous threshold */
-#define PSMI_MQ_RV_THRESH_NO_KASSIST 16000
-
 #define AMSH_HAVE_CMA   0x1
 #define AMSH_HAVE_KASSIST 0x1
 
-#if defined(PSM_CUDA)
-/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
-#define PSMI_MQ_GPU_RV_THRESH 127
-#elif defined(PSM_ONEAPI)
-/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
-#define PSMI_MQ_GPU_RV_THRESH 127
-#endif
-
 /* Each block reserves some space at the beginning to store auxiliary data */
 #define AMSH_BLOCK_HEADER_SIZE  4096
 
@@ -86,6 +70,12 @@
  * am_pkt_bulk_t header struct.
  */
 #define AMLONG_SZ_NO_DSA   8192
+// for AI workloads with limited processes and multi-ep, better to have
+// large MTU and will default to CMA off for all but 1st EP
+#define AMLONG_SZ_MULTIEP 32768
+// This is the range we allow AMLONG_SZ to be configured as
+#define AMLONG_SZ_MIN 1024
+#define AMLONG_SZ_MAX (1024*1024)
 
 #ifdef PSM_DSA
 /* DSA benefits from larger bulk packets and hence larger copies */
@@ -94,7 +84,14 @@
 #define AMLONG_SZ_DSA   (1024*512)
 #endif
 
-#define PSMI_KASSIST_MODE_DEFAULT PSMI_KASSIST_CMA_GET
-#define PSMI_KASSIST_MODE_DEFAULT_STRING  "cma-get"
+// GPU only supports GET("cma-get") or OFF("none"), so can't use PUT as default
+#define PSM3_KASSIST_MODE_DEFAULT PSM3_KASSIST_CMA_GET
+#define PSM3_KASSIST_MODE_DEFAULT_STRING  "cma-get"
+
+#ifdef PSM_FI
+#define SHM_FAULTINJ_CMA_ERR	10000	/* 1 every X CMA get/put error */
+#define SHM_FAULTINJ_CMA_NOTAVAIL 4	/* 1 every X CMA available at init */
+#endif /* PSM_FI */
+
 
 #endif /* PTL_AM_AM_CONFIG_H */
diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
index 020f3afb349..89dbdd6cd87 100644
--- a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
+++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
@@ -87,28 +87,15 @@
 #endif
 #endif
 
-int psm3_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-int psm3_shm_mq_gpu_rv_thresh = PSMI_MQ_GPU_RV_THRESH;
-#endif
-
-// qcounts and qelemsz tunable via amsh_fifo_getconfig();
-static amsh_qinfo_t amsh_qcounts = {
-	.qreqFifoShort = AMSHORT_Q_NO_DSA,
-	.qreqFifoLong = AMLONG_Q_NO_DSA,
-	.qrepFifoShort = AMSHORT_Q_NO_DSA,
-	.qrepFifoLong = AMLONG_Q_NO_DSA
-};
+/* AMLONG_PAYLOAD is number of bytes available in a bulk packet for payload. */
+#define AMLONG_PAYLOAD(FifoLong) ((FifoLong) - sizeof(am_pkt_bulk_t))
 
-static amsh_qinfo_t amsh_qelemsz = {
-	.qreqFifoShort = sizeof(am_pkt_short_t),
-	.qreqFifoLong = AMLONG_SZ_NO_DSA,
-	.qrepFifoShort = sizeof(am_pkt_short_t),
-	.qrepFifoLong = AMLONG_SZ_NO_DSA
-};
+/* req and rep MTU is the same, so can use either here */
+/* this is our local MTU, use when receiving data */
+#define AMLONG_MTU_LOCAL(ptl) AMLONG_PAYLOAD((ptl)->qelemsz.qreqFifoLong)
 
-/* AMLONG_MTU is the number of bytes available in a bulk packet for payload. */
-#define AMLONG_MTU (amsh_qelemsz.qreqFifoLong-sizeof(am_pkt_bulk_t))
+/* this is the MTU of a peer, use when sending data */
+#define AMLONG_MTU_DEST(ptl, destidx) AMLONG_PAYLOAD((ptl)->am_ep[destidx].qdir.qreqH->longbulkq.elem_sz)
 
 ustatic struct {
 	void *addr;
@@ -124,9 +111,9 @@ static void amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg,
 
 /* Kassist helper functions */
 #if _HFI_DEBUGGING
-static const char *psmi_kassist_getmode(int mode);
+static const char *psm3_kassist_getmode(int mode);
 #endif
-static int psm3_get_kassist_mode();
+static int psm3_get_kassist_mode(int first_ep);
 int psm3_epaddr_pid(psm2_epaddr_t epaddr);
 
 static inline void
@@ -152,19 +139,28 @@ am_ctl_bulkpkt_init(am_pkt_bulk_t *base_ptr, size_t elemsz, int nelems)
 	}
 }
 
-#define _PA(type) PSMI_ALIGNUP(amsh_qcounts.q ## type * amsh_qelemsz.q ## type, \
-			       PSMI_PAGESIZE)
-static inline uintptr_t am_ctl_sizeof_block()
+#define AMSH_QSIZE(ptl, type)                                                \
+	PSMI_ALIGNUP((ptl)->qelemsz.q ## type * (ptl)->qcounts.q ## type,   \
+		     PSMI_PAGESIZE)
+
+// compute size for our inbound shm segment
+static inline uintptr_t am_ctl_sizeof_block(struct ptl_am *ptl)
 {
-	return PSMI_ALIGNUP(
-			PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) +
+	return PSMI_ALIGNUP(AMSH_BLOCK_HEADER_SIZE, PSMI_PAGESIZE) +
 			/* reqctrl block */
 			PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
-			_PA(reqFifoShort) + _PA(reqFifoLong) +
+			AMSH_QSIZE(ptl, reqFifoShort) + AMSH_QSIZE(ptl, reqFifoLong) +
 			/*reqctrl block */
 			PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE) +
-			/* align to page size */
-			_PA(repFifoShort) + _PA(repFifoLong), PSMI_PAGESIZE);
+			AMSH_QSIZE(ptl, repFifoShort) + AMSH_QSIZE(ptl, repFifoLong);
+}
+
+// compute size for a remote node's shm segment
+static inline uintptr_t am_ctl_sizeof_seg(struct am_ctl_nodeinfo *nodeinfo)
+{
+	return ((uintptr_t) nodeinfo->qdir.qrepFifoLong +
+							nodeinfo->amsh_qsizes.qrepFifoLong)
+			- nodeinfo->amsh_shmbase;
 }
 
 #undef _PA
@@ -189,7 +185,7 @@ static void read_extra_ep_data(uint32_t data, uint32_t *pid, uint32_t *gpu)
 	*gpu = (data & ~pid_mask) >> 22;
 }
 
-static void am_update_directory(struct am_ctl_nodeinfo *);
+static void am_update_directory(struct am_ctl_nodeinfo *, size_t segsz);
 
 static
 void amsh_atexit()
@@ -282,15 +278,8 @@ psm2_error_t psm3_shm_create(ptl_t *ptl_gen)
 	int shmfd = -1;
 	char *amsh_keyname = NULL;
 	int iterator;
-	/* Get which kassist mode to use. */
-	ptl->psmi_kassist_mode = psm3_get_kassist_mode();
-
-	_HFI_PRDBG("kassist_mode %d %s use_kassist %d\n",
-			ptl->psmi_kassist_mode,
-			psmi_kassist_getmode(ptl->psmi_kassist_mode),
-			(ptl->psmi_kassist_mode != PSMI_KASSIST_OFF));
 
-	segsz = am_ctl_sizeof_block();
+	segsz = am_ctl_sizeof_block(ptl);
 	for (iterator = 0; iterator < INT_MAX; iterator++) {
 		snprintf(shmbuf,
 			 sizeof(shmbuf),
@@ -426,9 +415,10 @@ psm2_error_t psm3_epdir_extend(ptl_t *ptl_gen)
 }
 
 /**
- * Unmap shm regions upon proper disconnect with other processes
+ * Unmap peer's shm region upon proper disconnect with other processes
  */
-psm2_error_t psm3_do_unmap(uintptr_t shmbase)
+psm2_error_t psm3_do_unmap(struct am_ctl_nodeinfo *nodeinfo)
+
 {
 	psm2_error_t err = PSM2_OK;
 #if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
@@ -437,9 +427,9 @@ psm2_error_t psm3_do_unmap(uintptr_t shmbase)
 		/* ignore other errors as context could be destroyed before this */
 		CUresult cudaerr;
 		//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-		//		cuMemHostUnregister, (void*)shmbase);
+		//		cuMemHostUnregister, (void*)nodeinfo->amsh_shmbase);
 		psmi_count_cuMemHostUnregister++;
-		cudaerr = psmi_cuMemHostUnregister((void*)shmbase);
+		cudaerr = psmi_cuMemHostUnregister((void*)nodeinfo->amsh_shmbase);
 		if (cudaerr) {
 			const char *pStr = NULL;
 			psmi_count_cuGetErrorString++;
@@ -453,16 +443,16 @@ psm2_error_t psm3_do_unmap(uintptr_t shmbase)
         if (PSMI_IS_GPU_ENABLED) {
 			ze_result_t result;
 			//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver,
-			//	    (void *)shmbase);
+			//	    (void *)nodeinfo->amsh_shmbase);
 			psmi_count_zexDriverReleaseImportedPointer++;
 			result = psmi_zexDriverReleaseImportedPointer(ze_driver,
-					    (void *)shmbase);
+					    (void *)nodeinfo->amsh_shmbase);
 			if (result != ZE_RESULT_SUCCESS) {
 				_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
 			}
 		}
 #endif
-	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+	if (munmap((void *)nodeinfo->amsh_shmbase, am_ctl_sizeof_seg(nodeinfo))) {
 		err =
 		    psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
 				      "Error with munmap of shared segment: %s",
@@ -484,11 +474,10 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
 	int i;
-	int use_kassist;
 	uint16_t shmidx;
 	char shmbuf[256];
 	void *dest_mapptr;
-	size_t segsz;
+	size_t segsz = 0;
 	psm2_error_t err = PSM2_OK;
 	int dest_shmfd;
 	struct am_ctl_nodeinfo *dest_nodeinfo;
@@ -509,9 +498,6 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 	}
 
 
-	use_kassist = (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF);
-
-	segsz = am_ctl_sizeof_block();
 	for (iterator = 0; iterator < INT_MAX; iterator++) {
 		snprintf(shmbuf,
 			 sizeof(shmbuf),
@@ -521,9 +507,10 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 			 iterator);
 		dest_shmfd = shm_open(shmbuf, O_RDWR, S_IRWXU);
 		if (dest_shmfd < 0) {
-			if (errno == EACCES && iterator < INT_MAX)
+			if (errno == EACCES && iterator < INT_MAX) {
+				err = PSM2_SHMEM_SEGMENT_ERR;
 				continue;
-			else {
+			} else {
 				err = psm3_handle_error(NULL,
 							PSM2_SHMEM_SEGMENT_ERR,
 							"Error opening remote "
@@ -544,8 +531,9 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 				close(dest_shmfd);
 				goto fail;
 			}
-			if (getuid() == st.st_uid) {
+			if (getuid() == st.st_uid && st.st_size) {
 				err = PSM2_OK;
+				segsz = st.st_size;
 				break;
 			} else {
 				err = PSM2_SHMEM_SEGMENT_ERR;
@@ -561,6 +549,7 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 					"namespace exhausted.");
 		goto fail;
 	}
+	psmi_assert(segsz);
 
 	dest_mapptr = mmap(NULL, segsz,
 		      PROT_READ | PROT_WRITE, MAP_SHARED, dest_shmfd, 0);
@@ -613,45 +602,26 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 
 		for (i = 0; i <= ptl->max_ep_idx; i++) {
 			if (!psm3_epid_zero_internal(ptl->am_ep[i].epid))
-				am_update_directory(&ptl->am_ep[i]);
+				am_update_directory(&ptl->am_ep[i], am_ctl_sizeof_seg(&ptl->am_ep[i]));
 		}
 	}
 	for (i = 0; i < ptl->am_ep_size; i++) {
 		psmi_assert(psm3_epid_cmp_internal(ptl->am_ep[i].epid, epid));
 		if (psm3_epid_zero_internal(ptl->am_ep[i].epid)) {
+			// populate our local copy of the peer's nodeinfo
 			ptl->am_ep[i].epid = epid;
 			ptl->am_ep[i].psm_verno = dest_nodeinfo->psm_verno;
 			ptl->am_ep[i].pid = dest_nodeinfo->pid;
-			if (use_kassist) {
-				/* If we are able to use CMA assume everyone
-				 * else on the node can also use it.
-				 * Advertise that CMA is active via the
-				 * feature flag.
-				 */
-
-				if (psm3_cma_available()) {
-					ptl->am_ep[i].amsh_features |=
-					    AMSH_HAVE_CMA;
-					psm3_shm_mq_rv_thresh =
-					    PSMI_MQ_RV_THRESH_CMA;
-				} else {
-					ptl->psmi_kassist_mode =
-					    PSMI_KASSIST_OFF;
-					use_kassist = 0;
-					psm3_shm_mq_rv_thresh =
-					    PSMI_MQ_RV_THRESH_NO_KASSIST;
-				}
-			} else
-				psm3_shm_mq_rv_thresh =
-				    PSMI_MQ_RV_THRESH_NO_KASSIST;
-			_HFI_CONNDBG("KASSIST MODE: %s\n",
-				   psmi_kassist_getmode(ptl->psmi_kassist_mode));
+			ptl->am_ep[i].amsh_features = dest_nodeinfo->amsh_features;
+			_HFI_CONNDBG("Peer KASSIST: %d\n",
+					 (ptl->am_ep[i].amsh_features & AMSH_HAVE_CMA) != 0);
 			shmidx = *shmidx_o = i;
 			_HFI_CONNDBG("Mapped epid %s into shmidx %d\n", psm3_epid_fmt_internal(epid, 0), shmidx);
 			ptl->am_ep[i].amsh_shmbase = (uintptr_t) dest_mapptr;
 			ptl->am_ep[i].amsh_qsizes = dest_nodeinfo->amsh_qsizes;
 			if (i > ptl->max_ep_idx)
 				ptl->max_ep_idx = i;
+			am_update_directory(&ptl->am_ep[shmidx], segsz);
 			break;
 		}
 	}
@@ -671,10 +641,6 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
  * Initialize pointer structure and locks for endpoint shared-memory AM.
  */
 
-#define AMSH_QSIZE(type)                                                \
-	PSMI_ALIGNUP(amsh_qelemsz.q ## type * amsh_qcounts.q ## type,   \
-		     PSMI_PAGESIZE)
-
 static psm2_error_t amsh_init_segment(ptl_t *ptl_gen)
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
@@ -689,10 +655,10 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen)
 	if ((err = psm3_shm_create(ptl_gen)))
 		goto fail;
 
-	ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(reqFifoShort);
-	ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(reqFifoLong);
-	ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(repFifoShort);
-	ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(repFifoLong);
+	ptl->self_nodeinfo->amsh_qsizes.qreqFifoShort = AMSH_QSIZE(ptl, reqFifoShort);
+	ptl->self_nodeinfo->amsh_qsizes.qreqFifoLong = AMSH_QSIZE(ptl, reqFifoLong);
+	ptl->self_nodeinfo->amsh_qsizes.qrepFifoShort = AMSH_QSIZE(ptl, repFifoShort);
+	ptl->self_nodeinfo->amsh_qsizes.qrepFifoLong = AMSH_QSIZE(ptl, repFifoLong);
 
 	/* We core dump right after here if we don't check the mmap */
 
@@ -710,38 +676,38 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen)
 	ptl->reqH.base = ptl->reqH.head = ptl->reqH.end = NULL;
 	ptl->repH.base = ptl->repH.head = ptl->repH.end = NULL;
 
-	am_update_directory(ptl->self_nodeinfo);
+	am_update_directory(ptl->self_nodeinfo, am_ctl_sizeof_block(ptl));
 
 	ptl->reqH.head = ptl->reqH.base = (am_pkt_short_t *)
 		(((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort));
 	ptl->reqH.end = (am_pkt_short_t *)
 		(((uintptr_t)ptl->self_nodeinfo->qdir.qreqFifoShort) +
-		 amsh_qcounts.qreqFifoShort * amsh_qelemsz.qreqFifoShort);
+		 ptl->qcounts.qreqFifoShort * (uintptr_t)ptl->qelemsz.qreqFifoShort);
 
 	ptl->repH.head = ptl->repH.base = (am_pkt_short_t *)
 		(((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort));
 	ptl->repH.end = (am_pkt_short_t *)
 		(((uintptr_t)ptl->self_nodeinfo->qdir.qrepFifoShort) +
-		 amsh_qcounts.qrepFifoShort * amsh_qelemsz.qrepFifoShort);
+		 ptl->qcounts.qrepFifoShort * (uintptr_t)ptl->qelemsz.qrepFifoShort);
 
 	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->shortq,
-			 amsh_qcounts.qreqFifoShort,
-			 amsh_qelemsz.qreqFifoShort);
+			 ptl->qcounts.qreqFifoShort,
+			 ptl->qelemsz.qreqFifoShort);
 	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qreqH->longbulkq,
-			 amsh_qcounts.qreqFifoLong, amsh_qelemsz.qreqFifoLong);
+			 ptl->qcounts.qreqFifoLong, ptl->qelemsz.qreqFifoLong);
 	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->shortq,
-			 amsh_qcounts.qrepFifoShort,
-			 amsh_qelemsz.qrepFifoShort);
+			 ptl->qcounts.qrepFifoShort,
+			 ptl->qelemsz.qrepFifoShort);
 	am_ctl_qhdr_init(&ptl->self_nodeinfo->qdir.qrepH->longbulkq,
-			 amsh_qcounts.qrepFifoLong, amsh_qelemsz.qrepFifoLong);
+			 ptl->qcounts.qrepFifoLong, ptl->qelemsz.qrepFifoLong);
 
 	/* Set bulkidx in every bulk packet */
 	am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qreqFifoLong,
-			    amsh_qelemsz.qreqFifoLong,
-			    amsh_qcounts.qreqFifoLong);
+			    ptl->qelemsz.qreqFifoLong,
+			    ptl->qcounts.qreqFifoLong);
 	am_ctl_bulkpkt_init(ptl->self_nodeinfo->qdir.qrepFifoLong,
-			    amsh_qelemsz.qrepFifoLong,
-			    amsh_qcounts.qrepFifoLong);
+			    ptl->qelemsz.qrepFifoLong,
+			    ptl->qcounts.qrepFifoLong);
 
 	/* install the old sighandler back */
 	sigaction(SIGSEGV, &action_stash.SIGSEGV_old_act, NULL);
@@ -751,6 +717,7 @@ static psm2_error_t amsh_init_segment(ptl_t *ptl_gen)
 	return err;
 }
 
+/* unmap our own local shared memory segment (ptl->self_nodeinfo) */
 psm2_error_t psm3_shm_detach(ptl_t *ptl_gen)
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
@@ -796,7 +763,7 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen)
 		}
 	}
 #endif
-	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
+	if (munmap((void *)shmbase, am_ctl_sizeof_block(ptl))) {
 		err =
 		    psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
 				      "Error with munmap of shared segment: %s",
@@ -815,23 +782,21 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen)
  * updated when a new epaddr is connected to or on every epaddr already
  * connected to whenever the shared memory segment is relocated via mremap.
  *
- * @param epaddr Endpoint address for which to update local directory.
+ * @param ptl our local endpoint
+ * @param nodeinfo entry in directory to update
+ * @param segsz optional expected size of shared memory segment contents
+ * 				for sanity check (if 0 check is skipped)
  */
 
 static
-void am_update_directory(struct am_ctl_nodeinfo *nodeinfo)
+void am_update_directory(struct am_ctl_nodeinfo *nodeinfo, size_t segsz)
 {
-	uintptr_t base_this;
-
-	base_this = nodeinfo->amsh_shmbase +
-		AMSH_BLOCK_HEADER_SIZE;
-
 	/* Request queues */
-	nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *) base_this;
+	nodeinfo->qdir.qreqH = (am_ctl_blockhdr_t *)
+		(nodeinfo->amsh_shmbase + AMSH_BLOCK_HEADER_SIZE);
 	nodeinfo->qdir.qreqFifoShort = (am_pkt_short_t *)
 	    ((uintptr_t) nodeinfo->qdir.qreqH +
 	     PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
-
 	nodeinfo->qdir.qreqFifoLong = (am_pkt_bulk_t *)
 	    ((uintptr_t) nodeinfo->qdir.qreqFifoShort +
 	     nodeinfo->amsh_qsizes.qreqFifoShort);
@@ -840,7 +805,6 @@ void am_update_directory(struct am_ctl_nodeinfo *nodeinfo)
 	nodeinfo->qdir.qrepH = (am_ctl_blockhdr_t *)
 	    ((uintptr_t) nodeinfo->qdir.qreqFifoLong +
 	     nodeinfo->amsh_qsizes.qreqFifoLong);
-
 	nodeinfo->qdir.qrepFifoShort = (am_pkt_short_t *)
 	    ((uintptr_t) nodeinfo->qdir.qrepH +
 	     PSMI_ALIGNUP(sizeof(am_ctl_blockhdr_t), PSMI_PAGESIZE));
@@ -860,17 +824,11 @@ void am_update_directory(struct am_ctl_nodeinfo *nodeinfo)
 		  nodeinfo->qdir.qrepFifoLong);
 
 	/* Sanity check */
-	uintptr_t base_next =
-	    (uintptr_t) nodeinfo->qdir.qrepFifoLong +
-	    nodeinfo->amsh_qsizes.qrepFifoLong;
-
-	// this assert can happen if shm Fifo settings inconsistent
-	// such as 1 rank enabling DSA and another not enabling DSA
-	if (base_next - base_this > am_ctl_sizeof_block()) {
-		_HFI_ERROR("Inconsistent shm, Fifo parameters delta=%lu > block=%lu.  Aborting\n",
-				(unsigned long)(base_next - base_this),
-				(unsigned long)am_ctl_sizeof_block());
-		psmi_assert_always(base_next - base_this <= am_ctl_sizeof_block());
+	uintptr_t delta = am_ctl_sizeof_seg(nodeinfo);
+	if (segsz && delta != segsz) {
+		_HFI_ERROR("Inconsistent shm, Fifo parameters delta=%lu != segsz=%lu.  Aborting\n",
+				(unsigned long)delta, (unsigned long) segsz);
+		psmi_assert_always(delta == segsz);
 	}
 }
 
@@ -947,7 +905,7 @@ amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t
 
 	/* other setup */
 	ptl->am_ep[shmidx].epaddr = epaddr;
-	am_update_directory(&ptl->am_ep[shmidx]);
+	am_update_directory(&ptl->am_ep[shmidx], 0);
 	/* Finally, add to table */
 	if ((err = psm3_epid_add(ptl->ep, epid, epaddr)))
 		goto fail;
@@ -990,7 +948,7 @@ amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr)
 	ptl->am_ep[shmidx].psm_verno = nodeinfo->psm_verno;
 	ptl->am_ep[shmidx].pid = nodeinfo->pid;
 	ptl->am_ep[shmidx].amsh_qsizes = nodeinfo->amsh_qsizes;
-	am_update_directory(&ptl->am_ep[shmidx]);
+	am_update_directory(&ptl->am_ep[shmidx], 0);
 	return;
 }
 
@@ -1227,7 +1185,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
 				*/
 				if (((am_epaddr_t *) epaddr)->cstate_incoming ==
 					AMSH_CSTATE_INCOMING_DISC_REQUESTED)
-					err = psm3_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+					err = psm3_do_unmap(&ptl->am_ep[shmidx]);
 				req->epid_mask[i] = AMSH_CMASK_POSTREQ;
 			} else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
 				cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
@@ -1925,9 +1883,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr,
 			psm2_handler_t handler, psm2_amarg_t *args, int nargs,
 			const void *src, size_t len, void *dst, int flags))
 {
-#ifdef PSM_DEBUG
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
-#endif
 	uint16_t type;
 	uint32_t bulkidx;
 	uint16_t hidx = (uint16_t) handler;
@@ -1952,7 +1908,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr,
 		} else {
 			int i;
 
-			psmi_assert(len < amsh_qelemsz.qreqFifoLong);
+			psmi_assert(len <= AMLONG_MTU_DEST(ptl, destidx));
 			psmi_assert(src != NULL || nargs > NSHORT_ARGS);
 			type = AMFMT_SHORT;
 
@@ -1986,6 +1942,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr,
 			uint8_t *src_this = (uint8_t *) src;
 			uint8_t *dst_this = (uint8_t *) dst;
 			uint32_t bytes_this;
+			uint32_t mtu = AMLONG_MTU_DEST(ptl, destidx);
 #ifdef PSM_DSA
 			int use_dsa = psm3_use_dsa(len);
 #endif
@@ -1996,7 +1953,7 @@ psm3_amsh_generic_inner(uint32_t amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr,
 				  is_reply ? "rep" : "req", src, dst,
 				  (uint32_t) len, hidx);
 			while (bytes_left) {
-				bytes_this = min(bytes_left, AMLONG_MTU);
+				bytes_this = min(bytes_left, mtu);
 				AMSH_POLL_UNTIL(ptl_gen, is_reply,
 						(bulkpkt =
 						 am_ctl_getslot_long(ptl_gen,
@@ -2162,6 +2119,7 @@ psm3_am_reqq_add(int amtype, ptl_t *ptl_gen, psm2_epaddr_t epaddr,
 	ptl->psmi_am_reqq_fifo.lastp = &nreq->next;
 }
 
+// process inbound packet on our local shm fifos
 static
 void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq)
 {
@@ -2206,12 +2164,12 @@ void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq)
 				bulkptr =
 				    (uintptr_t) ptl->self_nodeinfo->qdir.
 				    qreqFifoLong;
-				bulkptr += bulkidx * amsh_qelemsz.qreqFifoLong;
+				bulkptr += bulkidx * (uintptr_t)ptl->qelemsz.qreqFifoLong;
 			} else {
 				bulkptr =
 				    (uintptr_t) ptl->self_nodeinfo->qdir.
 				    qrepFifoLong;
-				bulkptr += bulkidx * amsh_qelemsz.qrepFifoLong;
+				bulkptr += bulkidx * (uintptr_t)ptl->qelemsz.qrepFifoLong;
 			}
 			break;
 		default:
@@ -2223,6 +2181,7 @@ void process_packet(ptl_t *ptl_gen, am_pkt_short_t *pkt, int isreq)
 		}
 
 		bulkpkt = (am_pkt_bulk_t *) bulkptr;
+		psmi_assert(bulkpkt->len <= AMLONG_MTU_LOCAL(ptl));
 		_HFI_VDBG("ep=%p mq=%p type=%d bulkidx=%d flag=%d/%d nargs=%d "
 			  "from_idx=%d pkt=%p/%p hidx=%d\n",
 			  ptl->ep, ptl->ep->mq, pkt->type, bulkidx, pkt->flag,
@@ -2459,6 +2418,8 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 {
 	uint32_t bytes_left = len;
 	uint32_t bytes_this = 0;
+	ptl_t *ptl = epaddr->ptlctl->ptl;
+	uint32_t mtu = AMLONG_MTU_DEST((struct ptl_am *)ptl, ((am_epaddr_t *) epaddr)->shmidx);
 
 	psm2_handler_t handler = mq_handler_hidx;
 
@@ -2468,7 +2429,7 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 	args[2].u32w0 = 0;
 
 	psmi_assert(!(flags_user & PSM2_MQ_FLAG_SENDSYNC));// needs rndv
-	if (len <= AMLONG_MTU) {
+	if (len <= mtu) {
 		if (len <= 32)
 			args[0].u32w0 = MQ_MSG_TINY;
 		else
@@ -2480,15 +2441,15 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 
 	do {
 		args[2].u32w0 += bytes_this;
-		bytes_this = min(bytes_left, AMLONG_MTU);
+		bytes_this = min(bytes_left, mtu);
 
 		/* Assume that shared-memory active messages are delivered in order */
 		if (flags_internal & PSMI_REQ_FLAG_FASTPATH) {
-			psm3_am_reqq_add(AMREQUEST_SHORT, epaddr->ptlctl->ptl,
+			psm3_am_reqq_add(AMREQUEST_SHORT, ptl,
 					epaddr, handler, args, 3, (void *)ubuf,
 					bytes_this, NULL, 0);
 		} else {
-			psm3_amsh_short_request(epaddr->ptlctl->ptl, epaddr,
+			psm3_amsh_short_request(ptl, epaddr,
 						handler, args, 3, ubuf, bytes_this, 0);
 		}
 
@@ -2657,15 +2618,15 @@ int psm3_epaddr_pid(psm2_epaddr_t epaddr)
 }
 #if _HFI_DEBUGGING
 static
-const char *psmi_kassist_getmode(int mode)
+const char *psm3_kassist_getmode(int mode)
 {
 	switch (mode) {
-	case PSMI_KASSIST_OFF:
-		return "kassist off";
-	case PSMI_KASSIST_CMA_GET:
-		return "cma get";
-	case PSMI_KASSIST_CMA_PUT:
-		return "cma put";
+	case PSM3_KASSIST_OFF:
+		return "none";
+	case PSM3_KASSIST_CMA_GET:
+		return "cma-get";
+	case PSM3_KASSIST_CMA_PUT:
+		return "cma-put";
 	default:
 		return "unknown";
 	}
@@ -2673,10 +2634,21 @@ const char *psmi_kassist_getmode(int mode)
 #endif
 
 static
-int psm3_get_kassist_mode()
+int psm3_get_kassist_mode(int first_ep)
 {
-	/* Cuda PSM2 supports only KASSIST_CMA_GET */
-	int mode = PSMI_KASSIST_CMA_GET;
+	/* GPU supports only KASSIST_CMA_GET or NONE */
+	int mode = (first_ep?PSM3_KASSIST_MODE_DEFAULT:PSM3_KASSIST_OFF);
+#ifdef PSM_FI
+	if_pf(PSM3_FAULTINJ_ENABLED()) {
+		PSM3_FAULTINJ_STATIC_DECL(fi_cma_notavail, "cma_notavail",
+					"CMA not available",
+					1, SHM_FAULTINJ_CMA_NOTAVAIL);
+		if (PSM3_FAULTINJ_IS_FAULT(fi_cma_notavail, NULL, ""))
+			return PSM3_KASSIST_OFF;
+	}
+#endif
+	if (! psm3_cma_available())
+		return PSM3_KASSIST_OFF;
 #ifdef PSM_DSA
 	// dsa_available is determined during psm3_init(), while kassist is
 	// not checked until a shm ep is being opened. So dsa_available is
@@ -2686,7 +2658,7 @@ int psm3_get_kassist_mode()
 	// where kassist applies, so we must turn it off so DSA can
 	// do the copies for all rndv shm messages
 	if (psm3_dsa_available())
-		return PSMI_KASSIST_OFF;
+		return PSM3_KASSIST_OFF;
 #endif
 
 	union psmi_envvar_val env_kassist;
@@ -2707,22 +2679,23 @@ int psm3_get_kassist_mode()
 #endif
 			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
 			 (union psmi_envvar_val)
-			 PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) {
+			 (first_ep?PSM3_KASSIST_MODE_DEFAULT_STRING:"none"),
+			 &env_kassist)) {
 		char *s = env_kassist.e_str;
 		if (
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 			! PSMI_IS_GPU_ENABLED &&
 #endif
 			strcasecmp(s, "cma-put") == 0)
-			mode = PSMI_KASSIST_CMA_PUT;
+			mode = PSM3_KASSIST_CMA_PUT;
 		else if (strcasecmp(s, "cma-get") == 0)
-			mode = PSMI_KASSIST_CMA_GET;
+			mode = PSM3_KASSIST_CMA_GET;
 		else if (strcasecmp(s, "none") == 0)
-			mode = PSMI_KASSIST_OFF;
+			mode = PSM3_KASSIST_OFF;
 		else {
 			_HFI_INFO("Invalid value for PSM3_KASSIST_MODE ('%s') %-40s Using: cma-get\n",
 				s, PSM3_KASSIST_MODE_HELP);
-			mode = PSMI_KASSIST_CMA_GET;
+			mode = PSM3_KASSIST_CMA_GET;
 		}
 	}
 	return mode;
@@ -2792,7 +2765,6 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 						  "Fatal error in "
 						  "connecting to shm segment");
 			}
-			am_update_directory(&ptl->am_ep[shmidx]);
 			tok->shmidx = shmidx;
 		}
 
@@ -2890,7 +2862,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 			*/
 			cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
 			if (cstate == AMSH_CSTATE_OUTGOING_DISC_REQUESTED) {
-				err = psm3_do_unmap(ptl->am_ep[shmidx].amsh_shmbase);
+				err = psm3_do_unmap(&ptl->am_ep[shmidx]);
 				psm3_epid_remove(epaddr->ptlctl->ep, epaddr->epid);
 			}
 		}
@@ -2934,54 +2906,91 @@ psm3_amsh_am_get_parameters(psm2_ep_t ep, struct psm2_am_parameters *parameters)
 
 	parameters->max_handlers = PSMI_AM_NUM_HANDLERS;
 	parameters->max_nargs = PSMI_AM_MAX_ARGS;
-	parameters->max_request_short = AMLONG_MTU;
-	parameters->max_reply_short = AMLONG_MTU;
+	// we have not yet connected to our peers.  If we are certain multi-ep
+	// is not going to be used, we can report our local MTU.
+	// Otherwise, to be safe we must report our smallest valid MTU.
+	// This value is only used in psmx3 to indicate the max atomic size
+	// so a modest value is acceptable as most apps (such as intelSHMEM)
+	// will only do atomics on a single data item of <= 128 bits
+	if (psm3_multi_ep_enabled) {
+		parameters->max_request_short = AMLONG_PAYLOAD(AMLONG_SZ_MIN);
+		parameters->max_reply_short = AMLONG_PAYLOAD(AMLONG_SZ_MIN);
+	} else {
+		parameters->max_request_short =
+			AMLONG_MTU_LOCAL((struct ptl_am *)(ep->ptl_amsh.ptl));
+		parameters->max_reply_short =
+			AMLONG_MTU_LOCAL((struct ptl_am *)(ep->ptl_amsh.ptl));
+	}
 
 	return PSM2_OK;
 }
 
-static void amsh_fifo_getconfig()
+// for multi-ep, we use different defaults for the additional EPs
+// to avoid serialization within CMA
+static void amsh_fifo_getconfig(struct ptl_am *ptl)
 {
 	union psmi_envvar_val env_var;
 
+	// defaults
+	ptl->qcounts.qreqFifoShort = AMSHORT_Q_NO_DSA;
+	ptl->qcounts.qreqFifoLong = AMLONG_Q_NO_DSA;
+	ptl->qcounts.qrepFifoShort = AMSHORT_Q_NO_DSA;
+	ptl->qcounts.qrepFifoLong = AMLONG_Q_NO_DSA;
+
+	ptl->qelemsz.qreqFifoShort = sizeof(am_pkt_short_t);
+	ptl->qelemsz.qreqFifoLong = AMLONG_SZ_NO_DSA;
+	ptl->qelemsz.qrepFifoShort = sizeof(am_pkt_short_t);
+	ptl->qelemsz.qrepFifoLong = AMLONG_SZ_NO_DSA;
+
 #ifdef PSM_DSA
 	if (psm3_dsa_available()) {
 		// adjust defaults
-		amsh_qcounts.qreqFifoShort = AMSHORT_Q_DSA;
-		amsh_qcounts.qrepFifoShort = AMSHORT_Q_DSA;
-		amsh_qcounts.qreqFifoLong = AMLONG_Q_DSA;
-		amsh_qcounts.qrepFifoLong = AMLONG_Q_DSA;
-		amsh_qelemsz.qreqFifoLong = AMLONG_SZ_DSA;
-		amsh_qelemsz.qrepFifoLong = AMLONG_SZ_DSA;
-	}
+		ptl->qcounts.qreqFifoShort = AMSHORT_Q_DSA;
+		ptl->qcounts.qrepFifoShort = AMSHORT_Q_DSA;
+		ptl->qcounts.qreqFifoLong = AMLONG_Q_DSA;
+		ptl->qcounts.qrepFifoLong = AMLONG_Q_DSA;
+
+		ptl->qelemsz.qreqFifoLong = AMLONG_SZ_DSA;
+		ptl->qelemsz.qrepFifoLong = AMLONG_SZ_DSA;
+	} else
 #endif
+	if (ptl->kassist_mode == PSM3_KASSIST_OFF
+		&& psm3_get_mylocalrank_count() > 1
+		&& psm3_get_mylocalrank_count() <= 16) {
+		// adjust defaults for large message AI workloads
+		ptl->qelemsz.qreqFifoLong = AMLONG_SZ_MULTIEP;
+		ptl->qelemsz.qrepFifoLong = AMLONG_SZ_MULTIEP;
+	}
 
 	psm3_getenv("PSM3_SHM_SHORT_Q_DEPTH",
 		"Number of entries on shm undirectional short msg fifos",
 		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-		(union psmi_envvar_val)amsh_qcounts.qreqFifoShort, &env_var);
-	amsh_qcounts.qreqFifoShort = env_var.e_uint;
-	amsh_qcounts.qrepFifoShort = env_var.e_uint;
+		(union psmi_envvar_val)ptl->qcounts.qreqFifoShort, &env_var);
+	ptl->qcounts.qreqFifoShort = env_var.e_uint;
+	ptl->qcounts.qrepFifoShort = env_var.e_uint;
 
 	psm3_getenv("PSM3_SHM_LONG_Q_DEPTH",
 		"Number of entries on shm undirectional long msg fifos",
 		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-		(union psmi_envvar_val)amsh_qcounts.qreqFifoLong, &env_var);
-	amsh_qcounts.qreqFifoLong = env_var.e_uint;
-	amsh_qcounts.qrepFifoLong = env_var.e_uint;
+		(union psmi_envvar_val)ptl->qcounts.qreqFifoLong, &env_var);
+	ptl->qcounts.qreqFifoLong = env_var.e_uint;
+	ptl->qcounts.qrepFifoLong = env_var.e_uint;
 
 	// PSM3_SHM_SHORT_MTU - untunable at sizeof(am_pkt_short_t)
 
-	psm3_getenv("PSM3_SHM_LONG_MTU",
-		"Size of buffers on shm undirectional long msg fifos",
+	psm3_getenv_range("PSM3_SHM_LONG_MTU",
+		"Size of buffers on shm undirectional long msg fifos", NULL,
 		PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-		(union psmi_envvar_val)amsh_qelemsz.qreqFifoLong, &env_var);
-	amsh_qelemsz.qreqFifoLong = env_var.e_uint;
-	amsh_qelemsz.qrepFifoLong = env_var.e_uint;
+		(union psmi_envvar_val)ptl->qelemsz.qreqFifoLong,
+		(union psmi_envvar_val)AMLONG_SZ_MIN,
+		(union psmi_envvar_val)AMLONG_SZ_MAX,
+		NULL, NULL, &env_var);
+	ptl->qelemsz.qreqFifoLong = env_var.e_uint;
+	ptl->qelemsz.qrepFifoLong = env_var.e_uint;
 
 	_HFI_PRDBG("shm Q Short: %u of %u bytes, Long: %u of %u bytes\n",
-		amsh_qcounts.qreqFifoShort, amsh_qelemsz.qreqFifoShort,
-		amsh_qcounts.qreqFifoLong, amsh_qelemsz.qrepFifoLong);
+		ptl->qcounts.qreqFifoShort, ptl->qelemsz.qreqFifoShort,
+		ptl->qcounts.qreqFifoLong, ptl->qelemsz.qrepFifoLong);
 }
 
 /**
@@ -2996,6 +3005,7 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
 	psm2_error_t err = PSM2_OK;
+	int first_ep = (psm3_opened_endpoint_count == 0);
 
 	/* Preconditions */
 	psmi_assert_always(ep != NULL);
@@ -3011,8 +3021,14 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 	ptl->connect_phase = 0;
 	ptl->connect_incoming = 0;
 	ptl->connect_outgoing = 0;
+	/* Get which kassist mode to use. */
+	ptl->kassist_mode = psm3_get_kassist_mode(first_ep);
+
+	_HFI_PRDBG("kassist_mode %d %s\n",
+			ptl->kassist_mode,
+			psm3_kassist_getmode(ptl->kassist_mode));
 
-	amsh_fifo_getconfig();
+	amsh_fifo_getconfig(ptl);
 
 #ifdef PSM_ONEAPI
 #ifndef PSM_HAVE_PIDFD
@@ -3046,21 +3062,8 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 		goto fail;
 
 	ptl->self_nodeinfo->psm_verno = PSMI_VERNO;
-	if (ptl->psmi_kassist_mode != PSMI_KASSIST_OFF) {
-		if (psm3_cma_available()) {
-			ptl->self_nodeinfo->amsh_features |=
-				AMSH_HAVE_CMA;
-			psm3_shm_mq_rv_thresh =
-				PSMI_MQ_RV_THRESH_CMA;
-		} else {
-			ptl->psmi_kassist_mode =
-				PSMI_KASSIST_OFF;
-			psm3_shm_mq_rv_thresh =
-				PSMI_MQ_RV_THRESH_NO_KASSIST;
-		}
-	} else {
-		psm3_shm_mq_rv_thresh =
-			PSMI_MQ_RV_THRESH_NO_KASSIST;
+	if (ptl->kassist_mode != PSM3_KASSIST_OFF) {
+		ptl->self_nodeinfo->amsh_features |= AMSH_HAVE_CMA;
 	}
 	ptl->self_nodeinfo->pid = getpid();
 	ptl->self_nodeinfo->epid = ep->epid;
diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h
index 203b9512c3a..0796dbee9e9 100644
--- a/prov/psm3/psm3/ptl_am/psm_am_internal.h
+++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h
@@ -132,14 +132,14 @@ typedef struct psmi_handlertab {
 #define PSMI_AM_DISC_REQ    3
 #define PSMI_AM_DISC_REP    4
 
-#define PSMI_KASSIST_OFF       0x0
-#define PSMI_KASSIST_CMA_GET   0x1
-#define PSMI_KASSIST_CMA_PUT   0x2
+#define PSM3_KASSIST_OFF       0x0
+#define PSM3_KASSIST_CMA_GET   0x1
+#define PSM3_KASSIST_CMA_PUT   0x2
 
-#define PSMI_KASSIST_CMA       0x3
-#define PSMI_KASSIST_GET       0x1
-#define PSMI_KASSIST_PUT       0x2
-#define PSMI_KASSIST_MASK      0x3
+#define PSM3_KASSIST_CMA       0x3
+#define PSM3_KASSIST_GET       0x1
+#define PSM3_KASSIST_PUT       0x2
+#define PSM3_KASSIST_MASK      0x3
 
 int psm3_epaddr_pid(psm2_epaddr_t epaddr);
 
@@ -404,7 +404,7 @@ struct amsh_qdirectory {
  * Shared fifo element counts and sizes
  ******************************************
  * These values are context-wide, they can only be set early on and can't be *
- * modified at runtime.  All endpoints are expected to use the same values.
+ * modified at runtime.  Each endpoint could potentially use different values.
  */
 typedef
 struct amsh_qinfo {
@@ -424,6 +424,10 @@ struct amsh_qinfo {
  *
  * This structure is carefully arranged to optimize cache locality and
  * performance.  Do not modify without careful and thorough analysis.
+ *
+ * In addition to the copies in ptl_am.am_ep and ptl_am.self_nodeinfo
+ * this is also placed at the beginning of the shared memory segment so
+ * our peers can get info about our version, epid, qsizes, features, etc
  */
 struct am_ctl_nodeinfo {
 	uint16_t psm_verno;
@@ -433,7 +437,7 @@ struct am_ctl_nodeinfo {
 	psm2_epaddr_t epaddr;
 	uintptr_t amsh_shmbase;
 	amsh_qinfo_t amsh_qsizes;
-	uint32_t amsh_features;
+	volatile uint32_t amsh_features;
 	struct amsh_qdirectory qdir;
 } __attribute__((aligned(64)));
 
@@ -450,7 +454,7 @@ struct ptl_am {
 	int zero_polls;
 	int amsh_only_polls;
 	int max_ep_idx, am_ep_size;
-	int psmi_kassist_mode;
+	int kassist_mode;
 	char *amsh_keyname;
 
 	/* These three items carefully picked to fit in one cache line. */
@@ -460,8 +464,8 @@ struct ptl_am {
 
 	am_pkt_short_t amsh_empty_shortpkt;
 
-	struct am_ctl_nodeinfo *self_nodeinfo;
-	struct am_ctl_nodeinfo *am_ep;
+	struct am_ctl_nodeinfo *self_nodeinfo; /* our local advertized shm */
+	struct am_ctl_nodeinfo *am_ep; /* local array w/copy of each peer's info */
 #ifdef PSM_CUDA
 	am_cuda_memhandle_cache_t memhandle_cache;
 #endif
@@ -472,6 +476,9 @@ struct ptl_am {
 #define AMSH_GPU_BOUNCE_BUF_SZ (256*1024)
 	void *gpu_bounce_buf;	// for H to D
 #endif
+	// qcounts and qelemsz tunable via amsh_fifo_getconfig()
+	amsh_qinfo_t qcounts;
+	amsh_qinfo_t qelemsz;
 } __attribute__((aligned(64)));
 
 #endif
diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c
index 8a38d22ad4d..a6af3c356ac 100644
--- a/prov/psm3/psm3/ptl_am/ptl.c
+++ b/prov/psm3/psm3/ptl_am/ptl.c
@@ -66,6 +66,24 @@
 #include "am_oneapi_memhandle_cache.h"
 #endif
 
+#ifdef PSM_FI
+/*
+ * fault injection for psm3_cma_get() and psm3_cma_put().
+ * since the reaction to cma faults is for the given endpoint to stop
+ * using CMA, this should be set to be quite rare and only 1 fault per
+ * endpoint can occur, then the endpoint stops using CMA altogether
+ */
+PSMI_ALWAYS_INLINE(int cma_do_fault(psm2_ep_t ep))
+{
+	if_pf(PSM3_FAULTINJ_ENABLED()) {
+		PSM3_FAULTINJ_STATIC_DECL(fi, "cma_err", "CMA failure",
+					 0, SHM_FAULTINJ_CMA_ERR);
+		return PSM3_FAULTINJ_IS_FAULT(fi, ep, "");
+	} else
+		return 0;
+}
+#endif
+
 /* not reported yet, so just track in a global so can pass a pointer to
  * psm3_mq_handle_envelope and psm3_mq_handle_rts
  */
@@ -153,7 +171,8 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 	}
 #endif
 
-	if ((ptl->psmi_kassist_mode & PSMI_KASSIST_GET)
+	// since we will do the cma_get, can decide based on local config of ptl
+	if ((ptl->kassist_mode & PSM3_KASSIST_GET)
 	    && req->req_data.recv_msglen > 0
 	    && (pid = psm3_epaddr_pid(epaddr))) {
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
@@ -167,10 +186,18 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 			if (!ptl->gpu_bounce_buf)
 				PSM3_GPU_HOST_ALLOC(&ptl->gpu_bounce_buf, AMSH_GPU_BOUNCE_BUF_SZ);
 			while (cnt < req->req_data.recv_msglen) {
+				size_t res;
 				size_t nbytes = min(req->req_data.recv_msglen-cnt,
 									AMSH_GPU_BOUNCE_BUF_SZ);
-				size_t res = psm3_cma_get(pid, (void *)(req->rts_sbuf+cnt),
+#ifdef PSM_FI
+				if_pf(cma_do_fault(ptl->ep))
+					res = -1;
+				else
+#endif
+				res = psm3_cma_get(pid, (void *)(req->rts_sbuf+cnt),
 										ptl->gpu_bounce_buf, nbytes);
+				if (res == -1)
+					goto fail_cma;
 				void *buf;
 				psmi_assert_always(nbytes == res);
 				if (PSMI_USE_GDR_COPY_RECV(nbytes)
@@ -191,35 +218,42 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 			PSM3_GPU_SYNCHRONIZE_MEMCPY();
 		} else {
 			/* cma can be done in handler context or not. */
-			size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf,
+			size_t nbytes;
+#ifdef PSM_FI
+			if_pf(cma_do_fault(ptl->ep))
+				nbytes = -1;
+			else
+#endif
+			nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf,
 						req->req_data.buf, req->req_data.recv_msglen);
+			if (nbytes == -1)
+				goto fail_cma;
 			psmi_assert_always(nbytes == req->req_data.recv_msglen);
 		}
 #else
 		/* cma can be done in handler context or not. */
-		size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf,
+		size_t nbytes;
+#ifdef PSM_FI
+		if_pf(cma_do_fault(ptl->ep))
+			nbytes = -1;
+		else
+#endif
+		nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf,
 					req->req_data.buf, req->req_data.recv_msglen);
-		if (nbytes == -1) {
-			ptl->psmi_kassist_mode = PSMI_KASSIST_OFF;
-			_HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n");
-		}
-		else {
-			psmi_assert_always(nbytes == req->req_data.recv_msglen);
-			cma_succeed = 1;
-		}
+		if (nbytes == -1)
+			goto fail_cma;
 		psmi_assert_always(nbytes == req->req_data.recv_msglen);
 #endif
+		cma_succeed = 1;
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 send_cts:
-#endif
 	args[0].u64w0 = (uint64_t) (uintptr_t) req->ptl_req_ptr;
 	args[1].u64w0 = (uint64_t) (uintptr_t) req;
 	args[2].u64w0 = (uint64_t) (uintptr_t) req->req_data.buf;
 	args[3].u32w0 = req->req_data.recv_msglen;
 	args[3].u32w1 = tok != NULL ? 1 : 0;
-	args[4].u32w0 = ptl->psmi_kassist_mode;		// pass current kassist mode to the peer process
+	args[4].u32w0 = ptl->kassist_mode;		// pass current kassist mode to the peer process
 
 	if (tok != NULL) {
 		psm3_am_reqq_add(AMREQUEST_SHORT, tok->ptl,
@@ -235,12 +269,18 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 	req->mq->stats.rx_shm_bytes += req->req_data.recv_msglen;
 
 	/* 0-byte completion or we used kassist */
-	if (pid || cma_succeed ||
+	if (cma_succeed ||
 		req->req_data.recv_msglen == 0 || gpu_ipc_send_completion == 1) {
 		psm3_mq_handle_rts_complete(req);
 	}
 	PSM2_LOG_MSG("leaving.");
 	return PSM2_OK;
+
+fail_cma:
+	ptl->kassist_mode = PSM3_KASSIST_OFF;
+	ptl->self_nodeinfo->amsh_features &= ~AMSH_HAVE_CMA;
+	_HFI_ERROR("Reading from remote process' memory failed. Disabling CMA support\n");
+	goto send_cts;
 }
 
 static
@@ -417,22 +457,25 @@ psm3_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf,
 
 	if (msglen > 0) {
 		rarg[0].u64w0 = args[1].u64w0;	/* rreq */
-		int kassist_mode = ((struct ptl_am *)ptl)->psmi_kassist_mode;
+		int kassist_mode = ((struct ptl_am *)ptl)->kassist_mode;
 		int kassist_mode_peer = args[4].u32w0;
-		// In general, peer process(es) shall have the same kassist mode set,
-		// but due to dynamic CMA failure detection, we must align local and remote state,
-		// and make protocol to adopt to that potential change.
-		if (kassist_mode_peer == PSMI_KASSIST_OFF && (kassist_mode & PSMI_KASSIST_MASK)) {
-			((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF;
-			goto no_kassist;
-		}
 
-		if (kassist_mode & PSMI_KASSIST_PUT) {
+		if (kassist_mode_peer & PSM3_KASSIST_GET) {
+			// peer did cma_get(), nothing for us to do
+		} else if (kassist_mode & PSM3_KASSIST_PUT) {
+			// we can do cma_put()
 			int pid = psm3_epaddr_pid(tok->tok.epaddr_incoming);
-			size_t nbytes = psm3_cma_put(sreq->req_data.buf, pid, dest, msglen);
+			size_t nbytes;
+#ifdef PSM_FI
+			if_pf(cma_do_fault(((struct ptl_am *)ptl)->ep))
+				nbytes = -1;
+			else
+#endif
+			nbytes = psm3_cma_put(sreq->req_data.buf, pid, dest, msglen);
 			if (nbytes == -1) {
 				_HFI_ERROR("Writing to remote process' memory failed. Disabling CMA support\n");
-				((struct ptl_am *)ptl)->psmi_kassist_mode = PSMI_KASSIST_OFF;
+				((struct ptl_am *)ptl)->kassist_mode = PSM3_KASSIST_OFF;
+				((struct ptl_am *)ptl)->self_nodeinfo->amsh_features &= ~AMSH_HAVE_CMA;
 				goto no_kassist;
 			}
 
@@ -441,8 +484,8 @@ psm3_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf,
 			/* Send response that PUT is complete */
 			psm3_amsh_short_reply(tok, mq_handler_rtsdone_hidx,
 					      rarg, 1, NULL, 0, 0);
-		} else if (!(kassist_mode & PSMI_KASSIST_MASK)) {
-			/* Only transfer if kassist is off, i.e. neither GET nor PUT. */
+		} else {
+			/* Only transfer if peer didn't do GET and we didn't do PUT */
 no_kassist:
 			psm3_amsh_long_reply(tok, mq_handler_rtsdone_hidx, rarg,
 					     1, sreq->req_data.buf, msglen, dest, 0);
diff --git a/prov/psm3/psm3/ptl_am/ptl_fwd.h b/prov/psm3/psm3/ptl_am/ptl_fwd.h
index 85593aad847..09588cdda03 100644
--- a/prov/psm3/psm3/ptl_am/ptl_fwd.h
+++ b/prov/psm3/psm3/ptl_am/ptl_fwd.h
@@ -59,7 +59,4 @@
 /* Symbol in am ptl */
 extern struct ptl_ctl_init psm3_ptl_amsh;
 
-extern int psm3_shm_mq_rv_thresh;
-extern int psm3_shm_mq_gpu_rv_thresh;
-
 #endif
diff --git a/prov/psm3/psm3/ptl_ips/ips_config.h b/prov/psm3/psm3/ptl_ips/ips_config.h
index 6eb9db5ceaf..1a253aa4a23 100644
--- a/prov/psm3/psm3/ptl_ips/ips_config.h
+++ b/prov/psm3/psm3/ptl_ips/ips_config.h
@@ -65,6 +65,10 @@
 #define DF_OPP_LIBRARY "libopasadb.so.1.0.0"
 #define DATA_VFABRIC_OFFSET 8
 
+#define IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT 32
+#define IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT 128
+#define IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT 16
+
 /* Send retransmission */
 #define IPS_PROTO_SPIO_RETRY_US_DEFAULT	2	/* in uS */
 
diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
index 2bdd85a309c..221706ade25 100644
--- a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
+++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
@@ -105,8 +105,6 @@ struct ips_protoexp {
 
 	psm_transfer_type_t ctrl_xfer_type;
 	struct ips_scbctrl tid_scbc_rv;	// pool of SCBs for TID sends
-									// for OPA this includes: TIDEXP, CTS,
-									// EXPTID_COMPLETION
 									// For UD: CTS, ERR_CHK_RDMA,
 									// ERR_CHK_RDMA_RESP
 	mpool_t tid_desc_send_pool;
diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.h b/prov/psm3/psm3/ptl_ips/ips_path_rec.h
index 17fa819a396..6ef9e5820b2 100644
--- a/prov/psm3/psm3/ptl_ips/ips_path_rec.h
+++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.h
@@ -67,18 +67,6 @@
 /* Default size of path group hash table */
 #define DF_PATH_GRP_HASH_SIZE 255
 
-/* Default size of CCT table. Must be multiple of 64 */
-#define DF_CCT_TABLE_SIZE 128
-
-/* CCT max IPD delay. */
-#define DF_CCT_MAX_IPD_DELAY_US 21
-
-/* CCA divisor shift */
-#define CCA_DIVISOR_SHIFT 14
-
-/* CCA ipd mask */
-#define CCA_IPD_MASK 0x3FFF
-
 /* A lot of these are IBTA specific defines that are available in other header
  * files. To minimize dependencies with PSM build process they are listed
  * here. Most of this is used to implement IBTA compliance features with PSM
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c
index f6c9c215bcb..372dd75ea56 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto.c
@@ -81,6 +81,11 @@
 #define CTRL_MSG_DISCONNECT_REQUEST_QUEUED	0x0080
 #define CTRL_MSG_DISCONNECT_REPLY_QUEUED	0x0100
 
+#define CREDITS_INC_THRESH 2048
+// we are using 31 bits psn, and int16_t for psn diff on nak detection
+// to play safe we set max credit to 16384
+#define IPS_MAX_CREDIT 16384
+
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 uint32_t gpudirect_rdma_send_limit;
 uint32_t gpudirect_rdma_recv_limit;
@@ -106,6 +111,59 @@ void psmi_gpu_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
 }
 #endif /* PSM_CUDA || PSM_ONEAPI */
 
+static int parse_flow_credits(const char *str,
+			size_t errstr_size, char errstr[],
+			int tvals[3])
+{
+	psmi_assert(tvals);
+	int ntup = psm3_count_tuples(str);
+	int ret = psm3_parse_str_tuples(str, ntup, tvals);
+	if (ret < 0)
+		return ret;
+        // back compatibility - when only one value specified, set max=min, step=0
+        // this also can make value check to be accurate
+	if (ntup == 1) {
+		tvals[1] = tvals[0];
+		tvals[2] = 0;
+	}
+	if (tvals[0] < 0 || tvals[1] < 0 || tvals[2] < 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Negative values not allowed");
+		return -2;
+	}
+	if (tvals[0] > IPS_MAX_CREDIT || tvals[1] > IPS_MAX_CREDIT || tvals[2] > IPS_MAX_CREDIT) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Max allowed is %u", IPS_MAX_CREDIT);
+		return -2;
+	}
+	if (tvals[0] == 0 || tvals[1] == 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Zero values not allowed on min, max");
+		return -2;
+	}
+	if (tvals[1] > tvals[0] && tvals[2] == 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Zero values not allowed on adjust when max > min");
+		return -2;
+	}
+	if (tvals[0] > tvals[1]) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " min (%d) must be <= max (%d)", tvals[0], tvals[1]);
+		return -2;
+	}
+	return 0;
+}
+
+static int parse_check_flow_credits(int type,
+			const union psmi_envvar_val val, void *ptr,
+			size_t errstr_size, char errstr[])
+{
+	// parser will set tvals to result, use a copy to protect input of defaults
+	int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2]};
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
+	return parse_flow_credits(val.e_str, errstr_size, errstr, tvals);
+}
+
 psm2_error_t
 psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 	       int num_of_send_bufs, int num_of_send_desc, uint32_t imm_size,
@@ -133,16 +191,58 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 	{
 		/* Number of credits per flow */
 		union psmi_envvar_val env_flow_credits;
+		int tvals[3] = {
+                	min(IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT, num_of_send_desc),
+                	min(IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT, num_of_send_desc),
+                	IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT
+                };
+		char fcredits_def[32];
+		snprintf(fcredits_def, sizeof(fcredits_def), "%d:%d:%d", tvals[0], tvals[1], tvals[2]);
+
+		(void)psm3_getenv_range("PSM3_FLOW_CREDITS",
+			    "Number of unacked packets (credits) per flow in <min:max:adjust>",
+			    "Specified as min:max:adjust where min and max is the range of credits,\n"
+			    "and adjust is the adjustment amount for adjusting credits",
+			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES,
+			    (union psmi_envvar_val)fcredits_def,
+			    (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+			    parse_check_flow_credits, tvals,
+			    &env_flow_credits);
+		if (parse_flow_credits(env_flow_credits.e_str, 0, NULL, tvals) < 0) {
+                	// already checked, shouldn't get parse errors nor empty strings
+                	psmi_assert(0);
+                }
+                if (tvals[0] > num_of_send_desc) {
+                	tvals[0] = num_of_send_desc;
+                }
+                if (tvals[1] > num_of_send_desc) {
+                	tvals[1] = num_of_send_desc;
+                }
+
+                // set init flow credits. Use PSM2_FLOW_CREDITS when possible
 		int df_flow_credits = min(PSM2_FLOW_CREDITS, num_of_send_desc);
+		if (df_flow_credits > tvals[0] && df_flow_credits < tvals[1]) {
+			proto->flow_credits = df_flow_credits;
+		} else {
+			proto->flow_credits = (tvals[0] + tvals[1]) / 2;
+		}
+		proto->min_credits = tvals[0];
+		proto->max_credits = tvals[1];
+		proto->credits_adjust = tvals[2];
+	}
 
-		psm3_getenv("PSM3_FLOW_CREDITS",
-			    "Number of unacked packets (credits) per flow (default is 64)",
-			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-			    (union psmi_envvar_val)df_flow_credits,
-			    &env_flow_credits);
-		proto->flow_credits = env_flow_credits.e_uint;
+	{
+		union psmi_envvar_val env_thresh;
+		psm3_getenv_range("PSM3_CREDITS_INC_THRESH",
+			    "Threshold for increasing credits", NULL,
+			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			    (union psmi_envvar_val)CREDITS_INC_THRESH,
+			    (union psmi_envvar_val)0, (union psmi_envvar_val)UINT16_MAX,
+			    NULL, NULL, &env_thresh);
+		proto->credits_inc_thresh = env_thresh.e_uint;
 	}
 
+
 	/*
 	 * Checksum packets within PSM. Default is off.
 	 * This is heavy weight and done in software so not recommended for
@@ -197,7 +297,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 		proto->multirail_thresh_load_balance = env_thresh_load_balance.e_uint;
 	}
 
-	/* Initialize IBTA related stuff (path record, SL2VL, CCA etc.) */
+	/* Initialize IBTA related stuff (path record, etc.) */
 	if ((err = psm3_ips_ibta_init(proto)))
 		goto fail;
 
@@ -233,9 +333,6 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 			proto->flags |= IPS_PROTO_FLAG_COALESCE_ACKS;
 	}
 
-	/*
-	 * Initialize SDMA, otherwise, turn on all PIO.
-	 */
 	// initialize sdma after PSM3_MR_CACHE_MODE
 	proto->flags |= IPS_PROTO_FLAG_SPIO;
 
@@ -316,13 +413,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 	 * If we enable tid-based expected rendezvous, the expected protocol code
 	 * handles its own rv scb buffers.  If not, we have to enable eager-based
 	 * rendezvous and we allocate scb buffers for it.
-	 * For UD PSM3_RDMA (ep->rdmamode) controls our use of RDMA for Rendezvous
-	 * For STL100 PSM3_TID controls use of EXPTID for Rendezvous
+	 * For verbs PSM3_RDMA (ep->rdmamode) controls our use of RDMA for Rendezvous
 	 */
 	protoexp_flags = proto->ep->rdmamode;	// PSM3_RDMA
 
-	// protoexp implements RDMA for UD and TID for STL100 native.  N/A to UDP
-	// when proto->protoexp is NULL, we will not attempt to use TID nor RDMA
+	// protoexp implements RDMA for verbs.  N/A to sockets
+	// when proto->protoexp is NULL, we will not attempt to use RDMA
 	{
 		(void)protoexp_flags;
 		// for UD, even when RDMA is enabled, we may fall back to LONG_DATA
@@ -594,7 +690,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 								((uint64_t)env_mr_cache_size_mb.e_uint
 									* (1024*1024))
 										/ max(psm3_mq_max_window_rv(proto->mq, 0)/2,
-												proto->mq->hfi_thresh_rv));
+												proto->mq->rndv_nic_thresh));
 			} else {
 				// only send DMA, size based on smaller MRs
 				default_cache_entries = max(default_cache_entries,
@@ -692,7 +788,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	_HFI_DBG("GDR Copy: %d limit send=%u recv=%u gpu_rndv=%u GPU RDMA flags=0x%x limit send=%u recv=%u\n",
 		is_gdr_copy_enabled, gdr_copy_limit_send, gdr_copy_limit_recv,
-		gpu_thresh_rndv,
+		psm3_gpu_thresh_rndv,
 		proto->flags & (IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV
 				|IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND),
 		gpudirect_rdma_send_limit, gpudirect_rdma_recv_limit);
@@ -946,7 +1042,7 @@ proto_sdma_init(struct ips_proto *proto)
 	if (! is_gpudirect_enabled
 	    || !psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_SDMA))
 		env_sdma.e_uint = 0;
-	else 
+	else
 		psm3_getenv("PSM3_GPUDIRECT_SDMA",
 		    "UD GPU send dma flags (0 disables send dma, 1 enables), default 1",
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
@@ -1449,6 +1545,7 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 			// immediately ack the msg
 			struct ips_scb_unackedq *unackedq = &flow->scb_unacked;
 			flow->xmit_ack_num.psn_num = 1 + (__be32_to_cpu(scb->ips_lrh.bth[2]) & proto->psn_mask);
+			flow->xmit_ack_num.psn_num &= proto->psn_mask;
 
 			psmi_assert(scb == STAILQ_FIRST(unackedq));
 			STAILQ_REMOVE_HEAD(unackedq, nextq);
@@ -1517,6 +1614,22 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 #else
 		proto->stats.pio_no_flow_credits++;
 #endif
+		if (flow->credits <= 0) {
+//			_HFI_VDBG("flow=%p next=%d first_os=%d delta=%d\n", flow,
+//				flow->xmit_seq_num.psn_num, flow->credits_inc_psn,
+//				flow->xmit_seq_num.psn_num - flow->credits_inc_psn);
+			if (flow->max_credits < proto->max_credits && !between(flow->credits_inc_psn,
+				(flow->credits_inc_psn + proto->credits_inc_thresh) & proto->psn_mask,
+				flow->xmit_seq_num.psn_num)) {
+				// adjust with a small "random" number to avoid potential oscillation
+				uint16_t actual_adjust = min(proto->credits_adjust + (flow->xmit_seq_num.psn_num & 0xF),
+					proto->max_credits - flow->max_credits);
+				flow->max_credits += actual_adjust;
+				flow->credits += actual_adjust;
+				flow->credits_inc_psn = flow->xmit_seq_num.psn_num;
+				_HFI_VDBG("Increased flow (%p) credits to %d\n", flow, flow->max_credits);
+			}
+		}
 		psmi_timer_request(proto->timerq, flow->timer_send,
 				   get_cycles() + proto->timeout_send);
 	}
@@ -1620,9 +1733,8 @@ psm3_ips_proto_timer_ack_callback(struct psmi_timer *current_timer,
 		scb->abs_timeout = t_cyc_next + scb->ack_timeout;
 		if (done_local) {
 			_HFI_VDBG
-			    ("sending err_chk flow=%d with first=%d,last=%d\n",
-			     flow->flowid,
-			     STAILQ_FIRST(&flow->scb_unacked)->seq_num.psn_num,
+			    ("sending err_chk flow=%p with first=%d, last=%d\n",
+			     flow, scb->seq_num.psn_num,
 			     STAILQ_LAST(&flow->scb_unacked, ips_scb,
 					 nextq)->seq_num.psn_num);
 #ifdef PSM_BYTE_FLOW_CREDITS
@@ -1639,23 +1751,29 @@ psm3_ips_proto_timer_ack_callback(struct psmi_timer *current_timer,
 					flow->xmit_seq_num :
 					SLIST_FIRST(&flow->scb_pend)->seq_num;
 
-			if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
-				// for UD we use RC QP instead of STL100's TIDFLOW HW
-				// UDP has no RDMA
-				psmi_assert_always(0);	// we don't allocate ips_flow for TID
-				message_type = OPCODE_ERR_CHK;	// keep KlockWorks happy
-			} else {
-				PSM2_LOG_MSG("sending ERR_CHK message");
-				message_type = OPCODE_ERR_CHK;
-				err_chk_seq.psn_num = (err_chk_seq.psn_num - 1)
+			PSM2_LOG_MSG("sending ERR_CHK message");
+			message_type = OPCODE_ERR_CHK;
+			err_chk_seq.psn_num = (err_chk_seq.psn_num - 1)
 					& proto->psn_mask;
-			}
 			ctrlscb.ips_lrh.bth[2] =
 					__cpu_to_be32(err_chk_seq.psn_num);
 
 			psm3_ips_proto_send_ctrl_message(flow, message_type,
 					&flow->ipsaddr->ctrl_msg_queued,
 					&ctrlscb, ctrlscb.cksum, 0);
+			flow->credits_inc_psn = scb->seq_num.psn_num;
+			// decrease flow credits
+			if (flow->max_credits > proto->min_credits) {
+				uint16_t actual_adjust = min(proto->credits_adjust + (flow->xmit_seq_num.psn_num & 0xF),
+					flow->max_credits - proto->min_credits);
+				flow->max_credits -= actual_adjust;
+				if (flow->credits > actual_adjust) {
+					flow->credits -= actual_adjust;
+				} else {
+					flow->credits = 0;
+				}
+				_HFI_VDBG("Decreased flow (%p) credits to %d\n", flow, flow->max_credits);
+			}
 		}
 
 		t_cyc_next = get_cycles() + scb->ack_timeout;
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h
index 9c1b920f075..47bf7a50c1d 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto.h
@@ -59,13 +59,13 @@
 #include "ips_config.h"
 #include "psm_user.h"
 
-#include "ips_tid.h"
 #include "ips_recvhdrq.h"
 #include "ips_epstate.h"
 #include "ips_proto_am.h"
 #include "ips_tidflow.h"
 #include "ips_path_rec.h"
 
+#if defined(PSM_SOCKETS) && defined(USE_UDP)
 // when defined, this enables use of byte based flow credits in addition
 // to packet based.
 // It can help UDP to avoid overflowing the sockets kernel buffers.
@@ -73,6 +73,7 @@
 // memory at scale.
 // UD/RC, TCP and OPA HALs self configure so this has no effect
 #define PSM_BYTE_FLOW_CREDITS
+#endif
 
 typedef enum ips_path_type {
 	IPS_PATH_LOW_PRIORITY,
@@ -328,7 +329,6 @@ typedef enum psm_transfer_type {
 
 typedef enum psm_protocol_type {
 	PSM_PROTOCOL_GO_BACK_N = 0,
-	PSM_PROTOCOL_TIDFLOW,
 	PSM_PROTOCOL_LAST	/* Keep this the last protocol type */
 } psm_protocol_type_t;
 
@@ -369,6 +369,10 @@ struct ips_proto {
 #ifdef PSM_BYTE_FLOW_CREDITS
 	uint32_t flow_credit_bytes;	// credit limit in bytes
 #endif
+	uint16_t min_credits;		// min credits
+	uint16_t max_credits;		// max credits
+	uint16_t credits_adjust;	// credit adjusting amount
+	uint16_t credits_inc_thresh;	// credit increase threshold
 	mpool_t pend_sends_pool;
 	struct ips_ibta_compliance_fn ibta;
 	struct ips_proto_stats stats;
@@ -510,8 +514,6 @@ struct ips_flow {
 	uint16_t protocol:3;	/* go-back-n or tidflow */
 	uint16_t flags:8;	/* flow state flags */
 
-	// TBD - cwin only needed for OPA for CCA
-	uint16_t cwin;		/* Size of congestion window in packets */
 	// to allow for good pipelining of send/ACK need to trigger an ack at
 	// least every ack_interval packets (roughy flow_credits/4) or every
 	// ack_inteval_bytes bytes (roughly flow_credit_bytes/4) whichever
@@ -537,12 +539,14 @@ struct ips_flow {
 	// For UDP, sockets has byte oriented buffering so we need to
 	// impose a credit_bytes limit to allow sufficient pkt credits
 	// but avoid sockets buffer overflow and recv side discards/flow control
-	int16_t  credits;	/* Current credits available to send on flow */
+	int16_t  credits;	 /* Current credits available to send on flow */
+	uint16_t max_credits;	 /* credits limit */
+	uint32_t credits_inc_psn; /* the reference pkt psn used for increasing credit. We increase */
+	                         /* credit if current psn - credits_inc_psn > credit_inc_thresh */
 #ifdef PSM_BYTE_FLOW_CREDITS
 	int32_t  credit_bytes;	/* Current credit bytes avail to send on flow */
 #endif
 	uint32_t ack_index;     /* Index of the last ACK message type in pending message queue */
-
 	psmi_seqnum_t xmit_seq_num;	/* next psn for xmit */
 	psmi_seqnum_t xmit_ack_num;	/* last xmited psn acked + 1 */
 	psmi_seqnum_t recv_seq_num;	/* next psn expect to recv */
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_connect.c b/prov/psm3/psm3/ptl_ips/ips_proto_connect.c
index fd729dc7d9c..024ecf13ef9 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_connect.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_connect.c
@@ -317,10 +317,11 @@ ips_ipsaddr_set_req_params(struct ips_proto *proto,
 	ipsaddr->connidx_outgoing = req->hdr.connidx;
 	ipsaddr->runid_key = req->runid_key;
 	/* ipsaddr->initpsn = req->initpsn; */
-	_HFI_CONNDBG("%s -> %s: connidx_incoming=%u connidx_outgoing=%u\n",
+	_HFI_CONNDBG("%s -> %s: connidx_incoming=%u connidx_outgoing=%u flow=%p\n",
 		     psm3_epid_fmt_internal(proto->ep->epid, 0),
 		     psm3_epid_fmt_internal(ipsaddr->epaddr.epid, 1),
-		     ipsaddr->connidx_incoming, ipsaddr->connidx_outgoing);
+		     ipsaddr->connidx_incoming, ipsaddr->connidx_outgoing,
+		     &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO]);
 
 	err =
 	    psm3_epid_set_hostname(psm3_epid_nid(((psm2_epaddr_t) ipsaddr)->epid),
@@ -611,7 +612,8 @@ MOCKABLE(psm3_ips_flow_init)(struct ips_flow *flow, struct ips_proto *proto,
 	flow->recv_seq_num.psn_val = 0;
 	flow->xmit_ack_num.psn_val = 0;
 	flow->flags = 0;
-	flow->credits = flow->cwin = proto->flow_credits;
+	flow->credits = proto->flow_credits;
+	flow->max_credits = proto->flow_credits;
 	flow->ack_interval = max((proto->flow_credits >> 2) - 1, 1);
 	flow->ack_counter = 0;
 #ifdef PSM_BYTE_FLOW_CREDITS
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
index c39231b8679..4cc1ebc701b 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
@@ -53,17 +53,16 @@
 
 /* Copyright (c) 2016 Intel Corporation. All rights reserved. */
 
-// This file implements the TID protocol for STL100 and the RDMA
-// protocol for UD mode.  The majority of functons in this file (perhaps all)
-// are not used when TID/RDMA is disabled via PSM3_TID o PSM3_RDMA respectively
-// RDMA is N/A for UDP, so it will behave as if PSM3_RDMA is disabled
+// This file implements the RDMA
+// protocol for verbs mode.  The majority of functons in this file (perhaps all)
+// are not used when RDMA is disabled via PSM3_RDMA
+// RDMA is N/A for sockets, so it will behave as if PSM3_RDMA is disabled
 // and not use functions in this file.
 
 #include "psm_user.h"
 #include "psm2_hal.h"
 
 #include "ips_scb.h"
-#include "ips_tid.h"
 #include "ips_tidflow.h"
 #include "ips_proto.h"
 #include "ips_expected_proto.h"
@@ -113,7 +112,6 @@ static void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 					struct ips_gpu_hostbuf *chb_prev,
 					uint32_t tsess_srcoff,
 					uint32_t tsess_length,
-					uint32_t tsess_unaligned_start,
 					psm2_chb_match_type_t type);
 #endif
 
@@ -175,13 +173,11 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 	if (err != PSM2_OK)
 		goto fail;
 
-
 	if ((err = psm3_ips_scbctrl_init(ep, num_of_send_desc, 0,
 				    0, 0, ips_tid_scbavail_callback,
 				    protoexp, &protoexp->tid_scbc_rv)))
 		goto fail;
 
-
 	{
 		union psmi_envvar_val env_rts_cts_interleave;
 
@@ -256,75 +252,71 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 #endif
 #endif
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	if (PSMI_IS_GPU_ENABLED) {
+		struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS;
+		uint32_t maxsz, chunksz, max_elements;
+		uint32_t pool_num_obj_max_total;
+		uint32_t small_pool_num_obj_max_total;
 
+		if ((err = psm3_parse_mpool_env(protoexp->proto->mq, 1,
+						&rlim, &maxsz, &chunksz)))
+			goto fail;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	{
-		if (PSMI_IS_GPU_ENABLED) {
-			struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS;
-			uint32_t maxsz, chunksz, max_elements;
-			uint32_t pool_num_obj_max_total;
-			uint32_t small_pool_num_obj_max_total;
-
-			if ((err = psm3_parse_mpool_env(protoexp->proto->mq, 1,
-							&rlim, &maxsz, &chunksz)))
-				goto fail;
-
-			/* the maxsz is the amount in MB, not the number of entries,
-			 * since the element size depends on the window size */
-			max_elements = (maxsz*1024*1024) /
-				psm3_mq_max_window_rv(proto->mq, 1);
-			/* mpool requires max_elements to be power of 2. round down. */
-			max_elements = 1 << (31 - __builtin_clz(max_elements));
-			/* need at least 2 buffers */
-			max_elements = max(2, max_elements);
-			protoexp->gpu_hostbuf_recv_cfg.bufsz =
-				psm3_mq_max_window_rv(proto->mq, 1);
-
-			protoexp->gpu_hostbuf_pool_recv =
-				psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf),
-							  chunksz, max_elements, 0,
-							  UNDEFINED, NULL, NULL,
-							  psmi_gpu_hostbuf_alloc_func,
-							  (void *)
-							  &protoexp->gpu_hostbuf_recv_cfg);
-
-			if (protoexp->gpu_hostbuf_pool_recv == NULL) {
-				err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY,
-							"Couldn't allocate GPU host receive buffer pool");
-				goto fail;
-			}
-			psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_recv,
-						NULL, &pool_num_obj_max_total);
-
-			protoexp->gpu_hostbuf_small_recv_cfg.bufsz =
-				GPU_SMALLHOSTBUF_SZ;
-			protoexp->gpu_hostbuf_pool_small_recv =
-				psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf),
-							  chunksz, max_elements, 0,
-							  UNDEFINED, NULL, NULL,
-							  psmi_gpu_hostbuf_alloc_func,
-							  (void *)
-							  &protoexp->gpu_hostbuf_small_recv_cfg);
-
-			if (protoexp->gpu_hostbuf_pool_small_recv == NULL) {
-				err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY,
-							"Couldn't allocate GPU host small receive buffer pool");
-				goto fail;
-			}
-			psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_small_recv,
-						NULL, &small_pool_num_obj_max_total);
-			_HFI_DBG("GPU Recv Copy Pipeline: %u of %u bytes (small), %u of %u bytes\n",
-				small_pool_num_obj_max_total,
-				protoexp->gpu_hostbuf_small_recv_cfg.bufsz,
-				pool_num_obj_max_total,
-				protoexp->gpu_hostbuf_recv_cfg.bufsz);
-			PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp);
-			STAILQ_INIT(&protoexp->gpupend_getreqsq);
-		} else {
-			protoexp->gpu_hostbuf_pool_recv = NULL;
-			protoexp->gpu_hostbuf_pool_small_recv = NULL;
+		/* the maxsz is the amount in MB, not the number of entries,
+		 * since the element size depends on the window size */
+		max_elements = (maxsz*1024*1024) /
+			psm3_mq_max_window_rv(proto->mq, 1);
+		/* mpool requires max_elements to be power of 2. round down. */
+		max_elements = 1 << (31 - __builtin_clz(max_elements));
+		/* need at least 2 buffers */
+		max_elements = max(2, max_elements);
+		protoexp->gpu_hostbuf_recv_cfg.bufsz =
+			psm3_mq_max_window_rv(proto->mq, 1);
+
+		protoexp->gpu_hostbuf_pool_recv =
+			psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf),
+						  chunksz, max_elements, 0,
+						  UNDEFINED, NULL, NULL,
+						  psmi_gpu_hostbuf_alloc_func,
+						  (void *)
+						  &protoexp->gpu_hostbuf_recv_cfg);
+
+		if (protoexp->gpu_hostbuf_pool_recv == NULL) {
+			err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate GPU host receive buffer pool");
+			goto fail;
+		}
+		psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_recv,
+					NULL, &pool_num_obj_max_total);
+
+		protoexp->gpu_hostbuf_small_recv_cfg.bufsz =
+			GPU_SMALLHOSTBUF_SZ;
+		protoexp->gpu_hostbuf_pool_small_recv =
+			psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf),
+						  chunksz, max_elements, 0,
+						  UNDEFINED, NULL, NULL,
+						  psmi_gpu_hostbuf_alloc_func,
+						  (void *)
+						  &protoexp->gpu_hostbuf_small_recv_cfg);
+
+		if (protoexp->gpu_hostbuf_pool_small_recv == NULL) {
+			err = psm3_handle_error(proto->ep, PSM2_NO_MEMORY,
+						"Couldn't allocate GPU host small receive buffer pool");
+			goto fail;
 		}
+		psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_small_recv,
+					NULL, &small_pool_num_obj_max_total);
+		_HFI_DBG("GPU Recv Copy Pipeline: %u of %u bytes (small), %u of %u bytes\n",
+			small_pool_num_obj_max_total,
+			protoexp->gpu_hostbuf_small_recv_cfg.bufsz,
+			pool_num_obj_max_total,
+			protoexp->gpu_hostbuf_recv_cfg.bufsz);
+		PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp);
+		STAILQ_INIT(&protoexp->gpupend_getreqsq);
+	} else {
+		protoexp->gpu_hostbuf_pool_recv = NULL;
+		protoexp->gpu_hostbuf_pool_small_recv = NULL;
 	}
 #endif
 	psmi_assert(err == PSM2_OK);
@@ -368,14 +360,11 @@ psm2_error_t psm3_ips_protoexp_fini(struct ips_protoexp *protoexp)
 	if ((err = psm3_ips_scbctrl_fini(&protoexp->tid_scbc_rv)))
 		goto fail;
 
-
 	/* finalize tid flow control. */
 	if ((err = psm3_ips_tf_fini(&protoexp->tfc)))
 		goto fail;
 
-
 	psmi_free(protoexp);
-
 fail:
 	return err;
 }
@@ -414,19 +403,16 @@ void ips_tid_mravail_callback(struct ips_proto *proto)
 
 #endif // PSM_HAVE_RDMA
 
-// On STL100 ips_tf is a user space control for the HW tidflow which
+// On STL100 ips_tf was a user space control for the HW tidflow which
 // would fully process most valid inbound EXPTID packets within an RV Window.
-// For UD we maintain the user space control to help manage each active
+// For verbs we maintain the user space control to help manage each active
 // RV window.
 // There is one CTS per RV window (typically 128K).
-// For UD with RV, RDMA is used instread of EXPTID, with 1 RDMA per RV window.
+// For verbs with RV, RDMA is used instread of EXPTID, with 1 RDMA per RV window
 // Typically there are 32 (HFI_TF_NFLOWS) configured.
 // The 32 is hard coded, could make it tunable.
 // The tidflow provides a natural pacing mechanism and limits the total amount
-// of inflight EXPTID or RDMA incoming to given receiver.
-// In addition on STL100 there is an upper bound on TIDs which limited total
-// inbound DMA for a receiver to avoid 4MB. For smaller messages tidflow
-// count may be the limit, for larger messages TIDs would be the limit.
+// of inflight RDMA incoming to given receiver.
 
 /* New Tid Flows are available. If there are pending get requests put the
  * get timer on the timerq so it can be processed. */
@@ -544,12 +530,10 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
 	tidflows = ips_tf_available(&protoexp->tfc);
 	_HFI_MMDBG("available tidflow %u\n", tidflows);
 
-	if (
-		tidflows > 0)
+	if (tidflows > 0)
 		// get the actual TIDs and tidflows and send the CTS
 		ips_tid_pendtids_timer_callback(&protoexp->timer_getreqs, 0);
-	else if (
-		tidflows != -1)
+	else if (tidflows != -1)
 		// out of TIDs, set a timer to try again later
 		psmi_timer_request(protoexp->timerq, &protoexp->timer_getreqs,
 				   PSMI_TIMER_PRIO_1);
@@ -732,7 +716,6 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc)
 // so it cannot issue any sends directly, otherwise we will have a recursive
 // situation and potentially deeper recursion if more send CQEs found
 // key notes in this regard:
-//	OPA100 code which may send acks here is ifdef'ed out since N/A to RC QP RDMA
 //	psm3_mq_handle_rts_complete - sets flags in req and queues it, no callbacks
 //	psm3_mpool_put(tidsendc) - tid_desc_send_pool has no callback configured
 //	ips_tid_mravail_callback - psmi_timer_request call queues timer for future
@@ -1171,13 +1154,10 @@ int ips_protoexp_process_err_chk_rdma_resp(struct ips_recvhdrq_event *rcv_ev)
 #endif // defined(PSM_VERBS)
 
 #ifdef PSM_HAVE_RDMA
-// Intermediate STL100 EXTID packets can be delivered to software when
-// acks are requested.
-// The final packet in a STL100 EXTID flow is also delivered to software
-// to indicate the completion of the flow and can contain unaligned data.
-// for RDMA Write we will simply use immediate data in the write
-// to indicate the completed receive of the RDMA Write
-// if we use RDMA Read, the local SQ Completion will indicate this
+// Upon completion of an RDMA Write, a completion is delivered with
+// immediate data.  The immediate data is used
+// to indicate the completed receive of the RDMA Write.
+// If we use RDMA Read, the local SQ Completion will indicate this.
 #if defined(PSM_VERBS)
 // could build and pass a ips_recvhdrq_event or pass struct ips_recvhdrq
 // but all we really need is proto and len
@@ -1270,7 +1250,7 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref,
 
 	/* Do some sanity checking */
 	psmi_assert_always(tidrecvc->state == TIDRECVC_STATE_BUSY);
-	// STL100 does this at the end of ips_protoexp_send_tid_completion
+	// STL100 did this at the end of ips_protoexp_send_tid_completion
 	// TBD - seems like this should be done after ips_tid_recv_free
 	// so we have more likelihood of getting freshly freed resources?
 	if (tidrecvc->protoexp->tid_flags & IPS_PROTOEXP_FLAG_CTS_SERIALIZED) {
@@ -1403,7 +1383,6 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 				 struct ips_gpu_hostbuf *chb_prev,
 				 uint32_t tsess_srcoff,
 				 uint32_t tsess_length,
-				 uint32_t tsess_unaligned_start,
 				 psm2_chb_match_type_t type)
 {
 	struct ips_proto *proto = protoexp->proto;
@@ -1447,8 +1426,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 				tidsendc->userbuf =
 					(void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf);
 				tidsendc->buffer =
-					(void *)((uintptr_t)tidsendc->userbuf +
-						tsess_unaligned_start);
+					(void *)((uintptr_t)tidsendc->userbuf);
 				return;
 			}
 		} else {
@@ -1467,8 +1445,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 				tidsendc->userbuf =
 					(void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf);
 				tidsendc->buffer =
-					(void *)((uintptr_t)tidsendc->userbuf +
-						tsess_unaligned_start);
+					(void *)((uintptr_t)tidsendc->userbuf);
 				return;
 			}
 			if ((tsess_srcoff > chb->offset)
@@ -1489,8 +1466,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 					(void *)((uintptr_t) chb->host_buf +
 						tsess_srcoff - chb->offset);
 				tidsendc->buffer =
-					(void *)((uintptr_t)tidsendc->userbuf +
-							tsess_unaligned_start );
+					(void *)((uintptr_t)tidsendc->userbuf);
 				return;
 			}
 		}
@@ -1571,7 +1547,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 #if defined(PSM_SOCKETS) && PSMI_HAL_INST_CNT == 1
 	psmi_assert_always(0);	// should not get here
 #elif defined(PSM_VERBS)
-	// for UD we do not need a ips_flow since we will use the RC QP and
+	// for verbs we do not need a ips_flow since we will use the RC QP and
 	// then will use our main flow for the final RV completion control msg
 	// The path record for use by RDMA will be selected when the connection
 	// is established
@@ -1646,7 +1622,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 						    chb,
 						    tid_list->tsess_srcoff,
 						    tid_list->tsess_length,
-							0,
 						    rc);
 		} else {
 			// no match, need to prefetch
@@ -1655,7 +1630,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 						    NULL,
 						    tid_list->tsess_srcoff,
 						    tid_list->tsess_length,
-							0,
 						    PSMI_GPU_CONTINUE);
 		}
 		protoexp->proto->strat_stats.rndv_rdma_hbuf_send++;
@@ -1678,7 +1652,6 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 	tidsendc->rv_conn_count = 0;
 #endif
 
-
 	_HFI_EXP
 	    ("alloc tidsend=%4d tidrecv=%4d srcoff=%6d length=%6d"
 		"\n",
@@ -1686,7 +1659,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 	     tid_list->tsess_srcoff, tid_list->tsess_length
 		);
 
-	// start sending TIDEXP packets
+	// start sending RDMA packets
 	ips_tid_send_exp(tidsendc);
 
 	/* Add as a pending op and ring up the timer */
@@ -1726,12 +1699,9 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc)
 	struct ips_proto *proto = protoexp->proto;
 	psm2_error_t err = PSM2_OK;
 
-	// for STL100 native we would loop on ips_scb_prepare_tid_sendctrl and
-	// ips_proto_flow_enqueue to prepare EXPTID scbs for the TIDFLOW protocol
-	// and queue and issue them.  Once they were all posted the is_complete
-	// flag would be set.  For larger messages, it might take multiple
-	// attempts to get resources to queue everything in which case callbacks
-	// and timers ensure progress
+	// for STL100 the EXPTID scbs were sent by software and had to loop
+	// to get and queue scbs for the STL100 TIDFLOW protocol.
+	// Once they were all posted the is_complete flag would be set.
 	// For verbs we are delegating the RC Write "flow" to the NIC's RC QP
 	// it will manage segmentation, sequence numbers and acks for the flow
 	// so our job is done here after one call.
@@ -1865,8 +1835,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc)
  *
  */
 
-// we got a CTS and processed it.  Now we can start sending EXPTID packets.
-// For UD we will use RDMA instead of EXPTID
+// we got a CTS and processed it.  Now we can start sending RDMA packets.
 static
 psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
 {
@@ -1978,7 +1947,7 @@ ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current)
 	while (!STAILQ_EMPTY(phead)) {
 		tidsendc = STAILQ_FIRST(phead);
 
-		// we have some scb's and can use them to queue some more EXPTID packets
+		// we have some scb's and can use them to queue some more packets
 #if defined(PSM_VERBS)
 #ifdef RNDV_MOD
 		if (tidsendc->rv_need_err_chk_rdma)
@@ -2024,15 +1993,6 @@ ips_tid_pendsend_timer_callback(struct psmi_timer *timer, uint64_t current)
 }
 #endif // PSM_HAVE_RDMA
 
-/* Right now, in the kernel we are allowing for virtually non-contiguous pages,
-   in a single call, and we are therefore locking one page at a time, but since
-   the intended use of this routine is for a single group of
-   virtually contiguous pages, that should change to improve
-   performance.  That means possibly changing the calling MPI code.
-   Doing so gets rid of some of the loop stuff here, and in the driver,
-   and allows for a single call to the core VM code in the kernel,
-   rather than one per page, definitely improving performance. */
-
 
 static
 psm2_error_t
@@ -2261,9 +2221,8 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
 #endif
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (
-	    1	/* due to unaligned recv using hostbuf, must always do this */
-	) {
+	/* due to unaligned recv using hostbuf, must always do this */
+	{
 		/* Before processing pending TID requests, first try to free up
 		 * any GPU host buffers that are now idle. */
 		struct ips_tid_get_gpupend *cphead =
@@ -2392,8 +2351,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
 
 		psmi_assert(nbytes_this >= 4);
 
-		// for STL native the tids and tidflows available pace incoming TIDs
-		// for UD we still use tidflows available to pace incoming RDMA
+		// for verbs we use tidflows available to pace incoming RDMA
 			if ((ret = ips_tf_available(&protoexp->tfc)) <= 0) {
 			/* We're out of tidflow. If this process used all the resource,
 			 * the free callback will reschedule the operation, otherwise,
@@ -2576,6 +2534,3 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc)
 	return err;
 }
 #endif // PSM_HAVE_RDMA
-
-
-
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_header.h b/prov/psm3/psm3/ptl_ips/ips_proto_header.h
index aa0e84c17a7..8f2ee039cb6 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_header.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_header.h
@@ -148,7 +148,7 @@ struct ips_message_header {
 
 	};
 } PACK_SUFFIX;
-/* desc_genc is up to 32 bits, but EXPTID header (and RDMA immediate data)
+/* desc_genc is up to 32 bits, RDMA immediate data
  * only has room for 16 bits
  */
 #define IPS_HDR_RDESCID_GENC_MASK 0xffff
@@ -157,7 +157,7 @@ struct ips_message_header {
  * OpCodes in BTH[0], 24-31 bits. Order is important!!!
  */
 #define OPCODE_RESERVED			0xC0	/* reserved */
-/* TINY to EXPTID_COMPLETION/ERR_CHK_RDMA_RESP are level 2 packets */
+/* TINY to ERR_CHK_RDMA_RESP are level 2 packets */
 /* sending queue keeps a copy and resends if timeout waiting for ack */
 /* order and reliability maintained */
 #define OPCODE_TINY			0xC1	/* 0 <= msglen <= 8 */
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_help.h b/prov/psm3/psm3/ptl_ips/ips_proto_help.h
index b584f8d7c5b..cdf02155a3e 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_help.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_help.h
@@ -261,16 +261,10 @@ ips_scb_prepare_flow_inner(struct ips_proto *proto, struct ips_epaddr *ipsaddr,
 	scb->abs_timeout = TIMEOUT_INFINITE;
 	scb->scb_flags |= IPS_SEND_FLAG_PENDING;
 
-	if (flow->protocol == PSM_PROTOCOL_TIDFLOW) {
-		flow->xmit_seq_num.psn_seq += scb->nfrag;
-		scb->seq_num = flow->xmit_seq_num;
-		scb->seq_num.psn_seq--;
-	} else {
-		flow->xmit_seq_num.psn_num =
+	flow->xmit_seq_num.psn_num =
 		    (flow->xmit_seq_num.psn_num + scb->nfrag) & proto->psn_mask;
-		scb->seq_num.psn_num =
+	scb->seq_num.psn_num =
 		    (flow->xmit_seq_num.psn_num - 1) & proto->psn_mask;
-	}
 
 	return;
 }
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
index cdcc480e89a..a4f71ab8e5e 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
@@ -538,7 +538,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 	}
 #endif
 
-	PSM2_LOG_EPM_COND((len > proto->mq->hfi_thresh_rv) &&
+	PSM2_LOG_EPM_COND((len > proto->mq->rndv_nic_thresh) &&
 			  proto->protoexp,
 			  OPCODE_LONG_RTS,PSM2_LOG_TX,proto->ep->epid, req->rts_peer->epid,
 			    "scb->ips_lrh.hdr_data.u32w0: %d",scb->ips_lrh.hdr_data.u32w0);
@@ -556,8 +556,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 		goto fail;
 #ifdef PSM_HAVE_REG_MR
 	// TBD - we may want to include odd bytes at start
-	// and end of message in the RTS itself as opposed to being in last
-	// EXPTID payload packet's header
+	// and end of message in the RTS itself as opposed to using unaligned RDMA
 	// then the RDMA Write can be better aligned and may perform better
 	// Start registering memory for anticipated CTS requesting RDMA
 	// TBD - we could reduce duation of memory pin by doing this only
@@ -573,7 +572,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 	// registration for zero length sync messages
 	// PSM3_RDMA if disabled causes proto->protoexp == NULL
 	if (! ips_scb_buffer(scb) && len
-			&& len > proto->mq->hfi_thresh_rv
+			&& len > proto->mq->rndv_nic_thresh
 			&& proto->protoexp 	/* expected tid recieve enabled */
 			&& ips_epaddr_rdma_connected(ipsaddr)
 			&& !req->mr
@@ -618,7 +617,7 @@ int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len,
 {
 	if (
 		!(flags_user & PSM2_MQ_FLAG_INJECT) &&
-		len > gpu_thresh_rndv){
+		len > psm3_gpu_thresh_rndv){
 		return 1;
 	}
 
@@ -798,7 +797,6 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 #endif // PSM_HAVE_REG_MR
 				{
 					ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
-					// TBD for OPA flow_type could be DMA
 					proto->strat_stats.short_cuCopy_isend++;
 					proto->strat_stats.short_cuCopy_isend_bytes += len;
 				}
@@ -823,7 +821,6 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 			} else
 #endif
 			{
-				// TBD for OPA flow_type could be DMA
 				proto->strat_stats.short_copy_cpu_isend++;
 				proto->strat_stats.short_copy_cpu_isend_bytes += len;
 			}
@@ -894,7 +891,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 		     psm3_epaddr_get_name(mq->ep->epid, 0),
 		     psm3_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid, 1), ubuf,
 		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
-	} else if (len <= mq->hfi_thresh_rv) {
+	} else if (len <= mq->rndv_nic_thresh) {
 		req->send_msgoff = 0;
 		req->rts_peer = (psm2_epaddr_t) ipsaddr;
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
@@ -931,7 +928,6 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 			} else
 #endif
 			{
-				// TBD for OPA flow_type could be DMA
 				proto->strat_stats.eager_copy_cpu_isend++;
 				proto->strat_stats.eager_copy_cpu_isend_bytes += len;
 			}
@@ -1130,7 +1126,6 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 #endif // PSM_HAVE_REG_MR
 				{
 					ips_scb_flags(scb) |= IPS_SEND_FLAG_PAYLOAD_BUF_GPU;
-					// TBD for OPA flow_type could be DMA
 					proto->strat_stats.short_cuCopy_send++;
 					proto->strat_stats.short_cuCopy_send_bytes += len;
 				}
@@ -1157,7 +1152,6 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			} else
 #endif
 			{
-				// TBD for OPA flow_type could be DMA
 				proto->strat_stats.short_copy_cpu_send++;
 				proto->strat_stats.short_copy_cpu_send_bytes += len;
 			}
@@ -1240,7 +1234,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			  psm3_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid, 1),
 			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
 
-	} else if (len <= mq->hfi_thresh_rv) {
+	} else if (len <= mq->rndv_nic_thresh) {
 		// for FI_INJECT eager comes from user buffer, needs end to end ack
 		psm2_mq_req_t req;
 
@@ -1289,7 +1283,6 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			} else
 #endif
 			{
-				// TBD for OPA flow_type could be DMA
 				proto->strat_stats.eager_copy_cpu_send++;
 				proto->strat_stats.eager_copy_cpu_send_bytes += len;
 			}
@@ -1390,7 +1383,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 	/* Cases where we do not use TIDs:
 	 * 0) Received full message as payload to RTS, CTS is just an ack
 	 * 1) Recv on a host buffer, Send on a gpu buffer and len is <= 3 bytes
-	 * 2) Recv on a host buffer, Send on a host buffer and len <= hfi_thresh_rv
+	 * 2) Recv on a host buffer, Send on a host buffer and len <= rndv_nic_thresh
 	 * 3) Recv on gpu buf and len is <= 3 bytes
 	 * 4) Expected protocol not initialized.
 	 */
@@ -1398,7 +1391,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 	    || (!req->is_buf_gpu_mem && ((req->is_sendbuf_gpu_mem &&
 	     req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV)||
 	    (!req->is_sendbuf_gpu_mem &&
-	     req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv))) ||
+	     req->req_data.recv_msglen <= proto->mq->rndv_nic_thresh))) ||
 	    (req->is_buf_gpu_mem && req->req_data.recv_msglen <= GPUDIRECT_THRESH_RV) ||
 	    proto->protoexp == NULL	/* no expected tid recieve */
 #ifdef PSM_HAVE_REG_MR
@@ -1411,7 +1404,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 #ifdef PSM_HAVE_REG_MR
 		|| ! ips_epaddr_rdma_connected((ips_epaddr_t *) epaddr)
 #endif
-	    || req->req_data.recv_msglen <= proto->mq->hfi_thresh_rv /* less rv theshold */
+	    || req->req_data.recv_msglen <= proto->mq->rndv_nic_thresh /* less rv theshold */
 		) {  /* no expected tid recieve */
 #endif // PSM_CUDA || PSM_ONEAPI
 #ifdef PSM_HAVE_REG_MR
@@ -1434,7 +1427,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 			req->is_buf_gpu_mem, req->is_sendbuf_gpu_mem,
 #endif
-			proto->mq->hfi_thresh_rv,
+			proto->mq->rndv_nic_thresh,
 #ifdef PSM_HAVE_REG_MR
 			proto->protoexp?ips_epaddr_rdma_connected((ips_epaddr_t *) epaddr):0,
 #endif
@@ -1489,9 +1482,6 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 		// buffers which match smaller messages can get MR cache hit for
 		// various sized messages which may arrive in the buffer
 #ifdef PSM_HAVE_REG_MR
-		// TBD is this assert valid for OPA also?  Should be since
-		// with pick LONG DATA above if recv_msgoff >= recv_msglen
-		// and send_msglen should == recv_msglen
 		psmi_assert(req->req_data.send_msglen);	// 0 len uses LONG_DATA above
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 		// for GPU receive buffer we need to sort things out at a lower level
@@ -1591,73 +1581,66 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 	psmi_assert(nbytes_left > 0);
 
 	PSM2_LOG_MSG("entering.");
-	{
-		/* use PIO transfer */
-		flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
-		frag_size = flow->frag_size;
-		chunk_size = min(proto->ep->chunk_max_segs*frag_size,
+	flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+	frag_size = flow->frag_size;
+	chunk_size = min(proto->ep->chunk_max_segs*frag_size,
 					 proto->ep->chunk_max_size);
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-		if (req->is_buf_gpu_mem) {
+	if (req->is_buf_gpu_mem) {
 #ifdef PSM_HAVE_REG_MR
-			// rare, but when RV connection not available, we
-			// can select LONG DATA for a GPU send buffer.  Normally
-			// won't happen for GPU send >3 unless RDMA disabled
-			// or RV not connected
-			// TBD - no upper bound for send DMA here
-			// non-priority MR and will fallback if can't register
-			if (!req->mr && req->req_data.send_msglen > proto->iovec_gpu_thresh_eager) {
-				req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0,
-					req->req_data.buf, req->req_data.send_msglen, 
-					IBV_ACCESS_IS_GPU_ADDR);
-			}
-			if (req->mr) {
-				proto->strat_stats.rndv_long_gdr_send += dostats;
-				proto->strat_stats.rndv_long_gdr_send_bytes += dostats*req->req_data.send_msglen;
-			} else
-#endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-				// for GPU send buffer <= 3, receiver can select
-				// LONG DATA and we can use GDRCopy
-				// must repin per attempt
-			if (req->req_data.send_msglen <= gdr_copy_limit_send &&
+		// rare, but when RV connection not available, we
+		// can select LONG DATA for a GPU send buffer.  Normally
+		// won't happen for GPU send >3 unless RDMA disabled
+		// or RV not connected
+		// TBD - no upper bound for send DMA here
+		// non-priority MR and will fallback if can't register
+		if (!req->mr && req->req_data.send_msglen > proto->iovec_gpu_thresh_eager) {
+			req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0,
+				req->req_data.buf, req->req_data.send_msglen, 
+				IBV_ACCESS_IS_GPU_ADDR);
+		}
+		if (req->mr) {
+			proto->strat_stats.rndv_long_gdr_send += dostats;
+			proto->strat_stats.rndv_long_gdr_send_bytes += dostats*req->req_data.send_msglen;
+		} else
+#endif /* PSM_HAVE_REG_MR */
+			// for GPU send buffer <= 3, receiver can select
+			// LONG DATA and we can use GDRCopy
+			// must repin per attempt
+		if (req->req_data.send_msglen <= gdr_copy_limit_send &&
 				0 != (buf =  (uintptr_t)psmi_hal_gdr_convert_gpu_to_host_addr(
-					(unsigned long)req->req_data.buf,
-					req->req_data.send_msglen, 0, proto->ep))) {
-				converted = 1;
-				proto->strat_stats.rndv_long_gdrcopy_send += dostats;
-				proto->strat_stats.rndv_long_gdrcopy_send_bytes += dostats*req->req_data.send_msglen;
-			} else {
-				buf = (uintptr_t) req->req_data.buf + req->recv_msgoff;
-#else
-			{
-#endif
-				proto->strat_stats.rndv_long_cuCopy_send += dostats;
-				proto->strat_stats.rndv_long_cuCopy_send_bytes += dostats*req->req_data.send_msglen;
-			}
+				(unsigned long)req->req_data.buf,
+				req->req_data.send_msglen, 0, proto->ep))) {
+			converted = 1;
+			proto->strat_stats.rndv_long_gdrcopy_send += dostats;
+			proto->strat_stats.rndv_long_gdrcopy_send_bytes += dostats*req->req_data.send_msglen;
 		} else {
-#endif
+			buf = (uintptr_t) req->req_data.buf + req->recv_msgoff;
+			proto->strat_stats.rndv_long_cuCopy_send += dostats;
+			proto->strat_stats.rndv_long_cuCopy_send_bytes += dostats*req->req_data.send_msglen;
+		}
+	} else {
+#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 #ifdef PSM_HAVE_REG_MR
-			// TBD - no upper bound for send DMA here
-			// non-priority MR and will fallback if can't register
-			if (!req->mr && req->req_data.send_msglen > proto->iovec_thresh_eager) {
-				req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0,
-					req->req_data.buf,
-					req->req_data.send_msglen, 0);
-			}
-			if (req->mr) {
-				proto->strat_stats.rndv_long_dma_cpu_send += dostats;
-				proto->strat_stats.rndv_long_dma_cpu_send_bytes += dostats*req->req_data.send_msglen;
-			} else
-#endif
-			{
-				proto->strat_stats.rndv_long_copy_cpu_send += dostats;
-				proto->strat_stats.rndv_long_copy_cpu_send_bytes += dostats*req->req_data.send_msglen;
-			}
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+		// TBD - no upper bound for send DMA here
+		// non-priority MR and will fallback if can't register
+		if (!req->mr && req->req_data.send_msglen > proto->iovec_thresh_eager) {
+			req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0,
+				req->req_data.buf,
+				req->req_data.send_msglen, 0);
 		}
-#endif
+		if (req->mr) {
+			proto->strat_stats.rndv_long_dma_cpu_send += dostats;
+			proto->strat_stats.rndv_long_dma_cpu_send_bytes += dostats*(uint64_t)req->req_data.send_msglen;
+		} else
+#endif /* PSM_HAVE_REG_MR */
+		{
+			proto->strat_stats.rndv_long_copy_cpu_send += dostats;
+			proto->strat_stats.rndv_long_copy_cpu_send_bytes += (uint64_t)dostats*req->req_data.send_msglen;
+		}
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	}
+#endif
 
 	do {
 		/*
@@ -1667,8 +1650,8 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 		 */
 
 		/*
-		 * When tid code path is enabled, we don’t allocate scbc_rv
-		 * objects. If the message is less than the hfi_thresh_rv,
+		 * When tid code path is enabled, we don't allocate scbc_rv
+		 * objects. If the message is less than the rndv_nic_thresh,
 		 * we normally use eager protocol to do the transfer.
 		 * However, if it is sync send, we use the rendezvous
 		 * rts/cts/rts-data protocol.
@@ -1691,9 +1674,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 		unaligned_bytes = nbytes_left & 0x3;
 		if (unaligned_bytes) {
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-			if (!req->is_buf_gpu_mem
-			    || converted
-			    )
+			if (!req->is_buf_gpu_mem || converted)
 				mq_copy_tiny_host_mem((uint32_t *)&scb->ips_lrh.mdata,
 					(uint32_t *)buf, unaligned_bytes);
 			else
@@ -1721,8 +1702,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 #endif
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 		// SDMA identifies GPU buffers itself. But PIO path needs flags
-		if (req->is_buf_gpu_mem
-		) {
+		if (req->is_buf_gpu_mem) {
 #ifdef PSM_HAVE_REG_MR
 			if (!req->mr && !converted)
 #else
@@ -1820,9 +1800,9 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
 		proto->epaddr_stats.cts_rdma_recv++;
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-		psmi_assert(p_hdr->data[1].u32w1 > min(gpu_thresh_rndv, mq->hfi_thresh_rv));	// msglen
+		psmi_assert(p_hdr->data[1].u32w1 > min(psm3_gpu_thresh_rndv, mq->rndv_nic_thresh));	// msglen
 #else
-		psmi_assert(p_hdr->data[1].u32w1 > mq->hfi_thresh_rv);	// msglen
+		psmi_assert(p_hdr->data[1].u32w1 > mq->rndv_nic_thresh);	// msglen
 #endif
 		psmi_assert(proto->protoexp != NULL);
 
@@ -1857,7 +1837,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
 			proto->psmi_logevent_tid_send_reqs.next_warning = 0;
 		} else {
 			flow = &rcv_ev->ipsaddr->flows[ips_proto_flowid(p_hdr)];
-			flow->recv_seq_num.psn_num -= 1;                            /* Decrement seq number to NAK proper CTS */
+			flow->recv_seq_num.psn_num = (flow->recv_seq_num.psn_num - 1) & proto->psn_mask;                            /* Decrement seq number to NAK proper CTS */
 			ips_proto_send_nak((struct ips_recvhdrq *)rcv_ev->recvq, flow);
 			static unsigned int msg_cnt = 0;
 			if (msg_cnt++ == 0) {                                       /* Report the message only once */
@@ -2012,7 +1992,7 @@ psm3_ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
 
 	req->rts_peer = (psm2_epaddr_t) ipsaddr;
 	req->rts_reqidx_peer = p_hdr->data[1].u32w0;
-	if (req->req_data.send_msglen > mq->hfi_thresh_rv)
+	if (req->req_data.send_msglen > mq->rndv_nic_thresh)
 	{
 		PSM2_LOG_EPM(OPCODE_LONG_RTS,PSM2_LOG_RX,req->rts_peer->epid,mq->ep->epid,
 			    "req->rts_reqidx_peer: %d",req->rts_reqidx_peer);
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_params.h b/prov/psm3/psm3/ptl_ips/ips_proto_params.h
index 31148806fed..f288d6c54a1 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_params.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_params.h
@@ -110,8 +110,7 @@
 #define PSM_CRC_SIZE_IN_BYTES 8
 
 /*
- * version of protocol header (known to chip also).
- * This value for OPA is defined in spec.
+ * version of protocol header
  */
 #define IPS_PROTO_VERSION 0x1
 
@@ -199,7 +198,7 @@
 /* Path selection policies:
  *
  * (a) Adaptive - Dynamically determine the least loaded paths using various
- * feedback mechanism - Completion time via ACKs, NAKs, CCA using BECNs.
+ * feedback mechanism - Completion time via ACKs, NAKs, etc.
  *
  * (b) Static schemes  -
  *     (i) static_src  - Use path keyed off source context
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
index 716599c8e05..2fbc0a0773b 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
@@ -224,8 +224,8 @@ pio_dma_ack_valid(struct ips_proto *proto, struct ips_flow *flow,
 
 
 /* NAK post process for any flow where an scb may describe more than 1 packet
- * (OPA dma flow or GSO PIO flow). In which case we may need to resume in
- * middle of scb.
+ * (verbs  send dma flow or GSO PIO flow). In which case we may need to
+ * resume in middle of scb.
  */
 void psm3_ips_segmentation_nak_post_process(struct ips_proto *proto,
 				  struct ips_flow *flow)
@@ -406,7 +406,7 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev)
 			SLIST_FIRST(scb_pend) = NULL;
 			psmi_assert(flow->scb_num_pending == 0);
 			/* Reset congestion window - all packets ACK'd */
-			flow->credits = flow->cwin = proto->flow_credits;
+			flow->credits = flow->max_credits;
 			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
 #ifdef PSM_BYTE_FLOW_CREDITS
 			flow->credit_bytes = proto->flow_credit_bytes;
@@ -445,29 +445,6 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev)
 
 	psmi_assert(!STAILQ_EMPTY(unackedq));	/* sanity for above loop */
 
-	{
-		/* Increase congestion window if flow is not congested */
-		if_pf(flow->cwin < proto->flow_credits) {
-			// this only happens for OPA, so we don't have to
-			// increase ack_interval_bytes and flow_credit_bytes
-			// since we never decrease them for congestion
-			flow->credits +=
-			    min(flow->cwin << 1,
-				proto->flow_credits) - flow->cwin;
-			flow->cwin = min(flow->cwin << 1, proto->flow_credits);
-			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
-#ifdef PSM_BYTE_FLOW_CREDITS
-			//flow->credit_bytes += TBD
-			//flow->ack_interval_bytes = max((flow->credit_bytes >> 2) - 1, 1);
-			_HFI_VDBG("after grow cwin: flow_credits %d bytes %d\n",
-				flow->credits, flow->credit_bytes);
-#else
-			_HFI_VDBG("after grow cwin: flow_credits %d\n",
-				flow->credits);
-#endif
-		}
-	}
-
 	/* Reclaimed some credits - attempt to flush flow */
 	if (!SLIST_EMPTY(scb_pend))
 		flow->flush(flow, NULL);
@@ -495,7 +472,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
 	struct ips_scb_unackedq *unackedq;
 	struct ips_scb_pendlist *scb_pend;
 	psmi_seqnum_t ack_seq_num, last_seq_num;
-	psm_protocol_type_t protocol;
+	//psm_protocol_type_t protocol;
 	ips_epaddr_flow_t flowid;
 	ips_scb_t *scb;
 
@@ -506,7 +483,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
 	// we need to resend unacked packets starting with ack_seq_num.  So check
 	// psn of 1st NAK would like us to retransmit (e.g. don't -1 before check)
 	if ((flowid = ips_proto_flowid(p_hdr)) < EP_NUM_FLOW_ENTRIES) {
-		protocol = PSM_PROTOCOL_GO_BACK_N;
+		//protocol = PSM_PROTOCOL_GO_BACK_N;
 		psmi_assert(flowid < EP_NUM_FLOW_ENTRIES);
 		flow = &ipsaddr->flows[flowid];
 		if (!pio_dma_ack_valid(proto, flow, ack_seq_num))
@@ -589,7 +566,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
 			SLIST_FIRST(scb_pend) = NULL;
 			psmi_assert(flow->scb_num_pending == 0);
 			/* Reset congestion window if all packets acknowledged */
-			flow->credits = flow->cwin = proto->flow_credits;
+			flow->credits = flow->max_credits;
 			flow->ack_interval = max((flow->credits >> 2) - 1, 1);
 #ifdef PSM_BYTE_FLOW_CREDITS
 			flow->credit_bytes = proto->flow_credit_bytes;
@@ -628,10 +605,7 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
 
 	psmi_assert(!STAILQ_EMPTY(unackedq));	/* sanity for above loop */
 
-	if (protocol == PSM_PROTOCOL_TIDFLOW)
-		// we don't put TID (aka RDMA) pkts on UD, shouldn't get NAKs about it
-		_HFI_ERROR("post processing, Got nak for TID flow, not allowed for UD\n");
-	else if (scb->nfrag > 1)
+	if (scb->nfrag > 1)
 		psm3_ips_segmentation_nak_post_process(proto, flow);
 
 	/* Always cancel ACK timer as we are going to restart the flow */
@@ -665,19 +639,16 @@ int psm3_ips_proto_process_nak(struct ips_recvhdrq_event *rcv_ev)
 	{
 		int num_resent = 0;
 
-		/* Reclaim all credits upto congestion window only */
-		flow->credits = flow->cwin;
+		/* Reclaim all credits */
+		flow->credits = flow->max_credits;
 		flow->ack_interval = max((flow->credits >> 2) - 1, 1);
 #ifdef PSM_BYTE_FLOW_CREDITS
-		// TBD cwin not implemented for UD and UDP so can predict
-		// credit_bytes here
-		psmi_assert(flow->cwin == proto->flow_credits);
 		flow->credit_bytes = proto->flow_credit_bytes;
 		flow->ack_interval_bytes = max((flow->credit_bytes >> 2) - 1, 1);
-		_HFI_VDBG("after reclaim cwin: flow_credits %d\n",
-				flow->credits);
+		_HFI_VDBG("after reclaim credits: flow->credits %d credit_bytes %u\n",
+				flow->credits, flow->credit_bytes);
 #else /* PSM_BYTE_FLOW_CREDITS */
-		_HFI_VDBG("after reclaim cwin: flow_credits %d\n",
+		_HFI_VDBG("after reclaim credits: flow_credits %d\n",
 				flow->credits);
 #endif /* PSM_BYTE_FLOW_CREDITS */
 
diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.h b/prov/psm3/psm3/ptl_ips/ips_scb.h
index f51c9a27b67..97670116fdf 100644
--- a/prov/psm3/psm3/ptl_ips/ips_scb.h
+++ b/prov/psm3/psm3/ptl_ips/ips_scb.h
@@ -150,7 +150,7 @@ struct ips_scb {
 	uint32_t scb_flags;
 	/* When nfrag==1, frag_size and *remaining are undefined.
 	 * An scb can describe a large user buffer (nfrag>1) for segmentation
-	 * (UDP GSO and OPA send DMA).
+	 * (UDP GSO and verbs send DMA).
 	 * When such a buffer needs retransmission, the payload and payload_size
 	 * will be advanced to reflect what needs to be retransmitted.
 	 * *_remaining also are reduced to reflect what remains.
diff --git a/prov/psm3/psm3/ptl_ips/ips_tid.c b/prov/psm3/psm3/ptl_ips/ips_tid.c
deleted file mode 100644
index e7349dde133..00000000000
--- a/prov/psm3/psm3/ptl_ips/ips_tid.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2015 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
-
diff --git a/prov/psm3/psm3/ptl_ips/ips_tid.h b/prov/psm3/psm3/ptl_ips/ips_tid.h
deleted file mode 100644
index 6d31defc872..00000000000
--- a/prov/psm3/psm3/ptl_ips/ips_tid.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2015 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-/* Copyright (c) 2003-2014 Intel Corporation. All rights reserved. */
-
-/* included header files  */
-
-#ifndef _IPS_TID_H
-#define _IPS_TID_H
-
-#endif /* _IPS_TID_H */
diff --git a/prov/psm3/psm3/ptl_ips/ips_tidcache.c b/prov/psm3/psm3/ptl_ips/ips_tidcache.c
deleted file mode 100644
index f7588b83fe0..00000000000
--- a/prov/psm3/psm3/ptl_ips/ips_tidcache.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2015 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
diff --git a/prov/psm3/psm3/ptl_ips/ips_tidcache.h b/prov/psm3/psm3/ptl_ips/ips_tidcache.h
deleted file mode 100644
index 6d31284427e..00000000000
--- a/prov/psm3/psm3/ptl_ips/ips_tidcache.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2015 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2015 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef _IPS_TIDCACHE_H
-#define _IPS_TIDCACHE_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-
-/*
- * Design notes.
- *
- * PSM needs to call into driver to program receiving buffer pages to
- * HFI gen1 hardware, each tid can be programmed with physically contiguous
- * power-of-two pages from 1 pages to 512 pages. This procedure takes
- * time.
- *
- * Lots of applications tend to re-use the same receiving buffer, caching
- * such programmed tids in user space process will save time and improve
- * application performance.
- *
- * This PSM tid registration caching design requires cooperation between
- * PSM and driver. Here is what happen between PSM and driver.
- *
- * 1. PSM call into driver with a chunk of buffer with virtual address
- *    and length.
- * 2. driver pins the buffer pages, program hardware with the physical
- *    pages, get a list of tids.
- * 3. driver caches the tids with the corresponding virtual address in
- *    user space for each tid, and return the list of tids back to PSM.
- * 4. PSM also caches the list of tids with the corresponding virtual
- *    address for each tid, and use the list of tids for transmission.
- * 5. when process frees a buffer, kernel VM will catch the event and
- *    calls the callback in driver to notify that the virtual address
- *    range is gone in the process.
- * 6. driver will search its cache system and find the tids with the
- *    removed virtual address, put these tid in an invalidation queue
- *    and notify PSM the event.
- * 7. PSM will pick the event and remove the tids from its own cache
- *    as well.
- * 8. PSM must check such invalidation event every time before searching
- *    its caching system to match tids for a 'new' buffer chunk.
- * 9, when the caching system is full, and a new buffer chunk is asked
- *    to register, PSM picks a victim to remove.
- */
-
-typedef struct
-{
-	unsigned long		start;		/* start virtual address */
-	uint32_t		tidinfo;	/* tid encoding */
-	uint16_t		length;		/* length in pages */
-	uint16_t		invalidate;	/* invalidate flag */
-	uint16_t		refcount;	/* usage reference count */
-	uint16_t		i_prev;		/* idle queue previous */
-	uint16_t		i_next;		/* idle queue next */
-} rbtree_tidcache_mapitem_pl_t;
-
-typedef struct {
-	uint32_t		ntid;		/* tids are cached */
-	uint32_t		nidle;		/* tids are idle */
-} rbtree_tidcache_map_pl_t;
-
-#define RBTREE_MI_PL  rbtree_tidcache_mapitem_pl_t
-#define RBTREE_MAP_PL rbtree_tidcache_map_pl_t
-
-#include "psm3_rbtree.h"
-
-/*
- * Macro definition for easy programming.
- */
-
-#define NTID			p_map->payload.ntid
-#define REFCNT(x)		p_map->root[x].payload.refcount
-#define INVALIDATE(x)		p_map->root[x].payload.invalidate
-
-#define LENGTH(x)		p_map->root[x].payload.length
-#define START(x)		p_map->root[x].payload.start
-#define END(x)			(START(x) + (LENGTH(x)<<12))
-
-/*
- * Macro for idle tid queue management.
- */
-#define NIDLE			p_map->payload.nidle
-#define IHEAD			0
-#define INEXT(x)		p_map->root[x].payload.i_next
-#define IPREV(x)		p_map->root[x].payload.i_prev
-
-#define IDLE_REMOVE(x)		do {					\
-					INEXT(IPREV(x)) = INEXT(x);	\
-					IPREV(INEXT(x)) = IPREV(x);	\
-					NIDLE--;			\
-				} while (0)
-
-#define	IDLE_INSERT(x)		do {					\
-					INEXT(x) = INEXT(IHEAD);	\
-					IPREV(x) = IHEAD;		\
-					IPREV(INEXT(IHEAD)) = x;	\
-					INEXT(IHEAD) = x;		\
-					NIDLE++;			\
-				} while (0)
-
-extern void ips_tidcache_map_init(cl_qmap_t		*p_map,
-				  cl_map_item_t* const	root,
-				  cl_map_item_t* const	nil_item);
-
-#endif
diff --git a/prov/psm3/psm3/ptl_ips/ips_tidflow.c b/prov/psm3/psm3/ptl_ips/ips_tidflow.c
index dc7b7754d07..3305aedb865 100644
--- a/prov/psm3/psm3/ptl_ips/ips_tidflow.c
+++ b/prov/psm3/psm3/ptl_ips/ips_tidflow.c
@@ -59,9 +59,8 @@
 #include "ips_expected_proto.h"
 #include "ips_tidflow.h"
 
-// TBD - this is only needed for OPA or UD w/RNDV
-// can reduce to just counting allocations on UD and
-// not build for UDP.
+// TBD - this is only needed for UD w/RNDV
+// could omit from build for UDP.
 // Once that is done, could #ifdef PSMI_STATSTYPE_RDMA declaration
 //
 // TBD - move this into HAL and have init, fini, alloc, dealloc
@@ -104,47 +103,40 @@ psm2_error_t psm3_ips_tf_init(struct ips_protoexp *protoexp,
 		tfc->tidrecvc[tf_idx].rdescid._desc_genc = tf_idx;
 	}
 
-	{
-		tfc->tf_ctrl = (struct ips_tf_ctrl *)
-		    psmi_calloc(ep, UNDEFINED, 1,
-				sizeof(struct ips_tf_ctrl));
-		if (tfc->tf_ctrl == NULL) {
-			return PSM2_NO_MEMORY;
-		}
-	}
+	tfc->tf_ctrl = (struct ips_tf_ctrl *)psmi_calloc(ep, UNDEFINED, 1,
+			sizeof(struct ips_tf_ctrl));
+	if (tfc->tf_ctrl == NULL)
+		return PSM2_NO_MEMORY;
 
 	/*
 	 * Only the master process can initialize.
 	 */
-	{
-		tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS;
-		tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS;
+	tfc->tf_ctrl->tf_num_max = HFI_TF_NFLOWS;
+	tfc->tf_ctrl->tf_num_avail = HFI_TF_NFLOWS;
 
-		for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
-// USE_RC TBD this is bizzare.  For native mode it works fine
-// for UD/UDP mode it crashes at next_free assignment below on some systems
+	for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+// USE_RC TBD this is bizzare.
+// For UD/UDP mode it crashes at next_free assignment below on some systems
 // but adding this print or moving next_free assignment to separate
 // loop works fine.  Really odd if this is a compiler issue, but
 // I don't see any other reason.  We should be single threaded here
 // enabling the empty call to tidflow_reset doesn't help
-// stubbing tidflow_reset on native works fine, can't explain crash
-// nor workaround
-			/* Update flow state */
-			tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED;
-			tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx;
-			tfc->tf_ctrl->tf[tf_idx].next_gen = 0;
+// Can't explain crash nor workaround
+		/* Update flow state */
+		tfc->tf_ctrl->tf[tf_idx].state = TF_STATE_DEALLOCATED;
+		tfc->tf_ctrl->tf[tf_idx].tf_idx = tf_idx;
+		tfc->tf_ctrl->tf[tf_idx].next_gen = 0;
 #if 0
-			tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
+		tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
 #endif
 
-		}
+	}
 #if 1
-		for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
-			tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
-		}
-#endif
-		tfc->tf_ctrl->tf_head = 0;
+	for (tf_idx = 0; tf_idx < HFI_TF_NFLOWS; tf_idx++) {
+		tfc->tf_ctrl->tf[tf_idx].next_free = tf_idx + 1;
 	}
+#endif
+	tfc->tf_ctrl->tf_head = 0;
 
 #if TF_ADD
 	/* TF_ADD: Add a new stats type for tid flows in psm_stats.h */
@@ -179,12 +171,9 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc,
 	struct ips_tf_ctrl *ctrl = tfc->tf_ctrl;
 	struct ips_tf_entry *entry;
 
-
 	if (!ctrl->tf_num_avail) {
 		psmi_assert(ctrl->tf_head == HFI_TF_NFLOWS);
 		*tidrecvc = NULL;
-
-
 		return PSM2_EP_NO_RESOURCES;
 	}
 
@@ -192,7 +181,6 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc,
 	ctrl->tf_head = entry->next_free;
 	ctrl->tf_num_avail--;
 
-
 	tfc->tf_num_total++;
 	tfc->tf_num_inuse++;
 
@@ -206,7 +194,6 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc,
 	psmi_assert((*tidrecvc)->rdescid._desc_idx == entry->tf_idx);
 	psmi_assert_always(entry->next_gen < tfc->tf_gen_mask);
 
-
 	return PSM2_OK;
 }
 
@@ -233,12 +220,10 @@ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int use
 		tfc->tidrecvc[tf_idx].rdescid.u32w1++;
 	}
 
-
 	entry->next_free = ctrl->tf_head;
 	ctrl->tf_head = tf_idx;
 	ctrl->tf_num_avail++;
 
-
 	tfc->tf_num_inuse--;
 	/* If an available callback is registered invoke it */
 	if (((tfc->tf_num_inuse + 1) == ctrl->tf_num_max) && tfc->tf_avail_cb)
@@ -246,4 +231,3 @@ psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int use
 
 	return PSM2_OK;
 }
-
diff --git a/prov/psm3/psm3/ptl_ips/ips_tidflow.h b/prov/psm3/psm3/ptl_ips/ips_tidflow.h
index f3c29351bae..bfa2546c267 100644
--- a/prov/psm3/psm3/ptl_ips/ips_tidflow.h
+++ b/prov/psm3/psm3/ptl_ips/ips_tidflow.h
@@ -121,5 +121,4 @@ psm2_error_t psm3_ips_tf_allocate(struct ips_tf *tfc,
 /* Deallocate a tidflow */
 psm2_error_t psm3_ips_tf_deallocate(struct ips_tf *tfc, uint32_t tf_idx, int used);
 
-
 #endif
diff --git a/prov/psm3/psm3/ptl_ips/ptl_ips.h b/prov/psm3/psm3/ptl_ips/ptl_ips.h
index f5ab06d25ea..d33bd586adc 100644
--- a/prov/psm3/psm3/ptl_ips/ptl_ips.h
+++ b/prov/psm3/psm3/ptl_ips/ptl_ips.h
@@ -60,10 +60,8 @@
 
 #include "ips_proto.h"
 
-struct gen1_ptl_shared;	// OPA-only shared context
-
 /*
- * PTL at the ips level (for OPA)
+ * PTL at the ips level (for NIC)
  *
  * This PTL structure glues all the ips components together.
  *
@@ -115,8 +113,6 @@ struct ptl_ips {
 
 	/* context's status check timeout in cycles -- cached */
 	uint64_t status_cyc_timeout;
-	/* Shared contexts context  - OPA only */
-	struct gen1_ptl_shared *recvshc;
 	/* Rcv thread context */
 	struct ptl_rcvthread *rcvthread;
 }
diff --git a/prov/psm3/psm3/utils/utils_dsa.c b/prov/psm3/psm3/utils/utils_dsa.c
index 2c697b1cf20..a990babb208 100644
--- a/prov/psm3/psm3/utils/utils_dsa.c
+++ b/prov/psm3/psm3/utils/utils_dsa.c
@@ -115,6 +115,7 @@ struct dsa_wq {
 	uint32_t use_count;	// how many threads assigned to this WQ
 	uint32_t max_xfer_size; // Maximum supported transfer size
 	uint8_t dedicated;	// is this a dedicated (1) or shared (0) WQ
+	int fd;                 // Only valid if wq_reg is NULL
 };
 static struct dsa_wq dsa_wqs[DSA_MAX_QUEUES];
 static uint32_t dsa_my_num_wqs;
@@ -123,9 +124,13 @@ static psmi_spinlock_t dsa_wq_lock; // protects dsa_wq.use_count
 
 
 // Each thread is assigned a DSA WQ on 1st memcpy
+// These are only available in the thread, so we can only initialize them on
+// 1st IO and we can't clear them since ep close could be called by main thread
 static __thread void *dsa_wq_reg = NULL;
 static __thread uint8_t dsa_wq_dedicated;
 static __thread uint32_t dsa_wq_xfer_limit;
+static __thread int dsa_wq_fd = -1;
+
 
 // we keep completion record in thread local storage instead of stack
 // this way if a DSA completion times out and arrives late it still has a
@@ -156,6 +161,116 @@ static inline void movdir64b(struct dsa_hw_desc *desc, volatile void *reg)
 		: : "a" (reg), "d" (desc));
 }
 
+/*
+ * Submit work to the shared workqueue.
+ *
+ * Return:
+ *   0 == success
+ *   -1 == Failure (timeout)
+ */
+static int dsa_swq_queue(struct dsa_hw_desc *desc, void *wq_reg,
+			 struct dsa_stats *stats)
+{
+	uint64_t start_cycles, end_cycles;
+	int ret = 0;
+
+	if (enqcmd(desc, wq_reg)) {
+		// must retry, limit attempts
+		start_cycles = get_cycles();
+		end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT)/4;
+		while (enqcmd(desc, wq_reg)) {
+			if (get_cycles() > end_cycles) {
+				_HFI_INFO("DSA SWQ Enqueue Timeout\n");
+				ret = -1;
+				stats->dsa_error++;
+				break;
+			}
+		}
+		if (!ret)
+			stats->dsa_swq_wait_ns +=
+				cycles_to_nanosecs(get_cycles() -
+						   start_cycles);
+	} else {
+		stats->dsa_swq_no_wait++;
+	}
+
+	return ret;
+}
+
+/*
+ * Submit work through the write call.
+ *
+ * Return:
+ *   0 == success
+ *   -1 == Failure (timeout)
+ */
+static int dsa_write_queue(struct dsa_hw_desc *desc, int wq_fd,
+			   struct dsa_stats *stats)
+{
+	uint64_t start_cycles, end_cycles;
+	int ret;
+
+	ret = write(wq_fd, desc, sizeof(*desc));
+	if (ret != sizeof(*desc)) {
+		_HFI_VDBG("DSA write failed: ret %d (%s)\n",
+			  ret, strerror(errno));
+
+		/* Return if the err code is not "EAGAIN" */
+		if (errno != EAGAIN)
+			return -1;
+		// must retry, limit attempts
+		start_cycles = get_cycles();
+		end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT)/4;
+		ret = 0;
+		while (write(wq_fd, desc, sizeof(*desc) != sizeof(*desc))) {
+			if (errno != EAGAIN) {
+				_HFI_INFO("DSA write failed: (%s)\n",
+					  strerror(errno));
+				ret = -1;
+				break;
+			}
+			if (get_cycles() > end_cycles) {
+				_HFI_INFO("DSA Write Enqueue Timeout\n");
+				ret = -1;
+				stats->dsa_error++;
+				break;
+			}
+		}
+		if (!ret)
+			stats->dsa_wait_ns +=
+				cycles_to_nanosecs(get_cycles() -
+						   start_cycles);
+	} else {
+		stats->dsa_no_wait++;
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * Return:
+ *   0 -- Success
+ *   -1 -- Failure
+ */
+static int dsa_submit(struct dsa_hw_desc *desc, void *wq_reg,
+		      uint8_t wq_dedicated, int wq_fd,
+		      struct dsa_stats *stats)
+{
+	int ret = 0;
+
+	if (wq_reg) {
+		if (wq_dedicated)
+			/* use MOVDIR64B for DWQ */
+			movdir64b(desc, wq_reg);
+		else
+			ret = dsa_swq_queue(desc, wq_reg, stats);
+	} else {
+		ret = dsa_write_queue(desc, wq_fd, stats);
+	}
+
+	return ret;
+}
 
 /* use DSA to copy a block of memory */
 /* !rx-> copy from app to shm (sender), rx-> copy from shm to app (receiver) */
@@ -255,27 +370,14 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 	// make sure completion status zeroing fully written before post to HW
 	//_mm_sfence();
 	{ asm volatile("sfence":::"memory"); }
-	if (dsa_wq_dedicated) {
-		/* use MOVDIR64B for DWQ */
-		movdir64b(&desc, dsa_wq_reg);
-	} else {
-		/* use ENQCMDS for SWQ */
-		if (enqcmd(&desc, dsa_wq_reg)) {
-			// must retry, limit attempts
-			start_cycles = get_cycles();
-			end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT)/4;
-			while (enqcmd(&desc, dsa_wq_reg)) {
-				if (get_cycles() > end_cycles) {
-					_HFI_INFO("Disabling DSA: DSA SWQ Enqueue Timeout\n");
-					dsa_available = 0;
-					stats->dsa_error++;
-					goto memcpy_exit;
-				}
-			}
-			stats->dsa_swq_wait_ns += cycles_to_nanosecs(get_cycles() - start_cycles);
-		} else {
-			stats->dsa_swq_no_wait++;
-		}
+
+	// Submit the work
+	if (dsa_submit(&desc, dsa_wq_reg, dsa_wq_dedicated, dsa_wq_fd,
+		       stats)) {
+		// Fail to submit
+		_HFI_INFO("Disabling DSA: failed to submit work.\n");
+		dsa_available = 0;
+		goto memcpy_exit;
 	}
 
 	if (cpu_n) {
@@ -348,20 +450,31 @@ static void dsa_free_wqs(void)
 	int proc;
 	int i;
 
+	// free dsa_wqs, info relevant to our PROC
 	for (i=0; i<dsa_my_num_wqs; i++) {
 		if (dsa_wqs[i].wq_reg)
 			(void)munmap(dsa_wqs[i].wq_reg, DSA_MMAP_LEN);
 		dsa_wqs[i].wq_reg = NULL;
 		// points into dsa_wq_filename[], don't free
 		dsa_wqs[i].wq_filename = NULL;
+		dsa_wqs[i].use_count = 0;
+		if (dsa_wqs[i].fd >= 0) {
+			close(dsa_wqs[i].fd);
+			dsa_wqs[i].fd = -1;
+		}
 	}
+
 	// free what we parsed
 	for (proc=0; proc < dsa_num_proc; proc++) {
 		for (i=0; i<dsa_num_wqs[proc]; i++) {
 			psmi_free(dsa_wq_filename[proc][i]);
 			dsa_wq_filename[proc][i] = NULL;
+			dsa_wq_mode[proc][i] = 0;
+			dsa_wq_max_xfer_size[proc][i] = 0;
 		}
+		dsa_num_wqs[proc] = 0;
 	}
+	dsa_num_proc = 0;
 }
 
 // determine mode for a DSA WQ by reading the mode file under
@@ -473,11 +586,55 @@ static int psm3_dsa_max_xfer_size(const char *wq_filename)
 	return (uint32_t)strtoul(buf, NULL, 0);
 }
 
+/*
+ * Return:
+ *   0 -- Success
+ *   -1 -- Failure
+ */
+static int test_write_syscall(struct dsa_wq *wq)
+{
+	struct dsa_hw_desc desc = {};
+	struct dsa_completion_record comp __attribute__((aligned(32)));
+	int ret;
+	uint64_t start_cycles, end_cycles;
+
+	desc.opcode = DSA_OPCODE_NOOP;
+	desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
+	comp.status = 0;
+	desc.completion_addr = (unsigned long)&comp;
+
+	ret = write(wq->fd, &desc, sizeof(desc));
+	if (ret == sizeof(desc)) {
+		ret = 0;
+
+		start_cycles = get_cycles();
+		end_cycles = start_cycles + nanosecs_to_cycles(DSA_TIMEOUT);
+		while (comp.status == 0) {
+			if (get_cycles() > end_cycles && comp.status == 0) {
+				_HFI_ERROR("DSA timed out.\n");
+				return -1;
+			}
+		}
+
+		if (comp.status != DSA_COMP_SUCCESS)
+			ret = -1;
+	} else {
+		_HFI_ERROR("write failed: ret %d (%s)\n",
+			   ret, strerror(errno));
+		ret = -1;
+	}
+
+	return ret;
+}
+
 /* initialize DSA - call once per process */
 /* Some invalid inputs and DSA initialization errors are treated as fatal errors
  * since if DSA gets initialized on some nodes, but not on others, the
  * inconsistency in shm FIFO sizes causes an obsure fatal error later in
  * PSM3 intialization. So make the error more obvious and fail sooner.
+ *
+ * Note: if this fails, caller may try again later, so must cleanup any
+ * globals or resources we allocate before return failure.
  */
 int psm3_dsa_init(void)
 {
@@ -518,6 +675,7 @@ int psm3_dsa_init(void)
 		char *delim;
 		int new_proc = 0;
 		proc = 0;
+		dsa_num_wqs[proc] = 0;
 
 		if (! temp) {
 			_HFI_ERROR("Can't alloocate temp string");
@@ -564,8 +722,11 @@ int psm3_dsa_init(void)
 			dsa_wq_mode[proc][dsa_num_wqs[proc]] = mode;
 			dsa_wq_filename[proc][dsa_num_wqs[proc]] = psmi_strdup(PSMI_EP_NONE, s);
 			dsa_num_wqs[proc]++;
-			if (new_proc)
+			if (new_proc) {
 				proc++;
+				if (proc < DSA_MAX_PROC)
+					dsa_num_wqs[proc] = 0;
+			}
 			s = delim+1;
 		} while (delim);
 		psmi_free(temp);
@@ -680,8 +841,11 @@ int psm3_dsa_init(void)
 	for (i=0; i<dsa_my_num_wqs; i++) {
 		// key off having rw access to the DSA WQ to decide if DSA is available
 		dsa_wqs[i].wq_filename = dsa_wq_filename[proc][i];
-		dsa_wqs[i].dedicated = dsa_wq_mode[proc][i];
+		dsa_wqs[i].use_count = 0;
 		dsa_wqs[i].max_xfer_size = dsa_wq_max_xfer_size[proc][i];
+		dsa_wqs[i].dedicated = dsa_wq_mode[proc][i];
+		dsa_wqs[i].wq_reg = NULL;
+		dsa_wqs[i].fd = -1;
 		if (! realpath(dsa_wqs[i].wq_filename, dsa_filename)) {
 			_HFI_ERROR("Failed to resolve DSA WQ path %s\n", dsa_wqs[i].wq_filename);
 			goto fail;
@@ -691,13 +855,24 @@ int psm3_dsa_init(void)
 			_HFI_ERROR("Unable to open DSA WQ (%s): %s\n", dsa_filename, strerror(errno));
 			goto fail;
 		}
+		psmi_assert(! dsa_wqs[i].wq_reg);
 		dsa_wqs[i].wq_reg = mmap(NULL, DSA_MMAP_LEN, PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (dsa_wqs[i].wq_reg == MAP_FAILED) {
-			_HFI_ERROR("Unable to mmap DSA WQ (%s): %s\n", dsa_filename, strerror(errno));
+			_HFI_PRDBG("Unable to mmap DSA WQ (%s): %s\n", dsa_filename, strerror(errno));
+			dsa_wqs[i].wq_reg = NULL;
+			dsa_wqs[i].fd = fd;
+			/*
+			 * In case the driver doesn't support mmap, test if
+			 * it supports write system call for work submission.
+			 * If yes, fall back to using write syscall.
+			 */
+			if (test_write_syscall(&dsa_wqs[i])) {
+				_HFI_ERROR("Neither mmap nor write is supported for DSA WQ (%s)\n", dsa_filename);
+				goto fail;
+			}
+		} else {
 			close(fd);
-			goto fail;
 		}
-		close(fd);
 		// name + a coma or space
 		dsa_my_dsa_str_len += strlen(dsa_wqs[i].wq_filename)+1;
 	}
@@ -734,7 +909,7 @@ static inline void psm3_dsa_pick_wq(void)
 	int i, sel = 0;
 	uint32_t min_use_count = UINT32_MAX;
 	// pick the WQ for the current thread
-	if (dsa_wq_reg)
+	if (dsa_wq_reg || dsa_wq_fd >= 0)
 		return;	// typical case, already picked one
 
 	// rcvthread, pick last and don't count it
@@ -761,6 +936,7 @@ static inline void psm3_dsa_pick_wq(void)
 	dsa_wq_reg = dsa_wqs[sel].wq_reg;
 	dsa_wq_dedicated = dsa_wqs[sel].dedicated;
 	dsa_wq_xfer_limit = dsa_wqs[sel].max_xfer_size;
+	dsa_wq_fd = dsa_wqs[sel].fd;
 }
 
 
diff --git a/prov/psm3/psm3/utils/utils_env.c b/prov/psm3/psm3/utils/utils_env.c
index 55efb77bc2b..d2b3a68ca64 100644
--- a/prov/psm3/psm3/utils/utils_env.c
+++ b/prov/psm3/psm3/utils/utils_env.c
@@ -409,7 +409,7 @@ static int psm3_getenv_is_verblevel(int printlevel)
 
 // count number of fields in a str_tuple (field:field:....)
 // The number is number of colons + 1
-static int psm3_count_tuples(const char *str)
+int psm3_count_tuples(const char *str)
 {
 	int ret = 1;
 	if (! str)
diff --git a/prov/psm3/src/psmx3.h b/prov/psm3/src/psmx3.h
index 5209f138d5f..35b12916f55 100644
--- a/prov/psm3/src/psmx3.h
+++ b/prov/psm3/src/psmx3.h
@@ -857,9 +857,9 @@ struct psmx3_env {
 };
 
 #define PSMX3_MAX_UNITS	PSMI_MAX_RAILS /* from psm_config.h */
+#define PSMX3_MAX_EPS	64 /* no real limit, used to report max_trx_ctxt */
 struct psmx3_domain_info {
 	int max_trx_ctxt;
-	int free_trx_ctxt;
 	int num_units;	/* total HW units found by PSM3 */
 	int num_reported_units;	/* num entries in arrays below */
 	int num_active_units;	/* total active found, >= num_reported_units */
@@ -867,8 +867,6 @@ struct psmx3_domain_info {
 	int unit_is_active[PSMX3_MAX_UNITS];
 	int unit_id[PSMX3_MAX_UNITS];	/* PSM3 unit_id */
 	int addr_index[PSMX3_MAX_UNITS];/* PSM3 address index within unit_id */
-	int unit_nctxts[PSMX3_MAX_UNITS];
-	int unit_nfreectxts[PSMX3_MAX_UNITS];
 	char default_domain_name[PSMX3_MAX_UNITS * NAME_MAX]; /* autoselect:irdma0;irdma1;..... */
 	char default_fabric_name[PSMX3_MAX_UNITS * NAME_MAX]; /* RoCE 192.168.101.0/24;RoCE 192.168.102.0/24;.... */
 };
diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c
index 87e8fc50bc8..c59de18c24e 100644
--- a/prov/psm3/src/psmx3_atomic.c
+++ b/prov/psm3/src/psmx3_atomic.c
@@ -115,6 +115,19 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 }
 #endif
 
+#ifdef HAVE___INT128
+#define CASE_INT_TYPE(FUNC,...) \
+		case FI_INT8:	FUNC(__VA_ARGS__,int8_t); break; \
+		case FI_UINT8:	FUNC(__VA_ARGS__,uint8_t); break; \
+		case FI_INT16:	FUNC(__VA_ARGS__,int16_t); break; \
+		case FI_UINT16: FUNC(__VA_ARGS__,uint16_t); break; \
+		case FI_INT32:	FUNC(__VA_ARGS__,int32_t); break; \
+		case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break; \
+		case FI_INT64:	FUNC(__VA_ARGS__,int64_t); break; \
+		case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break; \
+		case FI_INT128:	FUNC(__VA_ARGS__,ofi_int128_t); break; \
+		case FI_UINT128: FUNC(__VA_ARGS__,ofi_uint128_t); break;
+#else
 #define CASE_INT_TYPE(FUNC,...) \
 		case FI_INT8:	FUNC(__VA_ARGS__,int8_t); break; \
 		case FI_UINT8:	FUNC(__VA_ARGS__,uint8_t); break; \
@@ -124,6 +137,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 		case FI_UINT32: FUNC(__VA_ARGS__,uint32_t); break; \
 		case FI_INT64:	FUNC(__VA_ARGS__,int64_t); break; \
 		case FI_UINT64: FUNC(__VA_ARGS__,uint64_t); break;
+#endif
 
 #define CASE_FP_TYPE(FUNC,...) \
 		case FI_FLOAT:	FUNC(__VA_ARGS__,float); break; \
@@ -168,6 +182,20 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 #define PSMX3_BXOR(dst,src)	(dst) ^= (src)
 #define PSMX3_COPY(dst,src)	(dst) = (src)
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+/* res is always CPU address, dst could be GPU address */
+#define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \
+		do { \
+			/*int i;*/ \
+			TYPE *d = (dst); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			/* for (i=0; i<(cnt); i++) */ \
+				/*r[i] = d[i];*/ \
+			psm3_memcpy(r, d, sizeof(TYPE)*cnt); \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 #define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \
 		do { \
 			int i; \
@@ -178,7 +206,29 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 				r[i] = d[i]; \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
+#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+/* src is always CPU address, dst could be GPU address */
+#define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			for (i=0; i<cnt; i++) { \
+				/*OP(d[i],s[i]);*/ \
+				TYPE tmp; \
+				psm3_memcpy(&tmp, d+i, sizeof(TYPE)); \
+				TYPE orig = tmp; \
+				OP(tmp,s[i]); \
+				/* GPU copy slow, worth extra check to avoid HTOD */\
+				if (tmp != orig) \
+					psm3_memcpy(d+i, &tmp, sizeof(TYPE)); \
+			} \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 #define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \
 		do { \
 			int i; \
@@ -189,7 +239,49 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 				OP(d[i],s[i]); \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
+#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+/* src is always CPU address, dst could be GPU address */
+// optimized to avoid unnecessary read and compare, OP==PSMX3_COPY and not used
+#define PSMX3_ATOMIC_WRITE_COPY(dst,src,cnt,OP,TYPE) \
+		do { \
+			/*int i;*/ \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			/* for (i=0; i<cnt; i++) */ \
+				/*PSMX3_COPY(d[i],s[i]);*/ \
+			psm3_memcpy(d, s, sizeof(TYPE)*cnt); \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#define PSMX3_ATOMIC_WRITE_COPY(dst,src,cnt,OP,TYPE) \
+	PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE)
+#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+/* src, res are always CPU address, dst could be GPU address */
+#define PSMX3_ATOMIC_READWRITE(dst,src,res,cnt,OP,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			psm3_memcpy(r, d, sizeof(TYPE)*cnt); \
+			for (i=0; i<(cnt); i++) { \
+				/*r[i] = d[i]; - done above */ \
+				/*OP(d[i],s[i]); */ \
+				TYPE tmp = r[i]; \
+				OP(tmp, s[i]); \
+				/* GPU copy slow, worth extra check to avoid HTOD */ \
+				if (tmp != r[i]) \
+					psm3_memcpy(d+i, &tmp, sizeof(TYPE)); \
+			} \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 #define PSMX3_ATOMIC_READWRITE(dst,src,res,cnt,OP,TYPE) \
 		do { \
 			int i; \
@@ -203,7 +295,32 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
+#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+/* src, cmp, res are always CPU address, dst could be GPU address */
+#define PSMX3_ATOMIC_CSWAP(dst,src,cmp,res,cnt,CMP_OP,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			TYPE *c = (cmp); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			psm3_memcpy(r, d, sizeof(TYPE)*cnt); \
+			for (i=0; i<(cnt); i++) { \
+				/*r[i] = d[i]; - done above*/ \
+				/* if (c[i] CMP_OP d[i]) */ \
+				if (c[i] CMP_OP r[i]) { \
+					/* d[i] = s[i]; */ \
+					/* GPU copy slow, may be worth check to avoid HTOD */ \
+					if (s[i] != r[i]) \
+						psm3_memcpy(d+i, &s[i], sizeof(TYPE)); \
+				} \
+			} \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 #define PSMX3_ATOMIC_CSWAP(dst,src,cmp,res,cnt,CMP_OP,TYPE) \
 		do { \
 			int i; \
@@ -219,7 +336,30 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
+#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+/* src, cmp, res are always CPU address, dst could be GPU address */
+#define PSMX3_ATOMIC_MSWAP(dst,src,cmp,res,cnt,TYPE) \
+		do { \
+			int i; \
+			TYPE *d = (dst); \
+			TYPE *s = (src); \
+			TYPE *c = (cmp); \
+			TYPE *r = (res); \
+			psmx3_lock(&psmx3_atomic_lock, 1); \
+			psm3_memcpy(r, d, sizeof(TYPE)*cnt); \
+			for (i=0; i<(cnt); i++) { \
+				/*r[i] = d[i]; - done above*/ \
+				/* d[i] = (s[i] & c[i]) | (d[i] & ~c[i]) */; \
+				TYPE tmp = (s[i] & c[i]) | (r[i] & ~c[i]); \
+				/* GPU copy slow, may be worth check to avoid HTOD */ \
+				if (tmp != r[i]) \
+					psm3_memcpy(d+i, &tmp, sizeof(TYPE)); \
+			} \
+			psmx3_unlock(&psmx3_atomic_lock, 1); \
+		} while (0)
+#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 #define PSMX3_ATOMIC_MSWAP(dst,src,cmp,res,cnt,TYPE) \
 		do { \
 			int i; \
@@ -234,6 +374,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
+#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
 
 static int psmx3_atomic_do_write(void *dest, void *src,
 				 int datatype, int op, int count)
@@ -290,7 +431,7 @@ static int psmx3_atomic_do_write(void *dest, void *src,
 		break;
 
 	case FI_ATOMIC_WRITE:
-		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE,
+		SWITCH_ALL_TYPE(datatype,PSMX3_ATOMIC_WRITE_COPY,
 				dest,src,count,PSMX3_COPY);
 		break;
 
diff --git a/prov/psm3/src/psmx3_attr.c b/prov/psm3/src/psmx3_attr.c
index fc3663f6133..99cbc8df2d1 100644
--- a/prov/psm3/src/psmx3_attr.c
+++ b/prov/psm3/src/psmx3_attr.c
@@ -97,8 +97,8 @@ static struct fi_domain_attr psmx3_domain_attr = {
 	.cq_data_size		= 0, /* 4, 8 */
 	.cq_cnt			= 65535,
 	.ep_cnt			= 65535,
-	.tx_ctx_cnt		= 1, /* psmx3_domain_info.free_trx_ctxt */
-	.rx_ctx_cnt		= 1, /* psmx3_domain_info.free_trx_ctxt */
+	.tx_ctx_cnt		= 1, /* psmx3_domain_info.max_trx_ctxt */
+	.rx_ctx_cnt		= 1, /* psmx3_domain_info.max_trx_ctxt */
 	.max_ep_tx_ctx		= 1, /* psmx3_domain_info.max_trx_ctxt */
 	.max_ep_rx_ctx		= 1, /* psmx3_domain_info.max_trx_ctxt */
 	.max_ep_stx_ctx		= 1, /* psmx3_domain_info.max_trx_ctxt */
@@ -307,7 +307,7 @@ static uint64_t get_max_inject_size(void) {
 	thresh_rv = 65536;	// default in odd case of PSM3_DEVICES=self
 
 	if (have_nic) {
-		temp = PSM_MQ_NIC_RNDV_THRESH;
+		temp = PSM3_MQ_RNDV_NIC_THRESH;
 		psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_NIC_THRESH"), &temp,
 							0, UINT_MAX);
 		if (thresh_rv > temp)
@@ -315,7 +315,7 @@ static uint64_t get_max_inject_size(void) {
 	}
 
 	if (have_shm) {
-		temp = MQ_SHM_THRESH_RNDV;
+		temp = PSM3_MQ_RNDV_SHM_THRESH;
 		psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_THRESH"), &temp,
 							0, UINT_MAX);
 		if (thresh_rv > temp)
@@ -327,7 +327,7 @@ static uint64_t get_max_inject_size(void) {
 		if (have_nic) {
 			// GPU ips rendezvous threshold
 			// sockets HAL avoids rendezvous, so this may be overly restrictive
-			temp = GPU_THRESH_RNDV;
+			temp = PSM3_GPU_THRESH_RNDV;
 			// PSM3_CUDA_THRESH_RNDV depricated, use PSM3_GPU_THRESH_RNDV if set
 			psm3_parse_str_uint(psm3_env_get("PSM3_CUDA_THRESH_RNDV"), &temp,
 								0, UINT_MAX);
@@ -339,7 +339,7 @@ static uint64_t get_max_inject_size(void) {
 
 		if (have_shm) {
 			// GPU shm rendezvous threshold
-			temp = MQ_SHM_GPU_THRESH_RNDV;
+			temp = PSM3_MQ_RNDV_SHM_GPU_THRESH;
 			psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_GPU_THRESH"), &temp,
 								0, UINT_MAX);
 			if (thresh_rv > temp)
@@ -596,19 +596,11 @@ void psmx3_update_prov_info(struct fi_info *info,
 		    ! psmx3_domain_info.default_domain_name[0])
 			unit = 0;
 
-		if (unit == PSMX3_DEFAULT_UNIT || !psmx3_env.multi_ep) {
-			p->domain_attr->tx_ctx_cnt = psmx3_domain_info.free_trx_ctxt;
-			p->domain_attr->rx_ctx_cnt = psmx3_domain_info.free_trx_ctxt;
-			p->domain_attr->max_ep_tx_ctx = psmx3_domain_info.max_trx_ctxt;
-			p->domain_attr->max_ep_rx_ctx = psmx3_domain_info.max_trx_ctxt;
-			p->domain_attr->max_ep_stx_ctx = psmx3_domain_info.max_trx_ctxt;
-		} else {
-			p->domain_attr->tx_ctx_cnt = psmx3_domain_info.unit_nfreectxts[unit];
-			p->domain_attr->rx_ctx_cnt = psmx3_domain_info.unit_nfreectxts[unit];
-			p->domain_attr->max_ep_tx_ctx = psmx3_domain_info.unit_nctxts[unit];
-			p->domain_attr->max_ep_rx_ctx = psmx3_domain_info.unit_nctxts[unit];
-			p->domain_attr->max_ep_stx_ctx = psmx3_domain_info.unit_nctxts[unit];
-		}
+		p->domain_attr->tx_ctx_cnt = psmx3_domain_info.max_trx_ctxt;
+		p->domain_attr->rx_ctx_cnt = psmx3_domain_info.max_trx_ctxt;
+		p->domain_attr->max_ep_tx_ctx = psmx3_domain_info.max_trx_ctxt;
+		p->domain_attr->max_ep_rx_ctx = psmx3_domain_info.max_trx_ctxt;
+		p->domain_attr->max_ep_stx_ctx = psmx3_domain_info.max_trx_ctxt;
 
 		free(p->domain_attr->name);
 		if (unit == PSMX3_DEFAULT_UNIT)
diff --git a/prov/psm3/src/psmx3_av.c b/prov/psm3/src/psmx3_av.c
index ac1d89ae531..53742afc121 100644
--- a/prov/psm3/src/psmx3_av.c
+++ b/prov/psm3/src/psmx3_av.c
@@ -234,11 +234,11 @@ void psmx3_epid_to_epaddr(struct psmx3_trx_ctxt *trx_ctxt,
 		psmx3_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__,
 			"psm3_ep_connect returned error %s, remote epid=%s."
 			"Try setting FI_PSM3_CONN_TIMEOUT "
-			"to a larger value (current: %d seconds).\n",
+			"to a larger value (current: %d seconds). Aborting\n",
 			psm3_error_get_string(err), psm3_epid_fmt(epid, 0), psmx3_env.conn_timeout);
 	else
 		psmx3_log(&psmx3_prov, FI_LOG_WARN, FI_LOG_AV, __func__, __LINE__,
-			"psm3_ep_connect returned error %s, remote epid=%s.\n",
+			"psm3_ep_connect returned error %s, remote epid=%s. Aborting\n",
 			psm3_error_get_string(err), psm3_epid_fmt(epid, 0));
 
 	abort();
diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c
index c20035a84de..29359d3ea34 100644
--- a/prov/psm3/src/psmx3_init.c
+++ b/prov/psm3/src/psmx3_init.c
@@ -391,16 +391,12 @@ static int psmx3_init_lib(void)
 static int psmx3_update_hfi_info(void)
 {
 	unsigned short i, j, psmx3_unit;
-	int nctxts = 0;
-	int nfreectxts = 0;
 	int multirail = 0;
-	int counted_unit;
 	char *s = NULL;
 	char unit_name[NAME_MAX];
 	char fabric_name[NAME_MAX];
 	uint32_t cnt = 0;
 	uint32_t addr_cnt = 0;
-	int tmp_nctxts, tmp_nfreectxts;
 	int unit_active;
 	int ret;
 	psm2_info_query_arg_t args[4];
@@ -459,25 +455,6 @@ static int psmx3_update_hfi_info(void)
 			continue;
 		}
 
-		if (PSM2_OK != psm3_info_query(PSM2_INFO_QUERY_NUM_FREE_CONTEXTS,
-						&tmp_nfreectxts, 1, args) || (tmp_nfreectxts < 0))
-		{
-			PSMX3_WARN(&psmx3_prov, FI_LOG_CORE,
-				"Failed to read number of free contexts from HFI unit_id %d\n",
-				i);
-			continue;
-		}
-
-		if (PSM2_OK != psm3_info_query(PSM2_INFO_QUERY_NUM_CONTEXTS,
-						&tmp_nctxts, 1, args) || (tmp_nctxts < 0))
-		{
-			PSMX3_WARN(&psmx3_prov, FI_LOG_CORE,
-				"Failed to read number of contexts from HFI unit_id %d\n",
-				i);
-			continue;
-		}
-
-		counted_unit = 0;
 		for (j=0; j < addr_cnt; j++) {
 			psmx3_unit = i * addr_cnt + j;
 			args[1].port = 1;	// VERBS_PORT
@@ -506,12 +483,6 @@ static int psmx3_update_hfi_info(void)
 				continue;
 			}
 
-			if (! counted_unit) {
-				nctxts += tmp_nctxts;
-				nfreectxts += tmp_nfreectxts;
-				counted_unit = 1;
-			}
-
 			psmx3_domain_info.num_active_units++;
 
 			/* for PSM3_MULTIRAIL only report 1 "autoselect" unit */
@@ -519,8 +490,6 @@ static int psmx3_update_hfi_info(void)
 				psmx3_domain_info.unit_is_active[psmx3_unit] = 1;
 				psmx3_domain_info.unit_id[psmx3_unit] = i;
 				psmx3_domain_info.addr_index[psmx3_unit] = j;
-				psmx3_domain_info.unit_nctxts[psmx3_unit] = tmp_nctxts;
-				psmx3_domain_info.unit_nfreectxts[psmx3_unit] = tmp_nfreectxts;
 				psmx3_domain_info.active_units[psmx3_domain_info.num_reported_units++] = psmx3_unit;
 			}
 			if (psmx3_domain_info.num_active_units == 1) {
@@ -554,22 +523,19 @@ static int psmx3_update_hfi_info(void)
 	}
 
 	PSMX3_INFO(&psmx3_prov, FI_LOG_CORE,
-		"hfi1 units: total %d, reported %d, active %d; "
-		"hfi1 contexts: total %d, free %d\n",
+		"psm3 units: total %d, reported %d, active %d\n",
 		psmx3_domain_info.num_units, psmx3_domain_info.num_reported_units,
-		psmx3_domain_info.num_active_units, nctxts, nfreectxts);
+		psmx3_domain_info.num_active_units);
 
 	if (psmx3_env.multi_ep) {
-		psmx3_domain_info.max_trx_ctxt = nctxts;
-		psmx3_domain_info.free_trx_ctxt = nfreectxts;
+		psmx3_domain_info.max_trx_ctxt = PSMX3_MAX_EPS;
 	} else {
 		psmx3_domain_info.max_trx_ctxt = 1;
-		psmx3_domain_info.free_trx_ctxt = (nfreectxts == 0) ? 0 : 1;
 	}
 
 	PSMX3_INFO(&psmx3_prov, FI_LOG_CORE,
-		"Tx/Rx contexts: %d in total, %d available.\n",
-		psmx3_domain_info.max_trx_ctxt, psmx3_domain_info.free_trx_ctxt);
+		"Tx/Rx contexts: %d allowed per process.\n",
+		psmx3_domain_info.max_trx_ctxt);
 
 	return 0;
 }