From 0dd815af85d2e375a5f62fed5c395576a00ae9f4 Mon Sep 17 00:00:00 2001 From: scott levy Date: Fri, 16 Dec 2022 09:05:31 -0700 Subject: [PATCH] Initial commit. --- LICENSE.md | 20 +++ README.md | 29 +++ data/bluefield_test.sh | 36 ++++ data/lookup_host_ip.py | 17 ++ data/process_nodelist_hosts.py | 63 +++++++ data/run-bf-test.sh | 10 ++ src/HodClient.cpp | 318 +++++++++++++++++++++++++++++++++ src/HodClient.hpp | 24 +++ src/HodClientApp.cpp | 25 +++ src/HodServer.cpp | 277 ++++++++++++++++++++++++++++ src/HodServer.hpp | 26 +++ src/HodServerApp.cpp | 27 +++ src/Makefile | 64 +++++++ src/README | 3 + 14 files changed, 939 insertions(+) create mode 100644 LICENSE.md create mode 100644 README.md create mode 100755 data/bluefield_test.sh create mode 100755 data/lookup_host_ip.py create mode 100755 data/process_nodelist_hosts.py create mode 100755 data/run-bf-test.sh create mode 100644 src/HodClient.cpp create mode 100644 src/HodClient.hpp create mode 100644 src/HodClientApp.cpp create mode 100644 src/HodServer.cpp create mode 100644 src/HodServer.hpp create mode 100644 src/HodServerApp.cpp create mode 100644 src/Makefile create mode 100644 src/README diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..995d709 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,20 @@ +MIT License + +Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC (NTESS). Under the terms +of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES +OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/README.md b/README.md new file mode 100644 index 0000000..e4fd8f7 --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +hod-carrier Overview +==================== +**What is "hod-carrier"?** + +The hod-carrier library is designed to move data between a host and a BlueField DPU using Infiniband +RDMA. The basic design is that the DPU runs a server and the host runs a client. The API is +structured so that a program executing on the host can issue a request to the server running on the +DPU to transfer data from host memory to BlueField2 DPU memory. + +This repository contains the code necessary to build four components: +- *libhodclient* : a software library containing the code necessary to run an instance of a + hod-carrier client + +- *libhodserver* : a software library containing the code necessary to run an instance of a + hod-carrier server + +- *hod_client_app* : a simple example of an executable that launches a hod-carrier client and uses + it request the server running on a BlueField DPU to transfer data from host memory to DPU memory. + +- *hod_server_app* : a simple example of an executable that launches a hod-carrier server and + services requests from a client instance running on the host processor. + +**Why "hod-carrier"?** + +A hod carrier is a person who delivers construction supplies (e.g., mortar, brick, stone) to +bricklayers and stone masons, commonly by employing a brick hod. A brick hod is an exceedingly +simple device to simplify the transport of these materials. Similarly, this software library is a +very simple device for moving data between a host and a BlueField DPU. As use cases emerge for +this service, we expect that the features and sophistication of this library will grow. diff --git a/data/bluefield_test.sh b/data/bluefield_test.sh new file mode 100755 index 0000000..059bcad --- /dev/null +++ b/data/bluefield_test.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +# (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +# Government retains certain rights in this software. + +PWD=`pwd` +HOSTLIST=`./process_nodelist_hosts.py` +CLIENTEXE=${PWD}/../src/build-host/hod_client_app +SERVEREXE=${PWD}/../src/build-bf/hod_server_app + +CLIENT_PORT=65432 +BF_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/../src/build-bf/ +ENV_VARS="export LD_LIBRARY_PATH=$BF_LIBRARY_PATH; " +ENV_VARS+="export IBVERBS_CLIENT_SOCKET_PORT=${CLIENT_PORT}; " + +echo "SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST" + +for HOST in $HOSTLIST; do + BF="${HOST}-bf" + CLIENTADDR=`./lookup_host_ip.py $HOST` + ENV_VARS+="export IBVERBS_CLIENT_IP_ADDR=$CLIENTADDR;" + ENV_VARS+="export IBVERBS_SERVER_DEVICE=mlx5_0;" + echo "CLIENTADDR = ${CLIENTADDR}" + echo "ENV_VARS = ${ENV_VARS}" + ssh -o "StrictHostKeyChecking no" $BF ${ENV_VARS} $SERVEREXE & +done + +HOST_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/../src/build-host/ +echo "HOST_LIBRARY_PATH=${HOST_LIBRARY_PATH}" +export IBVERBS_CLIENT_SOCKET_PORT=${CLIENT_PORT} +export IBVERBS_CLIENT_SOCKET_IF=eth1 +export LD_LIBRARY_PATH=$HOST_LIBRARY_PATH +srun -n 1 -N 1 $CLIENTEXE & + +wait diff --git a/data/lookup_host_ip.py b/data/lookup_host_ip.py new file mode 100755 index 0000000..47ac6df --- /dev/null +++ b/data/lookup_host_ip.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +# Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +# (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +# Government retains certain rights in this software. + +import socket +import sys + +def lookup_host_ip(hostname): + ipaddr = socket.gethostbyname(hostname) + return ipaddr + + +if __name__ == "__main__": + ipaddr = lookup_host_ip(sys.argv[1]) + print(ipaddr) diff --git a/data/process_nodelist_hosts.py b/data/process_nodelist_hosts.py new file mode 100755 index 0000000..0fe6dd9 --- /dev/null +++ b/data/process_nodelist_hosts.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +# Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +# (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +# Government retains certain rights in this software. + +import re +import sys +import os + +# [sllevy@klogin2 ~]$ srun -N 1 -n 1 echo $SLURM_JOB_NODELIST +# gn[3-4] + +nodelist_re = re.compile("(?P[a-z-]+(?:-bf)?)\[?(?P[0-9,-]+)\]?") +noderange_re = re.compile("(?P\d+)-(?P\d+)") + +# STRING for aggregating results +result = "" + +""" +if len(sys.argv) == 2: + slurm_nodelist = sys.argv[1] + ppn = int(sys.argv[2]) +else: + print("USAGE: %s " % (sys.argv[0])) + exit(-1) +""" + +slurm_nodelist = os.environ['SLURM_JOB_NODELIST'] +nodelist_strings = re.findall("[a-z-]+(?:-bf)?\[?[0-9,-]+\]?", slurm_nodelist) + +## There should be BlueField nodes +# !!!! DEEbug !!!! +#assert any(map(lambda x: "-bf" in x, nodelist_strings)) + +for nodelist_string in nodelist_strings: + if "-bf" in nodelist_string: + continue + nodelist_match = nodelist_re.match(nodelist_string) + if nodelist_match != None: + nodenumbers = nodelist_match.group("nodenumbers") + partition = nodelist_match.group("partition") + + for nodenumber in nodenumbers.split(","): + noderange_match = noderange_re.match(nodenumber) + if noderange_match == None: + start = int(nodenumber) + end = int(nodenumber) + else: + start = int(noderange_match.group("start")) + end = int(noderange_match.group("end")) + + for n in range(start,end+1): + if len(result) > 0: + #result += "," + result += " " + #result += ("%s%d:%d" % (partition,n,ppn)) + result += ("%s%d" % (partition,n)) + else: + print("WHOOPS: no match") + exit(-1) + +print(result) diff --git a/data/run-bf-test.sh b/data/run-bf-test.sh new file mode 100755 index 0000000..ea3cdb1 --- /dev/null +++ b/data/run-bf-test.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +# (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +# Government retains certain rights in this software. + +# ALL command line arguments to this script will be passed to the salloc command +# For example, on kahuna at SNL calling this script with "-p glinda" will run it on the partition +# that includes BF2 devices. +salloc -N 1 $@ ./bluefield_test.sh diff --git a/src/HodClient.cpp b/src/HodClient.cpp new file mode 100644 index 0000000..e356305 --- /dev/null +++ b/src/HodClient.cpp @@ -0,0 +1,318 @@ +// Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. + +#include "HodClient.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_CONNECTION + +#ifdef DEBUG_CONNECTION +#define dbg_connection(...) {printf("[CLIENT] "); printf(__VA_ARGS__); fflush(stdout);} +#else +#define dbg_connection(...) +#endif + +// In some cases, we want to use variables in debug messages that aren't otherwise used. +// This macro can be used to silence the compiler warnings associated with variables used this way +#ifdef DEBUG_CONNECTION +#define dbg_unused(v) +#else +#define dbg_unused(v) (void)v +#endif + +int Client::init() +{ + int rc = 0; + + dbg_connection("[ibverbs_transport] ENTER\n"); + + /* INITIALIZE socket */ + struct sockaddr_in server_addr, host_addr; + char *portno_string = getenv("IBVERBS_CLIENT_SOCKET_PORT"); + if( portno_string == nullptr ) { + printf("ERROR: Client port (IBVERBS_CLIENT_SOCKET_PORT) not set\n"); + return -1; + } + int portno = atoi(portno_string); + + host_sockfd = socket(AF_INET, SOCK_STREAM, 0); + + /* Get the IPv4 address of the socket network interface */ + struct ifreq ifr; + ifr.ifr_addr.sa_family = AF_INET; + char *client_socket_if = getenv("IBVERBS_CLIENT_SOCKET_IF"); + if( client_socket_if == nullptr ) { + printf("[CLIENT][ibverbs_connection] IBV_CLIENT_SOCKET_IF not found\n"); + return -1; + } + strncpy(ifr.ifr_name, client_socket_if, IFNAMSIZ-1); + ioctl(host_sockfd, SIOCGIFADDR, &ifr); + char *addr_str = inet_ntoa(((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr); + dbg_connection("FOUND address for: %s / %s\n", ifr.ifr_name, addr_str); + + host_addr.sin_addr = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr; + host_addr.sin_family = AF_INET; + host_addr.sin_port = htons(portno); + dbg_connection("BINDing to %s / %d\n", inet_ntoa(host_addr.sin_addr), portno); + rc = bind(host_sockfd, (struct sockaddr *) &host_addr, sizeof(host_addr)); + if( rc < 0 ) { + printf("[CLIENT] Failed to BIND to %s / %d (rc = %d / %s)\n", + addr_str, portno, rc, strerror(errno)); + return -1; + } + dbg_connection("LISTENING to %s / %d\n", inet_ntoa(host_addr.sin_addr), portno); + rc = listen(host_sockfd, 5); + if( rc < 0 ) { + printf("[CLIENT] LISTEN failed on %s / %d (rc = %d / %s)\n", + inet_ntoa(host_addr.sin_addr), portno, rc, strerror(errno)); + return -1; + } + + dbg_connection("[ibverbs_transport] initializing InfiniBand\n"); + + char *device_name = getenv("IBV_CLIENT_DEVICE"); + rc = ibv.open_device(device_name); + if (rc) { + printf("[ibverbs_transport] open_device FAILED"); + return -1; + } + + rc = ibv.setup_command_channel(); + if (rc) { + printf("[ibverbs_transport] setup_command_channel FAILED"); + return -1; + } + + rc = ibv.setup_rdma_channel(); + if (rc) { + printf("[ibverbs_transport] setup_rdma_channel FAILED"); + return -1; + } + + rc = ibv.create_qps(); + if (rc) { + printf("[ibverbs_transport] create_qps FAILED"); + return -1; + } + + dbg_connection("[ibverbs_transport] InfiniBand (ibverbs) Initialized\n"); + + char buffer[256]; + int n; + + socklen_t length = sizeof(server_addr); + server_sockfd = accept(host_sockfd, (struct sockaddr *)&server_addr, &length); + if( server_sockfd < 0 ) { + printf("[CLIENT] ACCEPT failed on %s / %d (rc = %d / %s)\n", + inet_ntoa(host_addr.sin_addr), portno, rc, strerror(errno)); + return -1; + } + dbg_connection("CLIENT success (server_sockfd (%p) = %d)!\n", &server_sockfd, server_sockfd); + + /* SEND RDMA qpn */ + memset(buffer, 0, 256); + sprintf(buffer, "%d", ibv.get_rdma_qpn()); + dbg_connection("SEND RDMA qpn = %s\n", buffer); + n = write(server_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + + /* RECEIVE RDMA qpn */ + memset(buffer, 0, 256); + n = read(server_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes\n", n); + dbg_connection("RECEIVE RDMA qpn = %s\n", buffer); + ibv.set_peer_rdma_qpn(atoi(buffer)); + + /* SEND CMD qpn */ + memset(buffer, 0, 256); + sprintf(buffer, "%d", ibv.get_cmd_qpn()); + printf("SEND CMD qpn = %s\n", buffer); + n = write(server_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + + /* RECEIVE CMD qpn */ + memset(buffer, 0, 256); + n = read(server_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes\n", n); + dbg_connection("RECEIVE CMD qpn = %s\n", buffer); + ibv.set_peer_cmd_qpn(atoi(buffer)); + + /* SEND lid */ + memset(buffer, 0, 256); + sprintf(buffer, "%d", ibv.get_lid()); + dbg_connection("SEND RDMA lid = %s\n", buffer); + n = write(server_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + + /* RECEIVE lid */ + memset(buffer, 0, 256); + n = read(server_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes\n", n); + dbg_connection("RECEIVE RDMA lid = %s\n", buffer); + ibv.set_peer_lid(atoi(buffer)); + + if( ibv.transition_to_ready() < 0 ) return -1; + dbg_connection("[ibverbs_transport] READY\n"); + dbg_connection("CLIENT init complete (server_sockfd (%p) = %d)!\n", &server_sockfd, server_sockfd); + + return 0; +} + +#define BUFFER_SIZE 256 +int Client::run(char *rdma_buffer, size_t rdma_buffer_size) +{ + int rc; + dbg_connection("==== RUN ====\n"); + dbg_connection("rdma_buffer = %p / rdma_buffer_size = %ld\n", rdma_buffer, rdma_buffer_size); + + /* SEND MR */ + struct ibv_mr *local_ibv_mr = ibv.register_memory(rdma_buffer, rdma_buffer_size); + + // ------ SOCKET code ------- + int n; + char buffer[BUFFER_SIZE]; + + dbg_connection("(RDMA buffer = %p / local_ibv_mr = 0x%x)\n", + rdma_buffer, local_ibv_mr->rkey); + dbg_connection("(RDMA buffer = 0x%x 0x%x 0x%x\n", + rdma_buffer[0], rdma_buffer[1], rdma_buffer[2]); + + /* ---- */ + struct ibv_sge sge; + struct ibv_recv_wr recv_wr; + + memset(&sge, 0, sizeof(sge)); + memset(&recv_wr, 0, sizeof(recv_wr)); + sge.addr = (uint64_t)rdma_buffer; + sge.length = rdma_buffer_size; + sge.lkey = local_ibv_mr->lkey; + + recv_wr.next = nullptr; + recv_wr.sg_list = &sge; + recv_wr.num_sge = 1; + recv_wr.wr_id = (uint64_t)0xBADFACE; + + dbg_connection("post_recv() - cmd_msg=%p - sge.addr=%lx ; sge.length=%u ; sge.lkey=%x\n", + rdma_buffer, sge.addr, sge.length, sge.lkey); + + rc = ibv.post_recv_wr(&recv_wr); + if( rc < 0 ) { + printf("post_recv() FAILED\n"); + return -1; + } + + /* ---- */ + + /* SEND rdma_buffer, buffer size, and rkey */ + memset(buffer, 0, BUFFER_SIZE); + sprintf(buffer, "0x%lx:0x%lx:0x%x", (uint64_t)local_ibv_mr->addr, + (uint64_t)rdma_buffer_size, local_ibv_mr->rkey); + + n = write(server_sockfd, buffer, strlen(buffer)); + if( n < 0 ) { + printf("[ibverbs_transport] ERROR: %d(%s)\n", errno, strerror(errno)); + return -1; + } + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + dbg_connection("SEND buffer = %s\n", buffer); + + /* RECEIVE ACK */ + memset(buffer, 0, 256); + n = read(server_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes\n", n); + dbg_connection("RECEIVE ACK = %s\n", buffer); + + /* CLEAN UP completed transfers. */ + char *memory; + char *element = strtok(buffer,":"); + while( 1 ) { + element = strtok(NULL,":"); + if( element == NULL ) { + break; + } + memory = (char *)strtol(element, NULL, 0); + if( pending_transfers.size() > 1 ) { + printf("**** WHOOPS : Number of PENDING CHECKPOINTS = %ld\n", pending_transfers.size()); + } + while( !pending_transfers.empty() ) { + std::pair next = pending_transfers.front(); + pending_transfers.pop(); + if( next.first != memory ) { + printf("**ERROR** : out-of-order ACKNOWLEDGEMENT (ACKed %p / FOUND %p)\n", + memory, next.first); + } else { + dbg_connection("FOUND matching ADDRESS (FREE %p)\n", memory); + free(next.first); + ibv_dereg_mr(next.second); + break; + } + } + } + + pending_transfers.push(std::make_pair(rdma_buffer,local_ibv_mr)); + return 0; +} + +int Client::stop() +{ + /* SEND complete message */ + char buffer[256]; + memset(buffer, 0, 256); + sprintf(buffer, "%s", "COMPLETE"); + dbg_connection("SENDING complete message\n"); + int n = write(server_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SENT %d bytes\n", n); + + /* RECEIVE ACK */ + memset(buffer, 0, 256); + n = read(server_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes\n", n); + dbg_connection("RECEIVE ACK COMPLETE = %s\n", buffer); + if( strcmp("COMPLETE", buffer) != 0 ) { + printf("*** ERROR: COMPLETE ack not received\n"); + fflush(stdout); + } + + if( close(server_sockfd) != 0 ) { + printf("ERROR: FAILED to close SERVER socket\n"); + return -1; + } else { + dbg_connection("SUCCESSFULLY closed SERVER socket\n"); + server_sockfd = -1; + } + if( close(host_sockfd) != 0 ) { + printf("ERROR: FAILED to close HOST socket\n"); + return -1; + } else { + dbg_connection("SUCCESSFULLY closed HOST socket\n"); + host_sockfd = -1; + } + + return 0; +} diff --git a/src/HodClient.hpp b/src/HodClient.hpp new file mode 100644 index 0000000..fa75f1e --- /dev/null +++ b/src/HodClient.hpp @@ -0,0 +1,24 @@ +// Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. + +#include "Ibverbs.hpp" +#include +#include +#include + +class Client { + public: + Client() : ibv(), pending_transfers() {}; + int init(); + int run(char *buffer, size_t buffer_size); + int stop(); + private: + Ibverbs ibv; + int host_sockfd; + int server_sockfd; + + std::queue> pending_transfers; +}; + + diff --git a/src/HodClientApp.cpp b/src/HodClientApp.cpp new file mode 100644 index 0000000..696fcd5 --- /dev/null +++ b/src/HodClientApp.cpp @@ -0,0 +1,25 @@ +// Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. + +#include "HodClient.hpp" +#include + +int main(int argc, char **argv) +{ + char *rdma_buffer = (char *)malloc(256); + memset(rdma_buffer, 0, 256); + + for(int i = 0; i < 256; i++ ) { + rdma_buffer[i] = i ^ 0xFF; + } + + Client *client = new Client(); + client->init(); +#if 0 + client->run(rdma_buffer, 256); +#endif + client->run(rdma_buffer, 256); + client->stop(); + return 0; +} diff --git a/src/HodServer.cpp b/src/HodServer.cpp new file mode 100644 index 0000000..7795bfa --- /dev/null +++ b/src/HodServer.cpp @@ -0,0 +1,277 @@ +// Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. + +#include "HodServer.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_CONNECTION + +#ifdef DEBUG_CONNECTION +#define dbg_connection(...) {printf("[SERVER] "); printf(__VA_ARGS__); fflush(stdout);} +#define dbg_connection_raw(...) {printf(__VA_ARGS__); fflush(stdout);} +#else +#define dbg_connection(...) +#define dbg_connection_raw(...) +#endif + +// In some cases, we want to use variables in debug messages that aren't otherwise used. +// This macro can be used to silence the compiler warnings associated with variables used this way +#ifdef DEBUG_CONNECTION +#define dbg_unused(v) +#else +#define dbg_unused(v) (void)v +#endif + + +Server::Server() : completed_transfers(), completed_transfers_lock() +{ }; + +int Server::init(std::string addr_str) +{ + int rc = 0; + + char *device_name = getenv("IBVERBS_SERVER_DEVICE"); + if( device_name != nullptr ) { + dbg_connection("LOOKING for DEVICE --> %s\n", device_name); + } else { + printf("DEVICE name not specified\n"); + fflush(stdout); + return -1; + } + + rc = ibv.open_device(device_name); + if (rc) { + printf("[ibverbs_transport] open_device FAILED"); + return -1; + } + + rc = ibv.setup_command_channel(); + if (rc) { + printf("[ibverbs_transport] setup_command_channel FAILED"); + return -1; + } + + rc = ibv.setup_rdma_channel(); + if (rc) { + printf("[ibverbs_transport] setup_rdma_channel FAILED"); + return -1; + } + + rc = ibv.create_qps(); + if (rc) { + printf("[ibverbs_transport] create_qps FAILED"); + return -1; + } + + dbg_connection("InfiniBand (ibverbs) INITIALIZED\n"); + + // ------ SOCKET code ------- + int n; + char *portno_string = getenv("IBVERBS_CLIENT_SOCKET_PORT"); + if( portno_string == nullptr ) { + printf("ERROR: Client port (IBVERBS_CLIENT_SOCKET_PORT) not set\n"); + return -1; + } + int portno = atoi(portno_string); + struct sockaddr_in client_addr; + char buffer[256]; + + /* SERVER code */ + client_sockfd = socket(AF_INET, SOCK_STREAM, 0); + client_addr.sin_addr.s_addr = inet_addr(addr_str.c_str()); + client_addr.sin_family = AF_INET; + client_addr.sin_port = htons(portno); + dbg_connection("CONNECTing to %s / %d\n", addr_str.c_str(), portno); + dbg_connection("CONNECTing to %s / %d\n", inet_ntoa(client_addr.sin_addr), portno); + int number_of_attempts = 5; + for( int i = 0; i < number_of_attempts; i++ ) { + rc = connect(client_sockfd,(struct sockaddr *)&client_addr,sizeof(client_addr)); + if( rc == 0 ) { + // we did it + break; + } + sleep(1); + } + + if( rc < 0 ) { + printf("UNABLE to CONNECT (%d attempts / rc = %d / %s : errno = %d)\n", + number_of_attempts, rc, strerror(errno), errno); + return -1; + } + + /* RECEIVE RDMA qpn */ + memset(buffer, 0, 256); + n = read(client_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes (%s)\n", n, strerror(errno)); + dbg_connection("RECEIVE RDMA qpn = %s\n", buffer); + ibv.set_peer_rdma_qpn(atoi(buffer)); + + /* SEND RDMA qpn */ + memset(buffer, 0, 256); + sprintf(buffer, "%d", ibv.get_rdma_qpn()); + dbg_connection("SEND RDMA qpn = %s\n", buffer); + n = write(client_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + + /* RECEIVE CMD qpn */ + memset(buffer, 0, 256); + n = read(client_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes (%s)\n", n, strerror(errno)); + dbg_connection("RECEIVE RDMA qpn = %s\n", buffer); + ibv.set_peer_cmd_qpn(atoi(buffer)); + + /* SEND CMD qpn */ + memset(buffer, 0, 256); + sprintf(buffer, "%d", ibv.get_cmd_qpn()); + dbg_connection("SEND RDMA qpn = %s\n", buffer); + n = write(client_sockfd, buffer, strlen(buffer)); + dbg_connection("SEND %d bytes\n", n); + + /* RECEIVE lid */ + memset(buffer, 0, 256); + n = read(client_sockfd, buffer, 255); + dbg_unused(n); + dbg_connection("RECEIVE %d bytes (%s)\n", n, strerror(errno)); + dbg_connection("RECEIVE RDMA lid = %s\n", buffer); + ibv.set_peer_lid(atoi(buffer)); + + /* SEND lid */ + memset(buffer, 0, 256); + sprintf(buffer, "%d", ibv.get_lid()); + dbg_connection("SEND RDMA lid = %s\n", buffer); + n = write(client_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + + rc = ibv.transition_to_ready(); + if( rc < 0 ) { + printf("FAILED to transition to READY\n"); + fflush(stdout); + return -1; + } + + dbg_connection("Infiniband READY\n"); + + return 0; +} + +uint64_t Server::get_remote_buffer(uint64_t remote_buffer_addr, size_t remote_buffer_size, + uint32_t remote_buffer_key) +{ + ibv.get_remote_buffer(remote_buffer_addr, remote_buffer_size, remote_buffer_key); + completed_transfers_lock.lock(); + completed_transfers.push(remote_buffer_addr); + completed_transfers_lock.unlock(); + return remote_buffer_addr; +} + +#define BUFFER_SIZE 256 +int Server::start() +{ + std::vector> futures; + uint64_t remote_buffer_addr = (uint64_t)nullptr; + size_t remote_buffer_size = 0; + uint32_t remote_buffer_key = 0; + char buffer[BUFFER_SIZE]; + int n; + + while(1) { + /* RECEIVE remote buffer addr --OR-- COMPLETE message */ + memset(buffer, 0, 256); + n = read(client_sockfd, buffer, 255); + dbg_connection("RECEIVED %d BYTES\n", n); + if( strncmp(buffer, (char *)"COMPLETE", 256) == 0 ) { + dbg_connection("RECEIVED ACK --> wait for FUTUREs\n"); + + // MAKE sure all of the threads have completed + std::vector>::iterator it; + for( it = futures.begin(); it != futures.end(); ++it ) { + (*it).wait(); + } + futures.clear(); + dbg_connection("FUTUREs complete\n"); + + /* SEND ACK complete */ + memset(buffer, 0, 256); + sprintf(buffer, "COMPLETE"); + dbg_connection("SEND ACK COMPLETE\n"); + n = write(client_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + + dbg_connection("CLOSING CLIENT socket (COMPLETE)\n"); + if( close(client_sockfd) != 0 ) { + printf("ERROR: Failed to CLOSE client socket"); + fflush(stdout); + return -1; + } + dbg_connection("CLIENT socket closed SUCCESSFULLY\n"); + client_sockfd = -1; + return 0; + } else if( n <= 0 ) { + dbg_connection("CLIENT socket closed unexpectedly (%s)\n", strerror(errno)); + fflush(stdout); + return -1; + } + + dbg_unused(n); + dbg_connection("RECEIVE %d bytes\n", n); + dbg_connection("RECEIVE RDMA remote buffer = %s\n", buffer); + + char *string_token; + + string_token = strtok(buffer,":"); + remote_buffer_addr = strtoul(string_token, NULL, 16); + string_token = strtok(NULL,":"); + remote_buffer_size = strtoul(string_token, NULL, 16); + string_token = strtok(NULL,":"); + remote_buffer_key = strtoul(string_token, NULL, 16); + + /* SEND ACK */ + memset(buffer, 0, 256); + char *p = buffer; + int n; + n = sprintf(buffer, "ACK"); + + completed_transfers_lock.lock(); + while( !completed_transfers.empty() ) { + p += n; + uint64_t addr = completed_transfers.front(); + completed_transfers.pop(); + n = sprintf(p,":0x%lx", addr); + } + completed_transfers_lock.unlock(); + + dbg_connection("SEND ACK RDMA remote buffer addr = %s\n", buffer); + n = write(client_sockfd, buffer, strlen(buffer)); + dbg_unused(n); + dbg_connection("SEND %d bytes\n", n); + + std::future future = std::async(&Server::get_remote_buffer, this, + remote_buffer_addr, + remote_buffer_size, + remote_buffer_key); + futures.push_back(move(future)); + } + return 0; +} diff --git a/src/HodServer.hpp b/src/HodServer.hpp new file mode 100644 index 0000000..a8836c9 --- /dev/null +++ b/src/HodServer.hpp @@ -0,0 +1,26 @@ +// Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. + +#include "Ibverbs.hpp" +#include +#include +#include +#include + +class Server { + public: + Server(); + int init(std::string addr_str); + int start(); + uint64_t get_remote_buffer(uint64_t remote_buffer_addr, size_t remote_buffer_size, + uint32_t remote_buffer_key); + private: + Ibverbs ibv; + int client_sockfd; + + std::queue completed_transfers; + std::mutex completed_transfers_lock; +}; + + diff --git a/src/HodServerApp.cpp b/src/HodServerApp.cpp new file mode 100644 index 0000000..f2d1b9e --- /dev/null +++ b/src/HodServerApp.cpp @@ -0,0 +1,27 @@ +// Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +// (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +// Government retains certain rights in this software. + +#include "HodServer.hpp" +#include + +int main(int argc, char **argv) +{ + const char *env_var_name = "IBVERBS_CLIENT_IP_ADDR"; + + Server *server = new Server; + printf("Server STARTing!\n"); + fflush(stdout); + + char *client_addr = getenv(env_var_name); + if( client_addr == nullptr ) { + printf("ERROR: environment variable (%s) not defined\n", env_var_name); + return -1; + } + std::string client_addr_str(client_addr); + + server->init(client_addr_str); + server->start(); + printf("Server COMPLETE!\n"); + return 0; +} diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..5fc5a89 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,64 @@ +# Copyright 2022 National Technology & Engineering Solutions of Sandia, LLC +# (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. +# Government retains certain rights in this software. + +.PHONY: clean-host clean-bf + +#CFLAGS := -Wall -Werror -g -O0 +CFLAGS := -Wall -fPIC -g -O0 -std=c++11 + +CC := gcc +CXX := g++ +LD := g++ + + +LDFLAGS := ${LDFLAGS} -libverbs + +APPNAMES := hod_server_app hod_client_app +LIBNAMES := libhodserver.so libhodclient.so + +HOSTBUILD := build-host/ +HOST_TARGET := host +BFBUILD := build-bf/ +BF_TARGET := bf + +all: + @echo "USE with 'make host' or 'make bf'" + +$(info BUILD objdir target) +ifeq ("$(MAKECMDGOALS)", "host") + OBJDIR := $(HOSTBUILD) +else ifeq ("$(MAKECMDGOALS)", "bf") + OBJDIR := $(BFBUILD) +endif +$(shell if [ ! -z $(OBJDIR) ] && [ ! -e $(OBJDIR) ]; then mkdir $(OBJDIR); fi;) + +APPS := $(addprefix $(OBJDIR),$(APPNAMES)) +LIBS := $(addprefix $(OBJDIR),$(LIBNAMES)) +$(info APPS = $(APPS)) + +host: ${LIBS} ${APPS} +bf: ${LIBS} ${APPS} + +%.o: ../%.cpp + $(CXX) $(CFLAGS) -c $< -o $@ + +$(OBJDIR)hod_server_app: $(OBJDIR)HodServerApp.o $(OBJDIR)libhodserver.so + ${LD} -o $@ $< ${LDFLAGS} -L$(OBJDIR) -lhodserver + +$(OBJDIR)libhodserver.so: $(OBJDIR)HodServer.o $(OBJDIR)Ibverbs.o + ${LD} -shared -o $@ $^ ${LDFLAGS} + +$(OBJDIR)libhodclient.so: $(OBJDIR)HodClient.o $(OBJDIR)Ibverbs.o + ${LD} -shared -o $@ $^ ${LDFLAGS} + +$(OBJDIR)hod_client_app: $(OBJDIR)HodClientApp.o $(OBJDIR)libhodclient.so + ${LD} -o $@ $< ${LDFLAGS} -L$(OBJDIR) -lhodclient + +print-% : ; @echo $* = $($*) + +clean-host: + rm -rf $(HOSTBUILD) + +clean-bf: + rm -rf $(BFBUILD) diff --git a/src/README b/src/README new file mode 100644 index 0000000..17959e3 --- /dev/null +++ b/src/README @@ -0,0 +1,3 @@ +To build on the host : "make host" --> uses build-host as the build directory + +To build on the BF : "make bf" --> uses build-bf as the build directory