diff --git a/scripts/adq/adq_setup.sh b/scripts/adq/adq_setup.sh new file mode 100755 index 000000000..1beb2a687 --- /dev/null +++ b/scripts/adq/adq_setup.sh @@ -0,0 +1,217 @@ +#!/bin/bash + +# https://blog.cloudflare.com/how-to-achieve-low-latency/ +# https://null.53bits.co.uk/index.php?page=numa-and-queue-affinity + + + +ICE_PATH= # +iface=eth0 +num_instance=24 +port=12300 +num_queues_tc0=2 +num_queues_tc1=32 +num_queues_tc2=32 +tc1offset=${num_queues_tc0} +# ipaddr=10.181.156.119 +host=$(hostname) +ipaddr=$(ifconfig|grep "inet "|grep broadcast|awk '{print $2}') + + +general_tuning() +{ + # disable firewall + service firewalld stop; systemctl mask firewalld + + # Enable latency-performance tuned profile + tuned-adm profile latency-performance + # check profile + cat /etc/tuned/active_profile + + # Set the CPU scaling governor to performance mode + x86_energy_perf_policy --hwp-enable 2>/dev/null + x86_energy_perf_policy performance + + # change ulimit + grep "hard nofile 65536" /etc/security/limits.conf + status=$? + if [[ "$status" == 1 ]]; then + echo "modify limits.conf" + echo -e '* hard nofile 65536'"\n"'* soft nofile 65536' | sudo tee -a /etc/security/limits.conf + fi + + + sysctl -w net.core.busy_poll=50000 + sysctl -w net.core.busy_read=50000 + sysctl -w net.core.somaxconn=4096 + sysctl -w net.core.netdev_max_backlog=8192 + sysctl -w net.ipv4.tcp_max_syn_backlog=16384 + sysctl -w net.core.rmem_max=16777216 + sysctl -w net.core.wmem_max=16777216 + sysctl -w net.ipv4.tcp_mem="764688 1019584 16777216" + sysctl -w net.ipv4.tcp_rmem="8192 87380 16777216" + sysctl -w net.ipv4.tcp_wmem="8192 65536 16777216" + sysctl -w net.ipv4.route.flush=1 + + # echo 800000 | sudo tee /proc/sys/net/ipv4/tcp_max_orphans + + + # Stop the irqbalance service. (Needed for interface interrupt affinity settings.) + systemctl stop irqbalance + echo kernel.numa_balancing=0 | sudo tee -a /etc/sysctl.conf + sysctl -p +} + +offline_cores() { + # assume we only keep 25 cores + # cat /proc/cpuinfo | grep -e processor -e "core id" -e "physical id" + lscpu + nproc=`nproc` + start=$((num_instance+1)) + end=$((nproc-1)) + for i in `seq ${start} ${end}`; do + echo 0 | sudo tee /sys/devices/system/cpu/cpu${i}/online + done +} + +turnoff_service() { + # turn off most background jobs to avoid interference +} + + +non_adq() +{ + # set wilson attributes to disable IRQ affinity + loony -H $host -d server set attribute irqaffinity:false + + # check current queues + ethtool -l $iface + + # find IRQ for the nic + ls /sys/class/net/$iface/device/msi_irqs/ + + # spread processing evenly between first 25 RX queues, and disable the other queues + sudo ethtool -X $iface equal 25 + # check the queue binding + sudo ethtool -x $iface + + + # IRQ affinity + let CPU=0 + cd /sys/class/net/$iface/device/msi_irqs/ || exit 1 + for IRQ in *; do + cat /proc/irq/$IRQ/smp_affinity_list; + echo $CPU > /proc/irq/$IRQ/smp_affinity_list + ((CPU+=1)) + done + + # Flow steering: this is used for initial mapping (connection start) + for ((i=0; i < num_queues_tc1+1; i++)); do + ethtool --config-ntuple $iface flow-type tcp4 dst-port $((port + i)) action $i + sleep 0.5 + done + + # receive flow steering, this is used for continuous packet steering + echo 3276800 > /proc/sys/net/core/rps_sock_flow_entries + for f in /sys/class/net/$iface/queues/rx-*/rps_flow_cnt; do + echo 32768 > $f; + done + + # Mellanox, turn on accelerated receive flow steering + ethtool -K $iface ntuple on + + # xps + ${ICE_PATH}/PROCGB/Linux/ice-1.0.4/scripts/set_xps_rxqs $iface + +} + + + + + +adq() +{ + modprobe ice + + # Enable hardware TC offload on the interface and turn off lldp + ethtool -K $iface hw-tc-offload on + ethtool --set-priv-flags $iface fw-lldp-agent off + + # verify settings + ethtool -k $iface | grep "hw-tc" + ethtool --show-priv-flags $iface + + read -s -n 1 -p "check hw-tc-offload is on before continue" + + /opt/iproute2/sbin/tc qdisc add dev $iface root mqprio num_tc 3 map 0 1 2 queues $num_queues_tc0@0 $num_queues_tc1@$num_queues_tc0 $num_queues_tc2@$((num_queues_tc0 + num_queues_tc1)) hw 1 mode channel + sleep 8 + /opt/iproute2/sbin/tc qdisc add dev $iface ingress + sleep 8 + + # create TC filters: one per pelikan instance + for ((i = 0; i < num_queues_tc1; i++)); do + /opt/iproute2/sbin/tc filter add dev $iface protocol ip ingress prio 1 flower dst_ip $ipaddr/32 ip_proto tcp dst_port $((port + i)) skip_sw hw_tc 1 + done + sleep 8 + + # check filter + /opt/iproute2/sbin/tc qdisc show dev $iface + /opt/iproute2/sbin/tc qdisc show dev $iface ingress + + # Set the interrupt moderation rate to a static value for Tx and turn off interrupt moderation for Rx + ethtool --coalesce ${iface} adaptive-rx off rx-usecs 0 + ethtool --coalesce ${iface} adaptive-tx off tx-usecs 500 + sleep 1 + + # config Intel Ethernet Flow Director (so that no two threads busy poll the same queue) + ethtool --features $iface ntuple on + ethtool --set-priv-flags $iface channel-inline-flow-director off + sleep 1 + + for ((i=0; i < num_queues_tc1; i++)); do + ethtool --config-ntuple $iface flow-type tcp4 dst-port $((port + i)) action $(((i % num_queues_tc1) + tc1offset)) + sleep 0.5 + done + + # check whether it is set correctly + ethtool --show-ntuple $iface + read -s -n 1 -p "check tc before continue" + + # Run the set_irq_affinity script for all interfaces + ${ICE_PATH}/PROCGB/Linux/ice-1.0.4/scripts/set_irq_affinity -X all $iface + # Configure symmetric queues on the interface + ${ICE_PATH}/PROCGB/Linux/ice-1.0.4/scripts/set_xps_rxqs $iface + + + # create cgroup + sudo yum -y install libcgroup libcgroup-tools + cgroup_name="app_tc1" + cgcreate -g net_prio:${cgroup_name} + cgset -r net_prio.ifpriomap="$iface 1" ${cgroup_name} + + cat /sys/fs/cgroup/net_prio/${cgroup_name}/net_prio.ifpriomap + cat /sys/fs/cgroup/net_prio/${cgroup_name}/tasks + + # cgexec -g net_prio:${cgroup_name} --sticky $command +} + + +cleanup() { + # clean up + jobs -p | xargs kill &> /dev/null + cgdelete -g net_prio:${cgroup_name} +} + + +watch() { + watch -d -n 0.5 "ethtool -S $iface | grep busy | column" +} + + +general_tuning +offline_cores +turnoff_service + +# non_adq +adq + diff --git a/scripts/adq/adq_tuning.sh b/scripts/adq/adq_tuning.sh new file mode 100755 index 000000000..c85fce2db --- /dev/null +++ b/scripts/adq/adq_tuning.sh @@ -0,0 +1,232 @@ +#!/bin/bash + +# https://blog.cloudflare.com/how-to-achieve-low-latency/ +# https://null.53bits.co.uk/index.php?page=numa-and-queue-affinity + + + +ICE_PATH=/home/junchengy/adq/ice/ +iface=eth0 +num_instance=24 +port=12300 +num_queues_tc0=2 +num_queues_tc1=32 +num_queues_tc2=32 +tc1offset=${num_queues_tc0} +# ipaddr=10.181.156.119 +host=$(hostname) +ipaddr=$(ifconfig|grep "inet "|grep broadcast|awk '{print $2}') + + +general_tuning() +{ + # disable firewall + service firewalld stop; systemctl mask firewalld + + # Enable latency-performance tuned profile + tuned-adm profile latency-performance + # check profile + cat /etc/tuned/active_profile + + # Set the CPU scaling governor to performance mode + x86_energy_perf_policy --hwp-enable 2>/dev/null + x86_energy_perf_policy performance + + # change ulimit + grep "hard nofile 65536" /etc/security/limits.conf + status=$? + if [[ "$status" == 1 ]]; then + echo "modify limits.conf" + echo -e '* hard nofile 65536'"\n"'* soft nofile 65536' | sudo tee -a /etc/security/limits.conf + fi + + + sysctl -w net.core.busy_poll=50000 + sysctl -w net.core.busy_read=50000 + sysctl -w net.core.somaxconn=4096 + sysctl -w net.core.netdev_max_backlog=8192 + sysctl -w net.ipv4.tcp_max_syn_backlog=16384 + sysctl -w net.core.rmem_max=16777216 + sysctl -w net.core.wmem_max=16777216 + sysctl -w net.ipv4.tcp_mem="764688 1019584 16777216" + sysctl -w net.ipv4.tcp_rmem="8192 87380 16777216" + sysctl -w net.ipv4.tcp_wmem="8192 65536 16777216" + sysctl -w net.ipv4.route.flush=1 + + # echo 800000 | sudo tee /proc/sys/net/ipv4/tcp_max_orphans + + + # Stop the irqbalance service. (Needed for interface interrupt affinity settings.) + systemctl stop irqbalance + echo kernel.numa_balancing=0 | sudo tee -a /etc/sysctl.conf + sysctl -p +} + +offline_cores() { + # assume we only keep 25 cores + # cat /proc/cpuinfo | grep -e processor -e "core id" -e "physical id" + lscpu + nproc=`nproc` + start=$((num_instance+1)) + end=$((nproc-1)) + for i in `seq ${start} ${end}`; do + echo 0 | sudo tee /sys/devices/system/cpu/cpu${i}/online + done +} + +turnoff_service() { + sudo puppet-util setbranch off + + sudo systemctl stop rezolus + sudo systemctl stop monit + sudo systemctl stop ntpdate + sudo systemctl stop osqueryd + sudo systemctl stop pcscd + sudo systemctl stop rsyslog + sudo systemctl stop scribe + sudo systemctl stop splunk + sudo systemctl stop tricorder + sudo systemctl stop tss-host-daemon + sudo systemctl stop twitcher + sudo systemctl stop twitterfw + sudo systemctl stop vexd + sudo systemctl stop fleetexec-server + sudo systemctl stop absorber.tss.production.host-daemon.13399.service +} + + +non_adq() +{ + # set wilson attributes to disable IRQ affinity + loony -H $host -d server set attribute irqaffinity:false + + # check current queues + ethtool -l $iface + + # find IRQ for the nic + ls /sys/class/net/$iface/device/msi_irqs/ + + # spread processing evenly between first 25 RX queues, and disable the other queues + sudo ethtool -X $iface equal 25 + # check the queue binding + sudo ethtool -x $iface + + + # IRQ affinity + let CPU=0 + cd /sys/class/net/$iface/device/msi_irqs/ || exit 1 + for IRQ in *; do + cat /proc/irq/$IRQ/smp_affinity_list; + echo $CPU > /proc/irq/$IRQ/smp_affinity_list + ((CPU+=1)) + done + + # Flow steering: this is used for initial mapping (connection start) + for ((i=0; i < num_queues_tc1+1; i++)); do + ethtool --config-ntuple $iface flow-type tcp4 dst-port $((port + i)) action $i + sleep 0.5 + done + + # receive flow steering, this is used for continuous packet steering + echo 3276800 > /proc/sys/net/core/rps_sock_flow_entries + for f in /sys/class/net/$iface/queues/rx-*/rps_flow_cnt; do + echo 32768 > $f; + done + + # Mellanox, turn on accelerated receive flow steering + ethtool -K $iface ntuple on + + # xps + ${ICE_PATH}/PROCGB/Linux/ice-1.0.4/scripts/set_xps_rxqs $iface + +} + + + + + +adq() +{ + modprobe ice + + # Enable hardware TC offload on the interface and turn off lldp + ethtool -K $iface hw-tc-offload on + ethtool --set-priv-flags $iface fw-lldp-agent off + + # verify settings + ethtool -k $iface | grep "hw-tc" + ethtool --show-priv-flags $iface + + read -s -n 1 -p "check hw-tc-offload is on before continue" + + /opt/iproute2/sbin/tc qdisc add dev $iface root mqprio num_tc 3 map 0 1 2 queues $num_queues_tc0@0 $num_queues_tc1@$num_queues_tc0 $num_queues_tc2@$((num_queues_tc0 + num_queues_tc1)) hw 1 mode channel + sleep 8 + /opt/iproute2/sbin/tc qdisc add dev $iface ingress + sleep 8 + + # create TC filters: one per pelikan instance + for ((i = 0; i < num_queues_tc1; i++)); do + /opt/iproute2/sbin/tc filter add dev $iface protocol ip ingress prio 1 flower dst_ip $ipaddr/32 ip_proto tcp dst_port $((port + i)) skip_sw hw_tc 1 + done + sleep 8 + + # check filter + /opt/iproute2/sbin/tc qdisc show dev $iface + /opt/iproute2/sbin/tc qdisc show dev $iface ingress + + # Set the interrupt moderation rate to a static value for Tx and turn off interrupt moderation for Rx + ethtool --coalesce ${iface} adaptive-rx off rx-usecs 0 + ethtool --coalesce ${iface} adaptive-tx off tx-usecs 500 + sleep 1 + + # config Intel Ethernet Flow Director (so that no two threads busy poll the same queue) + ethtool --features $iface ntuple on + ethtool --set-priv-flags $iface channel-inline-flow-director off + sleep 1 + + for ((i=0; i < num_queues_tc1; i++)); do + ethtool --config-ntuple $iface flow-type tcp4 dst-port $((port + i)) action $(((i % num_queues_tc1) + tc1offset)) + sleep 0.5 + done + + # check whether it is set correctly + ethtool --show-ntuple $iface + read -s -n 1 -p "check tc before continue" + + # Run the set_irq_affinity script for all interfaces + ${ICE_PATH}/PROCGB/Linux/ice-1.0.4/scripts/set_irq_affinity -X all $iface + # Configure symmetric queues on the interface + ${ICE_PATH}/PROCGB/Linux/ice-1.0.4/scripts/set_xps_rxqs $iface + + + # create cgroup + sudo yum -y install libcgroup libcgroup-tools + cgroup_name="app_tc1" + cgcreate -g net_prio:${cgroup_name} + cgset -r net_prio.ifpriomap="$iface 1" ${cgroup_name} + + cat /sys/fs/cgroup/net_prio/${cgroup_name}/net_prio.ifpriomap + cat /sys/fs/cgroup/net_prio/${cgroup_name}/tasks + + # cgexec -g net_prio:${cgroup_name} --sticky $command +} + + +cleanup() { + # clean up + jobs -p | xargs kill &> /dev/null + cgdelete -g net_prio:${cgroup_name} +} + + +watch() { + watch -d -n 0.5 "ethtool -S $iface | grep busy | column" +} + + +general_tuning +offline_cores +turnoff_service + +# non_adq +adq diff --git a/scripts/adq/analyaze.py b/scripts/adq/analyaze.py new file mode 100644 index 000000000..6a0dcce83 --- /dev/null +++ b/scripts/adq/analyaze.py @@ -0,0 +1,211 @@ +import os, sys +import argparse +import re +import glob +from collections import namedtuple +from pprint import pprint +import numpy as np +import matplotlib +import matplotlib.pyplot as plt +from matplotlib.ticker import MaxNLocator + + +sys.path.append(os.path.expanduser("~/")) +sys.path.append(os.path.expanduser("~/myworkspace/")) + + +Config = namedtuple("Config", ["thrpt", "nconn", "item_sz"]) +LatencyRes = namedtuple('LatencyRes', + ["thrpt", "conn", "P25", "P50", "P75", "P90", "P99", "P999", "P9999"]) + +n_instance = 1 +n_host = 24 +FACTOR = n_instance * n_host + + +def parse_data(data_path): + result_started = False + regex_conn = re.compile(r"Connections: Attempts: (?P\d+) Opened: (?P\d+) Errors: (?P\d+) Timeouts: (?P\d+) Open: (?P\d+)") + regex_rps = re.compile(r"Rate: Request: (?P[0-9.]+) rps Response: (?P[0-9.]+) rps Connect: (?P[0-9.]+) cps") + regex_lat = re.compile(r"Request Latency \(us\): p25: (?P\d+) p50: (?P\d+) p75: (?P\d+) p90: (?P\d+) p99: (?P\d+) p999: (?P\d+) p9999: (?P\d+)") + + thrpt, conn, P25, P50, P75, P90, P99, P999, P9999 = 0, 0, 0, 0, 0, 0, 0, 0, 0 + with open(data_path) as ifile: + for line in ifile: + if not result_started and "Window: 2" not in line: + continue + if "Window: 2" in line: + result_started = True + continue + if "Connections" in line: + m = regex_conn.search(line) + conn = int(m["open"]) + continue + if "Rate:" in line: + m = regex_rps.search(line) + thrpt = float(m["req"]) / 1000000 + continue + if "Request Latency " in line: + m = regex_lat.search(line) + if m is None: + print("{}\n{}".format(data_path, line)) + else: + P25, P50, P75, P90, P99, P999, P9999 = \ + int(m["P25"]), int(m["P50"]), int(m["P75"]), int(m["P90"]),\ + int(m["P99"]), int(m["P999"]), int(m["P9999"]) + continue + + return LatencyRes(thrpt=thrpt * n_instance * n_host, conn=conn * n_host, + P25=P25, P50=P50, P75=P75, P90=P90, P99=P99, P999=P999, P9999=P9999) + +def print_data(data_path): + print("{:8} {:8} {:8} {:8} {:8} {:8} {:8} {:8} {:8} {:8}".format( + "P25", "P50", "P75", "P90", "P99", "P999", "P9999", "MQPS", + "connection", "item size" + )) + for thrpt in (0.5, 1, 2, ): + for item_size in (64, 4096): + for nconn in (100, 500, 1000, 2000, 5000, 10000, ): + configs = [] + for f in glob.glob("{}/rpcperf_{}_{}_{}*".format(data_path, thrpt, nconn, item_size)): + configs.append(parse_data(f)) + # print(configs[-1], f.split("/")[-1]) + if len(configs) == 0: + print("{:8} {:8} {:8} {:8} {:8} {:8} {:8} {:8.4} {:8} {:8}".format( + "noData", "noData", "noData", "noData", "noData", "noData", + "noData", "noData", "noData", "noData", + )) + else: + # print("{:8} {:8} {:8} {:8} {:8} {:8} {:8} {:8.4} {:8} {:8}".format( + # np.mean(sorted([config.P25 for config in configs])[:]).astype(int), + # np.mean(sorted([config.P50 for config in configs])[:]).astype(int), + # np.mean(sorted([config.P75 for config in configs])[:]).astype(int), + # np.mean(sorted([config.P90 for config in configs])[:]).astype(int), + # np.mean(sorted([config.P99 for config in configs])[:]).astype(int), + # np.mean(sorted([config.P999 for config in configs])[:]).astype(int), + # np.mean(sorted([config.P9999 for config in configs])[:]).astype(int), + # np.mean(sorted([config.thrpt for config in configs])[:]), nconn, item_size + # )) + print("{:8} {:8} {:8} {:8} {:8} {:8} {:8} {:8.4} {:8} {:8}".format( + np.median(sorted([config.P25 for config in configs])[:]).astype(int), + np.median(sorted([config.P50 for config in configs])[:]).astype(int), + np.median(sorted([config.P75 for config in configs])[:]).astype(int), + np.median(sorted([config.P90 for config in configs])[:]).astype(int), + np.median(sorted([config.P99 for config in configs])[:]).astype(int), + np.median(sorted([config.P999 for config in configs])[:]).astype(int), + np.median(sorted([config.P9999 for config in configs])[:]).astype(int), + np.median(sorted([config.thrpt for config in configs])[:]), nconn, item_size + )) + + print() + + +def get_data(data_path, host): + data_dict = {} + for f in os.listdir(data_path): + _, thrpt, nconn, item_sz, instance_idx, other = f.split("_") + _, _, curr_host = other.split(".") + if int(float(thrpt)) == 16 or curr_host != host: + continue + conf = Config(thrpt=float(thrpt), nconn=int(nconn), item_sz=int(item_sz)) + lat = parse_data("{}/{}".format(data_path, f)) + data_dict[conf] = lat + + pprint(data_dict) + return data_dict + +def plot_lat(data_path_no_adq, data_path_adq): + host = "hostname" + data_dict_no_adq = get_data(data_path_no_adq, host) + data_dict_adq = get_data(data_path_adq, host) + conn_counts = (100, 500, 1000, 2000, 5000, 10000, 25000) + x_ticks1 = [2*i+1.2 for i in range(len(conn_counts))] + x_ticks2 = [2*i+1.8 for i in range(len(conn_counts))] + + for thrpt in (0.5, 1, 2, ): + for item_sz in (64, 4096): + P50_na, P90_na, P99_na, P999_na, P9999_na = [], [], [], [], [] + P50_ad, P90_ad, P99_ad, P999_ad, P9999_ad = [], [], [], [], [] + for nconn in conn_counts: + conf = Config(float(thrpt), nconn=nconn, item_sz=item_sz) + if conf not in data_dict_no_adq: + P50_na.append(0) + P90_na.append(0) + P99_na.append(0) + P999_na.append(0) + else: + lat = data_dict_no_adq[conf] + P50_na.append(lat.P50) + P90_na.append(lat.P90) + P99_na.append(lat.P99) + P999_na.append(lat.P999) + P9999_na.append(lat.P9999) + + if conf not in data_dict_adq: + P50_ad.append(0) + P90_ad.append(0) + P99_ad.append(0) + P999_ad.append(0) + else: + lat = data_dict_adq[conf] + P50_ad.append(lat.P50) + P90_ad.append(lat.P90) + P99_ad.append(lat.P99) + P999_ad.append(lat.P999) + P9999_ad.append(lat.P9999) + + P50_na, P90_na, P99_na, P999_na, P9999_na = np.array(P50_na), np.array(P90_na), np.array(P99_na), np.array(P999_na), np.array(P9999_na) + P50_ad, P90_ad, P99_ad, P999_ad, P9999_ad = np.array(P50_ad), np.array(P90_ad), np.array(P99_ad), np.array(P999_ad), np.array(P9999_ad) + + plt.bar(x_ticks1, P50_na, width=0.48, hatch="/", color="red", alpha=0.64, edgecolor='white') + plt.bar(x_ticks1, P90_na, width=0.48, hatch="\\", color="green", alpha=0.64, bottom=P50_na, edgecolor='white') + plt.bar(x_ticks1, P99_na, width=0.48, hatch="*", color="blue", alpha=0.64, bottom=P50_na+P90_na, edgecolor='white') + plt.bar(x_ticks1, P999_na, width=0.48, hatch="o", color="grey", alpha=0.64, bottom=P50_na+P90_na+P99_na, edgecolor='white') + + plt.bar(x_ticks2, P50_ad, width=0.48, hatch="/", color="red", alpha=0.64, edgecolor='white') + plt.bar(x_ticks2, P90_ad, width=0.48, hatch="\\", color="green", alpha=0.64, bottom=P50_ad, edgecolor='white') + plt.bar(x_ticks2, P99_ad, width=0.48, hatch="*", color="blue", alpha=0.64, bottom=P50_ad+P90_ad, edgecolor='white') + plt.bar(x_ticks2, P999_ad, width=0.48, hatch="o", color="grey", alpha=0.64, bottom=P50_ad+P90_ad+P99_ad, edgecolor='white') + + max_y = max(np.max(P999_na), np.max(P999_ad)) + if max_y > 1e5: + yticks = (100, 1000, 5000, 10000, 100000) + elif max_y > 1e4: + yticks = (100, 1000, 2000, 5000, 10000) + elif max_y > 1e3: + yticks = (100, 500, 1000, 2000, 5000) + else: + yticks = (100, 200, 500, 1000, 2000) + plt.legend(["P50", "P90", "P99", "P999"], ncol=4) + plt.yscale("log") + plt.xticks(x_ticks1, conn_counts) + plt.yticks(yticks, yticks) + plt.grid(linestyle="--") + plt.ylabel("latency ($\mu$s)") + plt.xlabel("Connection count") + plt.title("Throughput {:.1f} M QPS, item size {}".format(thrpt, item_sz)) + plt.savefig("lat_{}_{}.png".format(thrpt, item_sz)) + plt.clf() + +def used(): + plot_lat("rpcperf_log_mellanox_40Gbps_tuning", "rpcperf_log_mellanox_adq3", ) + plot_lat("rpcperf_log_mellanox_adq3", "rpcperf_log_no_adq1") + plot_lat("rpcperf_log_no_adq_newfirmware", "rpcperf_log_adq_newfirmware") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=""" + parse rpcperf results and print/plot the results + """) + parser.add_argument('--mode', dest='mode', type=str, help='print or plot', default="print", ) + parser.add_argument('--data', dest='data', type=str, help='path to data folder', required=True) + parser.add_argument('--data2', dest='data2', type=str, help='path to data2 folder, required to plotting', required=False) + + args = parser.parse_args() + + if args.mode == "print": + print_data(args.data) + elif args.mode == "plot": + plot_lat(args.data, args.data2) + else: + parser.print_help() diff --git a/scripts/adq/client_config.py b/scripts/adq/client_config.py new file mode 100755 index 000000000..441bf948f --- /dev/null +++ b/scripts/adq/client_config.py @@ -0,0 +1,138 @@ +import argparse +from math import ceil +import os +import sys +import subprocess +import re + + +PREFIX = 'loadgen' +PELIKAN_ITEM_OVERHEAD = {"twemcache": 48, "segcache": 8, "slimcache": 0} +PELIKAN_SERVER_PORT = 12300 + + +def generate_config(rate, connections, ksize, vsize, mem_bytes, get_weight, set_weight, threads, backend): +# create rpcperf.toml + item_overhead = PELIKAN_ITEM_OVERHEAD[backend] + nkey = int(ceil(0.20 * mem_bytes / (ksize + vsize + item_overhead))) + conn_per_thread = int(connections / threads) + + config_str = ''' +[general] +clients = {threads} +poolsize = {connections} # this specifies number of connection per thread +# runtime ~= windows x duration +windows = {window} +interval = 60 +request_ratelimit = {rate} +soft_timeout = true + +tcp_nodelay = false +connect_ratelimit = 100 +connect_timeout = 2000000 +request_timeout = 100000 + + +[[keyspace]] +length = {ksize} +count = {nkey} +weight = 1 +commands = [ + {{action = "get", weight = {get_weight}}}, + {{action = "set", weight = {set_weight}}}, +] +values = [ + {{length = {vsize}, weight = 1}}, +]'''.format(threads=threads, connections=conn_per_thread, window=connections//6000+3, nkey=nkey, rate=rate, + ksize=ksize, vsize=vsize, get_weight=get_weight, set_weight=set_weight) + + with open('rpcperf.toml', 'w') as the_file: + the_file.write(config_str) + +def get_hw_thd_idx(): + regex = re.compile(r"NUMA node(?P\d+) CPU\(s\): *(?P\d+)-(?P\d+),\d+-\d+") + thd_idx_pair = [] + p = subprocess.run("lscpu|grep NUMA", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout = p.stdout.decode().split("\n") + numa_n = 1 + for line in stdout: + if "NUMA node(s)" in line: + numa_n = int(line.split(":")[1]) + m = regex.search(line) + if m: + thd_idx_pair.append((int(m.group("thd_start")), int(m.group("thd_end")))) + assert numa_n == len(thd_idx_pair), "{} {}".format(numa_n, thd_idx_pair) + + thd_idx = [] + for p in thd_idx_pair: + for i in range(p[0], p[1]+1): + thd_idx.append(i) + return thd_idx + + +def generate_runscript(binary, server_ip, instances): + # create test.sh + fname = 'test.sh' + with open(fname, 'w') as the_file: + # for i in range(instances): + # server_port = PELIKAN_SERVER_PORT + i + # the_file.write('ulimit -n 65536; ') + # the_file.write('{binary_file} --config {config_file}'.format( + # binary_file=binary, config_file='rpcperf.toml')) + # the_file.write(' --endpoint {server_ip}:{server_port}'.format( + # server_ip=server_ip, server_port=server_port)) + # the_file.write(' > rpcperf_{server_port}_{instance_idx}.log'.format( + # server_port=server_port, instance_idx=i)) + # the_file.write(' 2>&1 &\n') + + + server_port = PELIKAN_SERVER_PORT + the_file.write('ulimit -n 65536; ') + the_file.write('{binary_file} --config {config_file}'.format(binary_file=binary, config_file='rpcperf.toml')) + the_file.write(' --endpoint {server_ip}:{server_port}'.format( + server_ip=server_ip, server_port=server_port)) + the_file.write(' > rpcperf_{server_port}_{instance_idx}.log'.format( + server_port=server_port, instance_idx=0)) + the_file.write(' 2>&1 &\n') + + the_file.write(""" +sleep 100; +nrunning=1 +while [ $nrunning -gt 0 ] +do + nrunning=$(pgrep -c rpc-perf) + echo "$(date) $(hostname): $nrunning clients are still running" + sleep 10 +done +""") + os.chmod(fname, 0o777) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=""" + Generate all the client-side scripts/configs needed for a test run. + """) + parser.add_argument('--binary', dest='binary', type=str, help='location of rpc-perf binary', required=True) + parser.add_argument('--backend', dest='backend', type=str, help='backend', required=True) + parser.add_argument('--prefix', dest='prefix', type=str, default=PREFIX, help='folder that contains all the other files to be generated') + parser.add_argument('--instances', dest='instances', type=int, help='number of instances', required=True) + parser.add_argument('--server_ip', dest='server_ip', type=str, help='server ip', required=True) + parser.add_argument('--rate', dest='rate', type=int, help='request rate per instance', required=True) + parser.add_argument('--connections', dest='connections', type=int, help='number of connections per instance', required=True) + parser.add_argument('--ksize', dest='ksize', type=int, help='key size', required=True) + parser.add_argument('--vsize', dest='vsize', type=int, help='value size', required=True) + parser.add_argument('--mem_bytes', dest='mem_bytes', type=int, help='memory size', required=True) + parser.add_argument('--get_weight', dest='get_weight', type=int, help='get weight (0-10)', required=True) + parser.add_argument('--set_weight', dest='set_weight', type=int, help='set weight (0-10)', required=True) + parser.add_argument('--threads', dest='threads', type=int, help='number of worker threads per rpc-perf', required=True) + + args = parser.parse_args() + + if not os.path.exists(args.prefix): + os.makedirs(args.prefix) + os.chdir(args.prefix) + + generate_config(args.rate, args.connections, + args.ksize, args.vsize, args.mem_bytes, + args.get_weight, args.set_weight, args.threads, args.backend) + generate_runscript(args.binary, args.server_ip, args.instances) diff --git a/scripts/adq/generate.sh b/scripts/adq/generate.sh new file mode 100755 index 000000000..215e39038 --- /dev/null +++ b/scripts/adq/generate.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +source params.sh > /dev/null + +show_help() { + echo 'generate.sh [-c [-r absolute/path/to/rpcperf] [-t server_ip]] [-s [-p absolute/path/to/pelikan]] [-m "path/to/pmem0 path/to/pmem1 ..."]' + echo 'Note that the first pmem path is bound to the first numa node, the second path is bound to the next numa node. One or more paths can be provided.' +} + +get_args() { + while getopts ":p:r:t:m:csh" opt; do + case "$opt" in + c) + client=true + ;; + s) + server=true + ;; + p) + pelikan=$OPTARG + ;; + r) + rpcperf=$OPTARG + ;; + t) + server_ip=$OPTARG + ;; + m) + pmem_paths=($OPTARG) + ;; + h) + show_help + exit 0 + ;; + \?) + echo "unrecognized option $opt" + show_help + exit 1 + ;; + esac + done +} + +# pelikan configs +gen_pelikan() { + for size in "${size_configs[@]}"; do + vsize=$((size - ksize)) + prefix=pelikan_${size} + python3 server_config.py --prefix="$prefix" --binary="$pelikan_binary" --instances="$pelikan_instances" --mem_bytes "$mem_bytes" --ksize "$ksize" --vsize "$vsize" --use_adq $use_adq --worker_binding True --pmem_paths ${pmem_paths[@]} + done +} + +# rpc-perf configs +gen_rpcperf() { + backend=${pelikan_binary##*_} + for thrpt in "${thrpt_configs[@]}"; do + nhost=${#CLIENTS[@]} + rate=$(echo "${thrpt} * 1000000 / ${rpcperf_instances} / ${nhost}" | bc) + for conn in "${conn_configs[@]}"; do + conn_per_instance=$conn + conn_per_instance=$(echo "${conn} * ${pelikan_instances} / ${rpcperf_instances} / ${nhost}" | bc) +# conn_per_instance=$(echo "$conn / ${nhost} + 0.5" | bc -l) +# conn_per_instance=${conn_per_instance%.*} + for size in "${size_configs[@]}"; do + vsize=$((size - ksize)) + prefix=rpcperf_${thrpt}_${conn}_${size} + python3 client_config.py --prefix="$prefix" --backend="$backend" --binary="$rpcperf_binary" --server_ip="$server_ip" --instances="$rpcperf_instances" --rate="$rate" --connections="$conn_per_instance" --ksize="$ksize" --vsize "$vsize" --mem_bytes="$mem_bytes" --get_weight="${get_weight}" --set_weight="${set_weight}" --threads="${rpcperf_threads}" + done + done + done +} + +set -e +get_args "${@}" +if [ "$client" = true ]; then + gen_rpcperf +fi +if [ "$server" = true ]; then + gen_pelikan +fi + +set +e \ No newline at end of file diff --git a/scripts/adq/params.sh b/scripts/adq/params.sh new file mode 100755 index 000000000..bd3cd15ad --- /dev/null +++ b/scripts/adq/params.sh @@ -0,0 +1,28 @@ + +# per pelikan instance +export conn_configs=(100 500 1000 2000 5000 10000) +export mem_bytes=$((4 * 1024 * 1024 * 1024)) +export size_configs=(64 4096) +export thrpt_configs=(0.5 1 2) # M QPS +export pelikan_instances=24 +# export rpcperf_instances=24 # per host +export rpcperf_instances=1 # per host +export ksize=32 +export get_weight=9 +export set_weight=1 +export rpcperf_threads=20 + +export pmem_paths=() +export use_adq=1 + +export SERVER= +export CLIENTS=() + + +export pelikan_binary="$HOME/pelikan_twemcache" +export rpcperf_binary="$HOME/rpc-perf" + +rm hosts 2>/dev/null +for c in ${CLIENTS[@]}; do + echo $c >> hosts; +done diff --git a/scripts/adq/run.sh b/scripts/adq/run.sh new file mode 100755 index 000000000..ce2f683a6 --- /dev/null +++ b/scripts/adq/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# client is remote or localhost +# server is localhost + + +source params.sh > /dev/null +trap "echo ${pelikan_binary##*/}; sudo pkill ${pelikan_binary##*/}; exit" SIGHUP SIGINT SIGTERM + +setup() +{ + sudo rm -r /tmp/pelikan* 2>/dev/null + sudo rm -r rpcperf_* pelikan_* 2>/dev/null + sudo pkill -9 -f pelikan + mkdir rpcperf_log 2>/dev/null; + echo "${#CLIENTS[@]} clients" + pssh -O StrictHostKeyChecking=no -h hosts "pkill -9 rpc-perf 2>/dev/null; rm -rf /tmp/rpcperf* 2>/dev/null; sleep 2" + pscp.pssh -h hosts "${rpcperf_binary}" $HOME/; +} + +gen_conf() +{ + ./generate.sh -c -r $HOME/rpc-perf -s -p $HOME/pelikan_twemcache -t "${SERVER}" +} + +run_all_tests() +{ + for server_conf in pelikan_*; do + item_size=${server_conf##*_} + for client_conf in rpcperf_*_"${item_size}"; do + echo -e "####### start ${server_conf} \t----- ${client_conf} \t#######" + if [ -f "rpcperf_log/${client_conf}_12300_0.log" ]; then + echo skip ${client_conf} + continue + fi + ./run_one_test.sh -s "${server_conf}" -c "${client_conf}" -m "${CLIENTS}" + sleep 80 + done + done + tar cvf rpcperf_log.tar.gz rpcperf_log +} + + +setup +gen_conf +run_all_tests + +tput bel +date diff --git a/scripts/adq/run_one_test.sh b/scripts/adq/run_one_test.sh new file mode 100755 index 000000000..2dc1ba3c8 --- /dev/null +++ b/scripts/adq/run_one_test.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +source params.sh > /dev/null + +show_help() { + echo "runtest.sh -c -s -m <\"one or more client ips\">" +} + +get_args() { + while getopts ":c:s:m:h" opt; do + case "$opt" in + c) + client_config=$OPTARG + ;; + s) + server_config=$OPTARG + ;; + m) + clients=($OPTARG) + ;; + h) + show_help + exit 0 + ;; + \?) + echo "unrecognized option $opt" + show_help + exit 1 + ;; + esac + done +} + +server_launch() { + init_dir=$(pwd) + folder=${server_config##*/} + + sudo pkill -f ${pelikan_binary} 2>/dev/null + rm -rf "/tmp/${folder}" 2>/dev/null + + cp -r "${server_config}" /tmp/ + cd "/tmp/$folder" && ./warm-up.sh + cd "${init_dir}" || exit 1 +} + +client_run() { + folder=${client_config##*/} + nClient=${#CLIENTS[@]} + for i in $(seq 0 $((nClient-1))); do + client=${CLIENTS[$i]} + sleep 0.5 + ( + scp -rq "${client_config}" "$client:/tmp/" + port=$(echo "12300+ (${i} % ${pelikan_instances})" | bc) + echo client "$i $client - $client_config" port $port + ssh -q "$client" -tt "cd /tmp/${folder} && sed -i 's/12300/${port}/g' test.sh && ./test.sh; " + scp -q "${client}:/tmp/${folder}/rpcperf_${port}_0.log" rpcperf_log/${folder}_${port}_0.log.${client} +# for i in $(seq 0 $((rpcperf_instances-1))); do +# port=$((12300 + i)) +# scp -q "${client}:/tmp/${folder}/rpcperf_${port}_${i}.log" rpcperf_log/${folder}_${port}_${i}.log.${client} +# done + ) & + done + wait +} + +cleanup() { + sudo pkill -f ${pelikan_binary} +} + + +trap "cleanup; exit" SIGHUP SIGINT SIGTERM + +get_args "${@}" +server_launch +client_run +cleanup diff --git a/scripts/adq/server_config.py b/scripts/adq/server_config.py new file mode 100644 index 000000000..da0b39524 --- /dev/null +++ b/scripts/adq/server_config.py @@ -0,0 +1,200 @@ +import argparse +from math import ceil, floor, log +import os +import subprocess +import sys + +PREFIX = 'test' +PELIKAN_ADMIN_PORT = 9900 +PELIKAN_SERVER_PORT = 12300 +PELIKAN_ITEM_OVERHEAD = {"twemcache": 48, "segcache": 8, "slimcache": 0} +NUMA_NODE = 0 + +def generate_config(instances, ksize, vsize, mem_bytes, pmem_paths, engine, worker_binding): + # create top-level folders under prefix + try: + os.makedirs('config') + except: + pass + try: + os.makedirs('log') + except: + pass + + item_overhead = PELIKAN_ITEM_OVERHEAD[engine] + item_size = ksize + vsize + item_overhead + # because segcache does not perform in-place update + # we need to reduce the #keys + nkey = int(ceil(0.20 * mem_bytes / item_size)) + hash_power = int(ceil(log(nkey, 2))) + + # create twemcache|slimcache|segcache config file(s) + for i in range(instances): + admin_port = PELIKAN_ADMIN_PORT + i + server_port = PELIKAN_SERVER_PORT + i + config_file = '{engine}-{server_port}.config'.format(engine=engine, server_port=server_port) + + # String with common options for both twemcache, segcache and slimcache + config_str = """\ +daemonize: yes +admin_port: {admin_port} +server_port: {server_port} + +admin_tw_cap: 2000 + +buf_init_size: 4160 + +buf_sock_poolsize: 65536 + +debug_log_level: 4 +debug_log_file: log/{engine}-{server_port}.log +debug_log_nbuf: 1048576 + +prefill: yes +prefill_ksize: {ksize} +prefill_vsize: {vsize} +prefill_nkey: {nkey} + +request_poolsize: 65536 +response_poolsize: 131072 + +time_type: 2 +""".format(admin_port=admin_port, server_port=server_port, ksize=ksize, vsize=vsize, nkey=nkey, engine=engine) + + # String with options specific for either twemcache or slimcache + pmem_path_str = "" + worker_pinning_str = "" + if engine == "slimcache": + datapool_param = "cuckoo_datapool" + engine_str = """\ + +cuckoo_item_size: {item_size} +cuckoo_nitem: {nkey} +cuckoo_datapool_prefault: yes +""".format(item_size=item_size, nkey=nkey) + elif engine == "twemcache": + datapool_param = "slab_datapool" + engine_str = """\ + +slab_evict_opt: 1 +slab_prealloc: yes +slab_hash_power: {hash_power} +slab_mem: {mem_bytes} +slab_size: 1048576 +slab_datapool_prefault: yes + +stats_intvl: 10000 +stats_log_file: log/twemcache-{server_port}.stats +""".format(hash_power=hash_power, mem_bytes=mem_bytes, server_port=server_port) + elif engine == "segcache": + datapool_param = "seg_datapool_path" + engine_str = """\ + +seg_evict_opt: 1 +seg_hash_power: {hash_power} +seg_mem: {mem_bytes} +seg_size: 1048576 + +stats_intvl: 10000 +stats_log_file: log/segcache-{server_port}.stats +""".format(hash_power=hash_power, mem_bytes=mem_bytes, server_port=server_port) + else: + raise RuntimeError("unknown binary {}".format(engine)) + + if worker_binding: + worker_pinning_str = """ + +worker_binding_core: {worker_binding_core} +""".format(worker_binding_core=i+1) + + # String with option specific for PMEM usage + if len(pmem_paths) > 0: + pmem_path_str = """\ + +{datapool_param}: {pmem_path} +""".format(datapool_param=datapool_param, pmem_path=os.path.join(pmem_paths[i%len(pmem_paths)], 'pool_{}'.format(server_port))) + + # Put it all together + config_str = config_str + engine_str + pmem_path_str + worker_pinning_str + with open(os.path.join('config', config_file),'w') as the_file: + the_file.write(config_str) + +def generate_runscript(binary, instances, pmem_paths_count, engine, use_adq, worker_binding): + # create bring-up.sh + fname = 'bring-up.sh' + numa_node_count = pmem_paths_count if pmem_paths_count > 0 else 1 + with open(fname, 'w') as the_file: + the_file.write("ulimit -n 65536;\n") + for i in range(instances): + config_file = os.path.join('config', '{engine}-{server_port}.config'.format(engine=engine, server_port=PELIKAN_SERVER_PORT+i)) + if not worker_binding: + the_file.write('sudo numactl --cpunodebind={numa_node} --preferred={numa_node} '.format( + numa_node=i%numa_node_count)) + + if use_adq: + the_file.write("sudo cgexec -g net_prio:{cgroup_name} --sticky ".format(cgroup_name="app_tc1")) + + the_file.write('sudo {binary_file} {config_file} > server.log 2>&1 \n'.format( + binary_file=binary, config_file=config_file)) + os.chmod(fname, 0o777) + + # create warm-up.sh + fname = 'warm-up.sh' + prefill_opt = "" + if engine == "slimcache": + prefill_opt = "prefilling cuckoo" + elif engine == "twemcache": + prefill_opt = "prefilling slab" + elif engine == "segcache": + prefill_opt = "prefilling seg" + + with open(fname, 'w') as the_file: + the_file.write(""" +./bring-up.sh + +nready=0 +while [ $nready -lt {instances} ] +do + nready=$(grep -l "{prefill_opt}" log/{engine}-*.log | wc -l) + echo "$(date): $nready out of {instances} servers are warmed up" + sleep 10 +done +""".format(instances=instances, prefill_opt=prefill_opt, engine=engine)) + os.chmod(fname, 0o777) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=""" + Generate all the server-side scripts/configs needed for a test run. + """) + parser.add_argument('--binary', dest='binary', type=str, help='location of pelikan_twemcache|pelikan_slimcache binary', required=True) + parser.add_argument('--prefix', dest='prefix', type=str, default=PREFIX, help='folder that contains all the other files to be generated') + parser.add_argument('--instances', dest='instances', type=int, help='number of instances') + parser.add_argument('--ksize', dest='ksize', type=int, help='key size') + parser.add_argument('--vsize', dest='vsize', type=int, help='value size') + parser.add_argument('--mem_bytes', dest='mem_bytes', type=int, help='total capacity of heap memory, in bytes') + parser.add_argument('--use_adq', dest='use_adq', default=False, type=str, help='whether to use adq') + parser.add_argument('--worker_binding', dest='worker_binding', default=True, type=bool, help='whether binding worker thread to cores') + parser.add_argument('--pmem_paths', dest='pmem_paths', nargs='*', help='list of pmem mount points') + + args = parser.parse_args() + + if not os.path.exists(args.prefix): + os.makedirs(args.prefix) + os.chdir(args.prefix) + print(os.getcwd(), args.prefix) + + engine = "" + binary_help_out = subprocess.run([args.binary, '--help'], stdout=subprocess.PIPE).stdout.decode() + for e in ("twemcache", "segcache", "slimcache"): + if e in binary_help_out: + engine = e + break + if len(engine) == 0: + print('Provided binary is not twemcache|segcache|slimcache. Only these engines are valid. Exiting...') + print("binary help output: {}".format(binary_help_out)) + sys.exit() + + use_adq = True if args.use_adq == "1" or args.use_adq == "True" else False + generate_config(args.instances, args.ksize, args.vsize, args.mem_bytes, args.pmem_paths, engine, args.worker_binding) + generate_runscript(args.binary, args.instances, len(args.pmem_paths), engine, use_adq, args.worker_binding)