Skip to content

Commit acc0c20

Browse files
committed
bypass network stack for local sockets
1 parent c2215cf commit acc0c20

File tree

5 files changed

+101
-98
lines changed

5 files changed

+101
-98
lines changed

docs/localhost-bypass-stack.rst

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ What is it?
66
A TCP socket will go through all the phases of TCP/IP packet processing even
77
when the remote peer is a local TCP socket. The aim of this ebpf code is to
88
bypass this network stack processing for TCP client/server communication on
9-
local sockets. The sockmap_ primitive added to ebpf allows you to
10-
parse/redirect the traffic between the sockets.
9+
local sockets. The sockmap_ primitive added to ebpf allows to parse/redirect
10+
the traffic between the sockets.
1111

12-
But why would anyone use TCP sockets locally?
12+
Why would anyone use TCP sockets locally?
1313
---------------------------------------------
1414
* Apps use TCP sockets for inter-process comm since their processes may be
1515
local or remote. By using TCP, their code works whether the other process is
@@ -24,7 +24,7 @@ But why would anyone use TCP sockets locally?
2424
primary/main container. This requires that in some cases (the TLS case
2525
mentioned) the traffic is proxied through the sidecar.
2626

27-
But why to bypass network stack?
27+
Why to bypass network stack?
2828
--------------------------------
2929
* Mainly for performance reason. In a typical container env, a packet
3030
may be tossed locally between different sockets before the data finally
@@ -35,15 +35,24 @@ But why to bypass network stack?
3535
* sockmap_ integrates with kernel's strparser_ and allows one to parse, process
3636
update the payload before redirecting. Check: ktls-bpf_.
3737

38-
Any side-effects of bypassing?
38+
Side-effects of bypassing?
3939
------------------------------
4040
* Possibly. If you have tooling which depends on this traffic passing through
4141
network stack then yes. For e.g., you may have an iptables rule to do xyz.
4242
This xyz would not work anymore, since the data is directly tossed between
4343
the sockets.
4444

45-
Solution details
46-
================
45+
Solution details: How eBPF helps?
46+
=================================
47+
48+
Use of sockmap
49+
--------------
50+
- Special SOCKMAP map
51+
- Ways to initialize this sockmap from userspace or kernelspace
52+
- Redirecting packets
53+
- Use of BPF_F_INGRESS flag
54+
- Parser and verdict eBPF functions
55+
Draw a picture to specify this
4756

4857
What are the pieces?
4958
--------------------

run_as_cgroupv2.sh

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,7 @@ chktool timeout
1313
chktool bpftool
1414
chktool mountpoint
1515

16-
CGRP_MNT=/tmp/cgroupv2
17-
FOO=$CGRP_MNT/foo
18-
PINPT=/sys/fs/bpf/localbypass
19-
BO=bin/local-socket-bypass-sockops-kern.bo
16+
CGRP_MNT=/mnt/cgroup-test-work-dir
2017
BPF_MNT=/sys/fs/bpf
2118

2219
trap 'unload "$BASH_COMMAND"' EXIT
@@ -26,13 +23,17 @@ unload()
2623
[[ $? -ne 0 ]] && echo "Failed: [$1]"
2724
echo "unloading..."
2825
set +e
29-
bpftool cgroup detach $FOO sock_ops pinned $PINPT 2>/dev/null
30-
umount $CGRP_MNT
31-
[[ -d "$CGRP_MNT" ]] && rm -rf $CGRP_MNT
32-
umount $BPF_MNT
26+
27+
if [ -d "$CGRP_MNT" ]; then
28+
mountpoint $CGRP_MNT >/dev/null
29+
[[ $? -eq 0 ]] && umount $CGRP_MNT
30+
[[ -d "$CGRP_MNT" ]] && rm -rf $CGRP_MNT
31+
fi
32+
33+
mountpoint $BPF_MNT >/dev/null
34+
[[ $? -eq 0 ]] && umount $BPF_MNT
3335
}
3436

35-
# Inspired by: https://github.com/torvalds/linux/blob/master/samples/bpf/tcp_bpf.readme
3637
load()
3738
{
3839
mountpoint $BPF_MNT >/dev/null
@@ -41,28 +42,21 @@ load()
4142
mkdir -p $CGRP_MNT
4243
mountpoint $CGRP_MNT >/dev/null
4344
[[ $? -ne 0 ]] && mount -t cgroup2 none $CGRP_MNT
44-
45-
set -eE -o functrace
46-
mkdir -p $FOO
47-
echo $$ >> $FOO/cgroup.procs
48-
bpftool prog load $BO $PINPT
49-
bpftool cgroup attach $FOO sock_ops pinned $PINPT
50-
sleep 1
51-
./bin/local-socket-bypass-user.bin &
52-
bypass_pid=$!
5345
}
5446

55-
load
47+
#load
48+
49+
./bin/local-socket-bypass-user.bin ./run_localsock.sh
5650

5751
### Start of Action
5852

59-
python -m SimpleHTTPServer 12345 &
60-
srv_pid=$!
61-
sleep 1
62-
curl http://localhost:12345/
53+
#python -m SimpleHTTPServer 12345 &
54+
#srv_pid=$!
55+
#sleep 1
56+
#curl http://localhost:12345/
6357
#curl http://www.baidu.com
64-
kill $srv_pid $bypass_pid
58+
#kill $srv_pid $bypass_pid
6559

6660
### End of Action
6761

68-
timeout --preserve-status 1s bpftool prog tracelog #show whatever logs were captured
62+
#timeout --preserve-status 1s bpftool prog tracelog #show whatever logs were captured

run_localsock.sh

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,36 @@
11
#!/bin/bash
22

3-
chktool()
3+
# This is called from src/local-socket-bypass-user.c
4+
5+
trap 'cleanup' EXIT
6+
7+
cleanup()
48
{
5-
[[ $(which "$1") == "" ]] && echo "need util $1" && exit 2
9+
[[ "$srv_pid" != "" ]] && kill $srv_pid
610
}
711

8-
chktool curl
9-
chktool python
10-
chktool timeout
11-
chktool bpftool
12-
chktool mountpoint
12+
workload1()
13+
{
14+
python -m SimpleHTTPServer 12345 &
15+
srv_pid=$!
16+
sleep 1
17+
curl http://localhost:12345/
18+
#curl http://www.ietf.org/
19+
}
20+
21+
workload2()
22+
{
23+
echo "TOCLIXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" | nc -l 127.0.0.1 12345 &
24+
srv_pid=$!
25+
sleep 0.5
26+
i=1
27+
while [ $i -le 20 ]; do
28+
echo "TOSRV$i"
29+
sleep 0.2
30+
((i++))
31+
done | nc -w 1 127.0.0.1 12345
32+
}
1333

14-
python -m SimpleHTTPServer 12345 &
15-
srv_pid=$!
16-
sleep 1
17-
curl http://localhost:12345/
18-
kill $srv_pid
34+
workload2
1935

2036
timeout --preserve-status 1s bpftool prog tracelog #show whatever logs were captured

src/local-socket-bypass-kern.c

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,46 +8,62 @@
88
*/
99

1010
#include <uapi/linux/bpf.h>
11+
#include <linux/in.h>
1112
#include <linux/version.h>
1213

1314
#include "bpf_helpers.h"
1415

16+
#define SRV_IDX 0
17+
#define CLI_IDX 1
18+
1519
struct bpf_map_def SEC("maps") skmap = {
1620
.type = BPF_MAP_TYPE_SOCKMAP,
1721
.key_size = sizeof(int),
1822
.value_size = sizeof(unsigned int),
1923
.max_entries = 2,
2024
};
2125

22-
#if 1
2326
SEC("sk_skb1")
2427
int prog1(struct __sk_buff *skb)
2528
{
26-
bpf_printk("parser called\n");
29+
bpf_printk("lport=%d parser called %d\n", skb->local_port, skb->len);
2730
return skb->len;
2831
}
2932

3033
SEC("sk_skb2")
3134
int prog2(struct __sk_buff *skb)
3235
{
33-
uint32_t idx = 0;
34-
bpf_printk("verdict called\n");
35-
return bpf_sk_redirect_map(skb, &skmap, idx, 0);
36+
int verdict;
37+
__u32 lport = skb->local_port;
38+
uint32_t idx = CLI_IDX;
39+
40+
if (lport == 12345) {
41+
idx = SRV_IDX;
42+
}
43+
verdict = bpf_sk_redirect_map(skb, &skmap, idx, BPF_F_INGRESS);
44+
bpf_printk("lport=%d verdict %d\n", lport, skb->len);
45+
return verdict;
3646
}
37-
#endif
3847

3948
SEC("sockops")
4049
int sock_map_update(struct bpf_sock_ops *ops)
4150
{
42-
int op;
43-
op = (int) ops->op;
51+
__u32 lport = ops->local_port;
52+
__u32 rport = ops->remote_port;
53+
__u32 lip = ops->local_ip4;
54+
int op = (int) ops->op;
55+
56+
if (lip != 0x100007f) {
57+
return 0;
58+
}
4459

4560
if (op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB || op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) {
46-
uint32_t idx = 0;
47-
int ret;
48-
bpf_printk("Calling UPDATE: loc=0x%x:%d rem=0x%x\n",
49-
ops->local_ip4, ops->local_port, ops->remote_ip4);
50-
ret = bpf_sock_map_update(ops, &skmap, &idx, BPF_ANY);
61+
int idx = CLI_IDX, ret;
62+
if (lport == 12345) {
63+
idx = SRV_IDX;
64+
}
65+
bpf_printk("%d UPDATE: lport=%d\n", idx, lport);
66+
ret = bpf_sock_map_update(ops, &skmap, &idx, BPF_NOEXIST);
5167
if (ret) {
5268
bpf_printk("FAILED bpf_sock_map_update ret=%d\n", ret);
5369
}

src/local-socket-bypass-user.c

Lines changed: 9 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -17,41 +17,6 @@
1717
#define ERROR(...) { printf(__VA_ARGS__); }
1818
#define INFO(...) { printf(__VA_ARGS__); }
1919

20-
#if 0
21-
void load_prog(void)
22-
{
23-
int err, parse_prog, verdict_prog, sockops_prog;
24-
struct bpf_object *obj;
25-
struct bpf_map *skmap;
26-
27-
err = bpf_prog_load("bin/local-socket-bypass-sockops-kern.bo",
28-
BPF_PROG_TYPE_SOCK_OPS, &obj, &sockops_prog);
29-
if (err) FATAL("sockops bpf_prog_load failed err:%d\n", err);
30-
31-
skmap = bpf_object__find_map_by_name(obj, "skmap");
32-
if (!skmap) FATAL("cud not find skmap\n");
33-
34-
g_skmap_fd = bpf_map__fd(skmap);
35-
if (g_skmap_fd < 0) FATAL("Could not get the stat map\n");
36-
37-
/* Load parser, get skmap fd and attach fd to parser */
38-
err = bpf_prog_load("bin/local-socket-bypass-parse-kern.bo",
39-
BPF_PROG_TYPE_SK_SKB, &obj, &parse_prog);
40-
if (err) FATAL("parse bpf_prog_load failed err:%d\n", err);
41-
42-
err = bpf_prog_attach(parse_prog, g_skmap_fd, BPF_SK_SKB_STREAM_PARSER, 0);
43-
if (err) FATAL("bpf_prog_attach failed err=%d\n", err);
44-
45-
/* Load verdict, skmap fd we already have and attach fd to verdict */
46-
err = bpf_prog_load("bin/local-socket-bypass-verdict-kern.bo",
47-
BPF_PROG_TYPE_SK_SKB, &obj, &verdict_prog);
48-
if (err) FATAL("verdict bpf_prog_load failed err:%d\n", err);
49-
50-
err = bpf_prog_attach(verdict_prog, g_skmap_fd, BPF_SK_SKB_STREAM_VERDICT, 0);
51-
if (err) FATAL("bpf_prog_attach failed err=%d\n", err);
52-
}
53-
#endif
54-
5520
int prog_attach_type[] = {
5621
BPF_SK_SKB_STREAM_PARSER,
5722
BPF_SK_SKB_STREAM_VERDICT,
@@ -79,7 +44,7 @@ void load_prog(void)
7944
int ret;
8045
int i = 0;
8146

82-
obj = bpf_object__open("bin/local-socket-bypass-sockops-kern.bo");
47+
obj = bpf_object__open("bin/local-socket-bypass-kern.bo");
8348
err = libbpf_get_error(obj);
8449
if (err) {
8550
char err_buf[256];
@@ -105,12 +70,10 @@ void load_prog(void)
10570
g_skmap_fd = bpf_map__fd(skmap);
10671
if (g_skmap_fd < 0) FATAL("cud not find map\n");
10772

108-
ret = bpf_prog_attach(prog_fd[0], g_skmap_fd,
109-
BPF_SK_SKB_STREAM_PARSER, 0);
73+
ret = bpf_prog_attach(prog_fd[0], g_skmap_fd, BPF_SK_SKB_STREAM_PARSER, 0);
11074
if (ret) FATAL("attach sockmap to parser failed\n");
11175

112-
ret = bpf_prog_attach(prog_fd[1], g_skmap_fd,
113-
BPF_SK_SKB_STREAM_VERDICT, 0);
76+
ret = bpf_prog_attach(prog_fd[1], g_skmap_fd, BPF_SK_SKB_STREAM_VERDICT, 0);
11477
if (ret) FATAL("attach sockmap to verdict failed\n");
11578

11679
ret = bpf_prog_attach(prog_fd[2], cgrp_fd, BPF_CGROUP_SOCK_OPS, 0);
@@ -122,7 +85,8 @@ int main(int argc, char *argv[])
12285
{
12386
int ret;
12487

125-
if (argc < 2) FATAL("Usage: %s <script-to-exec>\n", argv[0]);
88+
if (argc < 2)
89+
FATAL("Usage: %s <script-to-exec, ./run_localsock.sh>\n", argv[0]);
12690

12791
if (setup_cgroup_environment()) FATAL("setup cgrp failed\n");
12892
cgrp_fd = create_and_get_cgroup(CGRP_PATH);
@@ -132,6 +96,10 @@ int main(int argc, char *argv[])
13296
load_prog();
13397
ret = system(argv[1]);
13498
INFO("%s ret=%d\n", argv[1], ret);
99+
100+
bpf_prog_detach2(prog_fd[2], cgrp_fd, BPF_CGROUP_SOCK_OPS);
101+
bpf_prog_detach2(prog_fd[0], g_skmap_fd, BPF_SK_SKB_STREAM_PARSER);
102+
bpf_prog_detach2(prog_fd[1], g_skmap_fd, BPF_SK_SKB_STREAM_VERDICT);
135103

136104
cleanup_cgroup_environment();
137105
close(cgrp_fd);

0 commit comments

Comments
 (0)