Skip to content

Commit

Permalink
feat: support to maintain udp conn state (#493)
Browse files Browse the repository at this point in the history
  • Loading branch information
mzz2017 authored Apr 8, 2024
1 parent 7defd23 commit 605f005
Show file tree
Hide file tree
Showing 12 changed files with 201 additions and 55 deletions.
1 change: 1 addition & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ DerivePointerAlignment: false
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: false
RemoveBracesLLVM: true

# Taken from:
# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/kernel-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:
strategy:
fail-fast: false
matrix:
kernel: [ '5.10-20240305.092417', '5.15-20240305.092417', '6.1-20240305.092417', '6.6-20240305.092417' ]
kernel: [ '5.15-20240305.092417', '6.1-20240305.092417', '6.6-20240305.092417' ]
timeout-minutes: 10
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ node_modules/
*.log
.build_tags
.checkpatch-camelcase.git.
venv
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,6 @@ ebpf: submodule clean-ebpf
go generate ./trace/trace.go && echo trace > $(BUILD_TAGS_FILE) || echo > $(BUILD_TAGS_FILE)

ebpf-lint:
./scripts/checkpatch.pl --no-tree --strict --no-summary --show-types --color=always control/kern/tproxy.c --ignore COMMIT_COMMENT_SYMBOL,NOT_UNIFIED_DIFF,COMMIT_LOG_LONG_LINE,LONG_LINE_COMMENT,VOLATILE,ASSIGN_IN_IF,PREFER_DEFINED_ATTRIBUTE_MACRO,CAMELCASE,LEADING_SPACE,OPEN_ENDED_LINE,SPACING
./scripts/checkpatch.pl --no-tree --strict --no-summary --show-types --color=always control/kern/tproxy.c --ignore COMMIT_COMMENT_SYMBOL,NOT_UNIFIED_DIFF,COMMIT_LOG_LONG_LINE,LONG_LINE_COMMENT,VOLATILE,ASSIGN_IN_IF,PREFER_DEFINED_ATTRIBUTE_MACRO,CAMELCASE,LEADING_SPACE,OPEN_ENDED_LINE,SPACING,BLOCK_COMMENT_STYLE

## End Ebpf
1 change: 1 addition & 0 deletions common/consts/ebpf.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ var (
ProgTypeSkLookupFeatureVersion = internal.Version{5, 9, 0}
SockmapFeatureVersion = internal.Version{5, 10, 0}
UserspaceBatchUpdateLpmTrieFeatureVersion = internal.Version{5, 13, 0}
BpfTimerFeatureVersion = internal.Version{5, 15, 0}
HelperBpfGetFuncIpVersionFeatureVersion = internal.Version{5, 15, 0}
)

Expand Down
2 changes: 1 addition & 1 deletion control/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func NewControlPlane(
kernelVersion.String(),
requirement.String())
}
if requirement := consts.SockmapFeatureVersion; len(global.WanInterface) > 0 && kernelVersion.Less(requirement) {
if requirement := consts.BpfTimerFeatureVersion; len(global.WanInterface) > 0 && kernelVersion.Less(requirement) {
return nil, fmt.Errorf("your kernel version %v does not support bind to WAN; expect >=%v; remove wan_interface in config file and try again",
kernelVersion.String(),
requirement.String())
Expand Down
30 changes: 30 additions & 0 deletions control/control_plane_core.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,36 @@ func (c *controlPlaneCore) _bindWan(ifname string) error {
return nil
})

filterIngress := &netlink.BpfFilter{
FilterAttrs: netlink.FilterAttrs{
LinkIndex: link.Attrs().Index,
Parent: netlink.HANDLE_MIN_INGRESS,
Handle: netlink.MakeHandle(0x2023, 0b010+uint16(c.flip)),
Protocol: unix.ETH_P_ALL,
Priority: 1,
},
Fd: c.bpf.bpfPrograms.TproxyWanIngress.FD(),
Name: consts.AppName + "_wan_ingress",
DirectAction: true,
}
_ = netlink.FilterDel(filterIngress)
// Remove and add.
if !c.isReload {
// Clean up thoroughly.
filterIngressFlipped := deepcopy.Copy(filterIngress).(*netlink.BpfFilter)
filterIngressFlipped.FilterAttrs.Handle ^= 1
_ = netlink.FilterDel(filterIngressFlipped)
}
if err := netlink.FilterAdd(filterIngress); err != nil {
return fmt.Errorf("cannot attach ebpf object to filter ingress: %w", err)
}
c.deferFuncs = append(c.deferFuncs, func() error {
if err := netlink.FilterDel(filterIngress); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("FilterDel(%v:%v): %w", ifname, filterIngress.Name, err)
}
return nil
})

return nil
}

Expand Down
2 changes: 1 addition & 1 deletion control/kern/headers
Submodule headers updated 1 files
+40 −0 bpf_timer.h
205 changes: 159 additions & 46 deletions control/kern/tproxy.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "headers/bpf_core_read.h"
#include "headers/bpf_endian.h"
#include "headers/bpf_helpers.h"
#include "headers/bpf_timer.h"

// #define __DEBUG_ROUTING
// #define __PRINT_ROUTING_RESULT
Expand Down Expand Up @@ -76,6 +77,8 @@

#define ESOCKTNOSUPPORT 94 /* Socket type not supported */

#define TIMEOUT_UDP_CONN_STATE 3e11 /* 300s */

#define NDP_REDIRECT 137

enum { BPF_F_CURRENT_NETNS = -1 };
Expand Down Expand Up @@ -320,7 +323,8 @@ struct port_range {
*
* domain(geosite:cn, suffix: google.com) && l4proto(tcp) -> my_group
*
* pseudocode: domain(geosite:cn || suffix:google.com) && l4proto(tcp) -> my_group
* pseudocode: domain(geosite:cn || suffix:google.com) && l4proto(tcp) ->
* my_group
*
* A match_set can be: IP set geosite:cn, suffix google.com, tcp proto
*/
Expand Down Expand Up @@ -383,6 +387,19 @@ struct {
__uint(pinning, LIBBPF_PIN_BY_NAME);
} cookie_pid_map SEC(".maps");

struct udp_conn_state {
// pass

struct bpf_timer timer;
};

struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, MAX_DST_MAPPING_NUM);
__type(key, struct tuples_key);
__type(value, struct udp_conn_state);
} udp_conn_state_map SEC(".maps");

// Functions:

static __always_inline __u8 ipv4_get_dscp(const struct iphdr *iph)
Expand Down Expand Up @@ -564,7 +581,8 @@ parse_transport(const struct __sk_buff *skb, __u32 link_h_len,
__builtin_memset(udph, 0, sizeof(struct udphdr));

// bpf_printk("parse_transport: h_proto: %u ? %u %u", ethh->h_proto,
// bpf_htons(ETH_P_IP), bpf_htons(ETH_P_IPV6));
// bpf_htons(ETH_P_IP),
// bpf_htons(ETH_P_IPV6));
if (ethh->h_proto == bpf_htons(ETH_P_IP)) {
ret = bpf_skb_load_bytes(skb, offset, iph,
sizeof(struct iphdr));
Expand Down Expand Up @@ -1024,16 +1042,16 @@ int tproxy_lan_ingress(struct __sk_buff *skb)
get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto);

/*
* ip rule add fwmark 0x8000000/0x8000000 table 2023
* ip route add local default dev lo table 2023
* ip -6 rule add fwmark 0x8000000/0x8000000 table 2023
* ip -6 route add local default dev lo table 2023
* ip rule del fwmark 0x8000000/0x8000000 table 2023
* ip route del local default dev lo table 2023
* ip -6 rule del fwmark 0x8000000/0x8000000 table 2023
* ip -6 route del local default dev lo table 2023
*/
* ip rule add fwmark 0x8000000/0x8000000 table 2023
* ip route add local default dev lo table 2023
* ip -6 rule add fwmark 0x8000000/0x8000000 table 2023
* ip -6 route add local default dev lo table 2023
* ip rule del fwmark 0x8000000/0x8000000 table 2023
* ip route del local default dev lo table 2023
* ip -6 rule del fwmark 0x8000000/0x8000000 table 2023
* ip -6 route del local default dev lo table 2023
*/
// Socket lookup and assign skb to existing socket connection.
struct bpf_sock_tuple tuple = { 0 };
__u32 tuple_size;
Expand Down Expand Up @@ -1119,12 +1137,14 @@ int tproxy_lan_ingress(struct __sk_buff *skb)
/// NOTICE: No pid pname info for LAN packet.
// // Maybe this packet is also in the host (such as docker) ?
// // I tried and it is false.
// __u64 cookie = bpf_get_socket_cookie(skb);
// struct pid_pname *pid_pname = bpf_map_lookup_elem(&cookie_pid_map,
// &cookie); if (pid_pname) {
// __builtin_memcpy(routing_result.pname, pid_pname->pname, TASK_COMM_LEN);
// routing_result.pid = pid_pname->pid;
// }
//__u64 cookie = bpf_get_socket_cookie(skb);
//struct pid_pname *pid_pname =
// bpf_map_lookup_elem(&cookie_pid_map, &cookie);
//if (pid_pname) {
// __builtin_memcpy(routing_result.pname, pid_pname->pname,
// TASK_COMM_LEN);
// routing_result.pid = pid_pname->pid;
//}

// Save routing result.
ret = bpf_map_update_elem(&routing_tuples_map, &tuples.five,
Expand Down Expand Up @@ -1208,28 +1228,113 @@ static __always_inline bool pid_is_control_plane(struct __sk_buff *skb,
if ((skb->mark & 0x100) == 0x100) {
bpf_printk("No pid_pname found. But it should not happen");
/*
* if (l4proto == IPPROTO_TCP) {
*if (tcph.syn && !tcph.ack) {
* bpf_printk("No pid_pname found. But it should not happen: local:%u "
* "(%u)[%llu]",
* bpf_ntohs(sport), l4proto, cookie);
*} else {
* bpf_printk("No pid_pname found. But it should not happen: (Old "
* "Connection): local:%u "
* "(%u)[%llu]",
* bpf_ntohs(sport), l4proto, cookie);
*}
* } else {
*bpf_printk("No pid_pname found. But it should not happen: local:%u "
* "(%u)[%llu]",
* bpf_ntohs(sport), l4proto, cookie);
* }
*/
* if (l4proto == IPPROTO_TCP) {
*if (tcph.syn && !tcph.ack) {
* bpf_printk("No pid_pname found. But it should not happen: local:%u "
* "(%u)[%llu]",
* bpf_ntohs(sport), l4proto, cookie);
*} else {
* bpf_printk("No pid_pname found. But it should not happen: (Old "
* "Connection): local:%u "
* "(%u)[%llu]",
* bpf_ntohs(sport), l4proto, cookie);
*}
* } else {
*bpf_printk("No pid_pname found. But it should not happen: local:%u "
* "(%u)[%llu]",
* bpf_ntohs(sport), l4proto, cookie);
* }
*/
return true;
}
return false;
}

static int refresh_udp_conn_state_timer_cb(void *_udp_conn_state_map,
struct tuples_key *key,
struct udp_conn_state *val)
{
bpf_map_delete_elem(&udp_conn_state_map, key);
return 0;
}

static __always_inline void copy_reversed_tuples(struct tuples_key *key,
struct tuples_key *dst)
{
__builtin_memset(dst, 0, sizeof(*dst));
dst->dip = key->sip;
dst->sip = key->dip;
dst->sport = key->dport;
dst->dport = key->sport;
dst->l4proto = key->l4proto;
}

static __always_inline int refresh_udp_conn_state_timer(struct tuples_key *key)
{
struct udp_conn_state new_output_state = { 0 };
int ret = bpf_map_update_elem(&udp_conn_state_map, key,
&new_output_state, BPF_ANY);
if (unlikely(ret))
return -EINVAL;
struct udp_conn_state *value =
bpf_map_lookup_elem(&udp_conn_state_map, key);
if (unlikely(!value))
return -EFAULT;

ret = bpf_timer_init(&value->timer, &udp_conn_state_map,
CLOCK_MONOTONIC);
if (unlikely(ret))
goto del;

ret = bpf_timer_set_callback(&value->timer,
refresh_udp_conn_state_timer_cb);
if (unlikely(ret))
goto del;

ret = bpf_timer_start(&value->timer, TIMEOUT_UDP_CONN_STATE, 0);
if (unlikely(ret))
goto del;

return 0;
del:
bpf_map_delete_elem(&udp_conn_state_map, key);
return -EFAULT;
}

SEC("tc/wan_ingress")
int tproxy_wan_ingress(struct __sk_buff *skb)
{
struct ethhdr ethh;
struct iphdr iph;
struct ipv6hdr ipv6h;
struct icmp6hdr icmp6h;
struct tcphdr tcph;
struct udphdr udph;
__u8 ihl;
__u8 l4proto;
__u32 link_h_len;

if (get_link_h_len(skb->ifindex, &link_h_len))
return TC_ACT_OK;
int ret = parse_transport(skb, link_h_len, &ethh, &iph, &ipv6h, &icmp6h,
&tcph, &udph, &ihl, &l4proto);
if (ret)
return TC_ACT_OK;
if (l4proto != IPPROTO_UDP)
return TC_ACT_PIPE;

struct tuples tuples;
struct tuples_key reversed_tuples_key;

get_tuples(skb, &tuples, &iph, &ipv6h, &tcph, &udph, l4proto);
copy_reversed_tuples(&tuples.five, &reversed_tuples_key);

if (refresh_udp_conn_state_timer(&reversed_tuples_key))
return TC_ACT_SHOT;

return TC_ACT_PIPE;
}

// Routing and redirect the packet back.
// We cannot modify the dest address here. So we cooperate with wan_ingress.
SEC("tc/wan_egress")
Expand All @@ -1239,7 +1344,7 @@ int tproxy_wan_egress(struct __sk_buff *skb)
if (skb->ingress_ifindex != NOWHERE_IFINDEX)
return TC_ACT_OK;
// if ((skb->mark & 0x80) == 0x80) {
// return TC_ACT_OK;
// return TC_ACT_OK;
// }

struct ethhdr ethh;
Expand Down Expand Up @@ -1401,6 +1506,13 @@ int tproxy_wan_egress(struct __sk_buff *skb)
flag[6] = tuples.dscp;
struct pid_pname *pid_pname;

if (bpf_map_lookup_elem(&udp_conn_state_map, &tuples.five)) {
if (refresh_udp_conn_state_timer(&tuples.five))
return TC_ACT_SHOT;

return TC_ACT_OK;
}

if (pid_is_control_plane(skb, &pid_pname)) {
// From control plane. Direct.
return TC_ACT_OK;
Expand Down Expand Up @@ -1491,20 +1603,21 @@ int tproxy_wan_egress(struct __sk_buff *skb)
SEC("tc/dae0peer_ingress")
int tproxy_dae0peer_ingress(struct __sk_buff *skb)
{
/* Only packets redirected from wan_egress or lan_ingress have this cb mark. */
/* Only packets redirected from wan_egress or lan_ingress have this cb mark.
*/
if (skb->cb[0] != TPROXY_MARK)
return TC_ACT_SHOT;

/* ip rule add fwmark 0x8000000/0x8000000 table 2023
* ip route add local default dev lo table 2023
*/
* ip route add local default dev lo table 2023
*/
skb->mark = TPROXY_MARK;
bpf_skb_change_type(skb, PACKET_HOST);

/* l4proto is stored in skb->cb[1] only for UDP and new TCP. As for
* established TCP, kernel can take care of socket lookup, so just
* return them to stack without calling bpf_sk_assign.
*/
* established TCP, kernel can take care of socket lookup, so just
* return them to stack without calling bpf_sk_assign.
*/
__u8 l4proto = skb->cb[1];

if (l4proto != 0)
Expand Down Expand Up @@ -1585,9 +1698,9 @@ static __always_inline int _update_map_elem_by_cookie(const __u64 cookie)
unsigned long arg_end = BPF_CORE_READ(current, mm, arg_end);

/*
* For string like: /usr/lib/sddm/sddm-helper --socket /tmp/sddm-auth1
* We extract "sddm-helper" from it.
*/
* For string like: /usr/lib/sddm/sddm-helper --socket /tmp/sddm-auth1
* We extract "sddm-helper" from it.
*/
unsigned long loc, j, last_slash = -1;
#pragma unroll
for (loc = 0, j = 0; j < MAX_ARG_LEN_TO_PROBE;
Expand All @@ -1609,7 +1722,7 @@ static __always_inline int _update_map_elem_by_cookie(const __u64 cookie)
(const void *)(arg_start + j));
if (ret) {
// bpf_printk("failed to read process name.0: [%ld, %ld]", arg_start,
// arg_end);
// arg_end);
// bpf_printk("_failed to read process name.0: %ld %ld", j, to_read);
return ret;
}
Expand Down
Loading

0 comments on commit 605f005

Please sign in to comment.