Skip to content

Commit

Permalink
bpf: fixup dst MAC for host networked nodeport backends
Browse files Browse the repository at this point in the history
Because routing happens based on the dest IP (which is at that time the
client's) before we get hands on the packet on HEP, the MAC is likely
different (in the same subnet) than the MAC of the node that forwarded
the nodeport to us. Therefore once we place the packet in VXLAN and thus
change the dest IP, we must fix up the dest MAC based on what we
recorded when we received the inbound packets.

This is a similar case to the MAC fixing for regular pods. The only
difference is that the src MAC is already set correctly if we are on the
right device.

We cannot handle the case (yet) when the routes to the original client
and to the node on the other side of the vxland tunnel use different
NICs. We would need to redirect the packet to the other NIC first.
  • Loading branch information
tomastigera committed Dec 10, 2020
1 parent f7878a5 commit 63351c0
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 5 deletions.
32 changes: 32 additions & 0 deletions bpf-gpl/tc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,38 @@ static CALI_BPF_INLINE struct fwd calico_tc_skb_accepted(struct __sk_buff *skb,
goto deny;

nat_encap:
/* We are about to encap return trafic that originated on the local host
* namespace - a host networked pod. Routing was based on the dst IP,
* which was the original client's IP at that time, not the node's that
* forwarded it. We need to fix it now.
*/
if (CALI_F_TO_HEP) {
struct arp_value *arpv;
struct arp_key arpk = {
.ip = state->ip_dst,
.ifindex = skb->ifindex,
};

arpv = cali_v4_arp_lookup_elem(&arpk);
if (!arpv) {
CALI_DEBUG("ARP lookup failed for %x dev %d at HEP\n",
bpf_ntohl(state->ip_dst), arpk.ifindex);
/* Don't drop it yet, we might get lucky and the MAC is correct */
} else {
if (skb_shorter(skb, sizeof(struct ethhdr))) {
reason = CALI_REASON_SHORT;
goto deny;
}
struct ethhdr *eth_hdr = (void *)(long)skb->data;
__builtin_memcpy(&eth_hdr->h_dest, arpv->mac_dst, ETH_ALEN);
if (state->ct_result.ifindex_fwd == skb->ifindex) {
/* No need to change src MAC, if we are at the right device */
} else {
/* FIXME we need to redirect to the right device */
}
}
}

if (vxlan_v4_encap(skb, state->ip_src, state->ip_dst)) {
reason = CALI_REASON_ENCAP_FAIL;
goto deny;
Expand Down
23 changes: 18 additions & 5 deletions bpf/ut/nat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,8 @@ func TestNATNodePort(t *testing.T) {
arpKey := arp.NewKey(node1ip, 1 /* ifindex is always 1 in UT */)
Expect(arpMapN2).To(HaveKey(arpKey))
macDst := encapedPkt[0:6]
macScr := encapedPkt[6:12]
Expect(arpMapN2[arpKey]).To(Equal(arp.NewValue(macDst, macScr)))
macSrc := encapedPkt[6:12]
Expect(arpMapN2[arpKey]).To(Equal(arp.NewValue(macDst, macSrc)))

// try a spoofed tunnel packet, should be dropped and have no effect
runBpfTest(t, "calico_from_host_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) {
Expand Down Expand Up @@ -535,7 +535,7 @@ func TestNATNodePort(t *testing.T) {
// Response leaving workload at node 2
runBpfTest(t, "calico_from_workload_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) {
respPkt := udpResposeRaw(recvPkt)
// Change the MAC addressesso that we can observe that the right
// Change the MAC addresses so that we can observe that the right
// addresses were patched in.
copy(respPkt[:6], []byte{1, 2, 3, 4, 5, 6})
copy(respPkt[6:12], []byte{6, 5, 4, 3, 2, 1})
Expand All @@ -551,7 +551,7 @@ func TestNATNodePort(t *testing.T) {
ethR := ethL.(*layers.Ethernet)
Expect(ethR).To(layersMatchFields(&layers.Ethernet{
SrcMAC: macDst,
DstMAC: macScr,
DstMAC: macSrc,
EthernetType: layers.EthernetTypeIPv4,
}))

Expand Down Expand Up @@ -792,7 +792,11 @@ func TestNATNodePort(t *testing.T) {
runBpfTest(t, "calico_to_host_ep", rulesDefaultAllow, func(bpfrun bpfProgRunFn) {
respPkt := udpResposeRaw(recvPkt)

// No need to check MACs, no FIB, no forwarding, nopatching
// Change the MAC addresses so that we can observe that the right
// addresses were patched in.
macUntouched := []byte{6, 5, 4, 3, 2, 1}
copy(respPkt[:6], []byte{1, 2, 3, 4, 5, 6})
copy(respPkt[6:12], macUntouched)

res, err := bpfrun(respPkt)
Expect(err).NotTo(HaveOccurred())
Expand All @@ -801,6 +805,15 @@ func TestNATNodePort(t *testing.T) {
pktR := gopacket.NewPacket(res.dataOut, layers.LayerTypeEthernet, gopacket.Default)
fmt.Printf("pktR = %+v\n", pktR)

ethL := pktR.Layer(layers.LayerTypeEthernet)
Expect(ethL).NotTo(BeNil())
ethR := ethL.(*layers.Ethernet)
Expect(ethR).To(layersMatchFields(&layers.Ethernet{
SrcMAC: macUntouched, // Source is set by net stack and should not be touched.
DstMAC: macSrc,
EthernetType: layers.EthernetTypeIPv4,
}))

ipv4L := pktR.Layer(layers.LayerTypeIPv4)
Expect(ipv4L).NotTo(BeNil())
ipv4R := ipv4L.(*layers.IPv4)
Expand Down

0 comments on commit 63351c0

Please sign in to comment.