From aafb50989452aa94f5ca094f48da970644f33951 Mon Sep 17 00:00:00 2001 From: l1b0k Date: Thu, 16 Jan 2025 10:17:05 +0800 Subject: [PATCH] bpf: lxc: defer CT_INGRESS entry creation for loopback connections [ upstream commit dbf4351c4eff1857cf374886804eb876e970b992 ] Signed-off-by: l1b0k --- ..._INGRESS-entry-creation-for-loopback.patch | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch diff --git a/policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch b/policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch new file mode 100644 index 00000000..ebf024ae --- /dev/null +++ b/policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch @@ -0,0 +1,175 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Julian Wiedmann +Date: Wed, 16 Aug 2023 08:30:00 +0300 +Subject: bpf: lxc: defer CT_INGRESS entry creation for loopback connections + +[ upstream commit dbf4351c4eff1857cf374886804eb876e970b992 ] + +Currently the CT_INGRESS entry for a loopback connection is already created +when the first request leaves the client, as from-container creates the +CT_EGRESS entry (see the loopback handling in ct_create4() for details). + +This is unusual - we would normally just create the CT_INGRESS entry as +the first packet passes through to-container into the backend. But for +loopback connections it is needed, so that +1.) to-container finds a CT entry with .loopback set, and thus skips + network policy enforcement even for the first packet, and +2.) the CT entry has its rev_nat_index field populated, and thus can + RevNAT replies in from-container. + +This approach conflicts with the fact that loopback replies skip the +client's to-container path (to avoid network policy enforcement). + +Once the loopback connection is closed, the backend's from-container path +observes the FIN / RST, and __ct_lookup4() updates the CT_INGRESS entry's +lifetime to CT_CLOSE_TIMEOUT. But the client's to-container path will not +observe the FIN / RST, and thus the CT_EGRESS entry's lifetime remains as +CT_CONNECTION_LIFETIME_TCP. Therefore the CT_INGRESS entry will expire +earlier, and potentially gets garbage-collected while the CT_EGRESS entry +stays in place. + +If the same loopback connection is subsequently re-opened, the client's +from-container path finds the CT_EGRESS entry and thus will *not* call +ct_create4(). Consequently the CT_INGRESS entry is not created, and the +backend will not apply the loopback-specific handling described above. +Inbound requests are potentially dropped (due to network policy), and/or +replies are not RevNATed. + +Fix this up by letting the backend path create its CT_INGRESS entry as +usual. It just needs a bit of detection code in its CT_NEW handling to +understand that the first packet belongs to a .loopback connection, and +populate its own CT_INGRESS entry accordingly. + +[ Backport note: Addressed some minor conflicts, skipped tests. ] + +Signed-off-by: Julian Wiedmann +Signed-off-by: Quentin Monnet +--- + bpf/bpf_lxc.c | 25 +++++++++++++++++++++- + bpf/lib/conntrack.h | 52 ++++++++++++++++++--------------------------- + 2 files changed, 45 insertions(+), 32 deletions(-) + +diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c +index 0a18c35c9a..1eb8a2b784 100644 +--- a/bpf/bpf_lxc.c ++++ b/bpf/bpf_lxc.c +@@ -983,6 +983,16 @@ ct_recreate4: + * been passed to the stack. + */ + if (is_defined(ENABLE_ROUTING) || hairpin_flow) { ++ /* Hairpin requests need to pass through the backend's to-container ++ * path, to create a CT_INGRESS entry with .lb_loopback set. This ++ * drives RevNAT in the backend's from-container path. ++ * ++ * Hairpin replies are fully RevNATed in the backend's from-container ++ * path. Thus they don't match the CT_EGRESS entry, and we can't rely ++ * on a CT_REPLY result that would provide bypass of ingress policy. ++ * Thus manually skip the ingress policy path. ++ */ ++ bool bypass_ingress_policy = hairpin_flow && ct_status == CT_REPLY; + struct endpoint_info *ep; + + /* Lookup IPv4 address, this will return a match if: +@@ -1006,7 +1016,8 @@ ct_recreate4: + policy_clear_mark(ctx); + /* If the packet is from L7 LB it is coming from the host */ + return ipv4_local_delivery(ctx, ETH_HLEN, SECLABEL, ip4, +- ep, METRIC_EGRESS, from_l7lb, hairpin_flow); ++ ep, METRIC_EGRESS, from_l7lb, ++ bypass_ingress_policy); + } + } + +@@ -1833,7 +1844,19 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, enum ct_status + * define policy rules to allow pods to talk to themselves. We still + * want to execute the conntrack logic so that replies can be correctly + * matched. ++ * ++ * If ip4.saddr is IPV4_LOOPBACK, this is almost certainly a loopback ++ * connection. Populate ++ * - .loopback, so that policy enforcement is bypassed, and ++ * - .rev_nat_index, so that replies can be RevNATed. + */ ++ if (ret == CT_NEW && ip4->saddr == IPV4_LOOPBACK && ++ ct_has_loopback_egress_entry4(get_ct_map4(tuple), tuple, ++ &ct_state_new.rev_nat_index)) { ++ ct_state_new.loopback = true; ++ goto skip_policy_enforcement; ++ } ++ + if (unlikely(ct_state->loopback)) + goto skip_policy_enforcement; + #endif /* ENABLE_PER_PACKET_LB && !DISABLE_LOOPBACK_LB */ +diff --git a/bpf/lib/conntrack.h b/bpf/lib/conntrack.h +index 6a31c64b3a..5b3afdf83b 100644 +--- a/bpf/lib/conntrack.h ++++ b/bpf/lib/conntrack.h +@@ -941,37 +941,6 @@ static __always_inline int ct_create4(const void *map_main, + return DROP_CT_CREATE_FAILED; + } + +- if (ct_state->addr && ct_state->loopback) { +- __u8 flags = tuple->flags; +- __be32 saddr, daddr; +- +- saddr = tuple->saddr; +- daddr = tuple->daddr; +- +- /* We are looping back into the origin endpoint through a +- * service, set up a conntrack tuple for the reply to ensure we +- * do rev NAT before attempting to route the destination +- * address which will not point back to the right source. +- */ +- tuple->flags = TUPLE_F_IN; +- if (dir == CT_INGRESS) { +- tuple->saddr = ct_state->addr; +- tuple->daddr = ct_state->svc_addr; +- } else { +- tuple->saddr = ct_state->svc_addr; +- tuple->daddr = ct_state->addr; +- } +- +- if (map_update_elem(map_main, tuple, &entry, 0) < 0) { +- send_signal_ct_fill_up(ctx, SIGNAL_PROTO_V4); +- return DROP_CT_CREATE_FAILED; +- } +- +- tuple->saddr = saddr; +- tuple->daddr = daddr; +- tuple->flags = flags; +- } +- + if (map_related != NULL) { + /* Create an ICMP entry to relate errors */ + struct ipv4_ct_tuple icmp_tuple = { +@@ -996,6 +965,27 @@ static __always_inline int ct_create4(const void *map_main, + return 0; + } + ++#ifndef DISABLE_LOOPBACK_LB ++static __always_inline bool ++ct_has_loopback_egress_entry4(const void *map, struct ipv4_ct_tuple *tuple, ++ __u16 *rev_nat_index) ++{ ++ __u8 flags = tuple->flags; ++ struct ct_entry *entry; ++ ++ tuple->flags = TUPLE_F_OUT; ++ entry = map_lookup_elem(map, tuple); ++ tuple->flags = flags; ++ ++ if (entry && entry->lb_loopback) { ++ *rev_nat_index = entry->rev_nat_index; ++ return true; ++ } ++ ++ return false; ++} ++#endif ++ + /* The function tries to determine whether the flow identified by the given + * CT_INGRESS tuple belongs to a NodePort traffic (i.e., outside client => N/S + * LB => local backend). +-- +2.39.5 (Apple Git-154) +