From aafb50989452aa94f5ca094f48da970644f33951 Mon Sep 17 00:00:00 2001
From: l1b0k <libokang.lbk@alibaba-inc.com>
Date: Thu, 16 Jan 2025 10:17:05 +0800
Subject: [PATCH] bpf: lxc: defer CT_INGRESS entry creation for loopback
 connections

[ upstream commit dbf4351c4eff1857cf374886804eb876e970b992 ]

Signed-off-by: l1b0k <libokang.lbk@alibaba-inc.com>
---
 ..._INGRESS-entry-creation-for-loopback.patch | 175 ++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch

diff --git a/policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch b/policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch
new file mode 100644
index 00000000..ebf024ae
--- /dev/null
+++ b/policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch
@@ -0,0 +1,175 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Julian Wiedmann <jwi@isovalent.com>
+Date: Wed, 16 Aug 2023 08:30:00 +0300
+Subject: bpf: lxc: defer CT_INGRESS entry creation for loopback connections
+
+[ upstream commit dbf4351c4eff1857cf374886804eb876e970b992 ]
+
+Currently the CT_INGRESS entry for a loopback connection is already created
+when the first request leaves the client, as from-container creates the
+CT_EGRESS entry (see the loopback handling in ct_create4() for details).
+
+This is unusual - we would normally just create the CT_INGRESS entry as
+the first packet passes through to-container into the backend. But for
+loopback connections it is needed, so that
+1.) to-container finds a CT entry with .loopback set, and thus skips
+    network policy enforcement even for the first packet, and
+2.) the CT entry has its rev_nat_index field populated, and thus can
+    RevNAT replies in from-container.
+
+This approach conflicts with the fact that loopback replies skip the
+client's to-container path (to avoid network policy enforcement).
+
+Once the loopback connection is closed, the backend's from-container path
+observes the FIN / RST, and __ct_lookup4() updates the CT_INGRESS entry's
+lifetime to CT_CLOSE_TIMEOUT. But the client's to-container path will not
+observe the FIN / RST, and thus the CT_EGRESS entry's lifetime remains as
+CT_CONNECTION_LIFETIME_TCP. Therefore the CT_INGRESS entry will expire
+earlier, and potentially gets garbage-collected while the CT_EGRESS entry
+stays in place.
+
+If the same loopback connection is subsequently re-opened, the client's
+from-container path finds the CT_EGRESS entry and thus will *not* call
+ct_create4(). Consequently the CT_INGRESS entry is not created, and the
+backend will not apply the loopback-specific handling described above.
+Inbound requests are potentially dropped (due to network policy), and/or
+replies are not RevNATed.
+
+Fix this up by letting the backend path create its CT_INGRESS entry as
+usual. It just needs a bit of detection code in its CT_NEW handling to
+understand that the first packet belongs to a .loopback connection, and
+populate its own CT_INGRESS entry accordingly.
+
+[ Backport note: Addressed some minor conflicts, skipped tests. ]
+
+Signed-off-by: Julian Wiedmann <jwi@isovalent.com>
+Signed-off-by: Quentin Monnet <quentin@isovalent.com>
+---
+ bpf/bpf_lxc.c       | 25 +++++++++++++++++++++-
+ bpf/lib/conntrack.h | 52 ++++++++++++++++++---------------------------
+ 2 files changed, 45 insertions(+), 32 deletions(-)
+
+diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c
+index 0a18c35c9a..1eb8a2b784 100644
+--- a/bpf/bpf_lxc.c
++++ b/bpf/bpf_lxc.c
+@@ -983,6 +983,16 @@ ct_recreate4:
+ 	 * been passed to the stack.
+ 	 */
+ 	if (is_defined(ENABLE_ROUTING) || hairpin_flow) {
++		/* Hairpin requests need to pass through the backend's to-container
++		 * path, to create a CT_INGRESS entry with .lb_loopback set. This
++		 * drives RevNAT in the backend's from-container path.
++		 *
++		 * Hairpin replies are fully RevNATed in the backend's from-container
++		 * path. Thus they don't match the CT_EGRESS entry, and we can't rely
++		 * on a CT_REPLY result that would provide bypass of ingress policy.
++		 * Thus manually skip the ingress policy path.
++		 */
++		bool bypass_ingress_policy = hairpin_flow && ct_status == CT_REPLY;
+ 		struct endpoint_info *ep;
+ 
+ 		/* Lookup IPv4 address, this will return a match if:
+@@ -1006,7 +1016,8 @@ ct_recreate4:
+ 			policy_clear_mark(ctx);
+ 			/* If the packet is from L7 LB it is coming from the host */
+ 			return ipv4_local_delivery(ctx, ETH_HLEN, SECLABEL, ip4,
+-						   ep, METRIC_EGRESS, from_l7lb, hairpin_flow);
++						   ep, METRIC_EGRESS, from_l7lb,
++						   bypass_ingress_policy);
+ 		}
+ 	}
+ 
+@@ -1833,7 +1844,19 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, enum ct_status
+ 	 * define policy rules to allow pods to talk to themselves. We still
+ 	 * want to execute the conntrack logic so that replies can be correctly
+ 	 * matched.
++	 *
++	 * If ip4.saddr is IPV4_LOOPBACK, this is almost certainly a loopback
++	 * connection. Populate
++	 * - .loopback, so that policy enforcement is bypassed, and
++	 * - .rev_nat_index, so that replies can be RevNATed.
+ 	 */
++	if (ret == CT_NEW && ip4->saddr == IPV4_LOOPBACK &&
++	    ct_has_loopback_egress_entry4(get_ct_map4(tuple), tuple,
++					  &ct_state_new.rev_nat_index)) {
++		ct_state_new.loopback = true;
++		goto skip_policy_enforcement;
++	}
++
+ 	if (unlikely(ct_state->loopback))
+ 		goto skip_policy_enforcement;
+ #endif /* ENABLE_PER_PACKET_LB && !DISABLE_LOOPBACK_LB */
+diff --git a/bpf/lib/conntrack.h b/bpf/lib/conntrack.h
+index 6a31c64b3a..5b3afdf83b 100644
+--- a/bpf/lib/conntrack.h
++++ b/bpf/lib/conntrack.h
+@@ -941,37 +941,6 @@ static __always_inline int ct_create4(const void *map_main,
+ 		return DROP_CT_CREATE_FAILED;
+ 	}
+ 
+-	if (ct_state->addr && ct_state->loopback) {
+-		__u8 flags = tuple->flags;
+-		__be32 saddr, daddr;
+-
+-		saddr = tuple->saddr;
+-		daddr = tuple->daddr;
+-
+-		/* We are looping back into the origin endpoint through a
+-		 * service, set up a conntrack tuple for the reply to ensure we
+-		 * do rev NAT before attempting to route the destination
+-		 * address which will not point back to the right source.
+-		 */
+-		tuple->flags = TUPLE_F_IN;
+-		if (dir == CT_INGRESS) {
+-			tuple->saddr = ct_state->addr;
+-			tuple->daddr = ct_state->svc_addr;
+-		} else {
+-			tuple->saddr = ct_state->svc_addr;
+-			tuple->daddr = ct_state->addr;
+-		}
+-
+-		if (map_update_elem(map_main, tuple, &entry, 0) < 0) {
+-			send_signal_ct_fill_up(ctx, SIGNAL_PROTO_V4);
+-			return DROP_CT_CREATE_FAILED;
+-		}
+-
+-		tuple->saddr = saddr;
+-		tuple->daddr = daddr;
+-		tuple->flags = flags;
+-	}
+-
+ 	if (map_related != NULL) {
+ 		/* Create an ICMP entry to relate errors */
+ 		struct ipv4_ct_tuple icmp_tuple = {
+@@ -996,6 +965,27 @@ static __always_inline int ct_create4(const void *map_main,
+ 	return 0;
+ }
+ 
++#ifndef DISABLE_LOOPBACK_LB
++static __always_inline bool
++ct_has_loopback_egress_entry4(const void *map, struct ipv4_ct_tuple *tuple,
++			      __u16 *rev_nat_index)
++{
++	__u8 flags = tuple->flags;
++	struct ct_entry *entry;
++
++	tuple->flags = TUPLE_F_OUT;
++	entry = map_lookup_elem(map, tuple);
++	tuple->flags = flags;
++
++	if (entry && entry->lb_loopback) {
++		*rev_nat_index = entry->rev_nat_index;
++		return true;
++	}
++
++	return false;
++}
++#endif
++
+ /* The function tries to determine whether the flow identified by the given
+  * CT_INGRESS tuple belongs to a NodePort traffic (i.e., outside client => N/S
+  * LB => local backend).
+-- 
+2.39.5 (Apple Git-154)
+