forked from AliyunContainerService/terway
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request AliyunContainerService#764 from l1b0k/fix/hairpin
bpf: lxc: defer CT_INGRESS entry creation for loopback connections
- Loading branch information
Showing
1 changed file
with
175 additions
and
0 deletions.
There are no files selected for viewing
175 changes: 175 additions & 0 deletions
175
policy/cilium/0038-bpf-lxc-defer-CT_INGRESS-entry-creation-for-loopback.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 | ||
From: Julian Wiedmann <[email protected]> | ||
Date: Wed, 16 Aug 2023 08:30:00 +0300 | ||
Subject: bpf: lxc: defer CT_INGRESS entry creation for loopback connections | ||
|
||
[ upstream commit dbf4351c4eff1857cf374886804eb876e970b992 ] | ||
|
||
Currently the CT_INGRESS entry for a loopback connection is already created | ||
when the first request leaves the client, as from-container creates the | ||
CT_EGRESS entry (see the loopback handling in ct_create4() for details). | ||
|
||
This is unusual - we would normally just create the CT_INGRESS entry as | ||
the first packet passes through to-container into the backend. But for | ||
loopback connections it is needed, so that | ||
1.) to-container finds a CT entry with .loopback set, and thus skips | ||
network policy enforcement even for the first packet, and | ||
2.) the CT entry has its rev_nat_index field populated, and thus can | ||
RevNAT replies in from-container. | ||
|
||
This approach conflicts with the fact that loopback replies skip the | ||
client's to-container path (to avoid network policy enforcement). | ||
|
||
Once the loopback connection is closed, the backend's from-container path | ||
observes the FIN / RST, and __ct_lookup4() updates the CT_INGRESS entry's | ||
lifetime to CT_CLOSE_TIMEOUT. But the client's to-container path will not | ||
observe the FIN / RST, and thus the CT_EGRESS entry's lifetime remains as | ||
CT_CONNECTION_LIFETIME_TCP. Therefore the CT_INGRESS entry will expire | ||
earlier, and potentially gets garbage-collected while the CT_EGRESS entry | ||
stays in place. | ||
|
||
If the same loopback connection is subsequently re-opened, the client's | ||
from-container path finds the CT_EGRESS entry and thus will *not* call | ||
ct_create4(). Consequently the CT_INGRESS entry is not created, and the | ||
backend will not apply the loopback-specific handling described above. | ||
Inbound requests are potentially dropped (due to network policy), and/or | ||
replies are not RevNATed. | ||
|
||
Fix this up by letting the backend path create its CT_INGRESS entry as | ||
usual. It just needs a bit of detection code in its CT_NEW handling to | ||
understand that the first packet belongs to a .loopback connection, and | ||
populate its own CT_INGRESS entry accordingly. | ||
|
||
[ Backport note: Addressed some minor conflicts, skipped tests. ] | ||
|
||
Signed-off-by: Julian Wiedmann <[email protected]> | ||
Signed-off-by: Quentin Monnet <[email protected]> | ||
--- | ||
bpf/bpf_lxc.c | 25 +++++++++++++++++++++- | ||
bpf/lib/conntrack.h | 52 ++++++++++++++++++--------------------------- | ||
2 files changed, 45 insertions(+), 32 deletions(-) | ||
|
||
diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c | ||
index 0a18c35c9a..1eb8a2b784 100644 | ||
--- a/bpf/bpf_lxc.c | ||
+++ b/bpf/bpf_lxc.c | ||
@@ -983,6 +983,16 @@ ct_recreate4: | ||
* been passed to the stack. | ||
*/ | ||
if (is_defined(ENABLE_ROUTING) || hairpin_flow) { | ||
+ /* Hairpin requests need to pass through the backend's to-container | ||
+ * path, to create a CT_INGRESS entry with .lb_loopback set. This | ||
+ * drives RevNAT in the backend's from-container path. | ||
+ * | ||
+ * Hairpin replies are fully RevNATed in the backend's from-container | ||
+ * path. Thus they don't match the CT_EGRESS entry, and we can't rely | ||
+ * on a CT_REPLY result that would provide bypass of ingress policy. | ||
+ * Thus manually skip the ingress policy path. | ||
+ */ | ||
+ bool bypass_ingress_policy = hairpin_flow && ct_status == CT_REPLY; | ||
struct endpoint_info *ep; | ||
|
||
/* Lookup IPv4 address, this will return a match if: | ||
@@ -1006,7 +1016,8 @@ ct_recreate4: | ||
policy_clear_mark(ctx); | ||
/* If the packet is from L7 LB it is coming from the host */ | ||
return ipv4_local_delivery(ctx, ETH_HLEN, SECLABEL, ip4, | ||
- ep, METRIC_EGRESS, from_l7lb, hairpin_flow); | ||
+ ep, METRIC_EGRESS, from_l7lb, | ||
+ bypass_ingress_policy); | ||
} | ||
} | ||
|
||
@@ -1833,7 +1844,19 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, enum ct_status | ||
* define policy rules to allow pods to talk to themselves. We still | ||
* want to execute the conntrack logic so that replies can be correctly | ||
* matched. | ||
+ * | ||
+ * If ip4.saddr is IPV4_LOOPBACK, this is almost certainly a loopback | ||
+ * connection. Populate | ||
+ * - .loopback, so that policy enforcement is bypassed, and | ||
+ * - .rev_nat_index, so that replies can be RevNATed. | ||
*/ | ||
+ if (ret == CT_NEW && ip4->saddr == IPV4_LOOPBACK && | ||
+ ct_has_loopback_egress_entry4(get_ct_map4(tuple), tuple, | ||
+ &ct_state_new.rev_nat_index)) { | ||
+ ct_state_new.loopback = true; | ||
+ goto skip_policy_enforcement; | ||
+ } | ||
+ | ||
if (unlikely(ct_state->loopback)) | ||
goto skip_policy_enforcement; | ||
#endif /* ENABLE_PER_PACKET_LB && !DISABLE_LOOPBACK_LB */ | ||
diff --git a/bpf/lib/conntrack.h b/bpf/lib/conntrack.h | ||
index 6a31c64b3a..5b3afdf83b 100644 | ||
--- a/bpf/lib/conntrack.h | ||
+++ b/bpf/lib/conntrack.h | ||
@@ -941,37 +941,6 @@ static __always_inline int ct_create4(const void *map_main, | ||
return DROP_CT_CREATE_FAILED; | ||
} | ||
|
||
- if (ct_state->addr && ct_state->loopback) { | ||
- __u8 flags = tuple->flags; | ||
- __be32 saddr, daddr; | ||
- | ||
- saddr = tuple->saddr; | ||
- daddr = tuple->daddr; | ||
- | ||
- /* We are looping back into the origin endpoint through a | ||
- * service, set up a conntrack tuple for the reply to ensure we | ||
- * do rev NAT before attempting to route the destination | ||
- * address which will not point back to the right source. | ||
- */ | ||
- tuple->flags = TUPLE_F_IN; | ||
- if (dir == CT_INGRESS) { | ||
- tuple->saddr = ct_state->addr; | ||
- tuple->daddr = ct_state->svc_addr; | ||
- } else { | ||
- tuple->saddr = ct_state->svc_addr; | ||
- tuple->daddr = ct_state->addr; | ||
- } | ||
- | ||
- if (map_update_elem(map_main, tuple, &entry, 0) < 0) { | ||
- send_signal_ct_fill_up(ctx, SIGNAL_PROTO_V4); | ||
- return DROP_CT_CREATE_FAILED; | ||
- } | ||
- | ||
- tuple->saddr = saddr; | ||
- tuple->daddr = daddr; | ||
- tuple->flags = flags; | ||
- } | ||
- | ||
if (map_related != NULL) { | ||
/* Create an ICMP entry to relate errors */ | ||
struct ipv4_ct_tuple icmp_tuple = { | ||
@@ -996,6 +965,27 @@ static __always_inline int ct_create4(const void *map_main, | ||
return 0; | ||
} | ||
|
||
+#ifndef DISABLE_LOOPBACK_LB | ||
+static __always_inline bool | ||
+ct_has_loopback_egress_entry4(const void *map, struct ipv4_ct_tuple *tuple, | ||
+ __u16 *rev_nat_index) | ||
+{ | ||
+ __u8 flags = tuple->flags; | ||
+ struct ct_entry *entry; | ||
+ | ||
+ tuple->flags = TUPLE_F_OUT; | ||
+ entry = map_lookup_elem(map, tuple); | ||
+ tuple->flags = flags; | ||
+ | ||
+ if (entry && entry->lb_loopback) { | ||
+ *rev_nat_index = entry->rev_nat_index; | ||
+ return true; | ||
+ } | ||
+ | ||
+ return false; | ||
+} | ||
+#endif | ||
+ | ||
/* The function tries to determine whether the flow identified by the given | ||
* CT_INGRESS tuple belongs to a NodePort traffic (i.e., outside client => N/S | ||
* LB => local backend). | ||
-- | ||
2.39.5 (Apple Git-154) | ||
|