Skip to content

Commit aafb509

Browse files
committed
bpf: lxc: defer CT_INGRESS entry creation for loopback connections
[ upstream commit dbf4351c4eff1857cf374886804eb876e970b992 ] Signed-off-by: l1b0k <[email protected]>
1 parent ff90520 commit aafb509

1 file changed

Lines changed: 175 additions & 0 deletions

File tree

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2+
From: Julian Wiedmann <[email protected]>
3+
Date: Wed, 16 Aug 2023 08:30:00 +0300
4+
Subject: bpf: lxc: defer CT_INGRESS entry creation for loopback connections
5+
6+
[ upstream commit dbf4351c4eff1857cf374886804eb876e970b992 ]
7+
8+
Currently the CT_INGRESS entry for a loopback connection is already created
9+
when the first request leaves the client, as from-container creates the
10+
CT_EGRESS entry (see the loopback handling in ct_create4() for details).
11+
12+
This is unusual - we would normally just create the CT_INGRESS entry as
13+
the first packet passes through to-container into the backend. But for
14+
loopback connections it is needed, so that
15+
1.) to-container finds a CT entry with .loopback set, and thus skips
16+
network policy enforcement even for the first packet, and
17+
2.) the CT entry has its rev_nat_index field populated, and thus can
18+
RevNAT replies in from-container.
19+
20+
This approach conflicts with the fact that loopback replies skip the
21+
client's to-container path (to avoid network policy enforcement).
22+
23+
Once the loopback connection is closed, the backend's from-container path
24+
observes the FIN / RST, and __ct_lookup4() updates the CT_INGRESS entry's
25+
lifetime to CT_CLOSE_TIMEOUT. But the client's to-container path will not
26+
observe the FIN / RST, and thus the CT_EGRESS entry's lifetime remains as
27+
CT_CONNECTION_LIFETIME_TCP. Therefore the CT_INGRESS entry will expire
28+
earlier, and potentially gets garbage-collected while the CT_EGRESS entry
29+
stays in place.
30+
31+
If the same loopback connection is subsequently re-opened, the client's
32+
from-container path finds the CT_EGRESS entry and thus will *not* call
33+
ct_create4(). Consequently the CT_INGRESS entry is not created, and the
34+
backend will not apply the loopback-specific handling described above.
35+
Inbound requests are potentially dropped (due to network policy), and/or
36+
replies are not RevNATed.
37+
38+
Fix this up by letting the backend path create its CT_INGRESS entry as
39+
usual. It just needs a bit of detection code in its CT_NEW handling to
40+
understand that the first packet belongs to a .loopback connection, and
41+
populate its own CT_INGRESS entry accordingly.
42+
43+
[ Backport note: Addressed some minor conflicts, skipped tests. ]
44+
45+
Signed-off-by: Julian Wiedmann <[email protected]>
46+
Signed-off-by: Quentin Monnet <[email protected]>
47+
---
48+
bpf/bpf_lxc.c | 25 +++++++++++++++++++++-
49+
bpf/lib/conntrack.h | 52 ++++++++++++++++++---------------------------
50+
2 files changed, 45 insertions(+), 32 deletions(-)
51+
52+
diff --git a/bpf/bpf_lxc.c b/bpf/bpf_lxc.c
53+
index 0a18c35c9a..1eb8a2b784 100644
54+
--- a/bpf/bpf_lxc.c
55+
+++ b/bpf/bpf_lxc.c
56+
@@ -983,6 +983,16 @@ ct_recreate4:
57+
* been passed to the stack.
58+
*/
59+
if (is_defined(ENABLE_ROUTING) || hairpin_flow) {
60+
+ /* Hairpin requests need to pass through the backend's to-container
61+
+ * path, to create a CT_INGRESS entry with .lb_loopback set. This
62+
+ * drives RevNAT in the backend's from-container path.
63+
+ *
64+
+ * Hairpin replies are fully RevNATed in the backend's from-container
65+
+ * path. Thus they don't match the CT_EGRESS entry, and we can't rely
66+
+ * on a CT_REPLY result that would provide bypass of ingress policy.
67+
+ * Thus manually skip the ingress policy path.
68+
+ */
69+
+ bool bypass_ingress_policy = hairpin_flow && ct_status == CT_REPLY;
70+
struct endpoint_info *ep;
71+
72+
/* Lookup IPv4 address, this will return a match if:
73+
@@ -1006,7 +1016,8 @@ ct_recreate4:
74+
policy_clear_mark(ctx);
75+
/* If the packet is from L7 LB it is coming from the host */
76+
return ipv4_local_delivery(ctx, ETH_HLEN, SECLABEL, ip4,
77+
- ep, METRIC_EGRESS, from_l7lb, hairpin_flow);
78+
+ ep, METRIC_EGRESS, from_l7lb,
79+
+ bypass_ingress_policy);
80+
}
81+
}
82+
83+
@@ -1833,7 +1844,19 @@ ipv4_policy(struct __ctx_buff *ctx, int ifindex, __u32 src_label, enum ct_status
84+
* define policy rules to allow pods to talk to themselves. We still
85+
* want to execute the conntrack logic so that replies can be correctly
86+
* matched.
87+
+ *
88+
+ * If ip4.saddr is IPV4_LOOPBACK, this is almost certainly a loopback
89+
+ * connection. Populate
90+
+ * - .loopback, so that policy enforcement is bypassed, and
91+
+ * - .rev_nat_index, so that replies can be RevNATed.
92+
*/
93+
+ if (ret == CT_NEW && ip4->saddr == IPV4_LOOPBACK &&
94+
+ ct_has_loopback_egress_entry4(get_ct_map4(tuple), tuple,
95+
+ &ct_state_new.rev_nat_index)) {
96+
+ ct_state_new.loopback = true;
97+
+ goto skip_policy_enforcement;
98+
+ }
99+
+
100+
if (unlikely(ct_state->loopback))
101+
goto skip_policy_enforcement;
102+
#endif /* ENABLE_PER_PACKET_LB && !DISABLE_LOOPBACK_LB */
103+
diff --git a/bpf/lib/conntrack.h b/bpf/lib/conntrack.h
104+
index 6a31c64b3a..5b3afdf83b 100644
105+
--- a/bpf/lib/conntrack.h
106+
+++ b/bpf/lib/conntrack.h
107+
@@ -941,37 +941,6 @@ static __always_inline int ct_create4(const void *map_main,
108+
return DROP_CT_CREATE_FAILED;
109+
}
110+
111+
- if (ct_state->addr && ct_state->loopback) {
112+
- __u8 flags = tuple->flags;
113+
- __be32 saddr, daddr;
114+
-
115+
- saddr = tuple->saddr;
116+
- daddr = tuple->daddr;
117+
-
118+
- /* We are looping back into the origin endpoint through a
119+
- * service, set up a conntrack tuple for the reply to ensure we
120+
- * do rev NAT before attempting to route the destination
121+
- * address which will not point back to the right source.
122+
- */
123+
- tuple->flags = TUPLE_F_IN;
124+
- if (dir == CT_INGRESS) {
125+
- tuple->saddr = ct_state->addr;
126+
- tuple->daddr = ct_state->svc_addr;
127+
- } else {
128+
- tuple->saddr = ct_state->svc_addr;
129+
- tuple->daddr = ct_state->addr;
130+
- }
131+
-
132+
- if (map_update_elem(map_main, tuple, &entry, 0) < 0) {
133+
- send_signal_ct_fill_up(ctx, SIGNAL_PROTO_V4);
134+
- return DROP_CT_CREATE_FAILED;
135+
- }
136+
-
137+
- tuple->saddr = saddr;
138+
- tuple->daddr = daddr;
139+
- tuple->flags = flags;
140+
- }
141+
-
142+
if (map_related != NULL) {
143+
/* Create an ICMP entry to relate errors */
144+
struct ipv4_ct_tuple icmp_tuple = {
145+
@@ -996,6 +965,27 @@ static __always_inline int ct_create4(const void *map_main,
146+
return 0;
147+
}
148+
149+
+#ifndef DISABLE_LOOPBACK_LB
150+
+static __always_inline bool
151+
+ct_has_loopback_egress_entry4(const void *map, struct ipv4_ct_tuple *tuple,
152+
+ __u16 *rev_nat_index)
153+
+{
154+
+ __u8 flags = tuple->flags;
155+
+ struct ct_entry *entry;
156+
+
157+
+ tuple->flags = TUPLE_F_OUT;
158+
+ entry = map_lookup_elem(map, tuple);
159+
+ tuple->flags = flags;
160+
+
161+
+ if (entry && entry->lb_loopback) {
162+
+ *rev_nat_index = entry->rev_nat_index;
163+
+ return true;
164+
+ }
165+
+
166+
+ return false;
167+
+}
168+
+#endif
169+
+
170+
/* The function tries to determine whether the flow identified by the given
171+
* CT_INGRESS tuple belongs to a NodePort traffic (i.e., outside client => N/S
172+
* LB => local backend).
173+
--
174+
2.39.5 (Apple Git-154)
175+

0 commit comments

Comments
 (0)