Skip to content

Commit 88adb1a

Browse files
authored
Tunneled LSO/CSO Support (#614)
This PR makes use of in-progress illumos bits to detect common offloads between underlay devices, and advertises LSO/CSO capabilities on `opte` ports, relying illumos's emulation functionality to split packets and insert checksums if not on `cxgbe` (or other compatible NIC). This allows for a few nice things: * Guests will send us TCP packets without an L4 checksum and up to ~64KiB in size. - The guest and OPTE spend less time computing/updating checksums. - OPTE has to process fewer packets – hooray! Ultimately, that's less processing time per payload byte. - If sending out over the NIC, then either the NIC or illumos's LSO emulation path are responsible for splitting the packet into TCP packets which will not violate the MTU. See below on what we tweak here. - If sending in the loopback path, we get to hand these packets directly to the target guest without splitting them apart. We do need to insert the checksum ourself, however. * Guests will send us UDP packets without an L4 checksum. - Again, the guest and OPTE spend less time computing/updating checksums. Ultimately, we have control over the MSS we advertise to the NIC for LSO. The useful part of this is that when we know we are using a purely rack-internal path, we can elevate the MSS up to `MTU - overheads`. I've added a system of 'well-known' `ActionMeta` KV pairs which allow for layers like `overlay` to propagate this knowledge out. Given that the use of a larger MTU vastly reduces inbound packet rate on the receive half (the main bottleneck today), this gets us to around 8Gbps iPerf for rack-local traffic (and 17Gbps for two or more parallel streams) on `glasgow`. On `dublin` with a full control plane, this resolves to around 4Gbps sled-to-sled and 14-Gbps sled-local between Linux VMs – we want to investigate the drop here once this is on dogfood. illumos as a guest doesn't do much better than before since it does not advertise GRO (illumos-as-host will fragment the packets) or LSO (it will never send TCP packets > MTU). - [x] Cleanup. - [x] Finalise illumos interfaces. [stlouis#663](https://code.oxide.computer/c/illumos-gate/+/485) - [x] Manually fill checksums in guest loopback cases. - [x] Have viona perform LSO if guests do not advertise/expect LRO. [illumos#17032](https://code.illumos.org/c/illumos-gate/+/3942) Closes #328, closes #329. Based on #688.
1 parent c3cf635 commit 88adb1a

File tree

19 files changed

+1259
-179
lines changed

19 files changed

+1259
-179
lines changed

Cargo.lock

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/illumos-sys-hdrs/Cargo.toml

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,7 @@ repository.workspace = true
88

99
[features]
1010
default = []
11-
kernel = []
11+
kernel = []
12+
13+
[dependencies]
14+
bitflags.workspace = true

crates/illumos-sys-hdrs/src/lib.rs

+6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ pub mod kernel;
1212
#[cfg(feature = "kernel")]
1313
pub use kernel::*;
1414

15+
pub mod mac;
16+
1517
use core::ptr;
1618
use core::sync::atomic::AtomicI32;
1719
use core::sync::atomic::AtomicI64;
@@ -246,6 +248,8 @@ pub struct dblk_t {
246248
pub db_struioun: u64, // imprecise
247249
pub db_fthdr: *const c_void, // imprecise
248250
pub db_credp: *const c_void, // imprecise
251+
252+
pub db_meoi: [u8; 16], // imprecise
249253
}
250254

251255
impl Default for dblk_t {
@@ -269,6 +273,8 @@ impl Default for dblk_t {
269273
db_struioun: 0,
270274
db_fthdr: ptr::null(),
271275
db_credp: ptr::null(),
276+
277+
db_meoi: Default::default(),
272278
}
273279
}
274280
}

crates/illumos-sys-hdrs/src/mac.rs

+194
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
// Copyright 2025 Oxide Computer Company
6+
7+
#[cfg(feature = "kernel")]
8+
use crate::mblk_t;
9+
use bitflags::bitflags;
10+
11+
// ======================================================================
12+
// uts/common/sys/mac_provider.h
13+
// ======================================================================
14+
15+
bitflags! {
16+
#[repr(C)]
17+
#[derive(Clone, Copy, Debug, Default)]
18+
/// Flags which denote the valid fields of a `mac_ether_offload_info_t`
19+
/// or `mac_ether_tun_info_t`.
20+
///
21+
/// These are derived from `mac_ether_offload_flags_t` (mac_provider.h,
22+
/// omitting the `MEOI_` prefix).
23+
pub struct MacEtherOffloadFlags: u32 {
24+
/// `l2hlen` and `l3proto` are set.
25+
const L2INFO_SET = 1 << 0;
26+
/// `l3hlen` and `l4proto` are set.
27+
const L3INFO_SET = 1 << 1;
28+
/// `l4hlen` is set.
29+
const L4INFO_SET = 1 << 2;
30+
/// `tunhlen` is set.
31+
const TUNINFO_SET = 1 << 3;
32+
/// The ethernet header contains a VLAN tag.
33+
const VLAN_TAGGED = 1 << 4;
34+
/// The packet is fragmented at L3, and this packet is not the last
35+
/// fragment. L4 headers are expected to be present.
36+
const L3_FRAG_MORE = 1 << 5;
37+
/// The packet is fragmented at L3, and this packet is not the first
38+
/// fragment.
39+
const L3_FRAG_OFFSET = 1 << 6;
40+
}
41+
}
42+
43+
#[repr(C)]
44+
#[derive(Copy, Clone, Debug, Default, Eq, PartialEq)]
45+
/// The type of tunnel in use for a packet's outermost layer.
46+
///
47+
/// These are derived from `mac_ether_tun_type_t` (mac_provider.h,
48+
/// omitting the `METT_` prefix).
49+
pub struct MacTunType(u32);
50+
51+
impl MacTunType {
52+
pub const NONE: Self = Self(0);
53+
pub const GENEVE: Self = Self(1);
54+
pub const VXLAN: Self = Self(2);
55+
}
56+
57+
#[repr(C)]
58+
#[derive(Clone, Copy, Debug, Default)]
59+
pub struct mac_ether_offload_info_t {
60+
pub meoi_flags: MacEtherOffloadFlags,
61+
pub meoi_tuntype: MacTunType,
62+
pub meoi_len: u32,
63+
pub meoi_l2hlen: u8,
64+
pub meoi_l3proto: u16,
65+
pub meoi_l3hlen: u16,
66+
pub meoi_l4proto: u8,
67+
pub meoi_l4hlen: u8,
68+
pub meoi_tunhlen: u16,
69+
}
70+
71+
#[cfg(feature = "kernel")]
72+
unsafe extern "C" {
73+
pub fn lso_info_set(mp: *mut mblk_t, mss: u32, flags: u32);
74+
pub fn lso_info_cleanup(mp: *mut mblk_t);
75+
pub fn mac_hcksum_set(
76+
mp: *mut mblk_t,
77+
start: u32,
78+
stuff: u32,
79+
end: u32,
80+
value: u32,
81+
flags: u32,
82+
);
83+
pub fn mac_hcksum_get(
84+
mp: *mut mblk_t,
85+
start: *mut u32,
86+
stuff: *mut u32,
87+
end: *mut u32,
88+
value: *mut u32,
89+
flags: *mut u32,
90+
);
91+
pub fn mac_lso_get(mp: *mut mblk_t, mss: *mut u32, flags: *mut u32);
92+
pub fn mac_ether_set_pktinfo(
93+
mp: *mut mblk_t,
94+
outer_info: *const mac_ether_offload_info_t,
95+
inner_info: *const mac_ether_offload_info_t,
96+
);
97+
}
98+
99+
// ======================================================================
100+
// uts/common/sys/pattr.h
101+
// ======================================================================
102+
103+
bitflags! {
104+
/// Flags which denote checksum and LSO state for an `mblk_t`.
105+
///
106+
/// These are derived from `#define`s in pattr.h.
107+
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
108+
pub struct MblkOffloadFlags: u32 {
109+
/// Tx: IPv4 header checksum must be computed by hardware.
110+
const HCK_IPV4_HDRCKSUM = 1 << 0;
111+
/// Rx: IPv4 header checksum was verified correct by hardware.
112+
const HCK_IPV4_HDRCKSUM_OK = Self::HCK_IPV4_HDRCKSUM.bits();
113+
/// * Tx: Compute partial checksum based on start/stuff/end offsets.
114+
/// * Rx: Partial checksum computed and attached.
115+
const HCK_PARTIALCKSUM = 1 << 1;
116+
/// * Tx: Compute full (pseudo + l4 + payload) cksum for this packet.
117+
/// * Rx: Full checksum was computed in hardware, and is attached.
118+
const HCK_FULLCKSUM = 1 << 2;
119+
/// Rx: Hardware has verified that L3/L4 checksums are correct.
120+
const HCK_FULLCKSUM_OK = 1 << 3;
121+
/// Tx: Hardware must perform LSO.
122+
const HW_LSO = 1 << 4;
123+
/// Tx: The inner frame's IPv4 header checksum must be computed by
124+
/// hardware.
125+
const HCK_INNER_V4CKSUM = 1 << 5;
126+
/// Rx: The inner frame's IPv4 header checksum was verified correct by
127+
/// hardware.
128+
const HCK_INNER_V4CKSUM_OK = 1 << 6;
129+
/// * Tx: Compute inner L4 partial checksum based on MEOI parse offsets.
130+
const HCK_INNER_PARTIAL = 1 << 7;
131+
/// * Tx: Compute full (pseudo + l4 + payload) cksum for this packet's
132+
/// inner L4.
133+
const HCK_INNER_FULL = 1 << 8;
134+
/// Rx: Hardware has verified that inner L3/L4 checksums are correct.
135+
const HCK_INNER_FULL_OK = 1 << 9;
136+
/// The union of all checksum-related flags.
137+
const HCK_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() |
138+
Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits() |
139+
Self::HCK_FULLCKSUM_OK.bits() | Self::HCK_INNER_V4CKSUM.bits() |
140+
Self::HCK_INNER_V4CKSUM_OK.bits() | Self::HCK_INNER_PARTIAL.bits() |
141+
Self::HCK_INNER_FULL.bits() | Self::HCK_INNER_FULL_OK.bits();
142+
/// The union of all checksum-related flags used in the transmit path
143+
/// (i.e., indicating missing checksums).
144+
const HCK_TX_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() |
145+
Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits() |
146+
Self::HCK_INNER_V4CKSUM.bits() | Self::HCK_INNER_PARTIAL.bits() |
147+
Self::HCK_INNER_FULL.bits();
148+
/// The union of all checksum-related flags used in the transmit path
149+
/// for outer headers (untunnelled packets and encap layers).
150+
const HCK_OUTER_TX_FLAGS = Self::HCK_IPV4_HDRCKSUM.bits() |
151+
Self::HCK_PARTIALCKSUM.bits() | Self::HCK_FULLCKSUM.bits();
152+
/// The union of all checksum-related flags for outer headers (untunnelled
153+
/// packets and encap layers).
154+
const HCK_OUTER_FLAGS = Self::HCK_OUTER_TX_FLAGS.bits() |
155+
Self::HCK_IPV4_HDRCKSUM_OK.bits() | Self::HCK_FULLCKSUM_OK.bits();
156+
/// The union of all checksum-related flags used in the transmit path
157+
/// for inner headers (tunnelled packets).
158+
const HCK_INNER_TX_FLAGS = Self::HCK_INNER_V4CKSUM.bits() |
159+
Self::HCK_INNER_PARTIAL.bits() | Self::HCK_INNER_FULL.bits();
160+
/// The union of all checksum-related flags for inner headers (tunnelled
161+
/// packets).
162+
const HCK_INNER_FLAGS = Self::HCK_INNER_TX_FLAGS.bits() |
163+
Self::HCK_INNER_V4CKSUM_OK.bits() | Self::HCK_INNER_FULL_OK.bits();
164+
/// The union of all LSO-related flags.
165+
const HW_LSO_FLAGS = Self::HW_LSO.bits();
166+
}
167+
}
168+
169+
impl MblkOffloadFlags {
170+
/// Move any outer offload flags to the inner layer, as part of
171+
/// encapsulation.
172+
pub fn shift_in(self) -> Self {
173+
let mut out =
174+
self.difference(Self::HCK_INNER_FLAGS.union(Self::HCK_OUTER_FLAGS));
175+
176+
if self.contains(Self::HCK_IPV4_HDRCKSUM) {
177+
out |= Self::HCK_INNER_V4CKSUM;
178+
}
179+
180+
if self.contains(Self::HCK_PARTIALCKSUM) {
181+
out |= Self::HCK_INNER_PARTIAL;
182+
}
183+
184+
if self.contains(Self::HCK_FULLCKSUM) {
185+
out |= Self::HCK_INNER_FULL;
186+
}
187+
188+
if self.contains(Self::HCK_FULLCKSUM_OK) {
189+
out |= Self::HCK_INNER_FULL_OK;
190+
}
191+
192+
out
193+
}
194+
}

lib/opte/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ opte-api.workspace = true
2929

3030
ingot.workspace = true
3131

32-
bitflags.workspace = true
32+
bitflags = { workspace = true , features = ["serde"] }
3333
cfg-if.workspace = true
3434
crc32fast = { workspace = true, optional = true }
3535
dyn-clone.workspace = true

0 commit comments

Comments
 (0)