diff --git a/docs/design/features/offload_XDP/design_and_limitations.md b/docs/design/features/offload_XDP/design_and_limitations.md new file mode 100644 index 00000000..891926f5 --- /dev/null +++ b/docs/design/features/offload_XDP/design_and_limitations.md @@ -0,0 +1,30 @@ + +Netronome SmartNIC has following limitations: + +- Only supports maps of type BPF_MAP_TYPE_ARRAY/BPF_MAP_TYPE_HASH. +- Constraints on the size of the maps: + - Sum of map entries on the NIC should be less than 3,072,000; + - A maximum limit of 64 bytes per entry. +- Constraints on the size of the program: the maximum number of instructions on the SmartNIC is only 2800. +- Does not support XDP_REDIRECT. + + + +Considering the limitations above, bouncers and dividers (forwarding function) are offloaded to the NIC. + + + +The first packet between two endpoints will pass through the bouncer/divider. This forwarding function will be implemented on the NIC. On the contrary, for the XDP program on the endpoint host's kernel, every packet (not only the first) will pass through it. The main function is to redirect rather than forward, so it is not the offloaded target. + + + +The offloaded workflow is as follows (The black line represents the logic of the mizar, and the red line represents the partial function offloaded to SmartNIC): + +![workflow of offloading XDP](offload_XDP_workflow.png) \ No newline at end of file diff --git a/docs/design/features/offload_XDP/offload_XDP_workflow.png b/docs/design/features/offload_XDP/offload_XDP_workflow.png new file mode 100644 index 00000000..bf1ada5e Binary files /dev/null and b/docs/design/features/offload_XDP/offload_XDP_workflow.png differ diff --git a/etc/deploy/deploy.mizar.dev.yaml b/etc/deploy/deploy.mizar.dev.yaml index df3fa1d7..649ce740 100644 --- a/etc/deploy/deploy.mizar.dev.yaml +++ b/etc/deploy/deploy.mizar.dev.yaml @@ -527,6 +527,8 @@ spec: value: 'false' - name: FEATUREGATE_BWQOS value: 'false' + - name: FEATUREGATE_OFFLOAD_XDP + value: 'false' securityContext: privileged: true volumes: diff --git a/mizar/common/common.py b/mizar/common/common.py index c31c928f..f367924f 100644 --- a/mizar/common/common.py +++ b/mizar/common/common.py @@ -28,6 +28,7 @@ import datetime import json import dateutil.parser +import yaml from kubernetes import watch, client, config from kubernetes.client.rest import ApiException from ctypes.util import find_library @@ -434,6 +435,35 @@ def conf_list_has_max_elements(conf, conf_list): return True return False +def supported_offload_xdp_itf_names(): + """ + According to the list of NIC names, corresponding logic interface names are returned. + """ + logical_itf_names = [] + with open("/var/mizar/supported_xdp_offload_nics.yaml", "r", encoding="utf-8") as f: + supported_nic_names = yaml.load(f, Loader=yaml.FullLoader) + + rc, data = run_cmd("lspci -mm | grep 'Ethernet controller'") + if rc is not None: + logging.info("Failure running \"lspci -mm | grep 'Ethernet controller'\" with rc:" + f'''{rc}''') + return logical_itf_names + + eth_crtls = [i for i in data.split('\n') if i] + for eth_crtl in eth_crtls: + for vendor_name in supported_nic_names.keys(): + for model_name in supported_nic_names[vendor_name]: + if vendor_name.lower() in eth_crtl.lower() and model_name.lower() in eth_crtl.lower(): + pci_num = eth_crtl.split()[0] + rc, data = run_cmd("ls -l /sys/class/net | grep %s" % pci_num) + if rc is not None: + continue + else: + logical_itfs = [i for i in data.split('\n') if i] + for logical_itf in logical_itfs: + logical_itf_names.append(logical_itf.split('/')[-1]) + + return logical_itf_names + def get_default_itf(): """ Assuming "ip route" returns the following format: diff --git a/mizar/daemon/app.py b/mizar/daemon/app.py index 5868b09c..8fdd23be 100644 --- a/mizar/daemon/app.py +++ b/mizar/daemon/app.py @@ -94,6 +94,11 @@ def init(benchmark=False): output = r.stdout.read().decode().strip() logging.info("Removed existing XDP program: {}".format(output)) + cmd = "nsenter -t 1 -m -u -n -i ip link set dev " + f'''{default_itf}''' + " xdpoffload off" + r = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + output = r.stdout.read().decode().strip() + logging.info("Removed existing offload XDP program: {}".format(output)) + cmd = "nsenter -t 1 -m -u -n -i /trn_bin/transitd >transitd.log &" r = subprocess.Popen(cmd, shell=True) logging.info("Running transitd") @@ -119,6 +124,24 @@ def init(benchmark=False): output = r.stdout.read().decode().strip() logging.info("Running load-transit-xdp: {}".format(output)) + # Offload XDP program removes codes about debugging for size limitation. + if os.getenv('FEATUREGATE_OFFLOAD_XDP', 'false').lower() in ('true', '1'): + if default_itf in supported_offload_xdp_itf_names(): + config = { + "xdp_path": "/trn_xdp/trn_transit_xdp_hardware_offload_ebpf.o", + "pcapfile": "/bpffs/transit_xdp_offload.pcap", + "xdp_flag": CONSTANTS.XDP_OFFLOAD + } + config = json.dumps(config) + cmd = (f'''nsenter -t 1 -m -u -n -i /trn_bin/transit -s {nodeip} load-transit-xdp -i {default_itf} -j '{config}' ''') + r = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + output = r.stdout.read().decode().strip() + logging.info("Running load-transit-xdp with offload mode: {}".format(output)) + else: + logging.info("Offloading transit XDP functionality not supported for interface {}".format(default_itf)) + else: + logging.info("Offload XDP feature is disabled.") + if os.getenv('FEATUREGATE_BWQOS', 'false').lower() in ('false', '0'): logging.info("Bandwidth QoS feature is disabled.") return diff --git a/src/dmn/test/test_dmn.c b/src/dmn/test/test_dmn.c index 78ee6851..627d9f96 100644 --- a/src/dmn/test/test_dmn.c +++ b/src/dmn/test/test_dmn.c @@ -400,6 +400,7 @@ static void do_lo_xdp_load(void) xdp_intf.interface = itf; xdp_intf.xdp_path = xdp_path; xdp_intf.pcapfile = pcapfile; + xdp_intf.xdp_flag = XDP_FLAGS_SKB_MODE; int *rc; expect_function_call(__wrap_bpf_map_update_elem); @@ -408,6 +409,23 @@ static void do_lo_xdp_load(void) assert_int_equal(*rc, 0); } +static void do_lo_offload_xdp_load(void) +{ + rpc_trn_xdp_intf_t xdp_intf; + char itf[] = "lo"; + char xdp_path[] = "/path/to/xdp/object/file"; + char pcapfile[] = "/path/to/bpf/pinned/map"; + xdp_intf.interface = itf; + xdp_intf.xdp_path = xdp_path; + xdp_intf.pcapfile = pcapfile; + xdp_intf.xdp_flag = XDP_FLAGS_HW_MODE; + + int *rc; + expect_function_call(__wrap_bpf_map_update_elem); + rc = load_transit_xdp_1_svc(&xdp_intf, NULL); + assert_int_equal(*rc, 0); +} + static void do_lo_xdp_unload(void) { int *rc; @@ -466,6 +484,27 @@ static void test_update_vpc_1_svc(void **state) assert_int_equal(*rc, 0); } +static void test_update_offload_vpc_1_svc(void **state) +{ + UNUSED(state); + + char itf[] = "lo"; + uint32_t routers[] = { 0x100000a, 0x200000a }; + + struct rpc_trn_vpc_t vpc1 = { + .interface = itf, + .tunid = 3, + .routers_ips = { .routers_ips_len = 2, + .routers_ips_val = routers } + + }; + + int *rc; + expect_function_calls(__wrap_bpf_map_update_elem, 2); + rc = update_vpc_1_svc(&vpc1, NULL); + assert_int_equal(*rc, 0); +} + static void test_update_net_1_svc(void **state) { UNUSED(state); @@ -488,6 +527,28 @@ static void test_update_net_1_svc(void **state) assert_int_equal(*rc, 0); } +static void test_update_offload_net_1_svc(void **state) +{ + UNUSED(state); + + char itf[] = "lo"; + uint32_t switches[] = { 0x100000a, 0x200000a }; + + struct rpc_trn_network_t net1 = { + .interface = itf, + .prefixlen = 16, + .tunid = 3, + .netip = 0xa, + .switches_ips = { .switches_ips_len = 2, + .switches_ips_val = switches } + }; + + int *rc; + expect_function_calls(__wrap_bpf_map_update_elem, 2); + rc = update_net_1_svc(&net1, NULL); + assert_int_equal(*rc, 0); +} + static void test_update_ep_1_svc(void **state) { UNUSED(state); @@ -516,6 +577,34 @@ static void test_update_ep_1_svc(void **state) assert_int_equal(*rc, 0); } +static void test_update_offload_ep_1_svc(void **state) +{ + UNUSED(state); + + char itf[] = "lo"; + char vitf[] = "veth0"; + char hosted_itf[] = "veth"; + uint32_t remote[] = { 0x200000a }; + char mac[6] = { 1, 2, 3, 4, 5, 6 }; + + struct rpc_trn_endpoint_t ep1 = { + .interface = itf, + .ip = 0x100000a, + .eptype = 1, + .remote_ips = { .remote_ips_len = 1, .remote_ips_val = remote }, + .hosted_interface = hosted_itf, + .veth = vitf, + .tunid = 3, + }; + + memcpy(ep1.mac, mac, sizeof(char) * 6); + + int *rc; + expect_function_calls(__wrap_bpf_map_update_elem, 3); + rc = update_ep_1_svc(&ep1, NULL); + assert_int_equal(*rc, 0); +} + #if 0 static void test_update_agent_ep_1_svc(void **state) { @@ -1487,6 +1576,32 @@ static void test_delete_vpc_1_svc(void **state) assert_int_equal(*rc, RPC_TRN_ERROR); } +static void test_delete_offload_vpc_1_svc(void **state) +{ + UNUSED(state); + char itf[] = "lo"; + struct rpc_trn_vpc_key_t vpc_key = { .interface = itf, .tunid = 3 }; + int *rc; + + /* Test delete_vpc_1 with valid vp_ckey */ + will_return(__wrap_bpf_map_delete_elem, TRUE); + will_return(__wrap_bpf_map_delete_elem, TRUE); + expect_function_calls(__wrap_bpf_map_delete_elem, 2); + rc = delete_vpc_1_svc(&vpc_key, NULL); + assert_int_equal(*rc, 0); + + /* Test delete_vpc_1 with invalid vpc_key */ + will_return(__wrap_bpf_map_delete_elem, FALSE); + expect_function_call(__wrap_bpf_map_delete_elem); + rc = delete_vpc_1_svc(&vpc_key, NULL); + assert_int_equal(*rc, RPC_TRN_FATAL); + + /* Test delete_vpc_1 with invalid interface*/ + vpc_key.interface = ""; + rc = delete_vpc_1_svc(&vpc_key, NULL); + assert_int_equal(*rc, RPC_TRN_ERROR); +} + static void test_delete_net_1_svc(void **state) { UNUSED(state); @@ -1517,6 +1632,38 @@ static void test_delete_net_1_svc(void **state) assert_int_equal(*rc, RPC_TRN_ERROR); } +static void test_delete_offload_net_1_svc(void **state) +{ + UNUSED(state); + char itf[] = "lo"; + struct rpc_trn_network_key_t net_key = { + .interface = itf, + .prefixlen = 16, + .tunid = 3, + .netip = 0xa, + }; + int *rc; + + /* Test delete_net_1 with valid net_key */ + will_return(__wrap_bpf_map_delete_elem, TRUE); + will_return(__wrap_bpf_map_delete_elem, TRUE); + expect_function_calls(__wrap_bpf_map_delete_elem, 2); + rc = delete_net_1_svc(&net_key, NULL); + assert_int_equal(*rc, 0); + + /* Test delete_net_1 with invalid net_key */ + will_return(__wrap_bpf_map_delete_elem, FALSE); + expect_function_call(__wrap_bpf_map_delete_elem); + rc = delete_net_1_svc(&net_key, NULL); + assert_int_equal(*rc, RPC_TRN_ERROR); + + /* Test delete_net_1 with invalid interface*/ + net_key.interface = ""; + rc = delete_net_1_svc(&net_key, NULL); + assert_int_equal(*rc, RPC_TRN_ERROR); +} + + static void test_delete_ep_1_svc(void **state) { UNUSED(state); @@ -1566,6 +1713,56 @@ static void test_delete_ep_1_svc(void **state) assert_int_equal(*rc, RPC_TRN_ERROR); } +static void test_delete_offload_ep_1_svc(void **state) +{ + UNUSED(state); + char itf[] = "lo"; + struct rpc_trn_endpoint_key_t ep_key = { + .interface = itf, + .ip = 0x100000a, + .tunid = 3, + }; + int *rc; + + uint32_t remote[] = { 0x200000a }; + char mac[6] = { 1, 2, 3, 4, 5, 6 }; + + struct endpoint_t ep_val; + ep_val.eptype = 1; + ep_val.nremote_ips = 1; + ep_val.remote_ips[0] = remote[0]; + ep_val.hosted_iface = 1; + memcpy(ep_val.mac, mac, sizeof(mac)); + + /* Test delete_ep_1 with valid ep_key */ + will_return(__wrap_bpf_map_lookup_elem, &ep_val); + will_return(__wrap_bpf_map_lookup_elem, NULL); + will_return(__wrap_bpf_map_lookup_elem, NULL); + will_return(__wrap_bpf_map_lookup_elem, NULL); + will_return(__wrap_bpf_map_delete_elem, TRUE); + will_return(__wrap_bpf_map_delete_elem, TRUE); + expect_function_call(__wrap_bpf_map_lookup_elem); + expect_function_calls(__wrap_bpf_map_delete_elem, 2); + rc = delete_ep_1_svc(&ep_key, NULL); + assert_int_equal(*rc, 0); + + /* Test delete_ep_1 with invalid ep_key */ + will_return(__wrap_bpf_map_lookup_elem, &ep_val); + will_return(__wrap_bpf_map_lookup_elem, NULL); + will_return(__wrap_bpf_map_lookup_elem, NULL); + will_return(__wrap_bpf_map_lookup_elem, NULL); + will_return(__wrap_bpf_map_delete_elem, FALSE); + expect_function_call(__wrap_bpf_map_lookup_elem); + expect_function_call(__wrap_bpf_map_delete_elem); + rc = delete_ep_1_svc(&ep_key, NULL); + assert_int_equal(*rc, RPC_TRN_ERROR); + + /* Test delete_ep_1 with invalid interface*/ + ep_key.interface = ""; + rc = delete_ep_1_svc(&ep_key, NULL); + assert_int_equal(*rc, RPC_TRN_ERROR); +} + static void test_delete_agent_ep_1_svc(void **state) { UNUSED(state); @@ -1656,6 +1853,16 @@ static int groupSetup(void **state) return 0; } +static int offload_groupSetup(void **state) +{ + UNUSED(state); + TRN_LOG_INIT("transitd_offload_unit"); + trn_itf_table_init(); + do_lo_xdp_load(); + do_lo_offload_xdp_load(); + return 0; +} + /** * This is run once after all group tests */ @@ -1709,7 +1916,18 @@ int main() cmocka_unit_test(test_delete_agent_network_policy_protocol_port_1_svc) }; + const struct CMUnitTest offload_tests[] = { + cmocka_unit_test(test_update_offload_vpc_1_svc), + cmocka_unit_test(test_update_offload_net_1_svc), + cmocka_unit_test(test_update_offload_ep_1_svc), + cmocka_unit_test(test_delete_offload_vpc_1_svc), + cmocka_unit_test(test_delete_offload_net_1_svc), + cmocka_unit_test(test_delete_offload_ep_1_svc) + }; + int result = cmocka_run_group_tests(tests, groupSetup, groupTeardown); + result = cmocka_run_group_tests(offload_tests, offload_groupSetup, groupTeardown); + return result; } diff --git a/src/dmn/trn_rpc_protocol_handlers_1.c b/src/dmn/trn_rpc_protocol_handlers_1.c index 085ff9b9..308abf29 100644 --- a/src/dmn/trn_rpc_protocol_handlers_1.c +++ b/src/dmn/trn_rpc_protocol_handlers_1.c @@ -616,53 +616,73 @@ int *load_transit_xdp_1_svc(rpc_trn_xdp_intf_t *xdp_intf, struct svc_req *rqstp) struct user_metadata_t empty_md; struct user_metadata_t *md = trn_itf_table_find(itf); - if (md) { - TRN_LOG_INFO("Transit XDP for interface %s already exist.", itf); - return &result; - } + if (xdp_flag == XDP_FLAGS_HW_MODE) { + if (!md) { + TRN_LOG_ERROR("Cannot find interface metadata for %s", itf); + result = RPC_TRN_FATAL; + return &result; + } - TRN_LOG_INFO("Loading transit XDP for interface %s.", itf); - md = malloc(sizeof(struct user_metadata_t)); - if (!md) { - TRN_LOG_ERROR("Failure allocating memory for user_metadata_t"); - result = RPC_TRN_FATAL; - goto error; - } + // Metadata has been initialized in XDP_FLAGS_SKB_MODE + rc = trn_user_metadata_init_offload(md, itf, kern_path, xdp_flag); + if (rc != 0) { + TRN_LOG_ERROR( + "Failure initializing or loading transit XDP offload program for interface %s", + itf); + result = RPC_TRN_FATAL; + return &result; + } - memset(md, 0, sizeof(struct user_metadata_t)); - // Set all interface index slots to unused - int i; - for (i = 0; i < TRAN_MAX_ITF; i++) { - md->itf_idx[i] = TRAN_UNUSED_ITF_IDX; - } + TRN_LOG_INFO("Successfully loaded transit XDP offload on interface %s", itf); + } else { + if (md) { + TRN_LOG_INFO("Transit XDP for interface %s already exist.", itf); + return &result; + } - strcpy(md->pcapfile, xdp_intf->pcapfile); - md->pcapfile[255] = '\0'; - md->xdp_flags = xdp_intf->xdp_flag; + TRN_LOG_INFO("Loading transit XDP for interface %s.", itf); + md = malloc(sizeof(struct user_metadata_t)); + if (!md) { + TRN_LOG_ERROR("Failure allocating memory for user_metadata_t"); + result = RPC_TRN_FATAL; + goto error; + } - TRN_LOG_DEBUG("load_transit_xdp_1 path: %s, pcap: %s", - xdp_intf->xdp_path, xdp_intf->pcapfile); + memset(md, 0, sizeof(struct user_metadata_t)); + // Set all interface index slots to unused + int i; + for (i = 0; i < TRAN_MAX_ITF; i++) { + md->itf_idx[i] = TRAN_UNUSED_ITF_IDX; + } - rc = trn_user_metadata_init(md, itf, kern_path, md->xdp_flags); - if (rc != 0) { - TRN_LOG_ERROR( - "Failure initializing or loading transit XDP program for interface %s", - itf); - result = RPC_TRN_FATAL; - goto error; - } + strcpy(md->pcapfile, xdp_intf->pcapfile); + md->pcapfile[255] = '\0'; + md->xdp_flags = xdp_intf->xdp_flag; - rc = trn_itf_table_insert(itf, md); - if (rc != 0) { - TRN_LOG_ERROR( - "Failure populating interface table when loading XDP program on %s", - itf); - result = RPC_TRN_ERROR; - unload_error = true; - goto error; - } + TRN_LOG_DEBUG("load_transit_xdp_1 path: %s, pcap: %s", + xdp_intf->xdp_path, xdp_intf->pcapfile); - TRN_LOG_INFO("Successfully loaded transit XDP on interface %s", itf); + rc = trn_user_metadata_init(md, itf, kern_path, md->xdp_flags); + if (rc != 0) { + TRN_LOG_ERROR( + "Failure initializing or loading transit XDP program for interface %s", + itf); + result = RPC_TRN_FATAL; + goto error; + } + + rc = trn_itf_table_insert(itf, md); + if (rc != 0) { + TRN_LOG_ERROR( + "Failure populating interface table when loading XDP program on %s", + itf); + result = RPC_TRN_ERROR; + unload_error = true; + goto error; + } + + TRN_LOG_INFO("Successfully loaded transit XDP on interface %s", itf); + } return &result; diff --git a/src/dmn/trn_transit_xdp_usr.c b/src/dmn/trn_transit_xdp_usr.c index e69cfa78..7dc9b1e7 100644 --- a/src/dmn/trn_transit_xdp_usr.c +++ b/src/dmn/trn_transit_xdp_usr.c @@ -220,6 +220,25 @@ int trn_update_network(struct user_metadata_t *md, struct network_key_t *netkey, TRN_LOG_ERROR("Store network mapping failed (err:%d)", err); return 1; } + + if (md->xdp_flags == XDP_FLAGS_HW_MODE) { + struct network_offload_t net_offload; + if (net->nswitches > TRAN_MAX_NSWITCH_OFFLOAD) { + TRN_LOG_ERROR("Store offloaded network mapping failed for exceeding TRAN_MAX_NSWITCH_OFFLOAD"); + return 1; + } + + net_offload.prefixlen = net->prefixlen; + memcpy(net_offload.nip, net->nip, sizeof(net_offload.nip)); + net_offload.nswitches = net->nswitches; + memcpy(net_offload.switches_ips, net->switches_ips, net->nswitches * sizeof(net_offload.switches_ips[0])); + err = bpf_map_update_elem(md->networks_offload_map_fd, netkey, &net_offload, 0); + if (err) { + TRN_LOG_ERROR("Store offloaded network mapping failed (err:%d)", err); + return 1; + } + } + return 0; } @@ -280,6 +299,25 @@ int trn_update_endpoint(struct user_metadata_t *md, return 1; } + if (md->xdp_flags == XDP_FLAGS_HW_MODE) { + struct endpoint_offload_t ep_offload; + if (ep->nremote_ips > TRAN_MAX_REMOTES_OFFLOAD) { + TRN_LOG_ERROR("Store offloaded endpoint mapping failed for exceeding TRAN_MAX_REMOTES_OFFLOAD"); + return 1; + } + + ep_offload.eptype = ep->eptype; + ep_offload.nremote_ips = ep->nremote_ips; + memcpy(ep_offload.remote_ips, ep->remote_ips, ep->nremote_ips * sizeof(ep_offload.remote_ips[0])); + ep_offload.hosted_iface = ep->hosted_iface; + memcpy(ep_offload.mac, ep->mac, sizeof(ep->mac)); + err = bpf_map_update_elem(md->endpoints_offload_map_fd, epkey, &ep_offload, 0); + if (err) { + TRN_LOG_ERROR("Store offloaded endpoint mapping failed (err:%d).", err); + return 1; + } + } + return 0; } @@ -291,6 +329,23 @@ int trn_update_vpc(struct user_metadata_t *md, struct vpc_key_t *vpckey, TRN_LOG_ERROR("Store VPCs mapping failed (err:%d).", err); return 1; } + + if (md->xdp_flags == XDP_FLAGS_HW_MODE) { + struct vpc_offload_t vpc_offload; + if (vpc->nrouters > TRAN_MAX_NROUTER_OFFLOAD) { + TRN_LOG_ERROR("Store offloaded vpc mapping failed for exceeding TRAN_MAX_NROUTER_OFFLOAD"); + return 1; + } + + vpc_offload.nrouters = vpc->nrouters; + memcpy(vpc_offload.routers_ips, vpc->routers_ips, vpc->nrouters * sizeof(vpc_offload.routers_ips[0])); + err = bpf_map_update_elem(md->vpc_offload_map_fd, vpckey, &vpc_offload, 0); + if (err) { + TRN_LOG_ERROR("Store offloaded vpc mapping failed (err:%d).", err); + return 1; + } + } + return 0; } @@ -491,6 +546,15 @@ int trn_delete_network(struct user_metadata_t *md, struct network_key_t *netkey) TRN_LOG_ERROR("Deleting network mapping failed (err:%d).", err); return 1; } + + if (md->xdp_flags == XDP_FLAGS_HW_MODE) { + err = bpf_map_delete_elem(md->networks_offload_map_fd, netkey); + if (err) { + TRN_LOG_ERROR("Deleting offload network mapping failed (err:%d).", err); + return 1; + } + } + return 0; } @@ -518,6 +582,15 @@ int trn_delete_endpoint(struct user_metadata_t *md, return 1; } + if (md->xdp_flags == XDP_FLAGS_HW_MODE) { + err = bpf_map_delete_elem(md->endpoints_offload_map_fd, epkey); + if (err) { + TRN_LOG_ERROR("Deleting offload endpoint mapping failed (err:%d).", + err); + return 1; + } + } + return 0; } @@ -528,6 +601,15 @@ int trn_delete_vpc(struct user_metadata_t *md, struct vpc_key_t *vpckey) TRN_LOG_ERROR("Deleting vpc mapping failed (err:%d).", err); return 1; } + + if (md->xdp_flags == XDP_FLAGS_HW_MODE) { + err = bpf_map_delete_elem(md->vpc_offload_map_fd, vpckey); + if (err) { + TRN_LOG_ERROR("Deleting offload vpc mapping failed (err:%d).", err); + return 1; + } + } + return 0; } @@ -645,6 +727,82 @@ int trn_user_metadata_init(struct user_metadata_t *md, char *itf, return 0; } +int trn_user_metadata_init_offload(struct user_metadata_t *md, char *itf, + char *kern_path, int xdp_flags) +{ + int rc; + struct rlimit r = { RLIM_INFINITY, RLIM_INFINITY }; + struct bpf_prog_load_attr prog_load_attr = { .prog_type = + BPF_PROG_TYPE_XDP, + .file = kern_path }; + __u32 info_len = sizeof(md->info_offload); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + TRN_LOG_ERROR("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + md->ifindex = if_nametoindex(itf); + prog_load_attr.ifindex = md->ifindex; + if (!md->ifindex) { + TRN_LOG_ERROR("if_nametoindex"); + return 1; + } + + md->eth.ip = trn_get_interface_ipv4(md->ifindex); + md->eth.iface_index = md->ifindex; + + // offload_xdp cannot reuse the pinned maps(network policy) + if (bpf_prog_load_xattr(&prog_load_attr, &md->obj_offload, &md->prog_offload_fd)) { + TRN_LOG_ERROR("Error loading bpf: %s", kern_path); + return 1; + } + + // map_init + md->networks_offload_map = bpf_map__next(NULL, md->obj_offload); + md->vpc_offload_map = bpf_map__next(md->networks_offload_map, md->obj_offload); + md->endpoints_offload_map = bpf_map__next(md->vpc_offload_map, md->obj_offload); + md->interface_config_offload_map = bpf_map__next(md->endpoints_offload_map, md->obj_offload); + if (!md->endpoints_offload_map || !md->interface_config_offload_map || + !md->networks_offload_map || !md->vpc_offload_map) { + TRN_LOG_ERROR("Failure finding offloaded maps objects."); + return 1; + } + md->networks_offload_map_fd = bpf_map__fd(md->networks_offload_map); + md->vpc_offload_map_fd = bpf_map__fd(md->vpc_offload_map); + md->endpoints_offload_map_fd = bpf_map__fd(md->endpoints_offload_map); + md->interface_config_offload_map_fd = bpf_map__fd(md->interface_config_offload_map); + // map_init done + + if (!md->prog_offload_fd) { + TRN_LOG_ERROR("load_bpf_file: %s.", strerror(errno)); + return 1; + } + + if (bpf_set_link_xdp_fd(md->ifindex, md->prog_offload_fd, xdp_flags) < 0) { + TRN_LOG_ERROR("link set xdp_offload fd failed - %s.", strerror(errno)); + return 1; + } + + rc = bpf_obj_get_info_by_fd(md->prog_offload_fd, &md->info_offload, &info_len); + if (rc != 0) { + TRN_LOG_ERROR("can't get prog info - %s.", strerror(errno)); + return rc; + } + md->prog_offload_id = md->info_offload.id; + + // As the config of original Transit Program already has the itf_idx, set Offload Program as the same config + int k = 0; + rc = bpf_map_update_elem(md->interface_config_offload_map_fd, &k, &md->eth, 0); + if (rc != 0) { + TRN_LOG_ERROR("Failed to store interface data."); + return 1; + } + + md->xdp_flags = xdp_flags; // overwrite xdp_flags with XDP_OFFLOAD + return 0; +} + uint32_t trn_get_interface_ipv4(int itf_idx) { int fd; diff --git a/src/dmn/trn_transit_xdp_usr.h b/src/dmn/trn_transit_xdp_usr.h index c17ff797..e71a2223 100644 --- a/src/dmn/trn_transit_xdp_usr.h +++ b/src/dmn/trn_transit_xdp_usr.h @@ -117,6 +117,8 @@ struct user_metadata_t { __u32 xdp_flags; int prog_fd; __u32 prog_id; + int prog_offload_fd; + __u32 prog_offload_id; char pcapfile[256]; int itf_idx[TRAN_MAX_ITF]; @@ -148,6 +150,10 @@ struct user_metadata_t { int ing_namespace_label_policy_map_fd; int ing_pod_and_namespace_label_policy_map_fd; int tx_stats_map_fd; + int networks_offload_map_fd; + int vpc_offload_map_fd; + int endpoints_offload_map_fd; + int interface_config_offload_map_fd; struct bpf_map *jmp_table_map; struct bpf_map *networks_map; @@ -177,9 +183,15 @@ struct user_metadata_t { struct bpf_map *ing_namespace_label_policy_map; struct bpf_map *ing_pod_and_namespace_label_policy_map; struct bpf_map *tx_stats_map; + struct bpf_map *networks_offload_map; + struct bpf_map *vpc_offload_map; + struct bpf_map *endpoints_offload_map; + struct bpf_map *interface_config_offload_map; struct bpf_prog_info info; struct bpf_object *obj; + struct bpf_prog_info info_offload; + struct bpf_object *obj_offload; /* Array of programs at different stages. Currently supporting only one extra tail-call */ struct ebpf_prog_stage_t ebpf_progs[TRAN_MAX_PROG]; @@ -221,6 +233,9 @@ int trn_delete_network(struct user_metadata_t *md, int trn_user_metadata_init(struct user_metadata_t *md, char *itf, char *kern_path, int xdp_flags); +int trn_user_metadata_init_offload(struct user_metadata_t *md, char *itf, + char *kern_path, int xdp_flags); + uint32_t trn_get_interface_ipv4(int itf_idx); int trn_add_prog(struct user_metadata_t *md, unsigned int prog_idx, diff --git a/src/include/trn_datamodel.h b/src/include/trn_datamodel.h index c20f1fd6..99f42199 100644 --- a/src/include/trn_datamodel.h +++ b/src/include/trn_datamodel.h @@ -32,6 +32,9 @@ #define TRAN_MAX_NSWITCH 128 #define TRAN_MAX_NROUTER 128 #define TRAN_MAX_REMOTES 128 +#define TRAN_MAX_NSWITCH_OFFLOAD 5 +#define TRAN_MAX_NROUTER_OFFLOAD 5 +#define TRAN_MAX_REMOTES_OFFLOAD 7 #define TRAN_MAX_ITF 128 #define TRAN_UNUSED_ITF_IDX -1 @@ -95,6 +98,14 @@ struct endpoint_t { unsigned char mac[6]; } __attribute__((packed, aligned(4))); +struct endpoint_offload_t { + __u32 eptype; + __u32 nremote_ips; + __u32 remote_ips[TRAN_MAX_REMOTES_OFFLOAD]; //cause the size of remote_ips[TRAN_MAX_REMOTES] is too big to offload + int hosted_iface; + unsigned char mac[6]; +} __attribute__((packed, aligned(4))); + struct packet_metadata_key_t { __u32 tunip[3]; } __attribute__((packed)); @@ -126,6 +137,13 @@ struct network_t { __u32 switches_ips[TRAN_MAX_NSWITCH]; } __attribute__((packed, aligned(4))); +struct network_offload_t { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u32 nip[3]; + __u32 nswitches; + __u32 switches_ips[TRAN_MAX_NSWITCH_OFFLOAD]; +} __attribute__((packed, aligned(4))); + struct vpc_key_t { union { __be64 tunnel_id; @@ -137,6 +155,11 @@ struct vpc_t { __u32 routers_ips[TRAN_MAX_NROUTER]; } __attribute__((packed, aligned(4))); +struct vpc_offload_t { + __u32 nrouters; + __u32 routers_ips[TRAN_MAX_NROUTER_OFFLOAD]; +} __attribute__((packed, aligned(4))); + struct tunnel_iface_t { int iface_index; __u32 ip; diff --git a/src/xdp/trn_transit_xdp_hardware_offload.c b/src/xdp/trn_transit_xdp_hardware_offload.c new file mode 100644 index 00000000..8ac560ee --- /dev/null +++ b/src/xdp/trn_transit_xdp_hardware_offload.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * @file trn_transit_xdp_hardware_offload.c + * @author Peng Yang (@yangpenger) + * + * @brief Offloads functions of bouncers and dividers about Direct Path. + * This offloaded program works before original Transit XDP program, + * i.e., multiple programs on the same XDP interface. + * Thus, non-offload functions are performed by original Transit XDP program. + * + * @copyright Copyright (c) 2019 The Authors. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "extern/bpf_endian.h" +#include "extern/bpf_helpers.h" +#include "trn_datamodel.h" +#include "trn_kern.h" + +int _version SEC("version") = 1; + +struct bpf_map_def SEC("maps") networks_offload_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct network_key_t), + .value_size = sizeof(struct network_offload_t), + .max_entries = 1000001, + .map_flags = 0, +}; +BPF_ANNOTATE_KV_PAIR(networks_offload_map, struct network_key_t, struct network_offload_t); + +struct bpf_map_def SEC("maps") vpc_offload_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct vpc_key_t), + .value_size = sizeof(struct vpc_offload_t), + .max_entries = 1000001, + .map_flags = 0, +}; +BPF_ANNOTATE_KV_PAIR(vpc_offload_map, struct vpc_key_t, struct vpc_offload_t); + +struct bpf_map_def SEC("maps") endpoints_offload_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(struct endpoint_key_t), + .value_size = sizeof(struct endpoint_offload_t), + .max_entries = 1000001, + .map_flags = 0, +}; +BPF_ANNOTATE_KV_PAIR(endpoints_offload_map, struct endpoint_key_t, struct endpoint_offload_t); + +struct bpf_map_def SEC("maps") interface_config_offload_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(struct tunnel_iface_t), + .max_entries = 1, + .map_flags = 0, +}; +BPF_ANNOTATE_KV_PAIR(interface_config_offload_map, int, struct tunnel_iface_t); + +static __inline int trn_rewrite_remote_mac(struct transit_packet *pkt) +{ + /* The TTL must have been decremented before this step, Drop the + packet if TTL is zero */ + if (!pkt->ip->ttl) + return XDP_DROP; + + struct endpoint_offload_t *remote_ep; + struct endpoint_key_t epkey; + epkey.tunip[0] = 0; + epkey.tunip[1] = 0; + epkey.tunip[2] = pkt->ip->daddr; + /* Get the remote_mac address based on the value of the outer dest IP */ + remote_ep = bpf_map_lookup_elem(&endpoints_offload_map, &epkey); + if (!remote_ep) { + return XDP_DROP; + } + + trn_set_src_mac(pkt->data, pkt->eth->h_dest); + trn_set_dst_mac(pkt->data, remote_ep->mac); + + if (pkt->ip->tos & IPTOS_MINCOST) { + return XDP_PASS; + } + + return XDP_TX; +} + +static __inline int trn_router_handle_pkt(struct transit_packet *pkt, + __u32 inner_src_ip, + __u32 inner_dst_ip) +{ + __be64 tunnel_id = trn_vni_to_tunnel_id(pkt->geneve->vni); + /* This is where we forward the packet to the transit router: First lookup + the network of the inner_ip->daddr, if found hash and forward to the + transit switch of that network, OW forward to the transit router. */ + + struct network_key_t nkey; + struct network_offload_t *net; + /* SmartNIC does not supporting BPF_MAP_TYPE_LPM_TRIE here, so match with a exact length (80). */ + nkey.prefixlen = 80; + __builtin_memcpy(&nkey.nip[0], &tunnel_id, sizeof(tunnel_id)); + /* Obtain the network number by the mask. The subnet prefix length is 16. */ + nkey.nip[2] = inner_dst_ip & 0xFFFF; + net = bpf_map_lookup_elem(&networks_offload_map, &nkey); + + if (net) { + pkt->rts_opt->rts_data.host.ip = pkt->ip->daddr; + __builtin_memcpy(pkt->rts_opt->rts_data.host.mac, + pkt->eth->h_dest, 6 * sizeof(unsigned char)); + + if (net->nip[0] != nkey.nip[0] || net->nip[1] != nkey.nip[1]) { + return XDP_DROP; + } + + /* Only send to the first switch. */ + __u32 swidx = 0; + trn_set_src_dst_ip_csum(pkt, pkt->ip->daddr, + net->switches_ips[swidx]); + + return trn_rewrite_remote_mac(pkt); + } + + /* Now forward the packet to the VPC router */ + struct vpc_key_t vpckey; + struct vpc_offload_t *vpc; + + vpckey.tunnel_id = tunnel_id; + vpc = bpf_map_lookup_elem(&vpc_offload_map, &vpckey); + if (!vpc) { + return XDP_DROP; + } + + /* Only send to the first router. */ + __u32 routeridx = 0; + trn_set_src_dst_ip_csum(pkt, pkt->ip->daddr, + vpc->routers_ips[routeridx]); + + return trn_rewrite_remote_mac(pkt); +} + + +static __inline int trn_switch_handle_pkt(struct transit_packet *pkt, + __u32 inner_src_ip, + __u32 inner_dst_ip, __u32 orig_src_ip) +{ + __be64 tunnel_id = trn_vni_to_tunnel_id(pkt->geneve->vni); + struct endpoint_offload_t *ep; + struct endpoint_key_t epkey; + + __builtin_memcpy(&epkey.tunip[0], &tunnel_id, sizeof(tunnel_id)); + epkey.tunip[2] = inner_dst_ip; + + /* Get the remote_ip based on the value of the inner dest IP and VNI*/ + ep = bpf_map_lookup_elem(&endpoints_offload_map, &epkey); + if (!ep) { + if (pkt->scaled_ep_opt->type == TRN_GNV_SCALED_EP_OPT_TYPE && + pkt->scaled_ep_opt->scaled_ep_data.msg_type == + TRN_SCALED_EP_MODIFY) + return XDP_PASS; + + return trn_router_handle_pkt(pkt, inner_src_ip, inner_dst_ip); + } + + /* The packet may be sent first to a gw mac address */ + trn_set_dst_mac(pkt->inner_eth, ep->mac); + + // TODO: Currently all endpoints are attached to one host, for some + // ep types, they will have multiple attachments (e.g. LB endpoint). + if (ep->hosted_iface != -1) { + return XDP_PASS; + } + + if (ep->eptype == TRAN_SCALED_EP) { + return XDP_PASS; + } + + if (ep->nremote_ips == 0) { + return XDP_DROP; + } + + trn_set_src_dst_ip_csum(pkt, pkt->ip->daddr, ep->remote_ips[0]); + + return trn_rewrite_remote_mac(pkt); +} + +static __inline int trn_process_inner_ip(struct transit_packet *pkt) +{ + pkt->inner_ip = (void *)pkt->inner_eth + pkt->inner_eth_off; + __u32 ipproto; + + if (pkt->inner_ip + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + /* For whatever compiler reason, we need to perform reverse flow modification + in this function instead of trn_switch_handle_pkt so we keep the orig_src_ip */ + __u32 orig_src_ip = pkt->inner_ip->saddr; + + pkt->inner_ipv4_tuple.saddr = pkt->inner_ip->saddr; + pkt->inner_ipv4_tuple.daddr = pkt->inner_ip->daddr; + pkt->inner_ipv4_tuple.protocol = pkt->inner_ip->protocol; + pkt->inner_ipv4_tuple.sport = 0; + pkt->inner_ipv4_tuple.dport = 0; + + if (pkt->inner_ipv4_tuple.protocol == IPPROTO_TCP) { + pkt->inner_tcp = (void *)pkt->inner_ip + sizeof(*pkt->inner_ip); + if (pkt->inner_tcp + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + pkt->inner_ipv4_tuple.sport = pkt->inner_tcp->source; + pkt->inner_ipv4_tuple.dport = pkt->inner_tcp->dest; + } + + if (pkt->inner_ipv4_tuple.protocol == IPPROTO_UDP) { + pkt->inner_udp = (void *)pkt->inner_ip + sizeof(*pkt->inner_ip); + if (pkt->inner_udp + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + pkt->inner_ipv4_tuple.sport = pkt->inner_udp->source; + pkt->inner_ipv4_tuple.dport = pkt->inner_udp->dest; + } + + __be64 tunnel_id = trn_vni_to_tunnel_id(pkt->geneve->vni); + + /* Lookup the source endpoint*/ + struct endpoint_offload_t *src_ep; + struct endpoint_key_t src_epkey; + __builtin_memcpy(&src_epkey.tunip[0], &tunnel_id, sizeof(tunnel_id)); + src_epkey.tunip[2] = pkt->inner_ip->saddr; + src_ep = bpf_map_lookup_elem(&endpoints_offload_map, &src_epkey); + + /* If this is not the source endpoint's host, + skip reverse flow modification, or scaled endpoint modify handling */ + if (pkt->scaled_ep_opt->type == TRN_GNV_SCALED_EP_OPT_TYPE && + pkt->scaled_ep_opt->scaled_ep_data.msg_type == + TRN_SCALED_EP_MODIFY && + src_ep && src_ep->hosted_iface != -1) { + return XDP_PASS; + } + + /* Check if we need to apply a reverse flow update */ + struct ipv4_tuple_t inner; + __builtin_memcpy(&inner, &pkt->inner_ipv4_tuple, + sizeof(struct ipv4_tuple_t)); + + return trn_switch_handle_pkt(pkt, pkt->inner_ip->saddr, + pkt->inner_ip->daddr, orig_src_ip); +} + +static __inline int trn_process_inner_arp(struct transit_packet *pkt) +{ + unsigned char *sha; + unsigned char *tha = NULL; + struct endpoint_offload_t *ep; + struct endpoint_key_t epkey; + struct endpoint_offload_t *remote_ep; + __u32 *sip, *tip; + __u64 csum = 0; + + pkt->inner_arp = (void *)pkt->inner_eth + sizeof(*pkt->inner_eth); + if (pkt->inner_arp + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + if (pkt->inner_arp->ar_pro != bpf_htons(ETH_P_IP) || + pkt->inner_arp->ar_hrd != bpf_htons(ARPHRD_ETHER)) { + return XDP_DROP; + } + + if (pkt->inner_arp->ar_op != bpf_htons(ARPOP_REPLY) && + pkt->inner_arp->ar_op != bpf_htons(ARPOP_REQUEST)) { + return XDP_DROP; + } + + if ((unsigned char *)(pkt->inner_arp + 1) > pkt->data_end) { + return XDP_ABORTED; + } + + sha = (unsigned char *)(pkt->inner_arp + 1); + if (sha + ETH_ALEN > pkt->data_end) { + return XDP_ABORTED; + } + + sip = (__u32 *)(sha + ETH_ALEN); + if (sip + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + tha = (unsigned char *)sip + sizeof(__u32); + if (tha + ETH_ALEN > pkt->data_end) { + return XDP_ABORTED; + } + + tip = (__u32 *)(tha + ETH_ALEN); + if ((void *)tip + sizeof(__u32) > pkt->data_end) { + return XDP_ABORTED; + } + + __be64 tunnel_id = trn_vni_to_tunnel_id(pkt->geneve->vni); + + __builtin_memcpy(&epkey.tunip[0], &tunnel_id, sizeof(tunnel_id)); + epkey.tunip[2] = *tip; + ep = bpf_map_lookup_elem(&endpoints_offload_map, &epkey); + /* Don't respond to arp if endpoint is not found, or it is local to host */ + if (!ep || ep->hosted_iface != -1 || + pkt->inner_arp->ar_op != bpf_htons(ARPOP_REQUEST)) { + return trn_switch_handle_pkt(pkt, *sip, *tip, *sip); + } + + /* Respond to ARP */ + pkt->inner_arp->ar_op = bpf_htons(ARPOP_REPLY); + trn_set_arp_ha(tha, sha); + trn_set_arp_ha(sha, ep->mac); + + __u32 tmp_ip = *sip; + *sip = *tip; + *tip = tmp_ip; + + /* Set the sender mac address to the ep mac address */ + trn_set_src_mac(pkt->inner_eth, ep->mac); + + if (ep->eptype == TRAN_SIMPLE_EP) { + /*Get the remote_ep address based on the value of the outer dest IP */ + epkey.tunip[0] = 0; + epkey.tunip[1] = 0; + epkey.tunip[2] = ep->remote_ips[0]; + remote_ep = bpf_map_lookup_elem(&endpoints_offload_map, &epkey); + if (!remote_ep) { + return XDP_DROP; + } + + /* For a simple endpoint, Write the RTS option on behalf of the target endpoint */ + pkt->rts_opt->rts_data.host.ip = ep->remote_ips[0]; + __builtin_memcpy(pkt->rts_opt->rts_data.host.mac, + remote_ep->mac, 6 * sizeof(unsigned char)); + } else { + trn_reset_rts_opt(pkt); + } + + /* We need to lookup the endpoint again, since tip has changed */ + epkey.tunip[2] = *tip; + ep = bpf_map_lookup_elem(&endpoints_offload_map, &epkey); + + return trn_switch_handle_pkt(pkt, *sip, *tip, *sip); +} + +static __inline int trn_process_inner_eth(struct transit_packet *pkt) +{ + pkt->inner_eth = (void *)pkt->geneve + pkt->gnv_hdr_len; + pkt->inner_eth_off = sizeof(*pkt->inner_eth); + if (pkt->inner_eth + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + /* ARP */ + if (pkt->inner_eth->h_proto == bpf_htons(ETH_P_ARP)) { + return trn_process_inner_arp(pkt); + } + + if (pkt->eth->h_proto != bpf_htons(ETH_P_IP)) { + return XDP_DROP; + } + + return trn_process_inner_ip(pkt); +} + +static __inline int trn_process_geneve(struct transit_packet *pkt) +{ + pkt->geneve = (void *)pkt->udp + sizeof(*pkt->udp); + if (pkt->geneve + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + if (pkt->geneve->proto_type != bpf_htons(ETH_P_TEB)) { + return XDP_PASS; + } + + pkt->gnv_opt_len = pkt->geneve->opt_len * 4; + pkt->gnv_hdr_len = sizeof(*pkt->geneve) + pkt->gnv_opt_len; + pkt->rts_opt = (void *)&pkt->geneve->options[0]; + if (pkt->rts_opt + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + if (pkt->rts_opt->opt_class != TRN_GNV_OPT_CLASS) { + return XDP_ABORTED; + } + + // TODO: process options + pkt->scaled_ep_opt = (void *)pkt->rts_opt + sizeof(*pkt->rts_opt); + if (pkt->scaled_ep_opt + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + if (pkt->scaled_ep_opt->opt_class != TRN_GNV_OPT_CLASS) { + return XDP_ABORTED; + } + + return trn_process_inner_eth(pkt); +} + +static __inline int trn_process_udp(struct transit_packet *pkt) +{ + /* Get the UDP header */ + pkt->udp = (void *)pkt->ip + sizeof(*pkt->ip); + if (pkt->udp + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + if (pkt->udp->dest != GEN_DSTPORT) { + return XDP_PASS; + } + + return trn_process_geneve(pkt); +} + +static __inline int trn_process_ip(struct transit_packet *pkt) +{ + /* Get the IP header */ + pkt->ip = (void *)pkt->eth + pkt->eth_off; + if (pkt->ip + 1 > pkt->data_end) { + return XDP_ABORTED; + } + + if (pkt->ip->protocol != IPPROTO_UDP) { + return XDP_PASS; + } + + if (!pkt->ip->ttl) { + return XDP_DROP; + } + + /* Only process packets designated to this interface! + * In functional tests - relying on docker0 - we see such packets! + */ + if (pkt->ip->daddr != pkt->itf_ipv4) { + return XDP_DROP; + } + + return trn_process_udp(pkt); +} + +static __inline int trn_process_eth(struct transit_packet *pkt) +{ + pkt->eth = pkt->data; + pkt->eth_off = sizeof(*pkt->eth); + if (pkt->data + pkt->eth_off > pkt->data_end) { + return XDP_ABORTED; + } + + if (pkt->eth->h_proto != bpf_htons(ETH_P_IP)) { + return XDP_PASS; + } + + return trn_process_ip(pkt); +} + +SEC("transit") +int _transit(struct xdp_md *ctx) +{ + struct transit_packet pkt; + pkt.data = (void *)(long)ctx->data; + pkt.data_end = (void *)(long)ctx->data_end; + pkt.xdp = ctx; + struct tunnel_iface_t *itf; + int k = 0; + itf = bpf_map_lookup_elem(&interface_config_offload_map, &k); + if (!itf) { + return XDP_ABORTED; + } + + pkt.itf_ipv4 = itf->ip; + pkt.itf_idx = itf->iface_index; + + return trn_process_eth(&pkt); +} + +char _license[] SEC("license") = "GPL"; diff --git a/supported_xdp_offload_nics.yaml b/supported_xdp_offload_nics.yaml new file mode 100644 index 00000000..f71af639 --- /dev/null +++ b/supported_xdp_offload_nics.yaml @@ -0,0 +1,5 @@ +# The list of NICs that with offload XDP support +# vender_name: +# - "model_name" +Netronome: +- "Device 4000"