diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2026-04-06 18:43:54 -0700 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2026-04-06 18:43:54 -0700 |
| commit | 270c0637b906d23b59cb119dabce350d44d2471b (patch) | |
| tree | fa4cf971d3683863336527e903125c89797975be | |
| parent | 1caa871bb0615e2b68aa11bb7b453eeac770ea1d (diff) | |
| parent | 62838e363e4f0753d43b2b78e9d3f6ad3c9102ec (diff) | |
Merge branch 'xsk-tailroom-reservation-and-mtu-validation'
Maciej Fijalkowski says:
====================
xsk: tailroom reservation and MTU validation
here we fix a long-standing issue regarding multi-buffer scenario in ZC
mode - we have not been providing space at the end of the buffer where
multi-buffer XDP works on skb_shared_info. This has been brought to our
attention via [0].
Unaligned mode does not get any specific treatment, it is user's
responsibility to properly handle XSK addresses in queues.
With adjustments included here in this set against xskxceiver I have
been able to pass the full test suite on ice.
[0]: https://community.intel.com/t5/Ethernet-Products/X710-XDP-Packet-Corruption-Issue-DRV-MODE-Zero-Copy-Multi-Buffer/m-p/1724208
====================
Link: https://patch.msgid.link/20260402154958.562179-1-maciej.fijalkowski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | include/net/xdp_sock.h | 2 | ||||
| -rw-r--r-- | include/net/xdp_sock_drv.h | 23 | ||||
| -rw-r--r-- | net/xdp/xdp_umem.c | 3 | ||||
| -rw-r--r-- | net/xdp/xsk.c | 4 | ||||
| -rw-r--r-- | net/xdp/xsk_buff_pool.c | 32 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/prog_tests/test_xsk.c | 55 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/prog_tests/test_xsk.h | 23 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/prog_tests/xsk.c | 19 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/progs/xsk_xdp_progs.c | 4 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/xskxceiver.c | 23 |
10 files changed, 150 insertions, 38 deletions
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 23e8861e8b25..ebac60a3d8a1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -14,7 +14,7 @@ #include <linux/mm.h> #include <net/sock.h> -#define XDP_UMEM_SG_FLAG (1 << 1) +#define XDP_UMEM_SG_FLAG BIT(3) struct net_device; struct xsk_queue; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 6b9ebae2dc95..46797645a0c2 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -41,16 +41,37 @@ static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) return XDP_PACKET_HEADROOM + pool->headroom; } +static inline u32 xsk_pool_get_tailroom(bool mbuf) +{ + return mbuf ? SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : 0; +} + static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } -static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +static inline u32 __xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +{ + u32 frame_size = __xsk_pool_get_rx_frame_size(pool); + struct xdp_umem *umem = pool->umem; + bool mbuf; + + /* Reserve tailroom only for zero-copy pools that opted into + * multi-buffer. The reserved area is used for skb_shared_info, + * matching the XDP core's xdp_data_hard_end() layout. + */ + mbuf = pool->dev && (umem->flags & XDP_UMEM_SG_FLAG); + frame_size -= xsk_pool_get_tailroom(mbuf); + + return ALIGN_DOWN(frame_size, 128); +} + static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) { return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 066ce07c506d..58da2f4f4397 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -203,7 +203,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (!unaligned_chunks && chunks_rem) return -EINVAL; - if (headroom >= chunk_size - XDP_PACKET_HEADROOM) + if (headroom > chunk_size - XDP_PACKET_HEADROOM - + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) - 128) return -EINVAL; if (mr->flags & XDP_UMEM_TX_METADATA_LEN) { diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6149f6a79897..c8ef9e427c9c 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -239,7 +239,7 @@ static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + u32 frame_size = __xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; @@ -338,7 +338,7 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; - if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 37b7a68b89b3..cd7bc50872f6 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -10,6 +10,8 @@ #include "xdp_umem.h" #include "xsk.h" +#define ETH_PAD_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) + void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { if (!xs->tx) @@ -157,8 +159,12 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev, u16 queue_id, u16 flags) { + u32 needed = netdev->mtu + ETH_PAD_LEN; + u32 segs = netdev->xdp_zc_max_segs; + bool mbuf = flags & XDP_USE_SG; bool force_zc, force_copy; struct netdev_bpf bpf; + u32 frame_size; int err = 0; ASSERT_RTNL(); @@ -178,7 +184,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, if (err) return err; - if (flags & XDP_USE_SG) + if (mbuf) pool->umem->flags |= XDP_UMEM_SG_FLAG; if (flags & XDP_USE_NEED_WAKEUP) @@ -200,8 +206,24 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } - if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { - err = -EOPNOTSUPP; + if (mbuf) { + if (segs == 1) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + } else { + segs = 1; + } + + /* open-code xsk_pool_get_rx_frame_size() as pool->dev is not + * set yet at this point; we are before getting down to driver + */ + frame_size = __xsk_pool_get_rx_frame_size(pool) - + xsk_pool_get_tailroom(mbuf); + frame_size = ALIGN_DOWN(frame_size, 128); + + if (needed > frame_size * segs) { + err = -EINVAL; goto err_unreg_pool; } @@ -247,6 +269,10 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct xdp_umem *umem = umem_xs->umem; flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; + + if (umem->flags & XDP_UMEM_SG_FLAG) + flags |= XDP_USE_SG; + if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.c b/tools/testing/selftests/bpf/prog_tests/test_xsk.c index 7e38ec6e656b..7950c504ed28 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.c @@ -179,25 +179,6 @@ int xsk_configure_socket(struct xsk_socket_info *xsk, struct xsk_umem_info *umem return xsk_socket__create(&xsk->xsk, ifobject->ifindex, 0, umem->umem, rxr, txr, &cfg); } -#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" -static unsigned int get_max_skb_frags(void) -{ - unsigned int max_skb_frags = 0; - FILE *file; - - file = fopen(MAX_SKB_FRAGS_PATH, "r"); - if (!file) { - ksft_print_msg("Error opening %s\n", MAX_SKB_FRAGS_PATH); - return 0; - } - - if (fscanf(file, "%u", &max_skb_frags) != 1) - ksft_print_msg("Error reading %s\n", MAX_SKB_FRAGS_PATH); - - fclose(file); - return max_skb_frags; -} - static int set_ring_size(struct ifobject *ifobj) { int ret; @@ -1978,15 +1959,17 @@ int testapp_headroom(struct test_spec *test) int testapp_stats_rx_dropped(struct test_spec *test) { + u32 umem_tr = test->ifobj_tx->umem_tailroom; + if (test->mode == TEST_MODE_ZC) { ksft_print_msg("Can not run RX_DROPPED test for ZC mode\n"); return TEST_SKIP; } - if (pkt_stream_replace_half(test, MIN_PKT_SIZE * 4, 0)) + if (pkt_stream_replace_half(test, (MIN_PKT_SIZE * 3) + umem_tr, 0)) return TEST_FAILURE; test->ifobj_rx->umem->frame_headroom = test->ifobj_rx->umem->frame_size - - XDP_PACKET_HEADROOM - MIN_PKT_SIZE * 3; + XDP_PACKET_HEADROOM - (MIN_PKT_SIZE * 2) - umem_tr; if (pkt_stream_receive_half(test)) return TEST_FAILURE; test->ifobj_rx->validation_func = validate_rx_dropped; @@ -2242,11 +2225,7 @@ int testapp_too_many_frags(struct test_spec *test) if (test->mode == TEST_MODE_ZC) { max_frags = test->ifobj_tx->xdp_zc_max_segs; } else { - max_frags = get_max_skb_frags(); - if (!max_frags) { - ksft_print_msg("Can't get MAX_SKB_FRAGS from system, using default (17)\n"); - max_frags = 17; - } + max_frags = test->ifobj_tx->max_skb_frags; max_frags += 1; } @@ -2551,16 +2530,34 @@ int testapp_adjust_tail_shrink_mb(struct test_spec *test) int testapp_adjust_tail_grow(struct test_spec *test) { + if (test->mode == TEST_MODE_SKB) + return TEST_SKIP; + /* Grow by 4 bytes for testing purpose */ return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2); } int testapp_adjust_tail_grow_mb(struct test_spec *test) { + u32 grow_size; + + if (test->mode == TEST_MODE_SKB) + return TEST_SKIP; + + /* worst case scenario is when underlying setup will work on 3k + * buffers, let us account for it; given that we will use 6k as + * pkt_len, expect that it will be broken down to 2 descs each + * with 3k payload; + * + * 4k is truesize, 3k payload, 256 HR, 320 TR; + */ + grow_size = XSK_UMEM__MAX_FRAME_SIZE - + XSK_UMEM__LARGE_FRAME_SIZE - + XDP_PACKET_HEADROOM - + test->ifobj_tx->umem_tailroom; test->mtu = MAX_ETH_JUMBO_SIZE; - /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */ - return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1, - XSK_UMEM__LARGE_FRAME_SIZE * 2); + + return testapp_adjust_tail(test, grow_size, XSK_UMEM__LARGE_FRAME_SIZE * 2); } int testapp_tx_queue_consumer(struct test_spec *test) diff --git a/tools/testing/selftests/bpf/prog_tests/test_xsk.h b/tools/testing/selftests/bpf/prog_tests/test_xsk.h index 8fc78a057de0..1ab8aee4ce56 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_xsk.h +++ b/tools/testing/selftests/bpf/prog_tests/test_xsk.h @@ -31,6 +31,9 @@ #define SOCK_RECONF_CTR 10 #define USLEEP_MAX 10000 +#define MAX_SKB_FRAGS_PATH "/proc/sys/net/core/max_skb_frags" +#define SMP_CACHE_BYTES_PATH "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size" + extern bool opt_verbose; #define print_verbose(x...) do { if (opt_verbose) ksft_print_msg(x); } while (0) @@ -45,6 +48,24 @@ static inline u64 ceil_u64(u64 a, u64 b) return (a + b - 1) / b; } +static inline unsigned int read_procfs_val(const char *path) +{ + unsigned int read_val = 0; + FILE *file; + + file = fopen(path, "r"); + if (!file) { + ksft_print_msg("Error opening %s\n", path); + return 0; + } + + if (fscanf(file, "%u", &read_val) != 1) + ksft_print_msg("Error reading %s\n", path); + + fclose(file); + return read_val; +} + /* Simple test */ enum test_mode { TEST_MODE_SKB, @@ -115,6 +136,8 @@ struct ifobject { int mtu; u32 bind_flags; u32 xdp_zc_max_segs; + u32 umem_tailroom; + u32 max_skb_frags; bool tx_on; bool rx_on; bool use_poll; diff --git a/tools/testing/selftests/bpf/prog_tests/xsk.c b/tools/testing/selftests/bpf/prog_tests/xsk.c index dd4c35c0e428..6e2f63ee2a6c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xsk.c +++ b/tools/testing/selftests/bpf/prog_tests/xsk.c @@ -62,6 +62,7 @@ int configure_ifobj(struct ifobject *tx, struct ifobject *rx) static void test_xsk(const struct test_spec *test_to_run, enum test_mode mode) { + u32 max_frags, umem_tailroom, cache_line_size; struct ifobject *ifobj_tx, *ifobj_rx; struct test_spec test; int ret; @@ -84,6 +85,24 @@ static void test_xsk(const struct test_spec *test_to_run, enum test_mode mode) ifobj_tx->set_ring.default_rx = ifobj_tx->ring.rx_pending; } + cache_line_size = read_procfs_val(SMP_CACHE_BYTES_PATH); + if (!cache_line_size) + cache_line_size = 64; + + max_frags = read_procfs_val(MAX_SKB_FRAGS_PATH); + if (!max_frags) + max_frags = 17; + + ifobj_tx->max_skb_frags = max_frags; + ifobj_rx->max_skb_frags = max_frags; + + /* 48 bytes is a part of skb_shared_info w/o frags array; + * 16 bytes is sizeof(skb_frag_t) + */ + umem_tailroom = ALIGN(48 + (max_frags * 16), cache_line_size); + ifobj_tx->umem_tailroom = umem_tailroom; + ifobj_rx->umem_tailroom = umem_tailroom; + if (!ASSERT_OK(init_iface(ifobj_rx, worker_testapp_validate_rx), "init RX")) goto delete_rx; if (!ASSERT_OK(init_iface(ifobj_tx, worker_testapp_validate_tx), "init TX")) diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c index 683306db8594..023d8befd4ca 100644 --- a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c +++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c @@ -26,8 +26,10 @@ SEC("xdp.frags") int xsk_def_prog(struct xdp_md *xdp) SEC("xdp.frags") int xsk_xdp_drop(struct xdp_md *xdp) { + static unsigned int drop_idx; + /* Drop every other packet */ - if (idx++ % 2) + if (drop_idx++ % 2) return XDP_DROP; return bpf_redirect_map(&xsk, 0, XDP_DROP); diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 05b3cebc5ca9..7dad8556a722 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -80,6 +80,7 @@ #include <linux/mman.h> #include <linux/netdev.h> #include <linux/ethtool.h> +#include <linux/align.h> #include <arpa/inet.h> #include <net/if.h> #include <locale.h> @@ -333,6 +334,7 @@ static void print_tests(void) int main(int argc, char **argv) { const size_t total_tests = ARRAY_SIZE(tests) + ARRAY_SIZE(ci_skip_tests); + u32 cache_line_size, max_frags, umem_tailroom; struct pkt_stream *rx_pkt_stream_default; struct pkt_stream *tx_pkt_stream_default; struct ifobject *ifobj_tx, *ifobj_rx; @@ -354,6 +356,27 @@ int main(int argc, char **argv) setlocale(LC_ALL, ""); + cache_line_size = read_procfs_val(SMP_CACHE_BYTES_PATH); + if (!cache_line_size) { + ksft_print_msg("Can't get SMP_CACHE_BYTES from system, using default (64)\n"); + cache_line_size = 64; + } + + max_frags = read_procfs_val(MAX_SKB_FRAGS_PATH); + if (!max_frags) { + ksft_print_msg("Can't get MAX_SKB_FRAGS from system, using default (17)\n"); + max_frags = 17; + } + ifobj_tx->max_skb_frags = max_frags; + ifobj_rx->max_skb_frags = max_frags; + + /* 48 bytes is a part of skb_shared_info w/o frags array; + * 16 bytes is sizeof(skb_frag_t) + */ + umem_tailroom = ALIGN(48 + (max_frags * 16), cache_line_size); + ifobj_tx->umem_tailroom = umem_tailroom; + ifobj_rx->umem_tailroom = umem_tailroom; + parse_command_line(ifobj_tx, ifobj_rx, argc, argv); if (opt_print_tests) { |
