summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-11-03 18:11:43 -0800
committerJakub Kicinski <kuba@kernel.org>2025-11-03 18:11:44 -0800
commitff371a7e73c8e624d6a28684d839ed074ac97a2b (patch)
treede994a7538b465d0efd3e4d9d6de9a776b2648af
parent998b5d9683d9c415bd3ad0dcc9e8f2d0d34afaa4 (diff)
parentadd3c1324a8912b1abbf7afeb976fb719563f454 (diff)
Merge branch 'add-support-to-do-threaded-napi-busy-poll'
Samiullah Khawaja says: ==================== Add support to do threaded napi busy poll Extend the already existing support of threaded napi poll to do continuous busy polling. This is used for doing continuous polling of napi to fetch descriptors from backing RX/TX queues for low latency applications. Allow enabling of threaded busypoll using netlink so this can be enabled on a set of dedicated napis for low latency applications. Once enabled user can fetch the PID of the kthread doing NAPI polling and set affinity, priority and scheduler for it depending on the low-latency requirements. Extend the netlink interface to allow enabling/disabling threaded busypolling at individual napi level. We use this for our AF_XDP based hard low-latency usecase with usecs level latency requirement. For our usecase we want low jitter and stable latency at P99. Following is an analysis and comparison of available (and compatible) busy poll interfaces for a low latency usecase with stable P99. This can be suitable for applications that want very low latency at the expense of cpu usage and efficiency. Already existing APIs (SO_BUSYPOLL and epoll) allow busy polling a NAPI backing a socket, but the missing piece is a mechanism to busy poll a NAPI instance in a dedicated thread while ignoring available events or packets, regardless of the userspace API. Most existing mechanisms are designed to work in a pattern where you poll until new packets or events are received, after which userspace is expected to handle them. As a result, one has to hack together a solution using a mechanism intended to receive packets or events, not to simply NAPI poll. NAPI threaded busy polling, on the other hand, provides this capability natively, independent of any userspace API. This makes it really easy to setup and manage. For analysis we use an AF_XDP based benchmarking tool `xsk_rr`. The description of the tool and how it tries to simulate the real workload is following, - It sends UDP packets between 2 machines. - The client machine sends packets at a fixed frequency. To maintain the frequency of the packet being sent, we use open-loop sampling. That is the packets are sent in a separate thread. - The server replies to the packet inline by reading the pkt from the recv ring and replies using the tx ring. - To simulate the application processing time, we use a configurable delay in usecs on the client side after a reply is received from the server. The xsk_rr tool is posted separately as an RFC for tools/testing/selftest. We use this tool with following napi polling configurations, - Interrupts only - SO_BUSYPOLL (inline in the same thread where the client receives the packet). - SO_BUSYPOLL (separate thread and separate core) - Threaded NAPI busypoll System is configured using following script in all 4 cases, ``` echo 0 | sudo tee /sys/class/net/eth0/threaded echo 0 | sudo tee /proc/sys/kernel/timer_migration echo off | sudo tee /sys/devices/system/cpu/smt/control sudo ethtool -L eth0 rx 1 tx 1 sudo ethtool -G eth0 rx 1024 echo 0 | sudo tee /proc/sys/net/core/rps_sock_flow_entries echo 0 | sudo tee /sys/class/net/eth0/queues/rx-0/rps_cpus # pin IRQs on CPU 2 IRQS="$(gawk '/eth0-(TxRx-)?1/ {match($1, /([0-9]+)/, arr); \ print arr[0]}' < /proc/interrupts)" for irq in "${IRQS}"; \ do echo 2 | sudo tee /proc/irq/$irq/smp_affinity_list; done echo -1 | sudo tee /proc/sys/kernel/sched_rt_runtime_us for i in /sys/devices/virtual/workqueue/*/cpumask; \ do echo $i; echo 1,2,3,4,5,6 > $i; done if [[ -z "$1" ]]; then echo 400 | sudo tee /proc/sys/net/core/busy_read echo 100 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs echo 15000 | sudo tee /sys/class/net/eth0/gro_flush_timeout fi sudo ethtool -C eth0 adaptive-rx off adaptive-tx off rx-usecs 0 tx-usecs 0 if [[ "$1" == "enable_threaded" ]]; then echo 0 | sudo tee /proc/sys/net/core/busy_poll echo 0 | sudo tee /proc/sys/net/core/busy_read echo 100 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs echo 15000 | sudo tee /sys/class/net/eth0/gro_flush_timeout NAPI_ID=$(ynl --family netdev --output-json --do queue-get \ --json '{"ifindex": '${IFINDEX}', "id": '0', "type": "rx"}' | jq '."napi-id"') ynl --family netdev --json '{"id": "'${NAPI_ID}'", "threaded": "busy-poll"}' NAPI_T=$(ynl --family netdev --output-json --do napi-get \ --json '{"id": "'$NAPI_ID'"}' | jq '."pid"') sudo chrt -f -p 50 $NAPI_T # pin threaded poll thread to CPU 2 sudo taskset -pc 2 $NAPI_T fi if [[ "$1" == "enable_interrupt" ]]; then echo 0 | sudo tee /proc/sys/net/core/busy_read echo 0 | sudo tee /sys/class/net/eth0/napi_defer_hard_irqs echo 15000 | sudo tee /sys/class/net/eth0/gro_flush_timeout fi ``` To enable various configurations, script can be run as following, - Interrupt Only ``` <script> enable_interrupt ``` - SO_BUSYPOLL (no arguments to script) ``` <script> ``` - NAPI threaded busypoll ``` <script> enable_threaded ``` Once configured, the workload is run with various configurations using following commands. Set period (1/frequency) and delay in usecs to produce results for packet frequency and application processing delay. ## Interrupt Only and SO_BUSYPOLL (inline) - Server ``` sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \ -D <IP-dest> -S <IP-src> -M <MAC-dst> -m <MAC-src> -p 54321 -h -v ``` - Client ``` sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \ -S <IP-src> -D <IP-dest> -m <MAC-src> -M <MAC-dst> -p 54321 \ -P <Period-usecs> -d <Delay-usecs> -T -l 1 -v ``` ## SO_BUSYPOLL(done in separate core using recvfrom) Argument -t spawns a separate thread and continuously calls recvfrom. - Server ``` sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \ -D <IP-dest> -S <IP-src> -M <MAC-dst> -m <MAC-src> -p 54321 \ -h -v -t ``` - Client ``` sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \ -S <IP-src> -D <IP-dest> -m <MAC-src> -M <MAC-dst> -p 54321 \ -P <Period-usecs> -d <Delay-usecs> -T -l 1 -v -t ``` ## NAPI Threaded Busy Poll Argument -n skips the recvfrom call as there is no recv kick needed. - Server ``` sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \ -D <IP-dest> -S <IP-src> -M <MAC-dst> -m <MAC-src> -p 54321 \ -h -v -n ``` - Client ``` sudo chrt -f 50 taskset -c 3-5 ./xsk_rr -o 0 -B 400 -i eth0 -4 \ -S <IP-src> -D <IP-dest> -m <MAC-src> -M <MAC-dst> -p 54321 \ -P <Period-usecs> -d <Delay-usecs> -T -l 1 -v -n ``` | Experiment | interrupts | SO_BUSYPOLL | SO_BUSYPOLL(separate) | NAPI threaded | |---|---|---|---|---| | 12 Kpkt/s + 0us delay | | | | | | | p5: 12700 | p5: 12900 | p5: 13300 | p5: 12800 | | | p50: 13100 | p50: 13600 | p50: 14100 | p50: 13000 | | | p95: 13200 | p95: 13800 | p95: 14400 | p95: 13000 | | | p99: 13200 | p99: 13800 | p99: 14400 | p99: 13000 | | 32 Kpkt/s + 30us delay | | | | | | | p5: 19900 | p5: 16600 | p5: 13100 | p5: 12800 | | | p50: 21100 | p50: 17000 | p50: 13700 | p50: 13000 | | | p95: 21200 | p95: 17100 | p95: 14000 | p95: 13000 | | | p99: 21200 | p99: 17100 | p99: 14000 | p99: 13000 | | 125 Kpkt/s + 6us delay | | | | | | | p5: 14600 | p5: 17100 | p5: 13300 | p5: 12900 | | | p50: 15400 | p50: 17400 | p50: 13800 | p50: 13100 | | | p95: 15600 | p95: 17600 | p95: 14000 | p95: 13100 | | | p99: 15600 | p99: 17600 | p99: 14000 | p99: 13100 | | 12 Kpkt/s + 78us delay | | | | | | | p5: 14100 | p5: 16700 | p5: 13200 | p5: 12600 | | | p50: 14300 | p50: 17100 | p50: 13900 | p50: 12800 | | | p95: 14300 | p95: 17200 | p95: 14200 | p95: 12800 | | | p99: 14300 | p99: 17200 | p99: 14200 | p99: 12800 | | 25 Kpkt/s + 38us delay | | | | | | | p5: 19900 | p5: 16600 | p5: 13000 | p5: 12700 | | | p50: 21000 | p50: 17100 | p50: 13800 | p50: 12900 | | | p95: 21100 | p95: 17100 | p95: 14100 | p95: 12900 | | | p99: 21100 | p99: 17100 | p99: 14100 | p99: 12900 | ## Observations - Here without application processing all the approaches give the same latency within 1usecs range and NAPI threaded gives minimum latency. - With application processing the latency increases by 3-4usecs when doing inline polling. - Using a dedicated core to drive napi polling keeps the latency same even with application processing. This is observed both in userspace and threaded napi (in kernel). - Using napi threaded polling in kernel gives lower latency by 1-2usecs as compared to userspace driven polling in separate core. - Even on a dedicated core, SO_BUSYPOLL adds around 1-2usecs of latency. This is because it doesn't continuously busy poll until events are ready. Instead, it returns after polling only once, requiring the process to re-invoke the syscall for each poll, which requires a new enter/leave kernel cycle and the setup/teardown of the busy poll for every single poll attempt. - With application processing userspace will get the packet from recv ring and spend some time doing application processing and then do napi polling. While application processing is happening a dedicated core doing napi polling can pull the packet of the NAPI RX queue and populate the AF_XDP recv ring. This means that when the application thread is done with application processing it has new packets ready to recv and process in recv ring. - Napi threaded busy polling in the kernel with a dedicated core gives the consistent P5-P99 latency. Note well that threaded napi busy-polling has not been shown to yield efficiency or throughput benefits. In contrast, dedicating an entire core to busy-polling one NAPI (NIC queue) is rather inefficient. However, in certain specific use cases, this mechanism results in lower packet processing latency. The experimental testing reported here only covers those use cases and does not present a comprehensive evaluation of threaded napi busy-polling. Following histogram is generated to measure the time spent in recvfrom while using inline thread with SO_BUSYPOLL. The histogram is generated using the following bpftrace command. In this experiment there are 32K packets per second and the application processing delay is 30usecs. This is to measure whether there is significant time spent pulling packets from the descriptor queue that it will affect the overall latency if done inline. ``` bpftrace -e ' kprobe:xsk_recvmsg { @start[tid] = nsecs; } kretprobe:xsk_recvmsg { if (@start[tid]) { $sample = (nsecs - @start[tid]); @xsk_recvfrom_hist = hist($sample); delete(@start[tid]); } } END { clear(@start);}' ``` Here in case of inline busypolling around 35 percent of calls are taking 1-2usecs and around 50 percent are taking 0.5-2usecs. @xsk_recvfrom_hist: [128, 256) 24073 |@@@@@@@@@@@@@@@@@@@@@@ | [256, 512) 55633 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 20974 |@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 34234 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [2K, 4K) 3266 |@@@ | [4K, 8K) 19 | | ==================== Link: https://patch.msgid.link/20251028203007.575686-1-skhawaja@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/netlink/specs/netdev.yaml5
-rw-r--r--Documentation/networking/napi.rst50
-rw-r--r--include/linux/netdevice.h4
-rw-r--r--include/uapi/linux/netdev.h1
-rw-r--r--net/core/dev.c58
-rw-r--r--net/core/dev.h3
-rw-r--r--net/core/netdev-genl-gen.c2
-rw-r--r--tools/include/uapi/linux/netdev.h1
-rwxr-xr-xtools/testing/selftests/net/busy_poll_test.sh24
-rw-r--r--tools/testing/selftests/net/busy_poller.c16
10 files changed, 145 insertions, 19 deletions
diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index e00d3fa1c152..10c412b7433f 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -88,7 +88,7 @@ definitions:
-
name: napi-threaded
type: enum
- entries: [disabled, enabled]
+ entries: [disabled, enabled, busy-poll]
attribute-sets:
-
@@ -291,7 +291,8 @@ attribute-sets:
name: threaded
doc: Whether the NAPI is configured to operate in threaded polling
mode. If this is set to enabled then the NAPI context operates
- in threaded polling mode.
+ in threaded polling mode. If this is set to busy-poll, then the
+ threaded polling mode also busy polls.
type: u32
enum: napi-threaded
-
diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
index 7dd60366f4ff..4e008efebb35 100644
--- a/Documentation/networking/napi.rst
+++ b/Documentation/networking/napi.rst
@@ -263,7 +263,9 @@ are not well known).
Busy polling is enabled by either setting ``SO_BUSY_POLL`` on
selected sockets or using the global ``net.core.busy_poll`` and
``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
-also exists.
+also exists. Threaded polling of NAPI also has a mode to busy poll for
+packets (:ref:`threaded busy polling<threaded_busy_poll>`) using the NAPI
+processing kthread.
epoll-based busy polling
------------------------
@@ -426,6 +428,52 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is
the recommended usage, because otherwise setting ``irq-suspend-timeout``
might not have any discernible effect.
+.. _threaded_busy_poll:
+
+Threaded NAPI busy polling
+--------------------------
+
+Threaded NAPI busy polling extends threaded NAPI and adds support to do
+continuous busy polling of the NAPI. This can be useful for forwarding or
+AF_XDP applications.
+
+Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink.
+
+For example, using the following script:
+
+.. code-block:: bash
+
+ $ ynl --family netdev --do napi-set \
+ --json='{"id": 66, "threaded": "busy-poll"}'
+
+The kernel will create a kthread that busy polls on this NAPI.
+
+The user may elect to set the CPU affinity of this kthread to an unused CPU
+core to improve how often the NAPI is polled at the expense of wasted CPU
+cycles. Note that this will keep the CPU core busy with 100% usage.
+
+Once threaded busy polling is enabled for a NAPI, PID of the kthread can be
+retrieved using Netlink so the affinity of the kthread can be set up.
+
+For example, the following script can be used to fetch the PID:
+
+.. code-block:: bash
+
+ $ ynl --family netdev --do napi-get --json='{"id": 66}'
+
+This will output something like following, the pid `258` is the PID of the
+kthread that is polling this NAPI.
+
+.. code-block:: bash
+
+ $ {'defer-hard-irqs': 0,
+ 'gro-flush-timeout': 0,
+ 'id': 66,
+ 'ifindex': 2,
+ 'irq-suspend-timeout': 0,
+ 'pid': 258,
+ 'threaded': 'busy-poll'}
+
.. _threaded:
Threaded NAPI
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c1e5042c5e7..e808071dbb7d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -423,11 +423,12 @@ enum {
NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */
NAPI_STATE_LISTED, /* NAPI added to system lists */
NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
- NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_IN_BUSY_POLL, /* Do not rearm NAPI interrupt */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */
NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */
+ NAPI_STATE_THREADED_BUSY_POLL, /* The threaded NAPI poller will busy poll */
};
enum {
@@ -442,6 +443,7 @@ enum {
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED),
NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER),
+ NAPIF_STATE_THREADED_BUSY_POLL = BIT(NAPI_STATE_THREADED_BUSY_POLL),
};
enum gro_result {
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 48eb49aa03d4..048c8de1a130 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
enum netdev_napi_threaded {
NETDEV_NAPI_THREADED_DISABLED,
NETDEV_NAPI_THREADED_ENABLED,
+ NETDEV_NAPI_THREADED_BUSY_POLL,
};
enum {
diff --git a/net/core/dev.c b/net/core/dev.c
index dccc1176f3c6..2c1de5fb97d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7089,7 +7089,8 @@ static void napi_stop_kthread(struct napi_struct *napi)
*/
if ((val & NAPIF_STATE_SCHED_THREADED) ||
!(val & NAPIF_STATE_SCHED)) {
- new = val & (~NAPIF_STATE_THREADED);
+ new = val & (~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL));
} else {
msleep(20);
continue;
@@ -7113,6 +7114,16 @@ static void napi_stop_kthread(struct napi_struct *napi)
napi->thread = NULL;
}
+static void napi_set_threaded_state(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded_mode)
+{
+ bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
+ bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
+
+ assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
+}
+
int napi_set_threaded(struct napi_struct *napi,
enum netdev_napi_threaded threaded)
{
@@ -7139,7 +7150,7 @@ int napi_set_threaded(struct napi_struct *napi,
} else {
/* Make sure kthread is created before THREADED bit is set. */
smp_mb__before_atomic();
- assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ napi_set_threaded_state(napi, threaded);
}
return 0;
@@ -7531,7 +7542,9 @@ void napi_disable_locked(struct napi_struct *n)
}
new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
- new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
+ new &= ~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL |
+ NAPIF_STATE_PREFER_BUSY_POLL);
} while (!try_cmpxchg(&n->state, &val, new));
hrtimer_cancel(&n->timer);
@@ -7743,7 +7756,7 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
-static void napi_threaded_poll_loop(struct napi_struct *napi)
+static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
@@ -7772,22 +7785,47 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
}
skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
+
+ /* When busy poll is enabled, the old packets are not flushed in
+ * napi_complete_done. So flush them here.
+ */
+ if (busy_poll)
+ gro_flush_normal(&napi->gro, HZ >= 1000);
local_bh_enable();
+ /* Call cond_resched here to avoid watchdog warnings. */
+ if (repoll || busy_poll) {
+ rcu_softirq_qs_periodic(last_qs);
+ cond_resched();
+ }
+
if (!repoll)
break;
-
- rcu_softirq_qs_periodic(last_qs);
- cond_resched();
}
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
+ bool want_busy_poll;
+ bool in_busy_poll;
+ unsigned long val;
+
+ while (!napi_thread_wait(napi)) {
+ val = READ_ONCE(napi->state);
+
+ want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
+ in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
- while (!napi_thread_wait(napi))
- napi_threaded_poll_loop(napi);
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ want_busy_poll = false;
+
+ if (want_busy_poll != in_busy_poll)
+ assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
+ want_busy_poll);
+
+ napi_threaded_poll_loop(napi, want_busy_poll);
+ }
return 0;
}
@@ -13097,7 +13135,7 @@ static void run_backlog_napi(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
- napi_threaded_poll_loop(&sd->backlog);
+ napi_threaded_poll_loop(&sd->backlog, false);
}
static void backlog_napi_setup(unsigned int cpu)
diff --git a/net/core/dev.h b/net/core/dev.h
index 900880e8b5b4..4d872a79bafb 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -317,6 +317,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
{
+ if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state))
+ return NETDEV_NAPI_THREADED_BUSY_POLL;
+
if (test_bit(NAPI_STATE_THREADED, &n->state))
return NETDEV_NAPI_THREADED_ENABLED;
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index e9a2a6f26cb7..ff20435c45d2 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED
[NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
- [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1),
+ [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2),
};
/* NETDEV_CMD_BIND_TX - do */
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index 48eb49aa03d4..048c8de1a130 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
enum netdev_napi_threaded {
NETDEV_NAPI_THREADED_DISABLED,
NETDEV_NAPI_THREADED_ENABLED,
+ NETDEV_NAPI_THREADED_BUSY_POLL,
};
enum {
diff --git a/tools/testing/selftests/net/busy_poll_test.sh b/tools/testing/selftests/net/busy_poll_test.sh
index 7d2d40812074..5ec1c85c1623 100755
--- a/tools/testing/selftests/net/busy_poll_test.sh
+++ b/tools/testing/selftests/net/busy_poll_test.sh
@@ -27,6 +27,8 @@ NAPI_DEFER_HARD_IRQS=100
GRO_FLUSH_TIMEOUT=50000
SUSPEND_TIMEOUT=20000000
+NAPI_THREADED_MODE_BUSY_POLL=2
+
setup_ns()
{
set -e
@@ -62,6 +64,9 @@ cleanup_ns()
test_busypoll()
{
suspend_value=${1:-0}
+ napi_threaded_value=${2:-0}
+ prefer_busy_poll_value=${3:-$PREFER_BUSY_POLL}
+
tmp_file=$(mktemp)
out_file=$(mktemp)
@@ -73,10 +78,11 @@ test_busypoll()
-b${SERVER_IP} \
-m${MAX_EVENTS} \
-u${BUSY_POLL_USECS} \
- -P${PREFER_BUSY_POLL} \
+ -P${prefer_busy_poll_value} \
-g${BUSY_POLL_BUDGET} \
-i${NSIM_SV_IFIDX} \
-s${suspend_value} \
+ -t${napi_threaded_value} \
-o${out_file}&
wait_local_port_listen nssv ${SERVER_PORT} tcp
@@ -109,6 +115,15 @@ test_busypoll_with_suspend()
return $?
}
+test_busypoll_with_napi_threaded()
+{
+ # Only enable napi threaded poll. Set suspend timeout and prefer busy
+ # poll to 0.
+ test_busypoll 0 ${NAPI_THREADED_MODE_BUSY_POLL} 0
+
+ return $?
+}
+
###
### Code start
###
@@ -154,6 +169,13 @@ if [ $? -ne 0 ]; then
exit 1
fi
+test_busypoll_with_napi_threaded
+if [ $? -ne 0 ]; then
+ echo "test_busypoll_with_napi_threaded failed"
+ cleanup_ns
+ exit 1
+fi
+
echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c
index 04c7ff577bb8..3a81f9c94795 100644
--- a/tools/testing/selftests/net/busy_poller.c
+++ b/tools/testing/selftests/net/busy_poller.c
@@ -65,15 +65,16 @@ static uint32_t cfg_busy_poll_usecs;
static uint16_t cfg_busy_poll_budget;
static uint8_t cfg_prefer_busy_poll;
-/* IRQ params */
+/* NAPI params */
static uint32_t cfg_defer_hard_irqs;
static uint64_t cfg_gro_flush_timeout;
static uint64_t cfg_irq_suspend_timeout;
+static enum netdev_napi_threaded cfg_napi_threaded_poll = NETDEV_NAPI_THREADED_DISABLED;
static void usage(const char *filepath)
{
error(1, 0,
- "Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -i<ifindex>",
+ "Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -t<napi_threaded_poll> -i<ifindex>",
filepath);
}
@@ -86,7 +87,7 @@ static void parse_opts(int argc, char **argv)
if (argc <= 1)
usage(argv[0]);
- while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) {
+ while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:t:")) != -1) {
/* most options take integer values, except o and b, so reduce
* code duplication a bit for the common case by calling
* strtoull here and leave bounds checking and casting per
@@ -168,6 +169,12 @@ static void parse_opts(int argc, char **argv)
cfg_ifindex = (int)tmp;
break;
+ case 't':
+ if (tmp > 2)
+ error(1, ERANGE, "napi threaded poll value must be 0-2");
+
+ cfg_napi_threaded_poll = (enum netdev_napi_threaded)tmp;
+ break;
}
}
@@ -247,6 +254,9 @@ static void setup_queue(void)
netdev_napi_set_req_set_irq_suspend_timeout(set_req,
cfg_irq_suspend_timeout);
+ if (cfg_napi_threaded_poll)
+ netdev_napi_set_req_set_threaded(set_req, cfg_napi_threaded_poll);
+
if (netdev_napi_set(ys, set_req))
error(1, 0, "can't set NAPI params: %s\n", yerr.msg);