bpf, sockmap: Fix FIONREAD for sockmap

[ Upstream commit 929e30f9312514902133c45e51c79088421ab084 ] A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new msg_tot_len field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. Note that we intentionally do not include sk_receive_queue data in the FIONREAD result. Data in sk_receive_queue has not yet been processed by the BPF verdict program, and may be redirected to other sockets or dropped. Including it would create semantic ambiguity since this data may never be readable by the user. Unix and VSOCK sockets have similar issues, but fixing them is outside the scope of this patch as it would require more intrusive changes. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev> Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com> Link: https://lore.kernel.org/r/20260124113314.113584-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
author: Jiayuan Chen <jiayuan.chen@linux.dev> 2026-01-24 19:32:44 +0800
committer: Sasha Levin <sashal@kernel.org> 2026-03-04 07:19:32 -0500
commit: 9681044e45c95c4e59f19f6316291d2b91eed3d2 (patch)
tree: 4116b533aa2f199ac1671b970fd20f6110d103c7 /include/linux
parent: acaf1ea47bbfbc776bf8893532dccbf2d2167ae8 (diff)
1 files changed, 66 insertions, 2 deletions
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 5c5a2d65184c..e923f1c24ce4 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -93,6 +93,8 @@ struct sk_psock {
 	struct sk_buff_head		ingress_skb;
 	struct list_head		ingress_msg;
 	spinlock_t			ingress_lock;
+	/** @msg_tot_len: Total bytes queued in ingress_msg list. */
+	u32				msg_tot_len;
 	unsigned long			state;
 	struct list_head		link;
 	spinlock_t			link_lock;
@@ -312,6 +314,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock)
+{
+	/* Used by ioctl to read msg_tot_len only; lock-free for performance */
+	return READ_ONCE(psock->msg_tot_len);
+}
+
+static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff)
+{
+	/* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock().
+	 * ingress_lock should be held to prevent concurrent updates to msg_tot_len
+	 */
+	WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff);
+}
+
+static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff)
+{
+	spin_lock_bh(&psock->ingress_lock);
+	sk_psock_msg_len_add_locked(psock, diff);
+	spin_unlock_bh(&psock->ingress_lock);
+}
+
 static inline bool sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
@@ -320,6 +343,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
 	spin_lock_bh(&psock->ingress_lock);
 	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
 		list_add_tail(&msg->list, &psock->ingress_msg);
+		sk_psock_msg_len_add_locked(psock, msg->sg.size);
 		ret = true;
 	} else {
 		sk_msg_free(psock->sk, msg);
@@ -336,18 +360,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
 
 	spin_lock_bh(&psock->ingress_lock);
 	msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
-	if (msg)
+	if (msg) {
 		list_del(&msg->list);
+		sk_psock_msg_len_add_locked(psock, -msg->sg.size);
+	}
 	spin_unlock_bh(&psock->ingress_lock);
 	return msg;
 }
 
+static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock)
+{
+	return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+}
+
 static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
 {
 	struct sk_msg *msg;
 
 	spin_lock_bh(&psock->ingress_lock);
-	msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+	msg = sk_psock_peek_msg_locked(psock);
 	spin_unlock_bh(&psock->ingress_lock);
 	return msg;
 }
@@ -511,6 +542,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
+/* for tcp only, sk is locked */
+static inline ssize_t sk_psock_msg_inq(struct sock *sk)
+{
+	struct sk_psock *psock;
+	ssize_t inq = 0;
+
+	psock = sk_psock_get(sk);
+	if (likely(psock)) {
+		inq = sk_psock_get_msg_len_nolock(psock);
+		sk_psock_put(sk, psock);
+	}
+	return inq;
+}
+
+/* for udp only, sk is not locked */
+static inline ssize_t sk_msg_first_len(struct sock *sk)
+{
+	struct sk_psock *psock;
+	struct sk_msg *msg;
+	ssize_t inq = 0;
+
+	psock = sk_psock_get(sk);
+	if (likely(psock)) {
+		spin_lock_bh(&psock->ingress_lock);
+		msg = sk_psock_peek_msg_locked(psock);
+		if (msg)
+			inq = msg->sg.size;
+		spin_unlock_bh(&psock->ingress_lock);
+		sk_psock_put(sk, psock);
+	}
+	return inq;
+}
+
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
author	Jiayuan Chen <jiayuan.chen@linux.dev>	2026-01-24 19:32:44 +0800
committer	Sasha Levin <sashal@kernel.org>	2026-03-04 07:19:32 -0500
commit	9681044e45c95c4e59f19f6316291d2b91eed3d2 (patch)
tree	4116b533aa2f199ac1671b970fd20f6110d103c7 /include/linux
parent	acaf1ea47bbfbc776bf8893532dccbf2d2167ae8 (diff)