summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorMartin KaFai Lau <martin.lau@kernel.org>2025-02-20 13:07:22 -0800
committerMartin KaFai Lau <martin.lau@kernel.org>2025-02-20 14:30:16 -0800
commit68b92ac494eb767cff5826372328c10e24b2e25a (patch)
tree5a8d58615af649ddc8416ef380aaaafd5448b388 /include
parent09bc97bcf868af16a2cc78a1b8b6c9d31cbabd23 (diff)
parentf4924aec58dd9e14779f4bc11a6bf3a830a42a6c (diff)
Merge branch 'net-timestamp-bpf-extension-to-equip-applications-transparently'
Jason Xing says: ==================== net-timestamp: bpf extension to equip applications transparently "Timestamping is key to debugging network stack latency. With SO_TIMESTAMPING, bugs that are otherwise incorrectly assumed to be network issues can be attributed to the kernel." This is extracted from the talk "SO_TIMESTAMPING: Powering Fleetwide RPC Monitoring" addressed by Willem de Bruijn at netdevconf 0x17). There are a few areas that need optimization with the consideration of easier use and less performance impact, which I highlighted and mainly discussed at netconf 2024 with Willem de Bruijn and John Fastabend: uAPI compatibility, extra system call overhead, and the need for application modification. I initially managed to solve these issues by writing a kernel module that hooks various key functions. However, this approach is not suitable for the next kernel release. Therefore, a BPF extension was proposed. During recent period, Martin KaFai Lau provides invaluable suggestions about BPF along the way. Many thanks here! This series adds the BPF networking timestamping infrastructure through reusing most of the tx timestamping callback that is currently enabled by the SO_TIMESTAMPING.. This series also adds TX timestamping support for TCP. The RX timestamping and UDP support will be added in the future. ==================== Link: https://patch.msgid.link/20250220072940.99994-1-kerneljasonxing@gmail.com Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/filter.h1
-rw-r--r--include/linux/skbuff.h12
-rw-r--r--include/net/sock.h10
-rw-r--r--include/net/tcp.h7
-rw-r--r--include/uapi/linux/bpf.h30
5 files changed, 55 insertions, 5 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a3ea46281595..d36d5d5180b1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1508,6 +1508,7 @@ struct bpf_sock_ops_kern {
void *skb_data_end;
u8 op;
u8 is_fullsock;
+ u8 is_locked_tcp_sock;
u8 remaining_opt_len;
u64 temp; /* temp and everything after is not
* initialized to 0 before calling
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb2b751d274a..0b4f1889500d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -470,7 +470,7 @@ struct skb_shared_hwtstamps {
/* Definitions for tx_flags in struct skb_shared_info */
enum {
/* generate hardware time stamp */
- SKBTX_HW_TSTAMP = 1 << 0,
+ SKBTX_HW_TSTAMP_NOBPF = 1 << 0,
/* generate software time stamp when queueing packet to NIC */
SKBTX_SW_TSTAMP = 1 << 1,
@@ -489,10 +489,16 @@ enum {
/* generate software time stamp when entering packet scheduling */
SKBTX_SCHED_TSTAMP = 1 << 6,
+
+ /* used for bpf extension when a bpf program is loaded */
+ SKBTX_BPF = 1 << 7,
};
+#define SKBTX_HW_TSTAMP (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF)
+
#define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \
- SKBTX_SCHED_TSTAMP)
+ SKBTX_SCHED_TSTAMP | \
+ SKBTX_BPF)
#define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | \
SKBTX_HW_TSTAMP_USE_CYCLES | \
SKBTX_ANY_SW_TSTAMP)
@@ -4564,7 +4570,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
skb_clone_tx_timestamp(skb);
- if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
+ if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF))
skb_tstamp_tx(skb, NULL);
}
diff --git a/include/net/sock.h b/include/net/sock.h
index 60ebf3c7b229..2f6b55c59c16 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -303,6 +303,7 @@ struct sk_filter;
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_bpf_cb_flags: used in bpf_setsockopt()
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
@@ -525,6 +526,8 @@ struct sock {
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+ u8 sk_bpf_cb_flags;
void *sk_user_data;
#ifdef CONFIG_SECURITY
@@ -2921,6 +2924,13 @@ int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
void sock_enable_timestamps(struct sock *sk);
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
+#else
+static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+}
+#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7fd2d7fa4532..ae6c95b01012 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -964,10 +964,12 @@ struct tcp_skb_cb {
__u8 sacked; /* State flags for SACK. */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
- __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
+#define TSTAMP_ACK_SK 0x1
+#define TSTAMP_ACK_BPF 0x2
+ __u8 txstamp_ack:2, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
- unused:5;
+ unused:4;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
@@ -2657,6 +2659,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_owned_by_me(sk);
}
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2acf9b336371..defa5bb881f4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6913,6 +6913,12 @@ enum {
BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
};
+enum {
+ SK_BPF_CB_TX_TIMESTAMPING = 1<<0,
+ SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
+ SK_BPF_CB_TX_TIMESTAMPING
+};
+
/* List of known BPF sock_ops operators.
* New entries can only be added at the end
*/
@@ -7025,6 +7031,29 @@ enum {
* by the kernel or the
* earlier bpf-progs.
*/
+ BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing
+ * through dev layer when
+ * SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send
+ * to the nic when SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when
+ * SK_BPF_CB_TX_TIMESTAMPING feature
+ * is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the
+ * same sendmsg call are acked
+ * when SK_BPF_CB_TX_TIMESTAMPING
+ * feature is on.
+ */
+ BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall
+ * is triggered. It's used to correlate
+ * sendmsg timestamp with corresponding
+ * tskey.
+ */
};
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
@@ -7091,6 +7120,7 @@ enum {
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */
};
enum {