From 5d5de3a431d87ac51d43da8d796891d014975ab7 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 16 Feb 2023 10:48:21 +0800
Subject: bpf: Only allocate one bpf_mem_cache for bpf_cpumask_ma

The size of bpf_cpumask is fixed, so there is no need to allocate many
bpf_mem_caches for bpf_cpumask_ma, just one bpf_mem_cache is enough.
Also add comments for bpf_mem_alloc_init() in bpf_mem_alloc.h to prevent
future miuse.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230216024821.2202916-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3e164b8efaa9..a7104af61ab4 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -14,6 +14,13 @@ struct bpf_mem_alloc {
 	struct work_struct work;
 };
 
+/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
+ * Alloc and free are done with bpf_mem_cache_{alloc,free}().
+ *
+ * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
+ * Alloc and free are done with bpf_mem_{alloc,free}() and the size of
+ * the returned object is given by the size argument of bpf_mem_alloc().
+ */
 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 
-- 
cgit v1.2.3


From 7e0dac2807e6c4ae8c56941d74971fdb0763b4f9 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:45 -0800
Subject: bpf: Refactor process_dynptr_func

This change cleans up process_dynptr_func's flow to be more intuitive
and updates some comments with more context.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-3-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  3 ---
 kernel/bpf/verifier.c        | 62 ++++++++++++++++++++++----------------------
 2 files changed, 31 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index cf1bb1cf4a7b..b26ff2a8f63b 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -616,9 +616,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
 			   enum bpf_arg_type arg_type);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
-struct bpf_call_arg_meta;
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
-			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5cb8b623f639..e0e00509846b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -959,39 +959,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				       int spi)
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
+	int spi;
+
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return false;
 
-	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
-	 * will do check_mem_access to check and update stack bounds later, so
-	 * return true for that case.
+	spi = dynptr_get_spi(env, reg);
+
+	/* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+	 * error because this just means the stack state hasn't been updated yet.
+	 * We will do check_mem_access to check and update stack bounds later.
 	 */
-	if (spi < 0)
-		return spi == -ERANGE;
-	/* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
-	 * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
-	 * ensure dynptr objects at the slots we are touching are completely
-	 * destructed before we reinitialize them for a new one. For referenced
-	 * ones, destroy_if_dynptr_stack_slot returns an error early instead of
-	 * delaying it until the end where the user will get "Unreleased
+	if (spi < 0 && spi != -ERANGE)
+		return false;
+
+	/* We don't need to check if the stack slots are marked by previous
+	 * dynptr initializations because we allow overwriting existing unreferenced
+	 * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+	 * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+	 * touching are completely destructed before we reinitialize them for a new
+	 * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+	 * instead of delaying it until the end where the user will get "Unreleased
 	 * reference" error.
 	 */
 	return true;
 }
 
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				     int spi)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
-	int i;
+	int i, spi;
 
-	/* This already represents first slot of initialized bpf_dynptr */
+	/* This already represents first slot of initialized bpf_dynptr.
+	 *
+	 * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+	 * check_func_arg_reg_off's logic, so we don't need to check its
+	 * offset and alignment.
+	 */
 	if (reg->type == CONST_PTR_TO_DYNPTR)
 		return true;
 
+	spi = dynptr_get_spi(env, reg);
 	if (spi < 0)
 		return false;
 	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
@@ -6215,11 +6225,10 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
  * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
  * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
  */
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
-			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+			       enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
 {
 	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
-	int spi = 0;
 
 	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
 	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
@@ -6228,15 +6237,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 		verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
 		return -EFAULT;
 	}
-	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
-	 * check_func_arg_reg_off's logic. We only need to check offset
-	 * and its alignment for PTR_TO_STACK.
-	 */
-	if (reg->type == PTR_TO_STACK) {
-		spi = dynptr_get_spi(env, reg);
-		if (spi < 0 && spi != -ERANGE)
-			return spi;
-	}
 
 	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for
 	 *		 constructing a mutable bpf_dynptr object.
@@ -6254,7 +6254,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 	 *		 to.
 	 */
 	if (arg_type & MEM_UNINIT) {
-		if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
+		if (!is_dynptr_reg_valid_uninit(env, reg)) {
 			verbose(env, "Dynptr has to be an uninitialized dynptr\n");
 			return -EINVAL;
 		}
@@ -6277,7 +6277,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
 			return -EINVAL;
 		}
 
-		if (!is_dynptr_reg_valid_init(env, reg, spi)) {
+		if (!is_dynptr_reg_valid_init(env, reg)) {
 			verbose(env,
 				"Expected an initialized dynptr as arg #%d\n",
 				regno);
-- 
cgit v1.2.3


From 8357b366cbb09b17c90e2cd758360a6bd2ea7507 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:47 -0800
Subject: bpf: Define no-ops for externally called bpf dynptr functions

Some bpf dynptr functions will be called from places where
if CONFIG_BPF_SYSCALL is not set, then the dynptr function is
undefined. For example, when skb type dynptrs are added in the
next commit, dynptr functions are called from net/core/filter.c

This patch defines no-op implementations of these dynptr functions
so that they do not break compilation by being an undefined reference.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-5-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 75 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520b238abd5a..296841a31749 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1124,6 +1124,33 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func(
 	return bpf_func(ctx, insnsi);
 }
 
+/* the implementation of the opaque uapi struct bpf_dynptr */
+struct bpf_dynptr_kern {
+	void *data;
+	/* Size represents the number of usable bytes of dynptr data.
+	 * If for example the offset is at 4 for a local dynptr whose data is
+	 * of type u64, the number of usable bytes is 4.
+	 *
+	 * The upper 8 bits are reserved. It is as follows:
+	 * Bits 0 - 23 = size
+	 * Bits 24 - 30 = dynptr type
+	 * Bit 31 = whether dynptr is read-only
+	 */
+	u32 size;
+	u32 offset;
+} __aligned(8);
+
+enum bpf_dynptr_type {
+	BPF_DYNPTR_TYPE_INVALID,
+	/* Points to memory that is local to the bpf program */
+	BPF_DYNPTR_TYPE_LOCAL,
+	/* Underlying data is a ringbuf record */
+	BPF_DYNPTR_TYPE_RINGBUF,
+};
+
+int bpf_dynptr_check_size(u32 size);
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
+
 #ifdef CONFIG_BPF_JIT
 int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
 int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
@@ -2266,6 +2293,11 @@ static inline bool has_current_bpf_ctx(void)
 }
 
 void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+		     enum bpf_dynptr_type type, u32 offset, u32 size);
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -2495,6 +2527,19 @@ static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
 static inline void bpf_cgrp_storage_free(struct cgroup *cgroup)
 {
 }
+
+static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
+				   enum bpf_dynptr_type type, u32 offset, u32 size)
+{
+}
+
+static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+}
+
+static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@ -2913,36 +2958,6 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 num_args, struct bpf_bprintf_data *data);
 void bpf_bprintf_cleanup(struct bpf_bprintf_data *data);
 
-/* the implementation of the opaque uapi struct bpf_dynptr */
-struct bpf_dynptr_kern {
-	void *data;
-	/* Size represents the number of usable bytes of dynptr data.
-	 * If for example the offset is at 4 for a local dynptr whose data is
-	 * of type u64, the number of usable bytes is 4.
-	 *
-	 * The upper 8 bits are reserved. It is as follows:
-	 * Bits 0 - 23 = size
-	 * Bits 24 - 30 = dynptr type
-	 * Bit 31 = whether dynptr is read-only
-	 */
-	u32 size;
-	u32 offset;
-} __aligned(8);
-
-enum bpf_dynptr_type {
-	BPF_DYNPTR_TYPE_INVALID,
-	/* Points to memory that is local to the bpf program */
-	BPF_DYNPTR_TYPE_LOCAL,
-	/* Underlying data is a kernel-produced ringbuf record */
-	BPF_DYNPTR_TYPE_RINGBUF,
-};
-
-void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
-		     enum bpf_dynptr_type type, u32 offset, u32 size);
-void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
-int bpf_dynptr_check_size(u32 size);
-u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr);
-
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
 void bpf_cgroup_atype_put(int cgroup_atype);
-- 
cgit v1.2.3


From b5964b968ac64c2ec2debee7518499113b27c34e Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:50 -0800
Subject: bpf: Add skb dynptrs

Add skb dynptrs, which are dynptrs whose underlying pointer points
to a skb. The dynptr acts on skb data. skb dynptrs have two main
benefits. One is that they allow operations on sizes that are not
statically known at compile-time (eg variable-sized accesses).
Another is that parsing the packet data through dynptrs (instead of
through direct access of skb->data and skb->data_end) can be more
ergonomic and less brittle (eg does not need manual if checking for
being within bounds of data_end).

For bpf prog types that don't support writes on skb data, the dynptr is
read-only (bpf_dynptr_write() will return an error)

For reads and writes through the bpf_dynptr_read() and bpf_dynptr_write()
interfaces, reading and writing from/to data in the head as well as from/to
non-linear paged buffers is supported. Data slices through the
bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and
bpf_dynptr_slice_rdwr() (added in subsequent commit) should be used.

For examples of how skb dynptrs can be used, please see the attached
selftests.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-8-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 14 +++++++-
 include/linux/filter.h         | 18 ++++++++++
 include/uapi/linux/bpf.h       | 13 ++++++--
 kernel/bpf/btf.c               | 18 ++++++++++
 kernel/bpf/helpers.c           | 76 ++++++++++++++++++++++++++++++++++--------
 kernel/bpf/verifier.c          | 61 +++++++++++++++++++++++++++++++++
 net/core/filter.c              | 67 +++++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 13 ++++++--
 8 files changed, 261 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 296841a31749..e7436d7615b0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -607,11 +607,14 @@ enum bpf_type_flag {
 	 */
 	NON_OWN_REF		= BIT(14 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to sk_buff */
+	DYNPTR_TYPE_SKB		= BIT(15 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
-#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF)
+#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -1146,6 +1149,8 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_LOCAL,
 	/* Underlying data is a ringbuf record */
 	BPF_DYNPTR_TYPE_RINGBUF,
+	/* Underlying data is a sk_buff */
+	BPF_DYNPTR_TYPE_SKB,
 };
 
 int bpf_dynptr_check_size(u32 size);
@@ -2846,6 +2851,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 				struct bpf_insn *insn_buf,
 				struct bpf_prog *prog,
 				u32 *target_size);
+int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+			       struct bpf_dynptr_kern *ptr);
 #else
 static inline bool bpf_sock_common_is_valid_access(int off, int size,
 						   enum bpf_access_type type,
@@ -2867,6 +2874,11 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
 {
 	return 0;
 }
+static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+					     struct bpf_dynptr_kern *ptr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 #ifdef CONFIG_INET
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1727898f1641..de18e844d15e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1542,4 +1542,22 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index
 	return XDP_REDIRECT;
 }
 
+#ifdef CONFIG_NET
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+			  u32 len, u64 flags);
+#else /* CONFIG_NET */
+static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
+				       void *to, u32 len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
+					const void *from, u32 len, u64 flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_NET */
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 62ce1f5d1b1d..d0351d30e551 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5325,11 +5325,17 @@ union bpf_attr {
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
- *		*flags* is currently unused.
+ *
+ *		*flags* must be 0 except for skb-type dynptrs.
+ *
+ *		For skb-type dynptrs:
+ *		    *  For *flags*, please see the flags accepted by
+ *		       **bpf_skb_store_bytes**\ ().
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr or if *flags* is not 0.
+ *		is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ *		other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
  *
  * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
@@ -5337,6 +5343,9 @@ union bpf_attr {
  *
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
+ *
+ *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 84cca8473873..ef2d8969ed1f 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -207,6 +207,11 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_TRACING,
 	BTF_KFUNC_HOOK_SYSCALL,
 	BTF_KFUNC_HOOK_FMODRET,
+	BTF_KFUNC_HOOK_CGROUP_SKB,
+	BTF_KFUNC_HOOK_SCHED_ACT,
+	BTF_KFUNC_HOOK_SK_SKB,
+	BTF_KFUNC_HOOK_SOCKET_FILTER,
+	BTF_KFUNC_HOOK_LWT,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -7708,6 +7713,19 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 		return BTF_KFUNC_HOOK_TRACING;
 	case BPF_PROG_TYPE_SYSCALL:
 		return BTF_KFUNC_HOOK_SYSCALL;
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		return BTF_KFUNC_HOOK_CGROUP_SKB;
+	case BPF_PROG_TYPE_SCHED_ACT:
+		return BTF_KFUNC_HOOK_SCHED_ACT;
+	case BPF_PROG_TYPE_SK_SKB:
+		return BTF_KFUNC_HOOK_SK_SKB;
+	case BPF_PROG_TYPE_SOCKET_FILTER:
+		return BTF_KFUNC_HOOK_SOCKET_FILTER;
+	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_XMIT:
+	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+		return BTF_KFUNC_HOOK_LWT;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index abdcc52f90a6..e8e2414d1587 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1420,11 +1420,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
 	return ptr->size & DYNPTR_RDONLY_BIT;
 }
 
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+	ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
 static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
 {
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+	return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
 u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
@@ -1497,6 +1507,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
 	   u32, offset, u64, flags)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
 	if (!src->data || flags)
@@ -1506,13 +1517,23 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
 	if (err)
 		return err;
 
-	/* Source and destination may possibly overlap, hence use memmove to
-	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
-	 * pointing to overlapping PTR_TO_MAP_VALUE regions.
-	 */
-	memmove(dst, src->data + src->offset + offset, len);
+	type = bpf_dynptr_get_type(src);
 
-	return 0;
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		/* Source and destination may possibly overlap, hence use memmove to
+		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
+		 */
+		memmove(dst, src->data + src->offset + offset, len);
+		return 0;
+	case BPF_DYNPTR_TYPE_SKB:
+		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+	default:
+		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+		return -EFAULT;
+	}
 }
 
 static const struct bpf_func_proto bpf_dynptr_read_proto = {
@@ -1529,22 +1550,36 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
 BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
 	   u32, len, u64, flags)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
-	if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
+	if (!dst->data || bpf_dynptr_is_rdonly(dst))
 		return -EINVAL;
 
 	err = bpf_dynptr_check_off_len(dst, offset, len);
 	if (err)
 		return err;
 
-	/* Source and destination may possibly overlap, hence use memmove to
-	 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
-	 * pointing to overlapping PTR_TO_MAP_VALUE regions.
-	 */
-	memmove(dst->data + dst->offset + offset, src, len);
+	type = bpf_dynptr_get_type(dst);
 
-	return 0;
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		if (flags)
+			return -EINVAL;
+		/* Source and destination may possibly overlap, hence use memmove to
+		 * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+		 * pointing to overlapping PTR_TO_MAP_VALUE regions.
+		 */
+		memmove(dst->data + dst->offset + offset, src, len);
+		return 0;
+	case BPF_DYNPTR_TYPE_SKB:
+		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+					     flags);
+	default:
+		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+		return -EFAULT;
+	}
 }
 
 static const struct bpf_func_proto bpf_dynptr_write_proto = {
@@ -1560,6 +1595,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
 
 BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
 {
+	enum bpf_dynptr_type type;
 	int err;
 
 	if (!ptr->data)
@@ -1572,7 +1608,19 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
 	if (bpf_dynptr_is_rdonly(ptr))
 		return 0;
 
-	return (unsigned long)(ptr->data + ptr->offset + offset);
+	type = bpf_dynptr_get_type(ptr);
+
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return (unsigned long)(ptr->data + ptr->offset + offset);
+	case BPF_DYNPTR_TYPE_SKB:
+		/* skb dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+		return 0;
+	default:
+		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+		return 0;
+	}
 }
 
 static const struct bpf_func_proto bpf_dynptr_data_proto = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d052aa5800de..4f5fce16543b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -750,6 +750,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 		return BPF_DYNPTR_TYPE_LOCAL;
 	case DYNPTR_TYPE_RINGBUF:
 		return BPF_DYNPTR_TYPE_RINGBUF;
+	case DYNPTR_TYPE_SKB:
+		return BPF_DYNPTR_TYPE_SKB;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
@@ -6295,6 +6297,9 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 			case DYNPTR_TYPE_RINGBUF:
 				err_extra = "ringbuf";
 				break;
+			case DYNPTR_TYPE_SKB:
+				err_extra = "skb ";
+				break;
 			default:
 				err_extra = "<unknown>";
 				break;
@@ -6737,6 +6742,24 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state
 	return state->stack[spi].spilled_ptr.ref_obj_id;
 }
 
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+					    struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi;
+
+	if (reg->type == CONST_PTR_TO_DYNPTR)
+		return reg->dynptr.type;
+
+	spi = __get_spi(reg->off);
+	if (spi < 0) {
+		verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+		return BPF_DYNPTR_TYPE_INVALID;
+	}
+
+	return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
 static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 			  struct bpf_call_arg_meta *meta,
 			  const struct bpf_func_proto *fn,
@@ -8383,6 +8406,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 		break;
 	}
+	case BPF_FUNC_dynptr_write:
+	{
+		enum bpf_dynptr_type dynptr_type;
+		struct bpf_reg_state *reg;
+
+		reg = get_dynptr_arg_reg(env, fn, regs);
+		if (!reg)
+			return -EFAULT;
+
+		dynptr_type = dynptr_get_type(env, reg);
+		if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+			return -EFAULT;
+
+		if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+			/* this will trigger clear_all_pkt_pointers(), which will
+			 * invalidate all dynptr slices associated with the skb
+			 */
+			changes_data = true;
+
+		break;
+	}
 	case BPF_FUNC_user_ringbuf_drain:
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_user_ringbuf_callback_state);
@@ -8898,6 +8942,7 @@ enum special_kfunc_type {
 	KF_bpf_rbtree_remove,
 	KF_bpf_rbtree_add,
 	KF_bpf_rbtree_first,
+	KF_bpf_dynptr_from_skb,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8912,6 +8957,7 @@ BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8928,6 +8974,7 @@ BTF_ID(func, bpf_rcu_read_unlock)
 BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9682,6 +9729,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			if (is_kfunc_arg_uninit(btf, &args[i]))
 				dynptr_arg_type |= MEM_UNINIT;
 
+			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+				dynptr_arg_type |= DYNPTR_TYPE_SKB;
+
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
 			if (ret < 0)
 				return ret;
@@ -16356,6 +16406,17 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
+	} else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+		bool seen_direct_write = env->seen_direct_write;
+		bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+		if (is_rdonly)
+			insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly);
+
+		/* restore env->seen_direct_write to its original value, since
+		 * may_access_direct_pkt_data mutates it
+		 */
+		env->seen_direct_write = seen_direct_write;
 	}
 	return 0;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 1d6f165923bf..f3afa31a9b10 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1721,6 +1721,12 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+			  u32 len, u64 flags)
+{
+	return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
+}
+
 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
 	   void *, to, u32, len)
 {
@@ -1751,6 +1757,11 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+{
+	return ____bpf_skb_load_bytes(skb, offset, to, len);
+}
+
 BPF_CALL_4(bpf_flow_dissector_load_bytes,
 	   const struct bpf_flow_dissector *, ctx, u32, offset,
 	   void *, to, u32, len)
@@ -11621,3 +11632,59 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
 
 	return func;
 }
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+__bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
+				    struct bpf_dynptr_kern *ptr__uninit)
+{
+	if (flags) {
+		bpf_dynptr_set_null(ptr__uninit);
+		return -EINVAL;
+	}
+
+	bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+
+	return 0;
+}
+__diag_pop();
+
+int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+			       struct bpf_dynptr_kern *ptr__uninit)
+{
+	int err;
+
+	err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
+	if (err)
+		return err;
+
+	bpf_dynptr_set_rdonly(ptr__uninit);
+
+	return 0;
+}
+
+BTF_SET8_START(bpf_kfunc_check_set_skb)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
+BTF_SET8_END(bpf_kfunc_check_set_skb)
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
+	.owner = THIS_MODULE,
+	.set = &bpf_kfunc_check_set_skb,
+};
+
+static int __init bpf_kfunc_init(void)
+{
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
+	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+}
+late_initcall(bpf_kfunc_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 62ce1f5d1b1d..d0351d30e551 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5325,11 +5325,17 @@ union bpf_attr {
  *	Description
  *		Write *len* bytes from *src* into *dst*, starting from *offset*
  *		into *dst*.
- *		*flags* is currently unused.
+ *
+ *		*flags* must be 0 except for skb-type dynptrs.
+ *
+ *		For skb-type dynptrs:
+ *		    *  For *flags*, please see the flags accepted by
+ *		       **bpf_skb_store_bytes**\ ().
  *	Return
  *		0 on success, -E2BIG if *offset* + *len* exceeds the length
  *		of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
- *		is a read-only dynptr or if *flags* is not 0.
+ *		is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs,
+ *		other errors correspond to errors returned by **bpf_skb_store_bytes**\ ().
  *
  * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
  *	Description
@@ -5337,6 +5343,9 @@ union bpf_attr {
  *
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
+ *
+ *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
  *		read-only, if the dynptr is invalid, or if the offset and length
-- 
cgit v1.2.3


From 05421aecd4ed65da0dc17b0c3c13779ef334e9e5 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:51 -0800
Subject: bpf: Add xdp dynptrs

Add xdp dynptrs, which are dynptrs whose underlying pointer points
to a xdp_buff. The dynptr acts on xdp data. xdp dynptrs have two main
benefits. One is that they allow operations on sizes that are not
statically known at compile-time (eg variable-sized accesses).
Another is that parsing the packet data through dynptrs (instead of
through direct access of xdp->data and xdp->data_end) can be more
ergonomic and less brittle (eg does not need manual if checking for
being within bounds of data_end).

For reads and writes on the dynptr, this includes reading/writing
from/to and across fragments. Data slices through the bpf_dynptr_data
API are not supported; instead bpf_dynptr_slice() and
bpf_dynptr_slice_rdwr() should be used.

For examples of how xdp dynptrs can be used, please see the attached
selftests.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-9-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 +++++++-
 include/linux/filter.h         | 14 ++++++++++++++
 include/uapi/linux/bpf.h       |  2 +-
 kernel/bpf/helpers.c           |  9 ++++++++-
 kernel/bpf/verifier.c          | 10 ++++++++++
 net/core/filter.c              | 37 +++++++++++++++++++++++++++++++++++--
 tools/include/uapi/linux/bpf.h |  2 +-
 7 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e7436d7615b0..23ec684e660d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -610,11 +610,15 @@ enum bpf_type_flag {
 	/* DYNPTR points to sk_buff */
 	DYNPTR_TYPE_SKB		= BIT(15 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to xdp_buff */
+	DYNPTR_TYPE_XDP		= BIT(16 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
-#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB)
+#define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
+				 | DYNPTR_TYPE_XDP)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -1151,6 +1155,8 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_RINGBUF,
 	/* Underlying data is a sk_buff */
 	BPF_DYNPTR_TYPE_SKB,
+	/* Underlying data is a xdp_buff */
+	BPF_DYNPTR_TYPE_XDP,
 };
 
 int bpf_dynptr_check_size(u32 size);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index de18e844d15e..3f6992261ec5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1546,6 +1546,8 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index
 int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
 int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
 			  u32 len, u64 flags);
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
 #else /* CONFIG_NET */
 static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
 				       void *to, u32 len)
@@ -1558,6 +1560,18 @@ static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset,
+				       void *buf, u32 len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
+					void *buf, u32 len)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_NET */
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d0351d30e551..faa304c926cf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5344,7 +5344,7 @@ union bpf_attr {
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
  *
- *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		skb and xdp type dynptrs may not use bpf_dynptr_data. They should
  *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e8e2414d1587..114a875a05b1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1530,6 +1530,8 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
 		return 0;
 	case BPF_DYNPTR_TYPE_SKB:
 		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+	case BPF_DYNPTR_TYPE_XDP:
+		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
 	default:
 		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
 		return -EFAULT;
@@ -1576,6 +1578,10 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
 	case BPF_DYNPTR_TYPE_SKB:
 		return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
 					     flags);
+	case BPF_DYNPTR_TYPE_XDP:
+		if (flags)
+			return -EINVAL;
+		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
 	default:
 		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
 		return -EFAULT;
@@ -1615,7 +1621,8 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
 	case BPF_DYNPTR_TYPE_RINGBUF:
 		return (unsigned long)(ptr->data + ptr->offset + offset);
 	case BPF_DYNPTR_TYPE_SKB:
-		/* skb dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+	case BPF_DYNPTR_TYPE_XDP:
+		/* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
 		return 0;
 	default:
 		WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4f5fce16543b..5e42946e53ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -752,6 +752,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 		return BPF_DYNPTR_TYPE_RINGBUF;
 	case DYNPTR_TYPE_SKB:
 		return BPF_DYNPTR_TYPE_SKB;
+	case DYNPTR_TYPE_XDP:
+		return BPF_DYNPTR_TYPE_XDP;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
@@ -6300,6 +6302,9 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 			case DYNPTR_TYPE_SKB:
 				err_extra = "skb ";
 				break;
+			case DYNPTR_TYPE_XDP:
+				err_extra = "xdp ";
+				break;
 			default:
 				err_extra = "<unknown>";
 				break;
@@ -8943,6 +8948,7 @@ enum special_kfunc_type {
 	KF_bpf_rbtree_add,
 	KF_bpf_rbtree_first,
 	KF_bpf_dynptr_from_skb,
+	KF_bpf_dynptr_from_xdp,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8958,6 +8964,7 @@ BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8975,6 +8982,7 @@ BTF_ID(func, bpf_rbtree_remove)
 BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9731,6 +9739,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 			if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
 				dynptr_arg_type |= DYNPTR_TYPE_SKB;
+			else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+				dynptr_arg_type |= DYNPTR_TYPE_XDP;
 
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
 			if (ret < 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index f3afa31a9b10..c692046fa7f6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3839,7 +3839,7 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_1(bpf_xdp_get_buff_len, struct  xdp_buff*, xdp)
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
 {
 	return xdp_get_buff_len(xdp);
 }
@@ -3999,6 +3999,11 @@ static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+	return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
+}
+
 BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
 	   void *, buf, u32, len)
 {
@@ -4026,6 +4031,11 @@ static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
 	.arg4_type	= ARG_CONST_SIZE,
 };
 
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+	return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
+}
+
 static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
 {
 	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
@@ -11648,6 +11658,19 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
 
 	return 0;
 }
+
+__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
+				    struct bpf_dynptr_kern *ptr__uninit)
+{
+	if (flags) {
+		bpf_dynptr_set_null(ptr__uninit);
+		return -EINVAL;
+	}
+
+	bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
+
+	return 0;
+}
 __diag_pop();
 
 int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
@@ -11668,11 +11691,20 @@ BTF_SET8_START(bpf_kfunc_check_set_skb)
 BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
 BTF_SET8_END(bpf_kfunc_check_set_skb)
 
+BTF_SET8_START(bpf_kfunc_check_set_xdp)
+BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+BTF_SET8_END(bpf_kfunc_check_set_xdp)
+
 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
 	.owner = THIS_MODULE,
 	.set = &bpf_kfunc_check_set_skb,
 };
 
+static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
+	.owner = THIS_MODULE,
+	.set = &bpf_kfunc_check_set_xdp,
+};
+
 static int __init bpf_kfunc_init(void)
 {
 	int ret;
@@ -11685,6 +11717,7 @@ static int __init bpf_kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
-	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
 }
 late_initcall(bpf_kfunc_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d0351d30e551..faa304c926cf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5344,7 +5344,7 @@ union bpf_attr {
  *		*len* must be a statically known value. The returned data slice
  *		is invalidated whenever the dynptr is invalidated.
  *
- *		skb type dynptrs may not use bpf_dynptr_data. They should
+ *		skb and xdp type dynptrs may not use bpf_dynptr_data. They should
  *		instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
  *	Return
  *		Pointer to the underlying dynptr data, NULL if the dynptr is
-- 
cgit v1.2.3


From 66e3a13e7c2c44d0c9dd6bb244680ca7529a8845 Mon Sep 17 00:00:00 2001
From: Joanne Koong <joannelkoong@gmail.com>
Date: Wed, 1 Mar 2023 07:49:52 -0800
Subject: bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr

Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr.
The user must pass in a buffer to store the contents of the data slice
if a direct pointer to the data cannot be obtained.

For skb and xdp type dynptrs, these two APIs are the only way to obtain
a data slice. However, for other types of dynptrs, there is no
difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data.

For skb type dynptrs, the data is copied into the user provided buffer
if any of the data is not in the linear portion of the skb. For xdp type
dynptrs, the data is copied into the user provided buffer if the data is
between xdp frags.

If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then
the skb will be uncloned (see bpf_unclone_prologue()).

Please note that any bpf_dynptr_write() automatically invalidates any prior
data slices of the skb dynptr. This is because the skb may be cloned or
may need to pull its paged buffer into the head. As such, any
bpf_dynptr_write() will automatically have its prior data slices
invalidated, even if the write is to data in the skb head of an uncloned
skb. Please note as well that any other helper calls that change the
underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data
slices of the skb dynptr as well, for the same reasons.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Link: https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h         |  14 +++++
 include/uapi/linux/bpf.h       |   5 ++
 kernel/bpf/helpers.c           | 138 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c          | 127 +++++++++++++++++++++++++++++++++++--
 net/core/filter.c              |   6 +-
 tools/include/uapi/linux/bpf.h |   5 ++
 6 files changed, 288 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 3f6992261ec5..efa5d4a1677e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1548,6 +1548,9 @@ int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
 			  u32 len, u64 flags);
 int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
 int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+		      void *buf, unsigned long len, bool flush);
 #else /* CONFIG_NET */
 static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
 				       void *to, u32 len)
@@ -1572,6 +1575,17 @@ static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
 {
 	return -EOPNOTSUPP;
 }
+
+static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+	return NULL;
+}
+
+static inline void *bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
+				     unsigned long len, bool flush)
+{
+	return NULL;
+}
 #endif /* CONFIG_NET */
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index faa304c926cf..c9699304aed2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5329,6 +5329,11 @@ union bpf_attr {
  *		*flags* must be 0 except for skb-type dynptrs.
  *
  *		For skb-type dynptrs:
+ *		    *  All data slices of the dynptr are automatically
+ *		       invalidated after **bpf_dynptr_write**\ (). This is
+ *		       because writing may pull the skb and change the
+ *		       underlying packet buffer.
+ *
  *		    *  For *flags*, please see the flags accepted by
  *		       **bpf_skb_store_bytes**\ ().
  *	Return
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 114a875a05b1..648b29e78b84 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2193,6 +2193,142 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
 	return p;
 }
 
+/**
+ * bpf_dynptr_slice - Obtain a read-only pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+				   void *buffer, u32 buffer__szk)
+{
+	enum bpf_dynptr_type type;
+	u32 len = buffer__szk;
+	int err;
+
+	if (!ptr->data)
+		return 0;
+
+	err = bpf_dynptr_check_off_len(ptr, offset, len);
+	if (err)
+		return 0;
+
+	type = bpf_dynptr_get_type(ptr);
+
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return ptr->data + ptr->offset + offset;
+	case BPF_DYNPTR_TYPE_SKB:
+		return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+	case BPF_DYNPTR_TYPE_XDP:
+	{
+		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+		if (xdp_ptr)
+			return xdp_ptr;
+
+		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
+		return buffer;
+	}
+	default:
+		WARN_ONCE(true, "unknown dynptr type %d\n", type);
+		return 0;
+	}
+}
+
+/**
+ * bpf_dynptr_slice_rdwr - Obtain a writable pointer to the dynptr data.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ *	return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ *	bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * @returns: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+					void *buffer, u32 buffer__szk)
+{
+	if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+		return 0;
+
+	/* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+	 *
+	 * For skb-type dynptrs, it is safe to write into the returned pointer
+	 * if the bpf program allows skb data writes. There are two possiblities
+	 * that may occur when calling bpf_dynptr_slice_rdwr:
+	 *
+	 * 1) The requested slice is in the head of the skb. In this case, the
+	 * returned pointer is directly to skb data, and if the skb is cloned, the
+	 * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+	 * The pointer can be directly written into.
+	 *
+	 * 2) Some portion of the requested slice is in the paged buffer area.
+	 * In this case, the requested data will be copied out into the buffer
+	 * and the returned pointer will be a pointer to the buffer. The skb
+	 * will not be pulled. To persist the write, the user will need to call
+	 * bpf_dynptr_write(), which will pull the skb and commit the write.
+	 *
+	 * Similarly for xdp programs, if the requested slice is not across xdp
+	 * fragments, then a direct pointer will be returned, otherwise the data
+	 * will be copied out into the buffer and the user will need to call
+	 * bpf_dynptr_write() to commit changes.
+	 */
+	return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+}
+
 __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
 {
 	return obj;
@@ -2262,6 +2398,8 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
 BTF_ID_FLAGS(func, bpf_rdonly_cast)
 BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5e42946e53ab..a856896e835a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -759,6 +759,22 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 	}
 }
 
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+	switch (type) {
+	case BPF_DYNPTR_TYPE_LOCAL:
+		return DYNPTR_TYPE_LOCAL;
+	case BPF_DYNPTR_TYPE_RINGBUF:
+		return DYNPTR_TYPE_RINGBUF;
+	case BPF_DYNPTR_TYPE_SKB:
+		return DYNPTR_TYPE_SKB;
+	case BPF_DYNPTR_TYPE_XDP:
+		return DYNPTR_TYPE_XDP;
+	default:
+		return 0;
+	}
+}
+
 static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 {
 	return type == BPF_DYNPTR_TYPE_RINGBUF;
@@ -1681,6 +1697,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
 	       reg->type == PTR_TO_PACKET_END;
 }
 
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+	return base_type(reg->type) == PTR_TO_MEM &&
+		(reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
 /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
 static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
 				    enum bpf_reg_type which)
@@ -7429,6 +7451,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
 
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
  * are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -7436,7 +7461,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	struct bpf_reg_state *reg;
 
 	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
-		if (reg_is_pkt_pointer_any(reg))
+		if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
 			mark_reg_invalid(env, reg);
 	}));
 }
@@ -8688,6 +8713,11 @@ struct bpf_kfunc_call_arg_meta {
 	struct {
 		struct btf_field *field;
 	} arg_rbtree_root;
+	struct {
+		enum bpf_dynptr_type type;
+		u32 id;
+	} initialized_dynptr;
+	u64 mem_size;
 };
 
 static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
@@ -8761,6 +8791,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
 	return __kfunc_param_match_suffix(btf, arg, "__sz");
 }
 
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+					const struct btf_param *arg,
+					const struct bpf_reg_state *reg)
+{
+	const struct btf_type *t;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
 static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
 {
 	return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -8949,6 +8992,8 @@ enum special_kfunc_type {
 	KF_bpf_rbtree_first,
 	KF_bpf_dynptr_from_skb,
 	KF_bpf_dynptr_from_xdp,
+	KF_bpf_dynptr_slice,
+	KF_bpf_dynptr_slice_rdwr,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -8965,6 +9010,8 @@ BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -8983,6 +9030,8 @@ BTF_ID(func, bpf_rbtree_add)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
 
 static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -9062,7 +9111,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_CALLBACK;
 
-	if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+
+	if (argno + 1 < nargs &&
+	    (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+	     is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
 		arg_mem_size = true;
 
 	/* This is the catch all argument type of register types supported by
@@ -9745,6 +9797,18 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
 			if (ret < 0)
 				return ret;
+
+			if (!(dynptr_arg_type & MEM_UNINIT)) {
+				int id = dynptr_id(env, reg);
+
+				if (id < 0) {
+					verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+					return id;
+				}
+				meta->initialized_dynptr.id = id;
+				meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+			}
+
 			break;
 		}
 		case KF_ARG_PTR_TO_LIST_HEAD:
@@ -9840,14 +9904,33 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				return ret;
 			break;
 		case KF_ARG_PTR_TO_MEM_SIZE:
-			ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+		{
+			struct bpf_reg_state *size_reg = &regs[regno + 1];
+			const struct btf_param *size_arg = &args[i + 1];
+
+			ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
 			if (ret < 0) {
 				verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
 				return ret;
 			}
-			/* Skip next '__sz' argument */
+
+			if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+				if (meta->arg_constant.found) {
+					verbose(env, "verifier internal error: only one constant argument permitted\n");
+					return -EFAULT;
+				}
+				if (!tnum_is_const(size_reg->var_off)) {
+					verbose(env, "R%d must be a known constant\n", regno + 1);
+					return -EINVAL;
+				}
+				meta->arg_constant.found = true;
+				meta->arg_constant.value = size_reg->var_off.value;
+			}
+
+			/* Skip next '__sz' or '__szk' argument */
 			i++;
 			break;
+		}
 		case KF_ARG_PTR_TO_CALLBACK:
 			meta->subprogno = reg->subprogno;
 			break;
@@ -10082,6 +10165,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
 				regs[BPF_REG_0].btf = desc_btf;
 				regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+			} else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+				   meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+				enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+				mark_reg_known_zero(env, regs, BPF_REG_0);
+
+				if (!meta.arg_constant.found) {
+					verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+					return -EFAULT;
+				}
+
+				regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+				/* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+				regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+				if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+					regs[BPF_REG_0].type |= MEM_RDONLY;
+				} else {
+					/* this will set env->seen_direct_write to true */
+					if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+						verbose(env, "the prog does not allow writes to packet data\n");
+						return -EINVAL;
+					}
+				}
+
+				if (!meta.initialized_dynptr.id) {
+					verbose(env, "verifier internal error: no dynptr id\n");
+					return -EFAULT;
+				}
+				regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+				/* we don't need to set BPF_REG_0's ref obj id
+				 * because packet slices are not refcounted (see
+				 * dynptr_type_refcounted)
+				 */
 			} else {
 				verbose(env, "kernel function %s unhandled dynamic return type\n",
 					meta.func_name);
diff --git a/net/core/filter.c b/net/core/filter.c
index c692046fa7f6..8f3124e06133 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3894,8 +3894,8 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
-			     void *buf, unsigned long len, bool flush)
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+		      void *buf, unsigned long len, bool flush)
 {
 	unsigned long ptr_len, ptr_off = 0;
 	skb_frag_t *next_frag, *end_frag;
@@ -3941,7 +3941,7 @@ static void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
 	}
 }
 
-static void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
 {
 	struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
 	u32 size = xdp->data_end - xdp->data;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index faa304c926cf..c9699304aed2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5329,6 +5329,11 @@ union bpf_attr {
  *		*flags* must be 0 except for skb-type dynptrs.
  *
  *		For skb-type dynptrs:
+ *		    *  All data slices of the dynptr are automatically
+ *		       invalidated after **bpf_dynptr_write**\ (). This is
+ *		       because writing may pull the skb and change the
+ *		       underlying packet buffer.
+ *
  *		    *  For *flags*, please see the flags accepted by
  *		       **bpf_skb_store_bytes**\ ().
  *	Return
-- 
cgit v1.2.3


From 9db44fdd8105da00669d425acab887c668df75f6 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sat, 25 Feb 2023 16:40:09 +0100
Subject: bpf: Support kptrs in local storage maps

Enable support for kptrs in local storage maps by wiring up the freeing
of these kptrs from map value. Freeing of bpf_local_storage_map is only
delayed in case there are special fields, therefore bpf_selem_free_*
path can also only dereference smap safely in that case. This is
recorded using a bool utilizing a hole in bpF_local_storage_elem. It
could have been tagged in the pointer value smap using the lowest bit
(since alignment > 1), but since there was already a hole I went with
the simpler option. Only the map structure freeing is delayed using RCU
barriers, as the buckets aren't used when selem is being freed, so they
can be freed once all readers of the bucket lists can no longer access
it.

Cc: Martin KaFai Lau <martin.lau@kernel.org>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230225154010.391965-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  6 +++++
 kernel/bpf/bpf_local_storage.c    | 48 +++++++++++++++++++++++++++++++++++----
 kernel/bpf/syscall.c              |  6 ++++-
 kernel/bpf/verifier.c             | 12 ++++++----
 4 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 6d37a40cd90e..0fe92986412b 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -74,6 +74,12 @@ struct bpf_local_storage_elem {
 	struct hlist_node snode;	/* Linked to bpf_local_storage */
 	struct bpf_local_storage __rcu *local_storage;
 	struct rcu_head rcu;
+	bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU
+			    * callbacks? bpf_local_storage_map_free only
+			    * executes rcu_barrier when there are special
+			    * fields, this field remembers that to ensure we
+			    * don't access already freed smap in sdata.
+			    */
 	/* 8 bytes hole */
 	/* The data is stored in another cacheline to minimize
 	 * the number of cachelines access during a cache hit.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 58da17ae5124..2bdd722fe293 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -85,6 +85,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	if (selem) {
 		if (value)
 			copy_map_value(&smap->map, SDATA(selem)->data, value);
+		/* No need to call check_and_init_map_value as memory is zero init */
 		return selem;
 	}
 
@@ -113,10 +114,25 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
 	struct bpf_local_storage_elem *selem;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	/* The can_use_smap bool is set whenever we need to free additional
+	 * fields in selem data before freeing selem. bpf_local_storage_map_free
+	 * only executes rcu_barrier to wait for RCU callbacks when it has
+	 * special fields, hence we can only conditionally dereference smap, as
+	 * by this time the map might have already been freed without waiting
+	 * for our call_rcu callback if it did not have any special fields.
+	 */
+	if (selem->can_use_smap)
+		bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data);
+	kfree(selem);
+}
+
+static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu)
+{
+	/* Free directly if Tasks Trace RCU GP also implies RCU GP */
 	if (rcu_trace_implies_rcu_gp())
-		kfree(selem);
+		bpf_selem_free_rcu(rcu);
 	else
-		kfree_rcu(selem, rcu);
+		call_rcu(rcu, bpf_selem_free_rcu);
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -170,9 +186,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
 	if (use_trace_rcu)
-		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu);
 	else
-		kfree_rcu(selem, rcu);
+		call_rcu(&selem->rcu, bpf_selem_free_rcu);
 
 	return free_local_storage;
 }
@@ -240,6 +256,11 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 	hlist_add_head_rcu(&selem->map_node, &b->list);
 	raw_spin_unlock_irqrestore(&b->lock, flags);
+
+	/* If our data will have special fields, smap will wait for us to use
+	 * its record in bpf_selem_free_* RCU callbacks before freeing itself.
+	 */
+	selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record);
 }
 
 void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
@@ -723,6 +744,25 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
+	/* Only delay freeing of smap, buckets are not needed anymore */
 	kvfree(smap->buckets);
+
+	/* When local storage has special fields, callbacks for
+	 * bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using
+	 * the map BTF record, we need to execute an RCU barrier to wait for
+	 * them as the record will be freed right after our map_free callback.
+	 */
+	if (!IS_ERR_OR_NULL(smap->map.record)) {
+		rcu_barrier_tasks_trace();
+		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
+		 * is true, because while call_rcu invocation is skipped in that
+		 * case in bpf_selem_free_tasks_trace_rcu (and all local storage
+		 * maps pass use_trace_rcu = true), there can be call_rcu
+		 * callbacks based on use_trace_rcu = false in the earlier while
+		 * ((selem = ...)) loop or from bpf_local_storage_unlink_nolock
+		 * called from owner's free path.
+		 */
+		rcu_barrier();
+	}
 	bpf_map_area_free(smap);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index da117a2a83b2..eb50025b03c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1063,7 +1063,11 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_ARRAY &&
-				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+				    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+				    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+				    map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
 					ret = -EOPNOTSUPP;
 					goto free_map_tab;
 				}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a856896e835a..bf580f246a01 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7222,22 +7222,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		break;
 	case BPF_MAP_TYPE_SK_STORAGE:
 		if (func_id != BPF_FUNC_sk_storage_get &&
-		    func_id != BPF_FUNC_sk_storage_delete)
+		    func_id != BPF_FUNC_sk_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_INODE_STORAGE:
 		if (func_id != BPF_FUNC_inode_storage_get &&
-		    func_id != BPF_FUNC_inode_storage_delete)
+		    func_id != BPF_FUNC_inode_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_TASK_STORAGE:
 		if (func_id != BPF_FUNC_task_storage_get &&
-		    func_id != BPF_FUNC_task_storage_delete)
+		    func_id != BPF_FUNC_task_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_CGRP_STORAGE:
 		if (func_id != BPF_FUNC_cgrp_storage_get &&
-		    func_id != BPF_FUNC_cgrp_storage_delete)
+		    func_id != BPF_FUNC_cgrp_storage_delete &&
+		    func_id != BPF_FUNC_kptr_xchg)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_BLOOM_FILTER:
-- 
cgit v1.2.3


From f71f8530494bb5ab43d3369ef0ce8373eb1ee077 Mon Sep 17 00:00:00 2001
From: Tero Kristo <tero.kristo@linux.intel.com>
Date: Thu, 2 Mar 2023 13:46:13 +0200
Subject: bpf: Add support for absolute value BPF timers

Add a new flag BPF_F_TIMER_ABS that can be passed to bpf_timer_start()
to start an absolute value timer instead of the default relative value.
This makes the timer expire at an exact point in time, instead of a time
with latencies induced by both the BPF and timer subsystems.

Suggested-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Tero Kristo <tero.kristo@linux.intel.com>
Link: https://lore.kernel.org/r/20230302114614.2985072-2-tero.kristo@linux.intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 15 +++++++++++++++
 kernel/bpf/helpers.c           | 11 +++++++++--
 tools/include/uapi/linux/bpf.h | 15 +++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c9699304aed2..976b194eb775 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4969,6 +4969,12 @@ union bpf_attr {
  *		different maps if key/value layout matches across maps.
  *		Every bpf_timer_set_callback() can have different callback_fn.
  *
+ *		*flags* can be one of:
+ *
+ *		**BPF_F_TIMER_ABS**
+ *			Start the timer in absolute expire value instead of the
+ *			default relative one.
+ *
  *	Return
  *		0 on success.
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
@@ -7097,4 +7103,13 @@ struct bpf_core_relo {
 	enum bpf_core_relo_kind kind;
 };
 
+/*
+ * Flags to control bpf_timer_start() behaviour.
+ *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
+ *       relative to current time.
+ */
+enum {
+	BPF_F_TIMER_ABS = (1ULL << 0),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6fc0d6c44e4c..12f12e879bcf 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1264,10 +1264,11 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 {
 	struct bpf_hrtimer *t;
 	int ret = 0;
+	enum hrtimer_mode mode;
 
 	if (in_nmi())
 		return -EOPNOTSUPP;
-	if (flags)
+	if (flags > BPF_F_TIMER_ABS)
 		return -EINVAL;
 	__bpf_spin_lock_irqsave(&timer->lock);
 	t = timer->timer;
@@ -1275,7 +1276,13 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
 		ret = -EINVAL;
 		goto out;
 	}
-	hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+
+	if (flags & BPF_F_TIMER_ABS)
+		mode = HRTIMER_MODE_ABS_SOFT;
+	else
+		mode = HRTIMER_MODE_REL_SOFT;
+
+	hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
 out:
 	__bpf_spin_unlock_irqrestore(&timer->lock);
 	return ret;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c9699304aed2..976b194eb775 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4969,6 +4969,12 @@ union bpf_attr {
  *		different maps if key/value layout matches across maps.
  *		Every bpf_timer_set_callback() can have different callback_fn.
  *
+ *		*flags* can be one of:
+ *
+ *		**BPF_F_TIMER_ABS**
+ *			Start the timer in absolute expire value instead of the
+ *			default relative one.
+ *
  *	Return
  *		0 on success.
  *		**-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
@@ -7097,4 +7103,13 @@ struct bpf_core_relo {
 	enum bpf_core_relo_kind kind;
 };
 
+/*
+ * Flags to control bpf_timer_start() behaviour.
+ *     - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is
+ *       relative to current time.
+ */
+enum {
+	BPF_F_TIMER_ABS = (1ULL << 0),
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 20c09d92faeefb8536f705d3a4629e0dc314c8a1 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 2 Mar 2023 20:14:43 -0800
Subject: bpf: Introduce kptr_rcu.

The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
Derefrence of other kptr-s returns PTR_UNTRUSTED.

For example:
struct map_value {
   struct cgroup __kptr *cgrp;
};

SEC("tp_btf/cgroup_mkdir")
int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
{
  struct cgroup *cg, *cg2;

  cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
  bpf_kptr_xchg(&v->cgrp, cg);

  cg2 = v->cgrp; // This is new feature introduced by this patch.
  // cg2 is PTR_MAYBE_NULL | MEM_RCU.
  // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero

  if (cg2)
    bpf_cgroup_ancestor(cg2, level); // safe to do.
}

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230303041446.3630-4-alexei.starovoitov@gmail.com
---
 Documentation/bpf/kfuncs.rst                       | 12 +++--
 include/linux/btf.h                                |  2 +-
 kernel/bpf/helpers.c                               |  6 ++-
 kernel/bpf/verifier.c                              | 55 ++++++++++++++++++----
 net/bpf/test_run.c                                 |  3 +-
 .../selftests/bpf/progs/cgrp_kfunc_failure.c       |  2 +-
 tools/testing/selftests/bpf/progs/map_kptr_fail.c  |  4 +-
 tools/testing/selftests/bpf/verifier/calls.c       |  2 +-
 tools/testing/selftests/bpf/verifier/map_kptr.c    |  2 +-
 9 files changed, 65 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index b5d9b0d446bc..69eccf6f98ef 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -249,11 +249,13 @@ added later.
 2.4.8 KF_RCU flag
 -----------------
 
-The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument.
-When used together with KF_ACQUIRE, it indicates the kfunc should have a
-single argument which must be a trusted argument or a MEM_RCU pointer.
-The argument may have reference count of 0 and the kfunc must take this
-into consideration.
+The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
+KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
+that the objects are valid and there is no use-after-free. The pointers are not
+NULL, but the object's refcount could have reached zero. The kfuncs need to
+consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE
+pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely
+also be KF_RET_NULL.
 
 .. _KF_deprecated_flag:
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 49e0fe6d8274..556b3e2e7471 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -70,7 +70,7 @@
 #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
-#define KF_RCU          (1 << 7) /* kfunc only takes rcu pointer arguments */
+#define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 12f12e879bcf..637ac4e92e75 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2163,8 +2163,10 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
 	if (level > cgrp->level || level < 0)
 		return NULL;
 
+	/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
 	ancestor = cgrp->ancestors[level];
-	cgroup_get(ancestor);
+	if (!cgroup_tryget(ancestor))
+		return NULL;
 	return ancestor;
 }
 
@@ -2382,7 +2384,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
 #endif
 BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b834f3d2d81a..a095055d7ef4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4218,7 +4218,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
 			       struct bpf_reg_state *reg, u32 regno)
 {
 	const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
-	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
+	int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
 	const char *reg_name = "";
 
 	/* Only unreferenced case accepts untrusted pointers */
@@ -4285,6 +4285,34 @@ bad_type:
 	return -EINVAL;
 }
 
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(struct, cgroup)
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+	if (!btf_is_kernel(btf))
+		return false;
+	return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+	const struct btf_field_kptr *kptr = &field->kptr;
+
+	return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+}
+
 static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 				 int value_regno, int insn_idx,
 				 struct btf_field *kptr_field)
@@ -4319,7 +4347,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
 		 * value from map as PTR_TO_BTF_ID, with the correct type.
 		 */
 		mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
-				kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+				kptr_field->kptr.btf_id,
+				rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
+				PTR_MAYBE_NULL | MEM_RCU :
+				PTR_MAYBE_NULL | PTR_UNTRUSTED);
 		/* For mark_ptr_or_null_reg */
 		val_reg->id = ++env->id_gen;
 	} else if (class == BPF_STX) {
@@ -5163,10 +5194,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	 * An RCU-protected pointer can also be deemed trusted if we are in an
 	 * RCU read region. This case is handled below.
 	 */
-	if (nested_ptr_is_trusted(env, reg, off))
+	if (nested_ptr_is_trusted(env, reg, off)) {
 		flag |= PTR_TRUSTED;
-	else
+		/*
+		 * task->cgroups is trusted. It provides a stronger guarantee
+		 * than __rcu tag on 'cgroups' field in 'struct task_struct'.
+		 * Clear MEM_RCU in such case.
+		 */
+		flag &= ~MEM_RCU;
+	} else {
 		flag &= ~PTR_TRUSTED;
+	}
 
 	if (flag & MEM_RCU) {
 		/* Mark value register as MEM_RCU only if it is protected by
@@ -5175,11 +5213,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
 		 * it could be null in some cases.
 		 */
-		if (!env->cur_state->active_rcu_lock ||
-		    !(is_trusted_reg(reg) || is_rcu_reg(reg)))
-			flag &= ~MEM_RCU;
-		else
+		if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg)))
 			flag |= PTR_MAYBE_NULL;
+		else
+			flag &= ~MEM_RCU;
 	} else if (reg->type & MEM_RCU) {
 		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
 		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
@@ -9676,7 +9713,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			return -EINVAL;
 		}
 
-		if (is_kfunc_trusted_args(meta) &&
+		if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
 		    (register_is_null(reg) || type_may_be_null(reg->type))) {
 			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
 			return -EACCES;
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6f3d654b3339..6a8b33a103a4 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -737,6 +737,7 @@ __bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
 
 __bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
 {
+	/* p != NULL, but p->cnt could be 0 */
 }
 
 __bpf_kfunc void bpf_kfunc_call_test_destructive(void)
@@ -784,7 +785,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
-BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
 BTF_SET8_END(test_sk_check_kfunc_ids)
diff --git a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
index 4ad7fe24966d..b42291ed9586 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_kfunc_failure.c
@@ -205,7 +205,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
 }
 
 SEC("tp_btf/cgroup_mkdir")
-__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket")
+__failure __msg("expects refcounted")
 int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path)
 {
 	struct __cgrps_kfunc_map_value *v;
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_fail.c b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
index e19e2a5f38cf..08f9ec18c345 100644
--- a/tools/testing/selftests/bpf/progs/map_kptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/map_kptr_fail.c
@@ -281,7 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_")
+__failure __msg("R1 type=rcu_ptr_or_null_ expected=percpu_ptr_")
 int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
 {
 	struct map_value *v;
@@ -316,7 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
 }
 
 SEC("?tc")
-__failure __msg("R2 type=untrusted_ptr_ expected=ptr_")
+__failure __msg("R2 must be referenced")
 int reject_untrusted_xchg(struct __sk_buff *ctx)
 {
 	struct prog_test_ref_kfunc *p;
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 289ed202ec66..9a326a800e5c 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -243,7 +243,7 @@
 	},
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "R1 must be referenced",
+	.errstr = "R1 must be",
 },
 {
 	"calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
diff --git a/tools/testing/selftests/bpf/verifier/map_kptr.c b/tools/testing/selftests/bpf/verifier/map_kptr.c
index 6914904344c0..d775ccb01989 100644
--- a/tools/testing/selftests/bpf/verifier/map_kptr.c
+++ b/tools/testing/selftests/bpf/verifier/map_kptr.c
@@ -336,7 +336,7 @@
 	.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	.fixup_map_kptr = { 1 },
 	.result = REJECT,
-	.errstr = "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_",
+	.errstr = "R1 type=rcu_ptr_or_null_ expected=percpu_ptr_",
 },
 {
 	"map_kptr: ref: reject off != 0",
-- 
cgit v1.2.3


From 6fcd486b3a0a628c41f12b3a7329a18a2c74b351 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Thu, 2 Mar 2023 20:14:46 -0800
Subject: bpf: Refactor RCU enforcement in the verifier.

bpf_rcu_read_lock/unlock() are only available in clang compiled kernels. Lack
of such key mechanism makes it impossible for sleepable bpf programs to use RCU
pointers.

Allow bpf_rcu_read_lock/unlock() in GCC compiled kernels (though GCC doesn't
support btf_type_tag yet) and allowlist certain field dereferences in important
data structures like tast_struct, cgroup, socket that are used by sleepable
programs either as RCU pointer or full trusted pointer (which is valid outside
of RCU CS). Use BTF_TYPE_SAFE_RCU and BTF_TYPE_SAFE_TRUSTED macros for such
tagging. They will be removed once GCC supports btf_type_tag.

With that refactor check_ptr_to_btf_access(). Make it strict in enforcing
PTR_TRUSTED and PTR_UNTRUSTED while deprecating old PTR_TO_BTF_ID without
modifier flags. There is a chance that this strict enforcement might break
existing programs (especially on GCC compiled kernels), but this cleanup has to
start sooner than later. Note PTR_TO_CTX access still yields old deprecated
PTR_TO_BTF_ID. Once it's converted to strict PTR_TRUSTED or PTR_UNTRUSTED the
kfuncs and helpers will be able to default to KF_TRUSTED_ARGS. KF_RCU will
remain as a weaker version of KF_TRUSTED_ARGS where obj refcnt could be 0.

Adjust rcu_read_lock selftest to run on gcc and clang compiled kernels.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230303041446.3630-7-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h                                |   2 +-
 include/linux/bpf_verifier.h                       |   1 -
 kernel/bpf/btf.c                                   |  16 +-
 kernel/bpf/cpumask.c                               |  40 ++---
 kernel/bpf/verifier.c                              | 178 ++++++++++++++-------
 .../selftests/bpf/prog_tests/cgrp_local_storage.c  |  14 +-
 .../selftests/bpf/prog_tests/rcu_read_lock.c       |  16 +-
 .../selftests/bpf/progs/cgrp_ls_sleepable.c        |   4 +-
 .../testing/selftests/bpf/progs/cpumask_failure.c  |   2 +-
 .../selftests/bpf/progs/nested_trust_failure.c     |   2 +-
 tools/testing/selftests/bpf/progs/rcu_read_lock.c  |   6 +-
 tools/testing/selftests/bpf/verifier/calls.c       |   2 +-
 12 files changed, 173 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 23ec684e660d..d3456804f7aa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2279,7 +2279,7 @@ struct bpf_core_ctx {
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off);
+				int off, const char *suffix);
 
 bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 			       const struct btf *reg_btf, u32 reg_id,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b26ff2a8f63b..18538bad2b8c 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -537,7 +537,6 @@ struct bpf_verifier_env {
 	bool bypass_spec_v1;
 	bool bypass_spec_v4;
 	bool seen_direct_write;
-	bool rcu_tag_supported;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index c5e1d6955491..a8cb09e5973b 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6163,6 +6163,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 	const char *tname, *mname, *tag_value;
 	u32 vlen, elem_id, mid;
 
+	*flag = 0;
 again:
 	tname = __btf_name_by_offset(btf, t->name_off);
 	if (!btf_type_is_struct(t)) {
@@ -6329,6 +6330,15 @@ error:
 		 * of this field or inside of this struct
 		 */
 		if (btf_type_is_struct(mtype)) {
+			if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION &&
+			    btf_type_vlen(mtype) != 1)
+				/*
+				 * walking unions yields untrusted pointers
+				 * with exception of __bpf_md_ptr and other
+				 * unions with a single member
+				 */
+				*flag |= PTR_UNTRUSTED;
+
 			/* our field must be inside that union or struct */
 			t = mtype;
 
@@ -6373,7 +6383,7 @@ error:
 			stype = btf_type_skip_modifiers(btf, mtype->type, &id);
 			if (btf_type_is_struct(stype)) {
 				*next_btf_id = id;
-				*flag = tmp_flag;
+				*flag |= tmp_flag;
 				return WALK_PTR;
 			}
 		}
@@ -8357,7 +8367,7 @@ out:
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off)
+				int off, const char *suffix)
 {
 	struct btf *btf = reg->btf;
 	const struct btf_type *walk_type, *safe_type;
@@ -8374,7 +8384,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 
 	tname = btf_name_by_offset(btf, walk_type->name_off);
 
-	ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname);
+	ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
 	if (ret < 0)
 		return false;
 
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 2b3fbbfebdc5..b6587ec40f1b 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -427,26 +427,26 @@ BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)
 BTF_SET8_END(cpumask_kfunc_btf_ids)
 
 static const struct btf_kfunc_id_set cpumask_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a095055d7ef4..c2adf3c24c64 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5073,29 +5073,76 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
 	return 0;
 }
 
-#define BTF_TYPE_SAFE_NESTED(__type)  __PASTE(__type, __safe_fields)
+#define BTF_TYPE_SAFE_RCU(__type)  __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_TRUSTED(__type)  __PASTE(__type, __safe_trusted)
 
-BTF_TYPE_SAFE_NESTED(struct task_struct) {
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
+
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
 	const cpumask_t *cpus_ptr;
 	struct css_set __rcu *cgroups;
+	struct task_struct __rcu *real_parent;
+	struct task_struct *group_leader;
 };
 
-BTF_TYPE_SAFE_NESTED(struct css_set) {
+BTF_TYPE_SAFE_RCU(struct css_set) {
 	struct cgroup *dfl_cgrp;
 };
 
-static bool nested_ptr_is_trusted(struct bpf_verifier_env *env,
-				  struct bpf_reg_state *reg,
-				  int off)
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+	__bpf_md_ptr(struct seq_file *, seq);
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct task_struct *, task);
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+	struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+	struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct dentry) {
+	/* no negative dentry-s in places where bpf can see it */
+	struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct socket) {
+	struct sock *sk;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+			struct bpf_reg_state *reg,
+			int off)
 {
-	/* If its parent is not trusted, it can't regain its trusted status. */
-	if (!is_trusted_reg(reg))
-		return false;
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
 
-	BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct));
-	BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct css_set));
+	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu");
+}
 
-	return btf_nested_type_is_trusted(&env->log, reg, off);
+static bool type_is_trusted(struct bpf_verifier_env *env,
+			    struct bpf_reg_state *reg,
+			    int off)
+{
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
+	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
+
+	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted");
 }
 
 static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5181,49 +5228,58 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	if (ret < 0)
 		return ret;
 
-	/* If this is an untrusted pointer, all pointers formed by walking it
-	 * also inherit the untrusted flag.
-	 */
-	if (type_flag(reg->type) & PTR_UNTRUSTED)
-		flag |= PTR_UNTRUSTED;
+	if (ret != PTR_TO_BTF_ID) {
+		/* just mark; */
 
-	/* By default any pointer obtained from walking a trusted pointer is no
-	 * longer trusted, unless the field being accessed has explicitly been
-	 * marked as inheriting its parent's state of trust.
-	 *
-	 * An RCU-protected pointer can also be deemed trusted if we are in an
-	 * RCU read region. This case is handled below.
-	 */
-	if (nested_ptr_is_trusted(env, reg, off)) {
-		flag |= PTR_TRUSTED;
-		/*
-		 * task->cgroups is trusted. It provides a stronger guarantee
-		 * than __rcu tag on 'cgroups' field in 'struct task_struct'.
-		 * Clear MEM_RCU in such case.
+	} else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+		/* If this is an untrusted pointer, all pointers formed by walking it
+		 * also inherit the untrusted flag.
+		 */
+		flag = PTR_UNTRUSTED;
+
+	} else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+		/* By default any pointer obtained from walking a trusted pointer is no
+		 * longer trusted, unless the field being accessed has explicitly been
+		 * marked as inheriting its parent's state of trust (either full or RCU).
+		 * For example:
+		 * 'cgroups' pointer is untrusted if task->cgroups dereference
+		 * happened in a sleepable program outside of bpf_rcu_read_lock()
+		 * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+		 * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+		 *
+		 * A regular RCU-protected pointer with __rcu tag can also be deemed
+		 * trusted if we are in an RCU CS. Such pointer can be NULL.
 		 */
-		flag &= ~MEM_RCU;
+		if (type_is_trusted(env, reg, off)) {
+			flag |= PTR_TRUSTED;
+		} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+			if (type_is_rcu(env, reg, off)) {
+				/* ignore __rcu tag and mark it MEM_RCU */
+				flag |= MEM_RCU;
+			} else if (flag & MEM_RCU) {
+				/* __rcu tagged pointers can be NULL */
+				flag |= PTR_MAYBE_NULL;
+			} else if (flag & (MEM_PERCPU | MEM_USER)) {
+				/* keep as-is */
+			} else {
+				/* walking unknown pointers yields untrusted pointer */
+				flag = PTR_UNTRUSTED;
+			}
+		} else {
+			/*
+			 * If not in RCU CS or MEM_RCU pointer can be NULL then
+			 * aggressively mark as untrusted otherwise such
+			 * pointers will be plain PTR_TO_BTF_ID without flags
+			 * and will be allowed to be passed into helpers for
+			 * compat reasons.
+			 */
+			flag = PTR_UNTRUSTED;
+		}
 	} else {
+		/* Old compat. Deprecated */
 		flag &= ~PTR_TRUSTED;
 	}
 
-	if (flag & MEM_RCU) {
-		/* Mark value register as MEM_RCU only if it is protected by
-		 * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
-		 * itself can already indicate trustedness inside the rcu
-		 * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
-		 * it could be null in some cases.
-		 */
-		if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg)))
-			flag |= PTR_MAYBE_NULL;
-		else
-			flag &= ~MEM_RCU;
-	} else if (reg->type & MEM_RCU) {
-		/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
-		 * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
-		 */
-		flag |= PTR_UNTRUSTED;
-	}
-
 	if (atype == BPF_READ && value_regno >= 0)
 		mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
 
@@ -10049,10 +10105,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
 	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
-	if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
-		verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
-		return -EACCES;
-	}
 
 	if (env->cur_state->active_rcu_lock) {
 		struct bpf_func_state *state;
@@ -14911,8 +14963,22 @@ static int do_check(struct bpf_verifier_env *env)
 				 * src_reg == stack|map in some other branch.
 				 * Reject it.
 				 */
-				verbose(env, "same insn cannot be used with different pointers\n");
-				return -EINVAL;
+				if (base_type(src_reg_type) == PTR_TO_BTF_ID &&
+				    base_type(*prev_src_type) == PTR_TO_BTF_ID) {
+					/*
+					 * Have to support a use case when one path through
+					 * the program yields TRUSTED pointer while another
+					 * is UNTRUSTED. Fallback to UNTRUSTED to generate
+					 * BPF_PROBE_MEM.
+					 */
+					*prev_src_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+				} else {
+					verbose(env,
+						"The same insn cannot be used with different pointers: %s",
+						reg_type_str(env, src_reg_type));
+					verbose(env, " != %s\n", reg_type_str(env, *prev_src_type));
+					return -EINVAL;
+				}
 			}
 
 		} else if (class == BPF_STX) {
@@ -17984,8 +18050,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 	env->bypass_spec_v1 = bpf_bypass_spec_v1();
 	env->bypass_spec_v4 = bpf_bypass_spec_v4();
 	env->bpf_capable = bpf_capable();
-	env->rcu_tag_supported = btf_vmlinux &&
-		btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
 
 	if (is_priv)
 		env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
index 2cc759956e3b..63e776f4176e 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgrp_local_storage.c
@@ -193,7 +193,7 @@ out:
 	cgrp_ls_sleepable__destroy(skel);
 }
 
-static void test_no_rcu_lock(__u64 cgroup_id)
+static void test_yes_rcu_lock(__u64 cgroup_id)
 {
 	struct cgrp_ls_sleepable *skel;
 	int err;
@@ -204,7 +204,7 @@ static void test_no_rcu_lock(__u64 cgroup_id)
 
 	skel->bss->target_pid = syscall(SYS_gettid);
 
-	bpf_program__set_autoload(skel->progs.no_rcu_lock, true);
+	bpf_program__set_autoload(skel->progs.yes_rcu_lock, true);
 	err = cgrp_ls_sleepable__load(skel);
 	if (!ASSERT_OK(err, "skel_load"))
 		goto out;
@@ -220,7 +220,7 @@ out:
 	cgrp_ls_sleepable__destroy(skel);
 }
 
-static void test_rcu_lock(void)
+static void test_no_rcu_lock(void)
 {
 	struct cgrp_ls_sleepable *skel;
 	int err;
@@ -229,7 +229,7 @@ static void test_rcu_lock(void)
 	if (!ASSERT_OK_PTR(skel, "skel_open"))
 		return;
 
-	bpf_program__set_autoload(skel->progs.yes_rcu_lock, true);
+	bpf_program__set_autoload(skel->progs.no_rcu_lock, true);
 	err = cgrp_ls_sleepable__load(skel);
 	ASSERT_ERR(err, "skel_load");
 
@@ -256,10 +256,10 @@ void test_cgrp_local_storage(void)
 		test_negative();
 	if (test__start_subtest("cgroup_iter_sleepable"))
 		test_cgroup_iter_sleepable(cgroup_fd, cgroup_id);
+	if (test__start_subtest("yes_rcu_lock"))
+		test_yes_rcu_lock(cgroup_id);
 	if (test__start_subtest("no_rcu_lock"))
-		test_no_rcu_lock(cgroup_id);
-	if (test__start_subtest("rcu_lock"))
-		test_rcu_lock();
+		test_no_rcu_lock();
 
 	close(cgroup_fd);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
index 447d8560ecb6..3f1f58d3a729 100644
--- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c
@@ -25,10 +25,10 @@ static void test_success(void)
 
 	bpf_program__set_autoload(skel->progs.get_cgroup_id, true);
 	bpf_program__set_autoload(skel->progs.task_succ, true);
-	bpf_program__set_autoload(skel->progs.no_lock, true);
 	bpf_program__set_autoload(skel->progs.two_regions, true);
 	bpf_program__set_autoload(skel->progs.non_sleepable_1, true);
 	bpf_program__set_autoload(skel->progs.non_sleepable_2, true);
+	bpf_program__set_autoload(skel->progs.task_trusted_non_rcuptr, true);
 	err = rcu_read_lock__load(skel);
 	if (!ASSERT_OK(err, "skel_load"))
 		goto out;
@@ -69,6 +69,7 @@ out:
 
 static const char * const inproper_region_tests[] = {
 	"miss_lock",
+	"no_lock",
 	"miss_unlock",
 	"non_sleepable_rcu_mismatch",
 	"inproper_sleepable_helper",
@@ -99,7 +100,6 @@ out:
 }
 
 static const char * const rcuptr_misuse_tests[] = {
-	"task_untrusted_non_rcuptr",
 	"task_untrusted_rcuptr",
 	"cross_rcu_region",
 };
@@ -128,17 +128,8 @@ out:
 
 void test_rcu_read_lock(void)
 {
-	struct btf *vmlinux_btf;
 	int cgroup_fd;
 
-	vmlinux_btf = btf__load_vmlinux_btf();
-	if (!ASSERT_OK_PTR(vmlinux_btf, "could not load vmlinux BTF"))
-		return;
-	if (btf__find_by_name_kind(vmlinux_btf, "rcu", BTF_KIND_TYPE_TAG) < 0) {
-		test__skip();
-		goto out;
-	}
-
 	cgroup_fd = test__join_cgroup("/rcu_read_lock");
 	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /rcu_read_lock"))
 		goto out;
@@ -153,6 +144,5 @@ void test_rcu_read_lock(void)
 	if (test__start_subtest("negative_tests_rcuptr_misuse"))
 		test_rcuptr_misuse();
 	close(cgroup_fd);
-out:
-	btf__free(vmlinux_btf);
+out:;
 }
diff --git a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
index 2d11ed528b6f..7615dc23d301 100644
--- a/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
+++ b/tools/testing/selftests/bpf/progs/cgrp_ls_sleepable.c
@@ -49,7 +49,7 @@ int no_rcu_lock(void *ctx)
 	if (task->pid != target_pid)
 		return 0;
 
-	/* ptr_to_btf_id semantics. should work. */
+	/* task->cgroups is untrusted in sleepable prog outside of RCU CS */
 	cgrp = task->cgroups->dfl_cgrp;
 	ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0,
 				   BPF_LOCAL_STORAGE_GET_F_CREATE);
@@ -71,7 +71,7 @@ int yes_rcu_lock(void *ctx)
 
 	bpf_rcu_read_lock();
 	cgrp = task->cgroups->dfl_cgrp;
-	/* cgrp is untrusted and cannot pass to bpf_cgrp_storage_get() helper. */
+	/* cgrp is trusted under RCU CS */
 	ptr = bpf_cgrp_storage_get(&map_a, cgrp, 0, BPF_LOCAL_STORAGE_GET_F_CREATE);
 	if (ptr)
 		cgroup_id = cgrp->kn->id;
diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c
index 33e8e86dd090..c16f7563b84e 100644
--- a/tools/testing/selftests/bpf/progs/cpumask_failure.c
+++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c
@@ -44,7 +44,7 @@ int BPF_PROG(test_alloc_double_release, struct task_struct *task, u64 clone_flag
 }
 
 SEC("tp_btf/task_newtask")
-__failure __msg("bpf_cpumask_acquire args#0 expected pointer to STRUCT bpf_cpumask")
+__failure __msg("must be referenced")
 int BPF_PROG(test_acquire_wrong_cpumask, struct task_struct *task, u64 clone_flags)
 {
 	struct bpf_cpumask *cpumask;
diff --git a/tools/testing/selftests/bpf/progs/nested_trust_failure.c b/tools/testing/selftests/bpf/progs/nested_trust_failure.c
index 14aff7676436..0d1aa6bbace4 100644
--- a/tools/testing/selftests/bpf/progs/nested_trust_failure.c
+++ b/tools/testing/selftests/bpf/progs/nested_trust_failure.c
@@ -17,7 +17,7 @@ char _license[] SEC("license") = "GPL";
  */
 
 SEC("tp_btf/task_newtask")
-__failure __msg("R2 must be referenced or trusted")
+__failure __msg("R2 must be")
 int BPF_PROG(test_invalid_nested_user_cpus, struct task_struct *task, u64 clone_flags)
 {
 	bpf_cpumask_test_cpu(0, task->user_cpus_ptr);
diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
index 5cecbdbbb16e..7250bb76d18a 100644
--- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c
+++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c
@@ -81,7 +81,7 @@ int no_lock(void *ctx)
 {
 	struct task_struct *task, *real_parent;
 
-	/* no bpf_rcu_read_lock(), old code still works */
+	/* old style ptr_to_btf_id is not allowed in sleepable */
 	task = bpf_get_current_task_btf();
 	real_parent = task->real_parent;
 	(void)bpf_task_storage_get(&map_a, real_parent, 0, 0);
@@ -286,13 +286,13 @@ out:
 }
 
 SEC("?fentry.s/" SYS_PREFIX "sys_getpgid")
-int task_untrusted_non_rcuptr(void *ctx)
+int task_trusted_non_rcuptr(void *ctx)
 {
 	struct task_struct *task, *group_leader;
 
 	task = bpf_get_current_task_btf();
 	bpf_rcu_read_lock();
-	/* the pointer group_leader marked as untrusted */
+	/* the pointer group_leader is explicitly marked as trusted */
 	group_leader = task->real_parent->group_leader;
 	(void)bpf_task_storage_get(&map_a, group_leader, 0, 0);
 	bpf_rcu_read_unlock();
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 9a326a800e5c..5702fc9761ef 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -181,7 +181,7 @@
 	},
 	.result_unpriv = REJECT,
 	.result = REJECT,
-	.errstr = "negative offset ptr_ ptr R1 off=-4 disallowed",
+	.errstr = "ptr R1 off=-4 disallowed",
 },
 {
 	"calls: invalid kfunc call: PTR_TO_BTF_ID with variable offset",
-- 
cgit v1.2.3


From e768e3c5aab44ee63f58649d4c8cbbb3270e5c06 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Fri, 3 Mar 2023 15:15:42 +0100
Subject: bpf: Use separate RCU callbacks for freeing selem

Martin suggested that instead of using a byte in the hole (which he has
a use for in his future patch) in bpf_local_storage_elem, we can
dispatch a different call_rcu callback based on whether we need to free
special fields in bpf_local_storage_elem data. The free path, described
in commit 9db44fdd8105 ("bpf: Support kptrs in local storage maps"),
only waits for call_rcu callbacks when there are special (kptrs, etc.)
fields in the map value, hence it is necessary that we only access
smap in this case.

Therefore, dispatch different RCU callbacks based on the BPF map has a
valid btf_record, which dereference and use smap's btf_record only when
it is valid.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20230303141542.300068-1-memxor@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf_local_storage.h |  6 ---
 kernel/bpf/bpf_local_storage.c    | 79 ++++++++++++++++++++++++---------------
 2 files changed, 49 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 0fe92986412b..6d37a40cd90e 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -74,12 +74,6 @@ struct bpf_local_storage_elem {
 	struct hlist_node snode;	/* Linked to bpf_local_storage */
 	struct bpf_local_storage __rcu *local_storage;
 	struct rcu_head rcu;
-	bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU
-			    * callbacks? bpf_local_storage_map_free only
-			    * executes rcu_barrier when there are special
-			    * fields, this field remembers that to ensure we
-			    * don't access already freed smap in sdata.
-			    */
 	/* 8 bytes hole */
 	/* The data is stored in another cacheline to minimize
 	 * the number of cachelines access during a cache hit.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 2bdd722fe293..3d320393a12c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -109,30 +109,36 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 		kfree_rcu(local_storage, rcu);
 }
 
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_fields_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
+	struct bpf_local_storage_map *smap;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
-	/* The can_use_smap bool is set whenever we need to free additional
-	 * fields in selem data before freeing selem. bpf_local_storage_map_free
-	 * only executes rcu_barrier to wait for RCU callbacks when it has
-	 * special fields, hence we can only conditionally dereference smap, as
-	 * by this time the map might have already been freed without waiting
-	 * for our call_rcu callback if it did not have any special fields.
-	 */
-	if (selem->can_use_smap)
-		bpf_obj_free_fields(SDATA(selem)->smap->map.record, SDATA(selem)->data);
+	/* protected by the rcu_barrier*() */
+	smap = rcu_dereference_protected(SDATA(selem)->smap, true);
+	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
 	kfree(selem);
 }
 
-static void bpf_selem_free_tasks_trace_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu)
 {
 	/* Free directly if Tasks Trace RCU GP also implies RCU GP */
 	if (rcu_trace_implies_rcu_gp())
-		bpf_selem_free_rcu(rcu);
+		bpf_selem_free_fields_rcu(rcu);
+	else
+		call_rcu(rcu, bpf_selem_free_fields_rcu);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage_elem *selem;
+
+	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(selem);
 	else
-		call_rcu(rcu, bpf_selem_free_rcu);
+		kfree_rcu(selem, rcu);
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -145,6 +151,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
+	struct btf_record *rec;
 	void *owner;
 
 	smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
@@ -185,10 +192,26 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	if (use_trace_rcu)
-		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_tasks_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	/* A different RCU callback is chosen whenever we need to free
+	 * additional fields in selem data before freeing selem.
+	 * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU
+	 * callbacks when it has special fields, hence we can only conditionally
+	 * dereference smap, as by this time the map might have already been
+	 * freed without waiting for our call_rcu callback if it did not have
+	 * any special fields.
+	 */
+	rec = smap->map.record;
+	if (use_trace_rcu) {
+		if (!IS_ERR_OR_NULL(rec))
+			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu);
+		else
+			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+	} else {
+		if (!IS_ERR_OR_NULL(rec))
+			call_rcu(&selem->rcu, bpf_selem_free_fields_rcu);
+		else
+			kfree_rcu(selem, rcu);
+	}
 
 	return free_local_storage;
 }
@@ -256,11 +279,6 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	RCU_INIT_POINTER(SDATA(selem)->smap, smap);
 	hlist_add_head_rcu(&selem->map_node, &b->list);
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-
-	/* If our data will have special fields, smap will wait for us to use
-	 * its record in bpf_selem_free_* RCU callbacks before freeing itself.
-	 */
-	selem->can_use_smap = !IS_ERR_OR_NULL(smap->map.record);
 }
 
 void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
@@ -748,19 +766,20 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	kvfree(smap->buckets);
 
 	/* When local storage has special fields, callbacks for
-	 * bpf_selem_free_rcu and bpf_selem_free_tasks_trace_rcu will keep using
-	 * the map BTF record, we need to execute an RCU barrier to wait for
-	 * them as the record will be freed right after our map_free callback.
+	 * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will
+	 * keep using the map BTF record, we need to execute an RCU barrier to
+	 * wait for them as the record will be freed right after our map_free
+	 * callback.
 	 */
 	if (!IS_ERR_OR_NULL(smap->map.record)) {
 		rcu_barrier_tasks_trace();
 		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
 		 * is true, because while call_rcu invocation is skipped in that
-		 * case in bpf_selem_free_tasks_trace_rcu (and all local storage
-		 * maps pass use_trace_rcu = true), there can be call_rcu
-		 * callbacks based on use_trace_rcu = false in the earlier while
-		 * ((selem = ...)) loop or from bpf_local_storage_unlink_nolock
-		 * called from owner's free path.
+		 * case in bpf_selem_free_fields_trace_rcu (and all local
+		 * storage maps pass use_trace_rcu = true), there can be
+		 * call_rcu callbacks based on use_trace_rcu = false in the
+		 * while ((selem = ...)) loop above or when owner's free path
+		 * calls bpf_local_storage_unlink_nolock.
 		 */
 		rcu_barrier();
 	}
-- 
cgit v1.2.3


From d509c55cda22096a1836e35b03f66e1ef411c0c2 Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Wed, 1 Mar 2023 12:09:13 +0200
Subject: wifi: nl80211: Update the documentation of
 NL80211_SCAN_FLAG_COLOCATED_6GHZ

Add a detailed description of NL80211_SCAN_FLAG_COLOCATED_6GHZ
flag.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.487ab04feb39.I5129fd61841332474693046241586f057b134c3c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f14621a954e1..c22eeb18b996 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6510,8 +6510,14 @@ enum nl80211_timeout_reason {
  * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with
  *	%NL80211_ATTR_SCAN_FREQ_KHZ. This also means
  *	%NL80211_ATTR_SCAN_FREQUENCIES will not be included.
- * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for colocated APs reported by
- *	2.4/5 GHz APs
+ * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for collocated APs reported by
+ *	2.4/5 GHz APs. When the flag is set, the scan logic will use the
+ *	information from the RNR element found in beacons/probe responses
+ *	received on the 2.4/5 GHz channels to actively scan only the 6GHz
+ *	channels on which APs are expected to be found. Note that when not set,
+ *	the scan logic would scan all 6GHz channels, but since transmission of
+ *	probe requests on non PSC channels is limited, it is highly likely that
+ *	these channels would passively be scanned.
  */
 enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
-- 
cgit v1.2.3


From 6ff9efcfc2dc256480b252321818e0111b9399a2 Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Wed, 1 Mar 2023 12:09:20 +0200
Subject: wifi: wireless: cleanup unused function parameters

In the past ftype was used for deciding about 6G DUP beacon, but the
logic was removed and ftype is not needed anymore.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.98d4761b809b.I255f5ecd77cb24fcf2f1641bb5833ea2d121296e@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  4 +---
 net/wireless/scan.c    | 21 +++++----------------
 2 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f115b2550309..03b911abd772 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6814,13 +6814,11 @@ enum cfg80211_bss_frame_type {
  * @ie: IEs
  * @ielen: length of IEs
  * @band: enum nl80211_band of the channel
- * @ftype: frame type
  *
  * Returns the channel number, or -1 if none could be determined.
  */
 int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
-				    enum nl80211_band band,
-				    enum cfg80211_bss_frame_type ftype);
+				    enum nl80211_band band);
 
 /**
  * cfg80211_inform_bss_data - inform cfg80211 of a new BSS
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index c23709d852bc..a1382255fab3 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1810,8 +1810,7 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev,
 }
 
 int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
-				    enum nl80211_band band,
-				    enum cfg80211_bss_frame_type ftype)
+				    enum nl80211_band band)
 {
 	const struct element *tmp;
 
@@ -1868,15 +1867,14 @@ EXPORT_SYMBOL(cfg80211_get_ies_channel_number);
 static struct ieee80211_channel *
 cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen,
 			 struct ieee80211_channel *channel,
-			 enum nl80211_bss_scan_width scan_width,
-			 enum cfg80211_bss_frame_type ftype)
+			 enum nl80211_bss_scan_width scan_width)
 {
 	u32 freq;
 	int channel_number;
 	struct ieee80211_channel *alt_channel;
 
 	channel_number = cfg80211_get_ies_channel_number(ie, ielen,
-							 channel->band, ftype);
+							 channel->band);
 
 	if (channel_number < 0) {
 		/* No channel information in frame payload */
@@ -1954,7 +1952,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy,
 		return NULL;
 
 	channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan,
-					   data->scan_width, ftype);
+					   data->scan_width);
 	if (!channel)
 		return NULL;
 
@@ -2388,7 +2386,6 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 	size_t ielen, min_hdr_len = offsetof(struct ieee80211_mgmt,
 					     u.probe_resp.variable);
 	int bss_type;
-	enum cfg80211_bss_frame_type ftype;
 
 	BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) !=
 			offsetof(struct ieee80211_mgmt, u.beacon.variable));
@@ -2425,16 +2422,8 @@ cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy,
 			variable = ext->u.s1g_beacon.variable;
 	}
 
-	if (ieee80211_is_beacon(mgmt->frame_control))
-		ftype = CFG80211_BSS_FTYPE_BEACON;
-	else if (ieee80211_is_probe_resp(mgmt->frame_control))
-		ftype = CFG80211_BSS_FTYPE_PRESP;
-	else
-		ftype = CFG80211_BSS_FTYPE_UNKNOWN;
-
 	channel = cfg80211_get_bss_channel(wiphy, variable,
-					   ielen, data->chan, data->scan_width,
-					   ftype);
+					   ielen, data->chan, data->scan_width);
 	if (!channel)
 		return NULL;
 
-- 
cgit v1.2.3


From cbbaf2bb829b6c4ef911d4a725fc9b1fadc1e43f Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 1 Mar 2023 12:09:21 +0200
Subject: wifi: nl80211: add a command to enable/disable HW timestamping

Add a command to enable and disable HW timestamping of TM and FTM
frames. HW timestamping can be enabled for a specific mac address
or for all addresses.

The low level driver will indicate how many peers HW timestamping
can be enabled concurrently, and this information will be passed
to userspace.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.05678d7b1c17.Iccc08869ea8156f1c71a3111a47f86dd56234bd0@changeid
[switch to needing netdev UP, minor edits]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 27 +++++++++++++++++++++++++++
 include/uapi/linux/nl80211.h | 22 ++++++++++++++++++++++
 net/wireless/nl80211.c       | 37 +++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h      | 17 +++++++++++++++++
 net/wireless/trace.h         | 25 +++++++++++++++++++++++++
 5 files changed, 128 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 03b911abd772..f0da61c6ec4b 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -827,6 +827,18 @@ struct cfg80211_fils_aad {
 	const u8 *anonce;
 };
 
+/**
+ * struct cfg80211_set_hw_timestamp - enable/disable HW timestamping
+ * @macaddr: peer MAC address. NULL to enable/disable HW timestamping for all
+ *	addresses.
+ * @enable: if set, enable HW timestamping for the specified MAC address.
+ *	Otherwise disable HW timestamping for the specified MAC address.
+ */
+struct cfg80211_set_hw_timestamp {
+	const u8 *macaddr;
+	bool enable;
+};
+
 /**
  * cfg80211_get_chandef_type - return old channel type from chandef
  * @chandef: the channel definition
@@ -4330,6 +4342,8 @@ struct mgmt_frame_regs {
  * @add_link_station: Add a link to a station.
  * @mod_link_station: Modify a link of a station.
  * @del_link_station: Remove a link of a station.
+ *
+ * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -4683,6 +4697,8 @@ struct cfg80211_ops {
 				    struct link_station_parameters *params);
 	int	(*del_link_station)(struct wiphy *wiphy, struct net_device *dev,
 				    struct link_station_del_parameters *params);
+	int	(*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev,
+				    struct cfg80211_set_hw_timestamp *hwts);
 };
 
 /*
@@ -5139,6 +5155,8 @@ struct wiphy_iftype_akm_suites {
 	int n_akm_suites;
 };
 
+#define CFG80211_HW_TIMESTAMP_ALL_PEERS	0xffff
+
 /**
  * struct wiphy - wireless hardware description
  * @mtx: mutex for the data (structures) of this device
@@ -5348,6 +5366,13 @@ struct wiphy_iftype_akm_suites {
  *	NL80211_MAX_NR_AKM_SUITES in order to avoid compatibility issues with
  *	legacy userspace and maximum allowed value is
  *	CFG80211_MAX_NUM_AKM_SUITES.
+ *
+ * @hw_timestamp_max_peers: maximum number of peers that the driver supports
+ *	enabling HW timestamping for concurrently. Setting this field to a
+ *	non-zero value indicates that the driver supports HW timestamping.
+ *	A value of %CFG80211_HW_TIMESTAMP_ALL_PEERS indicates the driver
+ *	supports enabling HW timestamping for all peers (i.e. no need to
+ *	specify a mac address).
  */
 struct wiphy {
 	struct mutex mtx;
@@ -5496,6 +5521,8 @@ struct wiphy {
 	u8 ema_max_profile_periodicity;
 	u16 max_num_akm_suites;
 
+	u16 hw_timestamp_max_peers;
+
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index c22eeb18b996..c8520c150f9c 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1299,6 +1299,16 @@
  * @NL80211_CMD_MODIFY_LINK_STA: Modify a link of an MLD station
  * @NL80211_CMD_REMOVE_LINK_STA: Remove a link of an MLD station
  *
+ * @NL80211_CMD_SET_HW_TIMESTAMP: Enable/disable HW timestamping of Timing
+ *	measurement and Fine timing measurement frames. If %NL80211_ATTR_MAC
+ *	is included, enable/disable HW timestamping only for frames to/from the
+ *	specified MAC address. Otherwise enable/disable HW timestamping for
+ *	all TM/FTM frames (including ones that were enabled with specific MAC
+ *	address). If %NL80211_ATTR_HW_TIMESTAMP_ENABLED is not included, disable
+ *	HW timestamping.
+ *	The number of peers that HW timestamping can be enabled for concurrently
+ *	is indicated by %NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1550,6 +1560,8 @@ enum nl80211_commands {
 	NL80211_CMD_MODIFY_LINK_STA,
 	NL80211_CMD_REMOVE_LINK_STA,
 
+	NL80211_CMD_SET_HW_TIMESTAMP,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -2775,6 +2787,13 @@ enum nl80211_commands {
  *	indicates that the sub-channel is punctured. Higher 16 bits are
  *	reserved.
  *
+ * @NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS: Maximum number of peers that HW
+ *	timestamping can be enabled for concurrently (u16), a wiphy attribute.
+ *	A value of 0xffff indicates setting for all peers (i.e. not specifying
+ *	an address with %NL80211_CMD_SET_HW_TIMESTAMP) is supported.
+ * @NL80211_ATTR_HW_TIMESTAMP_ENABLED: Indicates whether HW timestamping should
+ *	be enabled or not (flag attribute).
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3306,6 +3325,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_PUNCT_BITMAP,
 
+	NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS,
+	NL80211_ATTR_HW_TIMESTAMP_ENABLED,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 112b4bb009c8..ab0497efdd37 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -806,6 +806,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG },
 	[NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT },
 	[NL80211_ATTR_PUNCT_BITMAP] = NLA_POLICY_RANGE(NLA_U8, 0, 0xffff),
+
+	[NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS] = { .type = NLA_U16 },
+	[NL80211_ATTR_HW_TIMESTAMP_ENABLED] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -2964,6 +2967,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
 		if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_MLO)
 			nla_put_flag(msg, NL80211_ATTR_MLO_SUPPORT);
 
+		if (rdev->wiphy.hw_timestamp_max_peers &&
+		    nla_put_u16(msg, NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS,
+				rdev->wiphy.hw_timestamp_max_peers))
+			goto nla_put_failure;
+
 		/* done */
 		state->split_start = 0;
 		break;
@@ -16162,6 +16170,29 @@ nl80211_remove_link_station(struct sk_buff *skb, struct genl_info *info)
 	return ret;
 }
 
+static int nl80211_set_hw_timestamp(struct sk_buff *skb,
+				    struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_set_hw_timestamp hwts = {};
+
+	if (!rdev->wiphy.hw_timestamp_max_peers)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_MAC] &&
+	    rdev->wiphy.hw_timestamp_max_peers != CFG80211_HW_TIMESTAMP_ALL_PEERS)
+		return -EOPNOTSUPP;
+
+	if (info->attrs[NL80211_ATTR_MAC])
+		hwts.macaddr = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	hwts.enable =
+		nla_get_flag(info->attrs[NL80211_ATTR_HW_TIMESTAMP_ENABLED]);
+
+	return rdev_set_hw_timestamp(rdev, dev, &hwts);
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -17336,6 +17367,12 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
 					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
+	{
+		.cmd = NL80211_CMD_SET_HW_TIMESTAMP,
+		.doit = nl80211_set_hw_timestamp,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 13b209a8db28..2e497cf26ef2 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1494,4 +1494,21 @@ rdev_del_link_station(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int
+rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev,
+		      struct net_device *dev,
+		      struct cfg80211_set_hw_timestamp *hwts)
+{
+	struct wiphy *wiphy = &rdev->wiphy;
+	int ret;
+
+	if (!rdev->ops->set_hw_timestamp)
+		return -EOPNOTSUPP;
+
+	trace_rdev_set_hw_timestamp(wiphy, dev, hwts);
+	ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts);
+	trace_rdev_return_int(wiphy, ret);
+
+	return ret;
+}
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ca7474eec723..f3fcfc4fcce5 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3918,6 +3918,31 @@ TRACE_EVENT(rdev_del_link_station,
 		  __entry->link_id)
 );
 
+TRACE_EVENT(rdev_set_hw_timestamp,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 struct cfg80211_set_hw_timestamp *hwts),
+
+	TP_ARGS(wiphy, netdev, hwts),
+
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		MAC_ENTRY(macaddr)
+		__field(bool, enable)
+	),
+
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		MAC_ASSIGN(macaddr, hwts->macaddr);
+		__entry->enable = hwts->enable;
+	),
+
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac %pM, enable: %u",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr,
+		  __entry->enable)
+);
+
 #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From 81202305f7c282c356c337dded8472d884acd94b Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 1 Mar 2023 12:09:22 +0200
Subject: wifi: mac80211: add support for set_hw_timestamp command

Support the set_hw_timestamp callback for enabling and disabling HW
timestamping if the low level driver supports it.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.700ded7badde.Ib2f7c228256ce313a04d3d9f9ecc6c7b9aa602bb@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  6 ++++++
 net/mac80211/cfg.c     | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 219fd15893b0..6946c9d95aec 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4230,6 +4230,9 @@ struct ieee80211_prep_tx_info {
  *	Note that a sta can also be inserted or removed with valid links,
  *	i.e. passed to @sta_add/@sta_state with sta->valid_links not zero.
  *	In fact, cannot change from having valid_links and not having them.
+ * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is
+ *	not restored at HW reset by mac80211 so drivers need to take care of
+ *	that.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4589,6 +4592,9 @@ struct ieee80211_ops {
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta,
 				u16 old_links, u16 new_links);
+	int (*set_hw_timestamp)(struct ieee80211_hw *hw,
+				struct ieee80211_vif *vif,
+				struct cfg80211_set_hw_timestamp *hwts);
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 8eb342300868..7e90f4a81962 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -4904,6 +4904,22 @@ ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev,
 	return ret;
 }
 
+static int ieee80211_set_hw_timestamp(struct wiphy *wiphy,
+				      struct net_device *dev,
+				      struct cfg80211_set_hw_timestamp *hwts)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+
+	if (!local->ops->set_hw_timestamp)
+		return -EOPNOTSUPP;
+
+	if (!check_sdata_in_driver(sdata))
+		return -EIO;
+
+	return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts);
+}
+
 const struct cfg80211_ops mac80211_config_ops = {
 	.add_virtual_intf = ieee80211_add_iface,
 	.del_virtual_intf = ieee80211_del_iface,
@@ -5014,4 +5030,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.add_link_station = ieee80211_add_link_station,
 	.mod_link_station = ieee80211_mod_link_station,
 	.del_link_station = ieee80211_del_link_station,
+	.set_hw_timestamp = ieee80211_set_hw_timestamp,
 };
-- 
cgit v1.2.3


From 4c532321bf90288dae6b07a3f52279bfde842a80 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 1 Mar 2023 12:09:23 +0200
Subject: wifi: cfg80211/mac80211: report link ID on control port RX

For control port RX, report the link ID for MLO.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.fe06dfc3791b.Iddcab94789cafe336417be406072ce8a6312fc2d@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  5 +++--
 net/mac80211/rx.c      |  2 +-
 net/wireless/nl80211.c | 15 ++++++++++-----
 net/wireless/trace.h   | 11 +++++++----
 4 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f0da61c6ec4b..7cebba1c4135 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -8126,6 +8126,7 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
  *	responsible for any cleanup.  The caller must also ensure that
  *	skb->protocol is set appropriately.
  * @unencrypted: Whether the frame was received unencrypted
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
  *
  * This function is used to inform userspace about a received control port
  * frame.  It should only be used if userspace indicated it wants to receive
@@ -8136,8 +8137,8 @@ void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie,
  *
  * Return: %true if the frame was passed to userspace
  */
-bool cfg80211_rx_control_port(struct net_device *dev,
-			      struct sk_buff *skb, bool unencrypted);
+bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb,
+			      bool unencrypted, int link_id);
 
 /**
  * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 095bcd2552bb..f63ed6b91d9b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2591,7 +2591,7 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
 		struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 		bool noencrypt = !(status->flag & RX_FLAG_DECRYPTED);
 
-		cfg80211_rx_control_port(dev, skb, noencrypt);
+		cfg80211_rx_control_port(dev, skb, noencrypt, rx->link_id);
 		dev_kfree_skb(skb);
 	} else {
 		struct ethhdr *ehdr = (void *)skb_mac_header(skb);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ab0497efdd37..85f714e1af87 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -18754,7 +18754,9 @@ EXPORT_SYMBOL(cfg80211_mgmt_tx_status_ext);
 
 static int __nl80211_rx_control_port(struct net_device *dev,
 				     struct sk_buff *skb,
-				     bool unencrypted, gfp_t gfp)
+				     bool unencrypted,
+				     int link_id,
+				     gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -18786,6 +18788,8 @@ static int __nl80211_rx_control_port(struct net_device *dev,
 			      NL80211_ATTR_PAD) ||
 	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
 	    nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) ||
+	    (link_id >= 0 &&
+	     nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) ||
 	    (unencrypted && nla_put_flag(msg,
 					 NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT)))
 		goto nla_put_failure;
@@ -18804,13 +18808,14 @@ static int __nl80211_rx_control_port(struct net_device *dev,
 	return -ENOBUFS;
 }
 
-bool cfg80211_rx_control_port(struct net_device *dev,
-			      struct sk_buff *skb, bool unencrypted)
+bool cfg80211_rx_control_port(struct net_device *dev, struct sk_buff *skb,
+			      bool unencrypted, int link_id)
 {
 	int ret;
 
-	trace_cfg80211_rx_control_port(dev, skb, unencrypted);
-	ret = __nl80211_rx_control_port(dev, skb, unencrypted, GFP_ATOMIC);
+	trace_cfg80211_rx_control_port(dev, skb, unencrypted, link_id);
+	ret = __nl80211_rx_control_port(dev, skb, unencrypted, link_id,
+					GFP_ATOMIC);
 	trace_cfg80211_return_bool(ret == 0);
 	return ret == 0;
 }
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index f3fcfc4fcce5..716a1fa70069 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3165,14 +3165,15 @@ TRACE_EVENT(cfg80211_control_port_tx_status,
 
 TRACE_EVENT(cfg80211_rx_control_port,
 	TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
-		 bool unencrypted),
-	TP_ARGS(netdev, skb, unencrypted),
+		 bool unencrypted, int link_id),
+	TP_ARGS(netdev, skb, unencrypted, link_id),
 	TP_STRUCT__entry(
 		NETDEV_ENTRY
 		__field(int, len)
 		MAC_ENTRY(from)
 		__field(u16, proto)
 		__field(bool, unencrypted)
+		__field(int, link_id)
 	),
 	TP_fast_assign(
 		NETDEV_ASSIGN;
@@ -3180,10 +3181,12 @@ TRACE_EVENT(cfg80211_rx_control_port,
 		MAC_ASSIGN(from, eth_hdr(skb)->h_source);
 		__entry->proto = be16_to_cpu(skb->protocol);
 		__entry->unencrypted = unencrypted;
+		__entry->link_id = link_id;
 	),
-	TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s",
+	TP_printk(NETDEV_PR_FMT ", len=%d, %pM, proto: 0x%x, unencrypted: %s, link: %d",
 		  NETDEV_PR_ARG, __entry->len, __entry->from,
-		  __entry->proto, BOOL_TO_STR(__entry->unencrypted))
+		  __entry->proto, BOOL_TO_STR(__entry->unencrypted),
+		  __entry->link_id)
 );
 
 TRACE_EVENT(cfg80211_cqm_rssi_notify,
-- 
cgit v1.2.3


From e1f113cc67870375eae0c7b84c2a40cc6388d903 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Wed, 1 Mar 2023 12:09:25 +0200
Subject: wifi: mac80211: add pointer from bss_conf to vif

While often not needed, this considerably simplifies going from a link
specific bss_config to the vif. This helps with e.g. creating link
specific debugfs entries inside drivers.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.46f701a10ed5.I20390b2a8165ff222d66585915689206ea93222b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 3 +++
 net/mac80211/link.c    | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 6946c9d95aec..2a1874d994e0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -534,6 +534,7 @@ struct ieee80211_fils_discovery {
  * This structure keeps information about a BSS (and an association
  * to that BSS) that can change during the lifetime of the BSS.
  *
+ * @vif: reference to owning VIF
  * @addr: (link) address used locally
  * @link_id: link ID, or 0 for non-MLO
  * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE
@@ -675,6 +676,8 @@ struct ieee80211_fils_discovery {
  *	bandwidth
  */
 struct ieee80211_bss_conf {
+	struct ieee80211_vif *vif;
+
 	const u8 *bssid;
 	unsigned int link_id;
 	u8 addr[ETH_ALEN] __aligned(2);
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index 8c8869cc1fb4..e6f9fce1dadb 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -34,6 +34,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 	link->link_id = link_id;
 	link->conf = link_conf;
 	link_conf->link_id = link_id;
+	link_conf->vif = &sdata->vif;
 
 	INIT_WORK(&link->csa_finalize_work,
 		  ieee80211_csa_finalize_work);
-- 
cgit v1.2.3


From 170cd6a66d9a164180eb4dc72d50afa6ce1ce566 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Wed, 1 Mar 2023 12:09:27 +0200
Subject: wifi: mac80211: add netdev per-link debugfs data and driver hook

This adds the infrastructure to have netdev specific per-link data both
for mac80211 and the driver in debugfs. For the driver, a new callback
is added which is only used if MLO is supported.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.fb4c947e4df8.I69b3516ddf4c8a7501b395f652d6063444ecad63@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h        |  10 ++
 net/mac80211/debugfs_netdev.c | 222 ++++++++++++++++++++++++++++++++----------
 net/mac80211/debugfs_netdev.h |  16 +++
 net/mac80211/driver-ops.c     |  25 ++++-
 net/mac80211/driver-ops.h     |  16 +++
 net/mac80211/ieee80211_i.h    |   4 +
 net/mac80211/link.c           |   4 +
 7 files changed, 242 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 2a1874d994e0..5df9eb828a58 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3844,6 +3844,12 @@ struct ieee80211_prep_tx_info {
  *	the station. See @sta_pre_rcu_remove if needed.
  *	This callback can sleep.
  *
+ * @link_add_debugfs: Drivers can use this callback to add debugfs files
+ *	when a link is added to a mac80211 vif. This callback should be within
+ *	a CONFIG_MAC80211_DEBUGFS conditional. This callback can sleep.
+ *	For non-MLO the callback will be called once for the default bss_conf
+ *	with the vif's directory rather than a separate subdirectory.
+ *
  * @sta_add_debugfs: Drivers can use this callback to add debugfs files
  *	when a station is added to mac80211's station list. This callback
  *	should be within a CONFIG_MAC80211_DEBUGFS conditional. This
@@ -4325,6 +4331,10 @@ struct ieee80211_ops {
 	int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			  struct ieee80211_sta *sta);
 #ifdef CONFIG_MAC80211_DEBUGFS
+	void (*link_add_debugfs)(struct ieee80211_hw *hw,
+				 struct ieee80211_vif *vif,
+				 struct ieee80211_bss_conf *link_conf,
+				 struct dentry *dir);
 	void (*sta_add_debugfs)(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta,
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 187bb22c0bbd..b0cef37eb394 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -23,16 +23,16 @@
 #include "driver-ops.h"
 
 static ssize_t ieee80211_if_read(
-	struct ieee80211_sub_if_data *sdata,
+	void *data,
 	char __user *userbuf,
 	size_t count, loff_t *ppos,
-	ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
+	ssize_t (*format)(const void *, char *, int))
 {
 	char buf[200];
 	ssize_t ret = -EINVAL;
 
 	read_lock(&dev_base_lock);
-	ret = (*format)(sdata, buf, sizeof(buf));
+	ret = (*format)(data, buf, sizeof(buf));
 	read_unlock(&dev_base_lock);
 
 	if (ret >= 0)
@@ -42,10 +42,10 @@ static ssize_t ieee80211_if_read(
 }
 
 static ssize_t ieee80211_if_write(
-	struct ieee80211_sub_if_data *sdata,
+	void *data,
 	const char __user *userbuf,
 	size_t count, loff_t *ppos,
-	ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int))
+	ssize_t (*write)(void *, const char *, int))
 {
 	char buf[64];
 	ssize_t ret;
@@ -58,64 +58,64 @@ static ssize_t ieee80211_if_write(
 	buf[count] = '\0';
 
 	rtnl_lock();
-	ret = (*write)(sdata, buf, count);
+	ret = (*write)(data, buf, count);
 	rtnl_unlock();
 
 	return ret;
 }
 
-#define IEEE80211_IF_FMT(name, field, format_string)			\
+#define IEEE80211_IF_FMT(name, type, field, format_string)		\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata, char *buf,		\
+	const type *data, char *buf,					\
 	int buflen)							\
 {									\
-	return scnprintf(buf, buflen, format_string, sdata->field);	\
+	return scnprintf(buf, buflen, format_string, data->field);	\
 }
-#define IEEE80211_IF_FMT_DEC(name, field)				\
-		IEEE80211_IF_FMT(name, field, "%d\n")
-#define IEEE80211_IF_FMT_HEX(name, field)				\
-		IEEE80211_IF_FMT(name, field, "%#x\n")
-#define IEEE80211_IF_FMT_LHEX(name, field)				\
-		IEEE80211_IF_FMT(name, field, "%#lx\n")
+#define IEEE80211_IF_FMT_DEC(name, type, field)				\
+		IEEE80211_IF_FMT(name, type, field, "%d\n")
+#define IEEE80211_IF_FMT_HEX(name, type, field)				\
+		IEEE80211_IF_FMT(name, type, field, "%#x\n")
+#define IEEE80211_IF_FMT_LHEX(name, type, field)			\
+		IEEE80211_IF_FMT(name, type, field, "%#lx\n")
 
-#define IEEE80211_IF_FMT_HEXARRAY(name, field)				\
+#define IEEE80211_IF_FMT_HEXARRAY(name, type, field)			\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata,			\
+	const type *data,						\
 	char *buf, int buflen)						\
 {									\
 	char *p = buf;							\
 	int i;								\
-	for (i = 0; i < sizeof(sdata->field); i++) {			\
+	for (i = 0; i < sizeof(data->field); i++) {			\
 		p += scnprintf(p, buflen + buf - p, "%.2x ",		\
-				 sdata->field[i]);			\
+				 data->field[i]);			\
 	}								\
 	p += scnprintf(p, buflen + buf - p, "\n");			\
 	return p - buf;							\
 }
 
-#define IEEE80211_IF_FMT_ATOMIC(name, field)				\
+#define IEEE80211_IF_FMT_ATOMIC(name, type, field)			\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata,			\
+	const type *data,						\
 	char *buf, int buflen)						\
 {									\
-	return scnprintf(buf, buflen, "%d\n", atomic_read(&sdata->field));\
+	return scnprintf(buf, buflen, "%d\n", atomic_read(&data->field));\
 }
 
-#define IEEE80211_IF_FMT_MAC(name, field)				\
+#define IEEE80211_IF_FMT_MAC(name, type, field)				\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata, char *buf,		\
+	const type *data, char *buf,					\
 	int buflen)							\
 {									\
-	return scnprintf(buf, buflen, "%pM\n", sdata->field);		\
+	return scnprintf(buf, buflen, "%pM\n", data->field);		\
 }
 
-#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field)			\
+#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, type, field)		\
 static ssize_t ieee80211_if_fmt_##name(					\
-	const struct ieee80211_sub_if_data *sdata,			\
+	const type *data,						\
 	char *buf, int buflen)						\
 {									\
 	return scnprintf(buf, buflen, "%d\n",				\
-			 jiffies_to_msecs(sdata->field));		\
+			 jiffies_to_msecs(data->field));		\
 }
 
 #define _IEEE80211_IF_FILE_OPS(name, _read, _write)			\
@@ -126,43 +126,67 @@ static const struct file_operations name##_ops = {			\
 	.llseek = generic_file_llseek,					\
 }
 
-#define _IEEE80211_IF_FILE_R_FN(name)					\
+#define _IEEE80211_IF_FILE_R_FN(name, type)				\
 static ssize_t ieee80211_if_read_##name(struct file *file,		\
 					char __user *userbuf,		\
 					size_t count, loff_t *ppos)	\
 {									\
+	ssize_t (*fn)(const void *, char *, int) = (void *)		\
+		((ssize_t (*)(const type, char *, int))			\
+		 ieee80211_if_fmt_##name);				\
 	return ieee80211_if_read(file->private_data,			\
-				 userbuf, count, ppos,			\
-				 ieee80211_if_fmt_##name);		\
+				 userbuf, count, ppos, fn);		\
 }
 
-#define _IEEE80211_IF_FILE_W_FN(name)					\
+#define _IEEE80211_IF_FILE_W_FN(name, type)				\
 static ssize_t ieee80211_if_write_##name(struct file *file,		\
 					 const char __user *userbuf,	\
 					 size_t count, loff_t *ppos)	\
 {									\
+	ssize_t (*fn)(void *, const char *, int) = (void *)		\
+		((ssize_t (*)(type, const char *, int))			\
+		 ieee80211_if_parse_##name);				\
 	return ieee80211_if_write(file->private_data, userbuf, count,	\
-				  ppos, ieee80211_if_parse_##name);	\
+				  ppos, fn);				\
 }
 
 #define IEEE80211_IF_FILE_R(name)					\
-	_IEEE80211_IF_FILE_R_FN(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_sub_if_data *)	\
 	_IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name, NULL)
 
 #define IEEE80211_IF_FILE_W(name)					\
-	_IEEE80211_IF_FILE_W_FN(name)					\
+	_IEEE80211_IF_FILE_W_FN(name, struct ieee80211_sub_if_data *)	\
 	_IEEE80211_IF_FILE_OPS(name, NULL, ieee80211_if_write_##name)
 
 #define IEEE80211_IF_FILE_RW(name)					\
-	_IEEE80211_IF_FILE_R_FN(name)					\
-	_IEEE80211_IF_FILE_W_FN(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_sub_if_data *)	\
+	_IEEE80211_IF_FILE_W_FN(name, struct ieee80211_sub_if_data *)	\
 	_IEEE80211_IF_FILE_OPS(name, ieee80211_if_read_##name,		\
 			       ieee80211_if_write_##name)
 
 #define IEEE80211_IF_FILE(name, field, format)				\
-	IEEE80211_IF_FMT_##format(name, field)				\
+	IEEE80211_IF_FMT_##format(name, struct ieee80211_sub_if_data, field) \
 	IEEE80211_IF_FILE_R(name)
 
+/* Same but with a link_ prefix in the ops variable name and different type */
+#define IEEE80211_IF_LINK_FILE_R(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_link_data *)	\
+	_IEEE80211_IF_FILE_OPS(link_##name, ieee80211_if_read_##name, NULL)
+
+#define IEEE80211_IF_LINK_FILE_W(name)					\
+	_IEEE80211_IF_FILE_W_FN(name)					\
+	_IEEE80211_IF_FILE_OPS(link_##name, NULL, ieee80211_if_write_##name)
+
+#define IEEE80211_IF_LINK_FILE_RW(name)					\
+	_IEEE80211_IF_FILE_R_FN(name, struct ieee80211_link_data *)	\
+	_IEEE80211_IF_FILE_W_FN(name, struct ieee80211_link_data *)	\
+	_IEEE80211_IF_FILE_OPS(link_##name, ieee80211_if_read_##name,	\
+			       ieee80211_if_write_##name)
+
+#define IEEE80211_IF_LINK_FILE(name, field, format)				\
+	IEEE80211_IF_FMT_##format(name, struct ieee80211_link_data, field) \
+	IEEE80211_IF_LINK_FILE_R(name)
+
 /* common attributes */
 IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[NL80211_BAND_2GHZ],
 		  HEX);
@@ -207,9 +231,9 @@ IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_5ghz);
 
 IEEE80211_IF_FILE(flags, flags, HEX);
 IEEE80211_IF_FILE(state, state, LHEX);
-IEEE80211_IF_FILE(txpower, vif.bss_conf.txpower, DEC);
-IEEE80211_IF_FILE(ap_power_level, deflink.ap_power_level, DEC);
-IEEE80211_IF_FILE(user_power_level, deflink.user_power_level, DEC);
+IEEE80211_IF_LINK_FILE(txpower, conf->txpower, DEC);
+IEEE80211_IF_LINK_FILE(ap_power_level, ap_power_level, DEC);
+IEEE80211_IF_LINK_FILE(user_power_level, user_power_level, DEC);
 
 static ssize_t
 ieee80211_if_fmt_hw_queues(const struct ieee80211_sub_if_data *sdata,
@@ -236,9 +260,10 @@ IEEE80211_IF_FILE(bssid, deflink.u.mgd.bssid, MAC);
 IEEE80211_IF_FILE(aid, vif.cfg.aid, DEC);
 IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS);
 
-static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
+static int ieee80211_set_smps(struct ieee80211_link_data *link,
 			      enum ieee80211_smps_mode smps_mode)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	int err;
 
@@ -256,7 +281,7 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
 		return -EOPNOTSUPP;
 
 	sdata_lock(sdata);
-	err = __ieee80211_request_smps_mgd(sdata, &sdata->deflink, smps_mode);
+	err = __ieee80211_request_smps_mgd(link->sdata, link, smps_mode);
 	sdata_unlock(sdata);
 
 	return err;
@@ -269,24 +294,24 @@ static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
 	[IEEE80211_SMPS_DYNAMIC] = "dynamic",
 };
 
-static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata,
+static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_link_data *link,
 				     char *buf, int buflen)
 {
-	if (sdata->vif.type == NL80211_IFTYPE_STATION)
+	if (link->sdata->vif.type == NL80211_IFTYPE_STATION)
 		return snprintf(buf, buflen, "request: %s\nused: %s\n",
-				smps_modes[sdata->deflink.u.mgd.req_smps],
-				smps_modes[sdata->deflink.smps_mode]);
+				smps_modes[link->u.mgd.req_smps],
+				smps_modes[link->smps_mode]);
 	return -EINVAL;
 }
 
-static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
+static ssize_t ieee80211_if_parse_smps(struct ieee80211_link_data *link,
 				       const char *buf, int buflen)
 {
 	enum ieee80211_smps_mode mode;
 
 	for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) {
 		if (strncmp(buf, smps_modes[mode], buflen) == 0) {
-			int err = ieee80211_set_smps(sdata, mode);
+			int err = ieee80211_set_smps(link, mode);
 			if (!err)
 				return buflen;
 			return err;
@@ -295,7 +320,7 @@ static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
 
 	return -EINVAL;
 }
-IEEE80211_IF_FILE_RW(smps);
+IEEE80211_IF_LINK_FILE_RW(smps);
 
 static ssize_t ieee80211_if_parse_tkip_mic_test(
 	struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
@@ -595,6 +620,8 @@ static ssize_t ieee80211_if_parse_active_links(struct ieee80211_sub_if_data *sda
 }
 IEEE80211_IF_FILE_RW(active_links);
 
+IEEE80211_IF_LINK_FILE(addr, conf->addr, MAC);
+
 #ifdef CONFIG_MAC80211_MESH
 IEEE80211_IF_FILE(estab_plinks, u.mesh.estab_plinks, ATOMIC);
 
@@ -685,7 +712,6 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata)
 	DEBUGFS_ADD(bssid);
 	DEBUGFS_ADD(aid);
 	DEBUGFS_ADD(beacon_timeout);
-	DEBUGFS_ADD_MODE(smps, 0600);
 	DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
 	DEBUGFS_ADD_MODE(beacon_loss, 0200);
 	DEBUGFS_ADD_MODE(uapsd_queues, 0600);
@@ -788,9 +814,6 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
 
 	DEBUGFS_ADD(flags);
 	DEBUGFS_ADD(state);
-	DEBUGFS_ADD(txpower);
-	DEBUGFS_ADD(user_power_level);
-	DEBUGFS_ADD(ap_power_level);
 
 	if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
 		add_common_files(sdata);
@@ -820,6 +843,31 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
 	}
 }
 
+#undef DEBUGFS_ADD_MODE
+#undef DEBUGFS_ADD
+
+#define DEBUGFS_ADD_MODE(dentry, name, mode) \
+	debugfs_create_file(#name, mode, dentry, \
+			    link, &link_##name##_ops)
+
+#define DEBUGFS_ADD(dentry, name) DEBUGFS_ADD_MODE(dentry, name, 0400)
+
+static void add_link_files(struct ieee80211_link_data *link,
+			   struct dentry *dentry)
+{
+	DEBUGFS_ADD(dentry, txpower);
+	DEBUGFS_ADD(dentry, user_power_level);
+	DEBUGFS_ADD(dentry, ap_power_level);
+
+	switch (link->sdata->vif.type) {
+	case NL80211_IFTYPE_STATION:
+		DEBUGFS_ADD_MODE(dentry, smps, 0600);
+		break;
+	default:
+		break;
+	}
+}
+
 void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
 {
 	char buf[10+IFNAMSIZ];
@@ -830,6 +878,9 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
 	sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
 							sdata->vif.debugfs_dir);
 	add_files(sdata);
+
+	if (!(sdata->local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+		add_link_files(&sdata->deflink, sdata->vif.debugfs_dir);
 }
 
 void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
@@ -855,3 +906,66 @@ void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
 	sprintf(buf, "netdev:%s", sdata->name);
 	debugfs_rename(dir->d_parent, dir, dir->d_parent, buf);
 }
+
+void ieee80211_link_debugfs_add(struct ieee80211_link_data *link)
+{
+	char link_dir_name[10];
+
+	if (WARN_ON(!link->sdata->vif.debugfs_dir))
+		return;
+
+	/* For now, this should not be called for non-MLO capable drivers */
+	if (WARN_ON(!(link->sdata->local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)))
+		return;
+
+	snprintf(link_dir_name, sizeof(link_dir_name),
+		 "link-%d", link->link_id);
+
+	link->debugfs_dir =
+		debugfs_create_dir(link_dir_name,
+				   link->sdata->vif.debugfs_dir);
+
+	DEBUGFS_ADD(link->debugfs_dir, addr);
+	add_link_files(link, link->debugfs_dir);
+}
+
+void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link)
+{
+	if (!link->sdata->vif.debugfs_dir || !link->debugfs_dir) {
+		link->debugfs_dir = NULL;
+		return;
+	}
+
+	if (link->debugfs_dir == link->sdata->vif.debugfs_dir) {
+		WARN_ON(link != &link->sdata->deflink);
+		link->debugfs_dir = NULL;
+		return;
+	}
+
+	debugfs_remove_recursive(link->debugfs_dir);
+	link->debugfs_dir = NULL;
+}
+
+void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link)
+{
+	if (WARN_ON(!link->debugfs_dir))
+		return;
+
+	drv_link_add_debugfs(link->sdata->local, link->sdata,
+			     link->conf, link->debugfs_dir);
+}
+
+void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link)
+{
+	if (!link || !link->debugfs_dir)
+		return;
+
+	if (WARN_ON(link->debugfs_dir == link->sdata->vif.debugfs_dir))
+		return;
+
+	/* Recreate the directory excluding the driver data */
+	debugfs_remove_recursive(link->debugfs_dir);
+	link->debugfs_dir = NULL;
+
+	ieee80211_link_debugfs_add(link);
+}
diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h
index a7e9d8d518f9..99e688dcabd6 100644
--- a/net/mac80211/debugfs_netdev.h
+++ b/net/mac80211/debugfs_netdev.h
@@ -10,6 +10,12 @@
 void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata);
 void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata);
 void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata);
+
+void ieee80211_link_debugfs_add(struct ieee80211_link_data *link);
+void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link);
+
+void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link);
+void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link);
 #else
 static inline void ieee80211_debugfs_add_netdev(
 	struct ieee80211_sub_if_data *sdata)
@@ -20,6 +26,16 @@ static inline void ieee80211_debugfs_remove_netdev(
 static inline void ieee80211_debugfs_rename_netdev(
 	struct ieee80211_sub_if_data *sdata)
 {}
+
+static inline void ieee80211_link_debugfs_add(struct ieee80211_link_data *link)
+{}
+static inline void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link)
+{}
+
+static inline void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link)
+{}
+static inline void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link)
+{}
 #endif
 
 #endif /* __IEEE80211_DEBUGFS_NETDEV_H */
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index cfb09e4aed4d..30cd0c905a24 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -8,6 +8,7 @@
 #include "trace.h"
 #include "driver-ops.h"
 #include "debugfs_sta.h"
+#include "debugfs_netdev.h"
 
 int drv_start(struct ieee80211_local *local)
 {
@@ -477,6 +478,10 @@ int drv_change_vif_links(struct ieee80211_local *local,
 			 u16 old_links, u16 new_links,
 			 struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS])
 {
+	struct ieee80211_link_data *link;
+	unsigned long links_to_add;
+	unsigned long links_to_rem;
+	unsigned int link_id;
 	int ret = -EOPNOTSUPP;
 
 	might_sleep();
@@ -487,13 +492,31 @@ int drv_change_vif_links(struct ieee80211_local *local,
 	if (old_links == new_links)
 		return 0;
 
+	links_to_add = ~old_links & new_links;
+	links_to_rem = old_links & ~new_links;
+
+	for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) {
+		link = rcu_access_pointer(sdata->link[link_id]);
+
+		ieee80211_link_debugfs_drv_remove(link);
+	}
+
 	trace_drv_change_vif_links(local, sdata, old_links, new_links);
 	if (local->ops->change_vif_links)
 		ret = local->ops->change_vif_links(&local->hw, &sdata->vif,
 						   old_links, new_links, old);
 	trace_drv_return_int(local, ret);
 
-	return ret;
+	if (ret)
+		return ret;
+
+	for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) {
+		link = rcu_access_pointer(sdata->link[link_id]);
+
+		ieee80211_link_debugfs_drv_add(link);
+	}
+
+	return 0;
 }
 
 int drv_change_sta_links(struct ieee80211_local *local,
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 5d13a3dfd366..a68d606e6987 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -465,6 +465,22 @@ static inline void drv_sta_remove(struct ieee80211_local *local,
 }
 
 #ifdef CONFIG_MAC80211_DEBUGFS
+static inline void drv_link_add_debugfs(struct ieee80211_local *local,
+					struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_bss_conf *link_conf,
+					struct dentry *dir)
+{
+	might_sleep();
+
+	sdata = get_bss_sdata(sdata);
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	if (local->ops->link_add_debugfs)
+		local->ops->link_add_debugfs(&local->hw, &sdata->vif,
+					     link_conf, dir);
+}
+
 static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
 				       struct ieee80211_sub_if_data *sdata,
 				       struct ieee80211_sta *sta,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ecc232eb1ee8..3d4edc25a69e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -999,6 +999,10 @@ struct ieee80211_link_data {
 	struct ieee80211_tx_queue_params tx_conf[IEEE80211_NUM_ACS];
 
 	struct ieee80211_bss_conf *conf;
+
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct dentry *debugfs_dir;
+#endif
 };
 
 struct ieee80211_sub_if_data {
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index e6f9fce1dadb..e82db88a47f8 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -10,6 +10,7 @@
 #include "ieee80211_i.h"
 #include "driver-ops.h"
 #include "key.h"
+#include "debugfs_netdev.h"
 
 void ieee80211_link_setup(struct ieee80211_link_data *link)
 {
@@ -61,6 +62,8 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
 		default:
 			WARN_ON(1);
 		}
+
+		ieee80211_link_debugfs_add(link);
 	}
 }
 
@@ -94,6 +97,7 @@ static void ieee80211_tear_down_links(struct ieee80211_sub_if_data *sdata,
 		if (WARN_ON(!link))
 			continue;
 		ieee80211_remove_link_keys(link, &keys);
+		ieee80211_link_debugfs_remove(link);
 		ieee80211_link_stop(link);
 	}
 
-- 
cgit v1.2.3


From 18cbf7c089ba70fefe1b4c01af28753cabfbf38f Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Wed, 1 Mar 2023 12:09:34 +0200
Subject: wifi: radiotap: Add EHT radiotap definitions

This is based on https://www.radiotap.org/fields/EHT.html and
https://www.radiotap.org/fields/U-SIG.html new EHT TLV
definition for 11be standard.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.254b19fffe41.I4ce78e2c558da6e5a708a8d68d61b5d7b3eb3746@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 187 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 185 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 598f53d2a3a0..0fc2667a9a5d 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2017		Intel Deutschland GmbH
- * Copyright (c) 2018-2019, 2021 Intel Corporation
+ * Copyright (c) 2018-2019, 2021-2022 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -82,11 +82,14 @@ enum ieee80211_radiotap_presence {
 	IEEE80211_RADIOTAP_HE_MU = 24,
 	IEEE80211_RADIOTAP_ZERO_LEN_PSDU = 26,
 	IEEE80211_RADIOTAP_LSIG = 27,
+	IEEE80211_RADIOTAP_TLV = 28,
 
 	/* valid in every it_present bitmap, even vendor namespaces */
 	IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE = 29,
 	IEEE80211_RADIOTAP_VENDOR_NAMESPACE = 30,
-	IEEE80211_RADIOTAP_EXT = 31
+	IEEE80211_RADIOTAP_EXT = 31,
+	IEEE80211_RADIOTAP_EHT_USIG = 33,
+	IEEE80211_RADIOTAP_EHT = 34,
 };
 
 /* for IEEE80211_RADIOTAP_FLAGS */
@@ -360,6 +363,186 @@ enum ieee80211_radiotap_zero_len_psdu_type {
 	IEEE80211_RADIOTAP_ZERO_LEN_PSDU_VENDOR			= 0xff,
 };
 
+struct ieee80211_radiotap_tlv {
+	__le16 type;
+	__le16 len;
+	u8 data[];
+} __packed;
+
+/* ieee80211_radiotap_eht_usig - content of U-SIG tlv (type 33)
+ * see www.radiotap.org/fields/U-SIG.html for details
+ */
+struct ieee80211_radiotap_eht_usig {
+	__le32 common;
+	__le32 value;
+	__le32 mask;
+} __packed;
+
+/* ieee80211_radiotap_eht - content of EHT tlv (type 34)
+ * see www.radiotap.org/fields/EHT.html for details
+ */
+struct ieee80211_radiotap_eht {
+	__le32 known;
+	__le32 data[9];
+	__le32 user_info[];
+} __packed;
+
+/* Known field for EHT TLV
+ * The ending defines for what the field applies as following
+ * O - OFDMA (including TB), M - MU-MIMO, S - EHT sounding.
+ */
+enum ieee80211_radiotap_eht_known {
+	IEEE80211_RADIOTAP_EHT_KNOWN_SPATIAL_REUSE		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_KNOWN_GI				= 0x00000004,
+	IEEE80211_RADIOTAP_EHT_KNOWN_EHT_LTF			= 0x00000010,
+	IEEE80211_RADIOTAP_EHT_KNOWN_LDPC_EXTRA_SYM_OM		= 0x00000020,
+	IEEE80211_RADIOTAP_EHT_KNOWN_PRE_PADD_FACOR_OM		= 0x00000040,
+	IEEE80211_RADIOTAP_EHT_KNOWN_PE_DISAMBIGUITY_OM		= 0x00000080,
+	IEEE80211_RADIOTAP_EHT_KNOWN_DISREGARD_O		= 0x00000100,
+	IEEE80211_RADIOTAP_EHT_KNOWN_DISREGARD_S		= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_KNOWN_CRC1			= 0x00002000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_TAIL1			= 0x00004000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_CRC2_O			= 0x00008000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_TAIL2_O			= 0x00010000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_NSS_S			= 0x00020000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_BEAMFORMED_S		= 0x00040000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_NR_NON_OFDMA_USERS_M	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_ENCODING_BLOCK_CRC_M	= 0x00100000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_ENCODING_BLOCK_TAIL_M	= 0x00200000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_RU_MRU_SIZE_OM		= 0x00400000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_RU_MRU_INDEX_OM		= 0x00800000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_RU_ALLOC_TB_FMT		= 0x01000000,
+	IEEE80211_RADIOTAP_EHT_KNOWN_PRIMARY_80			= 0x02000000,
+};
+
+enum ieee80211_radiotap_eht_data {
+	/* Data 0 */
+	IEEE80211_RADIOTAP_EHT_DATA0_SPATIAL_REUSE		= 0x00000078,
+	IEEE80211_RADIOTAP_EHT_DATA0_GI				= 0x00000180,
+	IEEE80211_RADIOTAP_EHT_DATA0_LTF			= 0x00000600,
+	IEEE80211_RADIOTAP_EHT_DATA0_EHT_LTF			= 0x00003800,
+	IEEE80211_RADIOTAP_EHT_DATA0_LDPC_EXTRA_SYM_OM		= 0x00004000,
+	IEEE80211_RADIOTAP_EHT_DATA0_PRE_PADD_FACOR_OM		= 0x00018000,
+	IEEE80211_RADIOTAP_EHT_DATA0_PE_DISAMBIGUITY_OM		= 0x00020000,
+	IEEE80211_RADIOTAP_EHT_DATA0_DISREGARD_S		= 0x000c0000,
+	IEEE80211_RADIOTAP_EHT_DATA0_DISREGARD_O		= 0x003c0000,
+	IEEE80211_RADIOTAP_EHT_DATA0_CRC1_O			= 0x03c00000,
+	IEEE80211_RADIOTAP_EHT_DATA0_TAIL1_O			= 0xfc000000,
+	/* Data 1 */
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_SIZE			= 0x0000001f,
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_INDEX			= 0x00001fe0,
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_ALLOC_CC_1_1_1		= 0x003fe000,
+	IEEE80211_RADIOTAP_EHT_DATA1_RU_ALLOC_CC_1_1_1_KNOWN	= 0x00400000,
+	IEEE80211_RADIOTAP_EHT_DATA1_PRIMARY_80			= 0xc0000000,
+	/* Data 2 */
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_1		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_1_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_1_1_2		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_1_1_2_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_2		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA2_RU_ALLOC_CC_2_1_2_KNOWN	= 0x20000000,
+	/* Data 3 */
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_1		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_1_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_2_2_1		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_2_2_1_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_2		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA3_RU_ALLOC_CC_1_2_2_KNOWN	= 0x20000000,
+	/* Data 4 */
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_2		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_2_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_1_2_3		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_1_2_3_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_3		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA4_RU_ALLOC_CC_2_2_3_KNOWN	= 0x20000000,
+	/* Data 5 */
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_4		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_4_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_2_2_4		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_2_2_4_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_5		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA5_RU_ALLOC_CC_1_2_5_KNOWN	= 0x20000000,
+	/* Data 6 */
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_5		= 0x000001ff,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_5_KNOWN	= 0x00000200,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_1_2_6		= 0x0007fc00,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_1_2_6_KNOWN	= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_6		= 0x1ff00000,
+	IEEE80211_RADIOTAP_EHT_DATA6_RU_ALLOC_CC_2_2_6_KNOWN	= 0x20000000,
+	/* Data 7 */
+	IEEE80211_RADIOTAP_EHT_DATA7_CRC2_O			= 0x0000000f,
+	IEEE80211_RADIOTAP_EHT_DATA7_TAIL_2_O			= 0x000003f0,
+	IEEE80211_RADIOTAP_EHT_DATA7_NSS_S			= 0x0000f000,
+	IEEE80211_RADIOTAP_EHT_DATA7_BEAMFORMED_S		= 0x00010000,
+	IEEE80211_RADIOTAP_EHT_DATA7_NUM_OF_NON_OFDMA_USERS	= 0x000e0000,
+	IEEE80211_RADIOTAP_EHT_DATA7_USER_ENCODING_BLOCK_CRC	= 0x00f00000,
+	IEEE80211_RADIOTAP_EHT_DATA7_USER_ENCODING_BLOCK_TAIL	= 0x3f000000,
+	/* Data 8 */
+	IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_PS_160	= 0x00000001,
+	IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_B0		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_DATA8_RU_ALLOC_TB_FMT_B7_B1	= 0x000001fc,
+};
+
+enum ieee80211_radiotap_eht_user_info {
+	IEEE80211_RADIOTAP_EHT_USER_INFO_STA_ID_KNOWN		= 0x00000001,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_MCS_KNOWN		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_CODING_KNOWN		= 0x00000004,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_NSS_KNOWN_O		= 0x00000010,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_BEAMFORMING_KNOWN_O	= 0x00000020,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_SPATIAL_CONFIG_KNOWN_M	= 0x00000040,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_DATA_FOR_USER		= 0x00000080,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_STA_ID			= 0x0007ff00,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_CODING			= 0x00080000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_MCS			= 0x00f00000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_NSS_O			= 0x0f000000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_BEAMFORMING_O		= 0x20000000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_SPATIAL_CONFIG_M	= 0x3f000000,
+	IEEE80211_RADIOTAP_EHT_USER_INFO_RESEVED_c0000000	= 0xc0000000,
+};
+
+enum ieee80211_radiotap_eht_usig_common {
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER_KNOWN	= 0x00000001,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW_KNOWN		= 0x00000002,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL_KNOWN		= 0x00000004,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR_KNOWN	= 0x00000008,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP_KNOWN		= 0x00000010,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BAD_USIG_CRC		= 0x00000020,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_PHY_VER		= 0x00007000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BW			= 0x00038000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_UL_DL		= 0x00040000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_BSS_COLOR		= 0x01f80000,
+	IEEE80211_RADIOTAP_EHT_USIG_COMMON_TXOP			= 0xfe000000,
+};
+
+enum ieee80211_radiotap_eht_usig_mu {
+	/* MU-USIG-1 */
+	IEEE80211_RADIOTAP_EHT_USIG1_MU_B20_B24_DISREGARD	= 0x0000001f,
+	IEEE80211_RADIOTAP_EHT_USIG1_MU_B25_VALIDATE		= 0x00000020,
+	/* MU-USIG-2 */
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B0_B1_PPDU_TYPE		= 0x000000c0,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B2_VALIDATE		= 0x00000100,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B3_B7_PUNCTURED_INFO	= 0x00003e00,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B8_VALIDATE		= 0x00004000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B9_B10_SIG_MCS		= 0x00018000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B11_B15_EHT_SIG_SYMBOLS = 0x003e0000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B16_B19_CRC		= 0x03c00000,
+	IEEE80211_RADIOTAP_EHT_USIG2_MU_B20_B25_TAIL		= 0xfc000000,
+};
+
+enum ieee80211_radiotap_eht_usig_tb {
+	/* TB-USIG-1 */
+	IEEE80211_RADIOTAP_EHT_USIG1_TB_B20_B25_DISREGARD	= 0x0000001f,
+
+	/* TB-USIG-2 */
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B0_B1_PPDU_TYPE		= 0x000000c0,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B2_VALIDATE		= 0x00000100,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B3_B6_SPATIAL_REUSE_1	= 0x00001e00,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B7_B10_SPATIAL_REUSE_2	= 0x0001e000,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B11_B15_DISREGARD	= 0x003e0000,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B16_B19_CRC		= 0x03c00000,
+	IEEE80211_RADIOTAP_EHT_USIG2_TB_B20_B25_TAIL		= 0xfc000000,
+};
+
 /**
  * ieee80211_get_radiotap_len - get radiotap header length
  */
-- 
cgit v1.2.3


From 9179dff82598ab8b4e88dcc93c9e26a2594efd1a Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Wed, 1 Mar 2023 12:09:35 +0200
Subject: wifi: mac80211: add support for driver adding radiotap TLVs

The new TLV format enables adding TLVs after the fixed
fields in radiotap, as part of the radiotap header.
Support this and move vendor data to the TLV format,
allowing a reuse of the RX_FLAG_RADIOTAP_VENDOR_DATA as
the new RX_FLAG_RADIOTAP_TLV_AT_END flag.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230301115906.b18fd5da8477.I576400ec40a7b35ef97a3b09a99b3a49e9174786@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 27 ++++-----
 drivers/net/wireless/mac80211_hwsim.c         | 45 ++++++++-------
 include/net/ieee80211_radiotap.h              | 20 +++++++
 include/net/mac80211.h                        | 44 +++-----------
 net/mac80211/rx.c                             | 82 +++++++++------------------
 5 files changed, 91 insertions(+), 127 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 549dbe0be223..d1769464d75b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -209,33 +209,34 @@ static void iwl_mvm_add_rtap_sniffer_config(struct iwl_mvm *mvm,
 					    struct sk_buff *skb)
 {
 	struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb);
-	struct ieee80211_vendor_radiotap *radiotap;
-	const int size = sizeof(*radiotap) + sizeof(__le16);
+	struct ieee80211_radiotap_vendor_tlv *radiotap;
+	const u16 vendor_data_len = sizeof(mvm->cur_aid);
+	const u16 padding = ALIGN(vendor_data_len, 4) - vendor_data_len;
 
 	if (!mvm->cur_aid)
 		return;
 
-	/* ensure alignment */
-	BUILD_BUG_ON((size + 2) % 4);
+	radiotap = skb_put(skb, sizeof(*radiotap) + vendor_data_len + padding);
+	radiotap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE);
+	radiotap->len = cpu_to_le16(sizeof(*radiotap) -
+				    sizeof(struct ieee80211_radiotap_tlv) +
+				    vendor_data_len);
 
-	radiotap = skb_put(skb, size + 2);
-	radiotap->align = 1;
 	/* Intel OUI */
 	radiotap->oui[0] = 0xf6;
 	radiotap->oui[1] = 0x54;
 	radiotap->oui[2] = 0x25;
 	/* radiotap sniffer config sub-namespace */
-	radiotap->subns = 1;
-	radiotap->present = 0x1;
-	radiotap->len = size - sizeof(*radiotap);
-	radiotap->pad = 2;
-
+	radiotap->oui_subtype = 1;
+	radiotap->vendor_type = 0;
+	/* clear reserved field */
+	radiotap->reserved = 0;
 	/* fill the data now */
 	memcpy(radiotap->data, &mvm->cur_aid, sizeof(mvm->cur_aid));
 	/* and clear the padding */
-	memset(radiotap->data + sizeof(__le16), 0, radiotap->pad);
+	memset(radiotap->data + vendor_data_len, 0, padding);
 
-	rx_status->flag |= RX_FLAG_RADIOTAP_VENDOR_DATA;
+	rx_status->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 }
 
 /* iwl_mvm_pass_packet_to_mac80211 - passes the packet for mac80211 */
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index e9b9340a97dc..152617034d19 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1534,37 +1534,38 @@ static void mac80211_hwsim_add_vendor_rtap(struct sk_buff *skb)
 	 * the values accordingly.
 	 */
 #ifdef HWSIM_RADIOTAP_OUI
-	struct ieee80211_vendor_radiotap *rtap;
+	struct ieee80211_radiotap_vendor_tlv *rtap;
+	static const char vendor_data[8] = "ABCDEFGH";
+
+	// Make sure no padding is needed
+	BUILD_BUG_ON(sizeof(vendor_data) % 4);
+	/* this is last radiotap info before the mac header, so
+	 * skb_reset_mac_header for mac8022 to know the end of
+	 * the radiotap TLV/beginning of the 802.11 header
+	 */
+	skb_reset_mac_header(skb);
 
 	/*
 	 * Note that this code requires the headroom in the SKB
 	 * that was allocated earlier.
 	 */
-	rtap = skb_push(skb, sizeof(*rtap) + 8 + 4);
+	rtap = skb_push(skb, sizeof(*rtap) + sizeof(vendor_data));
+
+	rtap->len = cpu_to_le16(sizeof(*rtap) -
+				sizeof(struct ieee80211_radiotap_tlv) +
+				sizeof(vendor_data));
+	rtap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE);
+
 	rtap->oui[0] = HWSIM_RADIOTAP_OUI[0];
 	rtap->oui[1] = HWSIM_RADIOTAP_OUI[1];
 	rtap->oui[2] = HWSIM_RADIOTAP_OUI[2];
-	rtap->subns = 127;
+	rtap->oui_subtype = 127;
+	/* clear reserved field */
+	rtap->reserved = 0;
+	rtap->vendor_type = 0;
+	memcpy(rtap->data, vendor_data, sizeof(vendor_data));
 
-	/*
-	 * Radiotap vendor namespaces can (and should) also be
-	 * split into fields by using the standard radiotap
-	 * presence bitmap mechanism. Use just BIT(0) here for
-	 * the presence bitmap.
-	 */
-	rtap->present = BIT(0);
-	/* We have 8 bytes of (dummy) data */
-	rtap->len = 8;
-	/* For testing, also require it to be aligned */
-	rtap->align = 8;
-	/* And also test that padding works, 4 bytes */
-	rtap->pad = 4;
-	/* push the data */
-	memcpy(rtap->data, "ABCDEFGH", 8);
-	/* make sure to clear padding, mac80211 doesn't */
-	memset(rtap->data + 8, 0, 4);
-
-	IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_VENDOR_DATA;
+	IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 #endif
 }
 
diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 0fc2667a9a5d..95436686d3fe 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -369,6 +369,26 @@ struct ieee80211_radiotap_tlv {
 	u8 data[];
 } __packed;
 
+/**
+ * struct ieee80211_radiotap_vendor_tlv - vendor radiotap data information
+ * @type: should always be set to IEEE80211_RADIOTAP_VENDOR_NAMESPACE
+ * @len: length of data
+ * @oui: radiotap vendor namespace OUI
+ * @oui_subtype: radiotap vendor sub namespace
+ * @vendor_type: radiotap vendor type
+ * @reserved: should always be set to zero (to avoid leaking memory)
+ * @data: the actual vendor namespace data
+ */
+struct ieee80211_radiotap_vendor_tlv {
+	__le16 type; /* IEEE80211_RADIOTAP_VENDOR_NAMESPACE */
+	__le16 len;
+	u8 oui[3];
+	u8 oui_subtype;
+	__le16 vendor_type;
+	__le16 reserved;
+	u8 data[];
+} __packed;
+
 /* ieee80211_radiotap_eht_usig - content of U-SIG tlv (type 33)
  * see www.radiotap.org/fields/U-SIG.html for details
  */
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5df9eb828a58..3a43ce5fd4ec 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1375,9 +1375,12 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
  *	subframes share the same sequence number. Reported subframes can be
  *	either regular MSDU or singly A-MSDUs. Subframes must not be
  *	interleaved with other frames.
- * @RX_FLAG_RADIOTAP_VENDOR_DATA: This frame contains vendor-specific
- *	radiotap data in the skb->data (before the frame) as described by
- *	the &struct ieee80211_vendor_radiotap.
+ * @RX_FLAG_RADIOTAP_TLV_AT_END: This frame contains radiotap TLVs in the
+ *	skb->data (before the 802.11 header).
+ *	If used, the SKB's mac_header pointer must be set to point
+ *	to the 802.11 header after the TLVs, and any padding added after TLV
+ *	data to align to 4 must be cleared by the driver putting the TLVs
+ *	in the skb.
  * @RX_FLAG_ALLOW_SAME_PN: Allow the same PN as same packet before.
  *	This is used for AMSDU subframes which can have the same PN as
  *	the first subframe.
@@ -1429,7 +1432,7 @@ enum mac80211_rx_flags {
 	RX_FLAG_ONLY_MONITOR		= BIT(17),
 	RX_FLAG_SKIP_MONITOR		= BIT(18),
 	RX_FLAG_AMSDU_MORE		= BIT(19),
-	RX_FLAG_RADIOTAP_VENDOR_DATA	= BIT(20),
+	RX_FLAG_RADIOTAP_TLV_AT_END	= BIT(20),
 	RX_FLAG_MIC_STRIPPED		= BIT(21),
 	RX_FLAG_ALLOW_SAME_PN		= BIT(22),
 	RX_FLAG_ICV_STRIPPED		= BIT(23),
@@ -1569,39 +1572,6 @@ ieee80211_rx_status_to_khz(struct ieee80211_rx_status *rx_status)
 	       (rx_status->freq_offset ? 500 : 0);
 }
 
-/**
- * struct ieee80211_vendor_radiotap - vendor radiotap data information
- * @present: presence bitmap for this vendor namespace
- *	(this could be extended in the future if any vendor needs more
- *	 bits, the radiotap spec does allow for that)
- * @align: radiotap vendor namespace alignment. This defines the needed
- *	alignment for the @data field below, not for the vendor namespace
- *	description itself (which has a fixed 2-byte alignment)
- *	Must be a power of two, and be set to at least 1!
- * @oui: radiotap vendor namespace OUI
- * @subns: radiotap vendor sub namespace
- * @len: radiotap vendor sub namespace skip length, if alignment is done
- *	then that's added to this, i.e. this is only the length of the
- *	@data field.
- * @pad: number of bytes of padding after the @data, this exists so that
- *	the skb data alignment can be preserved even if the data has odd
- *	length
- * @data: the actual vendor namespace data
- *
- * This struct, including the vendor data, goes into the skb->data before
- * the 802.11 header. It's split up in mac80211 using the align/oui/subns
- * data.
- */
-struct ieee80211_vendor_radiotap {
-	u32 present;
-	u8 align;
-	u8 oui[3];
-	u8 subns;
-	u8 pad;
-	u16 len;
-	u8 data[];
-} __packed;
-
 /**
  * enum ieee80211_conf_flags - configuration flags
  *
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index f63ed6b91d9b..0255c5745e1c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -55,7 +55,7 @@ static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb,
 	/* After pulling radiotap header, clear all flags that indicate
 	 * info in skb->data.
 	 */
-	status->flag &= ~(RX_FLAG_RADIOTAP_VENDOR_DATA |
+	status->flag &= ~(RX_FLAG_RADIOTAP_TLV_AT_END |
 			  RX_FLAG_RADIOTAP_LSIG |
 			  RX_FLAG_RADIOTAP_HE_MU |
 			  RX_FLAG_RADIOTAP_HE);
@@ -126,9 +126,6 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 	/* allocate extra bitmaps */
 	if (status->chains)
 		len += 4 * hweight8(status->chains);
-	/* vendor presence bitmap */
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)
-		len += 4;
 
 	if (ieee80211_have_rx_timestamp(status)) {
 		len = ALIGN(len, 8);
@@ -190,34 +187,28 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local,
 		len += 2 * hweight8(status->chains);
 	}
 
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		struct ieee80211_vendor_radiotap *rtap;
-		int vendor_data_offset = 0;
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END) {
+		int tlv_offset = 0;
 
 		/*
 		 * The position to look at depends on the existence (or non-
 		 * existence) of other elements, so take that into account...
 		 */
 		if (status->flag & RX_FLAG_RADIOTAP_HE)
-			vendor_data_offset +=
+			tlv_offset +=
 				sizeof(struct ieee80211_radiotap_he);
 		if (status->flag & RX_FLAG_RADIOTAP_HE_MU)
-			vendor_data_offset +=
+			tlv_offset +=
 				sizeof(struct ieee80211_radiotap_he_mu);
 		if (status->flag & RX_FLAG_RADIOTAP_LSIG)
-			vendor_data_offset +=
+			tlv_offset +=
 				sizeof(struct ieee80211_radiotap_lsig);
 
-		rtap = (void *)&skb->data[vendor_data_offset];
+		/* ensure 4 byte alignment for TLV */
+		len = ALIGN(len, 4);
 
-		/* alignment for fixed 6-byte vendor data header */
-		len = ALIGN(len, 2);
-		/* vendor data header */
-		len += 6;
-		if (WARN_ON(rtap->align == 0))
-			rtap->align = 1;
-		len = ALIGN(len, rtap->align);
-		len += rtap->len + rtap->pad;
+		/* TLVs until the mac header */
+		len += skb_mac_header(skb) - &skb->data[tlv_offset];
 	}
 
 	return len;
@@ -313,9 +304,9 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 	u32 it_present_val;
 	u16 rx_flags = 0;
 	u16 channel_flags = 0;
+	u32 tlvs_len = 0;
 	int mpdulen, chain;
 	unsigned long chains = status->chains;
-	struct ieee80211_vendor_radiotap rtap = {};
 	struct ieee80211_radiotap_he he = {};
 	struct ieee80211_radiotap_he_mu he_mu = {};
 	struct ieee80211_radiotap_lsig lsig = {};
@@ -336,18 +327,17 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		skb_pull(skb, sizeof(lsig));
 	}
 
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		rtap = *(struct ieee80211_vendor_radiotap *)skb->data;
-		/* rtap.len and rtap.pad are undone immediately */
-		skb_pull(skb, sizeof(rtap) + rtap.len + rtap.pad);
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END) {
+		/* data is pointer at tlv all other info was pulled off */
+		tlvs_len = skb_mac_header(skb) - skb->data;
 	}
 
 	mpdulen = skb->len;
 	if (!(has_fcs && ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)))
 		mpdulen += FCS_LEN;
 
-	rthdr = skb_push(skb, rtap_len);
-	memset(rthdr, 0, rtap_len - rtap.len - rtap.pad);
+	rthdr = skb_push(skb, rtap_len - tlvs_len);
+	memset(rthdr, 0, rtap_len - tlvs_len);
 	it_present = &rthdr->it_present;
 
 	/* radiotap header, set always present flags */
@@ -369,13 +359,8 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 				 BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL);
 	}
 
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		it_present_val |= BIT(IEEE80211_RADIOTAP_VENDOR_NAMESPACE) |
-				  BIT(IEEE80211_RADIOTAP_EXT);
-		put_unaligned_le32(it_present_val, it_present);
-		it_present++;
-		it_present_val = rtap.present;
-	}
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END)
+		it_present_val |= BIT(IEEE80211_RADIOTAP_TLV);
 
 	put_unaligned_le32(it_present_val, it_present);
 
@@ -706,22 +691,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 		*pos++ = status->chain_signal[chain];
 		*pos++ = chain;
 	}
-
-	if (status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA) {
-		/* ensure 2 byte alignment for the vendor field as required */
-		if ((pos - (u8 *)rthdr) & 1)
-			*pos++ = 0;
-		*pos++ = rtap.oui[0];
-		*pos++ = rtap.oui[1];
-		*pos++ = rtap.oui[2];
-		*pos++ = rtap.subns;
-		put_unaligned_le16(rtap.len, pos);
-		pos += 2;
-		/* align the actual payload as requested */
-		while ((pos - (u8 *)rthdr) & (rtap.align - 1))
-			*pos++ = 0;
-		/* data (and possible padding) already follows */
-	}
 }
 
 static struct sk_buff *
@@ -797,6 +766,13 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	bool only_monitor = false;
 	unsigned int min_head_len;
 
+	if (WARN_ON_ONCE(status->flag & RX_FLAG_RADIOTAP_TLV_AT_END &&
+			 !skb_mac_header_was_set(origskb))) {
+		/* with this skb no way to know where frame payload starts */
+		dev_kfree_skb(origskb);
+		return NULL;
+	}
+
 	if (status->flag & RX_FLAG_RADIOTAP_HE)
 		rtap_space += sizeof(struct ieee80211_radiotap_he);
 
@@ -806,12 +782,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
 	if (status->flag & RX_FLAG_RADIOTAP_LSIG)
 		rtap_space += sizeof(struct ieee80211_radiotap_lsig);
 
-	if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) {
-		struct ieee80211_vendor_radiotap *rtap =
-			(void *)(origskb->data + rtap_space);
-
-		rtap_space += sizeof(*rtap) + rtap->len + rtap->pad;
-	}
+	if (status->flag & RX_FLAG_RADIOTAP_TLV_AT_END)
+		rtap_space += skb_mac_header(origskb) - &origskb->data[rtap_space];
 
 	min_head_len = rtap_space;
 
-- 
cgit v1.2.3


From 5383bfff5261422f843de72a4089da9b152291b0 Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Sat, 18 Feb 2023 01:50:05 +0800
Subject: wifi: mac80211: introduce ieee80211_refresh_tx_agg_session_timer()

This allows low level drivers to refresh the tx agg session timer, based on
querying stats from the firmware usually. Especially for some mt76 devices
support .net_fill_forward_path would bypass mac80211, which leads to tx BA
session timeout clients that set a timeout in their AddBA response to our
request, even if our request is without a timeout.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/7c3f72eac1c34921cd84a462e60d71e125862152.1676616450.git.ryder.lee@mediatek.com
[slightly clarify commit message, add note about RCU]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 14 ++++++++++++++
 net/mac80211/agg-tx.c  | 17 +++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 3a43ce5fd4ec..722b99e54ad2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5973,6 +5973,20 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
 				  struct delayed_work *dwork,
 				  unsigned long delay);
 
+/**
+ * ieee80211_refresh_tx_agg_session_timer - Refresh a tx agg session timer.
+ * @sta: the station for which to start a BA session
+ * @tid: the TID to BA on.
+ *
+ * This function allows low level driver to refresh tx agg session timer
+ * to maintain BA session, the session level will still be managed by the
+ * mac80211.
+ *
+ * Note: must be called in an RCU critical section.
+ */
+void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *sta,
+					    u16 tid);
+
 /**
  * ieee80211_start_tx_ba_session - Start a tx Block Ack session.
  * @sta: the station for which to start a BA session
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index f9514bacbd4a..3b651e7f5a73 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -554,6 +554,23 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
 	ieee80211_send_addba_with_timeout(sta, tid_tx);
 }
 
+void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *pubsta,
+					    u16 tid)
+{
+	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+	struct tid_ampdu_tx *tid_tx;
+
+	if (WARN_ON_ONCE(tid >= IEEE80211_NUM_TIDS))
+		return;
+
+	tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+	if (!tid_tx)
+		return;
+
+	tid_tx->last_tx = jiffies;
+}
+EXPORT_SYMBOL(ieee80211_refresh_tx_agg_session_timer);
+
 /*
  * After accepting the AddBA Response we activated a timer,
  * resetting it after each frame that we send.
-- 
cgit v1.2.3


From f4d1181e4759c9c6c97c86cda2cf2d1ddb6a74d2 Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Sat, 18 Feb 2023 01:48:59 +0800
Subject: wifi: mac80211: add EHT MU-MIMO related flags in ieee80211_bss_conf

Similar to VHT/HE. This is utilized to pass MU-MIMO configurations
from user space (i.e. hostapd) to driver.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/8d9966c4c1e77cb1ade77d42bdc49905609192e9.1676628065.git.ryder.lee@mediatek.com
[move into combined if statement, reset on !eht]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  9 +++++++++
 net/mac80211/cfg.c     | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 722b99e54ad2..410cd3daaa59 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -674,6 +674,12 @@ struct ieee80211_fils_discovery {
  * @he_full_ul_mumimo: does this BSS support the reception (AP) or transmission
  *	(non-AP STA) of an HE TB PPDU on an RU that spans the entire PPDU
  *	bandwidth
+ * @eht_su_beamformer: in AP-mode, does this BSS enable operation as an EHT SU
+ *	beamformer
+ * @eht_su_beamformee: in AP-mode, does this BSS enable operation as an EHT SU
+ *	beamformee
+ * @eht_mu_beamformer: in AP-mode, does this BSS enable operation as an EHT MU
+ *	beamformer
  */
 struct ieee80211_bss_conf {
 	struct ieee80211_vif *vif;
@@ -761,6 +767,9 @@ struct ieee80211_bss_conf {
 	bool he_su_beamformee;
 	bool he_mu_beamformer;
 	bool he_full_ul_mumimo;
+	bool eht_su_beamformer;
+	bool eht_su_beamformee;
+	bool eht_mu_beamformer;
 };
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 2c8fa19c2f24..9789008626a5 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1299,6 +1299,22 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	if (params->eht_cap) {
 		link_conf->eht_puncturing = params->punct_bitmap;
 		changed |= BSS_CHANGED_EHT_PUNCTURING;
+
+		link_conf->eht_su_beamformer =
+			params->eht_cap->fixed.phy_cap_info[0] &
+				IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER;
+		link_conf->eht_su_beamformee =
+			params->eht_cap->fixed.phy_cap_info[0] &
+				IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE;
+		link_conf->eht_mu_beamformer =
+			params->eht_cap->fixed.phy_cap_info[7] &
+				(IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
+				 IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ |
+				 IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ);
+	} else {
+		link_conf->eht_su_beamformer = false;
+		link_conf->eht_su_beamformee = false;
+		link_conf->eht_mu_beamformer = false;
 	}
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP &&
-- 
cgit v1.2.3


From 2ad7dd9425408bf0ca524102808059c05bad169e Mon Sep 17 00:00:00 2001
From: Ryder Lee <ryder.lee@mediatek.com>
Date: Sat, 18 Feb 2023 01:49:25 +0800
Subject: wifi: mac80211: add LDPC related flags in ieee80211_bss_conf

This is utilized to pass LDPC configurations from user space
(i.e. hostapd) to driver.

Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/1de696aaa34efd77a926eb657b8c0fda05aaa177.1676628065.git.ryder.lee@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  6 ++++++
 net/mac80211/cfg.c     | 11 +++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 410cd3daaa59..f12edca660ba 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -657,6 +657,9 @@ struct ieee80211_fils_discovery {
  *	write-protected by sdata_lock and local->mtx so holding either is fine
  *	for read access.
  * @color_change_color: the bss color that will be used after the change.
+ * @ht_ldpc: in AP mode, indicates interface has HT LDPC capability.
+ * @vht_ldpc: in AP mode, indicates interface has VHT LDPC capability.
+ * @he_ldpc: in AP mode, indicates interface has HE LDPC capability.
  * @vht_su_beamformer: in AP mode, does this BSS support operation as an VHT SU
  *	beamformer
  * @vht_su_beamformee: in AP mode, does this BSS support operation as an VHT SU
@@ -759,6 +762,9 @@ struct ieee80211_bss_conf {
 	bool color_change_active;
 	u8 color_change_color;
 
+	bool ht_ldpc;
+	bool vht_ldpc;
+	bool he_ldpc;
 	bool vht_su_beamformer;
 	bool vht_su_beamformee;
 	bool vht_mu_beamformer;
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 9789008626a5..760ad934f9e1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1252,7 +1252,15 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	prev_beacon_int = link_conf->beacon_int;
 	link_conf->beacon_int = params->beacon_interval;
 
+	if (params->ht_cap)
+		link_conf->ht_ldpc =
+			params->ht_cap->cap_info &
+				cpu_to_le16(IEEE80211_HT_CAP_LDPC_CODING);
+
 	if (params->vht_cap) {
+		link_conf->vht_ldpc =
+			params->vht_cap->vht_cap_info &
+				cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC);
 		link_conf->vht_su_beamformer =
 			params->vht_cap->vht_cap_info &
 				cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE);
@@ -1282,6 +1290,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	}
 
 	if (params->he_cap) {
+		link_conf->he_ldpc =
+			params->he_cap->phy_cap_info[1] &
+				IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD;
 		link_conf->he_su_beamformer =
 			params->he_cap->phy_cap_info[3] &
 				IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER;
-- 
cgit v1.2.3


From 6933486133ecf71bbe273d7ac72cfc4a51286af3 Mon Sep 17 00:00:00 2001
From: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Date: Thu, 12 Jan 2023 06:54:13 +0530
Subject: wifi: nl80211: Add support for randomizing TA of auth and deauth
 frames

Add support to use a random local address in authentication and
deauthentication frames sent to unassociated peer when the driver
supports.

The driver needs to configure receive behavior to accept frames with
random transmit address specified in TX path authentication frames
during the time of the frame exchange is pending and such frames need to
be acknowledged similarly to frames sent to the local permanent address
when this random address functionality is used.

This capability allows use of randomized transmit address for PASN
authentication frames to improve privacy of WLAN clients.

Signed-off-by: Veerendranath Jakkam <quic_vjakkam@quicinc.com>
Link: https://lore.kernel.org/r/20230112012415.167556-2-quic_vjakkam@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  5 ++++
 net/wireless/mlme.c          | 55 +++++++++++++++++++++++++++++---------------
 2 files changed, 41 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index c8520c150f9c..9a0ac0363f1f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6348,6 +6348,10 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_SECURE_NAN: Device supports NAN Pairing which enables
  *	authentication, data encryption and message integrity.
  *
+ * @NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA: Device supports randomized TA
+ *	in authentication and deauthentication frames sent to unassociated peer
+ *	using @NL80211_CMD_FRAME.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -6418,6 +6422,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE,
 	NL80211_EXT_FEATURE_PUNCT,
 	NL80211_EXT_FEATURE_SECURE_NAN,
+	NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 81d3f40d6235..ac059cefbeb3 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -673,6 +673,39 @@ static bool cfg80211_allowed_address(struct wireless_dev *wdev, const u8 *addr)
 	return ether_addr_equal(addr, wdev_address(wdev));
 }
 
+static bool cfg80211_allowed_random_address(struct wireless_dev *wdev,
+					    const struct ieee80211_mgmt *mgmt)
+{
+	if (ieee80211_is_auth(mgmt->frame_control) ||
+	    ieee80211_is_deauth(mgmt->frame_control)) {
+		/* Allow random TA to be used with authentication and
+		 * deauthentication frames if the driver has indicated support.
+		 */
+		if (wiphy_ext_feature_isset(
+			    wdev->wiphy,
+			    NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA))
+			return true;
+	} else if (ieee80211_is_action(mgmt->frame_control) &&
+		   mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) {
+		/* Allow random TA to be used with Public Action frames if the
+		 * driver has indicated support.
+		 */
+		if (!wdev->connected &&
+		    wiphy_ext_feature_isset(
+			    wdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
+			return true;
+
+		if (wdev->connected &&
+		    wiphy_ext_feature_isset(
+			    wdev->wiphy,
+			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
+			return true;
+	}
+
+	return false;
+}
+
 int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			  struct wireless_dev *wdev,
 			  struct cfg80211_mgmt_tx_params *params, u64 *cookie)
@@ -774,25 +807,9 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev,
 			return err;
 	}
 
-	if (!cfg80211_allowed_address(wdev, mgmt->sa)) {
-		/* Allow random TA to be used with Public Action frames if the
-		 * driver has indicated support for this. Otherwise, only allow
-		 * the local address to be used.
-		 */
-		if (!ieee80211_is_action(mgmt->frame_control) ||
-		    mgmt->u.action.category != WLAN_CATEGORY_PUBLIC)
-			return -EINVAL;
-		if (!wdev->connected &&
-		    !wiphy_ext_feature_isset(
-			    &rdev->wiphy,
-			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA))
-			return -EINVAL;
-		if (wdev->connected &&
-		    !wiphy_ext_feature_isset(
-			    &rdev->wiphy,
-			    NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED))
-			return -EINVAL;
-	}
+	if (!cfg80211_allowed_address(wdev, mgmt->sa) &&
+	    !cfg80211_allowed_random_address(wdev, mgmt))
+		return -EINVAL;
 
 	/* Transmit the management frame as requested by user space */
 	return rdev_mgmt_tx(rdev, wdev, params, cookie);
-- 
cgit v1.2.3


From 2d5bcdcda8799cf21f9ab84598c946dd320207a2 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Tue, 7 Mar 2023 08:14:06 -0700
Subject: bpf: Increase size of BTF_ID_LIST without CONFIG_DEBUG_INFO_BTF again

After commit 66e3a13e7c2c ("bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr"),
clang builds without CONFIG_DEBUG_INFO_BTF warn:

  kernel/bpf/verifier.c:10298:24: warning: array index 16 is past the end of the array (that has type 'u32[16]' (aka 'unsigned int[16]')) [-Warray-bounds]
                                     meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
                                                     ^                  ~~~~~~~~~~~~~~~~~~~~~~~~
  kernel/bpf/verifier.c:9150:1: note: array 'special_kfunc_list' declared here
  BTF_ID_LIST(special_kfunc_list)
  ^
  include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST'
  #define BTF_ID_LIST(name) static u32 __maybe_unused name[16];
                            ^
  1 warning generated.

A warning of this nature was previously addressed by
commit beb3d47d1d3d ("bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set")
but there have been new kfuncs added since then.

Quadruple the size of the CONFIG_DEBUG_INFO_BTF=n definition so that
this problem is unlikely to show up for some time.

Link: https://github.com/ClangBuiltLinux/linux/issues/1810
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Tom Rix <trix@redhat.com>
Link: https://lore.kernel.org/r/20230307-bpf-kfuncs-warray-bounds-v1-1-00ad3191f3a6@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf_ids.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index 3a4f7cd882ca..00950cc03bff 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -204,7 +204,7 @@ extern struct btf_id_set8 name;
 
 #else
 
-#define BTF_ID_LIST(name) static u32 __maybe_unused name[16];
+#define BTF_ID_LIST(name) static u32 __maybe_unused name[64];
 #define BTF_ID(prefix, name)
 #define BTF_ID_FLAGS(prefix, name, ...)
 #define BTF_ID_UNUSED
-- 
cgit v1.2.3


From 90a5527d7686d3ebe0dd2a831356a6c7d7dc31bc Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:45:58 +0000
Subject: bpf: add new map ops ->map_mem_usage

Add a new map ops ->map_mem_usage to print the memory usage of a
bpf map.

This is a preparation for the followup change.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-2-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/syscall.c | 15 +++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d3456804f7aa..9059520bbb5e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -161,6 +161,8 @@ struct bpf_map_ops {
 				     bpf_callback_t callback_fn,
 				     void *callback_ctx, u64 flags);
 
+	u64 (*map_mem_usage)(const struct bpf_map *map);
+
 	/* BTF id of struct allocated by map_alloc */
 	int *map_btf_id;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ede5f987484f..7c96e68859ec 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -771,16 +771,15 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
 }
 
 #ifdef CONFIG_PROC_FS
-/* Provides an approximation of the map's memory footprint.
- * Used only to provide a backward compatibility and display
- * a reasonable "memlock" info.
- */
-static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
 {
 	unsigned long size;
 
-	size = round_up(map->key_size + bpf_map_value_size(map), 8);
+	if (map->ops->map_mem_usage)
+		return map->ops->map_mem_usage(map);
 
+	size = round_up(map->key_size + bpf_map_value_size(map), 8);
 	return round_up(map->max_entries * size, PAGE_SIZE);
 }
 
@@ -803,7 +802,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "max_entries:\t%u\n"
 		   "map_flags:\t%#x\n"
 		   "map_extra:\t%#llx\n"
-		   "memlock:\t%lu\n"
+		   "memlock:\t%llu\n"
 		   "map_id:\t%u\n"
 		   "frozen:\t%u\n",
 		   map->map_type,
@@ -812,7 +811,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->max_entries,
 		   map->map_flags,
 		   (unsigned long long)map->map_extra,
-		   bpf_map_memory_footprint(map),
+		   bpf_map_memory_usage(map),
 		   map->id,
 		   READ_ONCE(map->frozen));
 	if (type) {
-- 
cgit v1.2.3


From 7490b7f1c02ef825ef98f7230662049d4a464a21 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:46:11 +0000
Subject: bpf, net: bpf_local_storage memory usage

A new helper is introduced into bpf_local_storage map to calculate the
memory usage. This helper is also used by other maps like
bpf_cgrp_storage, bpf_inode_storage, bpf_task_storage and etc.

Note that currently the dynamically allocated storage elements are not
counted in the usage, since it will take extra runtime overhead in the
elements update or delete path. So let's put it aside now, and implement
it in the future when someone really need it.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-15-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  1 +
 kernel/bpf/bpf_cgrp_storage.c     |  1 +
 kernel/bpf/bpf_inode_storage.c    |  1 +
 kernel/bpf/bpf_local_storage.c    | 10 ++++++++++
 kernel/bpf/bpf_task_storage.c     |  1 +
 net/core/bpf_sk_storage.c         |  1 +
 6 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 6d37a40cd90e..d934248b8e81 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -164,5 +164,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags);
 
 void bpf_local_storage_free_rcu(struct rcu_head *rcu);
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
 
 #endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 6cdf6d9ed91d..9ae07aedaf23 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -221,6 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = {
 	.map_update_elem = bpf_cgrp_storage_update_elem,
 	.map_delete_elem = bpf_cgrp_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = cgroup_storage_ptr,
 };
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 05f4c66c9089..43e2619c8167 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -223,6 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
 	.map_update_elem = bpf_fd_inode_storage_update_elem,
 	.map_delete_elem = bpf_fd_inode_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = inode_storage_ptr,
 };
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 3d320393a12c..d3ba3f2db640 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -685,6 +685,16 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 	return free_storage;
 }
 
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+	struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+	u64 usage = sizeof(*smap);
+
+	/* The dynamically callocated selems are not counted currently. */
+	usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+	return usage;
+}
+
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
 			    struct bpf_local_storage_cache *cache)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 1e486055a523..20f942229f3c 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -335,6 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = {
 	.map_update_elem = bpf_pid_task_storage_update_elem,
 	.map_delete_elem = bpf_pid_task_storage_delete_elem,
 	.map_check_btf = bpf_local_storage_map_check_btf,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 	.map_btf_id = &bpf_local_storage_map_btf_id[0],
 	.map_owner_storage_ptr = task_storage_ptr,
 };
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index bb378c33f542..7a36353dbc22 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -324,6 +324,7 @@ const struct bpf_map_ops sk_storage_map_ops = {
 	.map_local_storage_charge = bpf_sk_storage_charge,
 	.map_local_storage_uncharge = bpf_sk_storage_uncharge,
 	.map_owner_storage_ptr = bpf_sk_storage_ptr,
+	.map_mem_usage = bpf_local_storage_map_mem_usage,
 };
 
 const struct bpf_func_proto bpf_sk_storage_get_proto = {
-- 
cgit v1.2.3


From b4fd0d672bca001632d7291b5b162b08e065b815 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:46:13 +0000
Subject: bpf, net: xskmap memory usage

A new helper is introduced to calculate xskmap memory usage.

The xfsmap memory usage can be dynamically changed when we add or remove
a xsk_map_node. Hence we need to track the count of xsk_map_node to get
its memory usage.

The result as follows,
- before
10: xskmap  name count_map  flags 0x0
        key 4B  value 4B  max_entries 65536  memlock 524288B

- after
10: xskmap  name count_map  flags 0x0 <<< no elements case
        key 4B  value 4B  max_entries 65536  memlock 524608B

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-17-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp_sock.h |  1 +
 net/xdp/xskmap.c       | 13 +++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 3057e1a4a11c..e96a1151ec75 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -38,6 +38,7 @@ struct xdp_umem {
 struct xsk_map {
 	struct bpf_map map;
 	spinlock_t lock; /* Synchronize map updates */
+	atomic_t count;
 	struct xdp_sock __rcu *xsk_map[];
 };
 
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 771d0fa90ef5..0c38d7175922 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -24,6 +24,7 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
 		return ERR_PTR(-ENOMEM);
 
 	bpf_map_inc(&map->map);
+	atomic_inc(&map->count);
 
 	node->map = map;
 	node->map_entry = map_entry;
@@ -32,8 +33,11 @@ static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
 
 static void xsk_map_node_free(struct xsk_map_node *node)
 {
+	struct xsk_map *map = node->map;
+
 	bpf_map_put(&node->map->map);
 	kfree(node);
+	atomic_dec(&map->count);
 }
 
 static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
@@ -85,6 +89,14 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 	return &m->map;
 }
 
+static u64 xsk_map_mem_usage(const struct bpf_map *map)
+{
+	struct xsk_map *m = container_of(map, struct xsk_map, map);
+
+	return struct_size(m, xsk_map, map->max_entries) +
+		   (u64)atomic_read(&m->count) * sizeof(struct xsk_map_node);
+}
+
 static void xsk_map_free(struct bpf_map *map)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
@@ -267,6 +279,7 @@ const struct bpf_map_ops xsk_map_ops = {
 	.map_update_elem = xsk_map_update_elem,
 	.map_delete_elem = xsk_map_delete_elem,
 	.map_check_btf = map_check_no_btf,
+	.map_mem_usage = xsk_map_mem_usage,
 	.map_btf_id = &xsk_map_btf_ids[0],
 	.map_redirect = xsk_map_redirect,
 };
-- 
cgit v1.2.3


From 9629363cd05642fe43aded44938adec067ad1da3 Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sun, 5 Mar 2023 12:46:14 +0000
Subject: bpf: offload map memory usage

A new helper is introduced to calculate offload map memory usage. But
currently the memory dynamically allocated in netdev dev_ops, like
nsim_map_update_elem, is not counted. Let's just put it aside now.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Link: https://lore.kernel.org/r/20230305124615.12358-18-laoar.shao@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 6 ++++++
 kernel/bpf/offload.c | 6 ++++++
 kernel/bpf/syscall.c | 1 +
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9059520bbb5e..6792a7940e1e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2624,6 +2624,7 @@ static inline bool bpf_map_is_offloaded(struct bpf_map *map)
 
 struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
 void bpf_map_offload_map_free(struct bpf_map *map);
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map);
 int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 			      const union bpf_attr *kattr,
 			      union bpf_attr __user *uattr);
@@ -2695,6 +2696,11 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 {
 }
 
+static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+	return 0;
+}
+
 static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 					    const union bpf_attr *kattr,
 					    union bpf_attr __user *uattr)
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 0c85e06f7ea7..d9c9f45e3529 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -563,6 +563,12 @@ void bpf_map_offload_map_free(struct bpf_map *map)
 	bpf_map_area_free(offmap);
 }
 
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+	/* The memory dynamically allocated in netdev dev_ops is not counted */
+	return sizeof(struct bpf_offloaded_map);
+}
+
 int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
 {
 	struct bpf_offloaded_map *offmap = map_to_offmap(map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7c96e68859ec..053409d951d2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -105,6 +105,7 @@ const struct bpf_map_ops bpf_map_offload_ops = {
 	.map_alloc = bpf_map_offload_map_alloc,
 	.map_free = bpf_map_offload_map_free,
 	.map_check_btf = map_check_no_btf,
+	.map_mem_usage = bpf_map_offload_map_mem_usage,
 };
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
-- 
cgit v1.2.3


From 11a2638d120b9d998916efb6fc55c6422e469ffa Mon Sep 17 00:00:00 2001
From: Mordechay Goodstein <mordechay.goodstein@intel.com>
Date: Sun, 5 Mar 2023 14:16:19 +0200
Subject: wifi: radiotap: separate vendor TLV into header/content

To be able to use a general function later for any kind of
TLV, separate the vendor TLV header/content in the structs.

Signed-off-by: Mordechay Goodstein <mordechay.goodstein@intel.com>
Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
Link: https://lore.kernel.org/r/20230305124407.8ac5195bb3e6.I19ad99c1ad3108453aede64bddf6ef1a7c4a0b74@changeid
[separate from the original combined patch]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 16 ++++++++--------
 drivers/net/wireless/mac80211_hwsim.c         | 14 +++++++-------
 include/net/ieee80211_radiotap.h              | 20 ++++++++++++++------
 3 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
index 2db4f68becff..71a6555f90d9 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c
@@ -223,18 +223,18 @@ static void iwl_mvm_add_rtap_sniffer_config(struct iwl_mvm *mvm,
 				    vendor_data_len);
 
 	/* Intel OUI */
-	radiotap->oui[0] = 0xf6;
-	radiotap->oui[1] = 0x54;
-	radiotap->oui[2] = 0x25;
+	radiotap->content.oui[0] = 0xf6;
+	radiotap->content.oui[1] = 0x54;
+	radiotap->content.oui[2] = 0x25;
 	/* radiotap sniffer config sub-namespace */
-	radiotap->oui_subtype = 1;
-	radiotap->vendor_type = 0;
+	radiotap->content.oui_subtype = 1;
+	radiotap->content.vendor_type = 0;
 	/* clear reserved field */
-	radiotap->reserved = 0;
+	radiotap->content.reserved = 0;
 	/* fill the data now */
-	memcpy(radiotap->data, &mvm->cur_aid, sizeof(mvm->cur_aid));
+	memcpy(radiotap->content.data, &mvm->cur_aid, sizeof(mvm->cur_aid));
 	/* and clear the padding */
-	memset(radiotap->data + vendor_data_len, 0, padding);
+	memset(radiotap->content.data + vendor_data_len, 0, padding);
 
 	rx_status->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 }
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 152617034d19..f4bdc243ea0d 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1556,14 +1556,14 @@ static void mac80211_hwsim_add_vendor_rtap(struct sk_buff *skb)
 				sizeof(vendor_data));
 	rtap->type = cpu_to_le16(IEEE80211_RADIOTAP_VENDOR_NAMESPACE);
 
-	rtap->oui[0] = HWSIM_RADIOTAP_OUI[0];
-	rtap->oui[1] = HWSIM_RADIOTAP_OUI[1];
-	rtap->oui[2] = HWSIM_RADIOTAP_OUI[2];
-	rtap->oui_subtype = 127;
+	rtap->content.oui[0] = HWSIM_RADIOTAP_OUI[0];
+	rtap->content.oui[1] = HWSIM_RADIOTAP_OUI[1];
+	rtap->content.oui[2] = HWSIM_RADIOTAP_OUI[2];
+	rtap->content.oui_subtype = 127;
 	/* clear reserved field */
-	rtap->reserved = 0;
-	rtap->vendor_type = 0;
-	memcpy(rtap->data, vendor_data, sizeof(vendor_data));
+	rtap->content.reserved = 0;
+	rtap->content.vendor_type = 0;
+	memcpy(rtap->content.data, vendor_data, sizeof(vendor_data));
 
 	IEEE80211_SKB_RXCB(skb)->flag |= RX_FLAG_RADIOTAP_TLV_AT_END;
 #endif
diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 95436686d3fe..f980a72f2ce6 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -370,18 +370,14 @@ struct ieee80211_radiotap_tlv {
 } __packed;
 
 /**
- * struct ieee80211_radiotap_vendor_tlv - vendor radiotap data information
- * @type: should always be set to IEEE80211_RADIOTAP_VENDOR_NAMESPACE
- * @len: length of data
+ * struct ieee80211_radiotap_vendor_content - radiotap vendor data content
  * @oui: radiotap vendor namespace OUI
  * @oui_subtype: radiotap vendor sub namespace
  * @vendor_type: radiotap vendor type
  * @reserved: should always be set to zero (to avoid leaking memory)
  * @data: the actual vendor namespace data
  */
-struct ieee80211_radiotap_vendor_tlv {
-	__le16 type; /* IEEE80211_RADIOTAP_VENDOR_NAMESPACE */
-	__le16 len;
+struct ieee80211_radiotap_vendor_content {
 	u8 oui[3];
 	u8 oui_subtype;
 	__le16 vendor_type;
@@ -389,6 +385,18 @@ struct ieee80211_radiotap_vendor_tlv {
 	u8 data[];
 } __packed;
 
+/**
+ * struct ieee80211_radiotap_vendor_tlv - vendor radiotap data information
+ * @type: should always be set to IEEE80211_RADIOTAP_VENDOR_NAMESPACE
+ * @len: length of data
+ * @content: vendor content see @ieee80211_radiotap_vendor_content
+ */
+struct ieee80211_radiotap_vendor_tlv {
+	__le16 type; /* IEEE80211_RADIOTAP_VENDOR_NAMESPACE */
+	__le16 len;
+	struct ieee80211_radiotap_vendor_content content;
+};
+
 /* ieee80211_radiotap_eht_usig - content of U-SIG tlv (type 33)
  * see www.radiotap.org/fields/U-SIG.html for details
  */
-- 
cgit v1.2.3


From 0194b64578e905dc8f112e641a71c306bd58ddde Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 6 Mar 2023 22:51:35 +0100
Subject: net: phy: improve phy_read_poll_timeout

cond sometimes is (val & MASK) what may result in a false positive
if val is a negative errno. We shouldn't evaluate cond if val < 0.
This has no functional impact here, but it's not nice.
Therefore switch order of the checks.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/6d8274ac-4344-23b4-d9a3-cad4c39517d4@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 36bf0bbc8efa..fefd5091bc24 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1130,16 +1130,15 @@ static inline int phy_read(struct phy_device *phydev, u32 regnum)
 #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \
 				timeout_us, sleep_before_read) \
 ({ \
-	int __ret = read_poll_timeout(phy_read, val, (cond) || val < 0, \
+	int __ret = read_poll_timeout(phy_read, val, val < 0 || (cond), \
 		sleep_us, timeout_us, sleep_before_read, phydev, regnum); \
-	if (val <  0) \
+	if (val < 0) \
 		__ret = val; \
 	if (__ret) \
 		phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \
 	__ret; \
 })
 
-
 /**
  * __phy_read - convenience function for reading a given PHY register
  * @phydev: the phy_device struct
-- 
cgit v1.2.3


From 40bbae583ec38ea31e728bf42a4ea72bded22ab6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 6 Mar 2023 20:43:13 +0000
Subject: net: remove enum skb_free_reason

enum skb_drop_reason is more generic, we can adopt it instead.

Provide dev_kfree_skb_irq_reason() and dev_kfree_skb_any_reason().

This means drivers can use more precise drop reasons if they want to.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Yunsheng Lin <linyunsheng@huawei.com>
Link: https://lore.kernel.org/r/20230306204313.10492-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 18 +++++++-----------
 net/core/dev.c            | 20 +++++++++-----------
 2 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6a14b7b11766..ee483071cf59 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,6 +52,7 @@
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
+#include <net/dropreason.h>
 
 struct netpoll_info;
 struct device;
@@ -3804,13 +3805,8 @@ static inline unsigned int get_netdev_rx_queue_index(
 
 int netif_get_num_default_rss_queues(void);
 
-enum skb_free_reason {
-	SKB_REASON_CONSUMED,
-	SKB_REASON_DROPPED,
-};
-
-void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason);
-void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);
+void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason);
 
 /*
  * It is not allowed to call kfree_skb() or consume_skb() from hardware
@@ -3833,22 +3829,22 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason);
  */
 static inline void dev_kfree_skb_irq(struct sk_buff *skb)
 {
-	__dev_kfree_skb_irq(skb, SKB_REASON_DROPPED);
+	dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
 }
 
 static inline void dev_consume_skb_irq(struct sk_buff *skb)
 {
-	__dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED);
+	dev_kfree_skb_irq_reason(skb, SKB_CONSUMED);
 }
 
 static inline void dev_kfree_skb_any(struct sk_buff *skb)
 {
-	__dev_kfree_skb_any(skb, SKB_REASON_DROPPED);
+	dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
 }
 
 static inline void dev_consume_skb_any(struct sk_buff *skb)
 {
-	__dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
+	dev_kfree_skb_any_reason(skb, SKB_CONSUMED);
 }
 
 u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
diff --git a/net/core/dev.c b/net/core/dev.c
index 253584777101..c7853192563d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3075,7 +3075,7 @@ void __netif_schedule(struct Qdisc *q)
 EXPORT_SYMBOL(__netif_schedule);
 
 struct dev_kfree_skb_cb {
-	enum skb_free_reason reason;
+	enum skb_drop_reason reason;
 };
 
 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
@@ -3108,7 +3108,7 @@ void netif_tx_wake_queue(struct netdev_queue *dev_queue)
 }
 EXPORT_SYMBOL(netif_tx_wake_queue);
 
-void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	unsigned long flags;
 
@@ -3128,18 +3128,16 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(__dev_kfree_skb_irq);
+EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
 
-void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	if (in_hardirq() || irqs_disabled())
-		__dev_kfree_skb_irq(skb, reason);
-	else if (unlikely(reason == SKB_REASON_DROPPED))
-		kfree_skb(skb);
+		dev_kfree_skb_irq_reason(skb, reason);
 	else
-		consume_skb(skb);
+		kfree_skb_reason(skb, reason);
 }
-EXPORT_SYMBOL(__dev_kfree_skb_any);
+EXPORT_SYMBOL(dev_kfree_skb_any_reason);
 
 
 /**
@@ -5020,11 +5018,11 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 			clist = clist->next;
 
 			WARN_ON(refcount_read(&skb->users));
-			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
+			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
 				trace_consume_skb(skb, net_tx_action);
 			else
 				trace_kfree_skb(skb, net_tx_action,
-						SKB_DROP_REASON_NOT_SPECIFIED);
+						get_kfree_skb_cb(skb)->reason);
 
 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 				__kfree_skb(skb);
-- 
cgit v1.2.3


From 4386b921857793440ebd4db3d6b70639149c7074 Mon Sep 17 00:00:00 2001
From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Date: Fri, 24 Feb 2023 10:52:51 +0100
Subject: netfilter: bridge: introduce broute meta statement

nftables equivalent for ebtables -t broute.

Implement broute meta statement to set br_netfilter_broute flag
in skb to force a packet to be routed instead of being bridged.

Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 +
 net/bridge/netfilter/nft_meta_bridge.c   | 71 ++++++++++++++++++++++++++++++--
 2 files changed, 70 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ff677f3a6cad..9c6f02c26054 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -931,6 +931,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_TIME_HOUR: hour of day (in seconds)
  * @NFT_META_SDIF: slave device interface index
  * @NFT_META_SDIFNAME: slave device interface name
+ * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -969,6 +970,7 @@ enum nft_meta_keys {
 	NFT_META_TIME_HOUR,
 	NFT_META_SDIF,
 	NFT_META_SDIFNAME,
+	NFT_META_BRI_BROUTE,
 	__NFT_META_IIFTYPE,
 };
 
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index c3ecd77e25cb..bd4d1b4d745f 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -8,6 +8,9 @@
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nft_meta.h>
 #include <linux/if_bridge.h>
+#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+
+#include "../br_private.h"
 
 static const struct net_device *
 nft_meta_get_bridge(const struct net_device *dev)
@@ -102,6 +105,50 @@ static const struct nft_expr_ops nft_meta_bridge_get_ops = {
 	.reduce		= nft_meta_get_reduce,
 };
 
+static void nft_meta_bridge_set_eval(const struct nft_expr *expr,
+				     struct nft_regs *regs,
+				     const struct nft_pktinfo *pkt)
+{
+	const struct nft_meta *meta = nft_expr_priv(expr);
+	u32 *sreg = &regs->data[meta->sreg];
+	struct sk_buff *skb = pkt->skb;
+	u8 value8;
+
+	switch (meta->key) {
+	case NFT_META_BRI_BROUTE:
+		value8 = nft_reg_load8(sreg);
+		BR_INPUT_SKB_CB(skb)->br_netfilter_broute = !!value8;
+		break;
+	default:
+		nft_meta_set_eval(expr, regs, pkt);
+	}
+}
+
+static int nft_meta_bridge_set_init(const struct nft_ctx *ctx,
+				    const struct nft_expr *expr,
+				    const struct nlattr * const tb[])
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int len;
+	int err;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (priv->key) {
+	case NFT_META_BRI_BROUTE:
+		len = sizeof(u8);
+		break;
+	default:
+		return nft_meta_set_init(ctx, expr, tb);
+	}
+
+	priv->len = len;
+	err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
 static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
 				       const struct nft_expr *expr)
 {
@@ -120,15 +167,33 @@ static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
 	return false;
 }
 
+static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx,
+					const struct nft_expr *expr,
+					const struct nft_data **data)
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int hooks;
+
+	switch (priv->key) {
+	case NFT_META_BRI_BROUTE:
+		hooks = 1 << NF_BR_PRE_ROUTING;
+		break;
+	default:
+		return nft_meta_set_validate(ctx, expr, data);
+	}
+
+	return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
 static const struct nft_expr_ops nft_meta_bridge_set_ops = {
 	.type		= &nft_meta_bridge_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
-	.eval		= nft_meta_set_eval,
-	.init		= nft_meta_set_init,
+	.eval		= nft_meta_bridge_set_eval,
+	.init		= nft_meta_bridge_set_init,
 	.destroy	= nft_meta_set_destroy,
 	.dump		= nft_meta_set_dump,
 	.reduce		= nft_meta_bridge_set_reduce,
-	.validate	= nft_meta_set_validate,
+	.validate	= nft_meta_bridge_set_validate,
 };
 
 static const struct nft_expr_ops *
-- 
cgit v1.2.3


From 10369080454d87ee5b2db211ce947cb3118f0e13 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 Mar 2023 14:59:59 +0000
Subject: net: reclaim skb->scm_io_uring bit

Commit 0091bfc81741 ("io_uring/af_unix: defer registered
files gc to io_uring release") added one bit to struct sk_buff.

This structure is critical for networking, and we try very hard
to not add bloat on it, unless absolutely required.

For instance, we can use a specific destructor as a wrapper
around unix_destruct_scm(), to identify skbs that unix_gc()
has to special case.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Cc: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 include/net/af_unix.h  | 1 +
 io_uring/rsrc.c        | 3 +--
 net/unix/garbage.c     | 2 +-
 net/unix/scm.c         | 6 ++++++
 5 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ff7ad331fb82..fe661011644b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -810,7 +810,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@csum_level: indicates the number of consecutive checksums found in
  *		the packet minus one that have been verified as
  *		CHECKSUM_UNNECESSARY (max 3)
- *	@scm_io_uring: SKB holds io_uring registered files
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
  *	@slow_gro: state present at GRO time, slower prepare step required
@@ -989,7 +988,6 @@ struct sk_buff {
 #endif
 	__u8			slow_gro:1;
 	__u8			csum_not_inet:1;
-	__u8			scm_io_uring:1;
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 480fa579787e..45ebde587138 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -11,6 +11,7 @@
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_destruct_scm(struct sk_buff *skb);
+void io_uring_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a59fc02de598..27ceda3b50cf 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -867,8 +867,7 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
 
 		UNIXCB(skb).fp = fpl;
 		skb->sk = sk;
-		skb->scm_io_uring = 1;
-		skb->destructor = unix_destruct_scm;
+		skb->destructor = io_uring_destruct_scm;
 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
 	}
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index dc2763540393..2405f0f9af31 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -305,7 +305,7 @@ void unix_gc(void)
 	 * release.path eventually putting registered files.
 	 */
 	skb_queue_walk_safe(&hitlist, skb, next_skb) {
-		if (skb->scm_io_uring) {
+		if (skb->destructor == io_uring_destruct_scm) {
 			__skb_unlink(skb, &hitlist);
 			skb_queue_tail(&skb->sk->sk_receive_queue, skb);
 		}
diff --git a/net/unix/scm.c b/net/unix/scm.c
index aa27a02478dc..f9152881d77f 100644
--- a/net/unix/scm.c
+++ b/net/unix/scm.c
@@ -152,3 +152,9 @@ void unix_destruct_scm(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 EXPORT_SYMBOL(unix_destruct_scm);
+
+void io_uring_destruct_scm(struct sk_buff *skb)
+{
+	unix_destruct_scm(skb);
+}
+EXPORT_SYMBOL(io_uring_destruct_scm);
-- 
cgit v1.2.3


From 28e144cf5f72ce1c304571bc448e37c27495903a Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 7 Mar 2023 16:31:30 -0500
Subject: netfilter: move br_nf_check_hbh_len to utils

Rename br_nf_check_hbh_len() to nf_ip6_check_hbh_len() and move it
to netfilter utils, so that it can be used by other modules, like
ovs and tc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter_ipv6.h |  2 ++
 net/bridge/br_netfilter_ipv6.c | 55 +-----------------------------------------
 net/netfilter/utils.c          | 52 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 48314ade1506..7834c0be2831 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -197,6 +197,8 @@ static inline int nf_cookie_v6_check(const struct ipv6hdr *iph,
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 			unsigned int dataoff, u_int8_t protocol);
 
+int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen);
+
 int ipv6_netfilter_init(void);
 void ipv6_netfilter_fini(void);
 
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index a0d6dfb3e255..550039dfc31a 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -40,59 +40,6 @@
 #include <linux/sysctl.h>
 #endif
 
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff
- * anyway
- */
-static int br_nf_check_hbh_len(struct sk_buff *skb, u32 *plen)
-{
-	int len, off = sizeof(struct ipv6hdr);
-	unsigned char *nh;
-
-	if (!pskb_may_pull(skb, off + 8))
-		return -1;
-	nh = (unsigned char *)(ipv6_hdr(skb) + 1);
-	len = (nh[1] + 1) << 3;
-
-	if (!pskb_may_pull(skb, off + len))
-		return -1;
-	nh = skb_network_header(skb);
-
-	off += 2;
-	len -= 2;
-	while (len > 0) {
-		int optlen;
-
-		if (nh[off] == IPV6_TLV_PAD1) {
-			off++;
-			len--;
-			continue;
-		}
-		if (len < 2)
-			return -1;
-		optlen = nh[off + 1] + 2;
-		if (optlen > len)
-			return -1;
-
-		if (nh[off] == IPV6_TLV_JUMBO) {
-			u32 pkt_len;
-
-			if (nh[off + 1] != 4 || (off & 3) != 2)
-				return -1;
-			pkt_len = ntohl(*(__be32 *)(nh + off + 2));
-			if (pkt_len <= IPV6_MAXPLEN ||
-			    ipv6_hdr(skb)->payload_len)
-				return -1;
-			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
-				return -1;
-			*plen = pkt_len;
-		}
-		off += optlen;
-		len -= optlen;
-	}
-
-	return len ? -1 : 0;
-}
-
 int br_validate_ipv6(struct net *net, struct sk_buff *skb)
 {
 	const struct ipv6hdr *hdr;
@@ -112,7 +59,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
 		goto inhdr_error;
 
 	pkt_len = ntohs(hdr->payload_len);
-	if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb, &pkt_len))
+	if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len))
 		goto drop;
 
 	if (pkt_len + ip6h_len > skb->len) {
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 2182d361e273..acef4155f0da 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -215,3 +215,55 @@ int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
 	}
 	return ret;
 }
+
+/* Only get and check the lengths, not do any hop-by-hop stuff. */
+int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen)
+{
+	int len, off = sizeof(struct ipv6hdr);
+	unsigned char *nh;
+
+	if (!pskb_may_pull(skb, off + 8))
+		return -ENOMEM;
+	nh = (unsigned char *)(ipv6_hdr(skb) + 1);
+	len = (nh[1] + 1) << 3;
+
+	if (!pskb_may_pull(skb, off + len))
+		return -ENOMEM;
+	nh = skb_network_header(skb);
+
+	off += 2;
+	len -= 2;
+	while (len > 0) {
+		int optlen;
+
+		if (nh[off] == IPV6_TLV_PAD1) {
+			off++;
+			len--;
+			continue;
+		}
+		if (len < 2)
+			return -EBADMSG;
+		optlen = nh[off + 1] + 2;
+		if (optlen > len)
+			return -EBADMSG;
+
+		if (nh[off] == IPV6_TLV_JUMBO) {
+			u32 pkt_len;
+
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				return -EBADMSG;
+			pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				return -EBADMSG;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				return -EBADMSG;
+			*plen = pkt_len;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+
+	return len ? -EBADMSG : 0;
+}
+EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
-- 
cgit v1.2.3


From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:15 -0800
Subject: bpf: add iterator kfuncs registration and validation logic

Add ability to register kfuncs that implement BPF open-coded iterator
contract and enforce naming and function proto convention. Enforcement
happens at the time of kfunc registration and significantly simplifies
the rest of iterators logic in the verifier.

More details follow in subsequent patches, but we enforce the following
conditions.

All kfuncs (constructor, next, destructor) have to be named consistenly
as bpf_iter_<type>_{new,next,destroy}(), respectively. <type> represents
iterator type, and iterator state should be represented as a matching
`struct bpf_iter_<type>` state type. Also, all iter kfuncs should have
a pointer to this `struct bpf_iter_<type>` as the very first argument.

Additionally:
  - Constructor, i.e., bpf_iter_<type>_new(), can have arbitrary extra
  number of arguments. Return type is not enforced either.
  - Next method, i.e., bpf_iter_<type>_next(), has to return a pointer
  type and should have exactly one argument: `struct bpf_iter_<type> *`
  (const/volatile/restrict and typedefs are ignored).
  - Destructor, i.e., bpf_iter_<type>_destroy(), should return void and
  should have exactly one argument, similar to the next method.
  - struct bpf_iter_<type> size is enforced to be positive and
  a multiple of 8 bytes (to fit stack slots correctly).

Such strictness and consistency allows to build generic helpers
abstracting important, but boilerplate, details to be able to use
open-coded iterators effectively and ergonomically (see bpf_for_each()
in subsequent patches). It also simplifies the verifier logic in some
places. At the same time, this doesn't hurt generality of possible
iterator implementations. Win-win.

Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is
marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course),
while destructor kfunc is marked as KF_ITER_DESTROY.

Additionally, we add a trivial kfunc name validation: it should be
a valid non-NULL and non-empty string.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |   2 +
 include/linux/btf.h          |   4 ++
 kernel/bpf/btf.c             | 112 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 117 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 18538bad2b8c..e2dc7f064449 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -59,6 +59,8 @@ struct bpf_active_lock {
 	u32 id;
 };
 
+#define ITER_PREFIX "bpf_iter_"
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 556b3e2e7471..1bba0827e8c4 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -71,6 +71,10 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 #define KF_RCU          (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
+/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */
+#define KF_ITER_NEW     (1 << 8) /* kfunc implements BPF iter constructor */
+#define KF_ITER_NEXT    (1 << 9) /* kfunc implements BPF iter next method */
+#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */
 
 /*
  * Tag marking a kernel function as a kfunc. This is meant to minimize the
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index a8cb09e5973b..71758cd15b07 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7596,6 +7596,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
 BTF_TRACING_TYPE_xxx
 #undef BTF_TRACING_TYPE
 
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+				 const struct btf_type *func, u32 func_flags)
+{
+	u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+	const char *name, *sfx, *iter_name;
+	const struct btf_param *arg;
+	const struct btf_type *t;
+	char exp_name[128];
+	u32 nr_args;
+
+	/* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+	if (!flags || (flags & (flags - 1)))
+		return -EINVAL;
+
+	/* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+	nr_args = btf_type_vlen(func);
+	if (nr_args < 1)
+		return -EINVAL;
+
+	arg = &btf_params(func)[0];
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!t || !btf_type_is_ptr(t))
+		return -EINVAL;
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+	if (!t || !__btf_type_is_struct(t))
+		return -EINVAL;
+
+	name = btf_name_by_offset(btf, t->name_off);
+	if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+		return -EINVAL;
+
+	/* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+	 * fit nicely in stack slots
+	 */
+	if (t->size == 0 || (t->size % 8))
+		return -EINVAL;
+
+	/* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+	 * naming pattern
+	 */
+	iter_name = name + sizeof(ITER_PREFIX) - 1;
+	if (flags & KF_ITER_NEW)
+		sfx = "new";
+	else if (flags & KF_ITER_NEXT)
+		sfx = "next";
+	else /* (flags & KF_ITER_DESTROY) */
+		sfx = "destroy";
+
+	snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+	if (strcmp(func_name, exp_name))
+		return -EINVAL;
+
+	/* only iter constructor should have extra arguments */
+	if (!(flags & KF_ITER_NEW) && nr_args != 1)
+		return -EINVAL;
+
+	if (flags & KF_ITER_NEXT) {
+		/* bpf_iter_<type>_next() should return pointer */
+		t = btf_type_skip_modifiers(btf, func->type, NULL);
+		if (!t || !btf_type_is_ptr(t))
+			return -EINVAL;
+	}
+
+	if (flags & KF_ITER_DESTROY) {
+		/* bpf_iter_<type>_destroy() should return void */
+		t = btf_type_by_id(btf, func->type);
+		if (!t || !btf_type_is_void(t))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+	const struct btf_type *func;
+	const char *func_name;
+	int err;
+
+	/* any kfunc should be FUNC -> FUNC_PROTO */
+	func = btf_type_by_id(btf, func_id);
+	if (!func || !btf_type_is_func(func))
+		return -EINVAL;
+
+	/* sanity check kfunc name */
+	func_name = btf_name_by_offset(btf, func->name_off);
+	if (!func_name || !func_name[0])
+		return -EINVAL;
+
+	func = btf_type_by_id(btf, func->type);
+	if (!func || !btf_type_is_func_proto(func))
+		return -EINVAL;
+
+	if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+		err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* Kernel Function (kfunc) BTF ID set registration API */
 
 static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
@@ -7772,7 +7874,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 				       const struct btf_kfunc_id_set *kset)
 {
 	struct btf *btf;
-	int ret;
+	int ret, i;
 
 	btf = btf_get_module_btf(kset->owner);
 	if (!btf) {
@@ -7789,7 +7891,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
+	for (i = 0; i < kset->set->cnt; i++) {
+		ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+					     kset->set->pairs[i].flags);
+		if (ret)
+			goto err_out;
+	}
+
 	ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
 	btf_put(btf);
 	return ret;
 }
-- 
cgit v1.2.3


From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:16 -0800
Subject: bpf: add support for open-coded iterator loops

Teach verifier about the concept of the open-coded (or inline) iterators.

This patch adds generic iterator loop verification logic, new STACK_ITER
stack slot type to contain iterator state, and necessary kfunc plumbing
for iterator's constructor, destructor and next methods. Next patch
implements first specific iterator (numbers iterator for implementing
for() loop logic). Such split allows to have more focused commits for
verifier logic and separate commit that we could point later to
demonstrating  what does it take to add a new kind of iterator.

Each kind of iterator has its own associated struct bpf_iter_<type>,
where <type> denotes a specific type of iterator. struct bpf_iter_<type>
state is supposed to live on BPF program stack, so there will be no way
to change its size later on without breaking backwards compatibility, so
choose wisely! But given this struct is specific to a given <type> of
iterator, this allows a lot of flexibility: simple iterators could be
fine with just one stack slot (8 bytes), like numbers iterator in the
next patch, while some other more complicated iterators might need way
more to keep their iterator state. Either way, such design allows to
avoid runtime memory allocations, which otherwise would be necessary if
we fixed on-the-stack size and it turned out to be too small for a given
iterator implementation.

The way BPF verifier logic is implemented, there are no artificial
restrictions on a number of active iterators, it should work correctly
using multiple active iterators at the same time. This also means you
can have multiple nested iteration loops. struct bpf_iter_<type>
reference can be safely passed to subprograms as well.

General flow is easiest to demonstrate with a simple example using
number iterator implemented in next patch. Here's the simplest possible
loop:

  struct bpf_iter_num it;
  int *v;

  bpf_iter_num_new(&it, 2, 5);
  while ((v = bpf_iter_num_next(&it))) {
      bpf_printk("X = %d", *v);
  }
  bpf_iter_num_destroy(&it);

Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is
exclusive and is not returned. This matches similar APIs (e.g., slices
in Go or Rust) that implement a range of elements, where end index is
non-inclusive.

In the above example, we see a trio of function:
  - constructor, bpf_iter_num_new(), which initializes iterator state
  (struct bpf_iter_num it) on the stack. If any of the input arguments
  are invalid, constructor should make sure to still initialize it such
  that subsequent bpf_iter_num_next() calls will return NULL. I.e., on
  error, return error and construct empty iterator.
  - next method, bpf_iter_num_next(), which accepts pointer to iterator
  state and produces an element. Next method should always return
  a pointer. The contract between BPF verifier is that next method will
  always eventually return NULL when elements are exhausted. Once NULL is
  returned, subsequent next calls should keep returning NULL. In the
  case of numbers iterator, bpf_iter_num_next() returns a pointer to an int
  (storage for this integer is inside the iterator state itself),
  which can be dereferenced after corresponding NULL check.
  - once done with the iterator, it's mandated that user cleans up its
  state with the call to destructor, bpf_iter_num_destroy() in this
  case. Destructor frees up any resources and marks stack space used by
  struct bpf_iter_num as usable for something else.

Any other iterator implementation will have to implement at least these
three methods. It is enforced that for any given type of iterator only
applicable constructor/destructor/next are callable. I.e., verifier
ensures you can't pass number iterator state into, say, cgroup
iterator's next method.

It is important to keep the naming pattern consistent to be able to
create generic macros to help with BPF iter usability. E.g., one
of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h
in selftests, which allows to utilize iterator "trio" nicely without
having to code the above somewhat tedious loop explicitly every time.
This is enforced at kfunc registration point by one of the previous
patches in this series.

At the implementation level, iterator state tracking for verification
purposes is very similar to dynptr. We add STACK_ITER stack slot type,
reserve necessary number of slots, depending on
sizeof(struct bpf_iter_<type>), and keep track of necessary extra state
in the "main" slot, which is marked with non-zero ref_obj_id. Other
slots are also marked as STACK_ITER, but have zero ref_obj_id. This is
simpler than having a separate "is_first_slot" flag.

Another big distinction is that STACK_ITER is *always refcounted*, which
simplifies implementation without sacrificing usability. So no need for
extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new
constructors, etc. Keeping it simple here.

As far as the verification logic goes, there are two extensive comments:
in process_iter_next_call() and iter_active_depths_differ() explaining
some important and sometimes subtle aspects. Please refer to them for
details.

But from 10,000-foot point of view, next methods are the points of
forking a verification state, which are conceptually similar to what
verifier is doing when validating conditional jump. We branch out at
a `call bpf_iter_<type>_next` instruction and simulate two outcomes:
NULL (iteration is done) and non-NULL (new element is returned). NULL is
simulated first and is supposed to reach exit without looping. After
that non-NULL case is validated and it either reaches exit (for trivial
examples with no real loop), or reaches another `call bpf_iter_<type>_next`
instruction with the state equivalent to already (partially) validated
one. State equivalency at that point means we technically are going to
be looping forever without "breaking out" out of established "state
envelope" (i.e., subsequent iterations don't add any new knowledge or
constraints to the verifier state, so running 1, 2, 10, or a million of
them doesn't matter). But taking into account the contract stating that
iterator next method *has to* return NULL eventually, we can conclude
that loop body is safe and will eventually terminate. Given we validated
logic outside of the loop (NULL case), and concluded that loop body is
safe (though potentially looping many times), verifier can claim safety
of the overall program logic.

The rest of the patch is necessary plumbing for state tracking, marking,
validation, and necessary further kfunc plumbing to allow implementing
iterator constructor, destructor, and next methods.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  23 ++
 kernel/bpf/verifier.c        | 595 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 610 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e2dc7f064449..0c052bc79940 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -61,6 +61,12 @@ struct bpf_active_lock {
 
 #define ITER_PREFIX "bpf_iter_"
 
+enum bpf_iter_state {
+	BPF_ITER_STATE_INVALID, /* for non-first slot */
+	BPF_ITER_STATE_ACTIVE,
+	BPF_ITER_STATE_DRAINED,
+};
+
 struct bpf_reg_state {
 	/* Ordering of fields matters.  See states_equal() */
 	enum bpf_reg_type type;
@@ -105,6 +111,18 @@ struct bpf_reg_state {
 			bool first_slot;
 		} dynptr;
 
+		/* For bpf_iter stack slots */
+		struct {
+			/* BTF container and BTF type ID describing
+			 * struct bpf_iter_<type> of an iterator state
+			 */
+			struct btf *btf;
+			u32 btf_id;
+			/* packing following two fields to fit iter state into 16 bytes */
+			enum bpf_iter_state state:2;
+			int depth:30;
+		} iter;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -143,6 +161,8 @@ struct bpf_reg_state {
 	 * same reference to the socket, to determine proper reference freeing.
 	 * For stack slots that are dynptrs, this is used to track references to
 	 * the dynptr to determine proper reference freeing.
+	 * Similarly to dynptrs, we use ID to track "belonging" of a reference
+	 * to a specific instance of bpf_iter.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -213,9 +233,11 @@ enum bpf_stack_slot_type {
 	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
 	 */
 	STACK_DYNPTR,
+	STACK_ITER,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+
 #define BPF_DYNPTR_SIZE		sizeof(struct bpf_dynptr_kern)
 #define BPF_DYNPTR_NR_SLOTS		(BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
@@ -450,6 +472,7 @@ struct bpf_insn_aux_data {
 	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */
+	bool is_iter_next; /* bpf_iter_<type>_next() kfunc call */
 	u8 alu_state; /* used in combination with alu_limit */
 
 	/* below fields are initialized once */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d40fba6a1c0..45a082284464 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -302,6 +302,10 @@ struct bpf_kfunc_call_arg_meta {
 		enum bpf_dynptr_type type;
 		u32 id;
 	} initialized_dynptr;
+	struct {
+		u8 spi;
+		u8 frameno;
+	} iter;
 	u64 mem_size;
 };
 
@@ -668,6 +672,7 @@ static char slot_type_char[] = {
 	[STACK_MISC]	= 'm',
 	[STACK_ZERO]	= '0',
 	[STACK_DYNPTR]	= 'd',
+	[STACK_ITER]	= 'i',
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -742,6 +747,11 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re
 	return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
 }
 
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+	return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
 static const char *kernel_type_name(const struct btf* btf, u32 id)
 {
 	return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
@@ -766,6 +776,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
 	}
 }
 
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+	if (!btf || btf_id == 0)
+		return "<invalid>";
+
+	/* we already validated that type is valid and has conforming name */
+	return kernel_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+	switch (state) {
+	case BPF_ITER_STATE_ACTIVE:
+		return "active";
+	case BPF_ITER_STATE_DRAINED:
+		return "drained";
+	case BPF_ITER_STATE_INVALID:
+		return "<invalid>";
+	default:
+		WARN_ONCE(1, "unknown iter state %d\n", state);
+		return "<unknown>";
+	}
+}
+
 static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
 {
 	env->scratched_regs |= 1U << regno;
@@ -1118,6 +1152,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
 	}
 }
 
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+				 struct bpf_reg_state *reg, int insn_idx,
+				 struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j, id;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	id = acquire_reference_state(env, insn_idx);
+	if (id < 0)
+		return id;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		__mark_reg_known_zero(st);
+		st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+		st->live |= REG_LIVE_WRITTEN;
+		st->ref_obj_id = i == 0 ? id : 0;
+		st->iter.btf = btf;
+		st->iter.btf_id = btf_id;
+		st->iter.state = BPF_ITER_STATE_ACTIVE;
+		st->iter.depth = 0;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_ITER;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return spi;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		if (i == 0)
+			WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+		__mark_reg_not_init(env, st);
+
+		/* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+		st->live |= REG_LIVE_WRITTEN;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			slot->slot_type[j] = STACK_INVALID;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+				     struct bpf_reg_state *reg, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+	 * will do check_mem_access to check and update stack bounds later, so
+	 * return true for that case.
+	 */
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi == -ERANGE)
+		return true;
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] == STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				   struct btf *btf, u32 btf_id, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i, j;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0)
+		return false;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_stack_state *slot = &state->stack[spi - i];
+		struct bpf_reg_state *st = &slot->spilled_ptr;
+
+		/* only main (first) slot has ref_obj_id set */
+		if (i == 0 && !st->ref_obj_id)
+			return false;
+		if (i != 0 && st->ref_obj_id)
+			return false;
+		if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+			return false;
+
+		for (j = 0; j < BPF_REG_SIZE; j++)
+			if (slot->slot_type[j] != STACK_ITER)
+				return false;
+	}
+
+	return true;
+}
+
+/* Check if given stack slot is "special":
+ *   - spilled register state (STACK_SPILL);
+ *   - dynptr state (STACK_DYNPTR);
+ *   - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+	enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+	switch (type) {
+	case STACK_SPILL:
+	case STACK_DYNPTR:
+	case STACK_ITER:
+		return true;
+	case STACK_INVALID:
+	case STACK_MISC:
+	case STACK_ZERO:
+		return false;
+	default:
+		WARN_ONCE(1, "unknown stack slot type %d\n", type);
+		return true;
+	}
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -1267,6 +1452,19 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			if (reg->ref_obj_id)
 				verbose(env, "(ref_id=%d)", reg->ref_obj_id);
 			break;
+		case STACK_ITER:
+			/* only main slot has ref_obj_id set; skip others */
+			reg = &state->stack[i].spilled_ptr;
+			if (!reg->ref_obj_id)
+				continue;
+
+			verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+			print_liveness(env, reg->live);
+			verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+				iter_type_str(reg->iter.btf, reg->iter.btf_id),
+				reg->ref_obj_id, iter_state_str(reg->iter.state),
+				reg->iter.depth);
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		default:
@@ -2710,6 +2908,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
 			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
 }
 
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+			  int spi, int nr_slots)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int err, i;
+
+	for (i = 0; i < nr_slots; i++) {
+		struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+		err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+		if (err)
+			return err;
+
+		mark_stack_slot_scratched(env, spi - i);
+	}
+
+	return 0;
+}
+
 /* This function is supposed to be used by the following 32-bit optimization
  * code only. It returns TRUE if the source or destination register operates
  * on 64-bit, otherwise return FALSE.
@@ -3691,8 +3908,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 
 		/* regular write of data into stack destroys any spilled ptr */
 		state->stack[spi].spilled_ptr.type = NOT_INIT;
-		/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
-		if (is_spilled_reg(&state->stack[spi]))
+		/* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+		if (is_stack_slot_special(&state->stack[spi]))
 			for (i = 0; i < BPF_REG_SIZE; i++)
 				scrub_spilled_slot(&state->stack[spi].slot_type[i]);
 
@@ -6506,6 +6723,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
 	return err;
 }
 
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+	struct bpf_func_state *state = func(env, reg);
+
+	return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+	/* btf_check_iter_kfuncs() guarantees that first argument of any iter
+	 * kfunc is iter state pointer
+	 */
+	return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+			    struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+	const struct btf_type *t;
+	const struct btf_param *arg;
+	int spi, err, i, nr_slots;
+	u32 btf_id;
+
+	/* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+	arg = &btf_params(meta->func_proto)[0];
+	t = btf_type_skip_modifiers(meta->btf, arg->type, NULL);	/* PTR */
+	t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id);	/* STRUCT */
+	nr_slots = t->size / BPF_REG_SIZE;
+
+	spi = iter_get_spi(env, reg, nr_slots);
+	if (spi < 0 && spi != -ERANGE)
+		return spi;
+
+	meta->iter.spi = spi;
+	meta->iter.frameno = reg->frameno;
+
+	if (is_iter_new_kfunc(meta)) {
+		/* bpf_iter_<type>_new() expects pointer to uninit iter state */
+		if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+			verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+			err = check_mem_access(env, insn_idx, regno,
+					       i, BPF_DW, BPF_WRITE, -1, false);
+			if (err)
+				return err;
+		}
+
+		err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+		if (err)
+			return err;
+	} else {
+		/* iter_next() or iter_destroy() expect initialized iter state*/
+		if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+			verbose(env, "expected an initialized iter_%s as arg #%d\n",
+				iter_type_str(meta->btf, btf_id), regno);
+			return -EINVAL;
+		}
+
+		err = mark_iter_read(env, reg, spi, nr_slots);
+		if (err)
+			return err;
+
+		meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+		if (is_iter_destroy_kfunc(meta)) {
+			err = unmark_stack_slots_iter(env, reg, nr_slots);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+				  struct bpf_kfunc_call_arg_meta *meta)
+{
+	struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+	struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+	struct bpf_reg_state *cur_iter, *queued_iter;
+	int iter_frameno = meta->iter.frameno;
+	int iter_spi = meta->iter.spi;
+
+	BTF_TYPE_EMIT(struct bpf_iter);
+
+	cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+	if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+	    cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+		verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+			cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+		return -EFAULT;
+	}
+
+	if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+		/* branch out active iter state */
+		queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+		if (!queued_st)
+			return -ENOMEM;
+
+		queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+		queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+		queued_iter->iter.depth++;
+
+		queued_fr = queued_st->frame[queued_st->curframe];
+		mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+	}
+
+	/* switch to DRAINED state, but keep the depth unchanged */
+	/* mark current iter state as drained and assume returned NULL */
+	cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+	__mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
+	return 0;
+}
+
 static bool arg_type_is_mem_size(enum bpf_arg_type type)
 {
 	return type == ARG_CONST_SIZE ||
@@ -9099,6 +9513,7 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_ALLOC_BTF_ID,  /* Allocated object */
 	KF_ARG_PTR_TO_KPTR,	     /* PTR_TO_KPTR but type specific */
 	KF_ARG_PTR_TO_DYNPTR,
+	KF_ARG_PTR_TO_ITER,
 	KF_ARG_PTR_TO_LIST_HEAD,
 	KF_ARG_PTR_TO_LIST_NODE,
 	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */
@@ -9220,6 +9635,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_DYNPTR;
 
+	if (is_kfunc_arg_iter(meta, argno))
+		return KF_ARG_PTR_TO_ITER;
+
 	if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_LIST_HEAD;
 
@@ -9848,6 +10266,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			break;
 		case KF_ARG_PTR_TO_KPTR:
 		case KF_ARG_PTR_TO_DYNPTR:
+		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
 		case KF_ARG_PTR_TO_LIST_NODE:
 		case KF_ARG_PTR_TO_RB_ROOT:
@@ -9944,6 +10363,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 
 			break;
 		}
+		case KF_ARG_PTR_TO_ITER:
+			ret = process_iter_arg(env, regno, insn_idx, meta);
+			if (ret < 0)
+				return ret;
+			break;
 		case KF_ARG_PTR_TO_LIST_HEAD:
 			if (reg->type != PTR_TO_MAP_VALUE &&
 			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -10148,6 +10572,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	desc_btf = meta.btf;
 	insn_aux = &env->insn_aux_data[insn_idx];
 
+	insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
+
 	if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
 		verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
 		return -EACCES;
@@ -10436,6 +10862,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			mark_btf_func_reg_size(env, regno, t->size);
 	}
 
+	if (is_iter_next_kfunc(&meta)) {
+		err = process_iter_next_call(env, insn_idx, &meta);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -13548,6 +13980,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			 * async state will be pushed for further exploration.
 			 */
 			mark_prune_point(env, t);
+		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+			struct bpf_kfunc_call_arg_meta meta;
+
+			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+			if (ret == 0 && is_iter_next_kfunc(&meta))
+				mark_prune_point(env, t);
+		}
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
 	case BPF_JA:
@@ -14301,6 +14740,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 	 * didn't use them
 	 */
 	for (i = 0; i < old->allocated_stack; i++) {
+		struct bpf_reg_state *old_reg, *cur_reg;
+
 		spi = i / BPF_REG_SIZE;
 
 		if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -14357,9 +14798,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 				return false;
 			break;
 		case STACK_DYNPTR:
-		{
-			const struct bpf_reg_state *old_reg, *cur_reg;
-
 			old_reg = &old->stack[spi].spilled_ptr;
 			cur_reg = &cur->stack[spi].spilled_ptr;
 			if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -14367,7 +14805,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
 			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
 				return false;
 			break;
-		}
+		case STACK_ITER:
+			old_reg = &old->stack[spi].spilled_ptr;
+			cur_reg = &cur->stack[spi].spilled_ptr;
+			/* iter.depth is not compared between states as it
+			 * doesn't matter for correctness and would otherwise
+			 * prevent convergence; we maintain it only to prevent
+			 * infinite loop check triggering, see
+			 * iter_active_depths_differ()
+			 */
+			if (old_reg->iter.btf != cur_reg->iter.btf ||
+			    old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+			    old_reg->iter.state != cur_reg->iter.state ||
+			    /* ignore {old_reg,cur_reg}->iter.depth, see above */
+			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+				return false;
+			break;
 		case STACK_MISC:
 		case STACK_ZERO:
 		case STACK_INVALID:
@@ -14626,6 +15079,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
 	return true;
 }
 
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ *   0: again:                          ; set up iter_next() call args
+ *   1:   r1 = &it                      ; <CHECKPOINT HERE>
+ *   2:   call bpf_iter_num_next        ; this is iter_next() call
+ *   3:   if r0 == 0 goto done
+ *   4:   ... something useful here ...
+ *   5:   goto again                    ; another iteration
+ *   6: done:
+ *   7:   r1 = &it
+ *   8:   call bpf_iter_num_destroy     ; clean up iter state
+ *   9:   exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ *   struct bpf_iter_num it;
+ *   int *p, x;
+ *
+ *   bpf_iter_num_new(&it, 0, 10);
+ *   while ((p = bpf_iter_num_next(&t))) {
+ *       x = p;
+ *       while (x--) {} // <<-- infinite loop here
+ *   }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+	struct bpf_reg_state *slot, *cur_slot;
+	struct bpf_func_state *state;
+	int i, fr;
+
+	for (fr = old->curframe; fr >= 0; fr--) {
+		state = old->frame[fr];
+		for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+			if (state->stack[i].slot_type[0] != STACK_ITER)
+				continue;
+
+			slot = &state->stack[i].spilled_ptr;
+			if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+				continue;
+
+			cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+			if (cur_slot->iter.depth != slot->iter.depth)
+				return true;
+		}
+	}
+	return false;
+}
 
 static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
@@ -14673,8 +15212,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 				 * Since the verifier still needs to catch infinite loops
 				 * inside async callbacks.
 				 */
-			} else if (states_maybe_looping(&sl->state, cur) &&
-				   states_equal(env, &sl->state, cur)) {
+				goto skip_inf_loop_check;
+			}
+			/* BPF open-coded iterators loop detection is special.
+			 * states_maybe_looping() logic is too simplistic in detecting
+			 * states that *might* be equivalent, because it doesn't know
+			 * about ID remapping, so don't even perform it.
+			 * See process_iter_next_call() and iter_active_depths_differ()
+			 * for overview of the logic. When current and one of parent
+			 * states are detected as equivalent, it's a good thing: we prove
+			 * convergence and can stop simulating further iterations.
+			 * It's safe to assume that iterator loop will finish, taking into
+			 * account iter_next() contract of eventually returning
+			 * sticky NULL result.
+			 */
+			if (is_iter_next_insn(env, insn_idx)) {
+				if (states_equal(env, &sl->state, cur)) {
+					struct bpf_func_state *cur_frame;
+					struct bpf_reg_state *iter_state, *iter_reg;
+					int spi;
+
+					cur_frame = cur->frame[cur->curframe];
+					/* btf_check_iter_kfuncs() enforces that
+					 * iter state pointer is always the first arg
+					 */
+					iter_reg = &cur_frame->regs[BPF_REG_1];
+					/* current state is valid due to states_equal(),
+					 * so we can assume valid iter and reg state,
+					 * no need for extra (re-)validations
+					 */
+					spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+					iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+					if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+						goto hit;
+				}
+				goto skip_inf_loop_check;
+			}
+			/* attempt to detect infinite loop to avoid unnecessary doomed work */
+			if (states_maybe_looping(&sl->state, cur) &&
+			    states_equal(env, &sl->state, cur) &&
+			    !iter_active_depths_differ(&sl->state, cur)) {
 				verbose_linfo(env, insn_idx, "; ");
 				verbose(env, "infinite loop detected at insn %d\n", insn_idx);
 				return -EINVAL;
@@ -14691,6 +15268,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * This threshold shouldn't be too high either, since states
 			 * at the end of the loop are likely to be useful in pruning.
 			 */
+skip_inf_loop_check:
 			if (!env->test_state_freq &&
 			    env->jmps_processed - env->prev_jmps_processed < 20 &&
 			    env->insn_processed - env->prev_insn_processed < 100)
@@ -14698,6 +15276,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			goto miss;
 		}
 		if (states_equal(env, &sl->state, cur)) {
+hit:
 			sl->hit_cnt++;
 			/* reached equivalent register/stack state,
 			 * prune the search.
-- 
cgit v1.2.3


From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 8 Mar 2023 10:41:17 -0800
Subject: bpf: implement numbers iterator

Implement the first open-coded iterator type over a range of integers.

It's public API consists of:
  - bpf_iter_num_new() constructor, which accepts [start, end) range
    (that is, start is inclusive, end is exclusive).
  - bpf_iter_num_next() which will keep returning read-only pointer to int
    until the range is exhausted, at which point NULL will be returned.
    If bpf_iter_num_next() is kept calling after this, NULL will be
    persistently returned.
  - bpf_iter_num_destroy() destructor, which needs to be called at some
    point to clean up iterator state. BPF verifier enforces that iterator
    destructor is called at some point before BPF program exits.

Note that `start = end = X` is a valid combination to setup an empty
iterator. bpf_iter_num_new() will return 0 (success) for any such
combination.

If bpf_iter_num_new() detects invalid combination of input arguments, it
returns error, resets iterator state to, effectively, empty iterator, so
any subsequent call to bpf_iter_num_next() will keep returning NULL.

BPF verifier has no knowledge that returned integers are in the
[start, end) value range, as both `start` and `end` are not statically
known and enforced: they are runtime values.

While the implementation is pretty trivial, some care needs to be taken
to avoid overflows and underflows. Subsequent selftests will validate
correctness of [start, end) semantics, especially around extremes
(INT_MIN and INT_MAX).

Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can
be specified.

bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded
BPF loops and bpf_loop() helper and is the basis for implementing
ergonomic BPF loops with no statically known or verified bounds.
Subsequent patches implement bpf_for() macro, demonstrating how this can
be wrapped into something that works and feels like a normal for() loop
in C language.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 +++--
 include/uapi/linux/bpf.h       |  8 +++++
 kernel/bpf/bpf_iter.c          | 70 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c           |  3 ++
 tools/include/uapi/linux/bpf.h |  8 +++++
 5 files changed, 95 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6792a7940e1e..e64ff1e89fb2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1617,8 +1617,12 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
-/* Maximum number of loops for bpf_loop */
-#define BPF_MAX_LOOPS	BIT(23)
+/* Maximum number of loops for bpf_loop and bpf_iter_num.
+ * It's enum to expose it (and thus make it discoverable) through BTF.
+ */
+enum {
+	BPF_MAX_LOOPS = 8 * 1024 * 1024,
+};
 
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
 	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
 	.arg4_type	= ARG_ANYTHING,
 };
+
+struct bpf_iter_num_kern {
+	int cur; /* current value, inclusive */
+	int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+	BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+	BTF_TYPE_EMIT(struct btf_iter_num);
+
+	/* start == end is legit, it's an empty range and we'll just get NULL
+	 * on first (and any subsequent) bpf_iter_num_next() call
+	 */
+	if (start > end) {
+		s->cur = s->end = 0;
+		return -EINVAL;
+	}
+
+	/* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+	if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+		s->cur = s->end = 0;
+		return -E2BIG;
+	}
+
+	/* user will call bpf_iter_num_next() first,
+	 * which will set s->cur to exactly start value;
+	 * underflow shouldn't matter
+	 */
+	s->cur = start - 1;
+	s->end = end;
+
+	return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	/* check failed initialization or if we are done (same behavior);
+	 * need to be careful about overflow, so convert to s64 for checks,
+	 * e.g., if s->cur == s->end == INT_MAX, we can't just do
+	 * s->cur + 1 >= s->end
+	 */
+	if ((s64)(s->cur + 1) >= s->end) {
+		s->cur = s->end = 0;
+		return NULL;
+	}
+
+	s->cur++;
+
+	return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+	struct bpf_iter_num_kern *s = (void *)it;
+
+	s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 637ac4e92e75..f9b7eeedce08 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2411,6 +2411,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
 BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
 BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
 BTF_SET8_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 976b194eb775..4abddb668a10 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7112,4 +7112,12 @@ enum {
 	BPF_F_TIMER_ABS = (1ULL << 0),
 };
 
+/* BPF numbers iterator state */
+struct bpf_iter_num {
+	/* opaque iterator state; having __u64 here allows to preserve correct
+	 * alignment requirements in vmlinux.h, generated from BTF
+	 */
+	__u64 __opaque[1];
+} __attribute__((aligned(8)));
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 4821a076eb602a6238528e9ebafeac853c833415 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 7 Mar 2023 16:23:26 -0500
Subject: sctp: add fair capacity stream scheduler

As it says in rfc8260#section-3.5 about the fair capacity scheduler:

   A fair capacity distribution between the streams is used.  This
   scheduler considers the lengths of the messages of each stream and
   schedules them in a specific way to maintain an equal capacity for
   all streams.  The details are implementation dependent.  interleaving
   user messages allows for a better realization of the fair capacity
   usage.

This patch adds Fair Capacity Scheduler based on the foundations added
by commit 5bbbbe32a431 ("sctp: introduce stream scheduler foundations"):

A fc_list and a fc_length are added into struct sctp_stream_out_ext and
a fc_list is added into struct sctp_stream. In .enqueue, when there are
chunks enqueued into a stream, this stream will be linked into stream->
fc_list by its fc_list ordered by its fc_length. In .dequeue, it always
picks up the 1st skb from stream->fc_list. In .dequeue_done, fc_length
is increased by chunk's len and update its location in stream->fc_list
according to the its new fc_length.

Note that when the new fc_length overflows in .dequeue_done, instead of
resetting all fc_lengths to 0, we only reduced them by U32_MAX / 4 to
avoid a moment of imbalance in the scheduling, as Marcelo suggested.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sctp/stream_sched.h |   1 +
 include/net/sctp/structs.h      |   7 ++
 include/uapi/linux/sctp.h       |   3 +-
 net/sctp/Makefile               |   3 +-
 net/sctp/stream_sched.c         |   1 +
 net/sctp/stream_sched_fc.c      | 183 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 net/sctp/stream_sched_fc.c

(limited to 'include')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index fa00dc20a0d7..913170710adb 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -58,5 +58,6 @@ void sctp_sched_ops_register(enum sctp_sched_type sched,
 			     struct sctp_sched_ops *sched_ops);
 void sctp_sched_ops_prio_init(void);
 void sctp_sched_ops_rr_init(void);
+void sctp_sched_ops_fc_init(void);
 
 #endif /* __sctp_stream_sched_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index e1f6e7fc2b11..2f1c9f50b352 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1429,6 +1429,10 @@ struct sctp_stream_out_ext {
 		struct {
 			struct list_head rr_list;
 		};
+		struct {
+			struct list_head fc_list;
+			__u32 fc_length;
+		};
 	};
 };
 
@@ -1475,6 +1479,9 @@ struct sctp_stream {
 			/* The next stream in line */
 			struct sctp_stream_out_ext *rr_next;
 		};
+		struct {
+			struct list_head fc_list;
+		};
 	};
 	struct sctp_stream_interleave *si;
 };
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index ed7d4ecbf53d..6814c5a1c4bc 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1211,7 +1211,8 @@ enum sctp_sched_type {
 	SCTP_SS_DEFAULT = SCTP_SS_FCFS,
 	SCTP_SS_PRIO,
 	SCTP_SS_RR,
-	SCTP_SS_MAX = SCTP_SS_RR
+	SCTP_SS_FC,
+	SCTP_SS_MAX = SCTP_SS_FC
 };
 
 /* Probe Interval socket option */
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index e845e4588535..0448398408d8 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -13,7 +13,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
-	  stream_sched_rr.o stream_interleave.o
+	  stream_sched_rr.o stream_sched_fc.o \
+	  stream_interleave.o
 
 sctp_diag-y := diag.o
 
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 330067002deb..1ebd14ef8daa 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -124,6 +124,7 @@ void sctp_sched_ops_init(void)
 	sctp_sched_ops_fcfs_init();
 	sctp_sched_ops_prio_init();
 	sctp_sched_ops_rr_init();
+	sctp_sched_ops_fc_init();
 }
 
 static void sctp_sched_free_sched(struct sctp_stream *stream)
diff --git a/net/sctp/stream_sched_fc.c b/net/sctp/stream_sched_fc.c
new file mode 100644
index 000000000000..b336c2f5486b
--- /dev/null
+++ b/net/sctp/stream_sched_fc.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2022
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions manipulate sctp stream queue/scheduling.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Xin Long <lucien.xin@gmail.com>
+ */
+
+#include <linux/list.h>
+#include <net/sctp/sctp.h>
+#include <net/sctp/sm.h>
+#include <net/sctp/stream_sched.h>
+
+/* Fair Capacity handling
+ * RFC 8260 section 3.5
+ */
+static void sctp_sched_fc_unsched_all(struct sctp_stream *stream);
+
+static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid,
+			     __u16 weight, gfp_t gfp)
+{
+	return 0;
+}
+
+static int sctp_sched_fc_get(struct sctp_stream *stream, __u16 sid,
+			     __u16 *value)
+{
+	return 0;
+}
+
+static int sctp_sched_fc_init(struct sctp_stream *stream)
+{
+	INIT_LIST_HEAD(&stream->fc_list);
+
+	return 0;
+}
+
+static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid,
+				  gfp_t gfp)
+{
+	struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext;
+
+	INIT_LIST_HEAD(&soute->fc_list);
+	soute->fc_length = 0;
+
+	return 0;
+}
+
+static void sctp_sched_fc_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+}
+
+static void sctp_sched_fc_sched(struct sctp_stream *stream,
+				struct sctp_stream_out_ext *soute)
+{
+	struct sctp_stream_out_ext *pos;
+
+	if (!list_empty(&soute->fc_list))
+		return;
+
+	list_for_each_entry(pos, &stream->fc_list, fc_list)
+		if (pos->fc_length >= soute->fc_length)
+			break;
+	list_add_tail(&soute->fc_list, &pos->fc_list);
+}
+
+static void sctp_sched_fc_enqueue(struct sctp_outq *q,
+				  struct sctp_datamsg *msg)
+{
+	struct sctp_stream *stream;
+	struct sctp_chunk *ch;
+	__u16 sid;
+
+	ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list);
+	sid = sctp_chunk_stream_no(ch);
+	stream = &q->asoc->stream;
+	sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext);
+}
+
+static struct sctp_chunk *sctp_sched_fc_dequeue(struct sctp_outq *q)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute;
+	struct sctp_chunk *ch;
+
+	/* Bail out quickly if queue is empty */
+	if (list_empty(&q->out_chunk_list))
+		return NULL;
+
+	/* Find which chunk is next */
+	if (stream->out_curr)
+		soute = stream->out_curr->ext;
+	else
+		soute = list_entry(stream->fc_list.next, struct sctp_stream_out_ext, fc_list);
+	ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list);
+
+	sctp_sched_dequeue_common(q, ch);
+	return ch;
+}
+
+static void sctp_sched_fc_dequeue_done(struct sctp_outq *q,
+				       struct sctp_chunk *ch)
+{
+	struct sctp_stream *stream = &q->asoc->stream;
+	struct sctp_stream_out_ext *soute, *pos;
+	__u16 sid, i;
+
+	sid = sctp_chunk_stream_no(ch);
+	soute = SCTP_SO(stream, sid)->ext;
+	/* reduce all fc_lengths by U32_MAX / 4 if the current fc_length overflows. */
+	if (soute->fc_length > U32_MAX - ch->skb->len) {
+		for (i = 0; i < stream->outcnt; i++) {
+			pos = SCTP_SO(stream, i)->ext;
+			if (!pos)
+				continue;
+			if (pos->fc_length <= (U32_MAX >> 2)) {
+				pos->fc_length = 0;
+				continue;
+			}
+			pos->fc_length -= (U32_MAX >> 2);
+		}
+	}
+	soute->fc_length += ch->skb->len;
+
+	if (list_empty(&soute->outq)) {
+		list_del_init(&soute->fc_list);
+		return;
+	}
+
+	pos = soute;
+	list_for_each_entry_continue(pos, &stream->fc_list, fc_list)
+		if (pos->fc_length >= soute->fc_length)
+			break;
+	list_move_tail(&soute->fc_list, &pos->fc_list);
+}
+
+static void sctp_sched_fc_sched_all(struct sctp_stream *stream)
+{
+	struct sctp_association *asoc;
+	struct sctp_chunk *ch;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) {
+		__u16 sid = sctp_chunk_stream_no(ch);
+
+		if (SCTP_SO(stream, sid)->ext)
+			sctp_sched_fc_sched(stream, SCTP_SO(stream, sid)->ext);
+	}
+}
+
+static void sctp_sched_fc_unsched_all(struct sctp_stream *stream)
+{
+	struct sctp_stream_out_ext *soute, *tmp;
+
+	list_for_each_entry_safe(soute, tmp, &stream->fc_list, fc_list)
+		list_del_init(&soute->fc_list);
+}
+
+static struct sctp_sched_ops sctp_sched_fc = {
+	.set = sctp_sched_fc_set,
+	.get = sctp_sched_fc_get,
+	.init = sctp_sched_fc_init,
+	.init_sid = sctp_sched_fc_init_sid,
+	.free_sid = sctp_sched_fc_free_sid,
+	.enqueue = sctp_sched_fc_enqueue,
+	.dequeue = sctp_sched_fc_dequeue,
+	.dequeue_done = sctp_sched_fc_dequeue_done,
+	.sched_all = sctp_sched_fc_sched_all,
+	.unsched_all = sctp_sched_fc_unsched_all,
+};
+
+void sctp_sched_ops_fc_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc);
+}
-- 
cgit v1.2.3


From 42d452e7709fdb4d42376d2a97369e22cc80a5d2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 7 Mar 2023 16:23:27 -0500
Subject: sctp: add weighted fair queueing stream scheduler

As it says in rfc8260#section-3.6 about the weighted fair queueing
scheduler:

   A Weighted Fair Queueing scheduler between the streams is used.  The
   weight is configurable per outgoing SCTP stream.  This scheduler
   considers the lengths of the messages of each stream and schedules
   them in a specific way to use the capacity according to the given
   weights.  If the weight of stream S1 is n times the weight of stream
   S2, the scheduler should assign to stream S1 n times the capacity it
   assigns to stream S2.  The details are implementation dependent.
   Interleaving user messages allows for a better realization of the
   capacity usage according to the given weights.

This patch adds Weighted Fair Queueing Scheduler actually based on
the code of Fair Capacity Scheduler by adding fc_weight into struct
sctp_stream_out_ext and taking it into account when sorting stream->
fc_list in sctp_sched_fc_sched() and sctp_sched_fc_dequeue_done().

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sctp/stream_sched.h |  1 +
 include/net/sctp/structs.h      |  1 +
 include/uapi/linux/sctp.h       |  3 ++-
 net/sctp/stream_sched.c         |  1 +
 net/sctp/stream_sched_fc.c      | 50 +++++++++++++++++++++++++++++++++++++----
 5 files changed, 51 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index 913170710adb..572d73fdcd5e 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -59,5 +59,6 @@ void sctp_sched_ops_register(enum sctp_sched_type sched,
 void sctp_sched_ops_prio_init(void);
 void sctp_sched_ops_rr_init(void);
 void sctp_sched_ops_fc_init(void);
+void sctp_sched_ops_wfq_init(void);
 
 #endif /* __sctp_stream_sched_h__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 2f1c9f50b352..a0933efd93c3 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1432,6 +1432,7 @@ struct sctp_stream_out_ext {
 		struct {
 			struct list_head fc_list;
 			__u32 fc_length;
+			__u16 fc_weight;
 		};
 	};
 };
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 6814c5a1c4bc..b7d91d4cf0db 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -1212,7 +1212,8 @@ enum sctp_sched_type {
 	SCTP_SS_PRIO,
 	SCTP_SS_RR,
 	SCTP_SS_FC,
-	SCTP_SS_MAX = SCTP_SS_FC
+	SCTP_SS_WFQ,
+	SCTP_SS_MAX = SCTP_SS_WFQ
 };
 
 /* Probe Interval socket option */
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 1ebd14ef8daa..e843760e9aaa 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -125,6 +125,7 @@ void sctp_sched_ops_init(void)
 	sctp_sched_ops_prio_init();
 	sctp_sched_ops_rr_init();
 	sctp_sched_ops_fc_init();
+	sctp_sched_ops_wfq_init();
 }
 
 static void sctp_sched_free_sched(struct sctp_stream *stream)
diff --git a/net/sctp/stream_sched_fc.c b/net/sctp/stream_sched_fc.c
index b336c2f5486b..4bd18a497a6d 100644
--- a/net/sctp/stream_sched_fc.c
+++ b/net/sctp/stream_sched_fc.c
@@ -19,11 +19,32 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/stream_sched.h>
 
-/* Fair Capacity handling
- * RFC 8260 section 3.5
+/* Fair Capacity and Weighted Fair Queueing handling
+ * RFC 8260 section 3.5 and 3.6
  */
 static void sctp_sched_fc_unsched_all(struct sctp_stream *stream);
 
+static int sctp_sched_wfq_set(struct sctp_stream *stream, __u16 sid,
+			      __u16 weight, gfp_t gfp)
+{
+	struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext;
+
+	if (!weight)
+		return -EINVAL;
+
+	soute->fc_weight = weight;
+	return 0;
+}
+
+static int sctp_sched_wfq_get(struct sctp_stream *stream, __u16 sid,
+			      __u16 *value)
+{
+	struct sctp_stream_out_ext *soute = SCTP_SO(stream, sid)->ext;
+
+	*value = soute->fc_weight;
+	return 0;
+}
+
 static int sctp_sched_fc_set(struct sctp_stream *stream, __u16 sid,
 			     __u16 weight, gfp_t gfp)
 {
@@ -50,6 +71,7 @@ static int sctp_sched_fc_init_sid(struct sctp_stream *stream, __u16 sid,
 
 	INIT_LIST_HEAD(&soute->fc_list);
 	soute->fc_length = 0;
+	soute->fc_weight = 1;
 
 	return 0;
 }
@@ -67,7 +89,8 @@ static void sctp_sched_fc_sched(struct sctp_stream *stream,
 		return;
 
 	list_for_each_entry(pos, &stream->fc_list, fc_list)
-		if (pos->fc_length >= soute->fc_length)
+		if ((__u64)pos->fc_length * soute->fc_weight >=
+		    (__u64)soute->fc_length * pos->fc_weight)
 			break;
 	list_add_tail(&soute->fc_list, &pos->fc_list);
 }
@@ -137,7 +160,8 @@ static void sctp_sched_fc_dequeue_done(struct sctp_outq *q,
 
 	pos = soute;
 	list_for_each_entry_continue(pos, &stream->fc_list, fc_list)
-		if (pos->fc_length >= soute->fc_length)
+		if ((__u64)pos->fc_length * soute->fc_weight >=
+		    (__u64)soute->fc_length * pos->fc_weight)
 			break;
 	list_move_tail(&soute->fc_list, &pos->fc_list);
 }
@@ -181,3 +205,21 @@ void sctp_sched_ops_fc_init(void)
 {
 	sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc);
 }
+
+static struct sctp_sched_ops sctp_sched_wfq = {
+	.set = sctp_sched_wfq_set,
+	.get = sctp_sched_wfq_get,
+	.init = sctp_sched_fc_init,
+	.init_sid = sctp_sched_fc_init_sid,
+	.free_sid = sctp_sched_fc_free_sid,
+	.enqueue = sctp_sched_fc_enqueue,
+	.dequeue = sctp_sched_fc_dequeue,
+	.dequeue_done = sctp_sched_fc_dequeue_done,
+	.sched_all = sctp_sched_fc_sched_all,
+	.unsched_all = sctp_sched_fc_unsched_all,
+};
+
+void sctp_sched_ops_wfq_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_WFQ, &sctp_sched_wfq);
+}
-- 
cgit v1.2.3


From 5a70f4a63000ba68004fb3c1aaf2f90303dd228f Mon Sep 17 00:00:00 2001
From: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Date: Thu, 9 Mar 2023 14:38:23 +0100
Subject: bpf: Fix a typo for BPF_F_ANY_ALIGNMENT in bpf.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix s/BPF_PROF_LOAD/BPF_PROG_LOAD/ typo in the documentation comment
for BPF_F_ANY_ALIGNMENT in bpf.h.

Signed-off-by: Michael Weiß <michael.weiss@aisec.fraunhofer.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20230309133823.944097-1-michael.weiss@aisec.fraunhofer.de
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4abddb668a10..d8c534e05b0a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1108,7 +1108,7 @@ enum bpf_link_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
-/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will allow any alignment whatsoever.  On platforms
  * with strict alignment requirements for loads ands stores (such
  * as sparc and mips) the verifier validates that all loads and
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4abddb668a10..d8c534e05b0a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1108,7 +1108,7 @@ enum bpf_link_type {
  */
 #define BPF_F_STRICT_ALIGNMENT	(1U << 0)
 
-/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will allow any alignment whatsoever.  On platforms
  * with strict alignment requirements for loads ands stores (such
  * as sparc and mips) the verifier validates that all loads and
-- 
cgit v1.2.3


From 6978052448f9eb19f7b03243ac0416104e5ee50d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 8 Mar 2023 15:20:06 +0100
Subject: netlink: remove unused 'compare' function

No users in the tree.  Tested with allmodconfig build.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230308142006.20879-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h  | 1 -
 net/netlink/af_netlink.c | 2 --
 net/netlink/af_netlink.h | 1 -
 3 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index c43ac7690eca..3e8743252167 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -50,7 +50,6 @@ struct netlink_kernel_cfg {
 	struct mutex	*cb_mutex;
 	int		(*bind)(struct net *net, int group);
 	void		(*unbind)(struct net *net, int group);
-	bool		(*compare)(struct net *net, struct sock *sk);
 };
 
 struct sock *__netlink_kernel_create(struct net *net, int unit,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c64277659753..877f1da1a8ac 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2097,8 +2097,6 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
 			nl_table[unit].bind = cfg->bind;
 			nl_table[unit].unbind = cfg->unbind;
 			nl_table[unit].flags = cfg->flags;
-			if (cfg->compare)
-				nl_table[unit].compare = cfg->compare;
 		}
 		nl_table[unit].registered = 1;
 	} else {
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 5f454c8de6a4..90a3198a9b7f 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -64,7 +64,6 @@ struct netlink_table {
 	struct module		*module;
 	int			(*bind)(struct net *net, int group);
 	void			(*unbind)(struct net *net, int group);
-	bool			(*compare)(struct net *net, struct sock *sock);
 	int			registered;
 };
 
-- 
cgit v1.2.3


From 62423bd2d2e231951245d77740a58027a2d81ef9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 8 Mar 2023 18:26:48 +0000
Subject: net: sched: remove qdisc_watchdog->last_expires

This field mirrors hrtimer softexpires, we can instead
use the existing helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20230308182648.1150762-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h | 1 -
 net/sched/sch_api.c     | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index 2016839991a4..bb0bd69fb655 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -64,7 +64,6 @@ static inline psched_time_t psched_get_time(void)
 }
 
 struct qdisc_watchdog {
-	u64		last_expires;
 	struct hrtimer	timer;
 	struct Qdisc	*qdisc;
 };
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index aba789c30a2e..fdb8f429333d 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -639,14 +639,16 @@ void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
 		return;
 
 	if (hrtimer_is_queued(&wd->timer)) {
+		u64 softexpires;
+
+		softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
 		/* If timer is already set in [expires, expires + delta_ns],
 		 * do not reprogram it.
 		 */
-		if (wd->last_expires - expires <= delta_ns)
+		if (softexpires - expires <= delta_ns)
 			return;
 	}
 
-	wd->last_expires = expires;
 	hrtimer_start_range_ns(&wd->timer,
 			       ns_to_ktime(expires),
 			       delta_ns,
-- 
cgit v1.2.3


From 76b9bf965c98c9b53ef7420b3b11438dbd764f92 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Wed, 8 Mar 2023 11:23:13 +0200
Subject: neighbour: delete neigh_lookup_nodev as not used

neigh_lookup_nodev isn't used in the kernel after removal
of DECnet. So let's remove it.

Fixes: 1202cdd66531 ("Remove DECnet support from kernel")
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/eb5656200d7964b2d177a36b77efa3c597d6d72d.1678267343.git.leonro@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/neighbour.h |  2 --
 net/core/neighbour.c    | 31 -------------------------------
 2 files changed, 33 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 2f2a6023fb0e..234799ca527e 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -336,8 +336,6 @@ void neigh_table_init(int index, struct neigh_table *tbl);
 int neigh_table_clear(int index, struct neigh_table *tbl);
 struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 			       struct net_device *dev);
-struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
-				     const void *pkey);
 struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
 				 struct net_device *dev, bool want_ref);
 static inline struct neighbour *neigh_create(struct neigh_table *tbl,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6798f6d2423b..0116b0ff91a7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -627,37 +627,6 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 }
 EXPORT_SYMBOL(neigh_lookup);
 
-struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
-				     const void *pkey)
-{
-	struct neighbour *n;
-	unsigned int key_len = tbl->key_len;
-	u32 hash_val;
-	struct neigh_hash_table *nht;
-
-	NEIGH_CACHE_STAT_INC(tbl, lookups);
-
-	rcu_read_lock_bh();
-	nht = rcu_dereference_bh(tbl->nht);
-	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
-
-	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
-	     n != NULL;
-	     n = rcu_dereference_bh(n->next)) {
-		if (!memcmp(n->primary_key, pkey, key_len) &&
-		    net_eq(dev_net(n->dev), net)) {
-			if (!refcount_inc_not_zero(&n->refcnt))
-				n = NULL;
-			NEIGH_CACHE_STAT_INC(tbl, hits);
-			break;
-		}
-	}
-
-	rcu_read_unlock_bh();
-	return n;
-}
-EXPORT_SYMBOL(neigh_lookup_nodev);
-
 static struct neighbour *
 ___neigh_create(struct neigh_table *tbl, const void *pkey,
 		struct net_device *dev, u32 flags,
-- 
cgit v1.2.3


From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 9 Mar 2023 22:01:49 -0800
Subject: bpf: ensure state checkpointing at iter_next() call sites

State equivalence check and checkpointing performed in is_state_visited()
employs certain heuristics to try to save memory by avoiding state checkpoints
if not enough jumps and instructions happened since last checkpoint. This leads
to unpredictability of whether a particular instruction will be checkpointed
and how regularly. While normally this is not causing much problems (except
inconveniences for predictable verifier tests, which we overcome with
BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded
iterators.

Checking and saving state checkpoints at iter_next() call is crucial for fast
convergence of open-coded iterator loop logic, so we need to force it. If we
don't do that, is_state_visited() might skip saving a checkpoint, causing
unnecessarily long sequence of not checkpointed instructions and jumps, leading
to exhaustion of jump history buffer, and potentially other undesired outcomes.
It is expected that with correct open-coded iterators convergence will happen
quickly, so we don't run a risk of exhausting memory.

This patch adds, in addition to prune and jump instruction marks, also a
"forced checkpoint" mark, and makes sure that any iter_next() call instruction
is marked as such.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  6 +++++-
 kernel/bpf/verifier.c        | 31 ++++++++++++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0c052bc79940..81d525d057c7 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -477,8 +477,12 @@ struct bpf_insn_aux_data {
 
 	/* below fields are initialized once */
 	unsigned int orig_idx; /* original instruction index */
-	bool prune_point;
 	bool jmp_point;
+	bool prune_point;
+	/* ensure we check state equivalence and save state checkpoint and
+	 * this instruction, regardless of any heuristics
+	 */
+	bool force_checkpoint;
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 45a082284464..13fd4c893f3b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13865,6 +13865,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
 	return env->insn_aux_data[insn_idx].prune_point;
 }
 
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+	env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+	return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+
 enum {
 	DONE_EXPLORING = 0,
 	KEEP_EXPLORING = 1,
@@ -13984,8 +13995,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
 			struct bpf_kfunc_call_arg_meta meta;
 
 			ret = fetch_kfunc_meta(env, insn, &meta, NULL);
-			if (ret == 0 && is_iter_next_kfunc(&meta))
+			if (ret == 0 && is_iter_next_kfunc(&meta)) {
 				mark_prune_point(env, t);
+				/* Checking and saving state checkpoints at iter_next() call
+				 * is crucial for fast convergence of open-coded iterator loop
+				 * logic, so we need to force it. If we don't do that,
+				 * is_state_visited() might skip saving a checkpoint, causing
+				 * unnecessarily long sequence of not checkpointed
+				 * instructions and jumps, leading to exhaustion of jump
+				 * history buffer, and potentially other undesired outcomes.
+				 * It is expected that with correct open-coded iterators
+				 * convergence will happen quickly, so we don't run a risk of
+				 * exhausting memory.
+				 */
+				mark_force_checkpoint(env, t);
+			}
 		}
 		return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
 
@@ -15172,7 +15196,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *sl, **pprev;
 	struct bpf_verifier_state *cur = env->cur_state, *new;
 	int i, j, err, states_cnt = 0;
-	bool add_new_state = env->test_state_freq ? true : false;
+	bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+	bool add_new_state = force_new_state;
 
 	/* bpf progs typically have pruning point every 4 instructions
 	 * http://vger.kernel.org/bpfconf2019.html#session-1
@@ -15269,7 +15294,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * at the end of the loop are likely to be useful in pruning.
 			 */
 skip_inf_loop_check:
-			if (!env->test_state_freq &&
+			if (!force_new_state &&
 			    env->jmps_processed - env->prev_jmps_processed < 20 &&
 			    env->insn_processed - env->prev_insn_processed < 100)
 				add_new_state = false;
-- 
cgit v1.2.3


From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:20 -0800
Subject: bpf: Move a few bpf_local_storage functions to static scope

This patch moves the bpf_local_storage_free_rcu() and
bpf_selem_unlink_map() to static because they are
not used outside of bpf_local_storage.c.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 3 ---
 kernel/bpf/bpf_local_storage.c    | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index d934248b8e81..502ad7093f13 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem);
-
 struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
@@ -163,7 +161,6 @@ struct bpf_local_storage_data *
 bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 			 void *value, u64 map_flags, gfp_t gfp_flags);
 
-void bpf_local_storage_free_rcu(struct rcu_head *rcu);
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
 
 #endif /* _BPF_LOCAL_STORAGE_H */
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index d3ba3f2db640..1904a4245ebe 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -95,7 +95,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
-void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
@@ -251,7 +251,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 	hlist_add_head_rcu(&selem->snode, &local_storage->list);
 }
 
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
 {
 	struct bpf_local_storage_map *smap;
 	struct bpf_local_storage_map_bucket *b;
-- 
cgit v1.2.3


From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:21 -0800
Subject: bpf: Refactor codes into bpf_local_storage_destroy

This patch first renames bpf_local_storage_unlink_nolock to
bpf_local_storage_destroy(). It better reflects that it is only
used when the storage's owner (sk/task/cgrp/inode) is being kfree().

All bpf_local_storage_destroy's caller is taking the spin lock and
then free the storage. This patch also moves these two steps into
the bpf_local_storage_destroy.

This is a preparation work for a later patch that uses
bpf_mem_cache_alloc/free in the bpf_local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 2 +-
 kernel/bpf/bpf_cgrp_storage.c     | 9 +--------
 kernel/bpf/bpf_inode_storage.c    | 8 +-------
 kernel/bpf/bpf_local_storage.c    | 8 ++++++--
 kernel/bpf/bpf_task_storage.c     | 9 +--------
 net/core/bpf_sk_storage.c         | 8 +-------
 6 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 502ad7093f13..5908a954ddc2 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
 			 struct bpf_local_storage_map *smap,
 			 bool cacheit_lockit);
 
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage);
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
 
 void bpf_local_storage_map_free(struct bpf_map *map,
 				struct bpf_local_storage_cache *cache,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 9ae07aedaf23..492594d69a86 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
 void bpf_cgrp_storage_free(struct cgroup *cgroup)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_cgroup_storage = false;
-	unsigned long flags;
 
 	rcu_read_lock();
 	local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
@@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
 	}
 
 	bpf_cgrp_storage_lock();
-	raw_spin_lock_irqsave(&local_storage->lock, flags);
-	free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_local_storage_destroy(local_storage);
 	bpf_cgrp_storage_unlock();
 	rcu_read_unlock();
-
-	if (free_cgroup_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static struct bpf_local_storage_data *
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 43e2619c8167..2d25bcfa371b 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
 void bpf_inode_storage_free(struct inode *inode)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_inode_storage = false;
 	struct bpf_storage_blob *bsb;
 
 	bsb = bpf_inode(inode);
@@ -72,13 +71,8 @@ void bpf_inode_storage_free(struct inode *inode)
 		return;
 	}
 
-	raw_spin_lock_bh(&local_storage->lock);
-	free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_bh(&local_storage->lock);
+	bpf_local_storage_destroy(local_storage);
 	rcu_read_unlock();
-
-	if (free_inode_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 1904a4245ebe..e19f9f50a60d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -652,11 +652,12 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 	return 0;
 }
 
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
 	struct bpf_local_storage_elem *selem;
 	bool free_storage = false;
 	struct hlist_node *n;
+	unsigned long flags;
 
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
@@ -667,6 +668,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 	 * when unlinking elem from the local_storage->list and
 	 * the map's bucket->list.
 	 */
+	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
 		/* Always unlink from map before unlinking from
 		 * local_storage.
@@ -681,8 +683,10 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
 		free_storage = bpf_selem_unlink_storage_nolock(
 			local_storage, selem, false, false);
 	}
+	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
-	return free_storage;
+	if (free_storage)
+		kfree_rcu(local_storage, rcu);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 20f942229f3c..4dcef28744d1 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
 void bpf_task_storage_free(struct task_struct *task)
 {
 	struct bpf_local_storage *local_storage;
-	bool free_task_storage = false;
-	unsigned long flags;
 
 	rcu_read_lock();
 
@@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task)
 	}
 
 	bpf_task_storage_lock();
-	raw_spin_lock_irqsave(&local_storage->lock, flags);
-	free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
-	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+	bpf_local_storage_destroy(local_storage);
 	bpf_task_storage_unlock();
 	rcu_read_unlock();
-
-	if (free_task_storage)
-		kfree_rcu(local_storage, rcu);
 }
 
 static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 7a36353dbc22..8f56438c104b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -49,7 +49,6 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 void bpf_sk_storage_free(struct sock *sk)
 {
 	struct bpf_local_storage *sk_storage;
-	bool free_sk_storage = false;
 
 	rcu_read_lock();
 	sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -58,13 +57,8 @@ void bpf_sk_storage_free(struct sock *sk)
 		return;
 	}
 
-	raw_spin_lock_bh(&sk_storage->lock);
-	free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage);
-	raw_spin_unlock_bh(&sk_storage->lock);
+	bpf_local_storage_destroy(sk_storage);
 	rcu_read_unlock();
-
-	if (free_sk_storage)
-		kfree_rcu(sk_storage, rcu);
 }
 
 static void bpf_sk_storage_map_free(struct bpf_map *map)
-- 
cgit v1.2.3


From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:24 -0800
Subject: bpf: Remember smap in bpf_local_storage

This patch remembers which smap triggers the allocation
of a 'struct bpf_local_storage' object. The local_storage is
allocated during the very first selem added to the owner.
The smap pointer is needed when using the bpf_mem_cache_free
in a later patch because it needs to free to the correct
smap's bpf_mem_alloc object.

When a selem is being removed, it needs to check if it is
the selem that triggers the creation of the local_storage.
If it is, the local_storage->smap pointer will be reset to NULL.
This NULL reset is done under the local_storage->lock in
bpf_selem_unlink_storage_nolock() when a selem is being removed.
Also note that the local_storage may not go away even
local_storage->smap is NULL because there may be other
selem still stored in the local_storage.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h | 1 +
 kernel/bpf/bpf_local_storage.c    | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 5908a954ddc2..613b1805ed9f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -83,6 +83,7 @@ struct bpf_local_storage_elem {
 
 struct bpf_local_storage {
 	struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE];
+	struct bpf_local_storage_map __rcu *smap;
 	struct hlist_head list; /* List of bpf_local_storage_elem */
 	void *owner;		/* The object that owns the above "list" of
 				 * bpf_local_storage_elem.
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 70df8dcb2066..5585dbfd9c66 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -213,6 +213,9 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 			kfree_rcu(selem, rcu);
 	}
 
+	if (rcu_access_pointer(local_storage->smap) == smap)
+		RCU_INIT_POINTER(local_storage->smap, NULL);
+
 	return free_local_storage;
 }
 
@@ -368,6 +371,7 @@ int bpf_local_storage_alloc(void *owner,
 		goto uncharge;
 	}
 
+	RCU_INIT_POINTER(storage->smap, smap);
 	INIT_HLIST_HEAD(&storage->list);
 	raw_spin_lock_init(&storage->lock);
 	storage->owner = owner;
-- 
cgit v1.2.3


From a47eabf216f77cb6f22ceb38d46f1bb95968579c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:25 -0800
Subject: bpf: Repurpose use_trace_rcu to reuse_now in bpf_local_storage

This patch re-purpose the use_trace_rcu to mean
if the freed memory can be reused immediately or not.
The use_trace_rcu is renamed to reuse_now. Other than
the boolean test is reversed, it should be a no-op.

The following explains the reason for the rename and how it will
be used in a later patch.

In a later patch, bpf_mem_cache_alloc/free will be used
in the bpf_local_storage. The bpf mem allocator will reuse
the freed memory immediately. Some of the free paths in
bpf_local_storage does not support memory to be reused immediately.
These paths are the "delete" elem cases from the bpf_*_storage_delete()
helper and the map_delete_elem() syscall. Note that "delete" elem
before the owner's (sk/task/cgrp/inode) lifetime ended is not
the common usage for the local storage.

The common free path, bpf_local_storage_destroy(), can reuse the
memory immediately. This common path means the storage stays with
its owner until the owner is destroyed.

The above mentioned "delete" elem paths that cannot
reuse immediately always has the 'use_trace_rcu ==  true'.
The cases that is safe for immediate reuse always have
'use_trace_rcu == false'. Instead of adding another arg
in a later patch, this patch re-purpose this arg
to reuse_now and have the test logic reversed.

In a later patch, 'reuse_now == true' will free to the
bpf_mem_cache_free() where the memory can be reused
immediately. 'reuse_now == false' will go through the
call_rcu_tasks_trace().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-7-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  2 +-
 kernel/bpf/bpf_cgrp_storage.c     |  2 +-
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/bpf_local_storage.c    | 24 ++++++++++++------------
 kernel/bpf/bpf_task_storage.c     |  2 +-
 net/core/bpf_sk_storage.c         |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 613b1805ed9f..18a31add2255 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -143,7 +143,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
 				   struct bpf_local_storage_elem *selem);
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu);
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now);
 
 void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 			struct bpf_local_storage_elem *selem);
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 492594d69a86..c975cacdd16b 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -121,7 +121,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 	return 0;
 }
 
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 2d25bcfa371b..ad2ab0187e45 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -122,7 +122,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 5585dbfd9c66..70c34a948c3c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -147,7 +147,7 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
  */
 static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
 					    struct bpf_local_storage_elem *selem,
-					    bool uncharge_mem, bool use_trace_rcu)
+					    bool uncharge_mem, bool reuse_now)
 {
 	struct bpf_local_storage_map *smap;
 	bool free_local_storage;
@@ -201,7 +201,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	 * any special fields.
 	 */
 	rec = smap->map.record;
-	if (use_trace_rcu) {
+	if (!reuse_now) {
 		if (!IS_ERR_OR_NULL(rec))
 			call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu);
 		else
@@ -220,7 +220,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 }
 
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
-				     bool use_trace_rcu)
+				     bool reuse_now)
 {
 	struct bpf_local_storage *local_storage;
 	bool free_local_storage = false;
@@ -235,11 +235,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, true, use_trace_rcu);
+			local_storage, selem, true, reuse_now);
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage) {
-		if (use_trace_rcu)
+		if (!reuse_now)
 			call_rcu_tasks_trace(&local_storage->rcu,
 				     bpf_local_storage_free_rcu);
 		else
@@ -284,14 +284,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
 	raw_spin_unlock_irqrestore(&b->lock, flags);
 }
 
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
 {
 	/* Always unlink from map before unlinking from local_storage
 	 * because selem will be freed after successfully unlinked from
 	 * the local_storage.
 	 */
 	bpf_selem_unlink_map(selem);
-	bpf_selem_unlink_storage(selem, use_trace_rcu);
+	bpf_selem_unlink_storage(selem, reuse_now);
 }
 
 /* If cacheit_lockit is false, this lookup function is lockless */
@@ -538,7 +538,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 	if (old_sdata) {
 		bpf_selem_unlink_map(SELEM(old_sdata));
 		bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
-						false, true);
+						false, false);
 	}
 
 unlock:
@@ -651,7 +651,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 		 * of the loop will set the free_cgroup_storage to true.
 		 */
 		free_storage = bpf_selem_unlink_storage_nolock(
-			local_storage, selem, false, false);
+			local_storage, selem, false, true);
 	}
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
@@ -745,7 +745,7 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 				migrate_disable();
 				this_cpu_inc(*busy_counter);
 			}
-			bpf_selem_unlink(selem, false);
+			bpf_selem_unlink(selem, true);
 			if (busy_counter) {
 				this_cpu_dec(*busy_counter);
 				migrate_enable();
@@ -783,8 +783,8 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 		/* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
 		 * is true, because while call_rcu invocation is skipped in that
 		 * case in bpf_selem_free_fields_trace_rcu (and all local
-		 * storage maps pass use_trace_rcu = true), there can be
-		 * call_rcu callbacks based on use_trace_rcu = false in the
+		 * storage maps pass reuse_now = false), there can be
+		 * call_rcu callbacks based on reuse_now = true in the
 		 * while ((selem = ...)) loop above or when owner's free path
 		 * calls bpf_local_storage_unlink_nolock.
 		 */
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 4dcef28744d1..c88cc04c17c1 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -168,7 +168,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
 	if (!nobusy)
 		return -EBUSY;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 8f56438c104b..a5f185b8e50a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -40,7 +40,7 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
 	if (!sdata)
 		return -ENOENT;
 
-	bpf_selem_unlink(SELEM(sdata), true);
+	bpf_selem_unlink(SELEM(sdata), false);
 
 	return 0;
 }
-- 
cgit v1.2.3


From c0d63f309186d8492577c67c67984c714b6b72bc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Tue, 7 Mar 2023 22:59:28 -0800
Subject: bpf: Add bpf_selem_free()

This patch refactors the selem freeing logic into bpf_selem_free().
It is a preparation work for a later patch using
bpf_mem_cache_alloc/free. The other kfree(selem) cases
are also changed to bpf_selem_free(..., reuse_now = true).

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230308065936.1550103-10-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  4 ++++
 kernel/bpf/bpf_local_storage.c    | 21 ++++++++++++++-------
 net/core/bpf_sk_storage.c         |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 18a31add2255..a34f61467a2f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -152,6 +152,10 @@ struct bpf_local_storage_elem *
 bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
 		bool charge_mem, gfp_t gfp_flags);
 
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+		    struct bpf_local_storage_map *smap,
+		    bool reuse_now);
+
 int
 bpf_local_storage_alloc(void *owner,
 			struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 146e9caeda96..512943aac435 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -125,6 +125,17 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
 		call_rcu(rcu, bpf_selem_free_rcu);
 }
 
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+		    struct bpf_local_storage_map *smap,
+		    bool reuse_now)
+{
+	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+	if (!reuse_now)
+		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+	else
+		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+}
+
 /* local_storage->lock must be held and selem->local_storage == local_storage.
  * The caller must ensure selem->smap is still valid to be
  * dereferenced for its smap->elem_size and smap->cache_idx.
@@ -175,11 +186,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	    SDATA(selem))
 		RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
 
-	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-	if (!reuse_now)
-		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	bpf_selem_free(selem, smap, reuse_now);
 
 	if (rcu_access_pointer(local_storage->smap) == smap)
 		RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -423,7 +430,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
 
 		err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
 		if (err) {
-			kfree(selem);
+			bpf_selem_free(selem, smap, true);
 			mem_uncharge(smap, owner, smap->elem_size);
 			return ERR_PTR(err);
 		}
@@ -517,7 +524,7 @@ unlock_err:
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 	if (selem) {
 		mem_uncharge(smap, owner, smap->elem_size);
-		kfree(selem);
+		bpf_selem_free(selem, smap, true);
 	}
 	return ERR_PTR(err);
 }
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index a5f185b8e50a..24c3dc0d62e5 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -197,7 +197,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
 		} else {
 			ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
 			if (ret) {
-				kfree(copy_selem);
+				bpf_selem_free(copy_selem, smap, true);
 				atomic_sub(smap->elem_size,
 					   &newsk->sk_omem_alloc);
 				bpf_map_put(map);
-- 
cgit v1.2.3


From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 9 Mar 2023 10:01:08 -0800
Subject: bpf: Change btf_record_find enum parameter to field_mask

btf_record_find's 3rd parameter can be multiple enum btf_field_type's
masked together. The function is called with BPF_KPTR in two places in
verifier.c, so it works with masked values already.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 2 +-
 kernel/bpf/syscall.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e64ff1e89fb2..3a38db315f7f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1925,7 +1925,7 @@ void bpf_prog_free_id(struct bpf_prog *prog);
 void bpf_map_free_id(struct bpf_map *map);
 
 struct btf_field *btf_record_find(const struct btf_record *rec,
-				  u32 offset, enum btf_field_type type);
+				  u32 offset, u32 field_mask);
 void btf_record_free(struct btf_record *rec);
 void bpf_map_free_record(struct bpf_map *map);
 struct btf_record *btf_record_dup(const struct btf_record *rec);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f406dfa13792..cc4b7684910c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -520,14 +520,14 @@ static int btf_field_cmp(const void *a, const void *b)
 }
 
 struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
-				  enum btf_field_type type)
+				  u32 field_mask)
 {
 	struct btf_field *field;
 
-	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
 		return NULL;
 	field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
-	if (!field || !(field->type & type))
+	if (!field || !(field->type & field_mask))
 		return NULL;
 	return field;
 }
-- 
cgit v1.2.3


From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Fri, 10 Mar 2023 15:07:41 -0800
Subject: bpf: Support __kptr to local kptrs

If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module
BTF - it must have been allocated by bpf_obj_new and therefore must be
free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local
kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new.

This patch adds support for treating __kptr-tagged pointers to "local
kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr
acquire / release semantics. Consider the following example:

  struct node_data {
          long key;
          long data;
          struct bpf_rb_node node;
  };

  struct map_value {
          struct node_data __kptr *node;
  };

  struct {
          __uint(type, BPF_MAP_TYPE_ARRAY);
          __type(key, int);
          __type(value, struct map_value);
          __uint(max_entries, 1);
  } some_nodes SEC(".maps");

If struct node_data had a matching definition in kernel BTF, the verifier would
expect a destructor for the type to be registered. Since struct node_data does
not match any type in kernel BTF, the verifier knows that there is no kfunc
that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can
only come from bpf_obj_new. So instead of searching for a registered dtor,
a bpf_obj_drop dtor can be assumed.

This allows the runtime to properly destruct such kptrs in
bpf_obj_free_fields, which enables maps to clean up map_vals w/ such
kptrs when going away.

Implementation notes:
  * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr.
    Before this patch, the variable would only ever point to vmlinux or
    module BTFs, but now it can point to some program BTF for local kptr
    type. It's later used to populate the (btf, btf_id) pair in kptr btf
    field.
  * It's necessary to btf_get the program BTF when populating btf_field
    for local kptr. btf_record_free later does a btf_put.
  * Behavior for non-local referenced kptrs is not modified, as
    bpf_find_btf_id helper only searches vmlinux and module BTFs for
    matching BTF type. If such a type is found, btf_field_kptr's btf will
    pass btf_is_kernel check, and the associated release function is
    some one-argument dtor. If btf_is_kernel check fails, associated
    release function is two-arg bpf_obj_drop_impl. Before this patch
    only btf_field_kptr's w/ kernel or module BTFs were created.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 11 ++++++++++-
 include/linux/btf.h  |  2 --
 kernel/bpf/btf.c     | 37 ++++++++++++++++++++++++++++---------
 kernel/bpf/helpers.c | 11 ++++++++---
 kernel/bpf/syscall.c | 14 +++++++++++++-
 5 files changed, 59 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3a38db315f7f..756b85f0d0d3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -189,10 +189,19 @@ enum btf_field_type {
 				 BPF_RB_NODE | BPF_RB_ROOT,
 };
 
+typedef void (*btf_dtor_kfunc_t)(void *);
+typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
+
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	btf_dtor_kfunc_t dtor;
+	union {
+		/* dtor used if btf_is_kernel(btf), otherwise the type
+		 * is program-allocated and obj_drop is used
+		 */
+		btf_dtor_kfunc_t dtor;
+		btf_dtor_obj_drop obj_drop;
+	};
 	u32 btf_id;
 };
 
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1bba0827e8c4..d53b10cc55f2 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -121,8 +121,6 @@ struct btf_struct_metas {
 	struct btf_struct_meta types[];
 };
 
-typedef void (*btf_dtor_kfunc_t)(void *);
-
 extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 37779ceefd09..66fad7a16b6c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3551,12 +3551,17 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 	return -EINVAL;
 }
 
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 			  struct btf_field_info *info)
 {
 	struct module *mod = NULL;
 	const struct btf_type *t;
-	struct btf *kernel_btf;
+	/* If a matching btf type is found in kernel or module BTFs, kptr_ref
+	 * is that BTF, otherwise it's program BTF
+	 */
+	struct btf *kptr_btf;
 	int ret;
 	s32 id;
 
@@ -3565,7 +3570,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 	 */
 	t = btf_type_by_id(btf, info->kptr.type_id);
 	id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
-			     &kernel_btf);
+			     &kptr_btf);
+	if (id == -ENOENT) {
+		/* btf_parse_kptr should only be called w/ btf = program BTF */
+		WARN_ON_ONCE(btf_is_kernel(btf));
+
+		/* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+		 * kptr allocated via bpf_obj_new
+		 */
+		field->kptr.dtor = (void *)&__bpf_obj_drop_impl;
+		id = info->kptr.type_id;
+		kptr_btf = (struct btf *)btf;
+		btf_get(kptr_btf);
+		goto found_dtor;
+	}
 	if (id < 0)
 		return id;
 
@@ -3582,20 +3600,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		 * can be used as a referenced pointer and be stored in a map at
 		 * the same time.
 		 */
-		dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+		dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
 		if (dtor_btf_id < 0) {
 			ret = dtor_btf_id;
 			goto end_btf;
 		}
 
-		dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+		dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
 		if (!dtor_func) {
 			ret = -ENOENT;
 			goto end_btf;
 		}
 
-		if (btf_is_module(kernel_btf)) {
-			mod = btf_try_get_module(kernel_btf);
+		if (btf_is_module(kptr_btf)) {
+			mod = btf_try_get_module(kptr_btf);
 			if (!mod) {
 				ret = -ENXIO;
 				goto end_btf;
@@ -3605,7 +3623,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		/* We already verified dtor_func to be btf_type_is_func
 		 * in register_btf_id_dtor_kfuncs.
 		 */
-		dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+		dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
 		addr = kallsyms_lookup_name(dtor_func_name);
 		if (!addr) {
 			ret = -EINVAL;
@@ -3614,14 +3632,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		field->kptr.dtor = (void *)addr;
 	}
 
+found_dtor:
 	field->kptr.btf_id = id;
-	field->kptr.btf = kernel_btf;
+	field->kptr.btf = kptr_btf;
 	field->kptr.module = mod;
 	return 0;
 end_mod:
 	module_put(mod);
 end_btf:
-	btf_put(kernel_btf);
+	btf_put(kptr_btf);
 	return ret;
 }
 
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f9b7eeedce08..77d64b6951b9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1896,14 +1896,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+	if (rec)
+		bpf_obj_free_fields(rec, p);
+	bpf_mem_free(&bpf_global_ma, p);
+}
+
 __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
 {
 	struct btf_struct_meta *meta = meta__ign;
 	void *p = p__alloc;
 
-	if (meta)
-		bpf_obj_free_fields(meta->record, p);
-	bpf_mem_free(&bpf_global_ma, p);
+	__bpf_obj_drop_impl(p, meta ? meta->record : NULL);
 }
 
 static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cc4b7684910c..0684febc447a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -659,8 +659,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		return;
 	fields = rec->fields;
 	for (i = 0; i < rec->cnt; i++) {
+		struct btf_struct_meta *pointee_struct_meta;
 		const struct btf_field *field = &fields[i];
 		void *field_ptr = obj + field->offset;
+		void *xchgd_field;
 
 		switch (fields[i].type) {
 		case BPF_SPIN_LOCK:
@@ -672,7 +674,17 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
 		case BPF_KPTR_REF:
-			field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+			xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+			if (!btf_is_kernel(field->kptr.btf)) {
+				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+									   field->kptr.btf_id);
+				WARN_ON_ONCE(!pointee_struct_meta);
+				field->kptr.obj_drop(xchgd_field, pointee_struct_meta ?
+								  pointee_struct_meta->record :
+								  NULL);
+			} else {
+				field->kptr.dtor(xchgd_field);
+			}
 			break;
 		case BPF_LIST_HEAD:
 			if (WARN_ON_ONCE(rec->spin_lock_off < 0))
-- 
cgit v1.2.3


From 22df776a9a866713d9decfb92b633bcfdb571954 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 15 Feb 2023 17:30:33 -0600
Subject: tasks: Extract rcu_users out of union

In commit 3fbd7ee285b2b ("tasks: Add a count of task RCU users"), a
count on the number of RCU users was added to struct task_struct. This
was done so as to enable the removal of task_rcu_dereference(), and
allow tasks to be protected by RCU even after exiting and being removed
from the runqueue. In this commit, the 'refcount_t rcu_users' field that
keeps track of this refcount was put into a union co-located with
'struct rcu_head rcu', so as to avoid taking up any extra space in
task_struct. This was possible to do safely, because the field was only
ever decremented by a static set of specific callers, and then never
incremented again.

While this restriction of there only being a small, static set of users
of this field has worked fine, it prevents us from leveraging the field
to use RCU to protect tasks in other contexts.

During tracing, for example, it would be useful to be able to collect
some tasks that performed a certain operation, put them in a map, and
then periodically summarize who they are, which cgroup they're in, how
much CPU time they've utilized, etc. While this can currently be done
with 'usage', it becomes tricky when a task is already in a map, or if a
reference should only be taken if a task is valid and will not soon be
reaped. Ideally, we could do something like pass a reference to a map
value, and then try to acquire a reference to the task in an RCU read
region by using refcount_inc_not_zero().

Similarly, in sched_ext, schedulers are using integer pids to remember
tasks, and then looking them up with find_task_by_pid_ns(). This is
slow, error prone, and adds complexity. It would be more convenient and
performant if BPF schedulers could instead store tasks directly in maps,
and then leverage RCU to ensure they can be safely accessed with low
overhead.

Finally, overloading fields like this is error prone. Someone that wants
to use 'rcu_users' could easily overlook the fact that once the rcu
callback is scheduled, the refcount will go back to being nonzero, thus
precluding the use of refcount_inc_not_zero(). Furthermore, as described
below, it's possible to extract the fields of the union without changing
the size of task_struct.

There are several possible ways to enable this:

1. The lightest touch approach is likely the one proposed in this patch,
   which is to simply extract 'rcu_users' and 'rcu' from the union, so
   that scheduling the 'rcu' callback doesn't overwrite the 'rcu_users'
   refcount. If we have a trusted task pointer, this would allow us to
   use refcnt_inc_not_zero() inside of an RCU region to determine if we
   can safely acquire a reference to the task and store it in a map. As
   mentioned below, this can be done without changing the size of
   task_struct, by moving the location of the union to another location
   that has padding gaps we can fill in.

2. Removing 'refcount_t rcu_users', and instead having the entire task
   be freed in an rcu callback. This is likely the most sound overall
   design, though it changes the behavioral semantics exposed to
   callers, who currently expect that a task that's successfully looked
   up in e.g. the pid_list with find_task_by_pid_ns(), can always have a
   'usage' reference acquired on them, as it's guaranteed to be >
   0 until after the next gp. In order for this approach to work, we'd
   have to audit all callers. This approach also slightly changes
   behavior observed by user space by not invoking
   trace_sched_process_free() until the whole task_struct is actually being
   freed, rather than just after it's exited. It also may change
   timings, as memory will be freed in an RCU callback rather than
   immediately when the final 'usage' refcount drops to 0. This also is
   arguably a benefit, as it provides more predictable performance to
   callers who are refcounting tasks.

3. There may be other solutions as well that don't require changing the
   layout of task_struct. For example, we could possibly do something
   complex from the BPF side, such as listen for task exit and remove a
   task from a map when the task is exiting. This would likely require
   significant custom handling for task_struct in the verifier, so a
   more generalizable solution is likely warranted.

As mentioned above, this patch proposes the lightest-touch approach
which allows callers elsewhere in the kernel to use 'rcu_users' to
ensure the lifetime of a task, by extracting 'rcu_users' and 'rcu' from
the union. There is no size change in task_struct with this patch.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: David Vernet <void@manifault.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20230215233033.889644-1-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/sched.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..b11b4517760f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,11 +1318,6 @@ struct task_struct {
 
 	struct tlbflush_unmap_batch	tlb_ubc;
 
-	union {
-		refcount_t		rcu_users;
-		struct rcu_head		rcu;
-	};
-
 	/* Cache last used pipe for splice(): */
 	struct pipe_inode_info		*splice_pipe;
 
@@ -1459,6 +1454,8 @@ struct task_struct {
 	unsigned long			saved_state_change;
 # endif
 #endif
+	struct rcu_head			rcu;
+	refcount_t			rcu_users;
 	int				pagefault_disabled;
 #ifdef CONFIG_MMU
 	struct task_struct		*oom_reaper_list;
-- 
cgit v1.2.3


From be50da3e9d4ad1958f7b11322d44d94d5c25a4c1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 9 Mar 2023 10:45:59 +0100
Subject: net: virtio_net: implement exact header length guest feature

Virtio spec introduced a feature VIRTIO_NET_F_GUEST_HDRLEN which when
set implicates that device benefits from knowing the exact size
of the header. For compatibility, to signal to the device that
the header is reliable driver also needs to set this feature.
Without this feature set by driver, device has to figure
out the header size itself.

Quoting the original virtio spec:
"hdr_len is a hint to the device as to how much of the header needs to
 be kept to copy into each packet"

"a hint" might not be clear for the reader what does it mean, if it is
"maybe like that" of "exactly like that". This feature just makes it
crystal clear and let the device count on the hdr_len being filled up
by the exact length of header.

Also note the spec already has following note about hdr_len:
"Due to various bugs in implementations, this field is not useful
 as a guarantee of the transport header size."

Without this feature the device needs to parse the header in core
data path handling. Accurate information helps the device to eliminate
such header parsing and directly use the hardware accelerators
for GSO operation.

virtio_net_hdr_from_skb() fills up hdr_len to skb_headlen(skb).
The driver already complies to fill the correct value. Introduce the
feature and advertise it.

Note that virtio spec also includes following note for device
implementation:
"Caution should be taken by the implementation so as to prevent
 a malicious driver from attacking the device by setting
 an incorrect hdr_len."

There is a plan to support this feature in our emulated device.
A device of SolidRun offers this feature bit. They claim this feature
will save the device a few cycles for every GSO packet.

Link: https://docs.oasis-open.org/virtio/virtio/v1.2/cs01/virtio-v1.2-cs01.html#x1-230006x3
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Alvaro Karsz <alvaro.karsz@solid-run.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20230309094559.917857-1-jiri@resnulli.us
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/virtio_net.c        | 6 ++++--
 include/uapi/linux/virtio_net.h | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fb5e68ed3ec2..e85b03988733 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -62,7 +62,8 @@ static const unsigned long guest_offloads[] = {
 	VIRTIO_NET_F_GUEST_UFO,
 	VIRTIO_NET_F_GUEST_CSUM,
 	VIRTIO_NET_F_GUEST_USO4,
-	VIRTIO_NET_F_GUEST_USO6
+	VIRTIO_NET_F_GUEST_USO6,
+	VIRTIO_NET_F_GUEST_HDRLEN
 };
 
 #define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
@@ -4213,7 +4214,8 @@ static struct virtio_device_id id_table[] = {
 	VIRTIO_NET_F_CTRL_MAC_ADDR, \
 	VIRTIO_NET_F_MTU, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, \
 	VIRTIO_NET_F_SPEED_DUPLEX, VIRTIO_NET_F_STANDBY, \
-	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL
+	VIRTIO_NET_F_RSS, VIRTIO_NET_F_HASH_REPORT, VIRTIO_NET_F_NOTF_COAL, \
+	VIRTIO_NET_F_GUEST_HDRLEN
 
 static unsigned int features[] = {
 	VIRTNET_FEATURES,
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index b4062bed186a..12c1c9699935 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -61,6 +61,7 @@
 #define VIRTIO_NET_F_GUEST_USO6	55	/* Guest can handle USOv6 in. */
 #define VIRTIO_NET_F_HOST_USO	56	/* Host can handle USO in. */
 #define VIRTIO_NET_F_HASH_REPORT  57	/* Supports hash report */
+#define VIRTIO_NET_F_GUEST_HDRLEN  59	/* Guest provides the exact hdr_len value. */
 #define VIRTIO_NET_F_RSS	  60	/* Supports RSS RX steering */
 #define VIRTIO_NET_F_RSC_EXT	  61	/* extended coalescing info */
 #define VIRTIO_NET_F_STANDBY	  62	/* Act as standby for another device
-- 
cgit v1.2.3


From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 13 Mar 2023 14:46:41 -0700
Subject: bpf: Disable migration when freeing stashed local kptr using obj drop

When a local kptr is stashed in a map and freed when the map goes away,
currently an error like the below appears:

[   39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875
[   39.196549] caller is bpf_mem_free+0x56/0xc0
[   39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G           O       6.2.0-13016-g22df776a9a86 #4477
[   39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
[   39.198949] Workqueue: events_unbound bpf_map_free_deferred
[   39.199470] Call Trace:
[   39.199703]  <TASK>
[   39.199911]  dump_stack_lvl+0x60/0x70
[   39.200267]  check_preemption_disabled+0xbf/0xe0
[   39.200704]  bpf_mem_free+0x56/0xc0
[   39.201032]  ? bpf_obj_new_impl+0xa0/0xa0
[   39.201430]  bpf_obj_free_fields+0x1cd/0x200
[   39.201838]  array_map_free+0xad/0x220
[   39.202193]  ? finish_task_switch+0xe5/0x3c0
[   39.202614]  bpf_map_free_deferred+0xea/0x210
[   39.203006]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.203460]  process_one_work+0x64f/0xbe0
[   39.203822]  ? pwq_dec_nr_in_flight+0x110/0x110
[   39.204264]  ? do_raw_spin_lock+0x107/0x1c0
[   39.204662]  ? lockdep_hardirqs_on_prepare+0xe/0x220
[   39.205107]  worker_thread+0x74/0x7a0
[   39.205451]  ? process_one_work+0xbe0/0xbe0
[   39.205818]  kthread+0x171/0x1a0
[   39.206111]  ? kthread_complete_and_exit+0x20/0x20
[   39.206552]  ret_from_fork+0x1f/0x30
[   39.206886]  </TASK>

This happens because the call to __bpf_obj_drop_impl I added in the patch
adding support for stashing local kptrs doesn't disable migration. Prior
to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF
progarm, whereas now it can be called from map free path, so it's
necessary to explicitly disable migration.

Also, refactor a bit to just call __bpf_obj_drop_impl directly instead
of bothering w/ dtor union and setting pointer-to-obj_drop.

Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs")
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 12 ++++--------
 kernel/bpf/btf.c     |  4 +---
 kernel/bpf/syscall.c | 10 +++++++---
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 756b85f0d0d3..71cc92a4ba48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -190,18 +190,14 @@ enum btf_field_type {
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
-typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *);
 
 struct btf_field_kptr {
 	struct btf *btf;
 	struct module *module;
-	union {
-		/* dtor used if btf_is_kernel(btf), otherwise the type
-		 * is program-allocated and obj_drop is used
-		 */
-		btf_dtor_kfunc_t dtor;
-		btf_dtor_obj_drop obj_drop;
-	};
+	/* dtor used if btf_is_kernel(btf), otherwise the type is
+	 * program-allocated, dtor is NULL,  and __bpf_obj_drop_impl is used
+	 */
+	btf_dtor_kfunc_t dtor;
 	u32 btf_id;
 };
 
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 66fad7a16b6c..b7e5a5510b91 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3551,8 +3551,6 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
 	return -EINVAL;
 }
 
-extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
-
 static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 			  struct btf_field_info *info)
 {
@@ -3578,7 +3576,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
 		/* Type exists only in program BTF. Assume that it's a MEM_ALLOC
 		 * kptr allocated via bpf_obj_new
 		 */
-		field->kptr.dtor = (void *)&__bpf_obj_drop_impl;
+		field->kptr.dtor = NULL;
 		id = info->kptr.type_id;
 		kptr_btf = (struct btf *)btf;
 		btf_get(kptr_btf);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0684febc447a..5b88301a2ae0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -650,6 +650,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 	bpf_timer_cancel_and_free(obj + rec->timer_off);
 }
 
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
@@ -679,9 +681,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 				pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
 									   field->kptr.btf_id);
 				WARN_ON_ONCE(!pointee_struct_meta);
-				field->kptr.obj_drop(xchgd_field, pointee_struct_meta ?
-								  pointee_struct_meta->record :
-								  NULL);
+				migrate_disable();
+				__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+								 pointee_struct_meta->record :
+								 NULL);
+				migrate_enable();
 			} else {
 				field->kptr.dtor(xchgd_field);
 			}
-- 
cgit v1.2.3


From 27d7fdf06fdb84455ff585b58c8034e2fab42583 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <zwisler@google.com>
Date: Mon, 13 Mar 2023 14:56:27 -0600
Subject: bpf: use canonical ftrace path

The canonical location for the tracefs filesystem is at /sys/kernel/tracing.

But, from Documentation/trace/ftrace.rst:

  Before 4.1, all ftrace tracing control files were within the debugfs
  file system, which is typically located at /sys/kernel/debug/tracing.
  For backward compatibility, when mounting the debugfs file system,
  the tracefs file system will be automatically mounted at:

  /sys/kernel/debug/tracing

Many comments and samples in the bpf code still refer to this older
debugfs path, so let's update them to avoid confusion.  There are a few
spots where the bpf code explicitly checks both tracefs and debugfs
(tools/bpf/bpftool/tracelog.c and tools/lib/api/fs/fs.c) and I've left
those alone so that the tools can continue to work with both paths.

Signed-off-by: Ross Zwisler <zwisler@google.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20230313205628.1058720-2-zwisler@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h           | 8 ++++----
 samples/bpf/cpustat_kern.c         | 4 ++--
 samples/bpf/hbm.c                  | 4 ++--
 samples/bpf/ibumad_kern.c          | 4 ++--
 samples/bpf/lwt_len_hist.sh        | 2 +-
 samples/bpf/offwaketime_kern.c     | 2 +-
 samples/bpf/task_fd_query_user.c   | 4 ++--
 samples/bpf/test_lwt_bpf.sh        | 2 +-
 samples/bpf/test_overhead_tp.bpf.c | 4 ++--
 tools/include/uapi/linux/bpf.h     | 8 ++++----
 10 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d8c534e05b0a..13129df937cd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1647,17 +1647,17 @@ union bpf_attr {
  * 	Description
  * 		This helper is a "printk()-like" facility for debugging. It
  * 		prints a message defined by format *fmt* (of size *fmt_size*)
- * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		to file *\/sys/kernel/tracing/trace* from TraceFS, if
  * 		available. It can take up to three additional **u64**
  * 		arguments (as an eBPF helpers, the total number of arguments is
  * 		limited to five).
  *
  * 		Each time the helper is called, it appends a line to the trace.
- * 		Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
- * 		open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * 		Lines are discarded while *\/sys/kernel/tracing/trace* is
+ * 		open, use *\/sys/kernel/tracing/trace_pipe* to avoid this.
  * 		The format of the trace is customizable, and the exact output
  * 		one will get depends on the options set in
- * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*\/sys/kernel/tracing/trace_options* (see also the
  * 		*README* file under the same directory). However, it usually
  * 		defaults to something like:
  *
diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
index 5aefd19cdfa1..944f13fe164a 100644
--- a/samples/bpf/cpustat_kern.c
+++ b/samples/bpf/cpustat_kern.c
@@ -76,8 +76,8 @@ struct {
 
 /*
  * The trace events for cpu_idle and cpu_frequency are taken from:
- * /sys/kernel/debug/tracing/events/power/cpu_idle/format
- * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
+ * /sys/kernel/tracing/events/power/cpu_idle/format
+ * /sys/kernel/tracing/events/power/cpu_frequency/format
  *
  * These two events have same format, so define one common structure.
  */
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index 516fbac28b71..ff58ec43f56a 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -65,7 +65,7 @@ static void Usage(void);
 static void read_trace_pipe2(void);
 static void do_error(char *msg, bool errno_flag);
 
-#define DEBUGFS "/sys/kernel/debug/tracing/"
+#define TRACEFS "/sys/kernel/tracing/"
 
 static struct bpf_program *bpf_prog;
 static struct bpf_object *obj;
@@ -77,7 +77,7 @@ static void read_trace_pipe2(void)
 	FILE *outf;
 	char *outFname = "hbm_out.log";
 
-	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	trace_fd = open(TRACEFS "trace_pipe", O_RDONLY, 0);
 	if (trace_fd < 0) {
 		printf("Error opening trace_pipe\n");
 		return;
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
index 9b193231024a..f07474c72525 100644
--- a/samples/bpf/ibumad_kern.c
+++ b/samples/bpf/ibumad_kern.c
@@ -39,8 +39,8 @@ struct {
 /* Taken from the current format defined in
  * include/trace/events/ib_umad.h
  * and
- * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_read/format
- * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_write/format
+ * /sys/kernel/tracing/events/ib_umad/ib_umad_read/format
+ * /sys/kernel/tracing/events/ib_umad/ib_umad_write/format
  */
 struct ib_umad_rw_args {
 	u64 pad;
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh
index 7078bfcc4f4d..381b2c634784 100755
--- a/samples/bpf/lwt_len_hist.sh
+++ b/samples/bpf/lwt_len_hist.sh
@@ -5,7 +5,7 @@ NS1=lwt_ns1
 VETH0=tst_lwt1a
 VETH1=tst_lwt1b
 BPF_PROG=lwt_len_hist.bpf.o
-TRACE_ROOT=/sys/kernel/debug/tracing
+TRACE_ROOT=/sys/kernel/tracing
 
 function cleanup {
 	# To reset saved histogram, remove pinned map
diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c
index eb4d94742e6b..23f12b47e9e5 100644
--- a/samples/bpf/offwaketime_kern.c
+++ b/samples/bpf/offwaketime_kern.c
@@ -110,7 +110,7 @@ static inline int update_counts(void *ctx, u32 pid, u64 delta)
 }
 
 #if 1
-/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+/* taken from /sys/kernel/tracing/events/sched/sched_switch/format */
 struct sched_switch_args {
 	unsigned long long pad;
 	char prev_comm[TASK_COMM_LEN];
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
index a33d74bd3a4b..1e61f2180470 100644
--- a/samples/bpf/task_fd_query_user.c
+++ b/samples/bpf/task_fd_query_user.c
@@ -235,7 +235,7 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
 	struct bpf_link *link;
 	ssize_t bytes;
 
-	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+	snprintf(buf, sizeof(buf), "/sys/kernel/tracing/%s_events",
 		 event_type);
 	kfd = open(buf, O_WRONLY | O_TRUNC, 0);
 	CHECK_PERROR_RET(kfd < 0);
@@ -252,7 +252,7 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
 	close(kfd);
 	kfd = -1;
 
-	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+	snprintf(buf, sizeof(buf), "/sys/kernel/tracing/events/%ss/%s/id",
 		 event_type, event_alias);
 	efd = open(buf, O_RDONLY, 0);
 	CHECK_PERROR_RET(efd < 0);
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh
index 2e9f5126963b..0bf2d0f6bf4b 100755
--- a/samples/bpf/test_lwt_bpf.sh
+++ b/samples/bpf/test_lwt_bpf.sh
@@ -21,7 +21,7 @@ IP_LOCAL="192.168.99.1"
 
 PROG_SRC="test_lwt_bpf.c"
 BPF_PROG="test_lwt_bpf.o"
-TRACE_ROOT=/sys/kernel/debug/tracing
+TRACE_ROOT=/sys/kernel/tracing
 CONTEXT_INFO=$(cat ${TRACE_ROOT}/trace_options | grep context)
 
 function lookup_mac()
diff --git a/samples/bpf/test_overhead_tp.bpf.c b/samples/bpf/test_overhead_tp.bpf.c
index 67cab3881969..8b498328e961 100644
--- a/samples/bpf/test_overhead_tp.bpf.c
+++ b/samples/bpf/test_overhead_tp.bpf.c
@@ -7,7 +7,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 
-/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
+/* from /sys/kernel/tracing/events/task/task_rename/format */
 struct task_rename {
 	__u64 pad;
 	__u32 pid;
@@ -21,7 +21,7 @@ int prog(struct task_rename *ctx)
 	return 0;
 }
 
-/* from /sys/kernel/debug/tracing/events/fib/fib_table_lookup/format */
+/* from /sys/kernel/tracing/events/fib/fib_table_lookup/format */
 struct fib_table_lookup {
 	__u64 pad;
 	__u32 tb_id;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d8c534e05b0a..13129df937cd 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1647,17 +1647,17 @@ union bpf_attr {
  * 	Description
  * 		This helper is a "printk()-like" facility for debugging. It
  * 		prints a message defined by format *fmt* (of size *fmt_size*)
- * 		to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * 		to file *\/sys/kernel/tracing/trace* from TraceFS, if
  * 		available. It can take up to three additional **u64**
  * 		arguments (as an eBPF helpers, the total number of arguments is
  * 		limited to five).
  *
  * 		Each time the helper is called, it appends a line to the trace.
- * 		Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
- * 		open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * 		Lines are discarded while *\/sys/kernel/tracing/trace* is
+ * 		open, use *\/sys/kernel/tracing/trace_pipe* to avoid this.
  * 		The format of the trace is customizable, and the exact output
  * 		one will get depends on the options set in
- * 		*\/sys/kernel/debug/tracing/trace_options* (see also the
+ * 		*\/sys/kernel/tracing/trace_options* (see also the
  * 		*README* file under the same directory). However, it usually
  * 		defaults to something like:
  *
-- 
cgit v1.2.3


From 2c854e5fcd7e243f5a7cf6a6afa0ef83060c903c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Mon, 13 Mar 2023 22:55:51 +0100
Subject: net: page_pool, skbuff: make skb_mark_for_recycle() always available

skb_mark_for_recycle() is guarded with CONFIG_PAGE_POOL, this creates
unneeded complication when using it in the generic code. For now, it's
only used in the drivers always selecting Page Pool, so this works.
Move the guards so that preprocessor will cut out only the operation
itself and the function will still be a noop on !PAGE_POOL systems,
but available there as well.
No functional changes.

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303020342.Wi2PRFFH-lkp@intel.com
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230313215553.1045175-3-aleksander.lobakin@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe661011644b..3f3a2a82a86b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5069,12 +5069,12 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 #endif
 }
 
-#ifdef CONFIG_PAGE_POOL
 static inline void skb_mark_for_recycle(struct sk_buff *skb)
 {
+#ifdef CONFIG_PAGE_POOL
 	skb->pp_recycle = 1;
-}
 #endif
+}
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
-- 
cgit v1.2.3


From d4e492338d11937c55841b1279287280d6e35894 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Mon, 13 Mar 2023 22:55:53 +0100
Subject: xdp: remove unused {__,}xdp_release_frame()

__xdp_build_skb_from_frame() was the last user of
{__,}xdp_release_frame(), which detaches pages from the page_pool.
All the consumers now recycle Page Pool skbs and page, except mlx5,
stmmac and tsnep drivers, which use page_pool_release_page() directly
(might change one day). It's safe to assume this functionality is not
needed anymore and can be removed (in favor of recycling).

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230313215553.1045175-5-aleksander.lobakin@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/xdp.h | 29 -----------------------------
 net/core/xdp.c    | 15 ---------------
 2 files changed, 44 deletions(-)

(limited to 'include')

diff --git a/include/net/xdp.h b/include/net/xdp.h
index d517bfac937b..5393b3ebe56e 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -317,35 +317,6 @@ void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
 void xdp_return_frame_bulk(struct xdp_frame *xdpf,
 			   struct xdp_frame_bulk *bq);
 
-/* When sending xdp_frame into the network stack, then there is no
- * return point callback, which is needed to release e.g. DMA-mapping
- * resources with page_pool.  Thus, have explicit function to release
- * frame resources.
- */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
-static inline void xdp_release_frame(struct xdp_frame *xdpf)
-{
-	struct xdp_mem_info *mem = &xdpf->mem;
-	struct skb_shared_info *sinfo;
-	int i;
-
-	/* Curr only page_pool needs this */
-	if (mem->type != MEM_TYPE_PAGE_POOL)
-		return;
-
-	if (likely(!xdp_frame_has_frags(xdpf)))
-		goto out;
-
-	sinfo = xdp_get_shared_info_from_frame(xdpf);
-	for (i = 0; i < sinfo->nr_frags; i++) {
-		struct page *page = skb_frag_page(&sinfo->frags[i]);
-
-		__xdp_release_frame(page_address(page), mem);
-	}
-out:
-	__xdp_release_frame(xdpf->data, mem);
-}
-
 static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
 {
 	struct skb_shared_info *sinfo;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a2237cfca8e9..8d3ad315f18d 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -531,21 +531,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 
-/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
-{
-	struct xdp_mem_allocator *xa;
-	struct page *page;
-
-	rcu_read_lock();
-	xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-	page = virt_to_head_page(data);
-	if (xa)
-		page_pool_release_page(xa->page_pool, page);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(__xdp_release_frame);
-
 void xdp_attachment_setup(struct xdp_attachment_info *info,
 			  struct netdev_bpf *bpf)
 {
-- 
cgit v1.2.3


From b071af523579df7341cabf0f16fc661125e9a13f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 13 Mar 2023 20:17:31 +0000
Subject: neighbour: annotate lockless accesses to n->nud_state

We have many lockless accesses to n->nud_state.

Before adding another one in the following patch,
add annotations to readers and writers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/vxlan/vxlan_core.c  |  4 ++--
 include/net/neighbour.h         |  2 +-
 net/bridge/br_arp_nd_proxy.c    |  4 ++--
 net/bridge/br_netfilter_hooks.c |  3 ++-
 net/core/filter.c               |  4 ++--
 net/core/neighbour.c            | 28 ++++++++++++++--------------
 net/ipv4/arp.c                  |  8 ++++----
 net/ipv4/fib_semantics.c        |  4 ++--
 net/ipv4/nexthop.c              |  4 ++--
 net/ipv4/route.c                |  2 +-
 net/ipv6/ip6_output.c           |  2 +-
 net/ipv6/ndisc.c                |  4 ++--
 net/ipv6/route.c                |  2 +-
 13 files changed, 36 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index b1b179effe2a..f2c30214cae8 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -1863,7 +1863,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 		struct vxlan_fdb *f;
 		struct sk_buff	*reply;
 
-		if (!(n->nud_state & NUD_CONNECTED)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
 			neigh_release(n);
 			goto out;
 		}
@@ -2027,7 +2027,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
 		struct vxlan_fdb *f;
 		struct sk_buff *reply;
 
-		if (!(n->nud_state & NUD_CONNECTED)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
 			neigh_release(n);
 			goto out;
 		}
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 234799ca527e..c8d39bba2a0d 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -464,7 +464,7 @@ static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
 
 	if (READ_ONCE(neigh->used) != now)
 		WRITE_ONCE(neigh->used, now);
-	if (!(neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
+	if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
 		return __neigh_event_send(neigh, skb, immediate_ok);
 	return 0;
 }
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index e5e48c6e35d7..b45c00c01dea 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -192,7 +192,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
 	if (n) {
 		struct net_bridge_fdb_entry *f;
 
-		if (!(n->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 			neigh_release(n);
 			return;
 		}
@@ -452,7 +452,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
 	if (n) {
 		struct net_bridge_fdb_entry *f;
 
-		if (!(n->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 			neigh_release(n);
 			return;
 		}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 638a4d5359db..3e3065bc0465 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -277,7 +277,8 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
 		struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 		int ret;
 
-		if ((neigh->nud_state & NUD_CONNECTED) && neigh->hh.hh_len) {
+		if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
+		    READ_ONCE(neigh->hh.hh_len)) {
 			neigh_hh_bridge(&neigh->hh, skb);
 			skb->dev = nf_bridge->physindev;
 			ret = br_handle_frame_finish(net, sk, skb);
diff --git a/net/core/filter.c b/net/core/filter.c
index 50f649f1b4a9..d052fac28d02 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5871,7 +5871,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	else
 		neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
 
-	if (!neigh || !(neigh->nud_state & NUD_VALID))
+	if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
 		return BPF_FIB_LKUP_RET_NO_NEIGH;
 	memcpy(params->dmac, neigh->ha, ETH_ALEN);
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
@@ -5992,7 +5992,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 	 * not needed here.
 	 */
 	neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
-	if (!neigh || !(neigh->nud_state & NUD_VALID))
+	if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
 		return BPF_FIB_LKUP_RET_NO_NEIGH;
 	memcpy(params->dmac, neigh->ha, ETH_ALEN);
 	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0116b0ff91a7..90d399b3f980 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1093,13 +1093,13 @@ static void neigh_timer_handler(struct timer_list *t)
 					  neigh->used +
 					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
 			neigh_dbg(2, "neigh %p is delayed\n", neigh);
-			neigh->nud_state = NUD_DELAY;
+			WRITE_ONCE(neigh->nud_state, NUD_DELAY);
 			neigh->updated = jiffies;
 			neigh_suspect(neigh);
 			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
 		} else {
 			neigh_dbg(2, "neigh %p is suspected\n", neigh);
-			neigh->nud_state = NUD_STALE;
+			WRITE_ONCE(neigh->nud_state, NUD_STALE);
 			neigh->updated = jiffies;
 			neigh_suspect(neigh);
 			notify = 1;
@@ -1109,14 +1109,14 @@ static void neigh_timer_handler(struct timer_list *t)
 				   neigh->confirmed +
 				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
 			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
-			neigh->nud_state = NUD_REACHABLE;
+			WRITE_ONCE(neigh->nud_state, NUD_REACHABLE);
 			neigh->updated = jiffies;
 			neigh_connect(neigh);
 			notify = 1;
 			next = neigh->confirmed + neigh->parms->reachable_time;
 		} else {
 			neigh_dbg(2, "neigh %p is probed\n", neigh);
-			neigh->nud_state = NUD_PROBE;
+			WRITE_ONCE(neigh->nud_state, NUD_PROBE);
 			neigh->updated = jiffies;
 			atomic_set(&neigh->probes, 0);
 			notify = 1;
@@ -1130,7 +1130,7 @@ static void neigh_timer_handler(struct timer_list *t)
 
 	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
 	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
-		neigh->nud_state = NUD_FAILED;
+		WRITE_ONCE(neigh->nud_state, NUD_FAILED);
 		notify = 1;
 		neigh_invalidate(neigh);
 		goto out;
@@ -1179,7 +1179,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
 			atomic_set(&neigh->probes,
 				   NEIGH_VAR(neigh->parms, UCAST_PROBES));
 			neigh_del_timer(neigh);
-			neigh->nud_state = NUD_INCOMPLETE;
+			WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
 			neigh->updated = now;
 			if (!immediate_ok) {
 				next = now + 1;
@@ -1191,7 +1191,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
 			}
 			neigh_add_timer(neigh, next);
 		} else {
-			neigh->nud_state = NUD_FAILED;
+			WRITE_ONCE(neigh->nud_state, NUD_FAILED);
 			neigh->updated = jiffies;
 			write_unlock_bh(&neigh->lock);
 
@@ -1201,7 +1201,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
 	} else if (neigh->nud_state & NUD_STALE) {
 		neigh_dbg(2, "neigh %p is delayed\n", neigh);
 		neigh_del_timer(neigh);
-		neigh->nud_state = NUD_DELAY;
+		WRITE_ONCE(neigh->nud_state, NUD_DELAY);
 		neigh->updated = jiffies;
 		neigh_add_timer(neigh, jiffies +
 				NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
@@ -1313,7 +1313,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
 	neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update);
 	if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
 		new = old & ~NUD_PERMANENT;
-		neigh->nud_state = new;
+		WRITE_ONCE(neigh->nud_state, new);
 		err = 0;
 		goto out;
 	}
@@ -1322,7 +1322,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
 		neigh_del_timer(neigh);
 		if (old & NUD_CONNECTED)
 			neigh_suspect(neigh);
-		neigh->nud_state = new;
+		WRITE_ONCE(neigh->nud_state, new);
 		err = 0;
 		notify = old & NUD_VALID;
 		if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -1401,7 +1401,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
 						((new & NUD_REACHABLE) ?
 						 neigh->parms->reachable_time :
 						 0)));
-		neigh->nud_state = new;
+		WRITE_ONCE(neigh->nud_state, new);
 		notify = 1;
 	}
 
@@ -1488,7 +1488,7 @@ void __neigh_set_probe_once(struct neighbour *neigh)
 	neigh->updated = jiffies;
 	if (!(neigh->nud_state & NUD_FAILED))
 		return;
-	neigh->nud_state = NUD_INCOMPLETE;
+	WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
 	atomic_set(&neigh->probes, neigh_max_probes(neigh));
 	neigh_add_timer(neigh,
 			jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
@@ -3198,7 +3198,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 			}
 			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
 				break;
-			if (n->nud_state & ~NUD_NOARP)
+			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
 			n = rcu_dereference_bh(n->next);
@@ -3240,7 +3240,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 			if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
 				break;
 
-			if (n->nud_state & ~NUD_NOARP)
+			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
 			n = rcu_dereference_bh(n->next);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4f7237661afb..9456f5bb35e5 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -375,7 +375,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 
 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
 	if (probes < 0) {
-		if (!(neigh->nud_state & NUD_VALID))
+		if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
 			pr_debug("trying to ucast probe in NUD_INVALID\n");
 		neigh_ha_snapshot(dst_ha, neigh, dev);
 		dst_hw = dst_ha;
@@ -1123,7 +1123,7 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev)
 
 	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
-		if (!(neigh->nud_state & NUD_NOARP)) {
+		if (!(READ_ONCE(neigh->nud_state) & NUD_NOARP)) {
 			read_lock_bh(&neigh->lock);
 			memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
 			r->arp_flags = arp_state_to_flags(neigh);
@@ -1144,12 +1144,12 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
 	struct neigh_table *tbl = &arp_tbl;
 
 	if (neigh) {
-		if ((neigh->nud_state & NUD_VALID) && !force) {
+		if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) {
 			neigh_release(neigh);
 			return 0;
 		}
 
-		if (neigh->nud_state & ~NUD_NOARP)
+		if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP)
 			err = neigh_update(neigh, NULL, NUD_FAILED,
 					   NEIGH_UPDATE_F_OVERRIDE|
 					   NEIGH_UPDATE_F_ADMIN, 0);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3bb890a40ed7..574ff450c4d2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -563,7 +563,7 @@ static int fib_detect_death(struct fib_info *fi, int order,
 		n = NULL;
 
 	if (n) {
-		state = n->nud_state;
+		state = READ_ONCE(n->nud_state);
 		neigh_release(n);
 	} else {
 		return 0;
@@ -2202,7 +2202,7 @@ static bool fib_good_nh(const struct fib_nh *nh)
 		else
 			n = NULL;
 		if (n)
-			state = n->nud_state;
+			state = READ_ONCE(n->nud_state);
 
 		rcu_read_unlock_bh();
 	}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index d8ef05347fd9..e28a99f1996b 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1128,7 +1128,7 @@ static bool ipv6_good_nh(const struct fib6_nh *nh)
 
 	n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
 	if (n)
-		state = n->nud_state;
+		state = READ_ONCE(n->nud_state);
 
 	rcu_read_unlock_bh();
 
@@ -1145,7 +1145,7 @@ static bool ipv4_good_nh(const struct fib_nh *nh)
 	n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
 				      (__force u32)nh->fib_nh_gw4);
 	if (n)
-		state = n->nud_state;
+		state = READ_ONCE(n->nud_state);
 
 	rcu_read_unlock_bh();
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index de6e3515ab4f..232009d216c4 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -784,7 +784,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 	if (!n)
 		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 	if (!IS_ERR(n)) {
-		if (!(n->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 			neigh_event_send(n, NULL);
 		} else {
 			if (fib_lookup(net, fl4, &res, 0) == 0) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 4ce3f9d3bc8a..e5ed39a3c65f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1153,7 +1153,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	rcu_read_lock_bh();
 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 				      rt6_nexthop(rt, &fl6->daddr));
-	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
+	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
 	rcu_read_unlock_bh();
 
 	if (err) {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index c4be62c99f73..18634ebd20a4 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -745,7 +745,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 		saddr = &ipv6_hdr(skb)->saddr;
 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
 	if (probes < 0) {
-		if (!(neigh->nud_state & NUD_VALID)) {
+		if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) {
 			ND_PRINTK(1, dbg,
 				  "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
 				  __func__, target);
@@ -1090,7 +1090,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
 		u8 old_flags = neigh->flags;
 		struct net *net = dev_net(dev);
 
-		if (neigh->nud_state & NUD_FAILED)
+		if (READ_ONCE(neigh->nud_state) & NUD_FAILED)
 			goto out;
 
 		/*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0fdb03df2287..25c00c6f5131 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -638,7 +638,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 	idev = __in6_dev_get(dev);
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 	if (neigh) {
-		if (neigh->nud_state & NUD_VALID)
+		if (READ_ONCE(neigh->nud_state) & NUD_VALID)
 			goto out;
 
 		write_lock(&neigh->lock);
-- 
cgit v1.2.3


From f947568e258038d3c2f8f38a9a7dabaca36643ec Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.ibm.com>
Date: Mon, 13 Mar 2023 11:10:31 +0100
Subject: net/smc: Introduce explicit check for v2 support

Previously, v2 support was derived from a very specific format of the SEID
as part of the SMC-D codebase. Make this part of the SMC-D device API, so
implementers do not need to adhere to a specific SEID format.

Signed-off-by: Stefan Raspl <raspl@linux.ibm.com>
Reviewed-and-tested-by: Jan Karcher <jaka@linux.ibm.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Signed-off-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/s390/net/ism_drv.c | 7 +++++++
 include/net/smc.h          | 1 +
 net/smc/smc_ism.c          | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index eb7e13486087..1c73d32966f1 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -842,6 +842,12 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx,
 	return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size);
 }
 
+static int smcd_supports_v2(void)
+{
+	return SYSTEM_EID.serial_number[0] != '0' ||
+		SYSTEM_EID.type[0] != '0';
+}
+
 static u64 smcd_get_local_gid(struct smcd_dev *smcd)
 {
 	return ism_get_local_gid(smcd->priv);
@@ -869,6 +875,7 @@ static const struct smcd_ops ism_ops = {
 	.reset_vlan_required = smcd_reset_vlan_required,
 	.signal_event = smcd_signal_ieq,
 	.move_data = smcd_move,
+	.supports_v2 = smcd_supports_v2,
 	.get_system_eid = ism_get_seid,
 	.get_local_gid = smcd_get_local_gid,
 	.get_chid = smcd_get_chid,
diff --git a/include/net/smc.h b/include/net/smc.h
index 597cb9381182..a002552be29c 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -67,6 +67,7 @@ struct smcd_ops {
 	int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx,
 			 bool sf, unsigned int offset, void *data,
 			 unsigned int size);
+	int (*supports_v2)(void);
 	u8* (*get_system_eid)(void);
 	u64 (*get_local_gid)(struct smcd_dev *dev);
 	u16 (*get_chid)(struct smcd_dev *dev);
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 3b0b7710c6b0..fbee2493091f 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -429,7 +429,7 @@ static void smcd_register_dev(struct ism_dev *ism)
 		u8 *system_eid = NULL;
 
 		system_eid = smcd->ops->get_system_eid();
-		if (system_eid[24] != '0' || system_eid[28] != '0') {
+		if (smcd->ops->supports_v2()) {
 			smc_ism_v2_capable = true;
 			memcpy(smc_ism_v2_system_eid, system_eid,
 			       SMC_MAX_EID_LEN);
-- 
cgit v1.2.3


From a02d83f9947d8f71904eda4de046630c3eb6802c Mon Sep 17 00:00:00 2001
From: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Date: Mon, 13 Mar 2023 12:32:11 +0100
Subject: scm: fix MSG_CTRUNC setting condition for SO_PASSSEC

Currently, kernel would set MSG_CTRUNC flag if msg_control buffer
wasn't provided and SO_PASSCRED was set or if there was pending SCM_RIGHTS.

For some reason we have no corresponding check for SO_PASSSEC.

In the recvmsg(2) doc we have:
       MSG_CTRUNC
              indicates that some control data was discarded due to lack
              of space in the buffer for ancillary data.

So, we need to set MSG_CTRUNC flag for all types of SCM.

This change can break applications those don't check MSG_CTRUNC flag.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Leon Romanovsky <leon@kernel.org>
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>

v2:
- commit message was rewritten according to Eric's suggestion
Acked-by: Paul Moore <paul@paul-moore.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/scm.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/scm.h b/include/net/scm.h
index 1ce365f4c256..585adc1346bd 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -105,16 +105,27 @@ static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct sc
 		}
 	}
 }
+
+static inline bool scm_has_secdata(struct socket *sock)
+{
+	return test_bit(SOCK_PASSSEC, &sock->flags);
+}
 #else
 static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
 { }
+
+static inline bool scm_has_secdata(struct socket *sock)
+{
+	return false;
+}
 #endif /* CONFIG_SECURITY_NETWORK */
 
 static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 				struct scm_cookie *scm, int flags)
 {
 	if (!msg->msg_control) {
-		if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp)
+		if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp ||
+		    scm_has_secdata(sock))
 			msg->msg_flags |= MSG_CTRUNC;
 		scm_destroy(scm);
 		return;
-- 
cgit v1.2.3


From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 10 Mar 2023 08:40:59 +0100
Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules

This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm
to functions located in modules:

1. The verifier tries to find the address to attach to in kallsyms. This
   is always done by searching the entire kallsyms, not respecting the
   module in which the function is located. Such approach causes an
   incorrect attachment address to be computed if the function to attach
   to is shadowed by a function of the same name located earlier in
   kallsyms.

2. If the address to attach to is located in a module, the module
   reference is only acquired in register_fentry. If the module is
   unloaded between the place where the address is found
   (bpf_check_attach_target in the verifier) and register_fentry, it is
   possible that another module is loaded to the same address which may
   lead to potential errors.

Since the attachment must contain the BTF of the program to attach to,
we extract the module from it and search for the function address in the
correct module (resolving problem no. 1). Then, the module reference is
taken directly in bpf_check_attach_target and stored in the bpf program
(in bpf_prog_aux). The reference is only released when the program is
unloaded (resolving problem no. 2).

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/syscall.c     |  6 ++++++
 kernel/bpf/trampoline.c  | 28 ----------------------------
 kernel/bpf/verifier.c    | 18 +++++++++++++++++-
 kernel/module/internal.h |  5 +++++
 5 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 71cc92a4ba48..3ef98fb92987 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1103,6 +1103,7 @@ struct bpf_trampoline {
 struct bpf_attach_target_info {
 	struct btf_func_model fmodel;
 	long tgt_addr;
+	struct module *tgt_mod;
 	const char *tgt_name;
 	const struct btf_type *tgt_type;
 };
@@ -1406,6 +1407,7 @@ struct bpf_prog_aux {
 	 * main prog always has linfo_idx == 0
 	 */
 	u32 linfo_idx;
+	struct module *mod;
 	u32 num_exentries;
 	struct exception_table_entry *extable;
 	union {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5b88301a2ae0..099e9068bcdd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2067,6 +2067,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
 {
 	bpf_prog_kallsyms_del_all(prog);
 	btf_put(prog->aux->btf);
+	module_put(prog->aux->mod);
 	kvfree(prog->aux->jited_linfo);
 	kvfree(prog->aux->linfo);
 	kfree(prog->aux->kfunc_tab);
@@ -3113,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
 		if (err)
 			goto out_unlock;
 
+		if (tgt_info.tgt_mod) {
+			module_put(prog->aux->mod);
+			prog->aux->mod = tgt_info.tgt_mod;
+		}
+
 		tr = bpf_trampoline_get(key, &tgt_info);
 		if (!tr) {
 			err = -ENOMEM;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..f61d5138b12b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,7 +9,6 @@
 #include <linux/btf.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/rcupdate_wait.h>
-#include <linux/module.h>
 #include <linux/static_call.h>
 #include <linux/bpf_verifier.h>
 #include <linux/bpf_lsm.h>
@@ -172,26 +171,6 @@ out:
 	return tr;
 }
 
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
-	struct module *mod;
-	int err = 0;
-
-	preempt_disable();
-	mod = __module_text_address((unsigned long) tr->func.addr);
-	if (mod && !try_module_get(mod))
-		err = -ENOENT;
-	preempt_enable();
-	tr->mod = mod;
-	return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
-	module_put(tr->mod);
-	tr->mod = NULL;
-}
-
 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 {
 	void *ip = tr->func.addr;
@@ -202,8 +181,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
 	else
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
 
-	if (!ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
@@ -238,9 +215,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		tr->func.ftrace_managed = true;
 	}
 
-	if (bpf_trampoline_module_get(tr))
-		return -ENOENT;
-
 	if (tr->func.ftrace_managed) {
 		ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
 		ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
@@ -248,8 +222,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
 	}
 
-	if (ret)
-		bpf_trampoline_module_put(tr);
 	return ret;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2bbd89279070..60793f793ca6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,6 +24,7 @@
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/poison.h>
+#include "../module/internal.h"
 
 #include "disasm.h"
 
@@ -18307,6 +18308,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	const char *tname;
 	struct btf *btf;
 	long addr = 0;
+	struct module *mod = NULL;
 
 	if (!btf_id) {
 		bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -18480,8 +18482,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			else
 				addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
 		} else {
-			addr = kallsyms_lookup_name(tname);
+			if (btf_is_module(btf)) {
+				mod = btf_try_get_module(btf);
+				if (mod)
+					addr = find_kallsyms_symbol_value(mod, tname);
+				else
+					addr = 0;
+			} else {
+				addr = kallsyms_lookup_name(tname);
+			}
 			if (!addr) {
+				module_put(mod);
 				bpf_log(log,
 					"The address of function %s cannot be found\n",
 					tname);
@@ -18521,11 +18532,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 				break;
 			}
 			if (ret) {
+				module_put(mod);
 				bpf_log(log, "%s is not sleepable\n", tname);
 				return ret;
 			}
 		} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
 			if (tgt_prog) {
+				module_put(mod);
 				bpf_log(log, "can't modify return codes of BPF programs\n");
 				return -EINVAL;
 			}
@@ -18534,6 +18547,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    !check_attach_modify_return(addr, tname))
 				ret = 0;
 			if (ret) {
+				module_put(mod);
 				bpf_log(log, "%s() is not modifiable\n", tname);
 				return ret;
 			}
@@ -18544,6 +18558,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 	tgt_info->tgt_addr = addr;
 	tgt_info->tgt_name = tname;
 	tgt_info->tgt_type = t;
+	tgt_info->tgt_mod = mod;
 	return 0;
 }
 
@@ -18623,6 +18638,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	/* store info about the attachment target that will be used later */
 	prog->aux->attach_func_proto = tgt_info.tgt_type;
 	prog->aux->attach_func_name = tgt_info.tgt_name;
+	prog->aux->mod = tgt_info.tgt_mod;
 
 	if (tgt_prog) {
 		prog->aux->saved_dst_prog_type = tgt_prog->type;
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 2e2bf236f558..5c9170f9135c 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -256,6 +256,11 @@ static inline bool sect_empty(const Elf_Shdr *sect)
 static inline void init_build_id(struct module *mod, const struct load_info *info) { }
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From c1fef618d611b31964ab397aa0bf0611da94bade Mon Sep 17 00:00:00 2001
From: Sandipan Patra <spatra@nvidia.com>
Date: Mon, 13 Mar 2023 22:42:23 -0700
Subject: net/mlx5: Implement thermal zone

Implement thermal zone support for mlx5 based HW. The NIC
uses temperature sensor provided by ASIC to report current temperature
to thermal core.

Signed-off-by: Sandipan Patra <spatra@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20230314054234.267365-5-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/Makefile  |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c    |   6 ++
 drivers/net/ethernet/mellanox/mlx5/core/thermal.c | 108 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/thermal.h |  20 ++++
 include/linux/mlx5/driver.h                       |   3 +
 include/linux/mlx5/mlx5_ifc.h                     |  26 ++++++
 6 files changed, 164 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/thermal.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/thermal.h

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 8d4e25cc54ea..6c2f1d4a58ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -77,6 +77,7 @@ mlx5_core-$(CONFIG_MLX5_ESWITCH)   += esw/acl/helper.o \
 
 mlx5_core-$(CONFIG_MLX5_BRIDGE)    += esw/bridge.o en/rep/bridge.o
 
+mlx5_core-$(CONFIG_THERMAL)        += thermal.o
 mlx5_core-$(CONFIG_MLX5_MPFS)      += lib/mpfs.o
 mlx5_core-$(CONFIG_VXLAN)          += lib/vxlan.o
 mlx5_core-$(CONFIG_PTP_1588_CLOCK) += lib/clock.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 0ff0eb660495..644c889f9a32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -52,6 +52,7 @@
 #include <linux/version.h>
 #include <net/devlink.h>
 #include "mlx5_core.h"
+#include "thermal.h"
 #include "lib/eq.h"
 #include "fs_core.h"
 #include "lib/mpfs.h"
@@ -1768,6 +1769,10 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err);
 
+	err = mlx5_thermal_init(dev);
+	if (err)
+		dev_err(&pdev->dev, "mlx5_thermal_init failed with error code %d\n", err);
+
 	pci_save_state(pdev);
 	devlink_register(devlink);
 	return 0;
@@ -1796,6 +1801,7 @@ static void remove_one(struct pci_dev *pdev)
 	set_bit(MLX5_BREAK_FW_WAIT, &dev->intf_state);
 	devlink_unregister(devlink);
 	mlx5_sriov_disable(pdev);
+	mlx5_thermal_uninit(dev);
 	mlx5_crdump_disable(dev);
 	mlx5_drain_health_wq(dev);
 	mlx5_uninit_one(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/thermal.c b/drivers/net/ethernet/mellanox/mlx5/core/thermal.c
new file mode 100644
index 000000000000..e47fa6fb836f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/thermal.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/thermal.h>
+#include <linux/err.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+#include "thermal.h"
+
+#define MLX5_THERMAL_POLL_INT_MSEC	1000
+#define MLX5_THERMAL_NUM_TRIPS		0
+#define MLX5_THERMAL_ASIC_SENSOR_INDEX	0
+
+/* Bit string indicating the writeablility of trip points if any */
+#define MLX5_THERMAL_TRIP_MASK	(BIT(MLX5_THERMAL_NUM_TRIPS) - 1)
+
+struct mlx5_thermal {
+	struct mlx5_core_dev *mdev;
+	struct thermal_zone_device *tzdev;
+};
+
+static int mlx5_thermal_get_mtmp_temp(struct mlx5_core_dev *mdev, u32 id, int *p_temp)
+{
+	u32 mtmp_out[MLX5_ST_SZ_DW(mtmp_reg)] = {};
+	u32 mtmp_in[MLX5_ST_SZ_DW(mtmp_reg)] = {};
+	int err;
+
+	MLX5_SET(mtmp_reg, mtmp_in, sensor_index, id);
+
+	err = mlx5_core_access_reg(mdev, mtmp_in,  sizeof(mtmp_in),
+				   mtmp_out, sizeof(mtmp_out),
+				   MLX5_REG_MTMP, 0, 0);
+
+	if (err)
+		return err;
+
+	*p_temp = MLX5_GET(mtmp_reg, mtmp_out, temperature);
+
+	return 0;
+}
+
+static int mlx5_thermal_get_temp(struct thermal_zone_device *tzdev,
+				 int *p_temp)
+{
+	struct mlx5_thermal *thermal = tzdev->devdata;
+	struct mlx5_core_dev *mdev = thermal->mdev;
+	int err;
+
+	err = mlx5_thermal_get_mtmp_temp(mdev, MLX5_THERMAL_ASIC_SENSOR_INDEX, p_temp);
+
+	if (err)
+		return err;
+
+	/* The unit of temp returned is in 0.125 C. The thermal
+	 * framework expects the value in 0.001 C.
+	 */
+	*p_temp *= 125;
+
+	return 0;
+}
+
+static struct thermal_zone_device_ops mlx5_thermal_ops = {
+	.get_temp = mlx5_thermal_get_temp,
+};
+
+int mlx5_thermal_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_thermal *thermal;
+	struct thermal_zone_device *tzd;
+	const char *data = "mlx5";
+
+	tzd = thermal_zone_get_zone_by_name(data);
+	if (!IS_ERR(tzd))
+		return 0;
+
+	thermal = kzalloc(sizeof(*thermal), GFP_KERNEL);
+	if (!thermal)
+		return -ENOMEM;
+
+	thermal->mdev = mdev;
+	thermal->tzdev = thermal_zone_device_register(data,
+						      MLX5_THERMAL_NUM_TRIPS,
+						      MLX5_THERMAL_TRIP_MASK,
+						      thermal,
+						      &mlx5_thermal_ops,
+						      NULL, 0, MLX5_THERMAL_POLL_INT_MSEC);
+	if (IS_ERR(thermal->tzdev)) {
+		dev_err(mdev->device, "Failed to register thermal zone device (%s) %ld\n",
+			data, PTR_ERR(thermal->tzdev));
+		kfree(thermal);
+		return -EINVAL;
+	}
+
+	mdev->thermal = thermal;
+	return 0;
+}
+
+void mlx5_thermal_uninit(struct mlx5_core_dev *mdev)
+{
+	if (!mdev->thermal)
+		return;
+
+	thermal_zone_device_unregister(mdev->thermal->tzdev);
+	kfree(mdev->thermal);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/thermal.h b/drivers/net/ethernet/mellanox/mlx5/core/thermal.h
new file mode 100644
index 000000000000..7d752c122192
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/thermal.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef __MLX5_THERMAL_DRIVER_H
+#define __MLX5_THERMAL_DRIVER_H
+
+#if IS_ENABLED(CONFIG_THERMAL)
+int mlx5_thermal_init(struct mlx5_core_dev *mdev);
+void mlx5_thermal_uninit(struct mlx5_core_dev *mdev);
+#else
+static inline int mlx5_thermal_init(struct mlx5_core_dev *mdev)
+{
+	mdev->thermal = NULL;
+	return 0;
+}
+
+static inline void mlx5_thermal_uninit(struct mlx5_core_dev *mdev) { }
+#endif
+
+#endif /* __MLX5_THERMAL_DRIVER_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209..7a898113b6b7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -134,6 +134,7 @@ enum {
 	MLX5_REG_PCAM		 = 0x507f,
 	MLX5_REG_NODE_DESC	 = 0x6001,
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+	MLX5_REG_MTMP		 = 0x900A,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MFRL		 = 0x9028,
 	MLX5_REG_MLCR		 = 0x902b,
@@ -731,6 +732,7 @@ struct mlx5_fw_tracer;
 struct mlx5_vxlan;
 struct mlx5_geneve;
 struct mlx5_hv_vhca;
+struct mlx5_thermal;
 
 #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
 #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
@@ -808,6 +810,7 @@ struct mlx5_core_dev {
 	struct mlx5_rsc_dump    *rsc_dump;
 	u32                      vsc_addr;
 	struct mlx5_hv_vhca	*hv_vhca;
+	struct mlx5_thermal	*thermal;
 };
 
 struct mlx5_db {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 66d76e97a087..d2c164f0778c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10869,6 +10869,31 @@ struct mlx5_ifc_mrtc_reg_bits {
 	u8         time_l[0x20];
 };
 
+struct mlx5_ifc_mtmp_reg_bits {
+	u8         reserved_at_0[0x14];
+	u8         sensor_index[0xc];
+
+	u8         reserved_at_20[0x10];
+	u8         temperature[0x10];
+
+	u8         mte[0x1];
+	u8         mtr[0x1];
+	u8         reserved_at_42[0xe];
+	u8         max_temperature[0x10];
+
+	u8         tee[0x2];
+	u8         reserved_at_62[0xe];
+	u8         temp_threshold_hi[0x10];
+
+	u8         reserved_at_80[0x10];
+	u8         temp_threshold_lo[0x10];
+
+	u8         reserved_at_a0[0x20];
+
+	u8         sensor_name_hi[0x20];
+	u8         sensor_name_lo[0x20];
+};
+
 union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_bufferx_reg_bits bufferx_reg;
 	struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout;
@@ -10931,6 +10956,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_mfrl_reg_bits mfrl_reg;
 	struct mlx5_ifc_mtutc_reg_bits mtutc_reg;
 	struct mlx5_ifc_mrtc_reg_bits mrtc_reg;
+	struct mlx5_ifc_mtmp_reg_bits mtmp_reg;
 	u8         reserved_at_0[0x60e0];
 };
 
-- 
cgit v1.2.3


From 028522e2844393abc44f7bd7477eb4a455f01579 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Mon, 13 Mar 2023 22:42:29 -0700
Subject: net/mlx5: Move needed PTYS functions to core layer

Downstream patches require devlink params to access the PTYS register,
move the needed functions from mlx5e to the core layer.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Link: https://lore.kernel.org/r/20230314054234.267365-11-saeed@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/en/params.c    |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/port.c  | 157 +--------------------
 drivers/net/ethernet/mellanox/mlx5/core/en/port.h  |  14 --
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  12 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/port.c     | 151 ++++++++++++++++++++
 include/linux/mlx5/port.h                          |  16 +++
 8 files changed, 179 insertions(+), 177 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index a21bd1179477..561da78d3b5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -553,7 +553,7 @@ bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 	u32 link_speed = 0;
 	u32 pci_bw = 0;
 
-	mlx5e_port_max_linkspeed(mdev, &link_speed);
+	mlx5_port_max_linkspeed(mdev, &link_speed);
 	pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL, NULL);
 	mlx5_core_dbg_once(mdev, "Max link speed = %d, PCI BW = %d\n",
 			   link_speed, pci_bw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
index 505ba41195b9..dbe2b19a9570 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -32,101 +32,6 @@
 
 #include "port.h"
 
-/* speed in units of 1Mb */
-static const u32 mlx5e_link_speed[MLX5E_LINK_MODES_NUMBER] = {
-	[MLX5E_1000BASE_CX_SGMII] = 1000,
-	[MLX5E_1000BASE_KX]       = 1000,
-	[MLX5E_10GBASE_CX4]       = 10000,
-	[MLX5E_10GBASE_KX4]       = 10000,
-	[MLX5E_10GBASE_KR]        = 10000,
-	[MLX5E_20GBASE_KR2]       = 20000,
-	[MLX5E_40GBASE_CR4]       = 40000,
-	[MLX5E_40GBASE_KR4]       = 40000,
-	[MLX5E_56GBASE_R4]        = 56000,
-	[MLX5E_10GBASE_CR]        = 10000,
-	[MLX5E_10GBASE_SR]        = 10000,
-	[MLX5E_10GBASE_ER]        = 10000,
-	[MLX5E_40GBASE_SR4]       = 40000,
-	[MLX5E_40GBASE_LR4]       = 40000,
-	[MLX5E_50GBASE_SR2]       = 50000,
-	[MLX5E_100GBASE_CR4]      = 100000,
-	[MLX5E_100GBASE_SR4]      = 100000,
-	[MLX5E_100GBASE_KR4]      = 100000,
-	[MLX5E_100GBASE_LR4]      = 100000,
-	[MLX5E_100BASE_TX]        = 100,
-	[MLX5E_1000BASE_T]        = 1000,
-	[MLX5E_10GBASE_T]         = 10000,
-	[MLX5E_25GBASE_CR]        = 25000,
-	[MLX5E_25GBASE_KR]        = 25000,
-	[MLX5E_25GBASE_SR]        = 25000,
-	[MLX5E_50GBASE_CR2]       = 50000,
-	[MLX5E_50GBASE_KR2]       = 50000,
-};
-
-static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = {
-	[MLX5E_SGMII_100M]			= 100,
-	[MLX5E_1000BASE_X_SGMII]		= 1000,
-	[MLX5E_5GBASE_R]			= 5000,
-	[MLX5E_10GBASE_XFI_XAUI_1]		= 10000,
-	[MLX5E_40GBASE_XLAUI_4_XLPPI_4]		= 40000,
-	[MLX5E_25GAUI_1_25GBASE_CR_KR]		= 25000,
-	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2]	= 50000,
-	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR]	= 50000,
-	[MLX5E_CAUI_4_100GBASE_CR4_KR4]		= 100000,
-	[MLX5E_100GAUI_2_100GBASE_CR2_KR2]	= 100000,
-	[MLX5E_200GAUI_4_200GBASE_CR4_KR4]	= 200000,
-	[MLX5E_400GAUI_8]			= 400000,
-	[MLX5E_100GAUI_1_100GBASE_CR_KR]	= 100000,
-	[MLX5E_200GAUI_2_200GBASE_CR2_KR2]	= 200000,
-	[MLX5E_400GAUI_4_400GBASE_CR4_KR4]	= 400000,
-};
-
-bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev)
-{
-	struct mlx5e_port_eth_proto eproto;
-	int err;
-
-	if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet))
-		return true;
-
-	err = mlx5_port_query_eth_proto(mdev, 1, true, &eproto);
-	if (err)
-		return false;
-
-	return !!eproto.cap;
-}
-
-static void mlx5e_port_get_speed_arr(struct mlx5_core_dev *mdev,
-				     const u32 **arr, u32 *size,
-				     bool force_legacy)
-{
-	bool ext = force_legacy ? false : mlx5e_ptys_ext_supported(mdev);
-
-	*size = ext ? ARRAY_SIZE(mlx5e_ext_link_speed) :
-		      ARRAY_SIZE(mlx5e_link_speed);
-	*arr  = ext ? mlx5e_ext_link_speed : mlx5e_link_speed;
-}
-
-int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
-			      struct mlx5e_port_eth_proto *eproto)
-{
-	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
-	int err;
-
-	if (!eproto)
-		return -EINVAL;
-
-	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, port);
-	if (err)
-		return err;
-
-	eproto->cap   = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
-					   eth_proto_capability);
-	eproto->admin = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_admin);
-	eproto->oper  = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
-	return 0;
-}
-
 void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status,
 				 u8 *an_disable_cap, u8 *an_disable_admin)
 {
@@ -172,30 +77,14 @@ int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable,
 			    sizeof(out), MLX5_REG_PTYS, 0, 1);
 }
 
-u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
-			  bool force_legacy)
-{
-	unsigned long temp = eth_proto_oper;
-	const u32 *table;
-	u32 speed = 0;
-	u32 max_size;
-	int i;
-
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
-	i = find_first_bit(&temp, max_size);
-	if (i < max_size)
-		speed = table[i];
-	return speed;
-}
-
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 {
-	struct mlx5e_port_eth_proto eproto;
+	struct mlx5_port_eth_proto eproto;
 	bool force_legacy = false;
 	bool ext;
 	int err;
 
-	ext = mlx5e_ptys_ext_supported(mdev);
+	ext = mlx5_ptys_ext_supported(mdev);
 	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
 	if (err)
 		goto out;
@@ -205,7 +94,7 @@ int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
 		if (err)
 			goto out;
 	}
-	*speed = mlx5e_port_ptys2speed(mdev, eproto.oper, force_legacy);
+	*speed = mlx5_port_ptys2speed(mdev, eproto.oper, force_legacy);
 	if (!(*speed))
 		err = -EINVAL;
 
@@ -213,46 +102,6 @@ out:
 	return err;
 }
 
-int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
-{
-	struct mlx5e_port_eth_proto eproto;
-	u32 max_speed = 0;
-	const u32 *table;
-	u32 max_size;
-	bool ext;
-	int err;
-	int i;
-
-	ext = mlx5e_ptys_ext_supported(mdev);
-	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
-	if (err)
-		return err;
-
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size, false);
-	for (i = 0; i < max_size; ++i)
-		if (eproto.cap & MLX5E_PROT_MASK(i))
-			max_speed = max(max_speed, table[i]);
-
-	*speed = max_speed;
-	return 0;
-}
-
-u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
-			       bool force_legacy)
-{
-	u32 link_modes = 0;
-	const u32 *table;
-	u32 max_size;
-	int i;
-
-	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
-	for (i = 0; i < max_size; ++i) {
-		if (table[i] == speed)
-			link_modes |= MLX5E_PROT_MASK(i);
-	}
-	return link_modes;
-}
-
 int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out)
 {
 	int sz = MLX5_ST_SZ_BYTES(pbmc_reg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
index 3f474e370828..d1da225f35da 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.h
@@ -36,25 +36,11 @@
 #include <linux/mlx5/driver.h>
 #include "en.h"
 
-struct mlx5e_port_eth_proto {
-	u32 cap;
-	u32 admin;
-	u32 oper;
-};
-
-int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
-			      struct mlx5e_port_eth_proto *eproto);
 void mlx5_port_query_eth_autoneg(struct mlx5_core_dev *dev, u8 *an_status,
 				 u8 *an_disable_cap, u8 *an_disable_admin);
 int mlx5_port_set_eth_ptys(struct mlx5_core_dev *dev, bool an_disable,
 			   u32 proto_admin, bool ext);
-u32 mlx5e_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
-			  bool force_legacy);
 int mlx5e_port_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
-int mlx5e_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
-u32 mlx5e_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
-			       bool force_legacy);
-bool mlx5e_ptys_ext_supported(struct mlx5_core_dev *mdev);
 int mlx5e_port_query_pbmc(struct mlx5_core_dev *mdev, void *out);
 int mlx5e_port_set_pbmc(struct mlx5_core_dev *mdev, void *in);
 int mlx5e_port_query_sbpr(struct mlx5_core_dev *mdev, u32 desc, u8 dir,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 7708acc9b2ab..53c35147f29b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -220,7 +220,7 @@ static void mlx5e_ethtool_get_speed_arr(struct mlx5_core_dev *mdev,
 					struct ptys2ethtool_config **arr,
 					u32 *size)
 {
-	bool ext = mlx5e_ptys_ext_supported(mdev);
+	bool ext = mlx5_ptys_ext_supported(mdev);
 
 	*arr = ext ? ptys2ext_ethtool_table : ptys2legacy_ethtool_table;
 	*size = ext ? ARRAY_SIZE(ptys2ext_ethtool_table) :
@@ -895,7 +895,7 @@ static void get_speed_duplex(struct net_device *netdev,
 	if (!netif_carrier_ok(netdev))
 		goto out;
 
-	speed = mlx5e_port_ptys2speed(priv->mdev, eth_proto_oper, force_legacy);
+	speed = mlx5_port_ptys2speed(priv->mdev, eth_proto_oper, force_legacy);
 	if (!speed) {
 		if (data_rate_oper)
 			speed = 100 * data_rate_oper;
@@ -980,7 +980,7 @@ static void get_lp_advertising(struct mlx5_core_dev *mdev, u32 eth_proto_lp,
 			       struct ethtool_link_ksettings *link_ksettings)
 {
 	unsigned long *lp_advertising = link_ksettings->link_modes.lp_advertising;
-	bool ext = mlx5e_ptys_ext_supported(mdev);
+	bool ext = mlx5_ptys_ext_supported(mdev);
 
 	ptys2ethtool_adver_link(lp_advertising, eth_proto_lp, ext);
 }
@@ -1160,7 +1160,7 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 				     const struct ethtool_link_ksettings *link_ksettings)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5e_port_eth_proto eproto;
+	struct mlx5_port_eth_proto eproto;
 	const unsigned long *adver;
 	bool an_changes = false;
 	u8 an_disable_admin;
@@ -1180,7 +1180,7 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 	autoneg = link_ksettings->base.autoneg;
 	speed = link_ksettings->base.speed;
 
-	ext_supported = mlx5e_ptys_ext_supported(mdev);
+	ext_supported = mlx5_ptys_ext_supported(mdev);
 	ext = ext_requested(autoneg, adver, ext_supported);
 	if (!ext_supported && ext)
 		return -EOPNOTSUPP;
@@ -1194,7 +1194,7 @@ int mlx5e_ethtool_set_link_ksettings(struct mlx5e_priv *priv,
 		goto out;
 	}
 	link_modes = autoneg == AUTONEG_ENABLE ? ethtool2ptys_adver_func(adver) :
-		mlx5e_port_speed2linkmodes(mdev, speed, !ext);
+		mlx5_port_speed2linkmodes(mdev, speed, !ext);
 
 	err = mlx5e_speed_validate(priv->netdev, ext, link_modes, autoneg);
 	if (err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 70b8d2dfa751..79dd8ad5ede7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1108,7 +1108,7 @@ mlx5e_hairpin_params_init(struct mlx5e_hairpin_params *hairpin_params,
 
 	hairpin_params->mdev = mdev;
 	/* set hairpin pair per each 50Gbs share of the link */
-	mlx5e_port_max_linkspeed(mdev, &link_speed);
+	mlx5_port_max_linkspeed(mdev, &link_speed);
 	link_speed = max_t(u32, link_speed, 50000);
 	link_speed64 = link_speed;
 	do_div(link_speed64, 50000);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 75015d370922..7c79476cc5f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -744,7 +744,7 @@ static int esw_qos_devlink_rate_to_mbps(struct mlx5_core_dev *mdev, const char *
 	u64 value;
 	int err;
 
-	err = mlx5e_port_max_linkspeed(mdev, &link_speed_max);
+	err = mlx5_port_max_linkspeed(mdev, &link_speed_max);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed to get link maximum speed");
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index a1548e6bfb35..0daeb4b72cca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -1054,3 +1054,154 @@ out:
 	kfree(out);
 	return err;
 }
+
+/* speed in units of 1Mb */
+static const u32 mlx5e_link_speed[MLX5E_LINK_MODES_NUMBER] = {
+	[MLX5E_1000BASE_CX_SGMII] = 1000,
+	[MLX5E_1000BASE_KX]       = 1000,
+	[MLX5E_10GBASE_CX4]       = 10000,
+	[MLX5E_10GBASE_KX4]       = 10000,
+	[MLX5E_10GBASE_KR]        = 10000,
+	[MLX5E_20GBASE_KR2]       = 20000,
+	[MLX5E_40GBASE_CR4]       = 40000,
+	[MLX5E_40GBASE_KR4]       = 40000,
+	[MLX5E_56GBASE_R4]        = 56000,
+	[MLX5E_10GBASE_CR]        = 10000,
+	[MLX5E_10GBASE_SR]        = 10000,
+	[MLX5E_10GBASE_ER]        = 10000,
+	[MLX5E_40GBASE_SR4]       = 40000,
+	[MLX5E_40GBASE_LR4]       = 40000,
+	[MLX5E_50GBASE_SR2]       = 50000,
+	[MLX5E_100GBASE_CR4]      = 100000,
+	[MLX5E_100GBASE_SR4]      = 100000,
+	[MLX5E_100GBASE_KR4]      = 100000,
+	[MLX5E_100GBASE_LR4]      = 100000,
+	[MLX5E_100BASE_TX]        = 100,
+	[MLX5E_1000BASE_T]        = 1000,
+	[MLX5E_10GBASE_T]         = 10000,
+	[MLX5E_25GBASE_CR]        = 25000,
+	[MLX5E_25GBASE_KR]        = 25000,
+	[MLX5E_25GBASE_SR]        = 25000,
+	[MLX5E_50GBASE_CR2]       = 50000,
+	[MLX5E_50GBASE_KR2]       = 50000,
+};
+
+static const u32 mlx5e_ext_link_speed[MLX5E_EXT_LINK_MODES_NUMBER] = {
+	[MLX5E_SGMII_100M] = 100,
+	[MLX5E_1000BASE_X_SGMII] = 1000,
+	[MLX5E_5GBASE_R] = 5000,
+	[MLX5E_10GBASE_XFI_XAUI_1] = 10000,
+	[MLX5E_40GBASE_XLAUI_4_XLPPI_4] = 40000,
+	[MLX5E_25GAUI_1_25GBASE_CR_KR] = 25000,
+	[MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2] = 50000,
+	[MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR] = 50000,
+	[MLX5E_CAUI_4_100GBASE_CR4_KR4] = 100000,
+	[MLX5E_100GAUI_2_100GBASE_CR2_KR2] = 100000,
+	[MLX5E_200GAUI_4_200GBASE_CR4_KR4] = 200000,
+	[MLX5E_400GAUI_8] = 400000,
+	[MLX5E_100GAUI_1_100GBASE_CR_KR] = 100000,
+	[MLX5E_200GAUI_2_200GBASE_CR2_KR2] = 200000,
+	[MLX5E_400GAUI_4_400GBASE_CR4_KR4] = 400000,
+};
+
+int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
+			      struct mlx5_port_eth_proto *eproto)
+{
+	u32 out[MLX5_ST_SZ_DW(ptys_reg)];
+	int err;
+
+	if (!eproto)
+		return -EINVAL;
+
+	err = mlx5_query_port_ptys(dev, out, sizeof(out), MLX5_PTYS_EN, port);
+	if (err)
+		return err;
+
+	eproto->cap   = MLX5_GET_ETH_PROTO(ptys_reg, out, ext,
+					   eth_proto_capability);
+	eproto->admin = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_admin);
+	eproto->oper  = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
+	return 0;
+}
+
+bool mlx5_ptys_ext_supported(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_port_eth_proto eproto;
+	int err;
+
+	if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet))
+		return true;
+
+	err = mlx5_port_query_eth_proto(mdev, 1, true, &eproto);
+	if (err)
+		return false;
+
+	return !!eproto.cap;
+}
+
+static void mlx5e_port_get_speed_arr(struct mlx5_core_dev *mdev,
+				     const u32 **arr, u32 *size,
+				     bool force_legacy)
+{
+	bool ext = force_legacy ? false : mlx5_ptys_ext_supported(mdev);
+
+	*size = ext ? ARRAY_SIZE(mlx5e_ext_link_speed) :
+		      ARRAY_SIZE(mlx5e_link_speed);
+	*arr  = ext ? mlx5e_ext_link_speed : mlx5e_link_speed;
+}
+
+u32 mlx5_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
+			 bool force_legacy)
+{
+	unsigned long temp = eth_proto_oper;
+	const u32 *table;
+	u32 speed = 0;
+	u32 max_size;
+	int i;
+
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
+	i = find_first_bit(&temp, max_size);
+	if (i < max_size)
+		speed = table[i];
+	return speed;
+}
+
+u32 mlx5_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
+			      bool force_legacy)
+{
+	u32 link_modes = 0;
+	const u32 *table;
+	u32 max_size;
+	int i;
+
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, force_legacy);
+	for (i = 0; i < max_size; ++i) {
+		if (table[i] == speed)
+			link_modes |= MLX5E_PROT_MASK(i);
+	}
+	return link_modes;
+}
+
+int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
+{
+	struct mlx5_port_eth_proto eproto;
+	u32 max_speed = 0;
+	const u32 *table;
+	u32 max_size;
+	bool ext;
+	int err;
+	int i;
+
+	ext = mlx5_ptys_ext_supported(mdev);
+	err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
+	if (err)
+		return err;
+
+	mlx5e_port_get_speed_arr(mdev, &table, &max_size, false);
+	for (i = 0; i < max_size; ++i)
+		if (eproto.cap & MLX5E_PROT_MASK(i))
+			max_speed = max(max_speed, table[i]);
+
+	*speed = max_speed;
+	return 0;
+}
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index e96ee1e348cb..98b2e1e149f9 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -141,6 +141,12 @@ enum mlx5_ptys_width {
 	MLX5_PTYS_WIDTH_12X	= 1 << 4,
 };
 
+struct mlx5_port_eth_proto {
+	u32 cap;
+	u32 admin;
+	u32 oper;
+};
+
 #define MLX5E_PROT_MASK(link_mode) (1U << link_mode)
 #define MLX5_GET_ETH_PROTO(reg, out, ext, field)	\
 	(ext ? MLX5_GET(reg, out, ext_##field) :	\
@@ -218,4 +224,14 @@ int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state);
 int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state);
 int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio);
 int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio);
+
+int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext,
+			      struct mlx5_port_eth_proto *eproto);
+bool mlx5_ptys_ext_supported(struct mlx5_core_dev *mdev);
+u32 mlx5_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper,
+			 bool force_legacy);
+u32 mlx5_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed,
+			      bool force_legacy);
+int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
+
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From 053fdaa841bd1af9fe9c2c30bba81119059aac95 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 14 Mar 2023 15:13:08 -0500
Subject: nfc: mrvl: Move platform_data struct into driver

There are no users of nfcmrvl platform_data struct outside of the
driver and none will be added, so move it into the driver.

Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/nfc/nfcmrvl/nfcmrvl.h         | 30 ++++++++++++++++++++--
 include/linux/platform_data/nfcmrvl.h | 48 -----------------------------------
 2 files changed, 28 insertions(+), 50 deletions(-)
 delete mode 100644 include/linux/platform_data/nfcmrvl.h

(limited to 'include')

diff --git a/drivers/nfc/nfcmrvl/nfcmrvl.h b/drivers/nfc/nfcmrvl/nfcmrvl.h
index 165bd0a95190..0f22b3233f73 100644
--- a/drivers/nfc/nfcmrvl/nfcmrvl.h
+++ b/drivers/nfc/nfcmrvl/nfcmrvl.h
@@ -8,8 +8,6 @@
 #ifndef _NFCMRVL_H_
 #define _NFCMRVL_H_
 
-#include <linux/platform_data/nfcmrvl.h>
-
 #include "fw_dnld.h"
 
 /* Define private flags: */
@@ -50,6 +48,34 @@ enum nfcmrvl_phy {
 	NFCMRVL_PHY_SPI		= 3,
 };
 
+struct nfcmrvl_platform_data {
+	/*
+	 * Generic
+	 */
+
+	/* GPIO that is wired to RESET_N signal */
+	int reset_n_io;
+	/* Tell if transport is muxed in HCI one */
+	unsigned int hci_muxed;
+
+	/*
+	 * UART specific
+	 */
+
+	/* Tell if UART needs flow control at init */
+	unsigned int flow_control;
+	/* Tell if firmware supports break control for power management */
+	unsigned int break_control;
+
+
+	/*
+	 * I2C specific
+	 */
+
+	unsigned int irq;
+	unsigned int irq_polarity;
+};
+
 struct nfcmrvl_private {
 
 	unsigned long flags;
diff --git a/include/linux/platform_data/nfcmrvl.h b/include/linux/platform_data/nfcmrvl.h
deleted file mode 100644
index 9e75ac8d19be..000000000000
--- a/include/linux/platform_data/nfcmrvl.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) 2015, Marvell International Ltd.
- *
- * This software file (the "File") is distributed by Marvell International
- * Ltd. under the terms of the GNU General Public License Version 2, June 1991
- * (the "License").  You may use, redistribute and/or modify this File in
- * accordance with the terms and conditions of the License, a copy of which
- * is available on the worldwide web at
- * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
- *
- * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE
- * ARE EXPRESSLY DISCLAIMED.  The License provides additional details about
- * this warranty disclaimer.
- */
-
-#ifndef _NFCMRVL_PTF_H_
-#define _NFCMRVL_PTF_H_
-
-struct nfcmrvl_platform_data {
-	/*
-	 * Generic
-	 */
-
-	/* GPIO that is wired to RESET_N signal */
-	int reset_n_io;
-	/* Tell if transport is muxed in HCI one */
-	unsigned int hci_muxed;
-
-	/*
-	 * UART specific
-	 */
-
-	/* Tell if UART needs flow control at init */
-	unsigned int flow_control;
-	/* Tell if firmware supports break control for power management */
-	unsigned int break_control;
-
-
-	/*
-	 * I2C specific
-	 */
-
-	unsigned int irq;
-	unsigned int irq_polarity;
-};
-
-#endif /* _NFCMRVL_PTF_H_ */
-- 
cgit v1.2.3


From bd7fc6e1957c2102866f9e464c1f2302e891b7e9 Mon Sep 17 00:00:00 2001
From: Shradha Gupta <shradhagupta@linux.microsoft.com>
Date: Wed, 15 Mar 2023 04:55:13 -0700
Subject: net: mana: Add new MANA VF performance counters for easier
 troubleshooting

Extended performance counter stats in 'ethtool -S <interface>' output
for MANA VF to facilitate troubleshooting.

Tested-on: Ubuntu22
Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c      | 62 ++++++++++++++++++++--
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 52 +++++++++++++++++-
 include/net/mana/mana.h                            | 18 +++++++
 3 files changed, 128 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 6120f2b6684f..492474b4d8aa 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -156,6 +156,7 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	struct mana_txq *txq;
 	struct mana_cq *cq;
 	int err, len;
+	u16 ihs;
 
 	if (unlikely(!apc->port_is_up))
 		goto tx_drop;
@@ -166,6 +167,7 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	txq = &apc->tx_qp[txq_idx].txq;
 	gdma_sq = txq->gdma_sq;
 	cq = &apc->tx_qp[txq_idx].tx_cq;
+	tx_stats = &txq->stats;
 
 	pkg.tx_oob.s_oob.vcq_num = cq->gdma_id;
 	pkg.tx_oob.s_oob.vsq_frame = txq->vsq_frame;
@@ -179,10 +181,17 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 
 	pkg.tx_oob.s_oob.pkt_fmt = pkt_fmt;
 
-	if (pkt_fmt == MANA_SHORT_PKT_FMT)
+	if (pkt_fmt == MANA_SHORT_PKT_FMT) {
 		pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_short_oob);
-	else
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->short_pkt_fmt++;
+		u64_stats_update_end(&tx_stats->syncp);
+	} else {
 		pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_oob);
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->long_pkt_fmt++;
+		u64_stats_update_end(&tx_stats->syncp);
+	}
 
 	pkg.wqe_req.inline_oob_data = &pkg.tx_oob;
 	pkg.wqe_req.flags = 0;
@@ -232,9 +241,35 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 						 &ipv6_hdr(skb)->daddr, 0,
 						 IPPROTO_TCP, 0);
 		}
+
+		if (skb->encapsulation) {
+			ihs = skb_inner_tcp_all_headers(skb);
+			u64_stats_update_begin(&tx_stats->syncp);
+			tx_stats->tso_inner_packets++;
+			tx_stats->tso_inner_bytes += skb->len - ihs;
+			u64_stats_update_end(&tx_stats->syncp);
+		} else {
+			if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+				ihs = skb_transport_offset(skb) + sizeof(struct udphdr);
+			} else {
+				ihs = skb_tcp_all_headers(skb);
+				if (ipv6_has_hopopt_jumbo(skb))
+					ihs -= sizeof(struct hop_jumbo_hdr);
+			}
+
+			u64_stats_update_begin(&tx_stats->syncp);
+			tx_stats->tso_packets++;
+			tx_stats->tso_bytes += skb->len - ihs;
+			u64_stats_update_end(&tx_stats->syncp);
+		}
+
 	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		csum_type = mana_checksum_info(skb);
 
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->csum_partial++;
+		u64_stats_update_end(&tx_stats->syncp);
+
 		if (csum_type == IPPROTO_TCP) {
 			pkg.tx_oob.s_oob.is_outer_ipv4 = ipv4;
 			pkg.tx_oob.s_oob.is_outer_ipv6 = ipv6;
@@ -254,8 +289,12 @@ netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 		}
 	}
 
-	if (mana_map_skb(skb, apc, &pkg))
+	if (mana_map_skb(skb, apc, &pkg)) {
+		u64_stats_update_begin(&tx_stats->syncp);
+		tx_stats->mana_map_err++;
+		u64_stats_update_end(&tx_stats->syncp);
 		goto free_sgl_ptr;
+	}
 
 	skb_queue_tail(&txq->pending_skbs, skb);
 
@@ -1038,6 +1077,8 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 	if (comp_read < 1)
 		return;
 
+	apc->eth_stats.tx_cqes = comp_read;
+
 	for (i = 0; i < comp_read; i++) {
 		struct mana_tx_comp_oob *cqe_oob;
 
@@ -1064,6 +1105,7 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 		case CQE_TX_VLAN_TAGGING_VIOLATION:
 			WARN_ONCE(1, "TX: CQE error %d: ignored.\n",
 				  cqe_oob->cqe_hdr.cqe_type);
+			apc->eth_stats.tx_cqe_err++;
 			break;
 
 		default:
@@ -1072,6 +1114,7 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 			 */
 			WARN_ONCE(1, "TX: Unexpected CQE type %d: HW BUG?\n",
 				  cqe_oob->cqe_hdr.cqe_type);
+			apc->eth_stats.tx_cqe_unknown_type++;
 			return;
 		}
 
@@ -1118,6 +1161,8 @@ static void mana_poll_tx_cq(struct mana_cq *cq)
 		WARN_ON_ONCE(1);
 
 	cq->work_done = pkt_transmitted;
+
+	apc->eth_stats.tx_cqes -= pkt_transmitted;
 }
 
 static void mana_post_pkt_rxq(struct mana_rxq *rxq)
@@ -1252,12 +1297,15 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	struct gdma_context *gc = rxq->gdma_rq->gdma_dev->gdma_context;
 	struct net_device *ndev = rxq->ndev;
 	struct mana_recv_buf_oob *rxbuf_oob;
+	struct mana_port_context *apc;
 	struct device *dev = gc->dev;
 	void *new_buf, *old_buf;
 	struct page *new_page;
 	u32 curr, pktlen;
 	dma_addr_t da;
 
+	apc = netdev_priv(ndev);
+
 	switch (oob->cqe_hdr.cqe_type) {
 	case CQE_RX_OKAY:
 		break;
@@ -1270,6 +1318,7 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 
 	case CQE_RX_COALESCED_4:
 		netdev_err(ndev, "RX coalescing is unsupported\n");
+		apc->eth_stats.rx_coalesced_err++;
 		return;
 
 	case CQE_RX_OBJECT_FENCE:
@@ -1279,6 +1328,7 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	default:
 		netdev_err(ndev, "Unknown RX CQE type = %d\n",
 			   oob->cqe_hdr.cqe_type);
+		apc->eth_stats.rx_cqe_unknown_type++;
 		return;
 	}
 
@@ -1341,11 +1391,15 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
 {
 	struct gdma_comp *comp = cq->gdma_comp_buf;
 	struct mana_rxq *rxq = cq->rxq;
+	struct mana_port_context *apc;
 	int comp_read, i;
 
+	apc = netdev_priv(rxq->ndev);
+
 	comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER);
 	WARN_ON_ONCE(comp_read > CQE_POLLING_BUFFER);
 
+	apc->eth_stats.rx_cqes = comp_read;
 	rxq->xdp_flush = false;
 
 	for (i = 0; i < comp_read; i++) {
@@ -1357,6 +1411,8 @@ static void mana_poll_rx_cq(struct mana_cq *cq)
 			return;
 
 		mana_process_rx_cqe(rxq, cq, &comp[i]);
+
+		apc->eth_stats.rx_cqes--;
 	}
 
 	if (rxq->xdp_flush)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 5b776a33a817..a64c81410dc1 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -13,6 +13,15 @@ static const struct {
 } mana_eth_stats[] = {
 	{"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)},
 	{"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)},
+	{"tx_cqes", offsetof(struct mana_ethtool_stats, tx_cqes)},
+	{"tx_cq_err", offsetof(struct mana_ethtool_stats, tx_cqe_err)},
+	{"tx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
+					tx_cqe_unknown_type)},
+	{"rx_cqes", offsetof(struct mana_ethtool_stats, rx_cqes)},
+	{"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
+					rx_coalesced_err)},
+	{"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
+					rx_cqe_unknown_type)},
 };
 
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
@@ -23,7 +32,8 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	if (stringset != ETH_SS_STATS)
 		return -EINVAL;
 
-	return ARRAY_SIZE(mana_eth_stats) + num_queues * 8;
+	return ARRAY_SIZE(mana_eth_stats) + num_queues *
+				(MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
 }
 
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
@@ -61,6 +71,22 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 		p += ETH_GSTRING_LEN;
 		sprintf(p, "tx_%d_xdp_xmit", i);
 		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_packets", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_bytes", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_inner_packets", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_tso_inner_bytes", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_long_pkt_fmt", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_short_pkt_fmt", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_csum_partial", i);
+		p += ETH_GSTRING_LEN;
+		sprintf(p, "tx_%d_mana_map_err", i);
+		p += ETH_GSTRING_LEN;
 	}
 }
 
@@ -78,6 +104,14 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	u64 xdp_xmit;
 	u64 xdp_drop;
 	u64 xdp_tx;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 long_pkt_fmt;
+	u64 short_pkt_fmt;
+	u64 csum_partial;
+	u64 mana_map_err;
 	int q, i = 0;
 
 	if (!apc->port_is_up)
@@ -113,11 +147,27 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 			packets = tx_stats->packets;
 			bytes = tx_stats->bytes;
 			xdp_xmit = tx_stats->xdp_xmit;
+			tso_packets = tx_stats->tso_packets;
+			tso_bytes = tx_stats->tso_bytes;
+			tso_inner_packets = tx_stats->tso_inner_packets;
+			tso_inner_bytes = tx_stats->tso_inner_bytes;
+			long_pkt_fmt = tx_stats->long_pkt_fmt;
+			short_pkt_fmt = tx_stats->short_pkt_fmt;
+			csum_partial = tx_stats->csum_partial;
+			mana_map_err = tx_stats->mana_map_err;
 		} while (u64_stats_fetch_retry(&tx_stats->syncp, start));
 
 		data[i++] = packets;
 		data[i++] = bytes;
 		data[i++] = xdp_xmit;
+		data[i++] = tso_packets;
+		data[i++] = tso_bytes;
+		data[i++] = tso_inner_packets;
+		data[i++] = tso_inner_bytes;
+		data[i++] = long_pkt_fmt;
+		data[i++] = short_pkt_fmt;
+		data[i++] = csum_partial;
+		data[i++] = mana_map_err;
 	}
 }
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 3bb579962a14..bb11a6535d80 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -48,6 +48,10 @@ enum TRI_STATE {
 
 #define MAX_PORTS_IN_MANA_DEV 256
 
+/* Update this count whenever the respective structures are changed */
+#define MANA_STATS_RX_COUNT 5
+#define MANA_STATS_TX_COUNT 11
+
 struct mana_stats_rx {
 	u64 packets;
 	u64 bytes;
@@ -61,6 +65,14 @@ struct mana_stats_tx {
 	u64 packets;
 	u64 bytes;
 	u64 xdp_xmit;
+	u64 tso_packets;
+	u64 tso_bytes;
+	u64 tso_inner_packets;
+	u64 tso_inner_bytes;
+	u64 short_pkt_fmt;
+	u64 long_pkt_fmt;
+	u64 csum_partial;
+	u64 mana_map_err;
 	struct u64_stats_sync syncp;
 };
 
@@ -331,6 +343,12 @@ struct mana_tx_qp {
 struct mana_ethtool_stats {
 	u64 stop_queue;
 	u64 wake_queue;
+	u64 tx_cqes;
+	u64 tx_cqe_err;
+	u64 tx_cqe_unknown_type;
+	u64 rx_cqes;
+	u64 rx_coalesced_err;
+	u64 rx_cqe_unknown_type;
 };
 
 struct mana_context {
-- 
cgit v1.2.3


From 8c44fa12c8fa09c6c12f0dc25129a6d13ee0a1ea Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 15 Mar 2023 15:11:45 +0200
Subject: net: Add MDB net device operations

Add MDB net device operations that will be invoked by rtnetlink code in
response to received RTM_{NEW,DEL,GET}MDB messages. Subsequent patches
will implement these operations in the bridge and VXLAN drivers.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ee483071cf59..23b0d7eaaadd 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1307,6 +1307,17 @@ struct netdev_net_notifier {
  *	Used to add FDB entries to dump requests. Implementers should add
  *	entries to skb and update idx with the number of entries.
  *
+ * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[],
+ *		      u16 nlmsg_flags, struct netlink_ext_ack *extack);
+ *	Adds an MDB entry to dev.
+ * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
+ *		      struct netlink_ext_ack *extack);
+ *	Deletes the MDB entry from dev.
+ * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
+ *		       struct netlink_callback *cb);
+ *	Dumps MDB entries from dev. The first argument (marker) in the netlink
+ *	callback is used by core rtnetlink code.
+ *
  * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
  *			     u16 flags, struct netlink_ext_ack *extack)
  * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
@@ -1569,6 +1580,16 @@ struct net_device_ops {
 					       const unsigned char *addr,
 					       u16 vid, u32 portid, u32 seq,
 					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_add)(struct net_device *dev,
+					       struct nlattr *tb[],
+					       u16 nlmsg_flags,
+					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_del)(struct net_device *dev,
+					       struct nlattr *tb[],
+					       struct netlink_ext_ack *extack);
+	int			(*ndo_mdb_dump)(struct net_device *dev,
+						struct sk_buff *skb,
+						struct netlink_callback *cb);
 	int			(*ndo_bridge_setlink)(struct net_device *dev,
 						      struct nlmsghdr *nlh,
 						      u16 flags,
-- 
cgit v1.2.3


From a3a48de5eade770e911d35291217bdd69ce04ef1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 15 Mar 2023 15:11:51 +0200
Subject: vxlan: mdb: Add MDB control path support

Implement MDB control path support, enabling the creation, deletion,
replacement and dumping of MDB entries in a similar fashion to the
bridge driver. Unlike the bridge driver, each entry stores a list of
remote VTEPs to which matched packets need to be replicated to and not a
list of bridge ports.

The motivating use case is the installation of MDB entries by a user
space control plane in response to received EVPN routes. As such, only
allow permanent MDB entries to be installed and do not implement
snooping functionality, avoiding a lot of unnecessary complexity.

Since entries can only be modified by user space under RTNL, use RTNL as
the write lock. Use RCU to ensure that MDB entries and remotes are not
freed while being accessed from the data path during transmission.

In terms of uAPI, reuse the existing MDB netlink interface, but add a
few new attributes to request and response messages:

* IP address of the destination VXLAN tunnel endpoint where the
  multicast receivers reside.

* UDP destination port number to use to connect to the remote VXLAN
  tunnel endpoint.

* VXLAN VNI Network Identifier to use to connect to the remote VXLAN
  tunnel endpoint. Required when Ingress Replication (IR) is used and
  the remote VTEP is not a member of originating broadcast domain
  (VLAN/VNI) [1].

* Source VNI Network Identifier the MDB entry belongs to. Used only when
  the VXLAN device is in external mode.

* Interface index of the outgoing interface to reach the remote VXLAN
  tunnel endpoint. This is required when the underlay destination IP is
  multicast (P2MP), as the multicast routing tables are not consulted.

All the new attributes are added under the 'MDBA_SET_ENTRY_ATTRS' nest
which is strictly validated by the bridge driver, thereby automatically
rejecting the new attributes.

[1] https://datatracker.ietf.org/doc/html/draft-ietf-bess-evpn-irb-mcast#section-3.2.2

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan/Makefile        |    2 +-
 drivers/net/vxlan/vxlan_core.c    |    8 +
 drivers/net/vxlan/vxlan_mdb.c     | 1341 +++++++++++++++++++++++++++++++++++++
 drivers/net/vxlan/vxlan_private.h |   31 +
 include/net/vxlan.h               |    5 +
 include/uapi/linux/if_bridge.h    |   10 +
 6 files changed, 1396 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/vxlan/vxlan_mdb.c

(limited to 'include')

diff --git a/drivers/net/vxlan/Makefile b/drivers/net/vxlan/Makefile
index d4c255499b72..91b8fec8b6cf 100644
--- a/drivers/net/vxlan/Makefile
+++ b/drivers/net/vxlan/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_VXLAN) += vxlan.o
 
-vxlan-objs := vxlan_core.o vxlan_multicast.o vxlan_vnifilter.o
+vxlan-objs := vxlan_core.o vxlan_multicast.o vxlan_vnifilter.o vxlan_mdb.o
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 5de1a20497a6..a8b26d4f76de 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2878,8 +2878,14 @@ static int vxlan_init(struct net_device *dev)
 	if (err)
 		goto err_free_percpu;
 
+	err = vxlan_mdb_init(vxlan);
+	if (err)
+		goto err_gro_cells_destroy;
+
 	return 0;
 
+err_gro_cells_destroy:
+	gro_cells_destroy(&vxlan->gro_cells);
 err_free_percpu:
 	free_percpu(dev->tstats);
 err_vnigroup_uninit:
@@ -2904,6 +2910,8 @@ static void vxlan_uninit(struct net_device *dev)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
+	vxlan_mdb_fini(vxlan);
+
 	if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
 		vxlan_vnigroup_uninit(vxlan);
 
diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c
new file mode 100644
index 000000000000..129692b3663f
--- /dev/null
+++ b/drivers/net/vxlan/vxlan_mdb.c
@@ -0,0 +1,1341 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/if_bridge.h>
+#include <linux/in.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/vxlan.h>
+
+#include "vxlan_private.h"
+
+struct vxlan_mdb_entry_key {
+	union vxlan_addr src;
+	union vxlan_addr dst;
+	__be32 vni;
+};
+
+struct vxlan_mdb_entry {
+	struct rhash_head rhnode;
+	struct list_head remotes;
+	struct vxlan_mdb_entry_key key;
+	struct hlist_node mdb_node;
+	struct rcu_head rcu;
+};
+
+#define VXLAN_MDB_REMOTE_F_BLOCKED	BIT(0)
+
+struct vxlan_mdb_remote {
+	struct list_head list;
+	struct vxlan_rdst __rcu *rd;
+	u8 flags;
+	u8 filter_mode;
+	u8 rt_protocol;
+	struct hlist_head src_list;
+	struct rcu_head rcu;
+};
+
+#define VXLAN_SGRP_F_DELETE	BIT(0)
+
+struct vxlan_mdb_src_entry {
+	struct hlist_node node;
+	union vxlan_addr addr;
+	u8 flags;
+};
+
+struct vxlan_mdb_dump_ctx {
+	long reserved;
+	long entry_idx;
+	long remote_idx;
+};
+
+struct vxlan_mdb_config_src_entry {
+	union vxlan_addr addr;
+	struct list_head node;
+};
+
+struct vxlan_mdb_config {
+	struct vxlan_dev *vxlan;
+	struct vxlan_mdb_entry_key group;
+	struct list_head src_list;
+	union vxlan_addr remote_ip;
+	u32 remote_ifindex;
+	__be32 remote_vni;
+	__be16 remote_port;
+	u16 nlflags;
+	u8 flags;
+	u8 filter_mode;
+	u8 rt_protocol;
+};
+
+static const struct rhashtable_params vxlan_mdb_rht_params = {
+	.head_offset = offsetof(struct vxlan_mdb_entry, rhnode),
+	.key_offset = offsetof(struct vxlan_mdb_entry, key),
+	.key_len = sizeof(struct vxlan_mdb_entry_key),
+	.automatic_shrinking = true,
+};
+
+static int __vxlan_mdb_add(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack);
+static int __vxlan_mdb_del(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack);
+
+static void vxlan_br_mdb_entry_fill(const struct vxlan_dev *vxlan,
+				    const struct vxlan_mdb_entry *mdb_entry,
+				    const struct vxlan_mdb_remote *remote,
+				    struct br_mdb_entry *e)
+{
+	const union vxlan_addr *dst = &mdb_entry->key.dst;
+
+	memset(e, 0, sizeof(*e));
+	e->ifindex = vxlan->dev->ifindex;
+	e->state = MDB_PERMANENT;
+
+	if (remote->flags & VXLAN_MDB_REMOTE_F_BLOCKED)
+		e->flags |= MDB_FLAGS_BLOCKED;
+
+	switch (dst->sa.sa_family) {
+	case AF_INET:
+		e->addr.u.ip4 = dst->sin.sin_addr.s_addr;
+		e->addr.proto = htons(ETH_P_IP);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		e->addr.u.ip6 = dst->sin6.sin6_addr;
+		e->addr.proto = htons(ETH_P_IPV6);
+		break;
+#endif
+	}
+}
+
+static int vxlan_mdb_entry_info_fill_srcs(struct sk_buff *skb,
+					  const struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_mdb_src_entry *ent;
+	struct nlattr *nest;
+
+	if (hlist_empty(&remote->src_list))
+		return 0;
+
+	nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST);
+	if (!nest)
+		return -EMSGSIZE;
+
+	hlist_for_each_entry(ent, &remote->src_list, node) {
+		struct nlattr *nest_ent;
+
+		nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY);
+		if (!nest_ent)
+			goto out_cancel_err;
+
+		if (vxlan_nla_put_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+				       &ent->addr) ||
+		    nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, 0))
+			goto out_cancel_err;
+
+		nla_nest_end(skb, nest_ent);
+	}
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_cancel_err:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int vxlan_mdb_entry_info_fill(const struct vxlan_dev *vxlan,
+				     struct sk_buff *skb,
+				     const struct vxlan_mdb_entry *mdb_entry,
+				     const struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+	struct br_mdb_entry e;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY_INFO);
+	if (!nest)
+		return -EMSGSIZE;
+
+	vxlan_br_mdb_entry_fill(vxlan, mdb_entry, remote, &e);
+
+	if (nla_put_nohdr(skb, sizeof(e), &e) ||
+	    nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, 0))
+		goto nest_err;
+
+	if (!vxlan_addr_any(&mdb_entry->key.src) &&
+	    vxlan_nla_put_addr(skb, MDBA_MDB_EATTR_SOURCE, &mdb_entry->key.src))
+		goto nest_err;
+
+	if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, remote->rt_protocol) ||
+	    nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, remote->filter_mode) ||
+	    vxlan_mdb_entry_info_fill_srcs(skb, remote) ||
+	    vxlan_nla_put_addr(skb, MDBA_MDB_EATTR_DST, &rd->remote_ip))
+		goto nest_err;
+
+	if (rd->remote_port && rd->remote_port != vxlan->cfg.dst_port &&
+	    nla_put_u16(skb, MDBA_MDB_EATTR_DST_PORT,
+			be16_to_cpu(rd->remote_port)))
+		goto nest_err;
+
+	if (rd->remote_vni != vxlan->default_dst.remote_vni &&
+	    nla_put_u32(skb, MDBA_MDB_EATTR_VNI, be32_to_cpu(rd->remote_vni)))
+		goto nest_err;
+
+	if (rd->remote_ifindex &&
+	    nla_put_u32(skb, MDBA_MDB_EATTR_IFINDEX, rd->remote_ifindex))
+		goto nest_err;
+
+	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) &&
+	    mdb_entry->key.vni && nla_put_u32(skb, MDBA_MDB_EATTR_SRC_VNI,
+					      be32_to_cpu(mdb_entry->key.vni)))
+		goto nest_err;
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+nest_err:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
+static int vxlan_mdb_entry_fill(const struct vxlan_dev *vxlan,
+				struct sk_buff *skb,
+				struct vxlan_mdb_dump_ctx *ctx,
+				const struct vxlan_mdb_entry *mdb_entry)
+{
+	int remote_idx = 0, s_remote_idx = ctx->remote_idx;
+	struct vxlan_mdb_remote *remote;
+	struct nlattr *nest;
+	int err = 0;
+
+	nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+	if (!nest)
+		return -EMSGSIZE;
+
+	list_for_each_entry(remote, &mdb_entry->remotes, list) {
+		if (remote_idx < s_remote_idx)
+			goto skip;
+
+		err = vxlan_mdb_entry_info_fill(vxlan, skb, mdb_entry, remote);
+		if (err)
+			break;
+skip:
+		remote_idx++;
+	}
+
+	ctx->remote_idx = err ? remote_idx : 0;
+	nla_nest_end(skb, nest);
+	return err;
+}
+
+static int vxlan_mdb_fill(const struct vxlan_dev *vxlan, struct sk_buff *skb,
+			  struct vxlan_mdb_dump_ctx *ctx)
+{
+	int entry_idx = 0, s_entry_idx = ctx->entry_idx;
+	struct vxlan_mdb_entry *mdb_entry;
+	struct nlattr *nest;
+	int err = 0;
+
+	nest = nla_nest_start_noflag(skb, MDBA_MDB);
+	if (!nest)
+		return -EMSGSIZE;
+
+	hlist_for_each_entry(mdb_entry, &vxlan->mdb_list, mdb_node) {
+		if (entry_idx < s_entry_idx)
+			goto skip;
+
+		err = vxlan_mdb_entry_fill(vxlan, skb, ctx, mdb_entry);
+		if (err)
+			break;
+skip:
+		entry_idx++;
+	}
+
+	ctx->entry_idx = err ? entry_idx : 0;
+	nla_nest_end(skb, nest);
+	return err;
+}
+
+int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+		   struct netlink_callback *cb)
+{
+	struct vxlan_mdb_dump_ctx *ctx = (void *)cb->ctx;
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct br_port_msg *bpm;
+	struct nlmsghdr *nlh;
+	int err;
+
+	ASSERT_RTNL();
+
+	NL_ASSERT_DUMP_CTX_FITS(struct vxlan_mdb_dump_ctx);
+
+	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, RTM_NEWMDB, sizeof(*bpm),
+			NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	bpm = nlmsg_data(nlh);
+	memset(bpm, 0, sizeof(*bpm));
+	bpm->family = AF_BRIDGE;
+	bpm->ifindex = dev->ifindex;
+
+	err = vxlan_mdb_fill(vxlan, skb, ctx);
+
+	nlmsg_end(skb, nlh);
+
+	cb->seq = vxlan->mdb_seq;
+	nl_dump_check_consistent(cb, nlh);
+
+	return err;
+}
+
+static const struct nla_policy
+vxlan_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = {
+	[MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY,
+						  sizeof(struct in_addr),
+						  sizeof(struct in6_addr)),
+};
+
+static const struct nla_policy
+vxlan_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = {
+	[MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(vxlan_mdbe_src_list_entry_pol),
+};
+
+static struct netlink_range_validation vni_range = {
+	.max = VXLAN_N_VID - 1,
+};
+
+static const struct nla_policy vxlan_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
+	[MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+					      sizeof(struct in_addr),
+					      sizeof(struct in6_addr)),
+	[MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE,
+						  MCAST_INCLUDE),
+	[MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(vxlan_mdbe_src_list_pol),
+	[MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
+	[MDBE_ATTR_DST] = NLA_POLICY_RANGE(NLA_BINARY,
+					   sizeof(struct in_addr),
+					   sizeof(struct in6_addr)),
+	[MDBE_ATTR_DST_PORT] = { .type = NLA_U16 },
+	[MDBE_ATTR_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range),
+	[MDBE_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+	[MDBE_ATTR_SRC_VNI] = NLA_POLICY_FULL_RANGE(NLA_U32, &vni_range),
+};
+
+static bool vxlan_mdb_is_valid_source(const struct nlattr *attr, __be16 proto,
+				      struct netlink_ext_ack *extack)
+{
+	switch (proto) {
+	case htons(ETH_P_IP):
+		if (nla_len(attr) != sizeof(struct in_addr)) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length");
+			return false;
+		}
+		if (ipv4_is_multicast(nla_get_in_addr(attr))) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed");
+			return false;
+		}
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case htons(ETH_P_IPV6): {
+		struct in6_addr src;
+
+		if (nla_len(attr) != sizeof(struct in6_addr)) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length");
+			return false;
+		}
+		src = nla_get_in6_addr(attr);
+		if (ipv6_addr_is_multicast(&src)) {
+			NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed");
+			return false;
+		}
+		break;
+	}
+#endif
+	default:
+		NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address");
+		return false;
+	}
+
+	return true;
+}
+
+static void vxlan_mdb_config_group_set(struct vxlan_mdb_config *cfg,
+				       const struct br_mdb_entry *entry,
+				       const struct nlattr *source_attr)
+{
+	struct vxlan_mdb_entry_key *group = &cfg->group;
+
+	switch (entry->addr.proto) {
+	case htons(ETH_P_IP):
+		group->dst.sa.sa_family = AF_INET;
+		group->dst.sin.sin_addr.s_addr = entry->addr.u.ip4;
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case htons(ETH_P_IPV6):
+		group->dst.sa.sa_family = AF_INET6;
+		group->dst.sin6.sin6_addr = entry->addr.u.ip6;
+		break;
+#endif
+	}
+
+	if (source_attr)
+		vxlan_nla_get_addr(&group->src, source_attr);
+}
+
+static bool vxlan_mdb_is_star_g(const struct vxlan_mdb_entry_key *group)
+{
+	return !vxlan_addr_any(&group->dst) && vxlan_addr_any(&group->src);
+}
+
+static bool vxlan_mdb_is_sg(const struct vxlan_mdb_entry_key *group)
+{
+	return !vxlan_addr_any(&group->dst) && !vxlan_addr_any(&group->src);
+}
+
+static int vxlan_mdb_config_src_entry_init(struct vxlan_mdb_config *cfg,
+					   __be16 proto,
+					   const struct nlattr *src_entry,
+					   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[MDBE_SRCATTR_MAX + 1];
+	struct vxlan_mdb_config_src_entry *src;
+	int err;
+
+	err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry,
+			       vxlan_mdbe_src_list_entry_pol, extack);
+	if (err)
+		return err;
+
+	if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS))
+		return -EINVAL;
+
+	if (!vxlan_mdb_is_valid_source(tb[MDBE_SRCATTR_ADDRESS], proto,
+				       extack))
+		return -EINVAL;
+
+	src = kzalloc(sizeof(*src), GFP_KERNEL);
+	if (!src)
+		return -ENOMEM;
+
+	err = vxlan_nla_get_addr(&src->addr, tb[MDBE_SRCATTR_ADDRESS]);
+	if (err)
+		goto err_free_src;
+
+	list_add_tail(&src->node, &cfg->src_list);
+
+	return 0;
+
+err_free_src:
+	kfree(src);
+	return err;
+}
+
+static void
+vxlan_mdb_config_src_entry_fini(struct vxlan_mdb_config_src_entry *src)
+{
+	list_del(&src->node);
+	kfree(src);
+}
+
+static int vxlan_mdb_config_src_list_init(struct vxlan_mdb_config *cfg,
+					  __be16 proto,
+					  const struct nlattr *src_list,
+					  struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config_src_entry *src, *tmp;
+	struct nlattr *src_entry;
+	int rem, err;
+
+	nla_for_each_nested(src_entry, src_list, rem) {
+		err = vxlan_mdb_config_src_entry_init(cfg, proto, src_entry,
+						      extack);
+		if (err)
+			goto err_src_entry_init;
+	}
+
+	return 0;
+
+err_src_entry_init:
+	list_for_each_entry_safe_reverse(src, tmp, &cfg->src_list, node)
+		vxlan_mdb_config_src_entry_fini(src);
+	return err;
+}
+
+static void vxlan_mdb_config_src_list_fini(struct vxlan_mdb_config *cfg)
+{
+	struct vxlan_mdb_config_src_entry *src, *tmp;
+
+	list_for_each_entry_safe_reverse(src, tmp, &cfg->src_list, node)
+		vxlan_mdb_config_src_entry_fini(src);
+}
+
+static int vxlan_mdb_config_attrs_init(struct vxlan_mdb_config *cfg,
+				       const struct br_mdb_entry *entry,
+				       const struct nlattr *set_attrs,
+				       struct netlink_ext_ack *extack)
+{
+	struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1];
+	int err;
+
+	err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX, set_attrs,
+			       vxlan_mdbe_attrs_pol, extack);
+	if (err)
+		return err;
+
+	if (NL_REQ_ATTR_CHECK(extack, set_attrs, mdbe_attrs, MDBE_ATTR_DST)) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing remote destination IP address");
+		return -EINVAL;
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_SOURCE] &&
+	    !vxlan_mdb_is_valid_source(mdbe_attrs[MDBE_ATTR_SOURCE],
+				       entry->addr.proto, extack))
+		return -EINVAL;
+
+	vxlan_mdb_config_group_set(cfg, entry, mdbe_attrs[MDBE_ATTR_SOURCE]);
+
+	/* rtnetlink code only validates that IPv4 group address is
+	 * multicast.
+	 */
+	if (!vxlan_addr_is_multicast(&cfg->group.dst) &&
+	    !vxlan_addr_any(&cfg->group.dst)) {
+		NL_SET_ERR_MSG_MOD(extack, "Group address is not multicast");
+		return -EINVAL;
+	}
+
+	if (vxlan_addr_any(&cfg->group.dst) &&
+	    mdbe_attrs[MDBE_ATTR_SOURCE]) {
+		NL_SET_ERR_MSG_MOD(extack, "Source cannot be specified for the all-zeros entry");
+		return -EINVAL;
+	}
+
+	if (vxlan_mdb_is_sg(&cfg->group))
+		cfg->filter_mode = MCAST_INCLUDE;
+
+	if (mdbe_attrs[MDBE_ATTR_GROUP_MODE]) {
+		if (!vxlan_mdb_is_star_g(&cfg->group)) {
+			NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries");
+			return -EINVAL;
+		}
+		cfg->filter_mode = nla_get_u8(mdbe_attrs[MDBE_ATTR_GROUP_MODE]);
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_SRC_LIST]) {
+		if (!vxlan_mdb_is_star_g(&cfg->group)) {
+			NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries");
+			return -EINVAL;
+		}
+		if (!mdbe_attrs[MDBE_ATTR_GROUP_MODE]) {
+			NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode");
+			return -EINVAL;
+		}
+		err = vxlan_mdb_config_src_list_init(cfg, entry->addr.proto,
+						     mdbe_attrs[MDBE_ATTR_SRC_LIST],
+						     extack);
+		if (err)
+			return err;
+	}
+
+	if (vxlan_mdb_is_star_g(&cfg->group) && list_empty(&cfg->src_list) &&
+	    cfg->filter_mode == MCAST_INCLUDE) {
+		NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list");
+		return -EINVAL;
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_RTPROT])
+		cfg->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]);
+
+	err = vxlan_nla_get_addr(&cfg->remote_ip, mdbe_attrs[MDBE_ATTR_DST]);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid remote destination address");
+		goto err_src_list_fini;
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_DST_PORT])
+		cfg->remote_port =
+			cpu_to_be16(nla_get_u16(mdbe_attrs[MDBE_ATTR_DST_PORT]));
+
+	if (mdbe_attrs[MDBE_ATTR_VNI])
+		cfg->remote_vni =
+			cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_VNI]));
+
+	if (mdbe_attrs[MDBE_ATTR_IFINDEX]) {
+		cfg->remote_ifindex =
+			nla_get_s32(mdbe_attrs[MDBE_ATTR_IFINDEX]);
+		if (!__dev_get_by_index(cfg->vxlan->net, cfg->remote_ifindex)) {
+			NL_SET_ERR_MSG_MOD(extack, "Outgoing interface not found");
+			err = -EINVAL;
+			goto err_src_list_fini;
+		}
+	}
+
+	if (mdbe_attrs[MDBE_ATTR_SRC_VNI])
+		cfg->group.vni =
+			cpu_to_be32(nla_get_u32(mdbe_attrs[MDBE_ATTR_SRC_VNI]));
+
+	return 0;
+
+err_src_list_fini:
+	vxlan_mdb_config_src_list_fini(cfg);
+	return err;
+}
+
+static int vxlan_mdb_config_init(struct vxlan_mdb_config *cfg,
+				 struct net_device *dev, struct nlattr *tb[],
+				 u16 nlmsg_flags,
+				 struct netlink_ext_ack *extack)
+{
+	struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]);
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	memset(cfg, 0, sizeof(*cfg));
+	cfg->vxlan = vxlan;
+	cfg->group.vni = vxlan->default_dst.remote_vni;
+	INIT_LIST_HEAD(&cfg->src_list);
+	cfg->nlflags = nlmsg_flags;
+	cfg->filter_mode = MCAST_EXCLUDE;
+	cfg->rt_protocol = RTPROT_STATIC;
+	cfg->remote_vni = vxlan->default_dst.remote_vni;
+	cfg->remote_port = vxlan->cfg.dst_port;
+
+	if (entry->ifindex != dev->ifindex) {
+		NL_SET_ERR_MSG_MOD(extack, "Port net device must be the VXLAN net device");
+		return -EINVAL;
+	}
+
+	/* State is not part of the entry key and can be ignored on deletion
+	 * requests.
+	 */
+	if ((nlmsg_flags & (NLM_F_CREATE | NLM_F_REPLACE)) &&
+	    entry->state != MDB_PERMANENT) {
+		NL_SET_ERR_MSG_MOD(extack, "MDB entry must be permanent");
+		return -EINVAL;
+	}
+
+	if (entry->flags) {
+		NL_SET_ERR_MSG_MOD(extack, "Invalid MDB entry flags");
+		return -EINVAL;
+	}
+
+	if (entry->vid) {
+		NL_SET_ERR_MSG_MOD(extack, "VID must not be specified");
+		return -EINVAL;
+	}
+
+	if (entry->addr.proto != htons(ETH_P_IP) &&
+	    entry->addr.proto != htons(ETH_P_IPV6)) {
+		NL_SET_ERR_MSG_MOD(extack, "Group address must be an IPv4 / IPv6 address");
+		return -EINVAL;
+	}
+
+	if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY_ATTRS)) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY_ATTRS attribute");
+		return -EINVAL;
+	}
+
+	return vxlan_mdb_config_attrs_init(cfg, entry, tb[MDBA_SET_ENTRY_ATTRS],
+					   extack);
+}
+
+static void vxlan_mdb_config_fini(struct vxlan_mdb_config *cfg)
+{
+	vxlan_mdb_config_src_list_fini(cfg);
+}
+
+static struct vxlan_mdb_entry *
+vxlan_mdb_entry_lookup(struct vxlan_dev *vxlan,
+		       const struct vxlan_mdb_entry_key *group)
+{
+	return rhashtable_lookup_fast(&vxlan->mdb_tbl, group,
+				      vxlan_mdb_rht_params);
+}
+
+static struct vxlan_mdb_remote *
+vxlan_mdb_remote_lookup(const struct vxlan_mdb_entry *mdb_entry,
+			const union vxlan_addr *addr)
+{
+	struct vxlan_mdb_remote *remote;
+
+	list_for_each_entry(remote, &mdb_entry->remotes, list) {
+		struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+
+		if (vxlan_addr_equal(addr, &rd->remote_ip))
+			return remote;
+	}
+
+	return NULL;
+}
+
+static void vxlan_mdb_rdst_free(struct rcu_head *head)
+{
+	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);
+
+	dst_cache_destroy(&rd->dst_cache);
+	kfree(rd);
+}
+
+static int vxlan_mdb_remote_rdst_init(const struct vxlan_mdb_config *cfg,
+				      struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_rdst *rd;
+	int err;
+
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	err = dst_cache_init(&rd->dst_cache, GFP_KERNEL);
+	if (err)
+		goto err_free_rdst;
+
+	rd->remote_ip = cfg->remote_ip;
+	rd->remote_port = cfg->remote_port;
+	rd->remote_vni = cfg->remote_vni;
+	rd->remote_ifindex = cfg->remote_ifindex;
+	rcu_assign_pointer(remote->rd, rd);
+
+	return 0;
+
+err_free_rdst:
+	kfree(rd);
+	return err;
+}
+
+static void vxlan_mdb_remote_rdst_fini(struct vxlan_rdst *rd)
+{
+	call_rcu(&rd->rcu, vxlan_mdb_rdst_free);
+}
+
+static int vxlan_mdb_remote_init(const struct vxlan_mdb_config *cfg,
+				 struct vxlan_mdb_remote *remote)
+{
+	int err;
+
+	err = vxlan_mdb_remote_rdst_init(cfg, remote);
+	if (err)
+		return err;
+
+	remote->flags = cfg->flags;
+	remote->filter_mode = cfg->filter_mode;
+	remote->rt_protocol = cfg->rt_protocol;
+	INIT_HLIST_HEAD(&remote->src_list);
+
+	return 0;
+}
+
+static void vxlan_mdb_remote_fini(struct vxlan_dev *vxlan,
+				  struct vxlan_mdb_remote *remote)
+{
+	WARN_ON_ONCE(!hlist_empty(&remote->src_list));
+	vxlan_mdb_remote_rdst_fini(rtnl_dereference(remote->rd));
+}
+
+static struct vxlan_mdb_src_entry *
+vxlan_mdb_remote_src_entry_lookup(const struct vxlan_mdb_remote *remote,
+				  const union vxlan_addr *addr)
+{
+	struct vxlan_mdb_src_entry *ent;
+
+	hlist_for_each_entry(ent, &remote->src_list, node) {
+		if (vxlan_addr_equal(&ent->addr, addr))
+			return ent;
+	}
+
+	return NULL;
+}
+
+static struct vxlan_mdb_src_entry *
+vxlan_mdb_remote_src_entry_add(struct vxlan_mdb_remote *remote,
+			       const union vxlan_addr *addr)
+{
+	struct vxlan_mdb_src_entry *ent;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return NULL;
+
+	ent->addr = *addr;
+	hlist_add_head(&ent->node, &remote->src_list);
+
+	return ent;
+}
+
+static void
+vxlan_mdb_remote_src_entry_del(struct vxlan_mdb_src_entry *ent)
+{
+	hlist_del(&ent->node);
+	kfree(ent);
+}
+
+static int
+vxlan_mdb_remote_src_fwd_add(const struct vxlan_mdb_config *cfg,
+			     const union vxlan_addr *addr,
+			     struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config sg_cfg;
+
+	memset(&sg_cfg, 0, sizeof(sg_cfg));
+	sg_cfg.vxlan = cfg->vxlan;
+	sg_cfg.group.src = *addr;
+	sg_cfg.group.dst = cfg->group.dst;
+	sg_cfg.group.vni = cfg->group.vni;
+	INIT_LIST_HEAD(&sg_cfg.src_list);
+	sg_cfg.remote_ip = cfg->remote_ip;
+	sg_cfg.remote_ifindex = cfg->remote_ifindex;
+	sg_cfg.remote_vni = cfg->remote_vni;
+	sg_cfg.remote_port = cfg->remote_port;
+	sg_cfg.nlflags = cfg->nlflags;
+	sg_cfg.filter_mode = MCAST_INCLUDE;
+	if (cfg->filter_mode == MCAST_EXCLUDE)
+		sg_cfg.flags = VXLAN_MDB_REMOTE_F_BLOCKED;
+	sg_cfg.rt_protocol = cfg->rt_protocol;
+
+	return __vxlan_mdb_add(&sg_cfg, extack);
+}
+
+static void
+vxlan_mdb_remote_src_fwd_del(struct vxlan_dev *vxlan,
+			     const struct vxlan_mdb_entry_key *group,
+			     const struct vxlan_mdb_remote *remote,
+			     const union vxlan_addr *addr)
+{
+	struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+	struct vxlan_mdb_config sg_cfg;
+
+	memset(&sg_cfg, 0, sizeof(sg_cfg));
+	sg_cfg.vxlan = vxlan;
+	sg_cfg.group.src = *addr;
+	sg_cfg.group.dst = group->dst;
+	sg_cfg.group.vni = group->vni;
+	INIT_LIST_HEAD(&sg_cfg.src_list);
+	sg_cfg.remote_ip = rd->remote_ip;
+
+	__vxlan_mdb_del(&sg_cfg, NULL);
+}
+
+static int
+vxlan_mdb_remote_src_add(const struct vxlan_mdb_config *cfg,
+			 struct vxlan_mdb_remote *remote,
+			 const struct vxlan_mdb_config_src_entry *src,
+			 struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_src_entry *ent;
+	int err;
+
+	ent = vxlan_mdb_remote_src_entry_lookup(remote, &src->addr);
+	if (!ent) {
+		ent = vxlan_mdb_remote_src_entry_add(remote, &src->addr);
+		if (!ent)
+			return -ENOMEM;
+	} else if (!(cfg->nlflags & NLM_F_REPLACE)) {
+		NL_SET_ERR_MSG_MOD(extack, "Source entry already exists");
+		return -EEXIST;
+	}
+
+	err = vxlan_mdb_remote_src_fwd_add(cfg, &ent->addr, extack);
+	if (err)
+		goto err_src_del;
+
+	/* Clear flags in case source entry was marked for deletion as part of
+	 * replace flow.
+	 */
+	ent->flags = 0;
+
+	return 0;
+
+err_src_del:
+	vxlan_mdb_remote_src_entry_del(ent);
+	return err;
+}
+
+static void vxlan_mdb_remote_src_del(struct vxlan_dev *vxlan,
+				     const struct vxlan_mdb_entry_key *group,
+				     const struct vxlan_mdb_remote *remote,
+				     struct vxlan_mdb_src_entry *ent)
+{
+	vxlan_mdb_remote_src_fwd_del(vxlan, group, remote, &ent->addr);
+	vxlan_mdb_remote_src_entry_del(ent);
+}
+
+static int vxlan_mdb_remote_srcs_add(const struct vxlan_mdb_config *cfg,
+				     struct vxlan_mdb_remote *remote,
+				     struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config_src_entry *src;
+	struct vxlan_mdb_src_entry *ent;
+	struct hlist_node *tmp;
+	int err;
+
+	list_for_each_entry(src, &cfg->src_list, node) {
+		err = vxlan_mdb_remote_src_add(cfg, remote, src, extack);
+		if (err)
+			goto err_src_del;
+	}
+
+	return 0;
+
+err_src_del:
+	hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node)
+		vxlan_mdb_remote_src_del(cfg->vxlan, &cfg->group, remote, ent);
+	return err;
+}
+
+static void vxlan_mdb_remote_srcs_del(struct vxlan_dev *vxlan,
+				      const struct vxlan_mdb_entry_key *group,
+				      struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_mdb_src_entry *ent;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node)
+		vxlan_mdb_remote_src_del(vxlan, group, remote, ent);
+}
+
+static size_t
+vxlan_mdb_nlmsg_src_list_size(const struct vxlan_mdb_entry_key *group,
+			      const struct vxlan_mdb_remote *remote)
+{
+	struct vxlan_mdb_src_entry *ent;
+	size_t nlmsg_size;
+
+	if (hlist_empty(&remote->src_list))
+		return 0;
+
+	/* MDBA_MDB_EATTR_SRC_LIST */
+	nlmsg_size = nla_total_size(0);
+
+	hlist_for_each_entry(ent, &remote->src_list, node) {
+			      /* MDBA_MDB_SRCLIST_ENTRY */
+		nlmsg_size += nla_total_size(0) +
+			      /* MDBA_MDB_SRCATTR_ADDRESS */
+			      nla_total_size(vxlan_addr_size(&group->dst)) +
+			      /* MDBA_MDB_SRCATTR_TIMER */
+			      nla_total_size(sizeof(u8));
+	}
+
+	return nlmsg_size;
+}
+
+static size_t vxlan_mdb_nlmsg_size(const struct vxlan_dev *vxlan,
+				   const struct vxlan_mdb_entry *mdb_entry,
+				   const struct vxlan_mdb_remote *remote)
+{
+	const struct vxlan_mdb_entry_key *group = &mdb_entry->key;
+	struct vxlan_rdst *rd = rtnl_dereference(remote->rd);
+	size_t nlmsg_size;
+
+	nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+		     /* MDBA_MDB */
+		     nla_total_size(0) +
+		     /* MDBA_MDB_ENTRY */
+		     nla_total_size(0) +
+		     /* MDBA_MDB_ENTRY_INFO */
+		     nla_total_size(sizeof(struct br_mdb_entry)) +
+		     /* MDBA_MDB_EATTR_TIMER */
+		     nla_total_size(sizeof(u32));
+	/* MDBA_MDB_EATTR_SOURCE */
+	if (vxlan_mdb_is_sg(group))
+		nlmsg_size += nla_total_size(vxlan_addr_size(&group->dst));
+	/* MDBA_MDB_EATTR_RTPROT */
+	nlmsg_size += nla_total_size(sizeof(u8));
+	/* MDBA_MDB_EATTR_SRC_LIST */
+	nlmsg_size += vxlan_mdb_nlmsg_src_list_size(group, remote);
+	/* MDBA_MDB_EATTR_GROUP_MODE */
+	nlmsg_size += nla_total_size(sizeof(u8));
+	/* MDBA_MDB_EATTR_DST */
+	nlmsg_size += nla_total_size(vxlan_addr_size(&rd->remote_ip));
+	/* MDBA_MDB_EATTR_DST_PORT */
+	if (rd->remote_port && rd->remote_port != vxlan->cfg.dst_port)
+		nlmsg_size += nla_total_size(sizeof(u16));
+	/* MDBA_MDB_EATTR_VNI */
+	if (rd->remote_vni != vxlan->default_dst.remote_vni)
+		nlmsg_size += nla_total_size(sizeof(u32));
+	/* MDBA_MDB_EATTR_IFINDEX */
+	if (rd->remote_ifindex)
+		nlmsg_size += nla_total_size(sizeof(u32));
+	/* MDBA_MDB_EATTR_SRC_VNI */
+	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && group->vni)
+		nlmsg_size += nla_total_size(sizeof(u32));
+
+	return nlmsg_size;
+}
+
+static int vxlan_mdb_nlmsg_fill(const struct vxlan_dev *vxlan,
+				struct sk_buff *skb,
+				const struct vxlan_mdb_entry *mdb_entry,
+				const struct vxlan_mdb_remote *remote,
+				int type)
+{
+	struct nlattr *mdb_nest, *mdb_entry_nest;
+	struct br_port_msg *bpm;
+	struct nlmsghdr *nlh;
+
+	nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	bpm = nlmsg_data(nlh);
+	memset(bpm, 0, sizeof(*bpm));
+	bpm->family  = AF_BRIDGE;
+	bpm->ifindex = vxlan->dev->ifindex;
+
+	mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB);
+	if (!mdb_nest)
+		goto cancel;
+	mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+	if (!mdb_entry_nest)
+		goto cancel;
+
+	if (vxlan_mdb_entry_info_fill(vxlan, skb, mdb_entry, remote))
+		goto cancel;
+
+	nla_nest_end(skb, mdb_entry_nest);
+	nla_nest_end(skb, mdb_nest);
+	nlmsg_end(skb, nlh);
+
+	return 0;
+
+cancel:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+static void vxlan_mdb_remote_notify(const struct vxlan_dev *vxlan,
+				    const struct vxlan_mdb_entry *mdb_entry,
+				    const struct vxlan_mdb_remote *remote,
+				    int type)
+{
+	struct net *net = dev_net(vxlan->dev);
+	struct sk_buff *skb;
+	int err = -ENOBUFS;
+
+	skb = nlmsg_new(vxlan_mdb_nlmsg_size(vxlan, mdb_entry, remote),
+			GFP_KERNEL);
+	if (!skb)
+		goto errout;
+
+	err = vxlan_mdb_nlmsg_fill(vxlan, skb, mdb_entry, remote, type);
+	if (err) {
+		kfree_skb(skb);
+		goto errout;
+	}
+
+	rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_KERNEL);
+	return;
+errout:
+	rtnl_set_sk_err(net, RTNLGRP_MDB, err);
+}
+
+static int
+vxlan_mdb_remote_srcs_replace(const struct vxlan_mdb_config *cfg,
+			      const struct vxlan_mdb_entry *mdb_entry,
+			      struct vxlan_mdb_remote *remote,
+			      struct netlink_ext_ack *extack)
+{
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	struct vxlan_mdb_src_entry *ent;
+	struct hlist_node *tmp;
+	int err;
+
+	hlist_for_each_entry(ent, &remote->src_list, node)
+		ent->flags |= VXLAN_SGRP_F_DELETE;
+
+	err = vxlan_mdb_remote_srcs_add(cfg, remote, extack);
+	if (err)
+		goto err_clear_delete;
+
+	hlist_for_each_entry_safe(ent, tmp, &remote->src_list, node) {
+		if (ent->flags & VXLAN_SGRP_F_DELETE)
+			vxlan_mdb_remote_src_del(vxlan, &mdb_entry->key, remote,
+						 ent);
+	}
+
+	return 0;
+
+err_clear_delete:
+	hlist_for_each_entry(ent, &remote->src_list, node)
+		ent->flags &= ~VXLAN_SGRP_F_DELETE;
+	return err;
+}
+
+static int vxlan_mdb_remote_replace(const struct vxlan_mdb_config *cfg,
+				    const struct vxlan_mdb_entry *mdb_entry,
+				    struct vxlan_mdb_remote *remote,
+				    struct netlink_ext_ack *extack)
+{
+	struct vxlan_rdst *new_rd, *old_rd = rtnl_dereference(remote->rd);
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	int err;
+
+	err = vxlan_mdb_remote_rdst_init(cfg, remote);
+	if (err)
+		return err;
+	new_rd = rtnl_dereference(remote->rd);
+
+	err = vxlan_mdb_remote_srcs_replace(cfg, mdb_entry, remote, extack);
+	if (err)
+		goto err_rdst_reset;
+
+	WRITE_ONCE(remote->flags, cfg->flags);
+	WRITE_ONCE(remote->filter_mode, cfg->filter_mode);
+	remote->rt_protocol = cfg->rt_protocol;
+	vxlan_mdb_remote_notify(vxlan, mdb_entry, remote, RTM_NEWMDB);
+
+	vxlan_mdb_remote_rdst_fini(old_rd);
+
+	return 0;
+
+err_rdst_reset:
+	rcu_assign_pointer(remote->rd, old_rd);
+	vxlan_mdb_remote_rdst_fini(new_rd);
+	return err;
+}
+
+static int vxlan_mdb_remote_add(const struct vxlan_mdb_config *cfg,
+				struct vxlan_mdb_entry *mdb_entry,
+				struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_remote *remote;
+	int err;
+
+	remote = vxlan_mdb_remote_lookup(mdb_entry, &cfg->remote_ip);
+	if (remote) {
+		if (!(cfg->nlflags & NLM_F_REPLACE)) {
+			NL_SET_ERR_MSG_MOD(extack, "Replace not specified and MDB remote entry already exists");
+			return -EEXIST;
+		}
+		return vxlan_mdb_remote_replace(cfg, mdb_entry, remote, extack);
+	}
+
+	if (!(cfg->nlflags & NLM_F_CREATE)) {
+		NL_SET_ERR_MSG_MOD(extack, "Create not specified and entry does not exist");
+		return -ENOENT;
+	}
+
+	remote = kzalloc(sizeof(*remote), GFP_KERNEL);
+	if (!remote)
+		return -ENOMEM;
+
+	err = vxlan_mdb_remote_init(cfg, remote);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to initialize remote MDB entry");
+		goto err_free_remote;
+	}
+
+	err = vxlan_mdb_remote_srcs_add(cfg, remote, extack);
+	if (err)
+		goto err_remote_fini;
+
+	list_add_rcu(&remote->list, &mdb_entry->remotes);
+	vxlan_mdb_remote_notify(cfg->vxlan, mdb_entry, remote, RTM_NEWMDB);
+
+	return 0;
+
+err_remote_fini:
+	vxlan_mdb_remote_fini(cfg->vxlan, remote);
+err_free_remote:
+	kfree(remote);
+	return err;
+}
+
+static void vxlan_mdb_remote_del(struct vxlan_dev *vxlan,
+				 struct vxlan_mdb_entry *mdb_entry,
+				 struct vxlan_mdb_remote *remote)
+{
+	vxlan_mdb_remote_notify(vxlan, mdb_entry, remote, RTM_DELMDB);
+	list_del_rcu(&remote->list);
+	vxlan_mdb_remote_srcs_del(vxlan, &mdb_entry->key, remote);
+	vxlan_mdb_remote_fini(vxlan, remote);
+	kfree_rcu(remote, rcu);
+}
+
+static struct vxlan_mdb_entry *
+vxlan_mdb_entry_get(struct vxlan_dev *vxlan,
+		    const struct vxlan_mdb_entry_key *group)
+{
+	struct vxlan_mdb_entry *mdb_entry;
+	int err;
+
+	mdb_entry = vxlan_mdb_entry_lookup(vxlan, group);
+	if (mdb_entry)
+		return mdb_entry;
+
+	mdb_entry = kzalloc(sizeof(*mdb_entry), GFP_KERNEL);
+	if (!mdb_entry)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&mdb_entry->remotes);
+	memcpy(&mdb_entry->key, group, sizeof(mdb_entry->key));
+	hlist_add_head(&mdb_entry->mdb_node, &vxlan->mdb_list);
+
+	err = rhashtable_lookup_insert_fast(&vxlan->mdb_tbl,
+					    &mdb_entry->rhnode,
+					    vxlan_mdb_rht_params);
+	if (err)
+		goto err_free_entry;
+
+	return mdb_entry;
+
+err_free_entry:
+	hlist_del(&mdb_entry->mdb_node);
+	kfree(mdb_entry);
+	return ERR_PTR(err);
+}
+
+static void vxlan_mdb_entry_put(struct vxlan_dev *vxlan,
+				struct vxlan_mdb_entry *mdb_entry)
+{
+	if (!list_empty(&mdb_entry->remotes))
+		return;
+
+	rhashtable_remove_fast(&vxlan->mdb_tbl, &mdb_entry->rhnode,
+			       vxlan_mdb_rht_params);
+	hlist_del(&mdb_entry->mdb_node);
+	kfree_rcu(mdb_entry, rcu);
+}
+
+static int __vxlan_mdb_add(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	struct vxlan_mdb_entry *mdb_entry;
+	int err;
+
+	mdb_entry = vxlan_mdb_entry_get(vxlan, &cfg->group);
+	if (IS_ERR(mdb_entry))
+		return PTR_ERR(mdb_entry);
+
+	err = vxlan_mdb_remote_add(cfg, mdb_entry, extack);
+	if (err)
+		goto err_entry_put;
+
+	vxlan->mdb_seq++;
+
+	return 0;
+
+err_entry_put:
+	vxlan_mdb_entry_put(vxlan, mdb_entry);
+	return err;
+}
+
+static int __vxlan_mdb_del(const struct vxlan_mdb_config *cfg,
+			   struct netlink_ext_ack *extack)
+{
+	struct vxlan_dev *vxlan = cfg->vxlan;
+	struct vxlan_mdb_entry *mdb_entry;
+	struct vxlan_mdb_remote *remote;
+
+	mdb_entry = vxlan_mdb_entry_lookup(vxlan, &cfg->group);
+	if (!mdb_entry) {
+		NL_SET_ERR_MSG_MOD(extack, "Did not find MDB entry");
+		return -ENOENT;
+	}
+
+	remote = vxlan_mdb_remote_lookup(mdb_entry, &cfg->remote_ip);
+	if (!remote) {
+		NL_SET_ERR_MSG_MOD(extack, "Did not find MDB remote entry");
+		return -ENOENT;
+	}
+
+	vxlan_mdb_remote_del(vxlan, mdb_entry, remote);
+	vxlan_mdb_entry_put(vxlan, mdb_entry);
+
+	vxlan->mdb_seq++;
+
+	return 0;
+}
+
+int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+		  struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config cfg;
+	int err;
+
+	ASSERT_RTNL();
+
+	err = vxlan_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack);
+	if (err)
+		return err;
+
+	err = __vxlan_mdb_add(&cfg, extack);
+
+	vxlan_mdb_config_fini(&cfg);
+	return err;
+}
+
+int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[],
+		  struct netlink_ext_ack *extack)
+{
+	struct vxlan_mdb_config cfg;
+	int err;
+
+	ASSERT_RTNL();
+
+	err = vxlan_mdb_config_init(&cfg, dev, tb, 0, extack);
+	if (err)
+		return err;
+
+	err = __vxlan_mdb_del(&cfg, extack);
+
+	vxlan_mdb_config_fini(&cfg);
+	return err;
+}
+
+static void vxlan_mdb_check_empty(void *ptr, void *arg)
+{
+	WARN_ON_ONCE(1);
+}
+
+static void vxlan_mdb_remotes_flush(struct vxlan_dev *vxlan,
+				    struct vxlan_mdb_entry *mdb_entry)
+{
+	struct vxlan_mdb_remote *remote, *tmp;
+
+	list_for_each_entry_safe(remote, tmp, &mdb_entry->remotes, list)
+		vxlan_mdb_remote_del(vxlan, mdb_entry, remote);
+}
+
+static void vxlan_mdb_entries_flush(struct vxlan_dev *vxlan)
+{
+	struct vxlan_mdb_entry *mdb_entry;
+	struct hlist_node *tmp;
+
+	/* The removal of an entry cannot trigger the removal of another entry
+	 * since entries are always added to the head of the list.
+	 */
+	hlist_for_each_entry_safe(mdb_entry, tmp, &vxlan->mdb_list, mdb_node) {
+		vxlan_mdb_remotes_flush(vxlan, mdb_entry);
+		vxlan_mdb_entry_put(vxlan, mdb_entry);
+	}
+}
+
+int vxlan_mdb_init(struct vxlan_dev *vxlan)
+{
+	int err;
+
+	err = rhashtable_init(&vxlan->mdb_tbl, &vxlan_mdb_rht_params);
+	if (err)
+		return err;
+
+	INIT_HLIST_HEAD(&vxlan->mdb_list);
+
+	return 0;
+}
+
+void vxlan_mdb_fini(struct vxlan_dev *vxlan)
+{
+	vxlan_mdb_entries_flush(vxlan);
+	rhashtable_free_and_destroy(&vxlan->mdb_tbl, vxlan_mdb_check_empty,
+				    NULL);
+}
diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h
index f4977925cb8a..7bcc38faae27 100644
--- a/drivers/net/vxlan/vxlan_private.h
+++ b/drivers/net/vxlan/vxlan_private.h
@@ -110,6 +110,14 @@ static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
 		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
 }
 
+static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip)
+{
+	if (ip->sa.sa_family == AF_INET6)
+		return ipv6_addr_is_multicast(&ip->sin6.sin6_addr);
+	else
+		return ipv4_is_multicast(ip->sin.sin_addr.s_addr);
+}
+
 #else /* !CONFIG_IPV6 */
 
 static inline
@@ -138,8 +146,21 @@ static inline int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
 	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
 }
 
+static inline bool vxlan_addr_is_multicast(const union vxlan_addr *ip)
+{
+	return ipv4_is_multicast(ip->sin.sin_addr.s_addr);
+}
+
 #endif
 
+static inline size_t vxlan_addr_size(const union vxlan_addr *ip)
+{
+	if (ip->sa.sa_family == AF_INET6)
+		return sizeof(struct in6_addr);
+	else
+		return sizeof(__be32);
+}
+
 static inline struct vxlan_vni_node *
 vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni)
 {
@@ -206,4 +227,14 @@ int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip,
 		    int rifindex);
 int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip,
 		     int rifindex);
+
+/* vxlan_mdb.c */
+int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+		   struct netlink_callback *cb);
+int vxlan_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+		  struct netlink_ext_ack *extack);
+int vxlan_mdb_del(struct net_device *dev, struct nlattr *tb[],
+		  struct netlink_ext_ack *extack);
+int vxlan_mdb_init(struct vxlan_dev *vxlan);
+void vxlan_mdb_fini(struct vxlan_dev *vxlan);
 #endif
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index bca5b01af247..110b703d8978 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -3,6 +3,7 @@
 #define __NET_VXLAN_H 1
 
 #include <linux/if_vlan.h>
+#include <linux/rhashtable-types.h>
 #include <net/udp_tunnel.h>
 #include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
@@ -302,6 +303,10 @@ struct vxlan_dev {
 	struct vxlan_vni_group  __rcu *vnigrp;
 
 	struct hlist_head fdb_head[FDB_HASH_SIZE];
+
+	struct rhashtable mdb_tbl;
+	struct hlist_head mdb_list;
+	unsigned int mdb_seq;
 };
 
 #define VXLAN_F_LEARN			0x01
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index d60c456710b3..c9d624f528c5 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -633,6 +633,11 @@ enum {
 	MDBA_MDB_EATTR_GROUP_MODE,
 	MDBA_MDB_EATTR_SOURCE,
 	MDBA_MDB_EATTR_RTPROT,
+	MDBA_MDB_EATTR_DST,
+	MDBA_MDB_EATTR_DST_PORT,
+	MDBA_MDB_EATTR_VNI,
+	MDBA_MDB_EATTR_IFINDEX,
+	MDBA_MDB_EATTR_SRC_VNI,
 	__MDBA_MDB_EATTR_MAX
 };
 #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
@@ -728,6 +733,11 @@ enum {
 	MDBE_ATTR_SRC_LIST,
 	MDBE_ATTR_GROUP_MODE,
 	MDBE_ATTR_RTPROT,
+	MDBE_ATTR_DST,
+	MDBE_ATTR_DST_PORT,
+	MDBE_ATTR_VNI,
+	MDBE_ATTR_IFINDEX,
+	MDBE_ATTR_SRC_VNI,
 	__MDBE_ATTR_MAX,
 };
 #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1)
-- 
cgit v1.2.3


From bc6c6b013ffee36eb555cc0a68aa3d9608e1fad2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 15 Mar 2023 15:11:52 +0200
Subject: vxlan: mdb: Add an internal flag to indicate MDB usage

Add an internal flag to indicate whether MDB entries are configured or
not. Set the flag after installing the first MDB entry and clear it
before deleting the last one.

The flag will be consulted by the data path which will only perform an
MDB lookup if the flag is set, thereby keeping the MDB overhead to a
minimum when the MDB is not used.

Another option would have been to use a static key, but it is global and
not per-device, unlike the current approach.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan/vxlan_mdb.c | 7 +++++++
 include/net/vxlan.h           | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c
index 129692b3663f..b32b1fb4a74a 100644
--- a/drivers/net/vxlan/vxlan_mdb.c
+++ b/drivers/net/vxlan/vxlan_mdb.c
@@ -1185,6 +1185,9 @@ vxlan_mdb_entry_get(struct vxlan_dev *vxlan,
 	if (err)
 		goto err_free_entry;
 
+	if (hlist_is_singular_node(&mdb_entry->mdb_node, &vxlan->mdb_list))
+		vxlan->cfg.flags |= VXLAN_F_MDB;
+
 	return mdb_entry;
 
 err_free_entry:
@@ -1199,6 +1202,9 @@ static void vxlan_mdb_entry_put(struct vxlan_dev *vxlan,
 	if (!list_empty(&mdb_entry->remotes))
 		return;
 
+	if (hlist_is_singular_node(&mdb_entry->mdb_node, &vxlan->mdb_list))
+		vxlan->cfg.flags &= ~VXLAN_F_MDB;
+
 	rhashtable_remove_fast(&vxlan->mdb_tbl, &mdb_entry->rhnode,
 			       vxlan_mdb_rht_params);
 	hlist_del(&mdb_entry->mdb_node);
@@ -1336,6 +1342,7 @@ int vxlan_mdb_init(struct vxlan_dev *vxlan)
 void vxlan_mdb_fini(struct vxlan_dev *vxlan)
 {
 	vxlan_mdb_entries_flush(vxlan);
+	WARN_ON_ONCE(vxlan->cfg.flags & VXLAN_F_MDB);
 	rhashtable_free_and_destroy(&vxlan->mdb_tbl, vxlan_mdb_check_empty,
 				    NULL);
 }
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 110b703d8978..b7b2e9abfb37 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -327,6 +327,7 @@ struct vxlan_dev {
 #define VXLAN_F_IPV6_LINKLOCAL		0x8000
 #define VXLAN_F_TTL_INHERIT		0x10000
 #define VXLAN_F_VNIFILTER               0x20000
+#define VXLAN_F_MDB			0x40000
 
 /* Flags that are used in the receive path. These flags must match in
  * order for a socket to be shareable
-- 
cgit v1.2.3


From abc17a11ed29b0471e428d86189acca8d1a213c6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:31:55 +0000
Subject: inet: preserve const qualifier in inet_sk()

We can change inet_sk() to propagate const qualifier of its argument.

This should avoid some potential errors caused by accidental
(const -> not_const) promotion.

Other helpers like tcp_sk(), udp_sk(), raw_sk() will be handled
in separate patch series.

v2: use container_of_const() as advised by Jakub and Linus

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/20230315142841.3a2ac99a@kernel.org/
Link: https://lore.kernel.org/netdev/CAHk-=wiOf12nrYEF2vJMcucKjWPN-Ns_SW9fA7LwST_2Dzp7rw@mail.gmail.com/
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h     | 5 +----
 include/trace/events/sock.h | 4 ++--
 include/trace/events/tcp.h  | 2 +-
 net/ipv4/ip_output.c        | 5 +++--
 net/ipv6/ping.c             | 2 +-
 net/ipv6/udp.c              | 2 +-
 net/mptcp/sockopt.c         | 2 +-
 security/lsm_audit.c        | 4 ++--
 8 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 51857117ac09..caa20a905531 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -305,10 +305,7 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
 	return sk_to_full_sk(skb->sk);
 }
 
-static inline struct inet_sock *inet_sk(const struct sock *sk)
-{
-	return (struct inet_sock *)sk;
-}
+#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)
 
 static inline void __inet_sk_copy_descendant(struct sock *sk_to,
 					     const struct sock *sk_from,
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 03d19fc562f8..fd206a6ab5b8 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -158,7 +158,7 @@ TRACE_EVENT(inet_sock_set_state,
 	),
 
 	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 		struct in6_addr *pin6;
 		__be32 *p32;
 
@@ -222,7 +222,7 @@ TRACE_EVENT(inet_sk_error_report,
 	),
 
 	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 		struct in6_addr *pin6;
 		__be32 *p32;
 
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 901b440238d5..bf06db8d2046 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -67,7 +67,7 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 	),
 
 	TP_fast_assign(
-		struct inet_sock *inet = inet_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 		__be32 *p32;
 
 		__entry->skbaddr = skb;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e7bef36ce26f..cb04dbad9ea4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -129,7 +129,8 @@ int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ip_local_out);
 
-static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+static inline int ip_select_ttl(const struct inet_sock *inet,
+				const struct dst_entry *dst)
 {
 	int ttl = inet->uc_ttl;
 
@@ -146,7 +147,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
 			  u8 tos)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 	struct rtable *rt = skb_rtable(skb);
 	struct net *net = sock_net(sk);
 	struct iphdr *iph;
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 808983bc2ec9..c4835dbdfcff 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -237,7 +237,7 @@ static int ping_v6_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
 	} else {
 		int bucket = ((struct ping_iter_state *) seq->private)->bucket;
-		struct inet_sock *inet = inet_sk(v);
+		struct inet_sock *inet = inet_sk((struct sock *)v);
 		__u16 srcp = ntohs(inet->inet_sport);
 		__u16 destp = ntohs(inet->inet_dport);
 		ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9fb2f33ee3a7..ab4ae886235a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1708,7 +1708,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
 	} else {
 		int bucket = ((struct udp_iter_state *)seq->private)->bucket;
-		struct inet_sock *inet = inet_sk(v);
+		const struct inet_sock *inet = inet_sk((const struct sock *)v);
 		__u16 srcp = ntohs(inet->inet_sport);
 		__u16 destp = ntohs(inet->inet_dport);
 		__ip6_dgram_sock_seq_show(seq, v, srcp, destp,
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 8a9656248b0f..5cef4d3d21ac 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1046,7 +1046,7 @@ static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
 
 static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 
 	memset(a, 0, sizeof(*a));
 
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index a7355b4b9bb8..00d3bdd386e2 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -317,7 +317,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 
 			switch (sk->sk_family) {
 			case AF_INET: {
-				struct inet_sock *inet = inet_sk(sk);
+				const struct inet_sock *inet = inet_sk(sk);
 
 				print_ipv4_addr(ab, inet->inet_rcv_saddr,
 						inet->inet_sport,
@@ -329,7 +329,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 			}
 #if IS_ENABLED(CONFIG_IPV6)
 			case AF_INET6: {
-				struct inet_sock *inet = inet_sk(sk);
+				const struct inet_sock *inet = inet_sk(sk);
 
 				print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr,
 						inet->inet_sport,
-- 
cgit v1.2.3


From 33e972bdf0b0aa208b67164c64eef3c307e4b303 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:31:56 +0000
Subject: ipv4: constify ip_mc_sf_allow() socket argument

This clarifies ip_mc_sf_allow() intent.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h | 2 +-
 net/ipv4/igmp.c      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index b19d3284551f..ebf4349a53af 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -122,7 +122,7 @@ extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 			sockptr_t optval, sockptr_t optlen);
 extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 			sockptr_t optval, size_t offset);
-extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt,
+extern int ip_mc_sf_allow(const struct sock *sk, __be32 local, __be32 rmt,
 			  int dif, int sdif);
 extern void ip_mc_init_dev(struct in_device *);
 extern void ip_mc_destroy_dev(struct in_device *);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c920aa9a62a9..48ff5f13e797 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2638,10 +2638,10 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 /*
  * check if a multicast source filter allows delivery for a given <src,dst,intf>
  */
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
 		   int dif, int sdif)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 	struct ip_mc_socklist *pmc;
 	struct ip_sf_socklist *psl;
 	int i;
-- 
cgit v1.2.3


From 66eb554c6449cef8c1e1b814f74d13f264582591 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:31:58 +0000
Subject: ipv6: constify inet6_mc_check()

inet6_mc_check() is essentially a read-only function.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 2 +-
 net/ipv6/mcast.c       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index c04f359655b8..82da55101b5a 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -223,7 +223,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
 void __ipv6_sock_mc_close(struct sock *sk);
 void ipv6_sock_mc_close(struct sock *sk);
-bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
+bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
 		    const struct in6_addr *src_addr);
 
 int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 1c02160cf7a4..714cdc9e2b8e 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -627,12 +627,12 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
 	return 0;
 }
 
-bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
+bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
 		    const struct in6_addr *src_addr)
 {
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct ipv6_mc_socklist *mc;
-	struct ip6_sf_socklist *psl;
+	const struct ipv6_pinfo *np = inet6_sk(sk);
+	const struct ipv6_mc_socklist *mc;
+	const struct ip6_sf_socklist *psl;
 	bool rv = true;
 
 	rcu_read_lock();
-- 
cgit v1.2.3


From db6af4fdb150b45e1ba6b295ccfd3df482e022d2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:32:00 +0000
Subject: ipv6: raw: constify raw_v6_match() socket argument

This clarifies raw_v6_match() intent.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rawv6.h | 2 +-
 net/ipv6/raw.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index bc70909625f6..82810cbe3798 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -6,7 +6,7 @@
 #include <net/raw.h>
 
 extern struct raw_hashinfo raw_v6_hashinfo;
-bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
 		  const struct in6_addr *loc_addr,
 		  const struct in6_addr *rmt_addr, int dif, int sdif);
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index bac9ba747bde..6ac2f2690c44 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -64,7 +64,7 @@
 struct raw_hashinfo raw_v6_hashinfo;
 EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
-bool raw_v6_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
 		  const struct in6_addr *loc_addr,
 		  const struct in6_addr *rmt_addr, int dif, int sdif)
 {
-- 
cgit v1.2.3


From 0a8c2568209ee0c3392593c7c5c7fe41c625a383 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 16 Mar 2023 15:32:01 +0000
Subject: ipv4: raw: constify raw_v4_match() socket argument

This clarifies raw_v4_match() intent.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h | 2 +-
 net/ipv4/raw.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 2c004c20ed99..7ad15830cf38 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -22,7 +22,7 @@
 extern struct proto raw_prot;
 
 extern struct raw_hashinfo raw_v4_hashinfo;
-bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
 		  __be32 raddr, __be32 laddr, int dif, int sdif);
 
 int raw_abort(struct sock *sk, int err);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 94df935ee0c5..3cf68695b40d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -116,10 +116,10 @@ void raw_unhash_sk(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(raw_unhash_sk);
 
-bool raw_v4_match(struct net *net, struct sock *sk, unsigned short num,
+bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
 		  __be32 raddr, __be32 laddr, int dif, int sdif)
 {
-	struct inet_sock *inet = inet_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
 
 	if (net_eq(sock_net(sk), net) && inet->inet_num == num	&&
 	    !(inet->inet_daddr && inet->inet_daddr != raddr) 	&&
-- 
cgit v1.2.3


From bd5314f8dd2d41330eecb60f0490c3fcfe1fc99d Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Fri, 17 Mar 2023 10:56:01 +0100
Subject: kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header

Moving find_kallsyms_symbol_value from kernel/module/internal.h to
include/linux/module.h. The reason is that internal.h is not prepared to
be included when CONFIG_MODULES=n. find_kallsyms_symbol_value is used by
kernel/bpf/verifier.c and including internal.h from it (without modules)
leads into a compilation error:

  In file included from ../include/linux/container_of.h:5,
                   from ../include/linux/list.h:5,
                   from ../include/linux/timer.h:5,
                   from ../include/linux/workqueue.h:9,
                   from ../include/linux/bpf.h:10,
                   from ../include/linux/bpf-cgroup.h:5,
                   from ../kernel/bpf/verifier.c:7:
  ../kernel/bpf/../module/internal.h: In function 'mod_find':
  ../include/linux/container_of.h:20:54: error: invalid use of undefined type 'struct module'
     20 |         static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
        |                                                      ^~
  [...]

This patch fixes the above error.

Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/oe-kbuild-all/202303161404.OrmfCy09-lkp@intel.com/
Link: https://lore.kernel.org/bpf/20230317095601.386738-1-vmalik@redhat.com
---
 include/linux/module.h   | 8 ++++++++
 kernel/bpf/verifier.c    | 2 +-
 kernel/module/internal.h | 6 ------
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 4435ad9439ab..41cfd3be57e5 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -616,6 +616,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
 extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
 #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
@@ -796,6 +798,12 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d62b7127ff2a..99394a2f7ee4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,7 +24,7 @@
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
 #include <linux/poison.h>
-#include "../module/internal.h"
+#include <linux/module.h>
 
 #include "disasm.h"
 
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 5c9170f9135c..1c877561a7d2 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -246,7 +246,6 @@ static inline void kmemleak_load_module(const struct module *mod,
 void init_build_id(struct module *mod, const struct load_info *info);
 void layout_symtab(struct module *mod, struct load_info *info);
 void add_kallsyms(struct module *mod, const struct load_info *info);
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
 
 static inline bool sect_empty(const Elf_Shdr *sect)
 {
@@ -256,11 +255,6 @@ static inline bool sect_empty(const Elf_Shdr *sect)
 static inline void init_build_id(struct module *mod, const struct load_info *info) { }
 static inline void layout_symtab(struct module *mod, struct load_info *info) { }
 static inline void add_kallsyms(struct module *mod, const struct load_info *info) { }
-static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
-						       const char *name)
-{
-	return 0;
-}
 #endif /* CONFIG_KALLSYMS */
 
 #ifdef CONFIG_SYSFS
-- 
cgit v1.2.3


From 6365ba64b4dbe8b59ddaeaa724b281f3787715d5 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Date: Wed, 8 Mar 2023 15:05:31 +0000
Subject: ptp: kvm: Use decrypted memory in confidential guest on x86

KVM_HC_CLOCK_PAIRING currently fails inside SEV-SNP guests because the
guest passes an address to static data to the host. In confidential
computing the host can't access arbitrary guest memory so handling the
hypercall runs into an "rmpfault". To make the hypercall work, the guest
needs to explicitly mark the memory as decrypted. Do that in
kvm_arch_ptp_init(), but retain the previous behavior for
non-confidential guests to save us from having to allocate memory.

Add a new arch-specific function (kvm_arch_ptp_exit()) to free the
allocation and mark the memory as encrypted again.

Signed-off-by: Jeremi Piotrowski <jpiotrowski@linux.microsoft.com>
Link: https://lore.kernel.org/r/20230308150531.477741-1-jpiotrowski@linux.microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/ptp/ptp_kvm_arm.c    |  4 +++
 drivers/ptp/ptp_kvm_common.c |  1 +
 drivers/ptp/ptp_kvm_x86.c    | 59 +++++++++++++++++++++++++++++++++++---------
 include/linux/ptp_kvm.h      |  1 +
 4 files changed, 54 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/ptp/ptp_kvm_arm.c b/drivers/ptp/ptp_kvm_arm.c
index b7d28c8dfb84..e68e6943167b 100644
--- a/drivers/ptp/ptp_kvm_arm.c
+++ b/drivers/ptp/ptp_kvm_arm.c
@@ -22,6 +22,10 @@ int kvm_arch_ptp_init(void)
 	return 0;
 }
 
+void kvm_arch_ptp_exit(void)
+{
+}
+
 int kvm_arch_ptp_get_clock(struct timespec64 *ts)
 {
 	return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL);
diff --git a/drivers/ptp/ptp_kvm_common.c b/drivers/ptp/ptp_kvm_common.c
index 9141162c4237..2418977989be 100644
--- a/drivers/ptp/ptp_kvm_common.c
+++ b/drivers/ptp/ptp_kvm_common.c
@@ -130,6 +130,7 @@ static struct kvm_ptp_clock kvm_ptp_clock;
 static void __exit ptp_kvm_exit(void)
 {
 	ptp_clock_unregister(kvm_ptp_clock.ptp_clock);
+	kvm_arch_ptp_exit();
 }
 
 static int __init ptp_kvm_init(void)
diff --git a/drivers/ptp/ptp_kvm_x86.c b/drivers/ptp/ptp_kvm_x86.c
index 4991054a2135..902844cc1a17 100644
--- a/drivers/ptp/ptp_kvm_x86.c
+++ b/drivers/ptp/ptp_kvm_x86.c
@@ -14,27 +14,64 @@
 #include <uapi/linux/kvm_para.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_kvm.h>
+#include <linux/set_memory.h>
 
 static phys_addr_t clock_pair_gpa;
-static struct kvm_clock_pairing clock_pair;
+static struct kvm_clock_pairing clock_pair_glbl;
+static struct kvm_clock_pairing *clock_pair;
 
 int kvm_arch_ptp_init(void)
 {
+	struct page *p;
 	long ret;
 
 	if (!kvm_para_available())
 		return -ENODEV;
 
-	clock_pair_gpa = slow_virt_to_phys(&clock_pair);
-	if (!pvclock_get_pvti_cpu0_va())
-		return -ENODEV;
+	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+		p = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!p)
+			return -ENOMEM;
+
+		clock_pair = page_address(p);
+		ret = set_memory_decrypted((unsigned long)clock_pair, 1);
+		if (ret) {
+			__free_page(p);
+			clock_pair = NULL;
+			goto nofree;
+		}
+	} else {
+		clock_pair = &clock_pair_glbl;
+	}
+
+	clock_pair_gpa = slow_virt_to_phys(clock_pair);
+	if (!pvclock_get_pvti_cpu0_va()) {
+		ret = -ENODEV;
+		goto err;
+	}
 
 	ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
 			     KVM_CLOCK_PAIRING_WALLCLOCK);
-	if (ret == -KVM_ENOSYS)
-		return -ENODEV;
+	if (ret == -KVM_ENOSYS) {
+		ret = -ENODEV;
+		goto err;
+	}
 
 	return ret;
+
+err:
+	kvm_arch_ptp_exit();
+nofree:
+	return ret;
+}
+
+void kvm_arch_ptp_exit(void)
+{
+	if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+		WARN_ON(set_memory_encrypted((unsigned long)clock_pair, 1));
+		free_page((unsigned long)clock_pair);
+		clock_pair = NULL;
+	}
 }
 
 int kvm_arch_ptp_get_clock(struct timespec64 *ts)
@@ -49,8 +86,8 @@ int kvm_arch_ptp_get_clock(struct timespec64 *ts)
 		return -EOPNOTSUPP;
 	}
 
-	ts->tv_sec = clock_pair.sec;
-	ts->tv_nsec = clock_pair.nsec;
+	ts->tv_sec = clock_pair->sec;
+	ts->tv_nsec = clock_pair->nsec;
 
 	return 0;
 }
@@ -81,9 +118,9 @@ int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec,
 			pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret);
 			return -EOPNOTSUPP;
 		}
-		tspec->tv_sec = clock_pair.sec;
-		tspec->tv_nsec = clock_pair.nsec;
-		*cycle = __pvclock_read_cycles(src, clock_pair.tsc);
+		tspec->tv_sec = clock_pair->sec;
+		tspec->tv_nsec = clock_pair->nsec;
+		*cycle = __pvclock_read_cycles(src, clock_pair->tsc);
 	} while (pvclock_read_retry(src, version));
 
 	*cs = &kvm_clock;
diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h
index c2e28deef33a..746fd67c3480 100644
--- a/include/linux/ptp_kvm.h
+++ b/include/linux/ptp_kvm.h
@@ -14,6 +14,7 @@ struct timespec64;
 struct clocksource;
 
 int kvm_arch_ptp_init(void);
+void kvm_arch_ptp_exit(void);
 int kvm_arch_ptp_get_clock(struct timespec64 *ts);
 int kvm_arch_ptp_get_crosststamp(u64 *cycle,
 		struct timespec64 *tspec, struct clocksource **cs);
-- 
cgit v1.2.3


From 36bd28c1cb0dbf48645cfe43159907fb3253b33a Mon Sep 17 00:00:00 2001
From: haozhe chang <haozhe.chang@mediatek.com>
Date: Thu, 16 Mar 2023 17:58:20 +0800
Subject: wwan: core: Support slicing in port TX flow of WWAN subsystem

wwan_port_fops_write inputs the SKB parameter to the TX callback of
the WWAN device driver. However, the WWAN device (e.g., t7xx) may
have an MTU less than the size of SKB, causing the TX buffer to be
sliced and copied once more in the WWAN device driver.

This patch implements the slicing in the WWAN subsystem and gives
the WWAN devices driver the option to slice(by frag_len) or not. By
doing so, the additional memory copy is reduced.

Meanwhile, this patch gives WWAN devices driver the option to reserve
headroom in fragments for the device-specific metadata.

Signed-off-by: haozhe chang <haozhe.chang@mediatek.com>
Reviewed-by: Loic Poulain <loic.poulain@linaro.org>
Link: https://lore.kernel.org/r/20230316095826.181904-1-haozhe.chang@mediatek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wwan/iosm/iosm_ipc_port.c  |  3 +-
 drivers/net/wwan/mhi_wwan_ctrl.c       |  2 +-
 drivers/net/wwan/rpmsg_wwan_ctrl.c     |  2 +-
 drivers/net/wwan/t7xx/t7xx_port_wwan.c | 36 +++++++++++----------
 drivers/net/wwan/wwan_core.c           | 58 ++++++++++++++++++++++++++--------
 drivers/net/wwan/wwan_hwsim.c          |  2 +-
 drivers/usb/class/cdc-wdm.c            |  3 +-
 include/linux/wwan.h                   | 11 +++++++
 8 files changed, 81 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wwan/iosm/iosm_ipc_port.c b/drivers/net/wwan/iosm/iosm_ipc_port.c
index b6d81c627277..5d5b4183e14a 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_port.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_port.c
@@ -63,7 +63,8 @@ struct iosm_cdev *ipc_port_init(struct iosm_imem *ipc_imem,
 	ipc_port->ipc_imem = ipc_imem;
 
 	ipc_port->iosm_port = wwan_create_port(ipc_port->dev, port_type,
-					       &ipc_wwan_ctrl_ops, ipc_port);
+					       &ipc_wwan_ctrl_ops, NULL,
+					       ipc_port);
 
 	return ipc_port;
 }
diff --git a/drivers/net/wwan/mhi_wwan_ctrl.c b/drivers/net/wwan/mhi_wwan_ctrl.c
index f7ca52353f40..e9f979d2d851 100644
--- a/drivers/net/wwan/mhi_wwan_ctrl.c
+++ b/drivers/net/wwan/mhi_wwan_ctrl.c
@@ -237,7 +237,7 @@ static int mhi_wwan_ctrl_probe(struct mhi_device *mhi_dev,
 
 	/* Register as a wwan port, id->driver_data contains wwan port type */
 	port = wwan_create_port(&cntrl->mhi_dev->dev, id->driver_data,
-				&wwan_pops, mhiwwan);
+				&wwan_pops, NULL, mhiwwan);
 	if (IS_ERR(port)) {
 		kfree(mhiwwan);
 		return PTR_ERR(port);
diff --git a/drivers/net/wwan/rpmsg_wwan_ctrl.c b/drivers/net/wwan/rpmsg_wwan_ctrl.c
index 31c24420ab2e..06f4b02f1552 100644
--- a/drivers/net/wwan/rpmsg_wwan_ctrl.c
+++ b/drivers/net/wwan/rpmsg_wwan_ctrl.c
@@ -129,7 +129,7 @@ static int rpmsg_wwan_ctrl_probe(struct rpmsg_device *rpdev)
 
 	/* Register as a wwan port, id.driver_data contains wwan port type */
 	port = wwan_create_port(parent, rpdev->id.driver_data,
-				&rpmsg_wwan_pops, rpwwan);
+				&rpmsg_wwan_pops, NULL, rpwwan);
 	if (IS_ERR(port))
 		return PTR_ERR(port);
 
diff --git a/drivers/net/wwan/t7xx/t7xx_port_wwan.c b/drivers/net/wwan/t7xx/t7xx_port_wwan.c
index 24bd21942403..17389c8f6600 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_wwan.c
+++ b/drivers/net/wwan/t7xx/t7xx_port_wwan.c
@@ -54,13 +54,13 @@ static void t7xx_port_ctrl_stop(struct wwan_port *port)
 static int t7xx_port_ctrl_tx(struct wwan_port *port, struct sk_buff *skb)
 {
 	struct t7xx_port *port_private = wwan_port_get_drvdata(port);
-	size_t len, offset, chunk_len = 0, txq_mtu = CLDMA_MTU;
 	const struct t7xx_port_conf *port_conf;
+	struct sk_buff *cur = skb, *cloned;
 	struct t7xx_fsm_ctl *ctl;
 	enum md_state md_state;
+	int cnt = 0, ret;
 
-	len = skb->len;
-	if (!len || !port_private->chan_enable)
+	if (!port_private->chan_enable)
 		return -EINVAL;
 
 	port_conf = port_private->port_conf;
@@ -72,23 +72,21 @@ static int t7xx_port_ctrl_tx(struct wwan_port *port, struct sk_buff *skb)
 		return -ENODEV;
 	}
 
-	for (offset = 0; offset < len; offset += chunk_len) {
-		struct sk_buff *skb_ccci;
-		int ret;
-
-		chunk_len = min(len - offset, txq_mtu - sizeof(struct ccci_header));
-		skb_ccci = t7xx_port_alloc_skb(chunk_len);
-		if (!skb_ccci)
-			return -ENOMEM;
-
-		skb_put_data(skb_ccci, skb->data + offset, chunk_len);
-		ret = t7xx_port_send_skb(port_private, skb_ccci, 0, 0);
+	while (cur) {
+		cloned = skb_clone(cur, GFP_KERNEL);
+		cloned->len = skb_headlen(cur);
+		ret = t7xx_port_send_skb(port_private, cloned, 0, 0);
 		if (ret) {
-			dev_kfree_skb_any(skb_ccci);
+			dev_kfree_skb(cloned);
 			dev_err(port_private->dev, "Write error on %s port, %d\n",
 				port_conf->name, ret);
-			return ret;
+			return cnt ? cnt + ret : ret;
 		}
+		cnt += cur->len;
+		if (cur == skb)
+			cur = skb_shinfo(skb)->frag_list;
+		else
+			cur = cur->next;
 	}
 
 	dev_kfree_skb(skb);
@@ -154,13 +152,17 @@ static int t7xx_port_wwan_disable_chl(struct t7xx_port *port)
 static void t7xx_port_wwan_md_state_notify(struct t7xx_port *port, unsigned int state)
 {
 	const struct t7xx_port_conf *port_conf = port->port_conf;
+	unsigned int header_len = sizeof(struct ccci_header);
+	struct wwan_port_caps caps;
 
 	if (state != MD_STATE_READY)
 		return;
 
 	if (!port->wwan.wwan_port) {
+		caps.frag_len = CLDMA_MTU - header_len;
+		caps.headroom_len = header_len;
 		port->wwan.wwan_port = wwan_create_port(port->dev, port_conf->port_type,
-							&wwan_ops, port);
+							&wwan_ops, &caps, port);
 		if (IS_ERR(port->wwan.wwan_port))
 			dev_err(port->dev, "Unable to create WWWAN port %s", port_conf->name);
 	}
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 966d0ccd2276..2e1c01cf00a9 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -67,6 +67,8 @@ struct wwan_device {
  * @rxq: Buffer inbound queue
  * @waitqueue: The waitqueue for port fops (read/write/poll)
  * @data_lock: Port specific data access serialization
+ * @headroom_len: SKB reserved headroom size
+ * @frag_len: Length to fragment packet
  * @at_data: AT port specific data
  */
 struct wwan_port {
@@ -79,6 +81,8 @@ struct wwan_port {
 	struct sk_buff_head rxq;
 	wait_queue_head_t waitqueue;
 	struct mutex data_lock;	/* Port specific data access serialization */
+	size_t headroom_len;
+	size_t frag_len;
 	union {
 		struct {
 			struct ktermios termios;
@@ -426,6 +430,7 @@ static int __wwan_port_dev_assign_name(struct wwan_port *port, const char *fmt)
 struct wwan_port *wwan_create_port(struct device *parent,
 				   enum wwan_port_type type,
 				   const struct wwan_port_ops *ops,
+				   struct wwan_port_caps *caps,
 				   void *drvdata)
 {
 	struct wwan_device *wwandev;
@@ -459,6 +464,8 @@ struct wwan_port *wwan_create_port(struct device *parent,
 
 	port->type = type;
 	port->ops = ops;
+	port->frag_len = caps ? caps->frag_len : SIZE_MAX;
+	port->headroom_len = caps ? caps->headroom_len : 0;
 	mutex_init(&port->ops_lock);
 	skb_queue_head_init(&port->rxq);
 	init_waitqueue_head(&port->waitqueue);
@@ -702,30 +709,53 @@ static ssize_t wwan_port_fops_read(struct file *filp, char __user *buf,
 static ssize_t wwan_port_fops_write(struct file *filp, const char __user *buf,
 				    size_t count, loff_t *offp)
 {
+	struct sk_buff *skb, *head = NULL, *tail = NULL;
 	struct wwan_port *port = filp->private_data;
-	struct sk_buff *skb;
+	size_t frag_len, remain = count;
 	int ret;
 
 	ret = wwan_wait_tx(port, !!(filp->f_flags & O_NONBLOCK));
 	if (ret)
 		return ret;
 
-	skb = alloc_skb(count, GFP_KERNEL);
-	if (!skb)
-		return -ENOMEM;
+	do {
+		frag_len = min(remain, port->frag_len);
+		skb = alloc_skb(frag_len + port->headroom_len, GFP_KERNEL);
+		if (!skb) {
+			ret = -ENOMEM;
+			goto freeskb;
+		}
+		skb_reserve(skb, port->headroom_len);
+
+		if (!head) {
+			head = skb;
+		} else if (!tail) {
+			skb_shinfo(head)->frag_list = skb;
+			tail = skb;
+		} else {
+			tail->next = skb;
+			tail = skb;
+		}
 
-	if (copy_from_user(skb_put(skb, count), buf, count)) {
-		kfree_skb(skb);
-		return -EFAULT;
-	}
+		if (copy_from_user(skb_put(skb, frag_len), buf + count - remain, frag_len)) {
+			ret = -EFAULT;
+			goto freeskb;
+		}
 
-	ret = wwan_port_op_tx(port, skb, !!(filp->f_flags & O_NONBLOCK));
-	if (ret) {
-		kfree_skb(skb);
-		return ret;
-	}
+		if (skb != head) {
+			head->data_len += skb->len;
+			head->len += skb->len;
+			head->truesize += skb->truesize;
+		}
+	} while (remain -= frag_len);
+
+	ret = wwan_port_op_tx(port, head, !!(filp->f_flags & O_NONBLOCK));
+	if (!ret)
+		return count;
 
-	return count;
+freeskb:
+	kfree_skb(head);
+	return ret;
 }
 
 static __poll_t wwan_port_fops_poll(struct file *filp, poll_table *wait)
diff --git a/drivers/net/wwan/wwan_hwsim.c b/drivers/net/wwan/wwan_hwsim.c
index 2397a903d8f5..dfbdaa259a3f 100644
--- a/drivers/net/wwan/wwan_hwsim.c
+++ b/drivers/net/wwan/wwan_hwsim.c
@@ -205,7 +205,7 @@ static struct wwan_hwsim_port *wwan_hwsim_port_new(struct wwan_hwsim_dev *dev)
 
 	port->wwan = wwan_create_port(&dev->dev, WWAN_PORT_AT,
 				      &wwan_hwsim_port_ops,
-				      port);
+				      NULL, port);
 	if (IS_ERR(port->wwan)) {
 		err = PTR_ERR(port->wwan);
 		goto err_free_port;
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 1f0951be15ab..c553decb5461 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -929,7 +929,8 @@ static void wdm_wwan_init(struct wdm_device *desc)
 		return;
 	}
 
-	port = wwan_create_port(&intf->dev, desc->wwanp_type, &wdm_wwan_port_ops, desc);
+	port = wwan_create_port(&intf->dev, desc->wwanp_type, &wdm_wwan_port_ops,
+				NULL, desc);
 	if (IS_ERR(port)) {
 		dev_err(&intf->dev, "%s: Unable to create WWAN port\n",
 			dev_name(intf->usb_dev));
diff --git a/include/linux/wwan.h b/include/linux/wwan.h
index 24d76500b1cc..01fa15506286 100644
--- a/include/linux/wwan.h
+++ b/include/linux/wwan.h
@@ -64,11 +64,21 @@ struct wwan_port_ops {
 			    poll_table *wait);
 };
 
+/** struct wwan_port_caps - The WWAN port capbilities
+ * @frag_len: WWAN port TX fragments length
+ * @headroom_len: WWAN port TX fragments reserved headroom length
+ */
+struct wwan_port_caps {
+	size_t frag_len;
+	unsigned int headroom_len;
+};
+
 /**
  * wwan_create_port - Add a new WWAN port
  * @parent: Device to use as parent and shared by all WWAN ports
  * @type: WWAN port type
  * @ops: WWAN port operations
+ * @caps: WWAN port capabilities
  * @drvdata: Pointer to caller driver data
  *
  * Allocate and register a new WWAN port. The port will be automatically exposed
@@ -86,6 +96,7 @@ struct wwan_port_ops {
 struct wwan_port *wwan_create_port(struct device *parent,
 				   enum wwan_port_type type,
 				   const struct wwan_port_ops *ops,
+				   struct wwan_port_caps *caps,
 				   void *drvdata);
 
 /**
-- 
cgit v1.2.3


From c641e9279f3530aa2fe4bcb250477b555b75104a Mon Sep 17 00:00:00 2001
From: Gavin Li <gavinl@nvidia.com>
Date: Thu, 16 Mar 2023 09:07:55 +0200
Subject: vxlan: Expose helper vxlan_build_gbp_hdr

The function vxlan_build_gbp_hdr will be used by other modules to build
gbp option in vxlan header according to gbp flags.

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Gavi Teitz <gavi@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/vxlan/vxlan_core.c | 19 -------------------
 include/net/vxlan.h            | 19 +++++++++++++++++++
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 8cc147c0d069..561fe1b314f5 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2093,25 +2093,6 @@ static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
 	return false;
 }
 
-static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, struct vxlan_metadata *md)
-{
-	struct vxlanhdr_gbp *gbp;
-
-	if (!md->gbp)
-		return;
-
-	gbp = (struct vxlanhdr_gbp *)vxh;
-	vxh->vx_flags |= VXLAN_HF_GBP;
-
-	if (md->gbp & VXLAN_GBP_DONT_LEARN)
-		gbp->dont_learn = 1;
-
-	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
-		gbp->policy_applied = 1;
-
-	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
-}
-
 static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol)
 {
 	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index b7b2e9abfb37..20bd7d893e10 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -572,4 +572,23 @@ static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh,
 	return true;
 }
 
+static inline void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, const struct vxlan_metadata *md)
+{
+	struct vxlanhdr_gbp *gbp;
+
+	if (!md->gbp)
+		return;
+
+	gbp = (struct vxlanhdr_gbp *)vxh;
+	vxh->vx_flags |= VXLAN_HF_GBP;
+
+	if (md->gbp & VXLAN_GBP_DONT_LEARN)
+		gbp->dont_learn = 1;
+
+	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
+		gbp->policy_applied = 1;
+
+	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
+}
+
 #endif
-- 
cgit v1.2.3


From bc9d003dc48c381763c3a6309bfc5eecf9962b9c Mon Sep 17 00:00:00 2001
From: Gavin Li <gavinl@nvidia.com>
Date: Thu, 16 Mar 2023 09:07:57 +0200
Subject: ip_tunnel: Preserve pointer const in ip_tunnel_info_opts

Change ip_tunnel_info_opts( ) from static function to macro to cast return
value and preserve the const-ness of the pointer.

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_tunnels.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index fca357679816..255b32a90850 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -67,6 +67,12 @@ struct ip_tunnel_key {
 	GENMASK((sizeof_field(struct ip_tunnel_info,		\
 			      options_len) * BITS_PER_BYTE) - 1, 0)
 
+#define ip_tunnel_info_opts(info)				\
+	_Generic(info,						\
+		 const struct ip_tunnel_info * : ((const void *)((info) + 1)),\
+		 struct ip_tunnel_info * : ((void *)((info) + 1))\
+	)
+
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
 #ifdef CONFIG_DST_CACHE
@@ -485,11 +491,6 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
 	}
 }
 
-static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info)
-{
-	return info + 1;
-}
-
 static inline void ip_tunnel_info_opts_get(void *to,
 					   const struct ip_tunnel_info *info)
 {
-- 
cgit v1.2.3


From 6ee44c518159c364e5d30ed85d357fe1d8e2c141 Mon Sep 17 00:00:00 2001
From: Gavin Li <gavinl@nvidia.com>
Date: Thu, 16 Mar 2023 09:07:58 +0200
Subject: net/mlx5e: TC, Add support for VxLAN GBP encap/decap flows offload

Add HW offloading support for TC flows with VxLAN GBP encap/decap.

Example of encap rule:
tc filter add dev eth0 protocol ip ingress flower \
    action tunnel_key set id 42 vxlan_opts 512 \
    action mirred egress redirect dev vxlan1

Example of decap rule:
tc filter add dev vxlan1 protocol ip ingress flower \
    enc_key_id 42 enc_dst_port 4789 vxlan_opts 1024 \
    action tunnel_key unset action mirred egress redirect dev eth0

Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Gavi Teitz <gavi@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Acked-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c  | 72 +++++++++++++++++++++-
 include/linux/mlx5/device.h                        |  6 ++
 include/linux/mlx5/mlx5_ifc.h                      | 13 +++-
 3 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
index 1f62c702b625..a184d739d5f8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /* Copyright (c) 2018 Mellanox Technologies. */
 
+#include <net/ip_tunnels.h>
 #include <net/vxlan.h>
 #include "lib/vxlan.h"
 #include "en/tc_tun.h"
@@ -86,9 +87,11 @@ static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[],
 	const struct ip_tunnel_key *tun_key = &e->tun_info->key;
 	__be32 tun_id = tunnel_id_to_key32(tun_key->tun_id);
 	struct udphdr *udp = (struct udphdr *)(buf);
+	const struct vxlan_metadata *md;
 	struct vxlanhdr *vxh;
 
-	if (tun_key->tun_flags & TUNNEL_VXLAN_OPT)
+	if ((tun_key->tun_flags & TUNNEL_VXLAN_OPT) &&
+	    e->tun_info->options_len != sizeof(*md))
 		return -EOPNOTSUPP;
 	vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
 	*ip_proto = IPPROTO_UDP;
@@ -96,6 +99,57 @@ static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[],
 	udp->dest = tun_key->tp_dst;
 	vxh->vx_flags = VXLAN_HF_VNI;
 	vxh->vx_vni = vxlan_vni_field(tun_id);
+	if (tun_key->tun_flags & TUNNEL_VXLAN_OPT) {
+		md = ip_tunnel_info_opts(e->tun_info);
+		vxlan_build_gbp_hdr(vxh, md);
+	}
+
+	return 0;
+}
+
+static int mlx5e_tc_tun_parse_vxlan_gbp_option(struct mlx5e_priv *priv,
+					       struct mlx5_flow_spec *spec,
+					       struct flow_cls_offload *f)
+{
+	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+	struct netlink_ext_ack *extack = f->common.extack;
+	struct flow_match_enc_opts enc_opts;
+	void *misc5_c, *misc5_v;
+	u32 *gbp, *gbp_mask;
+
+	flow_rule_match_enc_opts(rule, &enc_opts);
+
+	if (memchr_inv(&enc_opts.mask->data, 0, sizeof(enc_opts.mask->data)) &&
+	    !MLX5_CAP_ESW_FT_FIELD_SUPPORT_2(priv->mdev, tunnel_header_0_1)) {
+		NL_SET_ERR_MSG_MOD(extack, "Matching on VxLAN GBP is not supported");
+		return -EOPNOTSUPP;
+	}
+
+	if (enc_opts.key->dst_opt_type != TUNNEL_VXLAN_OPT) {
+		NL_SET_ERR_MSG_MOD(extack, "Wrong VxLAN option type: not GBP");
+		return -EOPNOTSUPP;
+	}
+
+	if (enc_opts.key->len != sizeof(*gbp) ||
+	    enc_opts.mask->len != sizeof(*gbp_mask)) {
+		NL_SET_ERR_MSG_MOD(extack, "VxLAN GBP option/mask len is not 32 bits");
+		return -EINVAL;
+	}
+
+	gbp = (u32 *)&enc_opts.key->data[0];
+	gbp_mask = (u32 *)&enc_opts.mask->data[0];
+
+	if (*gbp_mask & ~VXLAN_GBP_MASK) {
+		NL_SET_ERR_MSG_FMT_MOD(extack, "Wrong VxLAN GBP mask(0x%08X)\n", *gbp_mask);
+		return -EINVAL;
+	}
+
+	misc5_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_5);
+	misc5_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_5);
+	MLX5_SET(fte_match_set_misc5, misc5_c, tunnel_header_0, *gbp_mask);
+	MLX5_SET(fte_match_set_misc5, misc5_v, tunnel_header_0, *gbp);
+
+	spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_5;
 
 	return 0;
 }
@@ -122,6 +176,14 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv,
 	if (!enc_keyid.mask->keyid)
 		return 0;
 
+	if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
+		int err;
+
+		err = mlx5e_tc_tun_parse_vxlan_gbp_option(priv, spec, f);
+		if (err)
+			return err;
+	}
+
 	/* match on VNI is required */
 
 	if (!MLX5_CAP_ESW_FLOWTABLE_FDB(priv->mdev,
@@ -143,6 +205,12 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv,
 	return 0;
 }
 
+static bool mlx5e_tc_tun_encap_info_equal_vxlan(struct mlx5e_encap_key *a,
+						struct mlx5e_encap_key *b)
+{
+	return mlx5e_tc_tun_encap_info_equal_options(a, b, TUNNEL_VXLAN_OPT);
+}
+
 static int mlx5e_tc_tun_get_remote_ifindex(struct net_device *mirred_dev)
 {
 	const struct vxlan_dev *vxlan = netdev_priv(mirred_dev);
@@ -160,6 +228,6 @@ struct mlx5e_tc_tunnel vxlan_tunnel = {
 	.generate_ip_tun_hdr  = mlx5e_gen_ip_tunnel_header_vxlan,
 	.parse_udp_ports      = mlx5e_tc_tun_parse_udp_ports_vxlan,
 	.parse_tunnel         = mlx5e_tc_tun_parse_vxlan,
-	.encap_info_equal     = mlx5e_tc_tun_encap_info_equal_generic,
+	.encap_info_equal     = mlx5e_tc_tun_encap_info_equal_vxlan,
 	.get_remote_ifindex   = mlx5e_tc_tun_get_remote_ifindex,
 };
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 71b06ebad402..af4dd536a52c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1357,6 +1357,12 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_ESW_INGRESS_ACL_MAX(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_ingress.cap)
 
+#define MLX5_CAP_ESW_FT_FIELD_SUPPORT_2(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE(mdev, ft_field_support_2_esw_fdb.cap)
+
+#define MLX5_CAP_ESW_FT_FIELD_SUPPORT_2_MAX(mdev, cap) \
+	MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, ft_field_support_2_esw_fdb.cap)
+
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
 		 mdev->caps.hca[MLX5_CAP_ESWITCH]->cur, cap)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d2c164f0778c..e47d6c58da35 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -404,10 +404,13 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         metadata_reg_c_0[0x1];
 };
 
+/* Table 2170 - Flow Table Fields Supported 2 Format */
 struct mlx5_ifc_flow_table_fields_supported_2_bits {
 	u8         reserved_at_0[0xe];
 	u8         bth_opcode[0x1];
-	u8         reserved_at_f[0x11];
+	u8         reserved_at_f[0x1];
+	u8         tunnel_header_0_1[0x1];
+	u8         reserved_at_11[0xf];
 
 	u8         reserved_at_20[0x60];
 };
@@ -895,7 +898,13 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits {
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress;
 
-	u8      reserved_at_800[0x1000];
+	u8      reserved_at_800[0xC00];
+
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_esw_fdb;
+
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_esw_fdb;
+
+	u8      reserved_at_1500[0x300];
 
 	u8      sw_steering_fdb_action_drop_icm_address_rx[0x40];
 
-- 
cgit v1.2.3


From 94c540fbfc80ef95ec398fc77da1920de2946edb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:30 +0000
Subject: udp: preserve const qualifier in udp_sk()

We can change udp_sk() to propagate const qualifier of its argument,
thanks to container_of_const()

This should avoid some potential errors caused by accidental
(const -> not_const) promotion.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index a2892e151644..43c1fb2d2c21 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -97,10 +97,7 @@ struct udp_sock {
 
 #define UDP_MAX_SEGMENTS	(1 << 6UL)
 
-static inline struct udp_sock *udp_sk(const struct sock *sk)
-{
-	return (struct udp_sock *)sk;
-}
+#define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk)
 
 static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
 {
-- 
cgit v1.2.3


From 0a2db4630b72486ec2f207ae433c2156b7fd3837 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:32 +0000
Subject: raw: preserve const qualifier in raw_sk()

We can change raw_sk() to propagate const qualifier of its argument,
thanks to container_of_const()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 7ad15830cf38..c215af02f758 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -83,10 +83,7 @@ struct raw_sock {
 	u32		   ipmr_table;
 };
 
-static inline struct raw_sock *raw_sk(const struct sock *sk)
-{
-	return (struct raw_sock *)sk;
-}
+#define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk)
 
 static inline bool raw_sk_bound_dev_eq(struct net *net, int bound_dev_if,
 				       int dif, int sdif)
-- 
cgit v1.2.3


From 47fcae28b9ec409423ba7f67f93e8345acce8a36 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:33 +0000
Subject: ipv6: raw: preserve const qualifier in raw6_sk()

We can change raw6_sk() to propagate its argument const qualifier,
thanks to container_of_const().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 37dfdcfcdd54..839247a4f48e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -336,10 +336,7 @@ static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk)
 	return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL;
 }
 
-static inline struct raw6_sock *raw6_sk(const struct sock *sk)
-{
-	return (struct raw6_sock *)sk;
-}
+#define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk)
 
 #define ipv6_only_sock(sk)	(sk->sk_ipv6only)
 #define ipv6_sk_rxinfo(sk)	((sk)->sk_family == PF_INET6 && \
-- 
cgit v1.2.3


From ae6084b739925d84ef4a481d8eaa2187455f2e2f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:34 +0000
Subject: dccp: preserve const qualifier in dccp_sk()

We can change dccp_sk() to propagate its argument const qualifier,
thanks to container_of_const().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dccp.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/dccp.h b/include/linux/dccp.h
index 07e547c02fd8..325af611909f 100644
--- a/include/linux/dccp.h
+++ b/include/linux/dccp.h
@@ -305,10 +305,8 @@ struct dccp_sock {
 	struct timer_list		dccps_xmit_timer;
 };
 
-static inline struct dccp_sock *dccp_sk(const struct sock *sk)
-{
-	return (struct dccp_sock *)sk;
-}
+#define dccp_sk(ptr)	container_of_const(ptr, struct dccp_sock, \
+					   dccps_inet_connection.icsk_inet.sk)
 
 static inline const char *dccp_role(const struct sock *sk)
 {
-- 
cgit v1.2.3


From b064ba9c3cfaf3d22d6153ec3c245eaa4d913674 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:35 +0000
Subject: af_unix: preserve const qualifier in unix_sk()

We can change unix_sk() to propagate its argument const qualifier,
thanks to container_of_const().

We need to change dump_common_audit_data() 'struct unix_sock *u'
local var to get a const attribute.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/af_unix.h | 5 +----
 security/lsm_audit.c  | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 45ebde587138..824c258143a3 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -74,10 +74,7 @@ struct unix_sock {
 #endif
 };
 
-static inline struct unix_sock *unix_sk(const struct sock *sk)
-{
-	return (struct unix_sock *)sk;
-}
+#define unix_sk(ptr) container_of_const(ptr, struct unix_sock, sk)
 
 #define peer_wait peer_wq.wait
 
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 00d3bdd386e2..368e77ca43c4 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -310,7 +310,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	case LSM_AUDIT_DATA_NET:
 		if (a->u.net->sk) {
 			const struct sock *sk = a->u.net->sk;
-			struct unix_sock *u;
+			const struct unix_sock *u;
 			struct unix_address *addr;
 			int len = 0;
 			char *p = NULL;
-- 
cgit v1.2.3


From c7154ca8e075cc456fe773879263b33ae307a59e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:37 +0000
Subject: x25: preserve const qualifier in [a]x25_sk()

We can change [a]x25_sk() to propagate their argument const qualifier,
thanks to container_of_const().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ax25.h | 5 +----
 include/net/x25.h  | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index f8cf3629a419..0d939e5aee4e 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -260,10 +260,7 @@ struct ax25_sock {
 	struct ax25_cb		*cb;
 };
 
-static inline struct ax25_sock *ax25_sk(const struct sock *sk)
-{
-	return (struct ax25_sock *) sk;
-}
+#define ax25_sk(ptr) container_of_const(ptr, struct ax25_sock, sk)
 
 static inline struct ax25_cb *sk_to_ax25(const struct sock *sk)
 {
diff --git a/include/net/x25.h b/include/net/x25.h
index d7d6c2b4ffa7..597eb53c471e 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -177,10 +177,7 @@ struct x25_forward {
 	atomic_t		refcnt;
 };
 
-static inline struct x25_sock *x25_sk(const struct sock *sk)
-{
-	return (struct x25_sock *)sk;
-}
+#define x25_sk(ptr) container_of_const(ptr, struct x25_sock, sk)
 
 /* af_x25.c */
 extern int  sysctl_x25_restart_request_timeout;
-- 
cgit v1.2.3


From e9d9da91548b21e189fcd0259a0f2d26d1afc509 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 17 Mar 2023 15:55:39 +0000
Subject: tcp: preserve const qualifier in tcp_sk()

We can change tcp_sk() to propagate its argument const qualifier,
thanks to container_of_const().

We have two places where a const sock pointer has to be upgraded
to a write one. We have been using const qualifier for lockless
listeners to clearly identify points where writes could happen.

Add tcp_sk_rw() helper to better document these.

tcp_inbound_md5_hash(), __tcp_grow_window(), tcp_reset_check()
and tcp_rack_reo_wnd() get an additional const qualififer
for their @tp local variables.

smc_check_reset_syn_req() also needs a similar change.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 10 ++++++----
 include/net/tcp.h        |  2 +-
 net/ipv4/tcp.c           |  2 +-
 net/ipv4/tcp_input.c     |  4 ++--
 net/ipv4/tcp_minisocks.c |  5 +++--
 net/ipv4/tcp_output.c    |  9 +++++++--
 net/ipv4/tcp_recovery.c  |  2 +-
 7 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ca7f05a130d2..b4c08ac86983 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -472,10 +472,12 @@ enum tsq_flags {
 	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
 };
 
-static inline struct tcp_sock *tcp_sk(const struct sock *sk)
-{
-	return (struct tcp_sock *)sk;
-}
+#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
+
+/* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket.
+ * Used in context of (lockless) tcp listeners.
+ */
+#define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
 
 struct tcp_timewait_sock {
 	struct inet_timewait_sock tw_sk;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1e..a0a91a988272 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -529,7 +529,7 @@ static inline void tcp_synq_overflow(const struct sock *sk)
 
 	last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
 	if (!time_between32(now, last_overflow, last_overflow + HZ))
-		WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now);
+		WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now);
 }
 
 /* syncookies: no recent synqueue overflow on this listening socket? */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 01569de651b6..fd68d49490f2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4570,7 +4570,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
 	const __u8 *hash_location = NULL;
 	struct tcp_md5sig_key *hash_expected;
 	const struct tcphdr *th = tcp_hdr(skb);
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 	int genhash, l3index;
 	u8 newhash[16];
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 754ddbe0577f..2b75cd9e2e92 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -458,7 +458,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
 static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
 			     unsigned int skbtruesize)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 	/* Optimize this! */
 	int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
 	int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
@@ -5693,7 +5693,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
  */
 static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 
 	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
 			(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 9a7ef7732c24..dac0d62120e6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -463,7 +463,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 }
 EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
 
-static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
 				    struct request_sock *req,
 				    struct tcp_sock *newtp)
 {
@@ -492,7 +492,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	const struct inet_request_sock *ireq = inet_rsk(req);
 	struct tcp_request_sock *treq = tcp_rsk(req);
 	struct inet_connection_sock *newicsk;
-	struct tcp_sock *oldtp, *newtp;
+	const struct tcp_sock *oldtp;
+	struct tcp_sock *newtp;
 	u32 seq;
 
 	if (!newsk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b1e743b39a83..cfe128b81a01 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4127,8 +4127,13 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
 	if (!res) {
 		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
-		if (unlikely(tcp_passive_fastopen(sk)))
-			tcp_sk(sk)->total_retrans++;
+		if (unlikely(tcp_passive_fastopen(sk))) {
+			/* sk has const attribute because listeners are lockless.
+			 * However in this case, we are dealing with a passive fastopen
+			 * socket thus we can change total_retrans value.
+			 */
+			tcp_sk_rw(sk)->total_retrans++;
+		}
 		trace_tcp_retransmit_synack(sk, req);
 	}
 	return res;
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 50abaa941387..acf4869c5d3b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -4,7 +4,7 @@
 
 static u32 tcp_rack_reo_wnd(const struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
+	const struct tcp_sock *tp = tcp_sk(sk);
 
 	if (!tp->reord_seen) {
 		/* If reordering has not been observed, be aggressive during
-- 
cgit v1.2.3


From fec53f44945877c8627da4d3ad70e3ac7e204f38 Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Fri, 17 Mar 2023 11:54:09 -0700
Subject: net: mscc: ocelot: expose ocelot_pll5_init routine

Ocelot chips have an internal PLL that must be used when communicating
through external phys. Expose the init routine, so it can be used by other
drivers.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c         | 31 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_vsc7514.c | 30 -----------------------------
 include/soc/mscc/ocelot.h                  |  2 ++
 3 files changed, 33 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 08acb7b89086..9b8403e29445 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -7,6 +7,7 @@
 #include <linux/dsa/ocelot.h>
 #include <linux/if_bridge.h>
 #include <linux/iopoll.h>
+#include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/ocelot_vcap.h>
 #include "ocelot.h"
 #include "ocelot_vcap.h"
@@ -211,6 +212,36 @@ static void ocelot_mact_init(struct ocelot *ocelot)
 	ocelot_write(ocelot, MACACCESS_CMD_INIT, ANA_TABLES_MACACCESS);
 }
 
+void ocelot_pll5_init(struct ocelot *ocelot)
+{
+	/* Configure PLL5. This will need a proper CCF driver
+	 * The values are coming from the VTSS API for Ocelot
+	 */
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG4,
+		     HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
+		     HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8));
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG0,
+		     HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
+		     HSIO_PLL5G_CFG0_CPU_CLK_DIV(2) |
+		     HSIO_PLL5G_CFG0_ENA_BIAS |
+		     HSIO_PLL5G_CFG0_ENA_VCO_BUF |
+		     HSIO_PLL5G_CFG0_ENA_CP1 |
+		     HSIO_PLL5G_CFG0_SELCPI(2) |
+		     HSIO_PLL5G_CFG0_LOOP_BW_RES(0xe) |
+		     HSIO_PLL5G_CFG0_SELBGV820(4) |
+		     HSIO_PLL5G_CFG0_DIV4 |
+		     HSIO_PLL5G_CFG0_ENA_CLKTREE |
+		     HSIO_PLL5G_CFG0_ENA_LANE);
+	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG2,
+		     HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
+		     HSIO_PLL5G_CFG2_EN_RESET_OVERRUN |
+		     HSIO_PLL5G_CFG2_GAIN_TEST(0x8) |
+		     HSIO_PLL5G_CFG2_ENA_AMPCTRL |
+		     HSIO_PLL5G_CFG2_PWD_AMPCTRL_N |
+		     HSIO_PLL5G_CFG2_AMPC_SEL(0x10));
+}
+EXPORT_SYMBOL(ocelot_pll5_init);
+
 static void ocelot_vcap_enable(struct ocelot *ocelot, int port)
 {
 	ocelot_write_gix(ocelot, ANA_PORT_VCAP_S2_CFG_S2_ENA |
diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
index 7388c3b0535c..97e90e2869d4 100644
--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
@@ -18,7 +18,6 @@
 
 #include <soc/mscc/ocelot.h>
 #include <soc/mscc/ocelot_vcap.h>
-#include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/vsc7514_regs.h>
 #include "ocelot_fdma.h"
 #include "ocelot.h"
@@ -26,35 +25,6 @@
 #define VSC7514_VCAP_POLICER_BASE			128
 #define VSC7514_VCAP_POLICER_MAX			191
 
-static void ocelot_pll5_init(struct ocelot *ocelot)
-{
-	/* Configure PLL5. This will need a proper CCF driver
-	 * The values are coming from the VTSS API for Ocelot
-	 */
-	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG4,
-		     HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
-		     HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8));
-	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG0,
-		     HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
-		     HSIO_PLL5G_CFG0_CPU_CLK_DIV(2) |
-		     HSIO_PLL5G_CFG0_ENA_BIAS |
-		     HSIO_PLL5G_CFG0_ENA_VCO_BUF |
-		     HSIO_PLL5G_CFG0_ENA_CP1 |
-		     HSIO_PLL5G_CFG0_SELCPI(2) |
-		     HSIO_PLL5G_CFG0_LOOP_BW_RES(0xe) |
-		     HSIO_PLL5G_CFG0_SELBGV820(4) |
-		     HSIO_PLL5G_CFG0_DIV4 |
-		     HSIO_PLL5G_CFG0_ENA_CLKTREE |
-		     HSIO_PLL5G_CFG0_ENA_LANE);
-	regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG2,
-		     HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
-		     HSIO_PLL5G_CFG2_EN_RESET_OVERRUN |
-		     HSIO_PLL5G_CFG2_GAIN_TEST(0x8) |
-		     HSIO_PLL5G_CFG2_ENA_AMPCTRL |
-		     HSIO_PLL5G_CFG2_PWD_AMPCTRL_N |
-		     HSIO_PLL5G_CFG2_AMPC_SEL(0x10));
-}
-
 static int ocelot_chip_init(struct ocelot *ocelot, const struct ocelot_ops *ops)
 {
 	int ret;
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 2080879e4134..751d9b250615 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1183,4 +1183,6 @@ ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port,
 }
 #endif
 
+void ocelot_pll5_init(struct ocelot *ocelot);
+
 #endif
-- 
cgit v1.2.3


From 69f7f89c0db52c5a3fe1bc9ba69d8248b5ee0bca Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Fri, 17 Mar 2023 11:54:10 -0700
Subject: net: mscc: ocelot: expose generic phylink_mac_config routine

The ocelot-switch driver can utilize the phylink_mac_config routine. Move
this to the ocelot library location and export the symbol to make this
possible.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c     | 26 ++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_net.c | 21 +++------------------
 include/soc/mscc/ocelot.h              |  3 +++
 3 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 9b8403e29445..8292e93a3782 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -809,6 +809,32 @@ static int ocelot_port_flush(struct ocelot *ocelot, int port)
 	return err;
 }
 
+void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
+			       unsigned int link_an_mode,
+			       const struct phylink_link_state *state)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+
+	/* Disable HDX fast control */
+	ocelot_port_writel(ocelot_port, DEV_PORT_MISC_HDX_FAST_DIS,
+			   DEV_PORT_MISC);
+
+	/* SGMII only for now */
+	ocelot_port_writel(ocelot_port, PCS1G_MODE_CFG_SGMII_MODE_ENA,
+			   PCS1G_MODE_CFG);
+	ocelot_port_writel(ocelot_port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
+
+	/* Enable PCS */
+	ocelot_port_writel(ocelot_port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
+
+	/* No aneg on SGMII */
+	ocelot_port_writel(ocelot_port, 0, PCS1G_ANEG_CFG);
+
+	/* No loopback */
+	ocelot_port_writel(ocelot_port, 0, PCS1G_LB_CFG);
+}
+EXPORT_SYMBOL_GPL(ocelot_phylink_mac_config);
+
 void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port,
 				  unsigned int link_an_mode,
 				  phy_interface_t interface,
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index ca4bde861397..590a2b2816ad 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1675,25 +1675,10 @@ static void vsc7514_phylink_mac_config(struct phylink_config *config,
 {
 	struct net_device *ndev = to_net_dev(config->dev);
 	struct ocelot_port_private *priv = netdev_priv(ndev);
-	struct ocelot_port *ocelot_port = &priv->port;
-
-	/* Disable HDX fast control */
-	ocelot_port_writel(ocelot_port, DEV_PORT_MISC_HDX_FAST_DIS,
-			   DEV_PORT_MISC);
-
-	/* SGMII only for now */
-	ocelot_port_writel(ocelot_port, PCS1G_MODE_CFG_SGMII_MODE_ENA,
-			   PCS1G_MODE_CFG);
-	ocelot_port_writel(ocelot_port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
-
-	/* Enable PCS */
-	ocelot_port_writel(ocelot_port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
-
-	/* No aneg on SGMII */
-	ocelot_port_writel(ocelot_port, 0, PCS1G_ANEG_CFG);
+	struct ocelot *ocelot = priv->port.ocelot;
+	int port = priv->port.index;
 
-	/* No loopback */
-	ocelot_port_writel(ocelot_port, 0, PCS1G_LB_CFG);
+	ocelot_phylink_mac_config(ocelot, port, link_an_mode, state);
 }
 
 static void vsc7514_phylink_mac_link_down(struct phylink_config *config,
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 751d9b250615..87ade87d3540 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1111,6 +1111,9 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port,
 				   enum devlink_sb_pool_type pool_type,
 				   u32 *p_cur, u32 *p_max);
 
+void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
+			       unsigned int link_an_mode,
+			       const struct phylink_link_state *state);
 void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port,
 				  unsigned int link_an_mode,
 				  phy_interface_t interface,
-- 
cgit v1.2.3


From dfca93ed51a7cf8bfda876705816a5e55381ac4a Mon Sep 17 00:00:00 2001
From: Colin Foster <colin.foster@in-advantage.com>
Date: Fri, 17 Mar 2023 11:54:11 -0700
Subject: net: mscc: ocelot: expose serdes configuration function

During chip initialization, ports that use SGMII / QSGMII to interface to
external phys need to be configured on the VSC7513 and VSC7514. Expose this
configuration routine, so it can be used by DSA drivers.

Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mscc/ocelot.c     | 40 ++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/mscc/ocelot_net.c | 29 +++---------------------
 include/soc/mscc/ocelot.h              |  4 ++++
 3 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 8292e93a3782..1502bb2c8ea7 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -7,6 +7,7 @@
 #include <linux/dsa/ocelot.h>
 #include <linux/if_bridge.h>
 #include <linux/iopoll.h>
+#include <linux/phy/phy.h>
 #include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/ocelot_vcap.h>
 #include "ocelot.h"
@@ -809,6 +810,45 @@ static int ocelot_port_flush(struct ocelot *ocelot, int port)
 	return err;
 }
 
+int ocelot_port_configure_serdes(struct ocelot *ocelot, int port,
+				 struct device_node *portnp)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct device *dev = ocelot->dev;
+	int err;
+
+	/* Ensure clock signals and speed are set on all QSGMII links */
+	if (ocelot_port->phy_mode == PHY_INTERFACE_MODE_QSGMII)
+		ocelot_port_rmwl(ocelot_port, 0,
+				 DEV_CLOCK_CFG_MAC_TX_RST |
+				 DEV_CLOCK_CFG_MAC_RX_RST,
+				 DEV_CLOCK_CFG);
+
+	if (ocelot_port->phy_mode != PHY_INTERFACE_MODE_INTERNAL) {
+		struct phy *serdes = of_phy_get(portnp, NULL);
+
+		if (IS_ERR(serdes)) {
+			err = PTR_ERR(serdes);
+			dev_err_probe(dev, err,
+				      "missing SerDes phys for port %d\n",
+				      port);
+			return err;
+		}
+
+		err = phy_set_mode_ext(serdes, PHY_MODE_ETHERNET,
+				       ocelot_port->phy_mode);
+		of_phy_put(serdes);
+		if (err) {
+			dev_err(dev, "Could not SerDes mode on port %d: %pe\n",
+				port, ERR_PTR(err));
+			return err;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ocelot_port_configure_serdes);
+
 void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
 			       unsigned int link_an_mode,
 			       const struct phylink_link_state *state)
diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c
index 590a2b2816ad..21a87a3fc556 100644
--- a/drivers/net/ethernet/mscc/ocelot_net.c
+++ b/drivers/net/ethernet/mscc/ocelot_net.c
@@ -1742,34 +1742,11 @@ static int ocelot_port_phylink_create(struct ocelot *ocelot, int port,
 		return -EINVAL;
 	}
 
-	/* Ensure clock signals and speed are set on all QSGMII links */
-	if (phy_mode == PHY_INTERFACE_MODE_QSGMII)
-		ocelot_port_rmwl(ocelot_port, 0,
-				 DEV_CLOCK_CFG_MAC_TX_RST |
-				 DEV_CLOCK_CFG_MAC_RX_RST,
-				 DEV_CLOCK_CFG);
-
 	ocelot_port->phy_mode = phy_mode;
 
-	if (phy_mode != PHY_INTERFACE_MODE_INTERNAL) {
-		struct phy *serdes = of_phy_get(portnp, NULL);
-
-		if (IS_ERR(serdes)) {
-			err = PTR_ERR(serdes);
-			dev_err_probe(dev, err,
-				      "missing SerDes phys for port %d\n",
-				      port);
-			return err;
-		}
-
-		err = phy_set_mode_ext(serdes, PHY_MODE_ETHERNET, phy_mode);
-		of_phy_put(serdes);
-		if (err) {
-			dev_err(dev, "Could not SerDes mode on port %d: %pe\n",
-				port, ERR_PTR(err));
-			return err;
-		}
-	}
+	err = ocelot_port_configure_serdes(ocelot, port, portnp);
+	if (err)
+		return err;
 
 	priv = container_of(ocelot_port, struct ocelot_port_private, port);
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 87ade87d3540..d757b5e26d26 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -644,6 +644,7 @@ enum ocelot_tag_prefix {
 };
 
 struct ocelot;
+struct device_node;
 
 struct ocelot_ops {
 	struct net_device *(*port_to_netdev)(struct ocelot *ocelot, int port);
@@ -1111,6 +1112,9 @@ int ocelot_sb_occ_tc_port_bind_get(struct ocelot *ocelot, int port,
 				   enum devlink_sb_pool_type pool_type,
 				   u32 *p_cur, u32 *p_max);
 
+int ocelot_port_configure_serdes(struct ocelot *ocelot, int port,
+				 struct device_node *portnp);
+
 void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
 			       unsigned int link_an_mode,
 			       const struct phylink_link_state *state);
-- 
cgit v1.2.3


From e0aeb9b90acf6ee7c2d11141522ffbb5481734d3 Mon Sep 17 00:00:00 2001
From: Raed Salem <raeds@nvidia.com>
Date: Tue, 14 Mar 2023 10:58:39 +0200
Subject: xfrm: add new device offload acquire flag

During XFRM acquire flow, a default SA is created to be updated later,
once acquire netlink message is handled in user space. When the relevant
policy is offloaded this default SA is also offloaded to IPsec offload
supporting driver, however this SA does not have context suitable for
offloading in HW, nor is interesting to offload to HW, consequently needs
a special driver handling apart from other offloaded SA(s).
Add a special flag that marks such SA so driver can handle it correctly.

Signed-off-by: Raed Salem <raeds@nvidia.com>
Link: https://lore.kernel.org/r/f5da0834d8c6b82ab9ba38bd4a0c55e71f0e3dab.1678714336.git.leon@kernel.org
Acked-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/net/xfrm.h    | 5 +++++
 net/xfrm/xfrm_state.c | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 3e1f70e8e424..33ee3f5936e6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -138,6 +138,10 @@ enum {
 	XFRM_DEV_OFFLOAD_PACKET,
 };
 
+enum {
+	XFRM_DEV_OFFLOAD_FLAG_ACQ = 1,
+};
+
 struct xfrm_dev_offload {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
@@ -145,6 +149,7 @@ struct xfrm_dev_offload {
 	unsigned long		offload_handle;
 	u8			dir : 2;
 	u8			type : 2;
+	u8			flags : 2;
 };
 
 struct xfrm_mode {
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2ab3e09e2227..7cca0a1fa5ff 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1272,6 +1272,7 @@ found:
 			xso->dir = xdo->dir;
 			xso->dev = xdo->dev;
 			xso->real_dev = xdo->real_dev;
+			xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ;
 			netdev_tracker_alloc(xso->dev, &xso->dev_tracker,
 					     GFP_ATOMIC);
 			error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL);
-- 
cgit v1.2.3


From a69e332b4ef9f697a4558bab9c442a02b3659fcb Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 18 Mar 2023 21:32:41 +0100
Subject: net: phy: smsc: export functions for use by meson-gxl PHY driver

The Amlogic Meson internal PHY's have the same register layout as
certain SMSC PHY's (also for non-c22-standard registers). This seems
to be more than just coincidence. Apparently they also need the same
workaround for EDPD mode (energy detect power down). Therefore let's
export SMSC PHY driver functionality for use by the meson-gxl PHY
driver.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Chris Healy <healych@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/smsc.c  | 20 +++++++++++++-------
 include/linux/smscphy.h |  6 ++++++
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index 721871184205..730964b856ab 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -54,7 +54,7 @@ static int smsc_phy_ack_interrupt(struct phy_device *phydev)
 	return rc < 0 ? rc : 0;
 }
 
-static int smsc_phy_config_intr(struct phy_device *phydev)
+int smsc_phy_config_intr(struct phy_device *phydev)
 {
 	int rc;
 
@@ -75,8 +75,9 @@ static int smsc_phy_config_intr(struct phy_device *phydev)
 
 	return rc < 0 ? rc : 0;
 }
+EXPORT_SYMBOL_GPL(smsc_phy_config_intr);
 
-static irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev)
+irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev)
 {
 	int irq_status;
 
@@ -95,18 +96,20 @@ static irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev)
 
 	return IRQ_HANDLED;
 }
+EXPORT_SYMBOL_GPL(smsc_phy_handle_interrupt);
 
-static int smsc_phy_config_init(struct phy_device *phydev)
+int smsc_phy_config_init(struct phy_device *phydev)
 {
 	struct smsc_phy_priv *priv = phydev->priv;
 
-	if (!priv->energy_enable || phydev->irq != PHY_POLL)
+	if (!priv || !priv->energy_enable || phydev->irq != PHY_POLL)
 		return 0;
 
 	/* Enable energy detect power down mode */
 	return phy_set_bits(phydev, MII_LAN83C185_CTRL_STATUS,
 			    MII_LAN83C185_EDPWRDOWN);
 }
+EXPORT_SYMBOL_GPL(smsc_phy_config_init);
 
 static int smsc_phy_reset(struct phy_device *phydev)
 {
@@ -186,7 +189,7 @@ static int lan95xx_config_aneg_ext(struct phy_device *phydev)
  * The workaround is only applicable to poll mode. Energy Detect Power-Down may
  * not be used in interrupt mode lest link change detection becomes unreliable.
  */
-static int lan87xx_read_status(struct phy_device *phydev)
+int lan87xx_read_status(struct phy_device *phydev)
 {
 	struct smsc_phy_priv *priv = phydev->priv;
 	int err;
@@ -195,7 +198,8 @@ static int lan87xx_read_status(struct phy_device *phydev)
 	if (err)
 		return err;
 
-	if (!phydev->link && priv->energy_enable && phydev->irq == PHY_POLL) {
+	if (!phydev->link && priv && priv->energy_enable &&
+	    phydev->irq == PHY_POLL) {
 		/* Disable EDPD to wake up PHY */
 		int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
 		if (rc < 0)
@@ -229,6 +233,7 @@ static int lan87xx_read_status(struct phy_device *phydev)
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(lan87xx_read_status);
 
 static int smsc_get_sset_count(struct phy_device *phydev)
 {
@@ -269,7 +274,7 @@ static void smsc_get_stats(struct phy_device *phydev,
 		data[i] = smsc_get_stat(phydev, i);
 }
 
-static int smsc_phy_probe(struct phy_device *phydev)
+int smsc_phy_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
 	struct smsc_phy_priv *priv;
@@ -294,6 +299,7 @@ static int smsc_phy_probe(struct phy_device *phydev)
 
 	return clk_set_rate(refclk, 50 * 1000 * 1000);
 }
+EXPORT_SYMBOL_GPL(smsc_phy_probe);
 
 static struct phy_driver smsc_phy_driver[] = {
 {
diff --git a/include/linux/smscphy.h b/include/linux/smscphy.h
index 1a136271ba6a..80f37c1dba58 100644
--- a/include/linux/smscphy.h
+++ b/include/linux/smscphy.h
@@ -28,4 +28,10 @@
 #define MII_LAN83C185_MODE_POWERDOWN 0xC0 /* Power Down mode */
 #define MII_LAN83C185_MODE_ALL       0xE0 /* All capable mode */
 
+int smsc_phy_config_intr(struct phy_device *phydev);
+irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev);
+int smsc_phy_config_init(struct phy_device *phydev);
+int lan87xx_read_status(struct phy_device *phydev);
+int smsc_phy_probe(struct phy_device *phydev);
+
 #endif /* __LINUX_SMSCPHY_H__ */
-- 
cgit v1.2.3


From 4765a9722e09765866e131ec31f7b9cf4c1f4854 Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Sun, 19 Mar 2023 12:57:50 +0000
Subject: net: pcs: add driver for MediaTek SGMII PCS

The SGMII core found in several MediaTek SoCs is identical to what can
also be found in MediaTek's MT7531 Ethernet switch IC.
As this has not always been clear, both drivers developed different
implementations to deal with the PCS.
Recently Alexander Couzens pointed out this fact which lead to the
development of this shared driver.

Add a dedicated driver, mostly by copying the code now found in the
Ethernet driver. The now redundant code will be removed by a follow-up
commit.

Suggested-by: Alexander Couzens <lynxis@fe80.eu>
Suggested-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Tested-by: Frank Wunderlich <frank-w@public-files.de>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 MAINTAINERS                       |   8 +
 drivers/net/pcs/Kconfig           |   7 +
 drivers/net/pcs/Makefile          |   1 +
 drivers/net/pcs/pcs-mtk-lynxi.c   | 305 ++++++++++++++++++++++++++++++++++++++
 include/linux/pcs/pcs-mtk-lynxi.h |  13 ++
 5 files changed, 334 insertions(+)
 create mode 100644 drivers/net/pcs/pcs-mtk-lynxi.c
 create mode 100644 include/linux/pcs/pcs-mtk-lynxi.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 30ca644d704f..54a2a8122a97 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13042,6 +13042,14 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/ethernet/mediatek/
 
+MEDIATEK ETHERNET PCS DRIVER
+M:	Alexander Couzens <lynxis@fe80.eu>
+M:	Daniel Golle <daniel@makrotopia.org>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/pcs/pcs-mtk-lynxi.c
+F:	include/linux/pcs/pcs-mtk-lynxi.h
+
 MEDIATEK I2C CONTROLLER DRIVER
 M:	Qii Wang <qii.wang@mediatek.com>
 L:	linux-i2c@vger.kernel.org
diff --git a/drivers/net/pcs/Kconfig b/drivers/net/pcs/Kconfig
index 6e7e6c346a3e..7c34fb7cbf7b 100644
--- a/drivers/net/pcs/Kconfig
+++ b/drivers/net/pcs/Kconfig
@@ -18,6 +18,13 @@ config PCS_LYNX
 	  This module provides helpers to phylink for managing the Lynx PCS
 	  which is part of the Layerscape and QorIQ Ethernet SERDES.
 
+config PCS_MTK_LYNXI
+	tristate
+	select REGMAP
+	help
+	  This module provides helpers to phylink for managing the LynxI PCS
+	  which is part of MediaTek's SoC and Ethernet switch ICs.
+
 config PCS_RZN1_MIIC
 	tristate "Renesas RZ/N1 MII converter"
 	depends on OF && (ARCH_RZN1 || COMPILE_TEST)
diff --git a/drivers/net/pcs/Makefile b/drivers/net/pcs/Makefile
index 4c780d8f2e98..9b9afd6b1c22 100644
--- a/drivers/net/pcs/Makefile
+++ b/drivers/net/pcs/Makefile
@@ -5,5 +5,6 @@ pcs_xpcs-$(CONFIG_PCS_XPCS)	:= pcs-xpcs.o pcs-xpcs-nxp.o
 
 obj-$(CONFIG_PCS_XPCS)		+= pcs_xpcs.o
 obj-$(CONFIG_PCS_LYNX)		+= pcs-lynx.o
+obj-$(CONFIG_PCS_MTK_LYNXI)	+= pcs-mtk-lynxi.o
 obj-$(CONFIG_PCS_RZN1_MIIC)	+= pcs-rzn1-miic.o
 obj-$(CONFIG_PCS_ALTERA_TSE)	+= pcs-altera-tse.o
diff --git a/drivers/net/pcs/pcs-mtk-lynxi.c b/drivers/net/pcs/pcs-mtk-lynxi.c
new file mode 100644
index 000000000000..888452325edc
--- /dev/null
+++ b/drivers/net/pcs/pcs-mtk-lynxi.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018-2019 MediaTek Inc.
+/* A library for MediaTek SGMII circuit
+ *
+ * Author: Sean Wang <sean.wang@mediatek.com>
+ * Author: Alexander Couzens <lynxis@fe80.eu>
+ * Author: Daniel Golle <daniel@makrotopia.org>
+ *
+ */
+
+#include <linux/mdio.h>
+#include <linux/of.h>
+#include <linux/pcs/pcs-mtk-lynxi.h>
+#include <linux/phylink.h>
+#include <linux/regmap.h>
+
+/* SGMII subsystem config registers */
+/* BMCR (low 16) BMSR (high 16) */
+#define SGMSYS_PCS_CONTROL_1		0x0
+#define SGMII_BMCR			GENMASK(15, 0)
+#define SGMII_BMSR			GENMASK(31, 16)
+
+#define SGMSYS_PCS_DEVICE_ID		0x4
+#define SGMII_LYNXI_DEV_ID		0x4d544950
+
+#define SGMSYS_PCS_ADVERTISE		0x8
+#define SGMII_ADVERTISE			GENMASK(15, 0)
+#define SGMII_LPA			GENMASK(31, 16)
+
+#define SGMSYS_PCS_SCRATCH		0x14
+#define SGMII_DEV_VERSION		GENMASK(31, 16)
+
+/* Register to programmable link timer, the unit in 2 * 8ns */
+#define SGMSYS_PCS_LINK_TIMER		0x18
+#define SGMII_LINK_TIMER_MASK		GENMASK(19, 0)
+#define SGMII_LINK_TIMER_VAL(ns)	FIELD_PREP(SGMII_LINK_TIMER_MASK, \
+						   ((ns) / 2 / 8))
+
+/* Register to control remote fault */
+#define SGMSYS_SGMII_MODE		0x20
+#define SGMII_IF_MODE_SGMII		BIT(0)
+#define SGMII_SPEED_DUPLEX_AN		BIT(1)
+#define SGMII_SPEED_MASK		GENMASK(3, 2)
+#define SGMII_SPEED_10			FIELD_PREP(SGMII_SPEED_MASK, 0)
+#define SGMII_SPEED_100			FIELD_PREP(SGMII_SPEED_MASK, 1)
+#define SGMII_SPEED_1000		FIELD_PREP(SGMII_SPEED_MASK, 2)
+#define SGMII_DUPLEX_HALF		BIT(4)
+#define SGMII_REMOTE_FAULT_DIS		BIT(8)
+
+/* Register to reset SGMII design */
+#define SGMSYS_RESERVED_0		0x34
+#define SGMII_SW_RESET			BIT(0)
+
+/* Register to set SGMII speed, ANA RG_ Control Signals III */
+#define SGMII_PHY_SPEED_MASK		GENMASK(3, 2)
+#define SGMII_PHY_SPEED_1_25G		FIELD_PREP(SGMII_PHY_SPEED_MASK, 0)
+#define SGMII_PHY_SPEED_3_125G		FIELD_PREP(SGMII_PHY_SPEED_MASK, 1)
+
+/* Register to power up QPHY */
+#define SGMSYS_QPHY_PWR_STATE_CTRL	0xe8
+#define	SGMII_PHYA_PWD			BIT(4)
+
+/* Register to QPHY wrapper control */
+#define SGMSYS_QPHY_WRAP_CTRL		0xec
+#define SGMII_PN_SWAP_MASK		GENMASK(1, 0)
+#define SGMII_PN_SWAP_TX_RX		(BIT(0) | BIT(1))
+
+/* struct mtk_pcs_lynxi -  This structure holds each sgmii regmap andassociated
+ *                         data
+ * @regmap:                The register map pointing at the range used to setup
+ *                         SGMII modes
+ * @dev:                   Pointer to device owning the PCS
+ * @ana_rgc3:              The offset of register ANA_RGC3 relative to regmap
+ * @interface:             Currently configured interface mode
+ * @pcs:                   Phylink PCS structure
+ * @flags:                 Flags indicating hardware properties
+ */
+struct mtk_pcs_lynxi {
+	struct regmap		*regmap;
+	u32			ana_rgc3;
+	phy_interface_t		interface;
+	struct			phylink_pcs pcs;
+	u32			flags;
+};
+
+static struct mtk_pcs_lynxi *pcs_to_mtk_pcs_lynxi(struct phylink_pcs *pcs)
+{
+	return container_of(pcs, struct mtk_pcs_lynxi, pcs);
+}
+
+static void mtk_pcs_lynxi_get_state(struct phylink_pcs *pcs,
+				    struct phylink_link_state *state)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+	unsigned int bm, adv;
+
+	/* Read the BMSR and LPA */
+	regmap_read(mpcs->regmap, SGMSYS_PCS_CONTROL_1, &bm);
+	regmap_read(mpcs->regmap, SGMSYS_PCS_ADVERTISE, &adv);
+
+	phylink_mii_c22_pcs_decode_state(state, FIELD_GET(SGMII_BMSR, bm),
+					 FIELD_GET(SGMII_LPA, adv));
+}
+
+static int mtk_pcs_lynxi_config(struct phylink_pcs *pcs, unsigned int mode,
+				phy_interface_t interface,
+				const unsigned long *advertising,
+				bool permit_pause_to_mac)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+	bool mode_changed = false, changed, use_an;
+	unsigned int rgc3, sgm_mode, bmcr;
+	int advertise, link_timer;
+
+	advertise = phylink_mii_c22_pcs_encode_advertisement(interface,
+							     advertising);
+	if (advertise < 0)
+		return advertise;
+
+	/* Clearing IF_MODE_BIT0 switches the PCS to BASE-X mode, and
+	 * we assume that fixes it's speed at bitrate = line rate (in
+	 * other words, 1000Mbps or 2500Mbps).
+	 */
+	if (interface == PHY_INTERFACE_MODE_SGMII) {
+		sgm_mode = SGMII_IF_MODE_SGMII;
+		if (phylink_autoneg_inband(mode)) {
+			sgm_mode |= SGMII_REMOTE_FAULT_DIS |
+				    SGMII_SPEED_DUPLEX_AN;
+			use_an = true;
+		} else {
+			use_an = false;
+		}
+	} else if (phylink_autoneg_inband(mode)) {
+		/* 1000base-X or 2500base-X autoneg */
+		sgm_mode = SGMII_REMOTE_FAULT_DIS;
+		use_an = linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					   advertising);
+	} else {
+		/* 1000base-X or 2500base-X without autoneg */
+		sgm_mode = 0;
+		use_an = false;
+	}
+
+	if (use_an)
+		bmcr = BMCR_ANENABLE;
+	else
+		bmcr = 0;
+
+	if (mpcs->interface != interface) {
+		link_timer = phylink_get_link_timer_ns(interface);
+		if (link_timer < 0)
+			return link_timer;
+
+		/* PHYA power down */
+		regmap_set_bits(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL,
+				SGMII_PHYA_PWD);
+
+		/* Reset SGMII PCS state */
+		regmap_set_bits(mpcs->regmap, SGMSYS_RESERVED_0,
+				SGMII_SW_RESET);
+
+		if (mpcs->flags & MTK_SGMII_FLAG_PN_SWAP)
+			regmap_update_bits(mpcs->regmap, SGMSYS_QPHY_WRAP_CTRL,
+					   SGMII_PN_SWAP_MASK,
+					   SGMII_PN_SWAP_TX_RX);
+
+		if (interface == PHY_INTERFACE_MODE_2500BASEX)
+			rgc3 = SGMII_PHY_SPEED_3_125G;
+		else
+			rgc3 = SGMII_PHY_SPEED_1_25G;
+
+		/* Configure the underlying interface speed */
+		regmap_update_bits(mpcs->regmap, mpcs->ana_rgc3,
+				   SGMII_PHY_SPEED_MASK, rgc3);
+
+		/* Setup the link timer */
+		regmap_write(mpcs->regmap, SGMSYS_PCS_LINK_TIMER,
+			     SGMII_LINK_TIMER_VAL(link_timer));
+
+		mpcs->interface = interface;
+		mode_changed = true;
+	}
+
+	/* Update the advertisement, noting whether it has changed */
+	regmap_update_bits_check(mpcs->regmap, SGMSYS_PCS_ADVERTISE,
+				 SGMII_ADVERTISE, advertise, &changed);
+
+	/* Update the sgmsys mode register */
+	regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE,
+			   SGMII_REMOTE_FAULT_DIS | SGMII_SPEED_DUPLEX_AN |
+			   SGMII_IF_MODE_SGMII, sgm_mode);
+
+	/* Update the BMCR */
+	regmap_update_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1,
+			   BMCR_ANENABLE, bmcr);
+
+	/* Release PHYA power down state
+	 * Only removing bit SGMII_PHYA_PWD isn't enough.
+	 * There are cases when the SGMII_PHYA_PWD register contains 0x9 which
+	 * prevents SGMII from working. The SGMII still shows link but no traffic
+	 * can flow. Writing 0x0 to the PHYA_PWD register fix the issue. 0x0 was
+	 * taken from a good working state of the SGMII interface.
+	 * Unknown how much the QPHY needs but it is racy without a sleep.
+	 * Tested on mt7622 & mt7986.
+	 */
+	usleep_range(50, 100);
+	regmap_write(mpcs->regmap, SGMSYS_QPHY_PWR_STATE_CTRL, 0);
+
+	return changed || mode_changed;
+}
+
+static void mtk_pcs_lynxi_restart_an(struct phylink_pcs *pcs)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+
+	regmap_set_bits(mpcs->regmap, SGMSYS_PCS_CONTROL_1, BMCR_ANRESTART);
+}
+
+static void mtk_pcs_lynxi_link_up(struct phylink_pcs *pcs, unsigned int mode,
+				  phy_interface_t interface, int speed,
+				  int duplex)
+{
+	struct mtk_pcs_lynxi *mpcs = pcs_to_mtk_pcs_lynxi(pcs);
+	unsigned int sgm_mode;
+
+	if (!phylink_autoneg_inband(mode)) {
+		/* Force the speed and duplex setting */
+		if (speed == SPEED_10)
+			sgm_mode = SGMII_SPEED_10;
+		else if (speed == SPEED_100)
+			sgm_mode = SGMII_SPEED_100;
+		else
+			sgm_mode = SGMII_SPEED_1000;
+
+		if (duplex != DUPLEX_FULL)
+			sgm_mode |= SGMII_DUPLEX_HALF;
+
+		regmap_update_bits(mpcs->regmap, SGMSYS_SGMII_MODE,
+				   SGMII_DUPLEX_HALF | SGMII_SPEED_MASK,
+				   sgm_mode);
+	}
+}
+
+static const struct phylink_pcs_ops mtk_pcs_lynxi_ops = {
+	.pcs_get_state = mtk_pcs_lynxi_get_state,
+	.pcs_config = mtk_pcs_lynxi_config,
+	.pcs_an_restart = mtk_pcs_lynxi_restart_an,
+	.pcs_link_up = mtk_pcs_lynxi_link_up,
+};
+
+struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev,
+					 struct regmap *regmap, u32 ana_rgc3,
+					 u32 flags)
+{
+	struct mtk_pcs_lynxi *mpcs;
+	u32 id, ver;
+	int ret;
+
+	ret = regmap_read(regmap, SGMSYS_PCS_DEVICE_ID, &id);
+	if (ret < 0)
+		return NULL;
+
+	if (id != SGMII_LYNXI_DEV_ID) {
+		dev_err(dev, "unknown PCS device id %08x\n", id);
+		return NULL;
+	}
+
+	ret = regmap_read(regmap, SGMSYS_PCS_SCRATCH, &ver);
+	if (ret < 0)
+		return NULL;
+
+	ver = FIELD_GET(SGMII_DEV_VERSION, ver);
+	if (ver != 0x1) {
+		dev_err(dev, "unknown PCS device version %04x\n", ver);
+		return NULL;
+	}
+
+	dev_dbg(dev, "MediaTek LynxI SGMII PCS (id 0x%08x, ver 0x%04x)\n", id,
+		ver);
+
+	mpcs = kzalloc(sizeof(*mpcs), GFP_KERNEL);
+	if (!mpcs)
+		return NULL;
+
+	mpcs->ana_rgc3 = ana_rgc3;
+	mpcs->regmap = regmap;
+	mpcs->flags = flags;
+	mpcs->pcs.ops = &mtk_pcs_lynxi_ops;
+	mpcs->pcs.poll = true;
+	mpcs->interface = PHY_INTERFACE_MODE_NA;
+
+	return &mpcs->pcs;
+}
+EXPORT_SYMBOL(mtk_pcs_lynxi_create);
+
+void mtk_pcs_lynxi_destroy(struct phylink_pcs *pcs)
+{
+	if (!pcs)
+		return;
+
+	kfree(pcs_to_mtk_pcs_lynxi(pcs));
+}
+EXPORT_SYMBOL(mtk_pcs_lynxi_destroy);
+
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pcs/pcs-mtk-lynxi.h b/include/linux/pcs/pcs-mtk-lynxi.h
new file mode 100644
index 000000000000..be3b4ab32f4a
--- /dev/null
+++ b/include/linux/pcs/pcs-mtk-lynxi.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_PCS_MTK_LYNXI_H
+#define __LINUX_PCS_MTK_LYNXI_H
+
+#include <linux/phylink.h>
+#include <linux/regmap.h>
+
+#define MTK_SGMII_FLAG_PN_SWAP BIT(0)
+struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev,
+					 struct regmap *regmap,
+					 u32 ana_rgc3, u32 flags);
+void mtk_pcs_lynxi_destroy(struct phylink_pcs *pcs);
+#endif
-- 
cgit v1.2.3


From 04aae213e719ec2bb310158c4025316ace50589b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:13 -0700
Subject: net: skbuff: rename __pkt_vlan_present_offset to __mono_tc_offset

vlan_present is gone since
commit 354259fa73e2 ("net: remove skb->vlan_present")
rename the offset field to what BPF is currently looking
for in this byte - mono_delivery_time and tc_at_ingress.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-2-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h                               | 4 ++--
 net/core/filter.c                                    | 8 ++++----
 tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3f3a2a82a86b..5a63878a4550 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -955,7 +955,7 @@ struct sk_buff {
 	__u8			csum_valid:1;
 
 	/* private: */
-	__u8			__pkt_vlan_present_offset[0];
+	__u8			__mono_tc_offset[0];
 	/* public: */
 	__u8			remcsum_offload:1;
 	__u8			csum_complete_sw:1;
@@ -1078,7 +1078,7 @@ struct sk_buff {
 #define TC_AT_INGRESS_MASK		(1 << 7)
 #define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
 #endif
-#define PKT_VLAN_PRESENT_OFFSET	offsetof(struct sk_buff, __pkt_vlan_present_offset)
+#define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
 
 #ifdef __KERNEL__
 /*
diff --git a/net/core/filter.c b/net/core/filter.c
index 50f649f1b4a9..3370efad1dda 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9185,7 +9185,7 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
 	__u8 tmp_reg = BPF_REG_AX;
 
 	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
-			      PKT_VLAN_PRESENT_OFFSET);
+			      SKB_BF_MONO_TC_OFFSET);
 	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
 				SKB_MONO_DELIVERY_TIME_MASK, 2);
 	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
@@ -9232,7 +9232,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
 		/* AX is needed because src_reg and dst_reg could be the same */
 		__u8 tmp_reg = BPF_REG_AX;
 
-		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
 		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
 					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
 		*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
@@ -9267,14 +9267,14 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
 	if (!prog->tstamp_type_access) {
 		__u8 tmp_reg = BPF_REG_AX;
 
-		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET);
+		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
 		/* Writing __sk_buff->tstamp as ingress, goto <clear> */
 		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
 		/* goto <store> */
 		*insn++ = BPF_JMP_A(2);
 		/* <clear>: mono_delivery_time */
 		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
-		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, PKT_VLAN_PRESENT_OFFSET);
+		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
 	}
 #endif
 
diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
index d5fe3d4b936c..ae7b6e50e405 100644
--- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
@@ -68,17 +68,17 @@ static struct test_case test_cases[] = {
 #if defined(__x86_64__) || defined(__aarch64__)
 	{
 		N(SCHED_CLS, struct __sk_buff, tstamp),
-		.read  = "r11 = *(u8 *)($ctx + sk_buff::__pkt_vlan_present_offset);"
+		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
 			 "w11 &= 160;"
 			 "if w11 != 0xa0 goto pc+2;"
 			 "$dst = 0;"
 			 "goto pc+1;"
 			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
-		.write = "r11 = *(u8 *)($ctx + sk_buff::__pkt_vlan_present_offset);"
+		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
 			 "if w11 & 0x80 goto pc+1;"
 			 "goto pc+2;"
 			 "w11 &= -33;"
-			 "*(u8 *)($ctx + sk_buff::__pkt_vlan_present_offset) = r11;"
+			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
 			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",
 	},
 #endif
-- 
cgit v1.2.3


From b94e032b7ad6318b36615f0e0cc3b0d61a5531e8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:14 -0700
Subject: net: skbuff: reorder bytes 2 and 3 of the bitfield

BPF needs to know the offsets of fields it tries to access.
Zero-length fields are added to make offsetof() work.
This unfortunately partitions the bitfield (fields across
the zero-length members can't be coalesced).

Reorder bytes 2 and 3, BPF needs to know the offset of fields
previously in byte 3 and some fields in byte 2 should really
be optional.

The two bytes are always in the same cacheline so it should
not matter.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-3-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5a63878a4550..36d31e74db37 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -944,16 +944,6 @@ struct sk_buff {
 	__u8			ip_summed:2;
 	__u8			ooo_okay:1;
 
-	__u8			l4_hash:1;
-	__u8			sw_hash:1;
-	__u8			wifi_acked_valid:1;
-	__u8			wifi_acked:1;
-	__u8			no_fcs:1;
-	/* Indicates the inner headers are valid in the skbuff. */
-	__u8			encapsulation:1;
-	__u8			encap_hdr_csum:1;
-	__u8			csum_valid:1;
-
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
@@ -966,6 +956,16 @@ struct sk_buff {
 	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
 #endif
+
+	__u8			l4_hash:1;
+	__u8			sw_hash:1;
+	__u8			wifi_acked_valid:1;
+	__u8			wifi_acked:1;
+	__u8			no_fcs:1;
+	/* Indicates the inner headers are valid in the skbuff. */
+	__u8			encapsulation:1;
+	__u8			encap_hdr_csum:1;
+	__u8			csum_valid:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
 #endif
-- 
cgit v1.2.3


From c0ba861117c3e8deb03855d7dc5a7717958bbb18 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 20 Mar 2023 18:41:15 -0700
Subject: net: skbuff: move the fields BPF cares about directly next to the
 offset marker

To avoid more possible BPF dependencies with moving bitfields
around keep the fields BPF cares about right next to the offset
marker.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20230321014115.997841-4-kuba@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h                               | 18 +++++++++---------
 tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 36d31e74db37..6aeb0e7b9511 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -947,15 +947,15 @@ struct sk_buff {
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
-	__u8			remcsum_offload:1;
-	__u8			csum_complete_sw:1;
-	__u8			csum_level:2;
-	__u8			dst_pending_confirm:1;
 	__u8			mono_delivery_time:1;	/* See SKB_MONO_DELIVERY_TIME_MASK */
 #ifdef CONFIG_NET_CLS_ACT
-	__u8			tc_skip_classify:1;
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
+	__u8			tc_skip_classify:1;
 #endif
+	__u8			remcsum_offload:1;
+	__u8			csum_complete_sw:1;
+	__u8			csum_level:2;
+	__u8			dst_pending_confirm:1;
 
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
@@ -1072,11 +1072,11 @@ struct sk_buff {
  * around, you also must adapt these constants.
  */
 #ifdef __BIG_ENDIAN_BITFIELD
-#define TC_AT_INGRESS_MASK		(1 << 0)
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 2)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 7)
+#define TC_AT_INGRESS_MASK		(1 << 6)
 #else
-#define TC_AT_INGRESS_MASK		(1 << 7)
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 5)
+#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 0)
+#define TC_AT_INGRESS_MASK		(1 << 1)
 #endif
 #define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
 
diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
index ae7b6e50e405..4951aa978f33 100644
--- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
@@ -69,15 +69,15 @@ static struct test_case test_cases[] = {
 	{
 		N(SCHED_CLS, struct __sk_buff, tstamp),
 		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "w11 &= 160;"
-			 "if w11 != 0xa0 goto pc+2;"
+			 "w11 &= 3;"
+			 "if w11 != 0x3 goto pc+2;"
 			 "$dst = 0;"
 			 "goto pc+1;"
 			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
 		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "if w11 & 0x80 goto pc+1;"
+			 "if w11 & 0x2 goto pc+1;"
 			 "goto pc+2;"
-			 "w11 &= -33;"
+			 "w11 &= -2;"
 			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
 			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",
 	},
-- 
cgit v1.2.3


From 09eed1192cec1755967f2af8394207acdde579a1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Mar 2023 04:01:14 +0000
Subject: neighbour: switch to standard rcu, instead of rcu_bh

rcu_bh is no longer a win, especially for objects freed
with standard call_rcu().

Switch neighbour code to no longer disable BH when not necessary.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/arp.h        |  8 +++---
 include/net/ndisc.h      | 12 ++++-----
 include/net/neighbour.h  |  6 ++---
 include/net/nexthop.h    |  6 ++---
 net/core/filter.c        | 16 +++++++-----
 net/core/neighbour.c     | 64 ++++++++++++++++++++++++------------------------
 net/ipv4/fib_semantics.c |  4 +--
 net/ipv4/ip_output.c     |  6 ++---
 net/ipv4/nexthop.c       |  8 +++---
 net/ipv4/route.c         |  4 +--
 net/ipv6/addrconf.c      | 14 +++++------
 net/ipv6/ip6_output.c    | 10 ++++----
 net/ipv6/route.c         | 12 ++++-----
 13 files changed, 87 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/include/net/arp.h b/include/net/arp.h
index d7ef4ec71dfe..e8747e0713c7 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -38,11 +38,11 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv4_neigh_lookup_noref(dev, key);
 	if (n && !refcount_inc_not_zero(&n->refcnt))
 		n = NULL;
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return n;
 }
@@ -51,10 +51,10 @@ static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key)
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv4_neigh_lookup_noref(dev, key);
 	neigh_confirm(n);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 
 void arp_init(void);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 07e5168cdaf9..52eae0943433 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -395,11 +395,11 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref(dev, pkey);
 	if (n && !refcount_inc_not_zero(&n->refcnt))
 		n = NULL;
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return n;
 }
@@ -409,10 +409,10 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev,
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref(dev, pkey);
 	neigh_confirm(n);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 
 static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
@@ -420,10 +420,10 @@ static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
 {
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref_stub(dev, pkey);
 	neigh_confirm(n);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 
 /* uses ipv6_stub and is meant for use outside of IPv6 core */
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index c8d39bba2a0d..3fa5774bddac 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -299,14 +299,14 @@ static inline struct neighbour *___neigh_lookup_noref(
 	const void *pkey,
 	struct net_device *dev)
 {
-	struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht);
+	struct neigh_hash_table *nht = rcu_dereference(tbl->nht);
 	struct neighbour *n;
 	u32 hash_val;
 
 	hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
-	for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
+	for (n = rcu_dereference(nht->hash_buckets[hash_val]);
 	     n != NULL;
-	     n = rcu_dereference_bh(n->next)) {
+	     n = rcu_dereference(n->next)) {
 		if (n->dev == dev && key_eq(n, pkey))
 			return n;
 	}
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 28085b995ddc..9fa291a04621 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -498,7 +498,7 @@ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
 }
 
 /* Variant of nexthop_fib6_nh().
- * Caller should either hold rcu_read_lock_bh(), or RTNL.
+ * Caller should either hold rcu_read_lock(), or RTNL.
  */
 static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
 {
@@ -507,13 +507,13 @@ static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
 	if (nh->is_group) {
 		struct nh_group *nh_grp;
 
-		nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp);
+		nh_grp = rcu_dereference_rtnl(nh->nh_grp);
 		nh = nexthop_mpath_select(nh_grp, 0);
 		if (!nh)
 			return NULL;
 	}
 
-	nhi = rcu_dereference_bh_rtnl(nh->nh_info);
+	nhi = rcu_dereference_rtnl(nh->nh_info);
 	if (nhi->family == AF_INET6)
 		return &nhi->fib6_nh;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index d052fac28d02..a8c8fd96c822 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2204,7 +2204,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
 			return -ENOMEM;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	if (!nh) {
 		dst = skb_dst(skb);
 		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
@@ -2217,10 +2217,12 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
 		int ret;
 
 		sock_confirm_neigh(skb, neigh);
+		local_bh_disable();
 		dev_xmit_recursion_inc();
 		ret = neigh_output(neigh, skb, false);
 		dev_xmit_recursion_dec();
-		rcu_read_unlock_bh();
+		local_bh_enable();
+		rcu_read_unlock();
 		return ret;
 	}
 	rcu_read_unlock_bh();
@@ -2302,7 +2304,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 			return -ENOMEM;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	if (!nh) {
 		struct dst_entry *dst = skb_dst(skb);
 		struct rtable *rt = container_of(dst, struct rtable, dst);
@@ -2314,7 +2316,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 	} else if (nh->nh_family == AF_INET) {
 		neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
 	} else {
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 		goto out_drop;
 	}
 
@@ -2322,13 +2324,15 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 		int ret;
 
 		sock_confirm_neigh(skb, neigh);
+		local_bh_disable();
 		dev_xmit_recursion_inc();
 		ret = neigh_output(neigh, skb, is_v6gw);
 		dev_xmit_recursion_dec();
-		rcu_read_unlock_bh();
+		local_bh_enable();
+		rcu_read_unlock();
 		return ret;
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 out_drop:
 	kfree_skb(skb);
 	return -ENETDOWN;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 90d399b3f980..ddd0f32de20e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -614,7 +614,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __neigh_lookup_noref(tbl, pkey, dev);
 	if (n) {
 		if (!refcount_inc_not_zero(&n->refcnt))
@@ -622,7 +622,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 		NEIGH_CACHE_STAT_INC(tbl, hits);
 	}
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 	return n;
 }
 EXPORT_SYMBOL(neigh_lookup);
@@ -2184,11 +2184,11 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
 		};
 
-		rcu_read_lock_bh();
-		nht = rcu_dereference_bh(tbl->nht);
+		rcu_read_lock();
+		nht = rcu_dereference(tbl->nht);
 		ndc.ndtc_hash_rnd = nht->hash_rnd[0];
 		ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 
 		if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
 			goto nla_put_failure;
@@ -2703,15 +2703,15 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 	if (filter->dev_idx || filter->master_idx)
 		flags |= NLM_F_DUMP_FILTERED;
 
-	rcu_read_lock_bh();
-	nht = rcu_dereference_bh(tbl->nht);
+	rcu_read_lock();
+	nht = rcu_dereference(tbl->nht);
 
 	for (h = s_h; h < (1 << nht->hash_shift); h++) {
 		if (h > s_h)
 			s_idx = 0;
-		for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
+		for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0;
 		     n != NULL;
-		     n = rcu_dereference_bh(n->next)) {
+		     n = rcu_dereference(n->next)) {
 			if (idx < s_idx || !net_eq(dev_net(n->dev), net))
 				goto next;
 			if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
@@ -2730,7 +2730,7 @@ next:
 	}
 	rc = skb->len;
 out:
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 	cb->args[1] = h;
 	cb->args[2] = idx;
 	return rc;
@@ -3075,20 +3075,20 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
 	int chain;
 	struct neigh_hash_table *nht;
 
-	rcu_read_lock_bh();
-	nht = rcu_dereference_bh(tbl->nht);
+	rcu_read_lock();
+	nht = rcu_dereference(tbl->nht);
 
-	read_lock(&tbl->lock); /* avoid resizes */
+	read_lock_bh(&tbl->lock); /* avoid resizes */
 	for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
 		struct neighbour *n;
 
-		for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
+		for (n = rcu_dereference(nht->hash_buckets[chain]);
 		     n != NULL;
-		     n = rcu_dereference_bh(n->next))
+		     n = rcu_dereference(n->next))
 			cb(n, cookie);
 	}
-	read_unlock(&tbl->lock);
-	rcu_read_unlock_bh();
+	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_for_each);
 
@@ -3138,7 +3138,7 @@ int neigh_xmit(int index, struct net_device *dev,
 		tbl = neigh_tables[index];
 		if (!tbl)
 			goto out;
-		rcu_read_lock_bh();
+		rcu_read_lock();
 		if (index == NEIGH_ARP_TABLE) {
 			u32 key = *((u32 *)addr);
 
@@ -3150,11 +3150,11 @@ int neigh_xmit(int index, struct net_device *dev,
 			neigh = __neigh_create(tbl, addr, dev, false);
 		err = PTR_ERR(neigh);
 		if (IS_ERR(neigh)) {
-			rcu_read_unlock_bh();
+			rcu_read_unlock();
 			goto out_kfree_skb;
 		}
 		err = neigh->output(neigh, skb);
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 	}
 	else if (index == NEIGH_LINK_TABLE) {
 		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
@@ -3183,7 +3183,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 
 	state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
 	for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
-		n = rcu_dereference_bh(nht->hash_buckets[bucket]);
+		n = rcu_dereference(nht->hash_buckets[bucket]);
 
 		while (n) {
 			if (!net_eq(dev_net(n->dev), net))
@@ -3201,7 +3201,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
-			n = rcu_dereference_bh(n->next);
+			n = rcu_dereference(n->next);
 		}
 
 		if (n)
@@ -3225,7 +3225,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 		if (v)
 			return n;
 	}
-	n = rcu_dereference_bh(n->next);
+	n = rcu_dereference(n->next);
 
 	while (1) {
 		while (n) {
@@ -3243,7 +3243,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 			if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
 				break;
 next:
-			n = rcu_dereference_bh(n->next);
+			n = rcu_dereference(n->next);
 		}
 
 		if (n)
@@ -3252,7 +3252,7 @@ next:
 		if (++state->bucket >= (1 << nht->hash_shift))
 			break;
 
-		n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
+		n = rcu_dereference(nht->hash_buckets[state->bucket]);
 	}
 
 	if (n && pos)
@@ -3354,7 +3354,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
 
 void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
 	__acquires(tbl->lock)
-	__acquires(rcu_bh)
+	__acquires(rcu)
 {
 	struct neigh_seq_state *state = seq->private;
 
@@ -3362,9 +3362,9 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
 	state->bucket = 0;
 	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
 
-	rcu_read_lock_bh();
-	state->nht = rcu_dereference_bh(tbl->nht);
-	read_lock(&tbl->lock);
+	rcu_read_lock();
+	state->nht = rcu_dereference(tbl->nht);
+	read_lock_bh(&tbl->lock);
 
 	return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
 }
@@ -3399,13 +3399,13 @@ EXPORT_SYMBOL(neigh_seq_next);
 
 void neigh_seq_stop(struct seq_file *seq, void *v)
 	__releases(tbl->lock)
-	__releases(rcu_bh)
+	__releases(rcu)
 {
 	struct neigh_seq_state *state = seq->private;
 	struct neigh_table *tbl = state->tbl;
 
-	read_unlock(&tbl->lock);
-	rcu_read_unlock_bh();
+	read_unlock_bh(&tbl->lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 574ff450c4d2..65ba18a91865 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -2191,7 +2191,7 @@ static bool fib_good_nh(const struct fib_nh *nh)
 	if (nh->fib_nh_scope == RT_SCOPE_LINK) {
 		struct neighbour *n;
 
-		rcu_read_lock_bh();
+		rcu_read_lock();
 
 		if (likely(nh->fib_nh_gw_family == AF_INET))
 			n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
@@ -2204,7 +2204,7 @@ static bool fib_good_nh(const struct fib_nh *nh)
 		if (n)
 			state = READ_ONCE(n->nud_state);
 
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 	}
 
 	return !!(state & NUD_VALID);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index cb04dbad9ea4..22a90a9392eb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -219,7 +219,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 			return res;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
 	if (!IS_ERR(neigh)) {
 		int res;
@@ -227,10 +227,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 		sock_confirm_neigh(skb, neigh);
 		/* if crossing protocols, can not use the cached header */
 		res = neigh_output(neigh, skb, is_v6gw);
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 		return res;
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
 			    __func__);
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index e28a99f1996b..f95142e56da0 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1124,13 +1124,13 @@ static bool ipv6_good_nh(const struct fib6_nh *nh)
 	int state = NUD_REACHABLE;
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
 	if (n)
 		state = READ_ONCE(n->nud_state);
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return !!(state & NUD_VALID);
 }
@@ -1140,14 +1140,14 @@ static bool ipv4_good_nh(const struct fib_nh *nh)
 	int state = NUD_REACHABLE;
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
 				      (__force u32)nh->fib_nh_gw4);
 	if (n)
 		state = READ_ONCE(n->nud_state);
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return !!(state & NUD_VALID);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 232009d216c4..6a0a0bb452e9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -408,7 +408,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	struct net_device *dev = dst->dev;
 	struct neighbour *n;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	if (likely(rt->rt_gw_family == AF_INET)) {
 		n = ip_neigh_gw4(dev, rt->rt_gw4);
@@ -424,7 +424,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 		n = NULL;
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return n;
 }
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index faa47f9ea73a..31e0097878c5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1034,7 +1034,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 	unsigned int hash = inet6_addr_hash(net, &ifa->addr);
 	int err = 0;
 
-	spin_lock(&net->ipv6.addrconf_hash_lock);
+	spin_lock_bh(&net->ipv6.addrconf_hash_lock);
 
 	/* Ignore adding duplicate addresses on an interface */
 	if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
@@ -1044,7 +1044,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 		hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
 	}
 
-	spin_unlock(&net->ipv6.addrconf_hash_lock);
+	spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
 
 	return err;
 }
@@ -1139,15 +1139,15 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	/* For caller */
 	refcount_set(&ifa->refcnt, 1);
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	err = ipv6_add_addr_hash(idev->dev, ifa);
 	if (err < 0) {
-		rcu_read_unlock_bh();
+		rcu_read_unlock();
 		goto out;
 	}
 
-	write_lock(&idev->lock);
+	write_lock_bh(&idev->lock);
 
 	/* Add to inet6_dev unicast addr list. */
 	ipv6_link_dev_addr(idev, ifa);
@@ -1158,9 +1158,9 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
 	}
 
 	in6_ifa_hold(ifa);
-	write_unlock(&idev->lock);
+	write_unlock_bh(&idev->lock);
 
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	inet6addr_notifier_call_chain(NETDEV_UP, ifa);
 out:
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e5ed39a3c65f..0b6140f0179d 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -116,7 +116,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 			return res;
 	}
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 
@@ -124,7 +124,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 		if (unlikely(!neigh))
 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 		if (IS_ERR(neigh)) {
-			rcu_read_unlock_bh();
+			rcu_read_unlock();
 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
 			return -EINVAL;
@@ -132,7 +132,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	}
 	sock_confirm_neigh(skb, neigh);
 	ret = neigh_output(neigh, skb, false);
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -1150,11 +1150,11 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	 * dst entry of the nexthop router
 	 */
 	rt = (struct rt6_info *) *dst;
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 				      rt6_nexthop(rt, &fl6->daddr));
 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	if (err) {
 		struct inet6_ifaddr *ifp;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e829bd880384..244df77fac87 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -633,7 +633,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 
 	nh_gw = &fib6_nh->fib_nh_gw6;
 	dev = fib6_nh->fib_nh_dev;
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	last_probe = READ_ONCE(fib6_nh->last_probe);
 	idev = __in6_dev_get(dev);
 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
@@ -641,7 +641,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 		if (READ_ONCE(neigh->nud_state) & NUD_VALID)
 			goto out;
 
-		write_lock(&neigh->lock);
+		write_lock_bh(&neigh->lock);
 		if (!(neigh->nud_state & NUD_VALID) &&
 		    time_after(jiffies,
 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
@@ -649,7 +649,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 			if (work)
 				__neigh_set_probe_once(neigh);
 		}
-		write_unlock(&neigh->lock);
+		write_unlock_bh(&neigh->lock);
 	} else if (time_after(jiffies, last_probe +
 				       idev->cnf.rtr_probe_interval)) {
 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
@@ -667,7 +667,7 @@ static void rt6_probe(struct fib6_nh *fib6_nh)
 	}
 
 out:
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 #else
 static inline void rt6_probe(struct fib6_nh *fib6_nh)
@@ -683,7 +683,7 @@ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 	struct neighbour *neigh;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
 					  &fib6_nh->fib_nh_gw6);
 	if (neigh) {
@@ -701,7 +701,7 @@ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 
 	return ret;
 }
-- 
cgit v1.2.3


From fe602c87df1b6927562f4ee61edd851bb9578a49 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 21 Mar 2023 04:01:15 +0000
Subject: net: remove rcu_dereference_bh_rtnl()

This helper is no longer used in the tree.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rtnetlink.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 92ad75549e9c..f0c87baaf6c0 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -61,16 +61,6 @@ static inline bool lockdep_rtnl_is_held(void)
 #define rcu_dereference_rtnl(p)					\
 	rcu_dereference_check(p, lockdep_rtnl_is_held())
 
-/**
- * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
- * @p: The pointer to read, prior to dereference
- *
- * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
- * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
- */
-#define rcu_dereference_bh_rtnl(p)				\
-	rcu_dereference_bh_check(p, lockdep_rtnl_is_held())
-
 /**
  * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
  * @p: The pointer to read, prior to dereferencing
-- 
cgit v1.2.3


From 61587f1556fec39e8bafc40c8715f560639a4cf2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 21 Mar 2023 10:12:48 +0100
Subject: wifi: mac80211: add support for letting drivers register tc offload
 support

On newer MediaTek SoCs (e.g. MT7986), WLAN->WLAN or WLAN->Ethernet flows can
be offloaded by the SoC. In order to support that, the .ndo_setup_tc op is
needed.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20230321091248.30947-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  9 +++++++++
 net/mac80211/driver-ops.h  | 17 +++++++++++++++++
 net/mac80211/ieee80211_i.h |  3 ++-
 net/mac80211/iface.c       | 11 +++++++++++
 net/mac80211/trace.h       | 25 +++++++++++++++++++++++++
 5 files changed, 64 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f12edca660ba..fcfe3e9aff3d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4227,6 +4227,10 @@ struct ieee80211_prep_tx_info {
  * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. This is
  *	not restored at HW reset by mac80211 so drivers need to take care of
  *	that.
+ * @net_setup_tc: Called from .ndo_setup_tc in order to prepare hardware
+ *	flow offloading for flows originating from the vif.
+ *	Note that the driver must not assume that the vif driver_data is valid
+ *	at this point, since the callback can be called during netdev teardown.
  */
 struct ieee80211_ops {
 	void (*tx)(struct ieee80211_hw *hw,
@@ -4593,6 +4597,11 @@ struct ieee80211_ops {
 	int (*set_hw_timestamp)(struct ieee80211_hw *hw,
 				struct ieee80211_vif *vif,
 				struct cfg80211_set_hw_timestamp *hwts);
+	int (*net_setup_tc)(struct ieee80211_hw *hw,
+			    struct ieee80211_vif *vif,
+			    struct net_device *dev,
+			    enum tc_setup_type type,
+			    void *type_data);
 };
 
 /**
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index a68d606e6987..0bf208f5bbc5 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1502,6 +1502,23 @@ static inline int drv_net_fill_forward_path(struct ieee80211_local *local,
 	return ret;
 }
 
+static inline int drv_net_setup_tc(struct ieee80211_local *local,
+				   struct ieee80211_sub_if_data *sdata,
+				   struct net_device *dev,
+				   enum tc_setup_type type, void *type_data)
+{
+	int ret = -EOPNOTSUPP;
+
+	sdata = get_bss_sdata(sdata);
+	trace_drv_net_setup_tc(local, sdata, type);
+	if (local->ops->net_setup_tc)
+		ret = local->ops->net_setup_tc(&local->hw, &sdata->vif, dev,
+					       type, type_data);
+	trace_drv_return_int(local, ret);
+
+	return ret;
+}
+
 int drv_change_vif_links(struct ieee80211_local *local,
 			 struct ieee80211_sub_if_data *sdata,
 			 u16 old_links, u16 new_links,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 3d4edc25a69e..b2535614483e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1939,7 +1939,8 @@ void ieee80211_color_collision_detection_work(struct work_struct *work);
 /* interface handling */
 #define MAC80211_SUPPORTED_FEATURES_TX	(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \
 					 NETIF_F_HW_CSUM | NETIF_F_SG | \
-					 NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE)
+					 NETIF_F_HIGHDMA | NETIF_F_GSO_SOFTWARE | \
+					 NETIF_F_HW_TC)
 #define MAC80211_SUPPORTED_FEATURES_RX	(NETIF_F_RXCSUM)
 #define MAC80211_SUPPORTED_FEATURES	(MAC80211_SUPPORTED_FEATURES_TX | \
 					 MAC80211_SUPPORTED_FEATURES_RX)
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 23ed13f15067..bd2c48870add 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -813,6 +813,15 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	dev_fetch_sw_netstats(stats, dev->tstats);
 }
 
+static int ieee80211_netdev_setup_tc(struct net_device *dev,
+				     enum tc_setup_type type, void *type_data)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+
+	return drv_net_setup_tc(local, sdata, dev, type, type_data);
+}
+
 static const struct net_device_ops ieee80211_dataif_ops = {
 	.ndo_open		= ieee80211_open,
 	.ndo_stop		= ieee80211_stop,
@@ -821,6 +830,7 @@ static const struct net_device_ops ieee80211_dataif_ops = {
 	.ndo_set_rx_mode	= ieee80211_set_multicast_list,
 	.ndo_set_mac_address 	= ieee80211_change_mac,
 	.ndo_get_stats64	= ieee80211_get_stats64,
+	.ndo_setup_tc		= ieee80211_netdev_setup_tc,
 };
 
 static u16 ieee80211_monitor_select_queue(struct net_device *dev,
@@ -929,6 +939,7 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = {
 	.ndo_set_mac_address	= ieee80211_change_mac,
 	.ndo_get_stats64	= ieee80211_get_stats64,
 	.ndo_fill_forward_path	= ieee80211_netdev_fill_forward_path,
+	.ndo_setup_tc		= ieee80211_netdev_setup_tc,
 };
 
 static bool ieee80211_iftype_supports_hdr_offload(enum nl80211_iftype iftype)
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 9f4377566c42..e0ccf5fe708a 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2478,6 +2478,31 @@ DEFINE_EVENT(sta_event, drv_net_fill_forward_path,
 	TP_ARGS(local, sdata, sta)
 );
 
+TRACE_EVENT(drv_net_setup_tc,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 u8 type),
+
+	TP_ARGS(local, sdata, type),
+
+	TP_STRUCT__entry(
+		LOCAL_ENTRY
+		VIF_ENTRY
+		__field(u8, type)
+	),
+
+	TP_fast_assign(
+		LOCAL_ASSIGN;
+		VIF_ASSIGN;
+		__entry->type = type;
+	),
+
+	TP_printk(
+		LOCAL_PR_FMT VIF_PR_FMT " type:%d\n",
+		LOCAL_PR_ARG, VIF_PR_ARG, __entry->type
+	)
+);
+
 TRACE_EVENT(drv_change_vif_links,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From fe4a6d2db3bad41e9f22c860596f355af8493ebb Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 14 Mar 2023 10:59:56 +0100
Subject: wifi: mac80211: implement support for yet another mesh A-MSDU format

MT7996 hardware supports mesh A-MSDU subframes in hardware, but uses a
big-endian length field

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ryder Lee <ryder.lee@mediatek.com>
Link: https://lore.kernel.org/r/20230314095956.62085-7-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h  | 11 +++++++----
 net/mac80211/rx.c       | 22 ++++++++++++++++------
 net/mac80211/sta_info.h |  5 ++++-
 net/wireless/util.c     | 36 +++++++++++++++++++++++-------------
 4 files changed, 50 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 7cebba1c4135..86cb048dc924 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6274,10 +6274,13 @@ static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
  * mesh control field.
  *
  * @skb: The input A-MSDU frame without any headers.
- * @mesh_hdr: use standard compliant mesh A-MSDU subframe header
+ * @mesh_hdr: the type of mesh header to test
+ *	0: non-mesh A-MSDU length field
+ *	1: big-endian mesh A-MSDU length field
+ *	2: little-endian mesh A-MSDU length field
  * Returns: true if subframe header lengths are valid for the @mesh_hdr mode
  */
-bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr);
+bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr);
 
 /**
  * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame
@@ -6294,13 +6297,13 @@ bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr);
  * @extra_headroom: The hardware extra headroom for SKBs in the @list.
  * @check_da: DA to check in the inner ethernet header, or NULL
  * @check_sa: SA to check in the inner ethernet header, or NULL
- * @mesh_control: A-MSDU subframe header includes the mesh control field
+ * @mesh_control: see mesh_hdr in ieee80211_is_valid_amsdu
  */
 void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 			      const u8 *addr, enum nl80211_iftype iftype,
 			      const unsigned int extra_headroom,
 			      const u8 *check_da, const u8 *check_sa,
-			      bool mesh_control);
+			      u8 mesh_control);
 
 /**
  * ieee80211_get_8023_tunnel_proto - get RFC1042 or bridge tunnel encap protocol
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 85fb1d3eeb2f..1c957194554b 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2983,13 +2983,23 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset)
 		return RX_DROP_UNUSABLE;
 
 	if (rx->sta && rx->sta->amsdu_mesh_control < 0) {
-		bool valid_std = ieee80211_is_valid_amsdu(skb, true);
-		bool valid_nonstd = ieee80211_is_valid_amsdu(skb, false);
+		s8 valid = -1;
+		int i;
+
+		for (i = 0; i <= 2; i++) {
+			if (!ieee80211_is_valid_amsdu(skb, i))
+				continue;
+
+			if (valid >= 0) {
+				/* ambiguous */
+				valid = -1;
+				break;
+			}
+
+			valid = i;
+		}
 
-		if (valid_std && !valid_nonstd)
-			rx->sta->amsdu_mesh_control = 1;
-		else if (valid_nonstd && !valid_std)
-			rx->sta->amsdu_mesh_control = 0;
+		rx->sta->amsdu_mesh_control = valid;
 	}
 
 	ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index e8e482a82d77..f354d470e174 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -623,7 +623,10 @@ struct link_sta_info {
  * @cparams: CoDel parameters for this station.
  * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED)
  * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer
- *	(-1: not yet known, 0: non-standard [without mesh header], 1: standard)
+ *	(-1: not yet known,
+ *	  0: non-mesh A-MSDU length field
+ *	  1: big-endian mesh A-MSDU length field
+ *	  2: little-endian mesh A-MSDU length field)
  * @fast_tx: TX fastpath information
  * @fast_rx: RX fastpath information
  * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d1a89e82ead0..3bc0c3072e78 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -776,7 +776,24 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
 	return frame;
 }
 
-bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr)
+static u16
+ieee80211_amsdu_subframe_length(void *field, u8 mesh_flags, u8 hdr_type)
+{
+	__le16 *field_le = field;
+	__be16 *field_be = field;
+	u16 len;
+
+	if (hdr_type >= 2)
+		len = le16_to_cpu(*field_le);
+	else
+		len = be16_to_cpu(*field_be);
+	if (hdr_type)
+		len += __ieee80211_get_mesh_hdrlen(mesh_flags);
+
+	return len;
+}
+
+bool ieee80211_is_valid_amsdu(struct sk_buff *skb, u8 mesh_hdr)
 {
 	int offset = 0, remaining, subframe_len, padding;
 
@@ -790,12 +807,8 @@ bool ieee80211_is_valid_amsdu(struct sk_buff *skb, bool mesh_hdr)
 		if (skb_copy_bits(skb, offset + 2 * ETH_ALEN, &hdr, sizeof(hdr)) < 0)
 			return false;
 
-		if (mesh_hdr)
-			len = le16_to_cpu(*(__le16 *)&hdr.len) +
-			      __ieee80211_get_mesh_hdrlen(hdr.mesh_flags);
-		else
-			len = ntohs(hdr.len);
-
+		len = ieee80211_amsdu_subframe_length(&hdr.len, hdr.mesh_flags,
+						      mesh_hdr);
 		subframe_len = sizeof(struct ethhdr) + len;
 		padding = (4 - subframe_len) & 0x3;
 		remaining = skb->len - offset;
@@ -812,7 +825,7 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 			      const u8 *addr, enum nl80211_iftype iftype,
 			      const unsigned int extra_headroom,
 			      const u8 *check_da, const u8 *check_sa,
-			      bool mesh_control)
+			      u8 mesh_control)
 {
 	unsigned int hlen = ALIGN(extra_headroom, 4);
 	struct sk_buff *frame = NULL;
@@ -837,11 +850,8 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
 		skb_copy_bits(skb, offset, &hdr, copy_len);
 		if (iftype == NL80211_IFTYPE_MESH_POINT)
 			mesh_len = __ieee80211_get_mesh_hdrlen(hdr.flags);
-		if (mesh_control)
-			len = le16_to_cpu(*(__le16 *)&hdr.eth.h_proto) + mesh_len;
-		else
-			len = ntohs(hdr.eth.h_proto);
-
+		len = ieee80211_amsdu_subframe_length(&hdr.eth.h_proto, hdr.flags,
+						      mesh_control);
 		subframe_len = sizeof(struct ethhdr) + len;
 		padding = (4 - subframe_len) & 0x3;
 
-- 
cgit v1.2.3


From 8e40c3b6e1538401d2cf8d43087ebe1db8026af9 Mon Sep 17 00:00:00 2001
From: Manikanta Pubbisetty <quic_mpubbise@quicinc.com>
Date: Wed, 8 Mar 2023 16:15:56 +0530
Subject: wifi: nl80211: Update the documentation of
 NL80211_SCAN_FLAG_COLOCATED_6GHZ

Currently when NL80211_SCAN_FLAG_COLOCATED_6GHZ is set in the scan flags,
in addition to the co-located APs, PSC channels in the 6 GHz band would
also be scanned if the user space has asked for it. In other words, the
scan would happen on PSC channels & co-located 6 GHz channels that were
reported in the RNR IE.

Update the documentation of NL80211_SCAN_FLAG_COLOCATED_6GHZ flag to
reflect the above said behavior.

Signed-off-by: Manikanta Pubbisetty <quic_mpubbise@quicinc.com>
Link: https://lore.kernel.org/r/20230308104556.9399-1-quic_mpubbise@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 9a0ac0363f1f..14e958a32b84 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6544,7 +6544,9 @@ enum nl80211_timeout_reason {
  *	channels on which APs are expected to be found. Note that when not set,
  *	the scan logic would scan all 6GHz channels, but since transmission of
  *	probe requests on non PSC channels is limited, it is highly likely that
- *	these channels would passively be scanned.
+ *	these channels would passively be scanned. Also note that when the flag
+ *	is set, in addition to the colocated APs, PSC channels would also be
+ *	scanned if the user space has asked for it.
  */
 enum nl80211_scan_flags {
 	NL80211_SCAN_FLAG_LOW_PRIORITY				= 1<<0,
-- 
cgit v1.2.3


From bd54f3c29077f23dad92ef82a78061b40be30c65 Mon Sep 17 00:00:00 2001
From: Aloka Dixit <quic_alokad@quicinc.com>
Date: Mon, 5 Dec 2022 16:50:37 -0800
Subject: wifi: mac80211: generate EMA beacons in AP mode

Add APIs to generate an array of beacons for an EMA AP (enhanced
multiple BSSID advertisements), each including a single MBSSID element.
EMA profile periodicity equals the count of elements.

- ieee80211_beacon_get_template_ema_list() - Generate and return all
EMA beacon templates. Drivers must call ieee80211_beacon_free_ema_list()
to free the memory. No change in the prototype for the existing API,
ieee80211_beacon_get_template(), which should be used for non-EMA AP.

- ieee80211_beacon_get_template_ema_index() - Generate a beacon which
includes the multiple BSSID element at the given index. Drivers can use
this function in a loop until NULL is returned which indicates end of
available MBSSID elements.

- ieee80211_beacon_free_ema_list() - free the memory allocated for the
list of EMA beacon templates.

Modify existing functions ieee80211_beacon_get_ap(),
ieee80211_get_mbssid_beacon_len() and ieee80211_beacon_add_mbssid()
to accept a new parameter for EMA index.

Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Co-developed-by: John Crispin <john@phrozen.org>
Signed-off-by: John Crispin <john@phrozen.org>
Link: https://lore.kernel.org/r/20221206005040.3177-2-quic_alokad@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  68 +++++++++++++++++++++++
 net/mac80211/cfg.c         |  11 ++--
 net/mac80211/ieee80211_i.h |  10 +++-
 net/mac80211/tx.c          | 134 +++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 205 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index fcfe3e9aff3d..679421d37a42 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5285,6 +5285,74 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw,
 			      struct ieee80211_mutable_offsets *offs,
 			      unsigned int link_id);
 
+/**
+ * ieee80211_beacon_get_template_ema_index - EMA beacon template generation
+ * @hw: pointer obtained from ieee80211_alloc_hw().
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @offs: &struct ieee80211_mutable_offsets pointer to struct that will
+ *	receive the offsets that may be updated by the driver.
+ * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP).
+ * @ema_index: index of the beacon in the EMA set.
+ *
+ * This function follows the same rules as ieee80211_beacon_get_template()
+ * but returns a beacon template which includes multiple BSSID element at the
+ * requested index.
+ *
+ * Return: The beacon template. %NULL indicates the end of EMA templates.
+ */
+struct sk_buff *
+ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw,
+					struct ieee80211_vif *vif,
+					struct ieee80211_mutable_offsets *offs,
+					unsigned int link_id, u8 ema_index);
+
+/**
+ * struct ieee80211_ema_beacons - List of EMA beacons
+ * @cnt: count of EMA beacons.
+ *
+ * @bcn: array of EMA beacons.
+ * @bcn.skb: the skb containing this specific beacon
+ * @bcn.offs: &struct ieee80211_mutable_offsets pointer to struct that will
+ *	receive the offsets that may be updated by the driver.
+ */
+struct ieee80211_ema_beacons {
+	u8 cnt;
+	struct {
+		struct sk_buff *skb;
+		struct ieee80211_mutable_offsets offs;
+	} bcn[];
+};
+
+/**
+ * ieee80211_beacon_get_template_ema_list - EMA beacon template generation
+ * @hw: pointer obtained from ieee80211_alloc_hw().
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: the link id to which the beacon belongs (or 0 for a non-MLD AP)
+ *
+ * This function follows the same rules as ieee80211_beacon_get_template()
+ * but allocates and returns a pointer to list of all beacon templates required
+ * to cover all profiles in the multiple BSSID set. Each template includes only
+ * one multiple BSSID element.
+ *
+ * Driver must call ieee80211_beacon_free_ema_list() to free the memory.
+ *
+ * Return: EMA beacon templates of type struct ieee80211_ema_beacons *.
+ *	%NULL on error.
+ */
+struct ieee80211_ema_beacons *
+ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       unsigned int link_id);
+
+/**
+ * ieee80211_beacon_free_ema_list - free an EMA beacon template list
+ * @ema_beacons: list of EMA beacons of type &struct ieee80211_ema_beacons pointers.
+ *
+ * This function will free a list previously acquired by calling
+ * ieee80211_beacon_get_template_ema_list()
+ */
+void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons);
+
 /**
  * ieee80211_beacon_get_tim - beacon generation function
  * @hw: pointer obtained from ieee80211_alloc_hw().
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 760ad934f9e1..db5fa334b801 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1122,11 +1122,11 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
 	if (params->mbssid_ies) {
 		mbssid = params->mbssid_ies;
 		size += struct_size(new->mbssid_ies, elem, mbssid->cnt);
-		size += ieee80211_get_mbssid_beacon_len(mbssid);
+		size += ieee80211_get_mbssid_beacon_len(mbssid, mbssid->cnt);
 	} else if (old && old->mbssid_ies) {
 		mbssid = old->mbssid_ies;
 		size += struct_size(new->mbssid_ies, elem, mbssid->cnt);
-		size += ieee80211_get_mbssid_beacon_len(mbssid);
+		size += ieee80211_get_mbssid_beacon_len(mbssid, mbssid->cnt);
 	}
 
 	new = kzalloc(size, GFP_KERNEL);
@@ -3406,8 +3406,11 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
 
 	len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len +
 	      beacon->proberesp_ies_len + beacon->assocresp_ies_len +
-	      beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len +
-	      ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies);
+	      beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len;
+
+	if (beacon->mbssid_ies)
+		len += ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies,
+						       beacon->mbssid_ies->cnt);
 
 	new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL);
 	if (!new_beacon)
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c8c037a9e4e2..84d10e993eca 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1186,13 +1186,17 @@ ieee80211_vif_get_shift(struct ieee80211_vif *vif)
 }
 
 static inline int
-ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems)
+ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems, u8 i)
 {
-	int i, len = 0;
+	int len = 0;
 
-	if (!elems)
+	if (!elems || !elems->cnt || i > elems->cnt)
 		return 0;
 
+	if (i < elems->cnt)
+		return elems->elem[i].len;
+
+	/* i == elems->cnt, calculate total length of all MBSSID elements */
 	for (i = 0; i < elems->cnt; i++)
 		len += elems->elem[i].len;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index de17926484bd..139eec6c64da 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -5212,13 +5212,20 @@ ieee80211_beacon_get_finish(struct ieee80211_hw *hw,
 }
 
 static void
-ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon)
+ieee80211_beacon_add_mbssid(struct sk_buff *skb, struct beacon_data *beacon,
+			    u8 i)
 {
-	int i;
+	if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt ||
+	    i > beacon->mbssid_ies->cnt)
+		return;
 
-	if (!beacon->mbssid_ies)
+	if (i < beacon->mbssid_ies->cnt) {
+		skb_put_data(skb, beacon->mbssid_ies->elem[i].data,
+			     beacon->mbssid_ies->elem[i].len);
 		return;
+	}
 
+	/* i == beacon->mbssid_ies->cnt, include all MBSSID elements */
 	for (i = 0; i < beacon->mbssid_ies->cnt; i++)
 		skb_put_data(skb, beacon->mbssid_ies->elem[i].data,
 			     beacon->mbssid_ies->elem[i].len);
@@ -5231,7 +5238,8 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 			struct ieee80211_mutable_offsets *offs,
 			bool is_template,
 			struct beacon_data *beacon,
-			struct ieee80211_chanctx_conf *chanctx_conf)
+			struct ieee80211_chanctx_conf *chanctx_conf,
+			u8 ema_index)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
@@ -5250,7 +5258,9 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	/* headroom, head length,
 	 * tail length, maximum TIM length and multiple BSSID length
 	 */
-	mbssid_len = ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies);
+	mbssid_len = ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies,
+						     ema_index);
+
 	skb = dev_alloc_skb(local->tx_headroom + beacon->head_len +
 			    beacon->tail_len + 256 +
 			    local->hw.extra_beacon_tailroom + mbssid_len);
@@ -5268,7 +5278,7 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 		offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0];
 
 		if (mbssid_len) {
-			ieee80211_beacon_add_mbssid(skb, beacon);
+			ieee80211_beacon_add_mbssid(skb, beacon, ema_index);
 			offs->mbssid_off = skb->len - mbssid_len;
 		}
 
@@ -5287,12 +5297,51 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw,
 	return skb;
 }
 
+static struct ieee80211_ema_beacons *
+ieee80211_beacon_get_ap_ema_list(struct ieee80211_hw *hw,
+				 struct ieee80211_vif *vif,
+				 struct ieee80211_link_data *link,
+				 struct ieee80211_mutable_offsets *offs,
+				 bool is_template, struct beacon_data *beacon,
+				 struct ieee80211_chanctx_conf *chanctx_conf)
+{
+	struct ieee80211_ema_beacons *ema = NULL;
+
+	if (!beacon->mbssid_ies || !beacon->mbssid_ies->cnt)
+		return NULL;
+
+	ema = kzalloc(struct_size(ema, bcn, beacon->mbssid_ies->cnt),
+		      GFP_ATOMIC);
+	if (!ema)
+		return NULL;
+
+	for (ema->cnt = 0; ema->cnt < beacon->mbssid_ies->cnt; ema->cnt++) {
+		ema->bcn[ema->cnt].skb =
+			ieee80211_beacon_get_ap(hw, vif, link,
+						&ema->bcn[ema->cnt].offs,
+						is_template, beacon,
+						chanctx_conf, ema->cnt);
+		if (!ema->bcn[ema->cnt].skb)
+			break;
+	}
+
+	if (ema->cnt == beacon->mbssid_ies->cnt)
+		return ema;
+
+	ieee80211_beacon_free_ema_list(ema);
+	return NULL;
+}
+
+#define IEEE80211_INCLUDE_ALL_MBSSID_ELEMS -1
+
 static struct sk_buff *
 __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		       struct ieee80211_vif *vif,
 		       struct ieee80211_mutable_offsets *offs,
 		       bool is_template,
-		       unsigned int link_id)
+		       unsigned int link_id,
+		       int ema_index,
+		       struct ieee80211_ema_beacons **ema_beacons)
 {
 	struct ieee80211_local *local = hw_to_local(hw);
 	struct beacon_data *beacon = NULL;
@@ -5321,8 +5370,29 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw,
 		if (!beacon)
 			goto out;
 
-		skb = ieee80211_beacon_get_ap(hw, vif, link, offs, is_template,
-					      beacon, chanctx_conf);
+		if (ema_beacons) {
+			*ema_beacons =
+				ieee80211_beacon_get_ap_ema_list(hw, vif, link,
+								 offs,
+								 is_template,
+								 beacon,
+								 chanctx_conf);
+		} else {
+			if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) {
+				if (ema_index >= beacon->mbssid_ies->cnt)
+					goto out; /* End of MBSSID elements */
+
+				if (ema_index <= IEEE80211_INCLUDE_ALL_MBSSID_ELEMS)
+					ema_index = beacon->mbssid_ies->cnt;
+			} else {
+				ema_index = 0;
+			}
+
+			skb = ieee80211_beacon_get_ap(hw, vif, link, offs,
+						      is_template, beacon,
+						      chanctx_conf,
+						      ema_index);
+		}
 	} else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
 		struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
 		struct ieee80211_hdr *hdr;
@@ -5410,10 +5480,50 @@ ieee80211_beacon_get_template(struct ieee80211_hw *hw,
 			      struct ieee80211_mutable_offsets *offs,
 			      unsigned int link_id)
 {
-	return __ieee80211_beacon_get(hw, vif, offs, true, link_id);
+	return __ieee80211_beacon_get(hw, vif, offs, true, link_id,
+				      IEEE80211_INCLUDE_ALL_MBSSID_ELEMS, NULL);
 }
 EXPORT_SYMBOL(ieee80211_beacon_get_template);
 
+struct sk_buff *
+ieee80211_beacon_get_template_ema_index(struct ieee80211_hw *hw,
+					struct ieee80211_vif *vif,
+					struct ieee80211_mutable_offsets *offs,
+					unsigned int link_id, u8 ema_index)
+{
+	return __ieee80211_beacon_get(hw, vif, offs, true, link_id, ema_index,
+				      NULL);
+}
+EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_index);
+
+void ieee80211_beacon_free_ema_list(struct ieee80211_ema_beacons *ema_beacons)
+{
+	u8 i;
+
+	if (!ema_beacons)
+		return;
+
+	for (i = 0; i < ema_beacons->cnt; i++)
+		kfree_skb(ema_beacons->bcn[i].skb);
+
+	kfree(ema_beacons);
+}
+EXPORT_SYMBOL(ieee80211_beacon_free_ema_list);
+
+struct ieee80211_ema_beacons *
+ieee80211_beacon_get_template_ema_list(struct ieee80211_hw *hw,
+				       struct ieee80211_vif *vif,
+				       unsigned int link_id)
+{
+	struct ieee80211_ema_beacons *ema_beacons = NULL;
+
+	WARN_ON(__ieee80211_beacon_get(hw, vif, NULL, false, link_id, 0,
+				       &ema_beacons));
+
+	return ema_beacons;
+}
+EXPORT_SYMBOL(ieee80211_beacon_get_template_ema_list);
+
 struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 					 struct ieee80211_vif *vif,
 					 u16 *tim_offset, u16 *tim_length,
@@ -5421,7 +5531,9 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw,
 {
 	struct ieee80211_mutable_offsets offs = {};
 	struct sk_buff *bcn = __ieee80211_beacon_get(hw, vif, &offs, false,
-						     link_id);
+						     link_id,
+						     IEEE80211_INCLUDE_ALL_MBSSID_ELEMS,
+						     NULL);
 	struct sk_buff *copy;
 	int shift;
 
-- 
cgit v1.2.3


From 6f56ad1b92328997e1b1792047099df6f8d7acb5 Mon Sep 17 00:00:00 2001
From: Jeremy Sowden <jeremy@azazel.net>
Date: Wed, 15 Mar 2023 21:48:01 +0000
Subject: netfilter: nft_redir: use `struct nf_nat_range2` throughout and
 deduplicate eval call-backs

`nf_nat_redirect_ipv4` takes a `struct nf_nat_ipv4_multi_range_compat`,
but converts it internally to a `struct nf_nat_range2`.  Change the
function to take the latter, factor out the code now shared with
`nf_nat_redirect_ipv6`, move the conversion to the xt_REDIRECT module,
and update the ipv4 range initialization in the nft_redir module.

Replace a bare hex constant for 127.0.0.1 with a macro.

Remove `WARN_ON`.  `nf_nat_setup_info` calls `nf_ct_is_confirmed`:

	/* Can't setup nat info for confirmed ct. */
	if (nf_ct_is_confirmed(ct))
		return NF_ACCEPT;

This means that `ct` cannot be null or the kernel will crash, and
implies that `ctinfo` is `IP_CT_NEW` or `IP_CT_RELATED`.

nft_redir has separate ipv4 and ipv6 call-backs which share much of
their code, and an inet one switch containing a switch that calls one of
the others based on the family of the packet.  Merge the ipv4 and ipv6
ones into the inet one in order to get rid of the duplicate code.

Const-qualify the `priv` pointer since we don't need to write through
it.

Assign `priv->flags` to the range instead of OR-ing it in.

Set the `NF_NAT_RANGE_PROTO_SPECIFIED` flag once during init, rather
than on every eval.

Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_nat_redirect.h |  3 +-
 net/netfilter/nf_nat_redirect.c         | 71 +++++++++++++---------------
 net/netfilter/nft_redir.c               | 84 ++++++++++++---------------------
 net/netfilter/xt_REDIRECT.c             | 10 +++-
 4 files changed, 72 insertions(+), 96 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h
index 2418653a66db..279380de904c 100644
--- a/include/net/netfilter/nf_nat_redirect.h
+++ b/include/net/netfilter/nf_nat_redirect.h
@@ -6,8 +6,7 @@
 #include <uapi/linux/netfilter/nf_nat.h>
 
 unsigned int
-nf_nat_redirect_ipv4(struct sk_buff *skb,
-		     const struct nf_nat_ipv4_multi_range_compat *mr,
+nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum);
 unsigned int
 nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index f91579c821e9..6616ba5d0b04 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -10,6 +10,7 @@
 
 #include <linux/if.h>
 #include <linux/inetdevice.h>
+#include <linux/in.h>
 #include <linux/ip.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
@@ -24,54 +25,56 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_redirect.h>
 
+static unsigned int
+nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range,
+		const union nf_inet_addr *newdst)
+{
+	struct nf_nat_range2 newrange;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+	memset(&newrange, 0, sizeof(newrange));
+
+	newrange.flags		= range->flags | NF_NAT_RANGE_MAP_IPS;
+	newrange.min_addr	= *newdst;
+	newrange.max_addr	= *newdst;
+	newrange.min_proto	= range->min_proto;
+	newrange.max_proto	= range->max_proto;
+
+	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+}
+
 unsigned int
-nf_nat_redirect_ipv4(struct sk_buff *skb,
-		     const struct nf_nat_ipv4_multi_range_compat *mr,
+nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum)
 {
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	__be32 newdst;
-	struct nf_nat_range2 newrange;
+	union nf_inet_addr newdst = {};
 
 	WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
 		hooknum != NF_INET_LOCAL_OUT);
 
-	ct = nf_ct_get(skb, &ctinfo);
-	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
-
 	/* Local packets: make them go to loopback */
 	if (hooknum == NF_INET_LOCAL_OUT) {
-		newdst = htonl(0x7F000001);
+		newdst.ip = htonl(INADDR_LOOPBACK);
 	} else {
 		const struct in_device *indev;
 
-		newdst = 0;
-
 		indev = __in_dev_get_rcu(skb->dev);
 		if (indev) {
 			const struct in_ifaddr *ifa;
 
 			ifa = rcu_dereference(indev->ifa_list);
 			if (ifa)
-				newdst = ifa->ifa_local;
+				newdst.ip = ifa->ifa_local;
 		}
 
-		if (!newdst)
+		if (!newdst.ip)
 			return NF_DROP;
 	}
 
-	/* Transfer from original range. */
-	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
-	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
-	newrange.flags	     = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
-	newrange.min_addr.ip = newdst;
-	newrange.max_addr.ip = newdst;
-	newrange.min_proto   = mr->range[0].min;
-	newrange.max_proto   = mr->range[0].max;
-
-	/* Hand modified range to generic setup. */
-	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+	return nf_nat_redirect(skb, range, &newdst);
 }
 EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
 
@@ -81,14 +84,10 @@ unsigned int
 nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		     unsigned int hooknum)
 {
-	struct nf_nat_range2 newrange;
-	struct in6_addr newdst;
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
+	union nf_inet_addr newdst = {};
 
-	ct = nf_ct_get(skb, &ctinfo);
 	if (hooknum == NF_INET_LOCAL_OUT) {
-		newdst = loopback_addr;
+		newdst.in6 = loopback_addr;
 	} else {
 		struct inet6_dev *idev;
 		struct inet6_ifaddr *ifa;
@@ -98,7 +97,7 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		if (idev != NULL) {
 			read_lock_bh(&idev->lock);
 			list_for_each_entry(ifa, &idev->addr_list, if_list) {
-				newdst = ifa->addr;
+				newdst.in6 = ifa->addr;
 				addr = true;
 				break;
 			}
@@ -109,12 +108,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 			return NF_DROP;
 	}
 
-	newrange.flags		= range->flags | NF_NAT_RANGE_MAP_IPS;
-	newrange.min_addr.in6	= newdst;
-	newrange.max_addr.in6	= newdst;
-	newrange.min_proto	= range->min_proto;
-	newrange.max_proto	= range->max_proto;
-
-	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
+	return nf_nat_redirect(skb, range, &newdst);
 }
 EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 67cec56bc84a..a70196ffcb1e 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -64,6 +64,8 @@ static int nft_redir_init(const struct nft_ctx *ctx,
 		} else {
 			priv->sreg_proto_max = priv->sreg_proto_min;
 		}
+
+		priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
 	}
 
 	if (tb[NFTA_REDIR_FLAGS]) {
@@ -99,25 +101,37 @@ nla_put_failure:
 	return -1;
 }
 
-static void nft_redir_ipv4_eval(const struct nft_expr *expr,
-				struct nft_regs *regs,
-				const struct nft_pktinfo *pkt)
+static void nft_redir_eval(const struct nft_expr *expr,
+			   struct nft_regs *regs,
+			   const struct nft_pktinfo *pkt)
 {
-	struct nft_redir *priv = nft_expr_priv(expr);
-	struct nf_nat_ipv4_multi_range_compat mr;
+	const struct nft_redir *priv = nft_expr_priv(expr);
+	struct nf_nat_range2 range;
 
-	memset(&mr, 0, sizeof(mr));
+	memset(&range, 0, sizeof(range));
+	range.flags = priv->flags;
 	if (priv->sreg_proto_min) {
-		mr.range[0].min.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_min]);
-		mr.range[0].max.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_max]);
-		mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+		range.min_proto.all = (__force __be16)
+			nft_reg_load16(&regs->data[priv->sreg_proto_min]);
+		range.max_proto.all = (__force __be16)
+			nft_reg_load16(&regs->data[priv->sreg_proto_max]);
 	}
 
-	mr.range[0].flags |= priv->flags;
-
-	regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
+	switch (nft_pf(pkt)) {
+	case NFPROTO_IPV4:
+		regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range,
+							  nft_hook(pkt));
+		break;
+#ifdef CONFIG_NF_TABLES_IPV6
+	case NFPROTO_IPV6:
+		regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
+							  nft_hook(pkt));
+		break;
+#endif
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
 }
 
 static void
@@ -130,7 +144,7 @@ static struct nft_expr_type nft_redir_ipv4_type;
 static const struct nft_expr_ops nft_redir_ipv4_ops = {
 	.type		= &nft_redir_ipv4_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_redir)),
-	.eval		= nft_redir_ipv4_eval,
+	.eval		= nft_redir_eval,
 	.init		= nft_redir_init,
 	.destroy	= nft_redir_ipv4_destroy,
 	.dump		= nft_redir_dump,
@@ -148,28 +162,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
 };
 
 #ifdef CONFIG_NF_TABLES_IPV6
-static void nft_redir_ipv6_eval(const struct nft_expr *expr,
-				struct nft_regs *regs,
-				const struct nft_pktinfo *pkt)
-{
-	struct nft_redir *priv = nft_expr_priv(expr);
-	struct nf_nat_range2 range;
-
-	memset(&range, 0, sizeof(range));
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_min]);
-		range.max_proto.all = (__force __be16)nft_reg_load16(
-			&regs->data[priv->sreg_proto_max]);
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-
-	range.flags |= priv->flags;
-
-	regs->verdict.code =
-		nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
-}
-
 static void
 nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
@@ -180,7 +172,7 @@ static struct nft_expr_type nft_redir_ipv6_type;
 static const struct nft_expr_ops nft_redir_ipv6_ops = {
 	.type		= &nft_redir_ipv6_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_redir)),
-	.eval		= nft_redir_ipv6_eval,
+	.eval		= nft_redir_eval,
 	.init		= nft_redir_init,
 	.destroy	= nft_redir_ipv6_destroy,
 	.dump		= nft_redir_dump,
@@ -199,20 +191,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
 #endif
 
 #ifdef CONFIG_NF_TABLES_INET
-static void nft_redir_inet_eval(const struct nft_expr *expr,
-				struct nft_regs *regs,
-				const struct nft_pktinfo *pkt)
-{
-	switch (nft_pf(pkt)) {
-	case NFPROTO_IPV4:
-		return nft_redir_ipv4_eval(expr, regs, pkt);
-	case NFPROTO_IPV6:
-		return nft_redir_ipv6_eval(expr, regs, pkt);
-	}
-
-	WARN_ON_ONCE(1);
-}
-
 static void
 nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
@@ -223,7 +201,7 @@ static struct nft_expr_type nft_redir_inet_type;
 static const struct nft_expr_ops nft_redir_inet_ops = {
 	.type		= &nft_redir_inet_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_redir)),
-	.eval		= nft_redir_inet_eval,
+	.eval		= nft_redir_eval,
 	.init		= nft_redir_init,
 	.destroy	= nft_redir_inet_destroy,
 	.dump		= nft_redir_dump,
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 353ca7801251..ff66b56a3f97 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
 	nf_ct_netns_put(par->net, par->family);
 }
 
-/* FIXME: Take multiple ranges --RR */
 static int redirect_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
@@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
 static unsigned int
 redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 {
-	return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
+	const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+	struct nf_nat_range2 range = {
+		.flags       = mr->range[0].flags,
+		.min_proto   = mr->range[0].min,
+		.max_proto   = mr->range[0].max,
+	};
+
+	return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par));
 }
 
 static struct xt_target redirect_tg_reg[] __read_mostly = {
-- 
cgit v1.2.3


From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001
From: JP Kobryn <inwardvessel@gmail.com>
Date: Wed, 22 Mar 2023 12:47:54 -0700
Subject: bpf: return long from bpf_map_ops funcs

This patch changes the return types of bpf_map_ops functions to long, where
previously int was returned. Using long allows for bpf programs to maintain
the sign bit in the absence of sign extension during situations where
inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative
error is returned.

The definitions of the helper funcs are generated from comments in the bpf
uapi header at `include/uapi/linux/bpf.h`. The return type of these
helpers was previously changed from int to long in commit bdb7b79b4ce8. For
any case where one of the map helpers call the bpf_map_ops funcs that are
still returning 32-bit int, a compiler might not include sign extension
instructions to properly convert the 32-bit negative value a 64-bit
negative value.

For example:
bpf assembly excerpt of an inlined helper calling a kernel function and
checking for a specific error:

; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST);
  ...
  46:	call   0xffffffffe103291c	; htab_map_update_elem
; if (err && err != -EEXIST) {
  4b:	cmp    $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax

kernel function assembly excerpt of return value from
`htab_map_update_elem` returning 32-bit int:

movl $0xffffffef, %r9d
...
movl %r9d, %eax

...results in the comparison:
cmp $0xffffffffffffffef, $0x00000000ffffffef

Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long")
Tested-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: JP Kobryn <inwardvessel@gmail.com>
Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            | 14 +++++++-------
 include/linux/filter.h         |  6 +++---
 kernel/bpf/arraymap.c          | 12 ++++++------
 kernel/bpf/bloom_filter.c      | 12 ++++++------
 kernel/bpf/bpf_cgrp_storage.c  |  6 +++---
 kernel/bpf/bpf_inode_storage.c |  6 +++---
 kernel/bpf/bpf_struct_ops.c    |  6 +++---
 kernel/bpf/bpf_task_storage.c  |  6 +++---
 kernel/bpf/cpumap.c            |  8 ++++----
 kernel/bpf/devmap.c            | 24 ++++++++++++------------
 kernel/bpf/hashtab.c           | 36 ++++++++++++++++++------------------
 kernel/bpf/local_storage.c     |  6 +++---
 kernel/bpf/lpm_trie.c          |  6 +++---
 kernel/bpf/queue_stack_maps.c  | 22 +++++++++++-----------
 kernel/bpf/reuseport_array.c   |  2 +-
 kernel/bpf/ringbuf.c           |  6 +++---
 kernel/bpf/stackmap.c          |  6 +++---
 kernel/bpf/verifier.c          | 14 +++++++-------
 net/core/bpf_sk_storage.c      |  6 +++---
 net/core/sock_map.c            |  8 ++++----
 net/xdp/xskmap.c               |  8 ++++----
 21 files changed, 110 insertions(+), 110 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3ef98fb92987..ec0df059f562 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -96,11 +96,11 @@ struct bpf_map_ops {
 
 	/* funcs callable from userspace and from eBPF programs */
 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
-	int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
-	int (*map_delete_elem)(struct bpf_map *map, void *key);
-	int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
-	int (*map_pop_elem)(struct bpf_map *map, void *value);
-	int (*map_peek_elem)(struct bpf_map *map, void *value);
+	long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
+	long (*map_delete_elem)(struct bpf_map *map, void *key);
+	long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
+	long (*map_pop_elem)(struct bpf_map *map, void *value);
+	long (*map_peek_elem)(struct bpf_map *map, void *value);
 	void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu);
 
 	/* funcs called by prog_array and perf_event_array map */
@@ -139,7 +139,7 @@ struct bpf_map_ops {
 	struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
 
 	/* Misc helpers.*/
-	int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
+	long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags);
 
 	/* map_meta_equal must be implemented for maps that can be
 	 * used as an inner map.  It is a runtime check to ensure
@@ -157,7 +157,7 @@ struct bpf_map_ops {
 	int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
 					      struct bpf_func_state *caller,
 					      struct bpf_func_state *callee);
-	int (*map_for_each_callback)(struct bpf_map *map,
+	long (*map_for_each_callback)(struct bpf_map *map,
 				     bpf_callback_t callback_fn,
 				     void *callback_ctx, u64 flags);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index efa5d4a1677e..23c08c31bea9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol,
 }
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
-static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
-						  u64 flags, const u64 flag_mask,
-						  void *lookup_elem(struct bpf_map *map, u32 key))
+static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
+						   u64 flags, const u64 flag_mask,
+						   void *lookup_elem(struct bpf_map *map, u32 key))
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1588c793a715..2058e89b5ddd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
 }
 
 /* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
-				 u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+				  u64 map_flags)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
@@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 }
 
 /* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
@@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_array_map_info),
 };
 
-static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
-				   void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+				    void *callback_ctx, u64 flags)
 {
 	u32 i, key, num_elems = 0;
 	struct bpf_array *array;
@@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 	return 0;
 }
 
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	void *old_ptr;
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 6350c5d35a9b..db19784601a7 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -41,7 +41,7 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
 	return h & bloom->bitset_mask;
 }
 
-static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
 {
 	struct bpf_bloom_filter *bloom =
 		container_of(map, struct bpf_bloom_filter, map);
@@ -56,7 +56,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)
 	return 0;
 }
 
-static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
 {
 	struct bpf_bloom_filter *bloom =
 		container_of(map, struct bpf_bloom_filter, map);
@@ -73,12 +73,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
 	return 0;
 }
 
-static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
 }
 
-static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
 }
@@ -177,8 +177,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(-EINVAL);
 }
 
-static int bloom_map_update_elem(struct bpf_map *map, void *key,
-				 void *value, u64 flags)
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+				  void *value, u64 flags)
 {
 	/* The eBPF program should use map_push_elem instead */
 	return -EINVAL;
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c975cacdd16b..f5b016a5484d 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -93,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
 	return sdata ? sdata->data : NULL;
 }
 
-static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
-					  void *value, u64 map_flags)
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+					 void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct cgroup *cgroup;
@@ -125,7 +125,7 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
 	return 0;
 }
 
-static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct cgroup *cgroup;
 	int err, fd;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index ad2ab0187e45..9a5f05151898 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -91,8 +91,8 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
 	return sdata ? sdata->data : NULL;
 }
 
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+					     void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct file *f;
@@ -127,7 +127,7 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
 	return 0;
 }
 
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct file *f;
 	int fd, err;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 38903fb52f98..ba7a94276e3b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -349,8 +349,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
 					   model, flags, tlinks, NULL);
 }
 
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
-					  void *value, u64 flags)
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+					   void *value, u64 flags)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 	const struct bpf_struct_ops *st_ops = st_map->st_ops;
@@ -524,7 +524,7 @@ unlock:
 	return err;
 }
 
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 {
 	enum bpf_struct_ops_state prev_state;
 	struct bpf_struct_ops_map *st_map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index c88cc04c17c1..ab5bd1ef58c4 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -120,8 +120,8 @@ out:
 	return ERR_PTR(err);
 }
 
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
-					    void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+					     void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct task_struct *task;
@@ -173,7 +173,7 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
 	return 0;
 }
 
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct task_struct *task;
 	unsigned int f_flags;
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 871809e71b4e..8ec18faa74ac 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
 	}
 }
 
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	u32 key_cpu = *(u32 *)key;
@@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
 	struct bpf_cpumap_val cpumap_value = {};
@@ -667,7 +667,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return 0;
 }
 
-static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __cpu_map_lookup_elem);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 19b036a228f7..802692fa3905 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
 	kfree(dev);
 }
 
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *old_dev;
@@ -826,7 +826,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *old_dev;
@@ -897,8 +897,8 @@ err_out:
 	return ERR_PTR(-EINVAL);
 }
 
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
-				 void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+				  void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
@@ -939,15 +939,15 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 	return 0;
 }
 
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	return __dev_map_update_elem(current->nsproxy->net_ns,
 				     map, key, value, map_flags);
 }
 
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
-				     void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+				       void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
@@ -999,21 +999,21 @@ out_err:
 	return err;
 }
 
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
-				   u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+				     u64 map_flags)
 {
 	return __dev_map_hash_update_elem(current->nsproxy->net_ns,
 					 map, key, value, map_flags);
 }
 
-static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
 				      __dev_map_lookup_elem);
 }
 
-static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, ifindex, flags,
 				      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0df4b0c10f59..96b645bba3a4 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1073,8 +1073,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 }
 
 /* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
-				u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1175,8 +1175,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
 	bpf_lru_push_free(&htab->lru, &elem->lru_node);
 }
 
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
-				    u64 map_flags)
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+				     u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new, *l_old = NULL;
@@ -1242,9 +1242,9 @@ err:
 	return ret;
 }
 
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags,
-					 bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+					  void *value, u64 map_flags,
+					  bool onallcpus)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1297,9 +1297,9 @@ err:
 	return ret;
 }
 
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
-					     void *value, u64 map_flags,
-					     bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					      void *value, u64 map_flags,
+					      bool onallcpus)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new = NULL, *l_old;
@@ -1364,21 +1364,21 @@ err:
 	return ret;
 }
 
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+					void *value, u64 map_flags)
 {
 	return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
 }
 
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
-					   void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+					    void *value, u64 map_flags)
 {
 	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
 						 false);
 }
 
 /* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
@@ -1414,7 +1414,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
@@ -2134,8 +2134,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
 };
 
-static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
-				  void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+				   void *callback_ctx, u64 flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_nulls_head *head;
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index a993560f200a..4c7bbec4a9e4 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
 	return &READ_ONCE(storage->buf)->data[0];
 }
 
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
-				      void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 flags)
 {
 	struct bpf_cgroup_storage *storage;
 	struct bpf_storage_buffer *new;
@@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
 	bpf_map_area_free(map);
 }
 
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index dc23f2ac9cde..e0d3ddf2037a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
 }
 
 /* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
-			    void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+			     void *_key, void *value, u64 flags)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -431,7 +431,7 @@ out:
 }
 
 /* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct bpf_lpm_trie_key *key = _key;
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 63ecbbcb349d..601609164ef3 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)
 	bpf_map_area_free(qs);
 }
 
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long flags;
@@ -124,7 +124,7 @@ out:
 }
 
 
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long flags;
@@ -156,32 +156,32 @@ out:
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return __queue_map_get(map, value, false);
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return __stack_map_get(map, value, false);
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return __queue_map_get(map, value, true);
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
 {
 	return __stack_map_get(map, value, true);
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
-				     u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+				      u64 flags)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long irq_flags;
@@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
-				       void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+					void *value, u64 flags)
 {
 	return -EINVAL;
 }
 
 /* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 71cb72f5b733..cbf2d8d784b8 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
 }
 
 /* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
 {
 	struct reuseport_array *array = reuseport_array(map);
 	u32 index = *(u32 *)key;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0d2a45ff83f1..875ac9b698d9 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -242,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
-				   u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+				    u64 flags)
 {
 	return -ENOTSUPP;
 }
 
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
 {
 	return -ENOTSUPP;
 }
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 0f1d8dced933..b25fce425b2c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
 	return 0;
 }
 
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
-				 u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+				  u64 map_flags)
 {
 	return -EINVAL;
 }
 
 /* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
 	struct stack_map_bucket *old_bucket;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5693e4a92752..50c995697f0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -17692,21 +17692,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
 				     (void *(*)(struct bpf_map *map, void *key))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
-				     (int (*)(struct bpf_map *map, void *key))NULL));
+				     (long (*)(struct bpf_map *map, void *key))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_update_elem,
-				     (int (*)(struct bpf_map *map, void *key, void *value,
+				     (long (*)(struct bpf_map *map, void *key, void *value,
 					      u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_push_elem,
-				     (int (*)(struct bpf_map *map, void *value,
+				     (long (*)(struct bpf_map *map, void *value,
 					      u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
-				     (int (*)(struct bpf_map *map, void *value))NULL));
+				     (long (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
-				     (int (*)(struct bpf_map *map, void *value))NULL));
+				     (long (*)(struct bpf_map *map, void *value))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_redirect,
-				     (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+				     (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
 			BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
-				     (int (*)(struct bpf_map *map,
+				     (long (*)(struct bpf_map *map,
 					      bpf_callback_t callback_fn,
 					      void *callback_ctx,
 					      u64 flags))NULL));
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 24c3dc0d62e5..cb0f5a105b89 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -94,8 +94,8 @@ static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
 	return ERR_PTR(err);
 }
 
-static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
-					 void *value, u64 map_flags)
+static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
+					  void *value, u64 map_flags)
 {
 	struct bpf_local_storage_data *sdata;
 	struct socket *sock;
@@ -114,7 +114,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
 	return err;
 }
 
-static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
 {
 	struct socket *sock;
 	int fd, err;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 9b854e236d23..7c189c2e2fbf 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -437,7 +437,7 @@ static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
 	__sock_map_delete(stab, sk, link_raw);
 }
 
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
+static long sock_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	u32 i = *(u32 *)key;
@@ -587,8 +587,8 @@ out:
 	return ret;
 }
 
-static int sock_map_update_elem(struct bpf_map *map, void *key,
-				void *value, u64 flags)
+static long sock_map_update_elem(struct bpf_map *map, void *key,
+				 void *value, u64 flags)
 {
 	struct sock *sk = (struct sock *)value;
 	int ret;
@@ -925,7 +925,7 @@ static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
 	raw_spin_unlock_bh(&bucket->lock);
 }
 
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+static long sock_hash_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
 	u32 hash, key_size = map->key_size;
diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c
index 0c38d7175922..2c1427074a3b 100644
--- a/net/xdp/xskmap.c
+++ b/net/xdp/xskmap.c
@@ -162,8 +162,8 @@ static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
-			       u64 map_flags)
+static long xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 	struct xdp_sock __rcu **map_entry;
@@ -223,7 +223,7 @@ out:
 	return err;
 }
 
-static int xsk_map_delete_elem(struct bpf_map *map, void *key)
+static long xsk_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 	struct xdp_sock __rcu **map_entry;
@@ -243,7 +243,7 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key)
 	return 0;
 }
 
-static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags)
 {
 	return __bpf_xdp_redirect_map(map, index, flags, 0,
 				      __xsk_map_lookup_elem);
-- 
cgit v1.2.3


From 1cc6571f562774f1d928dc8b3cff50829b86e970 Mon Sep 17 00:00:00 2001
From: Nick Child <nnac123@linux.ibm.com>
Date: Tue, 21 Mar 2023 10:07:25 -0500
Subject: netdev: Enforce index cap in netdev_get_tx_queue

When requesting a TX queue at a given index, warn on out-of-bounds
referencing if the index is greater than the allocated number of
queues.

Specifically, since this function is used heavily in the networking
stack use DEBUG_NET_WARN_ON_ONCE to avoid executing a new branch on
every packet.

Signed-off-by: Nick Child <nnac123@linux.ibm.com>
Link: https://lore.kernel.org/r/20230321150725.127229-2-nnac123@linux.ibm.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7621c512765f..674ee5daa7b1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2484,6 +2484,7 @@ static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
 {
+	DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues);
 	return &dev->_tx[index];
 }
 
-- 
cgit v1.2.3


From 4ee9b0dcf09f426fbad7ed132d73ea2ba379d8ee Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Mar 2023 15:58:54 +0000
Subject: net: phylink: remove an_enabled

The Autoneg bit in the advertising bitmap and state->an_enabled are
always identical. state->an_enabled is now no longer used by any
drivers, so lets kill this duplication.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 37 +++++++++++++++++--------------------
 include/linux/phylink.h   |  2 --
 2 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 1a2f074685fa..f7da96f0c75b 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -843,7 +843,6 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode)
 		phylink_set(pl->supported, Autoneg);
 		phylink_set(pl->supported, Asym_Pause);
 		phylink_set(pl->supported, Pause);
-		pl->link_config.an_enabled = true;
 		pl->cfg_link_an_mode = MLO_AN_INBAND;
 
 		switch (pl->link_config.interface) {
@@ -945,9 +944,6 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode)
 				    "failed to validate link configuration for in-band status\n");
 			return -EINVAL;
 		}
-
-		/* Check if MAC/PCS also supports Autoneg. */
-		pl->link_config.an_enabled = phylink_test(pl->supported, Autoneg);
 	}
 
 	return 0;
@@ -957,7 +953,8 @@ static void phylink_apply_manual_flow(struct phylink *pl,
 				      struct phylink_link_state *state)
 {
 	/* If autoneg is disabled, pause AN is also disabled */
-	if (!state->an_enabled)
+	if (!linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			       state->advertising))
 		state->pause &= ~MLO_PAUSE_AN;
 
 	/* Manual configuration of pause modes */
@@ -997,21 +994,22 @@ static void phylink_mac_config(struct phylink *pl,
 			       const struct phylink_link_state *state)
 {
 	phylink_dbg(pl,
-		    "%s: mode=%s/%s/%s/%s/%s adv=%*pb pause=%02x link=%u an=%u\n",
+		    "%s: mode=%s/%s/%s/%s/%s adv=%*pb pause=%02x link=%u\n",
 		    __func__, phylink_an_mode_str(pl->cur_link_an_mode),
 		    phy_modes(state->interface),
 		    phy_speed_to_str(state->speed),
 		    phy_duplex_to_str(state->duplex),
 		    phy_rate_matching_to_str(state->rate_matching),
 		    __ETHTOOL_LINK_MODE_MASK_NBITS, state->advertising,
-		    state->pause, state->link, state->an_enabled);
+		    state->pause, state->link);
 
 	pl->mac_ops->mac_config(pl->config, pl->cur_link_an_mode, state);
 }
 
 static void phylink_mac_pcs_an_restart(struct phylink *pl)
 {
-	if (pl->link_config.an_enabled &&
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			      pl->link_config.advertising) &&
 	    phy_interface_mode_is_8023z(pl->link_config.interface) &&
 	    phylink_autoneg_inband(pl->cur_link_an_mode)) {
 		if (pl->pcs)
@@ -1138,9 +1136,9 @@ static void phylink_mac_pcs_get_state(struct phylink *pl,
 	linkmode_copy(state->advertising, pl->link_config.advertising);
 	linkmode_zero(state->lp_advertising);
 	state->interface = pl->link_config.interface;
-	state->an_enabled = pl->link_config.an_enabled;
 	state->rate_matching = pl->link_config.rate_matching;
-	if (state->an_enabled) {
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			      state->advertising)) {
 		state->speed = SPEED_UNKNOWN;
 		state->duplex = DUPLEX_UNKNOWN;
 		state->pause = MLO_PAUSE_NONE;
@@ -1531,7 +1529,6 @@ struct phylink *phylink_create(struct phylink_config *config,
 	pl->link_config.pause = MLO_PAUSE_AN;
 	pl->link_config.speed = SPEED_UNKNOWN;
 	pl->link_config.duplex = DUPLEX_UNKNOWN;
-	pl->link_config.an_enabled = true;
 	pl->mac_ops = mac_ops;
 	__set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
 	timer_setup(&pl->link_poll, phylink_fixed_poll, 0);
@@ -2136,8 +2133,9 @@ static void phylink_get_ksettings(const struct phylink_link_state *state,
 		kset->base.speed = state->speed;
 		kset->base.duplex = state->duplex;
 	}
-	kset->base.autoneg = state->an_enabled ? AUTONEG_ENABLE :
-				AUTONEG_DISABLE;
+	kset->base.autoneg = linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					       state->advertising) ?
+				AUTONEG_ENABLE : AUTONEG_DISABLE;
 }
 
 /**
@@ -2284,9 +2282,8 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
 	/* We have ruled out the case with a PHY attached, and the
 	 * fixed-link cases.  All that is left are in-band links.
 	 */
-	config.an_enabled = kset->base.autoneg == AUTONEG_ENABLE;
 	linkmode_mod_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, config.advertising,
-			 config.an_enabled);
+			 kset->base.autoneg == AUTONEG_ENABLE);
 
 	/* If this link is with an SFP, ensure that changes to advertised modes
 	 * also cause the associated interface to be selected such that the
@@ -2320,13 +2317,14 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
 	}
 
 	/* If autonegotiation is enabled, we must have an advertisement */
-	if (config.an_enabled && phylink_is_empty_linkmode(config.advertising))
+	if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+			      config.advertising) &&
+	    phylink_is_empty_linkmode(config.advertising))
 		return -EINVAL;
 
 	mutex_lock(&pl->state_mutex);
 	pl->link_config.speed = config.speed;
 	pl->link_config.duplex = config.duplex;
-	pl->link_config.an_enabled = config.an_enabled;
 
 	if (pl->link_config.interface != config.interface) {
 		/* The interface changed, e.g. 1000base-X <-> 2500base-X */
@@ -2932,7 +2930,6 @@ static int phylink_sfp_config_phy(struct phylink *pl, u8 mode,
 	config.speed = SPEED_UNKNOWN;
 	config.duplex = DUPLEX_UNKNOWN;
 	config.pause = MLO_PAUSE_AN;
-	config.an_enabled = pl->link_config.an_enabled;
 
 	/* Ignore errors if we're expecting a PHY to attach later */
 	ret = phylink_validate(pl, support, &config);
@@ -3001,7 +2998,6 @@ static int phylink_sfp_config_optical(struct phylink *pl)
 	config.speed = SPEED_UNKNOWN;
 	config.duplex = DUPLEX_UNKNOWN;
 	config.pause = MLO_PAUSE_AN;
-	config.an_enabled = true;
 
 	/* For all the interfaces that are supported, reduce the sfp_support
 	 * mask to only those link modes that can be supported.
@@ -3300,7 +3296,8 @@ void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state,
 	/* If there is no link or autonegotiation is disabled, the LP advertisement
 	 * data is not meaningful, so don't go any further.
 	 */
-	if (!state->link || !state->an_enabled)
+	if (!state->link || !linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT,
+					       state->advertising))
 		return;
 
 	switch (state->interface) {
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index c492c26202b5..9ff56b050584 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -93,7 +93,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode)
  *   the medium link mode (@speed and @duplex) and the speed/duplex of the phy
  *   interface mode (@interface) are different.
  * @link: true if the link is up.
- * @an_enabled: true if autonegotiation is enabled/desired.
  * @an_complete: true if autonegotiation has completed.
  */
 struct phylink_link_state {
@@ -105,7 +104,6 @@ struct phylink_link_state {
 	int pause;
 	int rate_matching;
 	unsigned int link:1;
-	unsigned int an_enabled:1;
 	unsigned int an_complete:1;
 };
 
-- 
cgit v1.2.3


From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:23:58 -0700
Subject: bpf: Retire the struct_ops map kvalue->refcnt.

We have replaced kvalue-refcnt with synchronize_rcu() to wait for an
RCU grace period.

Maintenance of kvalue->refcnt was a complicated task, as we had to
simultaneously keep track of two reference counts: one for the
reference count of bpf_map. When the kvalue->refcnt reaches zero, we
also have to reduce the reference count on bpf_map - yet these steps
are not performed in an atomic manner and require us to be vigilant
when managing them. By eliminating kvalue->refcnt, we can make our
maintenance more straightforward as the refcount of bpf_map is now
solely managed!

To prevent the trampoline image of a struct_ops from being released
while it is still in use, we wait for an RCU grace period. The
setsockopt(TCP_CONGESTION, "...") command allows you to change your
socket's congestion control algorithm and can result in releasing the
old struct_ops implementation. It is fine. However, this function is
exposed through bpf_setsockopt(), it may be accessed by BPF programs
as well. To ensure that the trampoline image belonging to struct_op
can be safely called while its method is in use, the trampoline
safeguarde the BPF program with rcu_read_lock(). Doing so prevents any
destruction of the associated images before returning from a
trampoline and requires us to wait for an RCU grace period.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h         |  1 +
 kernel/bpf/bpf_struct_ops.c | 77 ++++++++++++++++++++++++++-------------------
 kernel/bpf/syscall.c        |  6 ++--
 3 files changed, 49 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ec0df059f562..f04098468d7a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1945,6 +1945,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map);
 void bpf_map_inc_with_uref(struct bpf_map *map);
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
 struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ba7a94276e3b..2f3c4a0e03ee 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,6 +11,7 @@
 #include <linux/refcount.h>
 #include <linux/mutex.h>
 #include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
 
 enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
@@ -249,6 +250,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	enum bpf_struct_ops_state state;
+	s64 refcnt;
 
 	if (unlikely(*(u32 *)key != 0))
 		return -ENOENT;
@@ -267,7 +269,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
 	uvalue = value;
 	memcpy(uvalue, st_map->uvalue, map->value_size);
 	uvalue->state = state;
-	refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+	/* This value offers the user space a general estimate of how
+	 * many sockets are still utilizing this struct_ops for TCP
+	 * congestion control. The number might not be exact, but it
+	 * should sufficiently meet our present goals.
+	 */
+	refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+	refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
 
 	return 0;
 }
@@ -491,7 +500,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
-	refcount_set(&kvalue->refcnt, 1);
 	bpf_map_inc(map);
 
 	set_memory_rox((long)st_map->image, 1);
@@ -536,8 +544,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	switch (prev_state) {
 	case BPF_STRUCT_OPS_STATE_INUSE:
 		st_map->st_ops->unreg(&st_map->kvalue.data);
-		if (refcount_dec_and_test(&st_map->kvalue.refcnt))
-			bpf_map_put(map);
+		bpf_map_put(map);
 		return 0;
 	case BPF_STRUCT_OPS_STATE_TOBEFREE:
 		return -EINPROGRESS;
@@ -570,7 +577,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
 	kfree(value);
 }
 
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 
@@ -582,6 +589,28 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 	bpf_map_area_free(st_map);
 }
 
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+	/* The struct_ops's function may switch to another struct_ops.
+	 *
+	 * For example, bpf_tcp_cc_x->init() may switch to
+	 * another tcp_cc_y by calling
+	 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+	 * and its refcount may reach 0 which then free its
+	 * trampoline image while tcp_cc_x is still running.
+	 *
+	 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+	 * to finish. bpf-tcp-cc prog is non sleepable.
+	 * A rcu_tasks gp is to wait for the last few insn
+	 * in the tramopline image to finish before releasing
+	 * the trampoline image.
+	 */
+	synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+	__bpf_struct_ops_map_free(map);
+}
+
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
@@ -630,7 +659,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 				   NUMA_NO_NODE);
 	st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
 	if (!st_map->uvalue || !st_map->links || !st_map->image) {
-		bpf_struct_ops_map_free(map);
+		__bpf_struct_ops_map_free(map);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -676,41 +705,23 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
 bool bpf_struct_ops_get(const void *kdata)
 {
 	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *map;
 
 	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
 
-	return refcount_inc_not_zero(&kvalue->refcnt);
-}
-
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
-{
-	struct bpf_struct_ops_map *st_map;
-
-	st_map = container_of(head, struct bpf_struct_ops_map, rcu);
-	bpf_map_put(&st_map->map);
+	map = __bpf_map_inc_not_zero(&st_map->map, false);
+	return !IS_ERR(map);
 }
 
 void bpf_struct_ops_put(const void *kdata)
 {
 	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
 
 	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
-	if (refcount_dec_and_test(&kvalue->refcnt)) {
-		struct bpf_struct_ops_map *st_map;
-
-		st_map = container_of(kvalue, struct bpf_struct_ops_map,
-				      kvalue);
-		/* The struct_ops's function may switch to another struct_ops.
-		 *
-		 * For example, bpf_tcp_cc_x->init() may switch to
-		 * another tcp_cc_y by calling
-		 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
-		 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
-		 * and its map->refcnt may reach 0 which then free its
-		 * trampoline image while tcp_cc_x is still running.
-		 *
-		 * Thus, a rcu grace period is needed here.
-		 */
-		call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
-	}
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+	bpf_map_put(&st_map->map);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 099e9068bcdd..cff0348a2871 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1303,8 +1303,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	return map;
 }
 
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
 {
 	int refold;
 
-- 
cgit v1.2.3


From 8fb1a76a0f35c45a424c9eb84b0f97ffd51e6052 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:23:59 -0700
Subject: net: Update an existing TCP congestion control algorithm.

This feature lets you immediately transition to another congestion
control algorithm or implementation with the same name.  Once a name
is updated, new connections will apply this new algorithm.

The purpose is to update a customized algorithm implemented in BPF
struct_ops with a new version on the flight.  The following is an
example of using the userspace API implemented in later BPF patches.

   link = bpf_map__attach_struct_ops(skel->maps.ca_update_1);
   .......
   err = bpf_link__update_map(link, skel->maps.ca_update_2);

We first load and register an algorithm implemented in BPF struct_ops,
then swap it out with a new one using the same name. After that, newly
created connections will apply the updated algorithm, while older ones
retain the previous version already applied.

This patch also takes this chance to refactor the ca validation into
the new tcp_validate_congestion_control() function.

Cc: netdev@vger.kernel.org, Eric Dumazet <edumazet@google.com>
Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-3-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/tcp.h   |  3 +++
 net/ipv4/tcp_cong.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1e..2abb755e6a3a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1117,6 +1117,9 @@ struct tcp_congestion_ops {
 
 int tcp_register_congestion_control(struct tcp_congestion_ops *type);
 void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+int tcp_update_congestion_control(struct tcp_congestion_ops *type,
+				  struct tcp_congestion_ops *old_type);
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca);
 
 void tcp_assign_congestion_control(struct sock *sk);
 void tcp_init_congestion_control(struct sock *sk);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index db8b4b488c31..1b34050a7538 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -75,14 +75,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
 	return NULL;
 }
 
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
 {
-	int ret = 0;
-
 	/* all algorithms must implement these */
 	if (!ca->ssthresh || !ca->undo_cwnd ||
 	    !(ca->cong_avoid || ca->cong_control)) {
@@ -90,6 +84,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
 		return -EINVAL;
 	}
 
+	return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+	int ret;
+
+	ret = tcp_validate_congestion_control(ca);
+	if (ret)
+		return ret;
+
 	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
 
 	spin_lock(&tcp_cong_list_lock);
@@ -130,6 +138,50 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
 }
 EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
 
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+	struct tcp_congestion_ops *existing;
+	int ret;
+
+	ret = tcp_validate_congestion_control(ca);
+	if (ret)
+		return ret;
+
+	ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+	spin_lock(&tcp_cong_list_lock);
+	existing = tcp_ca_find_key(old_ca->key);
+	if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+		pr_notice("%s not registered or non-unique key\n",
+			  ca->name);
+		ret = -EINVAL;
+	} else if (existing != old_ca) {
+		pr_notice("invalid old congestion control algorithm to replace\n");
+		ret = -EINVAL;
+	} else {
+		/* Add the new one before removing the old one to keep
+		 * one implementation available all the time.
+		 */
+		list_add_tail_rcu(&ca->list, &tcp_cong_list);
+		list_del_rcu(&existing->list);
+		pr_debug("%s updated\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	/* Wait for outstanding readers to complete before the
+	 * module or struct_ops gets removed entirely.
+	 */
+	if (!ret)
+		synchronize_rcu();
+
+	return ret;
+}
+
 u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
 {
 	const struct tcp_congestion_ops *ca;
-- 
cgit v1.2.3


From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:00 -0700
Subject: bpf: Create links for BPF struct_ops maps.

Make bpf_link support struct_ops.  Previously, struct_ops were always
used alone without any associated links. Upon updating its value, a
struct_ops would be activated automatically. Yet other BPF program
types required to make a bpf_link with their instances before they
could become active. Now, however, you can create an inactive
struct_ops, and create a link to activate it later.

With bpf_links, struct_ops has a behavior similar to other BPF program
types. You can pin/unpin them from their links and the struct_ops will
be deactivated when its link is removed while previously need someone
to delete the value for it to be deactivated.

bpf_links are responsible for registering their associated
struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag
set to create a bpf_link, while a structs without this flag behaves in
the same manner as before and is registered upon updating its value.

The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it
used to craft the links for BPF struct_ops programs, but also to
create links for BPF struct_ops them-self.  Since the links of BPF
struct_ops programs are only used to create trampolines internally,
they are never seen in other contexts. Thus, they can be reused for
struct_ops themself.

To maintain a reference to the map supporting this link, we add
bpf_struct_ops_link as an additional type. The pointer of the map is
RCU and won't be necessary until later in the patchset.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            |   7 ++
 include/uapi/linux/bpf.h       |  12 +++-
 kernel/bpf/bpf_struct_ops.c    | 143 ++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           |  23 ++++---
 net/ipv4/bpf_tcp_ca.c          |   8 ++-
 tools/include/uapi/linux/bpf.h |  12 +++-
 6 files changed, 190 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f04098468d7a..8552279efe46 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1518,6 +1518,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
 	const char *name;
@@ -1552,6 +1553,7 @@ static inline void bpf_module_put(const void *data, struct module *owner)
 	else
 		module_put(owner);
 }
+int bpf_struct_ops_link_create(union bpf_attr *attr);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
@@ -1592,6 +1594,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
 {
 	return -EINVAL;
 }
+static inline int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 13129df937cd..42f40ee083bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@ enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6379,6 +6386,9 @@ struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 2f3c4a0e03ee..3d6b5240c25a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -17,6 +17,7 @@ enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
 	BPF_STRUCT_OPS_STATE_INUSE,
 	BPF_STRUCT_OPS_STATE_TOBEFREE,
+	BPF_STRUCT_OPS_STATE_READY,
 };
 
 #define BPF_STRUCT_OPS_COMMON_VALUE			\
@@ -59,6 +60,11 @@ struct bpf_struct_ops_map {
 	struct bpf_struct_ops_value kvalue;
 };
 
+struct bpf_struct_ops_link {
+	struct bpf_link link;
+	struct bpf_map __rcu *map;
+};
+
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
@@ -500,11 +506,29 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		*(unsigned long *)(udata + moff) = prog->aux->id;
 	}
 
-	bpf_map_inc(map);
+	if (st_map->map.map_flags & BPF_F_LINK) {
+		err = st_ops->validate(kdata);
+		if (err)
+			goto reset_unlock;
+		set_memory_rox((long)st_map->image, 1);
+		/* Let bpf_link handle registration & unregistration.
+		 *
+		 * Pair with smp_load_acquire() during lookup_elem().
+		 */
+		smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+		goto unlock;
+	}
 
 	set_memory_rox((long)st_map->image, 1);
 	err = st_ops->reg(kdata);
 	if (likely(!err)) {
+		/* This refcnt increment on the map here after
+		 * 'st_ops->reg()' is secure since the state of the
+		 * map must be set to INIT at this moment, and thus
+		 * bpf_struct_ops_map_delete_elem() can't unregister
+		 * or transition it to TOBEFREE concurrently.
+		 */
+		bpf_map_inc(map);
 		/* Pair with smp_load_acquire() during lookup_elem().
 		 * It ensures the above udata updates (e.g. prog->aux->id)
 		 * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -520,7 +544,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	 */
 	set_memory_nx((long)st_map->image, 1);
 	set_memory_rw((long)st_map->image, 1);
-	bpf_map_put(map);
 
 reset_unlock:
 	bpf_struct_ops_map_put_progs(st_map);
@@ -538,6 +561,9 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 	struct bpf_struct_ops_map *st_map;
 
 	st_map = (struct bpf_struct_ops_map *)map;
+	if (st_map->map.map_flags & BPF_F_LINK)
+		return -EOPNOTSUPP;
+
 	prev_state = cmpxchg(&st_map->kvalue.state,
 			     BPF_STRUCT_OPS_STATE_INUSE,
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
@@ -614,7 +640,7 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 {
 	if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
-	    attr->map_flags || !attr->btf_vmlinux_value_type_id)
+	    (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
 		return -EINVAL;
 	return 0;
 }
@@ -638,6 +664,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
+	if (attr->map_flags & BPF_F_LINK && !st_ops->validate)
+		return ERR_PTR(-EOPNOTSUPP);
+
 	t = st_ops->type;
 
 	st_map_size = sizeof(*st_map) +
@@ -725,3 +754,111 @@ void bpf_struct_ops_put(const void *kdata)
 
 	bpf_map_put(&st_map->map);
 }
+
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+	return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+		map->map_flags & BPF_F_LINK &&
+		/* Pair with smp_store_release() during map_update */
+		smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
+
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_struct_ops_map *st_map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	st_map = (struct bpf_struct_ops_map *)
+		rcu_dereference_protected(st_link->map, true);
+	if (st_map) {
+		/* st_link->map can be NULL if
+		 * bpf_struct_ops_link_create() fails to register.
+		 */
+		st_map->st_ops->unreg(&st_map->kvalue.data);
+		bpf_map_put(&st_map->map);
+	}
+	kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+					    struct seq_file *seq)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock();
+	map = rcu_dereference(st_link->map);
+	seq_printf(seq, "map_id:\t%d\n", map->id);
+	rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+					       struct bpf_link_info *info)
+{
+	struct bpf_struct_ops_link *st_link;
+	struct bpf_map *map;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	rcu_read_lock();
+	map = rcu_dereference(st_link->map);
+	info->struct_ops.map_id = map->id;
+	rcu_read_unlock();
+	return 0;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+	.dealloc = bpf_struct_ops_map_link_dealloc,
+	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+	struct bpf_struct_ops_link *link = NULL;
+	struct bpf_link_primer link_primer;
+	struct bpf_struct_ops_map *st_map;
+	struct bpf_map *map;
+	int err;
+
+	map = bpf_map_get(attr->link_create.map_fd);
+	if (!map)
+		return -EINVAL;
+
+	st_map = (struct bpf_struct_ops_map *)map;
+
+	if (!bpf_struct_ops_valid_to_reg(map)) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto err_out;
+
+	err = st_map->st_ops->reg(st_map->kvalue.data);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		link = NULL;
+		goto err_out;
+	}
+	RCU_INIT_POINTER(link->map, map);
+
+	return bpf_link_settle(&link_primer);
+
+err_out:
+	bpf_map_put(map);
+	kfree(link);
+	return err;
+}
+
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cff0348a2871..21f76698875c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2825,16 +2825,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
 	const struct bpf_prog *prog = link->prog;
 	char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
 
-	bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
 	seq_printf(m,
 		   "link_type:\t%s\n"
-		   "link_id:\t%u\n"
-		   "prog_tag:\t%s\n"
-		   "prog_id:\t%u\n",
+		   "link_id:\t%u\n",
 		   bpf_link_type_strs[link->type],
-		   link->id,
-		   prog_tag,
-		   prog->aux->id);
+		   link->id);
+	if (prog) {
+		bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+		seq_printf(m,
+			   "prog_tag:\t%s\n"
+			   "prog_id:\t%u\n",
+			   prog_tag,
+			   prog->aux->id);
+	}
 	if (link->ops->show_fdinfo)
 		link->ops->show_fdinfo(link, m);
 }
@@ -4314,7 +4317,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
 
 	info.type = link->type;
 	info.id = link->id;
-	info.prog_id = link->prog->aux->id;
+	if (link->prog)
+		info.prog_id = link->prog->aux->id;
 
 	if (link->ops->fill_link_info) {
 		err = link->ops->fill_link_info(link, &info);
@@ -4577,6 +4581,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	if (CHECK_ATTR(BPF_LINK_CREATE))
 		return -EINVAL;
 
+	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+		return bpf_struct_ops_link_create(attr);
+
 	prog = bpf_prog_get(attr->link_create.prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 13fc0c185cd9..bbbd5eb94db2 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -239,8 +239,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
 		if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
 				     sizeof(tcp_ca->name)) <= 0)
 			return -EINVAL;
-		if (tcp_ca_find(utcp_ca->name))
-			return -EEXIST;
 		return 1;
 	}
 
@@ -266,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata)
 	tcp_unregister_congestion_control(kdata);
 }
 
+static int bpf_tcp_ca_validate(void *kdata)
+{
+	return tcp_validate_congestion_control(kdata);
+}
+
 struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
@@ -273,6 +276,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.check_member = bpf_tcp_ca_check_member,
 	.init_member = bpf_tcp_ca_init_member,
 	.init = bpf_tcp_ca_init,
+	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
 };
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 13129df937cd..9cf1deaf21f2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1033,6 +1033,7 @@ enum bpf_attach_type {
 	BPF_PERF_EVENT,
 	BPF_TRACE_KPROBE_MULTI,
 	BPF_LSM_CGROUP,
+	BPF_STRUCT_OPS,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1266,6 +1267,9 @@ enum {
 
 /* Create a map that is suitable to be an inner map with dynamic max entries */
 	BPF_F_INNER_MAP		= (1U << 12),
+
+/* Create a map that will be registered/unregesitered by the backed bpf_link */
+	BPF_F_LINK		= (1U << 13),
 };
 
 /* Flags for BPF_PROG_QUERY. */
@@ -1507,7 +1511,10 @@ union bpf_attr {
 	} task_fd_query;
 
 	struct { /* struct used by BPF_LINK_CREATE command */
-		__u32		prog_fd;	/* eBPF program to attach */
+		union {
+			__u32		prog_fd;	/* eBPF program to attach */
+			__u32		map_fd;		/* eBPF struct_ops to attach */
+		};
 		union {
 			__u32		target_fd;	/* object to attach to */
 			__u32		target_ifindex; /* target ifindex */
@@ -6379,6 +6386,9 @@ struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 map_id;
+		} struct_ops;
 	};
 } __attribute__((aligned(8)));
 
-- 
cgit v1.2.3


From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@meta.com>
Date: Wed, 22 Mar 2023 20:24:02 -0700
Subject: bpf: Update the struct_ops of a bpf_link.

By improving the BPF_LINK_UPDATE command of bpf(), it should allow you
to conveniently switch between different struct_ops on a single
bpf_link. This would enable smoother transitions from one struct_ops
to another.

The struct_ops maps passing along with BPF_LINK_UPDATE should have the
BPF_F_LINK flag.

Signed-off-by: Kui-Feng Lee <kuifeng@meta.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h            |  3 +++
 include/uapi/linux/bpf.h       | 21 +++++++++++++-----
 kernel/bpf/bpf_struct_ops.c    | 48 +++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c           | 34 ++++++++++++++++++++++++++++++
 net/ipv4/bpf_tcp_ca.c          |  6 ++++++
 tools/include/uapi/linux/bpf.h | 21 +++++++++++++-----
 6 files changed, 122 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8552279efe46..2d8f3f639e68 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1476,6 +1476,8 @@ struct bpf_link_ops {
 	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
 	int (*fill_link_info)(const struct bpf_link *link,
 			      struct bpf_link_info *info);
+	int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
+			  struct bpf_map *old_map);
 };
 
 struct bpf_tramp_link {
@@ -1518,6 +1520,7 @@ struct bpf_struct_ops {
 			   void *kdata, const void *udata);
 	int (*reg)(void *kdata);
 	void (*unreg)(void *kdata);
+	int (*update)(void *kdata, void *old_kdata);
 	int (*validate)(void *kdata);
 	const struct btf_type *type;
 	const struct btf_type *value_type;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42f40ee083bf..e3d3b5160d26 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1555,12 +1555,23 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
 		__u32		link_fd;	/* link fd */
-		/* new program fd to update link with */
-		__u32		new_prog_fd;
+		union {
+			/* new program fd to update link with */
+			__u32		new_prog_fd;
+			/* new struct_ops map fd to update link with */
+			__u32           new_map_fd;
+		};
 		__u32		flags;		/* extra flags */
-		/* expected link's program fd; is specified only if
-		 * BPF_F_REPLACE flag is set in flags */
-		__u32		old_prog_fd;
+		union {
+			/* expected link's program fd; is specified only if
+			 * BPF_F_REPLACE flag is set in flags.
+			 */
+			__u32		old_prog_fd;
+			/* expected link's map fd; is specified only
+			 * if BPF_F_REPLACE flag is set.
+			 */
+			__u32           old_map_fd;
+		};
 	} link_update;
 
 	struct {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 3d6b5240c25a..6401deca3b56 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -65,6 +65,8 @@ struct bpf_struct_ops_link {
 	struct bpf_map __rcu *map;
 };
 
+static DEFINE_MUTEX(update_mutex);
+
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
@@ -664,7 +666,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
-	if (attr->map_flags & BPF_F_LINK && !st_ops->validate)
+	if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
 		return ERR_PTR(-EOPNOTSUPP);
 
 	t = st_ops->type;
@@ -810,10 +812,54 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
 	return 0;
 }
 
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+					  struct bpf_map *expected_old_map)
+{
+	struct bpf_struct_ops_map *st_map, *old_st_map;
+	struct bpf_map *old_map;
+	struct bpf_struct_ops_link *st_link;
+	int err = 0;
+
+	st_link = container_of(link, struct bpf_struct_ops_link, link);
+	st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+	if (!bpf_struct_ops_valid_to_reg(new_map))
+		return -EINVAL;
+
+	mutex_lock(&update_mutex);
+
+	old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+	if (expected_old_map && old_map != expected_old_map) {
+		err = -EPERM;
+		goto err_out;
+	}
+
+	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+	/* The new and old struct_ops must be the same type. */
+	if (st_map->st_ops != old_st_map->st_ops) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+	if (err)
+		goto err_out;
+
+	bpf_map_inc(new_map);
+	rcu_assign_pointer(st_link->map, new_map);
+	bpf_map_put(old_map);
+
+err_out:
+	mutex_unlock(&update_mutex);
+
+	return err;
+}
+
 static const struct bpf_link_ops bpf_struct_ops_map_lops = {
 	.dealloc = bpf_struct_ops_map_link_dealloc,
 	.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
 	.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+	.update_map = bpf_struct_ops_map_link_update,
 };
 
 int bpf_struct_ops_link_create(union bpf_attr *attr)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 21f76698875c..b4d758fa5981 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4682,6 +4682,35 @@ out:
 	return ret;
 }
 
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+	struct bpf_map *new_map, *old_map = NULL;
+	int ret;
+
+	new_map = bpf_map_get(attr->link_update.new_map_fd);
+	if (IS_ERR(new_map))
+		return -EINVAL;
+
+	if (attr->link_update.flags & BPF_F_REPLACE) {
+		old_map = bpf_map_get(attr->link_update.old_map_fd);
+		if (IS_ERR(old_map)) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+	} else if (attr->link_update.old_map_fd) {
+		ret = -EINVAL;
+		goto out_put;
+	}
+
+	ret = link->ops->update_map(link, new_map, old_map);
+
+	if (old_map)
+		bpf_map_put(old_map);
+out_put:
+	bpf_map_put(new_map);
+	return ret;
+}
+
 #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
 
 static int link_update(union bpf_attr *attr)
@@ -4702,6 +4731,11 @@ static int link_update(union bpf_attr *attr)
 	if (IS_ERR(link))
 		return PTR_ERR(link);
 
+	if (link->ops->update_map) {
+		ret = link_update_map(link, attr);
+		goto out_put_link;
+	}
+
 	new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
 	if (IS_ERR(new_prog)) {
 		ret = PTR_ERR(new_prog);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index bbbd5eb94db2..e8b27826283e 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -264,6 +264,11 @@ static void bpf_tcp_ca_unreg(void *kdata)
 	tcp_unregister_congestion_control(kdata);
 }
 
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
+{
+	return tcp_update_congestion_control(kdata, old_kdata);
+}
+
 static int bpf_tcp_ca_validate(void *kdata)
 {
 	return tcp_validate_congestion_control(kdata);
@@ -273,6 +278,7 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
 	.unreg = bpf_tcp_ca_unreg,
+	.update = bpf_tcp_ca_update,
 	.check_member = bpf_tcp_ca_check_member,
 	.init_member = bpf_tcp_ca_init_member,
 	.init = bpf_tcp_ca_init,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9cf1deaf21f2..d6c5a022ae28 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1555,12 +1555,23 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_UPDATE command */
 		__u32		link_fd;	/* link fd */
-		/* new program fd to update link with */
-		__u32		new_prog_fd;
+		union {
+			/* new program fd to update link with */
+			__u32		new_prog_fd;
+			/* new struct_ops map fd to update link with */
+			__u32           new_map_fd;
+		};
 		__u32		flags;		/* extra flags */
-		/* expected link's program fd; is specified only if
-		 * BPF_F_REPLACE flag is set in flags */
-		__u32		old_prog_fd;
+		union {
+			/* expected link's program fd; is specified only if
+			 * BPF_F_REPLACE flag is set in flags.
+			 */
+			__u32		old_prog_fd;
+			/* expected link's map fd; is specified only
+			 * if BPF_F_REPLACE flag is set.
+			 */
+			__u32           old_map_fd;
+		};
 	} link_update;
 
 	struct {
-- 
cgit v1.2.3


From 3eb8eea2a453463f5606ce3e46cf225f88671440 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 21 Mar 2023 22:38:48 -0700
Subject: docs: networking: document NAPI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add basic documentation about NAPI. We can stop linking to the ancient
doc on the LF wiki.

Link: https://lore.kernel.org/all/20230315223044.471002-1-kuba@kernel.org/
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Acked-by: Pavel Pisa <pisa@cmp.felk.cvut.cz> # for ctucanfd-driver.rst
Reviewed-by: Tony Nguyen <anthony.l.nguyen@intel.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20230322053848.198452-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../device_drivers/can/ctu/ctucanfd-driver.rst     |   3 +-
 .../device_drivers/ethernet/intel/e100.rst         |   3 +-
 .../device_drivers/ethernet/intel/i40e.rst         |   4 +-
 .../device_drivers/ethernet/intel/ice.rst          |   4 +-
 Documentation/networking/index.rst                 |   1 +
 Documentation/networking/napi.rst                  | 254 +++++++++++++++++++++
 include/linux/netdevice.h                          |  13 +-
 7 files changed, 269 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/networking/napi.rst

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst b/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst
index 1a4fc6607582..1661d13174d5 100644
--- a/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst
+++ b/Documentation/networking/device_drivers/can/ctu/ctucanfd-driver.rst
@@ -229,8 +229,7 @@ frames for a while. This has a potential to avoid the costly round of
 enabling interrupts, handling an incoming IRQ in ISR, re-enabling the
 softirq and switching context back to softirq.
 
-More detailed documentation of NAPI may be found on the pages of Linux
-Foundation `<https://wiki.linuxfoundation.org/networking/napi>`_.
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 Integrating the core to Xilinx Zynq
 -----------------------------------
diff --git a/Documentation/networking/device_drivers/ethernet/intel/e100.rst b/Documentation/networking/device_drivers/ethernet/intel/e100.rst
index 3d4a9ba21946..371b7e5c3293 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/e100.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/e100.rst
@@ -151,8 +151,7 @@ NAPI
 
 NAPI (Rx polling mode) is supported in the e100 driver.
 
-See https://wiki.linuxfoundation.org/networking/napi for more
-information on NAPI.
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 Multiple Interfaces on Same Ethernet Broadcast Network
 ------------------------------------------------------
diff --git a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
index ac35bd472bdc..c495c4e16b3b 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/i40e.rst
@@ -399,8 +399,8 @@ operate only in full duplex and only at their native speed.
 NAPI
 ----
 NAPI (Rx polling mode) is supported in the i40e driver.
-For more information on NAPI, see
-https://wiki.linuxfoundation.org/networking/napi
+
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 Flow Control
 ------------
diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
index 5efea4dd1251..2b6dc7880d7b 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
@@ -817,10 +817,10 @@ NOTE:
 
 NAPI
 ----
+
 This driver supports NAPI (Rx polling mode).
-For more information on NAPI, see
-https://wiki.linuxfoundation.org/networking/napi
 
+See :ref:`Documentation/networking/napi.rst <napi>` for more information.
 
 MACVLAN
 -------
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 4ddcae33c336..24bb256d6d53 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -73,6 +73,7 @@ Contents:
    mpls-sysctl
    mptcp-sysctl
    multiqueue
+   napi
    netconsole
    netdev-features
    netdevices
diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst
new file mode 100644
index 000000000000..a7a047742e93
--- /dev/null
+++ b/Documentation/networking/napi.rst
@@ -0,0 +1,254 @@
+.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+.. _napi:
+
+====
+NAPI
+====
+
+NAPI is the event handling mechanism used by the Linux networking stack.
+The name NAPI no longer stands for anything in particular [#]_.
+
+In basic operation the device notifies the host about new events
+via an interrupt.
+The host then schedules a NAPI instance to process the events.
+The device may also be polled for events via NAPI without receiving
+interrupts first (:ref:`busy polling<poll>`).
+
+NAPI processing usually happens in the software interrupt context,
+but there is an option to use :ref:`separate kernel threads<threaded>`
+for NAPI processing.
+
+All in all NAPI abstracts away from the drivers the context and configuration
+of event (packet Rx and Tx) processing.
+
+Driver API
+==========
+
+The two most important elements of NAPI are the struct napi_struct
+and the associated poll method. struct napi_struct holds the state
+of the NAPI instance while the method is the driver-specific event
+handler. The method will typically free Tx packets that have been
+transmitted and process newly received packets.
+
+.. _drv_ctrl:
+
+Control API
+-----------
+
+netif_napi_add() and netif_napi_del() add/remove a NAPI instance
+from the system. The instances are attached to the netdevice passed
+as argument (and will be deleted automatically when netdevice is
+unregistered). Instances are added in a disabled state.
+
+napi_enable() and napi_disable() manage the disabled state.
+A disabled NAPI can't be scheduled and its poll method is guaranteed
+to not be invoked. napi_disable() waits for ownership of the NAPI
+instance to be released.
+
+The control APIs are not idempotent. Control API calls are safe against
+concurrent use of datapath APIs but an incorrect sequence of control API
+calls may result in crashes, deadlocks, or race conditions. For example,
+calling napi_disable() multiple times in a row will deadlock.
+
+Datapath API
+------------
+
+napi_schedule() is the basic method of scheduling a NAPI poll.
+Drivers should call this function in their interrupt handler
+(see :ref:`drv_sched` for more info). A successful call to napi_schedule()
+will take ownership of the NAPI instance.
+
+Later, after NAPI is scheduled, the driver's poll method will be
+called to process the events/packets. The method takes a ``budget``
+argument - drivers can process completions for any number of Tx
+packets but should only process up to ``budget`` number of
+Rx packets. Rx processing is usually much more expensive.
+
+In other words, it is recommended to ignore the budget argument when
+performing TX buffer reclamation to ensure that the reclamation is not
+arbitrarily bounded; however, it is required to honor the budget argument
+for RX processing.
+
+.. warning::
+
+   The ``budget`` argument may be 0 if core tries to only process Tx completions
+   and no Rx packets.
+
+The poll method returns the amount of work done. If the driver still
+has outstanding work to do (e.g. ``budget`` was exhausted)
+the poll method should return exactly ``budget``. In that case,
+the NAPI instance will be serviced/polled again (without the
+need to be scheduled).
+
+If event processing has been completed (all outstanding packets
+processed) the poll method should call napi_complete_done()
+before returning. napi_complete_done() releases the ownership
+of the instance.
+
+.. warning::
+
+   The case of finishing all events and using exactly ``budget``
+   must be handled carefully. There is no way to report this
+   (rare) condition to the stack, so the driver must either
+   not call napi_complete_done() and wait to be called again,
+   or return ``budget - 1``.
+
+   If the ``budget`` is 0 napi_complete_done() should never be called.
+
+Call sequence
+-------------
+
+Drivers should not make assumptions about the exact sequencing
+of calls. The poll method may be called without the driver scheduling
+the instance (unless the instance is disabled). Similarly,
+it's not guaranteed that the poll method will be called, even
+if napi_schedule() succeeded (e.g. if the instance gets disabled).
+
+As mentioned in the :ref:`drv_ctrl` section - napi_disable() and subsequent
+calls to the poll method only wait for the ownership of the instance
+to be released, not for the poll method to exit. This means that
+drivers should avoid accessing any data structures after calling
+napi_complete_done().
+
+.. _drv_sched:
+
+Scheduling and IRQ masking
+--------------------------
+
+Drivers should keep the interrupts masked after scheduling
+the NAPI instance - until NAPI polling finishes any further
+interrupts are unnecessary.
+
+Drivers which have to mask the interrupts explicitly (as opposed
+to IRQ being auto-masked by the device) should use the napi_schedule_prep()
+and __napi_schedule() calls:
+
+.. code-block:: c
+
+  if (napi_schedule_prep(&v->napi)) {
+      mydrv_mask_rxtx_irq(v->idx);
+      /* schedule after masking to avoid races */
+      __napi_schedule(&v->napi);
+  }
+
+IRQ should only be unmasked after a successful call to napi_complete_done():
+
+.. code-block:: c
+
+  if (budget && napi_complete_done(&v->napi, work_done)) {
+    mydrv_unmask_rxtx_irq(v->idx);
+    return min(work_done, budget - 1);
+  }
+
+napi_schedule_irqoff() is a variant of napi_schedule() which takes advantage
+of guarantees given by being invoked in IRQ context (no need to
+mask interrupts). Note that PREEMPT_RT forces all interrupts
+to be threaded so the interrupt may need to be marked ``IRQF_NO_THREAD``
+to avoid issues on real-time kernel configurations.
+
+Instance to queue mapping
+-------------------------
+
+Modern devices have multiple NAPI instances (struct napi_struct) per
+interface. There is no strong requirement on how the instances are
+mapped to queues and interrupts. NAPI is primarily a polling/processing
+abstraction without specific user-facing semantics. That said, most networking
+devices end up using NAPI in fairly similar ways.
+
+NAPI instances most often correspond 1:1:1 to interrupts and queue pairs
+(queue pair is a set of a single Rx and single Tx queue).
+
+In less common cases a NAPI instance may be used for multiple queues
+or Rx and Tx queues can be serviced by separate NAPI instances on a single
+core. Regardless of the queue assignment, however, there is usually still
+a 1:1 mapping between NAPI instances and interrupts.
+
+It's worth noting that the ethtool API uses a "channel" terminology where
+each channel can be either ``rx``, ``tx`` or ``combined``. It's not clear
+what constitutes a channel; the recommended interpretation is to understand
+a channel as an IRQ/NAPI which services queues of a given type. For example,
+a configuration of 1 ``rx``, 1 ``tx`` and 1 ``combined`` channel is expected
+to utilize 3 interrupts, 2 Rx and 2 Tx queues.
+
+User API
+========
+
+User interactions with NAPI depend on NAPI instance ID. The instance IDs
+are only visible to the user thru the ``SO_INCOMING_NAPI_ID`` socket option.
+It's not currently possible to query IDs used by a given device.
+
+Software IRQ coalescing
+-----------------------
+
+NAPI does not perform any explicit event coalescing by default.
+In most scenarios batching happens due to IRQ coalescing which is done
+by the device. There are cases where software coalescing is helpful.
+
+NAPI can be configured to arm a repoll timer instead of unmasking
+the hardware interrupts as soon as all packets are processed.
+The ``gro_flush_timeout`` sysfs configuration of the netdevice
+is reused to control the delay of the timer, while
+``napi_defer_hard_irqs`` controls the number of consecutive empty polls
+before NAPI gives up and goes back to using hardware IRQs.
+
+.. _poll:
+
+Busy polling
+------------
+
+Busy polling allows a user process to check for incoming packets before
+the device interrupt fires. As is the case with any busy polling it trades
+off CPU cycles for lower latency (production uses of NAPI busy polling
+are not well known).
+
+Busy polling is enabled by either setting ``SO_BUSY_POLL`` on
+selected sockets or using the global ``net.core.busy_poll`` and
+``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
+also exists.
+
+IRQ mitigation
+---------------
+
+While busy polling is supposed to be used by low latency applications,
+a similar mechanism can be used for IRQ mitigation.
+
+Very high request-per-second applications (especially routing/forwarding
+applications and especially applications using AF_XDP sockets) may not
+want to be interrupted until they finish processing a request or a batch
+of packets.
+
+Such applications can pledge to the kernel that they will perform a busy
+polling operation periodically, and the driver should keep the device IRQs
+permanently masked. This mode is enabled by using the ``SO_PREFER_BUSY_POLL``
+socket option. To avoid system misbehavior the pledge is revoked
+if ``gro_flush_timeout`` passes without any busy poll call.
+
+The NAPI budget for busy polling is lower than the default (which makes
+sense given the low latency intention of normal busy polling). This is
+not the case with IRQ mitigation, however, so the budget can be adjusted
+with the ``SO_BUSY_POLL_BUDGET`` socket option.
+
+.. _threaded:
+
+Threaded NAPI
+-------------
+
+Threaded NAPI is an operating mode that uses dedicated kernel
+threads rather than software IRQ context for NAPI processing.
+The configuration is per netdevice and will affect all
+NAPI instances of that device. Each NAPI instance will spawn a separate
+thread (called ``napi/${ifc-name}-${napi-id}``).
+
+It is recommended to pin each kernel thread to a single CPU, the same
+CPU as the CPU which services the interrupt. Note that the mapping
+between IRQs and NAPI instances may not be trivial (and is driver
+dependent). The NAPI instance IDs will be assigned in the opposite
+order than the process IDs of the kernel threads.
+
+Threaded NAPI is controlled by writing 0/1 to the ``threaded`` file in
+netdev's sysfs directory.
+
+.. rubric:: Footnotes
+
+.. [#] NAPI was originally referred to as New API in 2.4 Linux.
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 674ee5daa7b1..18a5be6ddd0f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -509,15 +509,18 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
-bool napi_complete_done(struct napi_struct *n, int work_done);
 /**
- *	napi_complete - NAPI processing complete
- *	@n: NAPI context
+ * napi_complete_done - NAPI processing complete
+ * @n: NAPI context
+ * @work_done: number of packets processed
  *
- * Mark NAPI processing as complete.
- * Consider using napi_complete_done() instead.
+ * Mark NAPI processing as complete. Should only be called if poll budget
+ * has not been completely consumed.
+ * Prefer over napi_complete().
  * Return false if device should avoid rearming interrupts.
  */
+bool napi_complete_done(struct napi_struct *n, int work_done);
+
 static inline bool napi_complete(struct napi_struct *n)
 {
 	return napi_complete_done(n, 0);
-- 
cgit v1.2.3


From dbbb27e183b1568d5a907ace1cd144b0709ea52a Mon Sep 17 00:00:00 2001
From: Aloka Dixit <quic_alokad@quicinc.com>
Date: Thu, 23 Mar 2023 04:38:00 -0700
Subject: cfg80211: support RNR for EMA AP

As per IEEE Std 802.11ax-2021, 11.1.3.8.3 Discovery of a nontransmitted
BSSID profile, an EMA AP that transmits a Beacon frame carrying a partial
list of nontransmitted BSSID profiles should include in the frame
a Reduced Neighbor Report element carrying information for at least the
nontransmitted BSSIDs that are not present in the Multiple BSSID element
carried in that frame.
Add new nested attribute NL80211_ATTR_EMA_RNR_ELEMS to support the above.
Number of RNR elements must be more than or equal to the number of
MBSSID elements. This attribute can be used only when EMA is enabled.
Userspace is responsible for splitting the RNR into multiple elements such
that each element excludes the non-transmitting profiles already included
in the MBSSID element (%NL80211_ATTR_MBSSID_ELEMS) at the same index.
Each EMA beacon will be generated by adding MBSSID and RNR elements
at the same index. If the userspace provides more RNR elements than the
number of MBSSID elements then these will be added in every EMA beacon.

Signed-off-by: Aloka Dixit <quic_alokad@quicinc.com>
Link: https://lore.kernel.org/r/20230323113801.6903-2-quic_alokad@quicinc.com
[Johannes: validate elements]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 19 +++++++++++
 include/uapi/linux/nl80211.h | 13 ++++++++
 net/wireless/nl80211.c       | 79 ++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 104 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 86cb048dc924..3cf236520288 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1178,6 +1178,23 @@ struct cfg80211_mbssid_elems {
 	} elem[];
 };
 
+/**
+ * struct cfg80211_rnr_elems - Reduced neighbor report (RNR) elements
+ *
+ * @cnt: Number of elements in array %elems.
+ *
+ * @elem: Array of RNR element(s) to be added into Beacon frames.
+ * @elem.data: Data for RNR elements.
+ * @elem.len: Length of data.
+ */
+struct cfg80211_rnr_elems {
+	u8 cnt;
+	struct {
+		const u8 *data;
+		size_t len;
+	} elem[];
+};
+
 /**
  * struct cfg80211_beacon_data - beacon data
  * @link_id: the link ID for the AP MLD link sending this beacon
@@ -1198,6 +1215,7 @@ struct cfg80211_mbssid_elems {
  * @probe_resp_len: length of probe response template (@probe_resp)
  * @probe_resp: probe response template (AP mode only)
  * @mbssid_ies: multiple BSSID elements
+ * @rnr_ies: reduced neighbor report elements
  * @ftm_responder: enable FTM responder functionality; -1 for no change
  *	(which also implies no change in LCI/civic location data)
  * @lci: Measurement Report element content, starting with Measurement Token
@@ -1221,6 +1239,7 @@ struct cfg80211_beacon_data {
 	const u8 *lci;
 	const u8 *civicloc;
 	struct cfg80211_mbssid_elems *mbssid_ies;
+	struct cfg80211_rnr_elems *rnr_ies;
 	s8 ftm_responder;
 
 	size_t head_len, tail_len;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 14e958a32b84..cf4fb981e131 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2794,6 +2794,17 @@ enum nl80211_commands {
  * @NL80211_ATTR_HW_TIMESTAMP_ENABLED: Indicates whether HW timestamping should
  *	be enabled or not (flag attribute).
  *
+ * @NL80211_ATTR_EMA_RNR_ELEMS: Optional nested attribute for
+ *	reduced neighbor report (RNR) elements. This attribute can be used
+ *	only when NL80211_MBSSID_CONFIG_ATTR_EMA is enabled.
+ *	Userspace is responsible for splitting the RNR into multiple
+ *	elements such that each element excludes the non-transmitting
+ *	profiles already included in the MBSSID element
+ *	(%NL80211_ATTR_MBSSID_ELEMS) at the same index. Each EMA beacon
+ *	will be generated by adding MBSSID and RNR elements at the same
+ *	index. If the userspace includes more RNR elements than number of
+ *	MBSSID elements then these will be added in every EMA beacon.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3328,6 +3339,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS,
 	NL80211_ATTR_HW_TIMESTAMP_ENABLED,
 
+	NL80211_ATTR_EMA_RNR_ELEMS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0a31b1d2845d..80a20d69f285 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -809,6 +809,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 
 	[NL80211_ATTR_MAX_HW_TIMESTAMP_PEERS] = { .type = NLA_U16 },
 	[NL80211_ATTR_HW_TIMESTAMP_ENABLED] = { .type = NLA_FLAG },
+	[NL80211_ATTR_EMA_RNR_ELEMS] = { .type = NLA_NESTED },
 };
 
 /* policy for the key attributes */
@@ -5425,6 +5426,38 @@ nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs)
 	return elems;
 }
 
+static struct cfg80211_rnr_elems *
+nl80211_parse_rnr_elems(struct wiphy *wiphy, struct nlattr *attrs,
+			struct netlink_ext_ack *extack)
+{
+	struct nlattr *nl_elems;
+	struct cfg80211_rnr_elems *elems;
+	int rem_elems;
+	u8 i = 0, num_elems = 0;
+
+	nla_for_each_nested(nl_elems, attrs, rem_elems) {
+		int ret;
+
+		ret = validate_ie_attr(nl_elems, extack);
+		if (ret)
+			return ERR_PTR(ret);
+
+		num_elems++;
+	}
+
+	elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL);
+	if (!elems)
+		return ERR_PTR(-ENOMEM);
+
+	nla_for_each_nested(nl_elems, attrs, rem_elems) {
+		elems->elem[i].data = nla_data(nl_elems);
+		elems->elem[i].len = nla_len(nl_elems);
+		i++;
+	}
+	elems->cnt = num_elems;
+	return elems;
+}
+
 static int nl80211_parse_he_bss_color(struct nlattr *attrs,
 				      struct cfg80211_he_bss_color *he_bss_color)
 {
@@ -5451,7 +5484,8 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs,
 
 static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
 				struct nlattr *attrs[],
-				struct cfg80211_beacon_data *bcn)
+				struct cfg80211_beacon_data *bcn,
+				struct netlink_ext_ack *extack)
 {
 	bool haveinfo = false;
 	int err;
@@ -5548,6 +5582,21 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev,
 			return PTR_ERR(mbssid);
 
 		bcn->mbssid_ies = mbssid;
+
+		if (bcn->mbssid_ies && attrs[NL80211_ATTR_EMA_RNR_ELEMS]) {
+			struct cfg80211_rnr_elems *rnr =
+				nl80211_parse_rnr_elems(&rdev->wiphy,
+							attrs[NL80211_ATTR_EMA_RNR_ELEMS],
+							extack);
+
+			if (IS_ERR(rnr))
+				return PTR_ERR(rnr);
+
+			if (rnr && rnr->cnt < bcn->mbssid_ies->cnt)
+				return -EINVAL;
+
+			bcn->rnr_ies = rnr;
+		}
 	}
 
 	return 0;
@@ -5866,7 +5915,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 	if (!params)
 		return -ENOMEM;
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params->beacon);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params->beacon,
+				   info->extack);
 	if (err)
 		goto out;
 
@@ -6096,6 +6146,11 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			goto out_unlock;
 	}
 
+	if (!params->mbssid_config.ema && params->beacon.rnr_ies) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
+
 	err = nl80211_calculate_ap_params(params);
 	if (err)
 		goto out_unlock;
@@ -6137,6 +6192,7 @@ out:
 	    params->mbssid_config.tx_wdev->netdev &&
 	    params->mbssid_config.tx_wdev->netdev != dev)
 		dev_put(params->mbssid_config.tx_wdev->netdev);
+	kfree(params->beacon.rnr_ies);
 	kfree(params);
 
 	return err;
@@ -6161,7 +6217,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 	if (!wdev->links[link_id].ap.beacon_interval)
 		return -EINVAL;
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params, info->extack);
 	if (err)
 		goto out;
 
@@ -6171,6 +6227,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info)
 
 out:
 	kfree(params.mbssid_ies);
+	kfree(params.rnr_ies);
 	return err;
 }
 
@@ -10030,7 +10087,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 	if (!need_new_beacon)
 		goto skip_beacons;
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_after,
+				   info->extack);
 	if (err)
 		goto free;
 
@@ -10047,7 +10105,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto free;
 
-	err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa);
+	err = nl80211_parse_beacon(rdev, csa_attrs, &params.beacon_csa,
+				   info->extack);
 	if (err)
 		goto free;
 
@@ -10167,6 +10226,8 @@ skip_beacons:
 free:
 	kfree(params.beacon_after.mbssid_ies);
 	kfree(params.beacon_csa.mbssid_ies);
+	kfree(params.beacon_after.rnr_ies);
+	kfree(params.beacon_csa.rnr_ies);
 	kfree(csa_attrs);
 	return err;
 }
@@ -15882,7 +15943,8 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 	params.count = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT]);
 	params.color = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR]);
 
-	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_next);
+	err = nl80211_parse_beacon(rdev, info->attrs, &params.beacon_next,
+				   info->extack);
 	if (err)
 		return err;
 
@@ -15896,7 +15958,8 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		goto out;
 
-	err = nl80211_parse_beacon(rdev, tb, &params.beacon_color_change);
+	err = nl80211_parse_beacon(rdev, tb, &params.beacon_color_change,
+				   info->extack);
 	if (err)
 		goto out;
 
@@ -15952,6 +16015,8 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 out:
 	kfree(params.beacon_next.mbssid_ies);
 	kfree(params.beacon_color_change.mbssid_ies);
+	kfree(params.beacon_next.rnr_ies);
+	kfree(params.beacon_color_change.rnr_ies);
 	kfree(tb);
 	return err;
 }
-- 
cgit v1.2.3


From 5097f84437c9bd50b2c65b5f85395c34b2d545db Mon Sep 17 00:00:00 2001
From: Jaewan Kim <jaewan@google.com>
Date: Wed, 22 Mar 2023 13:16:34 +0000
Subject: wifi: nl80211: make nl80211_send_chandef non-static

Expose nl80211_send_chandef functionality for mac80211_hwsim or vendor
netlink can use it where needed.

Signed-off-by: Jaewan Kim <jaewan@google.com>
Reviewed-by: Michal Kubiak <michal.kubiak@intel.com>
Link: https://lore.kernel.org/r/20230322131637.2633968-3-jaewan@google.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 9 +++++++++
 net/wireless/nl80211.c | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 3cf236520288..9e04f69712b1 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -950,6 +950,15 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
 				  const struct cfg80211_chan_def *chandef,
 				  enum nl80211_iftype iftype);
 
+/**
+ * nl80211_send_chandef - sends the channel definition.
+ * @msg: the msg to send channel definition
+ * @chandef: the channel definition to check
+ *
+ * Returns: 0 if sent the channel definition to msg, < 0 on error
+ **/
+int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef);
+
 /**
  * ieee80211_chanwidth_rate_flags - return rate flags for channel width
  * @width: the channel width of the channel
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 80a20d69f285..f1cd3d9130dd 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3765,8 +3765,7 @@ out:
 	return result;
 }
 
-static int nl80211_send_chandef(struct sk_buff *msg,
-				const struct cfg80211_chan_def *chandef)
+int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef)
 {
 	if (WARN_ON(!cfg80211_chandef_valid(chandef)))
 		return -EINVAL;
@@ -3797,6 +3796,7 @@ static int nl80211_send_chandef(struct sk_buff *msg,
 		return -ENOBUFS;
 	return 0;
 }
+EXPORT_SYMBOL(nl80211_send_chandef);
 
 static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
 			      struct cfg80211_registered_device *rdev,
-- 
cgit v1.2.3


From 9821d8d4628e630ab56f47a8e6b878a2576e069b Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 14 Feb 2023 09:29:46 +0200
Subject: lib: cpu_rmap: Use allocator for rmap entries

Use a proper allocator for rmap entries using a naive for loop. The
allocator relies on whether an entry is NULL to be considered free.
Remove the used field of rmap which is not needed.

Also, avoid crashing the kernel if an entry is not available.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
---
 include/linux/cpu_rmap.h |  3 +--
 lib/cpu_rmap.c           | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h
index be8aea04d023..0ec745e6cd36 100644
--- a/include/linux/cpu_rmap.h
+++ b/include/linux/cpu_rmap.h
@@ -16,14 +16,13 @@
  * struct cpu_rmap - CPU affinity reverse-map
  * @refcount: kref for object
  * @size: Number of objects to be reverse-mapped
- * @used: Number of objects added
  * @obj: Pointer to array of object pointers
  * @near: For each CPU, the index and distance to the nearest object,
  *      based on affinity masks
  */
 struct cpu_rmap {
 	struct kref	refcount;
-	u16		size, used;
+	u16		size;
 	void		**obj;
 	struct {
 		u16	index;
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index e77f12bb3c77..5d4bf7a8b926 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -128,19 +128,31 @@ debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix)
 }
 #endif
 
+static int get_free_index(struct cpu_rmap *rmap)
+{
+	int i;
+
+	for (i = 0; i < rmap->size; i++)
+		if (!rmap->obj[i])
+			return i;
+
+	return -ENOSPC;
+}
+
 /**
  * cpu_rmap_add - add object to a rmap
  * @rmap: CPU rmap allocated with alloc_cpu_rmap()
  * @obj: Object to add to rmap
  *
- * Return index of object.
+ * Return index of object or -ENOSPC if no free entry was found
  */
 int cpu_rmap_add(struct cpu_rmap *rmap, void *obj)
 {
-	u16 index;
+	int index = get_free_index(rmap);
+
+	if (index < 0)
+		return index;
 
-	BUG_ON(rmap->used >= rmap->size);
-	index = rmap->used++;
 	rmap->obj[index] = obj;
 	return index;
 }
@@ -230,7 +242,7 @@ void free_irq_cpu_rmap(struct cpu_rmap *rmap)
 	if (!rmap)
 		return;
 
-	for (index = 0; index < rmap->used; index++) {
+	for (index = 0; index < rmap->size; index++) {
 		glue = rmap->obj[index];
 		if (glue)
 			irq_set_affinity_notifier(glue->notify.irq, NULL);
@@ -295,13 +307,22 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq)
 	glue->notify.release = irq_cpu_rmap_release;
 	glue->rmap = rmap;
 	cpu_rmap_get(rmap);
-	glue->index = cpu_rmap_add(rmap, glue);
+	rc = cpu_rmap_add(rmap, glue);
+	if (rc < 0)
+		goto err_add;
+
+	glue->index = rc;
 	rc = irq_set_affinity_notifier(irq, &glue->notify);
-	if (rc) {
-		cpu_rmap_put(glue->rmap);
-		rmap->obj[glue->index] = NULL;
-		kfree(glue);
-	}
+	if (rc)
+		goto err_set;
+
+	return rc;
+
+err_set:
+	rmap->obj[glue->index] = NULL;
+err_add:
+	cpu_rmap_put(glue->rmap);
+	kfree(glue);
 	return rc;
 }
 EXPORT_SYMBOL(irq_cpu_rmap_add);
-- 
cgit v1.2.3


From 71f0a2478605c100358a9f9e174849fa643bf8a7 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Tue, 14 Feb 2023 11:05:46 +0200
Subject: lib: cpu_rmap: Add irq_cpu_rmap_remove to complement irq_cpu_rmap_add

Add a function to complement irq_cpu_rmap_add(). It removes the irq from
the reverse mapping by setting the notifier to NULL. The function calls
irq_set_affinity_notifier() with NULL at the notify argument which then
cancel any pending notifier work and decrement reference on the
notifier. When ref count reaches zero, the glue pointer is kfree and the
rmap entry is set to NULL serving both to avoid second attempt to
release it and also making the rmap entry available for subsequent
mapping.

It should be noted the drivers usually creates the reverse mapping at
initialization time and remove it at unload time so we do not expect
failures in allocating rmap due to kref holding the glue entry.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
---
 include/linux/cpu_rmap.h |  1 +
 lib/cpu_rmap.c           | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h
index 0ec745e6cd36..cae324d10965 100644
--- a/include/linux/cpu_rmap.h
+++ b/include/linux/cpu_rmap.h
@@ -60,6 +60,7 @@ static inline struct cpu_rmap *alloc_irq_cpu_rmap(unsigned int size)
 }
 extern void free_irq_cpu_rmap(struct cpu_rmap *rmap);
 
+int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq);
 extern int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq);
 
 #endif /* __LINUX_CPU_RMAP_H */
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 5d4bf7a8b926..73c1636b927b 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -285,6 +285,17 @@ static void irq_cpu_rmap_release(struct kref *ref)
 	kfree(glue);
 }
 
+/**
+ * irq_cpu_rmap_remove - remove an IRQ from a CPU affinity reverse-map
+ * @rmap: The reverse-map
+ * @irq: The IRQ number
+ */
+int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq)
+{
+	return irq_set_affinity_notifier(irq, NULL);
+}
+EXPORT_SYMBOL(irq_cpu_rmap_remove);
+
 /**
  * irq_cpu_rmap_add - add an IRQ to a CPU affinity reverse-map
  * @rmap: The reverse-map
-- 
cgit v1.2.3


From fb0a6a268dcd6fe144c99d60a1166e34c6991d5f Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Thu, 5 Jan 2023 11:31:46 +0200
Subject: net/mlx5: Provide external API for allocating vectors

Provide external API to be used by other drivers relying on mlx5_core,
for allocating MSIX vectors. An example for such a driver would be
mlx5_vdpa.

Signed-off-by: Eli Cohen <elic@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 52 +++++++++++++++++++++++
 include/linux/mlx5/driver.h                       |  6 +++
 2 files changed, 58 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 7fa63d31ae5b..e12e528c09f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -483,6 +483,58 @@ struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
 	return irq;
 }
 
+/**
+ * mlx5_msix_alloc - allocate msix interrupt
+ * @dev: mlx5 device from which to request
+ * @handler: interrupt handler
+ * @affdesc: affinity descriptor
+ * @name: interrupt name
+ *
+ * Returns: struct msi_map with result encoded.
+ * Note: the caller must make sure to release the irq by calling
+ *       mlx5_msix_free() if shutdown was initiated.
+ */
+struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
+			       irqreturn_t (*handler)(int, void *),
+			       const struct irq_affinity_desc *affdesc,
+			       const char *name)
+{
+	struct msi_map map;
+	int err;
+
+	if (!dev->pdev) {
+		map.virq = 0;
+		map.index = -EINVAL;
+		return map;
+	}
+
+	map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, affdesc);
+	if (!map.virq)
+		return map;
+
+	err = request_irq(map.virq, handler, 0, name, NULL);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		pci_msix_free_irq(dev->pdev, map);
+		map.virq = 0;
+		map.index = -ENOMEM;
+	}
+	return map;
+}
+EXPORT_SYMBOL(mlx5_msix_alloc);
+
+/**
+ * mlx5_msix_free - free a previously allocated msix interrupt
+ * @dev: mlx5 device associated with interrupt
+ * @map: map previously returned by mlx5_msix_alloc()
+ */
+void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map)
+{
+	free_irq(map.virq, NULL);
+	pci_msix_free_irq(dev->pdev, map);
+}
+EXPORT_SYMBOL(mlx5_msix_free);
+
 /**
  * mlx5_irqs_release_vectors - release one or more IRQs back to the system.
  * @irqs: IRQs to be released.
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f33389b42209..df0f82110249 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1308,4 +1308,10 @@ enum {
 	MLX5_OCTWORD = 16,
 };
 
+struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
+			       irqreturn_t (*handler)(int, void *),
+			       const struct irq_affinity_desc *affdesc,
+			       const char *name);
+void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map);
+
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From e65a5c6edbc6ca4853e6076bd81db1a410592a09 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:42 -0700
Subject: bpf: Add a few bpf mem allocator functions

This patch adds a few bpf mem allocator functions which will
be used in the bpf_local_storage in a later patch.

bpf_mem_cache_alloc_flags(..., gfp_t flags) is added. When the
flags == GFP_KERNEL, it will fallback to __alloc(..., GFP_KERNEL).
bpf_local_storage knows its running context is sleepable (GFP_KERNEL)
and provides a better guarantee on memory allocation.

bpf_local_storage has some uncommon cases that its selem
cannot be reused immediately. It handles its own
rcu_head and goes through a rcu_trace gp and then free it.
bpf_mem_cache_raw_free() is added for direct free purpose
without leaking the LLIST_NODE_SZ internal knowledge.
During free time, the 'struct bpf_mem_alloc *ma' is no longer
available. However, the caller should know if it is
percpu memory or not and it can call different raw_free functions.
bpf_local_storage does not support percpu value, so only
the non-percpu 'bpf_mem_cache_raw_free()' is added in
this patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  2 ++
 kernel/bpf/memalloc.c         | 59 ++++++++++++++++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index a7104af61ab4..3929be5743f4 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
 /* kmem_cache_alloc/free equivalent: */
 void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
 void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_cache_raw_free(void *ptr);
+void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
 
 #endif /* _BPF_MEM_ALLOC_H */
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5fcdacbb8439..410637c225fb 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
 	return entry;
 }
 
-static void *__alloc(struct bpf_mem_cache *c, int node)
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
 {
-	/* Allocate, but don't deplete atomic reserves that typical
-	 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
-	 * will allocate from the current numa node which is what we
-	 * want here.
-	 */
-	gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
-
 	if (c->percpu_size) {
 		void **obj = kmalloc_node(c->percpu_size, flags, node);
 		void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
 		 */
 		obj = __llist_del_first(&c->free_by_rcu);
 		if (!obj) {
-			obj = __alloc(c, node);
+			/* Allocate, but don't deplete atomic reserves that typical
+			 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+			 * will allocate from the current numa node which is what we
+			 * want here.
+			 */
+			obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
 			if (!obj)
 				break;
 		}
@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
 
 	unit_free(this_cpu_ptr(ma->cache), ptr);
 }
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+	if (!ptr)
+		return;
+
+	kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+	struct bpf_mem_cache *c;
+	void *ret;
+
+	c = this_cpu_ptr(ma->cache);
+
+	ret = unit_alloc(c);
+	if (!ret && flags == GFP_KERNEL) {
+		struct mem_cgroup *memcg, *old_memcg;
+
+		memcg = get_memcg(c);
+		old_memcg = set_active_memcg(memcg);
+		ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+		set_active_memcg(old_memcg);
+		mem_cgroup_put(memcg);
+	}
+
+	return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
-- 
cgit v1.2.3


From 08a7ce384e33e53e0732c500a8af67a73f8fceca Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:43 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage_elem

This patch uses bpf_mem_alloc for the task and cgroup local storage that
the bpf prog can easily get a hold of the storage owner's PTR_TO_BTF_ID.
eg. bpf_get_current_task_btf() can be used in some of the kmalloc code
path which will cause deadlock/recursion. bpf_mem_cache_alloc is
deadlock free and will solve a legit use case in [1].

For sk storage, its batch creation benchmark shows a few percent
regression when the sk create/destroy batch size is larger than 32.
The sk creation/destruction happens much more often and
depends on external traffic. Considering it is hypothetical
to be able to cause deadlock with sk storage, it can cross
the bridge to use bpf_mem_alloc till a legit (ie. useful)
use case comes up.

For inode storage, bpf_local_storage_destroy() is called before
waiting for a rcu gp and its memory cannot be reused immediately.
inode stays with kmalloc/kfree after the rcu [or tasks_trace] gp.

A 'bool bpf_ma' argument is added to bpf_local_storage_map_alloc().
Only task and cgroup storage have 'bpf_ma == true' which
means to use bpf_mem_cache_alloc/free(). This patch only changes
selem to use bpf_mem_alloc for task and cgroup. The next patch
will change the local_storage to use bpf_mem_alloc also for
task and cgroup.

Here is some more details on the changes:

* memory allocation:
After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because
bpf_mem_cache_alloc() could return a reused selem. It is to keep
the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data
is zero-ed. SDATA(selem)->data is the visible part to the bpf prog.
No need to use zero_map_value() to do the zeroing because
bpf_selem_free(..., reuse_now = true) ensures no bpf prog is using
the selem before returning the selem through bpf_mem_cache_free().
For the internal fields of selem, they will be initialized when
linking to the new smap and the new local_storage.

When 'bpf_ma == false', nothing changes in this patch. It will
stay with the bpf_map_kzalloc().

* memory free:
The bpf_selem_free() and bpf_selem_free_rcu() are modified to handle
the bpf_ma == true case.

For the common selem free path where its owner is also being destroyed,
the mem is freed in bpf_local_storage_destroy(), the owner (task
and cgroup) has gone through a rcu gp. The memory can be reused
immediately, so bpf_local_storage_destroy() will call
bpf_selem_free(..., reuse_now = true) which will do
bpf_mem_cache_free() for immediate reuse consideration.

An exception is the delete elem code path. The delete elem code path
is called from the helper bpf_*_storage_delete() and the syscall
bpf_map_delete_elem(). This path is an unusual case for local
storage because the common use case is to have the local storage
staying with its owner life time so that the bpf prog and the user
space does not have to monitor the owner's destruction. For the delete
elem path, the selem cannot be reused immediately because there could
be bpf prog using it. It will call bpf_selem_free(..., reuse_now = false)
and it will wait for a rcu tasks trace gp before freeing the elem. The
rcu callback is changed to do bpf_mem_cache_raw_free() instead of kfree().

When 'bpf_ma == false', it should be the same as before.
__bpf_selem_free() is added to do the kfree_rcu and call_tasks_trace_rcu().
A few words on the 'reuse_now == true'. When 'reuse_now == true',
it is still racing with bpf_local_storage_map_free which is under rcu
protection, so it still needs to wait for a rcu gp instead of kfree().
Otherwise, the selem may be reused by slab for a totally different struct
while the bpf_local_storage_map_free() is still using it (as a
rcu reader). For the inode case, there may be other rcu readers also.
In short, when bpf_ma == false and reuse_now == true => vanilla rcu.

[1]: https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |  6 ++-
 kernel/bpf/bpf_cgrp_storage.c     |  2 +-
 kernel/bpf/bpf_inode_storage.c    |  2 +-
 kernel/bpf/bpf_local_storage.c    | 95 +++++++++++++++++++++++++++++++++++----
 kernel/bpf/bpf_task_storage.c     |  2 +-
 net/core/bpf_sk_storage.c         |  2 +-
 6 files changed, 95 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index a34f61467a2f..30efbcab2798 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 #include <linux/types.h>
+#include <linux/bpf_mem_alloc.h>
 #include <uapi/linux/btf.h>
 
 #define BPF_LOCAL_STORAGE_CACHE_SIZE	16
@@ -55,6 +56,8 @@ struct bpf_local_storage_map {
 	u32 bucket_log;
 	u16 elem_size;
 	u16 cache_idx;
+	struct bpf_mem_alloc selem_ma;
+	bool bpf_ma;
 };
 
 struct bpf_local_storage_data {
@@ -122,7 +125,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
 
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
-			    struct bpf_local_storage_cache *cache);
+			    struct bpf_local_storage_cache *cache,
+			    bool bpf_ma);
 
 struct bpf_local_storage_data *
 bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index f5b016a5484d..d17d5b694668 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -149,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+	return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
 }
 
 static void cgroup_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 9a5f05151898..e17ad581b9be 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -199,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
 
 static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &inode_cache);
+	return bpf_local_storage_map_alloc(attr, &inode_cache, false);
 }
 
 static void inode_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 351d991694cb..309ea727a5cb 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	if (charge_mem && mem_charge(smap, owner, smap->elem_size))
 		return NULL;
 
-	selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
-				gfp_flags | __GFP_NOWARN);
+	if (smap->bpf_ma) {
+		migrate_disable();
+		selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+		migrate_enable();
+		if (selem)
+			/* Keep the original bpf_map_kzalloc behavior
+			 * before started using the bpf_mem_cache_alloc.
+			 *
+			 * No need to use zero_map_value. The bpf_selem_free()
+			 * only does bpf_mem_cache_free when there is
+			 * no other bpf prog is using the selem.
+			 */
+			memset(SDATA(selem)->data, 0, smap->map.value_size);
+	} else {
+		selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+					gfp_flags | __GFP_NOWARN);
+	}
+
 	if (selem) {
 		if (value)
 			copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -124,12 +140,34 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
 }
 
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage_elem *selem;
+
+	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(selem);
+	else
+		kfree_rcu(selem, rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+			     bool vanilla_rcu)
+{
+	if (vanilla_rcu)
+		kfree_rcu(selem, rcu);
+	else
+		call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
 static void bpf_selem_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage_elem *selem;
 
 	selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
-	kfree(selem);
+	bpf_mem_cache_raw_free(selem);
 }
 
 static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
@@ -145,10 +183,23 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
 		    bool reuse_now)
 {
 	bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
-	if (!reuse_now)
+
+	if (!smap->bpf_ma) {
+		__bpf_selem_free(selem, reuse_now);
+		return;
+	}
+
+	if (!reuse_now) {
 		call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
-	else
-		call_rcu(&selem->rcu, bpf_selem_free_rcu);
+	} else {
+		/* Instead of using the vanilla call_rcu(),
+		 * bpf_mem_cache_free will be able to reuse selem
+		 * immediately.
+		 */
+		migrate_disable();
+		bpf_mem_cache_free(&smap->selem_ma, selem);
+		migrate_enable();
+	}
 }
 
 /* local_storage->lock must be held and selem->local_storage == local_storage.
@@ -654,13 +705,25 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
 	return usage;
 }
 
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
 struct bpf_map *
 bpf_local_storage_map_alloc(union bpf_attr *attr,
-			    struct bpf_local_storage_cache *cache)
+			    struct bpf_local_storage_cache *cache,
+			    bool bpf_ma)
 {
 	struct bpf_local_storage_map *smap;
 	unsigned int i;
 	u32 nbuckets;
+	int err;
 
 	smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
 	if (!smap)
@@ -675,8 +738,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
 					 nbuckets, GFP_USER | __GFP_NOWARN);
 	if (!smap->buckets) {
-		bpf_map_area_free(smap);
-		return ERR_PTR(-ENOMEM);
+		err = -ENOMEM;
+		goto free_smap;
 	}
 
 	for (i = 0; i < nbuckets; i++) {
@@ -687,8 +750,20 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 	smap->elem_size = offsetof(struct bpf_local_storage_elem,
 				   sdata.data[attr->value_size]);
 
+	smap->bpf_ma = bpf_ma;
+	if (bpf_ma) {
+		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+		if (err)
+			goto free_smap;
+	}
+
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
 	return &smap->map;
+
+free_smap:
+	kvfree(smap->buckets);
+	bpf_map_area_free(smap);
+	return ERR_PTR(err);
 }
 
 void bpf_local_storage_map_free(struct bpf_map *map,
@@ -754,6 +829,8 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
+	if (smap->bpf_ma)
+		bpf_mem_alloc_destroy(&smap->selem_ma);
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
 }
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ab5bd1ef58c4..d1af0c8f9ce4 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -309,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
 
 static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &task_cache);
+	return bpf_local_storage_map_alloc(attr, &task_cache, true);
 }
 
 static void task_storage_map_free(struct bpf_map *map)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index cb0f5a105b89..085025c7130a 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
 
 static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 {
-	return bpf_local_storage_map_alloc(attr, &sk_cache);
+	return bpf_local_storage_map_alloc(attr, &sk_cache, false);
 }
 
 static int notsupp_get_next_key(struct bpf_map *map, void *key,
-- 
cgit v1.2.3


From 6ae9d5e99e1dd26babdd9502759fa25a3fd348ad Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Wed, 22 Mar 2023 14:52:44 -0700
Subject: bpf: Use bpf_mem_cache_alloc/free for bpf_local_storage

This patch uses bpf_mem_cache_alloc/free for allocating and freeing
bpf_local_storage for task and cgroup storage.

The changes are similar to the previous patch. A few things that
worth to mention for bpf_local_storage:

The local_storage is freed when the last selem is deleted.
Before deleting a selem from local_storage, it needs to retrieve the
local_storage->smap because the bpf_selem_unlink_storage_nolock()
may have set it to NULL. Note that local_storage->smap may have
already been NULL when the selem created this local_storage has
been removed. In this case, call_rcu will be used to free the
local_storage.
Also, the bpf_ma (true or false) value is needed before calling
bpf_local_storage_free(). The bpf_ma can either be obtained from
the local_storage->smap (if available) or any of its selem's smap.
A new helper check_storage_bpf_ma() is added to obtain
bpf_ma for a deleting bpf_local_storage.

When bpf_local_storage_alloc getting a reused memory, all
fields are either in the correct values or will be initialized.
'cache[]' must already be all NULLs. 'list' must be empty.
Others will be initialized.

Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20230322215246.1675516-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_local_storage.h |   1 +
 kernel/bpf/bpf_local_storage.c    | 130 +++++++++++++++++++++++++++++++++-----
 2 files changed, 116 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 30efbcab2798..173ec7f43ed1 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -57,6 +57,7 @@ struct bpf_local_storage_map {
 	u16 elem_size;
 	u16 cache_idx;
 	struct bpf_mem_alloc selem_ma;
+	struct bpf_mem_alloc storage_ma;
 	bool bpf_ma;
 };
 
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 309ea727a5cb..dab2ff4c99d9 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -111,33 +111,74 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
 	return NULL;
 }
 
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+	struct bpf_local_storage *local_storage;
+
+	/* If RCU Tasks Trace grace period implies RCU grace period, do
+	 * kfree(), else do kfree_rcu().
+	 */
+	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+	if (rcu_trace_implies_rcu_gp())
+		kfree(local_storage);
+	else
+		kfree_rcu(local_storage, rcu);
+}
+
 static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
 {
 	struct bpf_local_storage *local_storage;
 
 	local_storage = container_of(rcu, struct bpf_local_storage, rcu);
-	kfree(local_storage);
+	bpf_mem_cache_raw_free(local_storage);
 }
 
 static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
 {
-	/* If RCU Tasks Trace grace period implies RCU grace period, do
-	 * kfree(), else do kfree_rcu().
-	 */
 	if (rcu_trace_implies_rcu_gp())
 		bpf_local_storage_free_rcu(rcu);
 	else
 		call_rcu(rcu, bpf_local_storage_free_rcu);
 }
 
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+				     bool vanilla_rcu)
+{
+	if (vanilla_rcu)
+		kfree_rcu(local_storage, rcu);
+	else
+		call_rcu_tasks_trace(&local_storage->rcu,
+				     __bpf_local_storage_free_trace_rcu);
+}
+
 static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
-				   bool reuse_now)
+				   struct bpf_local_storage_map *smap,
+				   bool bpf_ma, bool reuse_now)
 {
-	if (!reuse_now)
+	if (!bpf_ma) {
+		__bpf_local_storage_free(local_storage, reuse_now);
+		return;
+	}
+
+	if (!reuse_now) {
 		call_rcu_tasks_trace(&local_storage->rcu,
 				     bpf_local_storage_free_trace_rcu);
-	else
+		return;
+	}
+
+	if (smap) {
+		migrate_disable();
+		bpf_mem_cache_free(&smap->storage_ma, local_storage);
+		migrate_enable();
+	} else {
+		/* smap could be NULL if the selem that triggered
+		 * this 'local_storage' creation had been long gone.
+		 * In this case, directly do call_rcu().
+		 */
 		call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+	}
 }
 
 /* rcu tasks trace callback for bpf_ma == false */
@@ -260,11 +301,47 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
 	return free_local_storage;
 }
 
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+				 struct bpf_local_storage_map *storage_smap,
+				 struct bpf_local_storage_elem *selem)
+{
+
+	struct bpf_local_storage_map *selem_smap;
+
+	/* local_storage->smap may be NULL. If it is, get the bpf_ma
+	 * from any selem in the local_storage->list. The bpf_ma of all
+	 * local_storage and selem should have the same value
+	 * for the same map type.
+	 *
+	 * If the local_storage->list is already empty, the caller will not
+	 * care about the bpf_ma value also because the caller is not
+	 * responsibile to free the local_storage.
+	 */
+
+	if (storage_smap)
+		return storage_smap->bpf_ma;
+
+	if (!selem) {
+		struct hlist_node *n;
+
+		n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+					  bpf_rcu_lock_held());
+		if (!n)
+			return false;
+
+		selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
+	}
+	selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+	return selem_smap->bpf_ma;
+}
+
 static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 				     bool reuse_now)
 {
+	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage *local_storage;
-	bool free_local_storage = false;
+	bool bpf_ma, free_local_storage = false;
 	unsigned long flags;
 
 	if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -273,6 +350,10 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 
 	local_storage = rcu_dereference_check(selem->local_storage,
 					      bpf_rcu_lock_held());
+	storage_smap = rcu_dereference_check(local_storage->smap,
+					     bpf_rcu_lock_held());
+	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
 	raw_spin_lock_irqsave(&local_storage->lock, flags);
 	if (likely(selem_linked_to_storage(selem)))
 		free_local_storage = bpf_selem_unlink_storage_nolock(
@@ -280,7 +361,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_local_storage)
-		bpf_local_storage_free(local_storage, reuse_now);
+		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
 }
 
 void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -400,8 +481,15 @@ int bpf_local_storage_alloc(void *owner,
 	if (err)
 		return err;
 
-	storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
-				  gfp_flags | __GFP_NOWARN);
+	if (smap->bpf_ma) {
+		migrate_disable();
+		storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+		migrate_enable();
+	} else {
+		storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+					  gfp_flags | __GFP_NOWARN);
+	}
+
 	if (!storage) {
 		err = -ENOMEM;
 		goto uncharge;
@@ -447,7 +535,7 @@ int bpf_local_storage_alloc(void *owner,
 	return 0;
 
 uncharge:
-	bpf_local_storage_free(storage, true);
+	bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
 	mem_uncharge(smap, owner, sizeof(*storage));
 	return err;
 }
@@ -660,11 +748,15 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
 
 void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 {
+	struct bpf_local_storage_map *storage_smap;
 	struct bpf_local_storage_elem *selem;
-	bool free_storage = false;
+	bool bpf_ma, free_storage = false;
 	struct hlist_node *n;
 	unsigned long flags;
 
+	storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+	bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
+
 	/* Neither the bpf_prog nor the bpf_map's syscall
 	 * could be modifying the local_storage->list now.
 	 * Thus, no elem can be added to or deleted from the
@@ -692,7 +784,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
 	raw_spin_unlock_irqrestore(&local_storage->lock, flags);
 
 	if (free_storage)
-		bpf_local_storage_free(local_storage, true);
+		bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
 }
 
 u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -755,6 +847,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 		err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
 		if (err)
 			goto free_smap;
+
+		err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+		if (err) {
+			bpf_mem_alloc_destroy(&smap->selem_ma);
+			goto free_smap;
+		}
 	}
 
 	smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
@@ -829,8 +927,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,
 	 */
 	synchronize_rcu();
 
-	if (smap->bpf_ma)
+	if (smap->bpf_ma) {
 		bpf_mem_alloc_destroy(&smap->selem_ma);
+		bpf_mem_alloc_destroy(&smap->storage_ma);
+	}
 	kvfree(smap->buckets);
 	bpf_map_area_free(smap);
 }
-- 
cgit v1.2.3


From 3948b05950fdd64002a5f182c65ba5cf2d53cf71 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 23 Mar 2023 09:28:42 -0700
Subject: net: introduce a config option to tweak MAX_SKB_FRAGS

Currently, MAX_SKB_FRAGS value is 17.

For standard tcp sendmsg() traffic, no big deal because tcp_sendmsg()
attempts order-3 allocations, stuffing 32768 bytes per frag.

But with zero copy, we use order-0 pages.

For BIG TCP to show its full potential, we add a config option
to be able to fit up to 45 segments per skb.

This is also needed for BIG TCP rx zerocopy, as zerocopy currently
does not support skbs with frag list.

We have used MAX_SKB_FRAGS=45 value for years at Google before
we deployed 4K MTU, with no adverse effect, other than
a recent issue in mlx4, fixed in commit 26782aad00cc
("net/mlx4: MLX4_TX_BOUNCE_BUFFER_SIZE depends on MAX_SKB_FRAGS")

Back then, goal was to be able to receive full size (64KB) GRO
packets without the frag_list overhead.

Note that /proc/sys/net/core/max_skb_frags can also be used to limit
the number of fragments TCP can use in tx packets.

By default we keep the old/legacy value of 17 until we get
more coverage for the updated values.

Sizes of struct skb_shared_info on 64bit arches

MAX_SKB_FRAGS | sizeof(struct skb_shared_info):
==============================================
         17     320
         21     320+64  = 384
         25     320+128 = 448
         29     320+192 = 512
         33     320+256 = 576
         37     320+320 = 640
         41     320+384 = 704
         45     320+448 = 768

This inflation might cause problems for drivers assuming they could pack
both the incoming packet (for MTU=1500) and skb_shared_info in half a page,
using build_skb().

v3: fix build error when CONFIG_NET=n
v2: fix two build errors assuming MAX_SKB_FRAGS was "unsigned long"

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/r/20230323162842.1935061-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/scsi/cxgbi/libcxgbi.c |  4 ++--
 include/linux/skbuff.h        | 16 +++++-----------
 net/Kconfig                   | 12 ++++++++++++
 net/packet/af_packet.c        |  4 ++--
 4 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index af281e271f88..3e1de4c842cc 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -2314,9 +2314,9 @@ static int cxgbi_sock_tx_queue_up(struct cxgbi_sock *csk, struct sk_buff *skb)
 		frags++;
 
 	if (frags >= SKB_WR_LIST_SIZE) {
-		pr_err("csk 0x%p, frags %u, %u,%u >%lu.\n",
+		pr_err("csk 0x%p, frags %u, %u,%u >%u.\n",
 		       csk, skb_shinfo(skb)->nr_frags, skb->len,
-		       skb->data_len, SKB_WR_LIST_SIZE);
+		       skb->data_len, (unsigned int)SKB_WR_LIST_SIZE);
 		return -EINVAL;
 	}
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe661011644b..82511b2f61ea 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -345,18 +345,12 @@ struct sk_buff_head {
 
 struct sk_buff;
 
-/* To allow 64K frame to be packed as single skb without frag_list we
- * require 64K/PAGE_SIZE pages plus 1 additional page to allow for
- * buffers which do not start on a page boundary.
- *
- * Since GRO uses frags we allocate at least 16 regardless of page
- * size.
- */
-#if (65536/PAGE_SIZE + 1) < 16
-#define MAX_SKB_FRAGS 16UL
-#else
-#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1)
+#ifndef CONFIG_MAX_SKB_FRAGS
+# define CONFIG_MAX_SKB_FRAGS 17
 #endif
+
+#define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS
+
 extern int sysctl_max_skb_frags;
 
 /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
diff --git a/net/Kconfig b/net/Kconfig
index 48c33c222199..f806722bccf4 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -251,6 +251,18 @@ config PCPU_DEV_REFCNT
 	  network device refcount are using per cpu variables if this option is set.
 	  This can be forced to N to detect underflows (with a performance drop).
 
+config MAX_SKB_FRAGS
+	int "Maximum number of fragments per skb_shared_info"
+	range 17 45
+	default 17
+	help
+	  Having more fragments per skb_shared_info can help GRO efficiency.
+	  This helps BIG TCP workloads, but might expose bugs in some
+	  legacy drivers.
+	  This also increases memory overhead of small packets,
+	  and in drivers using build_skb().
+	  If unsure, say 17.
+
 config RPS
 	bool
 	depends on SMP && SYSFS
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 497193f73030..568f8d76e3c1 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2622,8 +2622,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		nr_frags = skb_shinfo(skb)->nr_frags;
 
 		if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
-			pr_err("Packet exceed the number of skb frags(%lu)\n",
-			       MAX_SKB_FRAGS);
+			pr_err("Packet exceed the number of skb frags(%u)\n",
+			       (unsigned int)MAX_SKB_FRAGS);
 			return -EFAULT;
 		}
 
-- 
cgit v1.2.3


From 3e4d5ba9a3f85f21f1ebdee5a5901bb43389abc5 Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Thu, 23 Mar 2023 18:36:04 +0200
Subject: netlink: Add a macro to set policy message with format string

Similar to NL_SET_ERR_MSG_FMT, add a macro which sets netlink policy
error message with a format string.

Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 3e8743252167..19c0791ed9d5 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -161,9 +161,31 @@ struct netlink_ext_ack {
 	}							\
 } while (0)
 
+#define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do {	\
+	struct netlink_ext_ack *__extack = (extack);				\
+										\
+	if (!__extack)								\
+		break;								\
+										\
+	if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN,		\
+		     "%s" fmt "%s", "", ##args, "") >=				\
+	    NETLINK_MAX_FMTMSG_LEN)						\
+		net_warn_ratelimited("%s" fmt "%s", "truncated extack: ",       \
+				     ##args, "\n");				\
+										\
+	do_trace_netlink_extack(__extack->_msg_buf);				\
+										\
+	__extack->_msg = __extack->_msg_buf;					\
+	__extack->bad_attr = (attr);						\
+	__extack->policy = (pol);						\
+} while (0)
+
 #define NL_SET_ERR_MSG_ATTR(extack, attr, msg)		\
 	NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg)
 
+#define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \
+	NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args)
+
 #define NL_SET_ERR_ATTR_MISS(extack, nest, type)  do {	\
 	struct netlink_ext_ack *__extack = (extack);	\
 							\
-- 
cgit v1.2.3


From 233eb4e786b57ea686b51c13a04cc2839fd682fc Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayagr@amazon.com>
Date: Thu, 23 Mar 2023 18:36:05 +0200
Subject: ethtool: Add support for configuring tx_push_buf_len

This attribute, which is part of ethtool's ring param configuration
allows the user to specify the maximum number of the packet's payload
that can be written directly to the device.

Example usage:
    # ethtool -G [interface] tx-push-buf-len [number of bytes]

Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Shay Agroskin <shayagr@amazon.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml     |  8 +++++
 Documentation/networking/ethtool-netlink.rst | 47 ++++++++++++++++++----------
 include/linux/ethtool.h                      | 14 ++++++---
 include/uapi/linux/ethtool_netlink.h         |  2 ++
 net/ethtool/netlink.h                        |  2 +-
 net/ethtool/rings.c                          | 34 ++++++++++++++++++--
 6 files changed, 84 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index 4727c067e2ba..6d8ae3d9a680 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -165,6 +165,12 @@ attribute-sets:
       -
         name: rx-push
         type: u8
+      -
+        name: tx-push-buf-len
+        type: u32
+      -
+        name: tx-push-buf-len-max
+        type: u32
 
   -
     name: mm-stat
@@ -311,6 +317,8 @@ operations:
             - cqe-size
             - tx-push
             - rx-push
+            - tx-push-buf-len
+            - tx-push-buf-len-max
       dump: *ring-get-op
     -
       name: rings-set
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index e1bc6186d7ea..cd0973d4ba01 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -860,22 +860,24 @@ Request contents:
 
 Kernel response contents:
 
-  ====================================  ======  ===========================
-  ``ETHTOOL_A_RINGS_HEADER``            nested  reply header
-  ``ETHTOOL_A_RINGS_RX_MAX``            u32     max size of RX ring
-  ``ETHTOOL_A_RINGS_RX_MINI_MAX``       u32     max size of RX mini ring
-  ``ETHTOOL_A_RINGS_RX_JUMBO_MAX``      u32     max size of RX jumbo ring
-  ``ETHTOOL_A_RINGS_TX_MAX``            u32     max size of TX ring
-  ``ETHTOOL_A_RINGS_RX``                u32     size of RX ring
-  ``ETHTOOL_A_RINGS_RX_MINI``           u32     size of RX mini ring
-  ``ETHTOOL_A_RINGS_RX_JUMBO``          u32     size of RX jumbo ring
-  ``ETHTOOL_A_RINGS_TX``                u32     size of TX ring
-  ``ETHTOOL_A_RINGS_RX_BUF_LEN``        u32     size of buffers on the ring
-  ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``    u8      TCP header / data split
-  ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
-  ``ETHTOOL_A_RINGS_TX_PUSH``           u8      flag of TX Push mode
-  ``ETHTOOL_A_RINGS_RX_PUSH``           u8      flag of RX Push mode
-  ====================================  ======  ===========================
+  =======================================   ======  ===========================
+  ``ETHTOOL_A_RINGS_HEADER``                nested  reply header
+  ``ETHTOOL_A_RINGS_RX_MAX``                u32     max size of RX ring
+  ``ETHTOOL_A_RINGS_RX_MINI_MAX``           u32     max size of RX mini ring
+  ``ETHTOOL_A_RINGS_RX_JUMBO_MAX``          u32     max size of RX jumbo ring
+  ``ETHTOOL_A_RINGS_TX_MAX``                u32     max size of TX ring
+  ``ETHTOOL_A_RINGS_RX``                    u32     size of RX ring
+  ``ETHTOOL_A_RINGS_RX_MINI``               u32     size of RX mini ring
+  ``ETHTOOL_A_RINGS_RX_JUMBO``              u32     size of RX jumbo ring
+  ``ETHTOOL_A_RINGS_TX``                    u32     size of TX ring
+  ``ETHTOOL_A_RINGS_RX_BUF_LEN``            u32     size of buffers on the ring
+  ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT``        u8      TCP header / data split
+  ``ETHTOOL_A_RINGS_CQE_SIZE``              u32     Size of TX/RX CQE
+  ``ETHTOOL_A_RINGS_TX_PUSH``               u8      flag of TX Push mode
+  ``ETHTOOL_A_RINGS_RX_PUSH``               u8      flag of RX Push mode
+  ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN``       u32     size of TX push buffer
+  ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX``   u32     max size of TX push buffer
+  =======================================   ======  ===========================
 
 ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with
 page-flipping TCP zero-copy receive (``getsockopt(TCP_ZEROCOPY_RECEIVE)``).
@@ -891,6 +893,18 @@ through MMIO writes, thus reducing the latency. However, enabling this feature
 may increase the CPU cost. Drivers may enforce additional per-packet
 eligibility checks (e.g. on packet size).
 
+``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN`` specifies the maximum number of bytes of a
+transmitted packet a driver can push directly to the underlying device
+('push' mode). Pushing some of the payload bytes to the device has the
+advantages of reducing latency for small packets by avoiding DMA mapping (same
+as ``ETHTOOL_A_RINGS_TX_PUSH`` parameter) as well as allowing the underlying
+device to process packet headers ahead of fetching its payload.
+This can help the device to make fast actions based on the packet's headers.
+This is similar to the "tx-copybreak" parameter, which copies the packet to a
+preallocated DMA memory area instead of mapping new memory. However,
+tx-push-buff parameter copies the packet directly to the device to allow the
+device to take faster actions on the packet.
+
 RINGS_SET
 =========
 
@@ -908,6 +922,7 @@ Request contents:
   ``ETHTOOL_A_RINGS_CQE_SIZE``          u32     Size of TX/RX CQE
   ``ETHTOOL_A_RINGS_TX_PUSH``           u8      flag of TX Push mode
   ``ETHTOOL_A_RINGS_RX_PUSH``           u8      flag of RX Push mode
+  ``ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN``   u32     size of TX push buffer
   ====================================  ======  ===========================
 
 Kernel checks that requested ring sizes do not exceed limits reported by
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 2792185dda22..798d35890118 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -75,6 +75,8 @@ enum {
  * @tx_push: The flag of tx push mode
  * @rx_push: The flag of rx push mode
  * @cqe_size: Size of TX/RX completion queue event
+ * @tx_push_buf_len: Size of TX push buffer
+ * @tx_push_buf_max_len: Maximum allowed size of TX push buffer
  */
 struct kernel_ethtool_ringparam {
 	u32	rx_buf_len;
@@ -82,6 +84,8 @@ struct kernel_ethtool_ringparam {
 	u8	tx_push;
 	u8	rx_push;
 	u32	cqe_size;
+	u32	tx_push_buf_len;
+	u32	tx_push_buf_max_len;
 };
 
 /**
@@ -90,12 +94,14 @@ struct kernel_ethtool_ringparam {
  * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size
  * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push
  * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push
+ * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len
  */
 enum ethtool_supported_ring_param {
-	ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0),
-	ETHTOOL_RING_USE_CQE_SIZE   = BIT(1),
-	ETHTOOL_RING_USE_TX_PUSH    = BIT(2),
-	ETHTOOL_RING_USE_RX_PUSH    = BIT(3),
+	ETHTOOL_RING_USE_RX_BUF_LEN		= BIT(0),
+	ETHTOOL_RING_USE_CQE_SIZE		= BIT(1),
+	ETHTOOL_RING_USE_TX_PUSH		= BIT(2),
+	ETHTOOL_RING_USE_RX_PUSH		= BIT(3),
+	ETHTOOL_RING_USE_TX_PUSH_BUF_LEN	= BIT(4),
 };
 
 #define __ETH_RSS_HASH_BIT(bit)	((u32)1 << (bit))
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index d39ce21381c5..1ebf8d455f07 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -357,6 +357,8 @@ enum {
 	ETHTOOL_A_RINGS_CQE_SIZE,			/* u32 */
 	ETHTOOL_A_RINGS_TX_PUSH,			/* u8 */
 	ETHTOOL_A_RINGS_RX_PUSH,			/* u8 */
+	ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN,		/* u32 */
+	ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_RINGS_CNT,
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index f7b189ed96b2..79424b34b553 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -413,7 +413,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT
 extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
 extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
 extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
-extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_RX_PUSH + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX + 1];
 extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
 extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
 extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index f358cd57d094..1c4972526142 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -11,6 +11,7 @@ struct rings_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_ringparam	ringparam;
 	struct kernel_ethtool_ringparam	kernel_ringparam;
+	u32				supported_ring_params;
 };
 
 #define RINGS_REPDATA(__reply_base) \
@@ -32,6 +33,8 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base,
 
 	if (!dev->ethtool_ops->get_ringparam)
 		return -EOPNOTSUPP;
+
+	data->supported_ring_params = dev->ethtool_ops->supported_ring_params;
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
@@ -57,7 +60,9 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
 	       nla_total_size(sizeof(u8))  +	/* _RINGS_TCP_DATA_SPLIT */
 	       nla_total_size(sizeof(u32)  +	/* _RINGS_CQE_SIZE */
 	       nla_total_size(sizeof(u8))  +	/* _RINGS_TX_PUSH */
-	       nla_total_size(sizeof(u8)));	/* _RINGS_RX_PUSH */
+	       nla_total_size(sizeof(u8))) +	/* _RINGS_RX_PUSH */
+	       nla_total_size(sizeof(u32)) +	/* _RINGS_TX_PUSH_BUF_LEN */
+	       nla_total_size(sizeof(u32));	/* _RINGS_TX_PUSH_BUF_LEN_MAX */
 }
 
 static int rings_fill_reply(struct sk_buff *skb,
@@ -67,6 +72,7 @@ static int rings_fill_reply(struct sk_buff *skb,
 	const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
 	const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam;
 	const struct ethtool_ringparam *ringparam = &data->ringparam;
+	u32 supported_ring_params = data->supported_ring_params;
 
 	WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED);
 
@@ -98,7 +104,12 @@ static int rings_fill_reply(struct sk_buff *skb,
 	    (kr->cqe_size &&
 	     (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) ||
 	    nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) ||
-	    nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push))
+	    nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) ||
+	    ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) &&
+	     (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX,
+			  kr->tx_push_buf_max_len) ||
+	      nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN,
+			  kr->tx_push_buf_len))))
 		return -EMSGSIZE;
 
 	return 0;
@@ -117,6 +128,7 @@ const struct nla_policy ethnl_rings_set_policy[] = {
 	[ETHTOOL_A_RINGS_CQE_SIZE]		= NLA_POLICY_MIN(NLA_U32, 1),
 	[ETHTOOL_A_RINGS_TX_PUSH]		= NLA_POLICY_MAX(NLA_U8, 1),
 	[ETHTOOL_A_RINGS_RX_PUSH]		= NLA_POLICY_MAX(NLA_U8, 1),
+	[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN]	= { .type = NLA_U32 },
 };
 
 static int
@@ -158,6 +170,14 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info,
 		return -EOPNOTSUPP;
 	}
 
+	if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] &&
+	    !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) {
+		NL_SET_ERR_MSG_ATTR(info->extack,
+				    tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+				    "setting tx push buf len is not supported");
+		return -EOPNOTSUPP;
+	}
+
 	return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP;
 }
 
@@ -189,6 +209,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
 			tb[ETHTOOL_A_RINGS_TX_PUSH], &mod);
 	ethnl_update_u8(&kernel_ringparam.rx_push,
 			tb[ETHTOOL_A_RINGS_RX_PUSH], &mod);
+	ethnl_update_u32(&kernel_ringparam.tx_push_buf_len,
+			 tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod);
 	if (!mod)
 		return 0;
 
@@ -209,6 +231,14 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
 		return -EINVAL;
 	}
 
+	if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) {
+		NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+					"Requested TX push buffer exceeds the maximum of %u",
+					kernel_ringparam.tx_push_buf_max_len);
+
+		return -EINVAL;
+	}
+
 	ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
 					      &kernel_ringparam, info->extack);
 	return ret < 0 ? ret : 1;
-- 
cgit v1.2.3


From e5ab9eff46b04c5a04778e40d7092fed3fda52ca Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Mar 2023 21:55:30 +0100
Subject: atomics: Provide atomic_add_negative() variants

atomic_add_negative() does not provide the relaxed/acquire/release
variants.

Provide them in preparation for a new scalable reference count algorithm.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20230323102800.101763813@linutronix.de
---
 include/linux/atomic/atomic-arch-fallback.h | 208 ++++++++++++++++++++++++++--
 include/linux/atomic/atomic-instrumented.h  |  68 ++++++++-
 include/linux/atomic/atomic-long.h          |  38 ++++-
 scripts/atomic/atomics.tbl                  |   2 +-
 scripts/atomic/fallbacks/add_negative       |  11 +-
 5 files changed, 309 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h
index 77bc5522e61c..4226379a232d 100644
--- a/include/linux/atomic/atomic-arch-fallback.h
+++ b/include/linux/atomic/atomic-arch-fallback.h
@@ -1208,15 +1208,21 @@ arch_atomic_inc_and_test(atomic_t *v)
 #define arch_atomic_inc_and_test arch_atomic_inc_and_test
 #endif
 
+#ifndef arch_atomic_add_negative_relaxed
+#ifdef arch_atomic_add_negative
+#define arch_atomic_add_negative_acquire arch_atomic_add_negative
+#define arch_atomic_add_negative_release arch_atomic_add_negative
+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative
+#endif /* arch_atomic_add_negative */
+
 #ifndef arch_atomic_add_negative
 /**
- * arch_atomic_add_negative - add and test if negative
+ * arch_atomic_add_negative - Add and test if negative
  * @i: integer value to add
  * @v: pointer of type atomic_t
  *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
  */
 static __always_inline bool
 arch_atomic_add_negative(int i, atomic_t *v)
@@ -1226,6 +1232,95 @@ arch_atomic_add_negative(int i, atomic_t *v)
 #define arch_atomic_add_negative arch_atomic_add_negative
 #endif
 
+#ifndef arch_atomic_add_negative_acquire
+/**
+ * arch_atomic_add_negative_acquire - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic_add_negative_acquire(int i, atomic_t *v)
+{
+	return arch_atomic_add_return_acquire(i, v) < 0;
+}
+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
+#endif
+
+#ifndef arch_atomic_add_negative_release
+/**
+ * arch_atomic_add_negative_release - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic_add_negative_release(int i, atomic_t *v)
+{
+	return arch_atomic_add_return_release(i, v) < 0;
+}
+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
+#endif
+
+#ifndef arch_atomic_add_negative_relaxed
+/**
+ * arch_atomic_add_negative_relaxed - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic_add_negative_relaxed(int i, atomic_t *v)
+{
+	return arch_atomic_add_return_relaxed(i, v) < 0;
+}
+#define arch_atomic_add_negative_relaxed arch_atomic_add_negative_relaxed
+#endif
+
+#else /* arch_atomic_add_negative_relaxed */
+
+#ifndef arch_atomic_add_negative_acquire
+static __always_inline bool
+arch_atomic_add_negative_acquire(int i, atomic_t *v)
+{
+	bool ret = arch_atomic_add_negative_relaxed(i, v);
+	__atomic_acquire_fence();
+	return ret;
+}
+#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire
+#endif
+
+#ifndef arch_atomic_add_negative_release
+static __always_inline bool
+arch_atomic_add_negative_release(int i, atomic_t *v)
+{
+	__atomic_release_fence();
+	return arch_atomic_add_negative_relaxed(i, v);
+}
+#define arch_atomic_add_negative_release arch_atomic_add_negative_release
+#endif
+
+#ifndef arch_atomic_add_negative
+static __always_inline bool
+arch_atomic_add_negative(int i, atomic_t *v)
+{
+	bool ret;
+	__atomic_pre_full_fence();
+	ret = arch_atomic_add_negative_relaxed(i, v);
+	__atomic_post_full_fence();
+	return ret;
+}
+#define arch_atomic_add_negative arch_atomic_add_negative
+#endif
+
+#endif /* arch_atomic_add_negative_relaxed */
+
 #ifndef arch_atomic_fetch_add_unless
 /**
  * arch_atomic_fetch_add_unless - add unless the number is already a given value
@@ -2329,15 +2424,21 @@ arch_atomic64_inc_and_test(atomic64_t *v)
 #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
 #endif
 
+#ifndef arch_atomic64_add_negative_relaxed
+#ifdef arch_atomic64_add_negative
+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative
+#define arch_atomic64_add_negative_release arch_atomic64_add_negative
+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative
+#endif /* arch_atomic64_add_negative */
+
 #ifndef arch_atomic64_add_negative
 /**
- * arch_atomic64_add_negative - add and test if negative
+ * arch_atomic64_add_negative - Add and test if negative
  * @i: integer value to add
  * @v: pointer of type atomic64_t
  *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
  */
 static __always_inline bool
 arch_atomic64_add_negative(s64 i, atomic64_t *v)
@@ -2347,6 +2448,95 @@ arch_atomic64_add_negative(s64 i, atomic64_t *v)
 #define arch_atomic64_add_negative arch_atomic64_add_negative
 #endif
 
+#ifndef arch_atomic64_add_negative_acquire
+/**
+ * arch_atomic64_add_negative_acquire - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
+{
+	return arch_atomic64_add_return_acquire(i, v) < 0;
+}
+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
+#endif
+
+#ifndef arch_atomic64_add_negative_release
+/**
+ * arch_atomic64_add_negative_release - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
+{
+	return arch_atomic64_add_return_release(i, v) < 0;
+}
+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
+#endif
+
+#ifndef arch_atomic64_add_negative_relaxed
+/**
+ * arch_atomic64_add_negative_relaxed - Add and test if negative
+ * @i: integer value to add
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
+ */
+static __always_inline bool
+arch_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
+{
+	return arch_atomic64_add_return_relaxed(i, v) < 0;
+}
+#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative_relaxed
+#endif
+
+#else /* arch_atomic64_add_negative_relaxed */
+
+#ifndef arch_atomic64_add_negative_acquire
+static __always_inline bool
+arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
+{
+	bool ret = arch_atomic64_add_negative_relaxed(i, v);
+	__atomic_acquire_fence();
+	return ret;
+}
+#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire
+#endif
+
+#ifndef arch_atomic64_add_negative_release
+static __always_inline bool
+arch_atomic64_add_negative_release(s64 i, atomic64_t *v)
+{
+	__atomic_release_fence();
+	return arch_atomic64_add_negative_relaxed(i, v);
+}
+#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release
+#endif
+
+#ifndef arch_atomic64_add_negative
+static __always_inline bool
+arch_atomic64_add_negative(s64 i, atomic64_t *v)
+{
+	bool ret;
+	__atomic_pre_full_fence();
+	ret = arch_atomic64_add_negative_relaxed(i, v);
+	__atomic_post_full_fence();
+	return ret;
+}
+#define arch_atomic64_add_negative arch_atomic64_add_negative
+#endif
+
+#endif /* arch_atomic64_add_negative_relaxed */
+
 #ifndef arch_atomic64_fetch_add_unless
 /**
  * arch_atomic64_fetch_add_unless - add unless the number is already a given value
@@ -2456,4 +2646,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v)
 #endif
 
 #endif /* _LINUX_ATOMIC_FALLBACK_H */
-// b5e87bdd5ede61470c29f7a7e4de781af3770f09
+// 00071fffa021cec66f6290d706d69c91df87bade
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 7a139ec030b0..0496816738ca 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -592,6 +592,28 @@ atomic_add_negative(int i, atomic_t *v)
 	return arch_atomic_add_negative(i, v);
 }
 
+static __always_inline bool
+atomic_add_negative_acquire(int i, atomic_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+atomic_add_negative_release(int i, atomic_t *v)
+{
+	kcsan_release();
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_add_negative_release(i, v);
+}
+
+static __always_inline bool
+atomic_add_negative_relaxed(int i, atomic_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_add_negative_relaxed(i, v);
+}
+
 static __always_inline int
 atomic_fetch_add_unless(atomic_t *v, int a, int u)
 {
@@ -1211,6 +1233,28 @@ atomic64_add_negative(s64 i, atomic64_t *v)
 	return arch_atomic64_add_negative(i, v);
 }
 
+static __always_inline bool
+atomic64_add_negative_acquire(s64 i, atomic64_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic64_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+atomic64_add_negative_release(s64 i, atomic64_t *v)
+{
+	kcsan_release();
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic64_add_negative_release(i, v);
+}
+
+static __always_inline bool
+atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic64_add_negative_relaxed(i, v);
+}
+
 static __always_inline s64
 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
 {
@@ -1830,6 +1874,28 @@ atomic_long_add_negative(long i, atomic_long_t *v)
 	return arch_atomic_long_add_negative(i, v);
 }
 
+static __always_inline bool
+atomic_long_add_negative_acquire(long i, atomic_long_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_long_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+atomic_long_add_negative_release(long i, atomic_long_t *v)
+{
+	kcsan_release();
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_long_add_negative_release(i, v);
+}
+
+static __always_inline bool
+atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
+{
+	instrument_atomic_read_write(v, sizeof(*v));
+	return arch_atomic_long_add_negative_relaxed(i, v);
+}
+
 static __always_inline long
 atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 {
@@ -2083,4 +2149,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
+// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a
diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h
index 800b8c35992d..2fc51ba66beb 100644
--- a/include/linux/atomic/atomic-long.h
+++ b/include/linux/atomic/atomic-long.h
@@ -479,6 +479,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
 	return arch_atomic64_add_negative(i, v);
 }
 
+static __always_inline bool
+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
+{
+	return arch_atomic64_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
+{
+	return arch_atomic64_add_negative_release(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
+{
+	return arch_atomic64_add_negative_relaxed(i, v);
+}
+
 static __always_inline long
 arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 {
@@ -973,6 +991,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v)
 	return arch_atomic_add_negative(i, v);
 }
 
+static __always_inline bool
+arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
+{
+	return arch_atomic_add_negative_acquire(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_release(long i, atomic_long_t *v)
+{
+	return arch_atomic_add_negative_release(i, v);
+}
+
+static __always_inline bool
+arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
+{
+	return arch_atomic_add_negative_relaxed(i, v);
+}
+
 static __always_inline long
 arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
 {
@@ -1011,4 +1047,4 @@ arch_atomic_long_dec_if_positive(atomic_long_t *v)
 
 #endif /* CONFIG_64BIT */
 #endif /* _LINUX_ATOMIC_LONG_H */
-// e8f0e08ff072b74d180eabe2ad001282b38c2c88
+// a194c07d7d2f4b0e178d3c118c919775d5d65f50
diff --git a/scripts/atomic/atomics.tbl b/scripts/atomic/atomics.tbl
index fbee2f6190d9..85ca8d9b5c27 100644
--- a/scripts/atomic/atomics.tbl
+++ b/scripts/atomic/atomics.tbl
@@ -33,7 +33,7 @@ try_cmpxchg		B	v	p:old	i:new
 sub_and_test		b	i	v
 dec_and_test		b	v
 inc_and_test		b	v
-add_negative		b	i	v
+add_negative		B	i	v
 add_unless		fb	v	i:a	i:u
 inc_not_zero		b	v
 inc_unless_negative	b	v
diff --git a/scripts/atomic/fallbacks/add_negative b/scripts/atomic/fallbacks/add_negative
index 15caa2eb2371..e5980abf5904 100755
--- a/scripts/atomic/fallbacks/add_negative
+++ b/scripts/atomic/fallbacks/add_negative
@@ -1,16 +1,15 @@
 cat <<EOF
 /**
- * arch_${atomic}_add_negative - add and test if negative
+ * arch_${atomic}_add_negative${order} - Add and test if negative
  * @i: integer value to add
  * @v: pointer of type ${atomic}_t
  *
- * Atomically adds @i to @v and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
+ * Atomically adds @i to @v and returns true if the result is negative,
+ * or false when the result is greater than or equal to zero.
  */
 static __always_inline bool
-arch_${atomic}_add_negative(${int} i, ${atomic}_t *v)
+arch_${atomic}_add_negative${order}(${int} i, ${atomic}_t *v)
 {
-	return arch_${atomic}_add_return(i, v) < 0;
+	return arch_${atomic}_add_return${order}(i, v) < 0;
 }
 EOF
-- 
cgit v1.2.3


From ee1ee6db07795d9637bc5e8993a8ddcf886541ef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Mar 2023 21:55:31 +0100
Subject: atomics: Provide rcuref - scalable reference counting

atomic_t based reference counting, including refcount_t, uses
atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is
implemented with a atomic_try_cmpxchg() loop. High contention of the
reference count leads to retry loops and scales badly. There is nothing to
improve on this implementation as the semantics have to be preserved.

Provide rcuref as a scalable alternative solution which is suitable for RCU
managed objects. Similar to refcount_t it comes with overflow and underflow
detection and mitigation.

rcuref treats the underlying atomic_t as an unsigned integer and partitions
this space into zones:

  0x00000000 - 0x7FFFFFFF	valid zone (1 .. (INT_MAX + 1) references)
  0x80000000 - 0xBFFFFFFF	saturation zone
  0xC0000000 - 0xFFFFFFFE	dead zone
  0xFFFFFFFF   			no reference

rcuref_get() unconditionally increments the reference count with
atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the
reference count with atomic_add_negative_release().

This unconditional increment avoids the inc_not_zero() problem, but
requires a more complex implementation on the put() side when the count
drops from 0 to -1.

When this transition is detected then it is attempted to mark the reference
count dead, by setting it to the midpoint of the dead zone with a single
atomic_cmpxchg_release() operation. This operation can fail due to a
concurrent rcuref_get() elevating the reference count from -1 to 0 again.

If the unconditional increment in rcuref_get() hits a reference count which
is marked dead (or saturated) it will detect it after the fact and bring
back the reference count to the midpoint of the respective zone. The zones
provide enough tolerance which makes it practically impossible to escape
from a zone.

The racy implementation of rcuref_put() requires to protect rcuref_put()
against a grace period ending in order to prevent a subtle use after
free. As RCU is the only mechanism which allows to protect against that, it
is not possible to fully replace the atomic_inc_not_zero() based
implementation of refcount_t with this scheme.

The final drop is slightly more expensive than the atomic_dec_return()
counterpart, but that's not the case which this is optimized for. The
optimization is on the high frequeunt get()/put() pairs and their
scalability.

The performance of an uncontended rcuref_get()/put() pair where the put()
is not dropping the last reference is still on par with the plain atomic
operations, while at the same time providing overflow and underflow
detection and mitigation.

The performance of rcuref compared to plain atomic_inc_not_zero() and
atomic_dec_return() based reference counting under contention:

 -  Micro benchmark: All CPUs running a increment/decrement loop on an
    elevated reference count, which means the 0 to -1 transition never
    happens.

    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.3X to 4.7X

 - Conversion of dst_entry::__refcnt to rcuref and testing with the
    localhost memtier/memcached benchmark. That benchmark shows the
    reference count contention prominently.

    The performance gain depends on microarchitecture and the number of
    CPUs and has been observed in the range of 1.1X to 2.6X over the
    previous fix for the false sharing issue vs. struct
    dst_entry::__refcnt.

    When memtier is run over a real 1Gb network connection, there is a
    small gain on top of the false sharing fix. The two changes combined
    result in a 2%-5% total gain for that networked test.

Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de
---
 include/linux/rcuref.h | 155 +++++++++++++++++++++++++++
 include/linux/types.h  |   6 ++
 lib/Makefile           |   2 +-
 lib/rcuref.c           | 281 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 443 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/rcuref.h
 create mode 100644 lib/rcuref.c

(limited to 'include')

diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
new file mode 100644
index 000000000000..2c8bfd0f1b6b
--- /dev/null
+++ b/include/linux/rcuref.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_RCUREF_H
+#define _LINUX_RCUREF_H
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/limits.h>
+#include <linux/lockdep.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+
+#define RCUREF_ONEREF		0x00000000U
+#define RCUREF_MAXREF		0x7FFFFFFFU
+#define RCUREF_SATURATED	0xA0000000U
+#define RCUREF_RELEASED		0xC0000000U
+#define RCUREF_DEAD		0xE0000000U
+#define RCUREF_NOREF		0xFFFFFFFFU
+
+/**
+ * rcuref_init - Initialize a rcuref reference count with the given reference count
+ * @ref:	Pointer to the reference count
+ * @cnt:	The initial reference count typically '1'
+ */
+static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
+{
+	atomic_set(&ref->refcnt, cnt - 1);
+}
+
+/**
+ * rcuref_read - Read the number of held reference counts of a rcuref
+ * @ref:	Pointer to the reference count
+ *
+ * Return: The number of held references (0 ... N)
+ */
+static inline unsigned int rcuref_read(rcuref_t *ref)
+{
+	unsigned int c = atomic_read(&ref->refcnt);
+
+	/* Return 0 if within the DEAD zone. */
+	return c >= RCUREF_RELEASED ? 0 : c + 1;
+}
+
+extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);
+
+/**
+ * rcuref_get - Acquire one reference on a rcuref reference count
+ * @ref:	Pointer to the reference count
+ *
+ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See documentation in lib/rcuref.c
+ *
+ * Return:
+ *	False if the attempt to acquire a reference failed. This happens
+ *	when the last reference has been put already
+ *
+ *	True if a reference was successfully acquired
+ */
+static inline __must_check bool rcuref_get(rcuref_t *ref)
+{
+	/*
+	 * Unconditionally increase the reference count. The saturation and
+	 * dead zones provide enough tolerance for this.
+	 */
+	if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
+		return true;
+
+	/* Handle the cases inside the saturation and dead zones */
+	return rcuref_get_slowpath(ref);
+}
+
+extern __must_check bool rcuref_put_slowpath(rcuref_t *ref);
+
+/*
+ * Internal helper. Do not invoke directly.
+ */
+static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
+{
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
+			 "suspicious rcuref_put_rcusafe() usage");
+	/*
+	 * Unconditionally decrease the reference count. The saturation and
+	 * dead zones provide enough tolerance for this.
+	 */
+	if (likely(!atomic_add_negative_release(-1, &ref->refcnt)))
+		return false;
+
+	/*
+	 * Handle the last reference drop and cases inside the saturation
+	 * and dead zones.
+	 */
+	return rcuref_put_slowpath(ref);
+}
+
+/**
+ * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
+ * @ref:	Pointer to the reference count
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Can be invoked from contexts, which guarantee that no grace period can
+ * happen which would free the object concurrently if the decrement drops
+ * the last reference and the slowpath races against a concurrent get() and
+ * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
+ *
+ * Return:
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely release the
+ *	object which is protected by the reference counter.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	release the protected object.
+ */
+static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
+{
+	return __rcuref_put(ref);
+}
+
+/**
+ * rcuref_put -- Release one reference for a rcuref reference count
+ * @ref:	Pointer to the reference count
+ *
+ * Can be invoked from any context.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides an acquire ordering on success such that free()
+ * must come after.
+ *
+ * Return:
+ *
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely schedule the
+ *	object, which is protected by the reference counter, for
+ *	deconstruction.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	deconstruct the protected object.
+ */
+static inline __must_check bool rcuref_put(rcuref_t *ref)
+{
+	bool released;
+
+	preempt_disable();
+	released = __rcuref_put(ref);
+	preempt_enable();
+	return released;
+}
+
+#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index ea8cf60a8a79..688fb943556a 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -175,6 +175,12 @@ typedef struct {
 } atomic64_t;
 #endif
 
+typedef struct {
+	atomic_t refcnt;
+} rcuref_t;
+
+#define RCUREF_INIT(i)	{ .refcnt = ATOMIC_INIT(i - 1) }
+
 struct list_head {
 	struct list_head *next, *prev;
 };
diff --git a/lib/Makefile b/lib/Makefile
index baf2821f7a00..31a3a257fd49 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o rhashtable.o base64.o \
-	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
+	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
 obj-y += string_helpers.o
diff --git a/lib/rcuref.c b/lib/rcuref.c
new file mode 100644
index 000000000000..5ec00a4a64d1
--- /dev/null
+++ b/lib/rcuref.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * rcuref - A scalable reference count implementation for RCU managed objects
+ *
+ * rcuref is provided to replace open coded reference count implementations
+ * based on atomic_t. It protects explicitely RCU managed objects which can
+ * be visible even after the last reference has been dropped and the object
+ * is heading towards destruction.
+ *
+ * A common usage pattern is:
+ *
+ * get()
+ *	rcu_read_lock();
+ *	p = get_ptr();
+ *	if (p && !atomic_inc_not_zero(&p->refcnt))
+ *		p = NULL;
+ *	rcu_read_unlock();
+ *	return p;
+ *
+ * put()
+ *	if (!atomic_dec_return(&->refcnt)) {
+ *		remove_ptr(p);
+ *		kfree_rcu((p, rcu);
+ *	}
+ *
+ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
+ * O(N^2) behaviour under contention with N concurrent operations.
+ *
+ * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
+ * better under contention.
+ *
+ * Why not refcount?
+ * =================
+ *
+ * In principle it should be possible to make refcount use the rcuref
+ * scheme, but the destruction race described below cannot be prevented
+ * unless the protected object is RCU managed.
+ *
+ * Theory of operation
+ * ===================
+ *
+ * rcuref uses an unsigned integer reference counter. As long as the
+ * counter value is greater than or equal to RCUREF_ONEREF and not larger
+ * than RCUREF_MAXREF the reference is alive:
+ *
+ * ONEREF   MAXREF               SATURATED             RELEASED      DEAD    NOREF
+ * 0        0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
+ * <---valid --------> <-------saturation zone-------> <-----dead zone----->
+ *
+ * The get() and put() operations do unconditional increments and
+ * decrements. The result is checked after the operation. This optimizes
+ * for the fast path.
+ *
+ * If the reference count is saturated or dead, then the increments and
+ * decrements are not harmful as the reference count still stays in the
+ * respective zones and is always set back to STATURATED resp. DEAD. The
+ * zones have room for 2^28 racing operations in each direction, which
+ * makes it practically impossible to escape the zones.
+ *
+ * Once the last reference is dropped the reference count becomes
+ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
+ * slowpath then tries to set the reference count from RCUREF_NOREF to
+ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
+ * concurrent rcuref_get() can acquire the reference count and bring it
+ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
+ *
+ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
+ * DEAD + 1, which is inside the dead zone. If that happens the reference
+ * count is put back to DEAD.
+ *
+ * The actual race is possible due to the unconditional increment and
+ * decrements in rcuref_get() and rcuref_put():
+ *
+ *	T1				T2
+ *	get()				put()
+ *					if (atomic_add_negative(-1, &ref->refcnt))
+ *		succeeds->			atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
+ *
+ *	atomic_add_negative(1, &ref->refcnt);	<- Elevates refcount to DEAD + 1
+ *
+ * As the result of T1's add is negative, the get() goes into the slow path
+ * and observes refcnt being in the dead zone which makes the operation fail.
+ *
+ * Possible critical states:
+ *
+ *	Context Counter	References	Operation
+ *	T1	0	1		init()
+ *	T2	1	2		get()
+ *	T1	0	1		put()
+ *	T2     -1	0		put() tries to mark dead
+ *	T1	0	1		get()
+ *	T2	0	1		put() mark dead fails
+ *	T1     -1	0		put() tries to mark dead
+ *	T1    DEAD	0		put() mark dead succeeds
+ *	T2    DEAD+1	0		get() fails and puts it back to DEAD
+ *
+ * Of course there are more complex scenarios, but the above illustrates
+ * the working principle. The rest is left to the imagination of the
+ * reader.
+ *
+ * Deconstruction race
+ * ===================
+ *
+ * The release operation must be protected by prohibiting a grace period in
+ * order to prevent a possible use after free:
+ *
+ *	T1				T2
+ *	put()				get()
+ *	// ref->refcnt = ONEREF
+ *	if (!atomic_add_negative(-1, &ref->refcnt))
+ *		return false;				<- Not taken
+ *
+ *	// ref->refcnt == NOREF
+ *	--> preemption
+ *					// Elevates ref->refcnt to ONEREF
+ *					if (!atomic_add_negative(1, &ref->refcnt))
+ *						return true;			<- taken
+ *
+ *					if (put(&p->ref)) { <-- Succeeds
+ *						remove_pointer(p);
+ *						kfree_rcu(p, rcu);
+ *					}
+ *
+ *		RCU grace period ends, object is freed
+ *
+ *	atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);	<- UAF
+ *
+ * This is prevented by disabling preemption around the put() operation as
+ * that's in most kernel configurations cheaper than a rcu_read_lock() /
+ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
+ * prevents the grace period which keeps the object alive until all put()
+ * operations complete.
+ *
+ * Saturation protection
+ * =====================
+ *
+ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
+ * Once this is exceedded the reference count becomes stale by setting it
+ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
+ * wrap arounds which obviously cause worse problems than a memory
+ * leak. When saturation is reached a warning is emitted.
+ *
+ * Race conditions
+ * ===============
+ *
+ * All reference count increment/decrement operations are unconditional and
+ * only verified after the fact. This optimizes for the good case and takes
+ * the occasional race vs. a dead or already saturated refcount into
+ * account. The saturation and dead zones are large enough to accomodate
+ * for that.
+ *
+ * Memory ordering
+ * ===============
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object to increase the
+ * reference count on will provide the ordering. For locked data
+ * structures, its the lock acquire, for RCU/lockless data structures its
+ * the dependent load.
+ *
+ * rcuref_get() provides a control dependency ordering future stores which
+ * ensures that the object is not modified when acquiring a reference
+ * fails.
+ *
+ * rcuref_put() provides release order, i.e. all prior loads and stores
+ * will be issued before. It also provides a control dependency ordering
+ * against the subsequent destruction of the object.
+ *
+ * If rcuref_put() successfully dropped the last reference and marked the
+ * object DEAD it also provides acquire ordering.
+ */
+
+#include <linux/export.h>
+#include <linux/rcuref.h>
+
+/**
+ * rcuref_get_slowpath - Slowpath of rcuref_get()
+ * @ref:	Pointer to the reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ *	False if the reference count was already marked dead
+ *
+ *	True if the reference count is saturated, which prevents the
+ *	object from being deconstructed ever.
+ */
+bool rcuref_get_slowpath(rcuref_t *ref)
+{
+	unsigned int cnt = atomic_read(&ref->refcnt);
+
+	/*
+	 * If the reference count was already marked dead, undo the
+	 * increment so it stays in the middle of the dead zone and return
+	 * fail.
+	 */
+	if (cnt >= RCUREF_RELEASED) {
+		atomic_set(&ref->refcnt, RCUREF_DEAD);
+		return false;
+	}
+
+	/*
+	 * If it was saturated, warn and mark it so. In case the increment
+	 * was already on a saturated value restore the saturation
+	 * marker. This keeps it in the middle of the saturation zone and
+	 * prevents the reference count from overflowing. This leaks the
+	 * object memory, but prevents the obvious reference count overflow
+	 * damage.
+	 */
+	if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
+		atomic_set(&ref->refcnt, RCUREF_SATURATED);
+	return true;
+}
+EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
+
+/**
+ * rcuref_put_slowpath - Slowpath of __rcuref_put()
+ * @ref:	Pointer to the reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely schedule the
+ *	object, which is protected by the reference counter, for
+ *	deconstruction.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	deconstruct the protected object.
+ */
+bool rcuref_put_slowpath(rcuref_t *ref)
+{
+	unsigned int cnt = atomic_read(&ref->refcnt);
+
+	/* Did this drop the last reference? */
+	if (likely(cnt == RCUREF_NOREF)) {
+		/*
+		 * Carefully try to set the reference count to RCUREF_DEAD.
+		 *
+		 * This can fail if a concurrent get() operation has
+		 * elevated it again or the corresponding put() even marked
+		 * it dead already. Both are valid situations and do not
+		 * require a retry. If this fails the caller is not
+		 * allowed to deconstruct the object.
+		 */
+		if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
+			return false;
+
+		/*
+		 * The caller can safely schedule the object for
+		 * deconstruction. Provide acquire ordering.
+		 */
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+
+	/*
+	 * If the reference count was already in the dead zone, then this
+	 * put() operation is imbalanced. Warn, put the reference count back to
+	 * DEAD and tell the caller to not deconstruct the object.
+	 */
+	if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
+		atomic_set(&ref->refcnt, RCUREF_DEAD);
+		return false;
+	}
+
+	/*
+	 * This is a put() operation on a saturated refcount. Restore the
+	 * mean saturation value and tell the caller to not deconstruct the
+	 * object.
+	 */
+	if (cnt > RCUREF_MAXREF)
+		atomic_set(&ref->refcnt, RCUREF_SATURATED);
+	return false;
+}
+EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
-- 
cgit v1.2.3


From d288a162dd1c73507da582966f17dd226e34a0c0 Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Thu, 23 Mar 2023 21:55:29 +0100
Subject: net: dst: Prevent false sharing vs. dst_entry:: __refcnt

dst_entry::__refcnt is highly contended in scenarios where many connections
happen from and to the same IP. The reference count is an atomic_t, so the
reference count operations have to take the cache-line exclusive.

Aside of the unavoidable reference count contention there is another
significant problem which is caused by that: False sharing.

perf top identified two affected read accesses. dst_entry::lwtstate and
rtable::rt_genid.

dst_entry:__refcnt is located at offset 64 of dst_entry, which puts it into
a seperate cacheline vs. the read mostly members located at the beginning
of the struct.

That prevents false sharing vs. the struct members in the first 64
bytes of the structure, but there is also

  dst_entry::lwtstate

which is located after the reference count and in the same cache line. This
member is read after a reference count has been acquired.

struct rtable embeds a struct dst_entry at offset 0. struct dst_entry has a
size of 112 bytes, which means that the struct members of rtable which
follow the dst member share the same cache line as dst_entry::__refcnt.
Especially

  rtable::rt_genid

is also read by the contexts which have a reference count acquired
already.

When dst_entry:__refcnt is incremented or decremented via an atomic
operation these read accesses stall. This was found when analysing the
memtier benchmark in 1:100 mode, which amplifies the problem extremly.

Move the rt[6i]_uncached[_list] members out of struct rtable and struct
rt6_info into struct dst_entry to provide padding and move the lwtstate
member after that so it ends up in the same cache line.

The resulting improvement depends on the micro-architecture and the number
of CPUs. It ranges from +20% to +120% with a localhost memtier/memcached
benchmark.

[ tglx: Rearrange struct ]

Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20230323102800.042297517@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h       | 15 ++++++++++++++-
 include/net/ip6_fib.h   |  3 ---
 include/net/ip6_route.h |  2 +-
 include/net/route.h     |  3 ---
 net/ipv4/route.c        | 20 ++++++++++----------
 net/ipv4/xfrm4_policy.c |  4 ++--
 net/ipv6/route.c        | 26 +++++++++++++-------------
 net/ipv6/xfrm6_policy.c |  4 ++--
 8 files changed, 42 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index d67fda89cd0f..81f2279ea911 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -69,15 +69,28 @@ struct dst_entry {
 #endif
 	int			__use;
 	unsigned long		lastuse;
-	struct lwtunnel_state   *lwtstate;
 	struct rcu_head		rcu_head;
 	short			error;
 	short			__pad;
 	__u32			tclassid;
 #ifndef CONFIG_64BIT
+	struct lwtunnel_state   *lwtstate;
 	atomic_t		__refcnt;	/* 32-bit offset 64 */
 #endif
 	netdevice_tracker	dev_tracker;
+
+	/*
+	 * Used by rtable and rt6_info. Moves lwtstate into the next cache
+	 * line on 64bit so that lwtstate does not cause false sharing with
+	 * __refcnt under contention of __refcnt. This also puts the
+	 * frequently accessed members of rtable and rt6_info out of the
+	 * __refcnt cache line.
+	 */
+	struct list_head	rt_uncached;
+	struct uncached_list	*rt_uncached_list;
+#ifdef CONFIG_64BIT
+	struct lwtunnel_state   *lwtstate;
+#endif
 };
 
 struct dst_metrics {
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6268963d9599..79570cb4ea9c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -217,9 +217,6 @@ struct rt6_info {
 	struct inet6_dev		*rt6i_idev;
 	u32				rt6i_flags;
 
-	struct list_head		rt6i_uncached;
-	struct uncached_list		*rt6i_uncached_list;
-
 	/* more non-fragment space at head required */
 	unsigned short			rt6i_nfheader_len;
 };
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 81ee387a1fc4..3556595ce59a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -100,7 +100,7 @@ static inline struct dst_entry *ip6_route_output(struct net *net,
 static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
 {
 	if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
-	    !list_empty(&rt->rt6i_uncached))
+	    !list_empty(&rt->dst.rt_uncached))
 		ip6_rt_put(rt);
 }
 
diff --git a/include/net/route.h b/include/net/route.h
index fe00b0a2e475..bcc367cf3aa2 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -78,9 +78,6 @@ struct rtable {
 	/* Miscellaneous cached information */
 	u32			rt_mtu_locked:1,
 				rt_pmtu:31;
-
-	struct list_head	rt_uncached;
-	struct uncached_list	*rt_uncached_list;
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6a0a0bb452e9..2a3d14d95ada 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1508,20 +1508,20 @@ void rt_add_uncached_list(struct rtable *rt)
 {
 	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
 
-	rt->rt_uncached_list = ul;
+	rt->dst.rt_uncached_list = ul;
 
 	spin_lock_bh(&ul->lock);
-	list_add_tail(&rt->rt_uncached, &ul->head);
+	list_add_tail(&rt->dst.rt_uncached, &ul->head);
 	spin_unlock_bh(&ul->lock);
 }
 
 void rt_del_uncached_list(struct rtable *rt)
 {
-	if (!list_empty(&rt->rt_uncached)) {
-		struct uncached_list *ul = rt->rt_uncached_list;
+	if (!list_empty(&rt->dst.rt_uncached)) {
+		struct uncached_list *ul = rt->dst.rt_uncached_list;
 
 		spin_lock_bh(&ul->lock);
-		list_del_init(&rt->rt_uncached);
+		list_del_init(&rt->dst.rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
 }
@@ -1546,13 +1546,13 @@ void rt_flush_dev(struct net_device *dev)
 			continue;
 
 		spin_lock_bh(&ul->lock);
-		list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
+		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
 			if (rt->dst.dev != dev)
 				continue;
 			rt->dst.dev = blackhole_netdev;
 			netdev_ref_replace(dev, blackhole_netdev,
 					   &rt->dst.dev_tracker, GFP_ATOMIC);
-			list_move(&rt->rt_uncached, &ul->quarantine);
+			list_move(&rt->dst.rt_uncached, &ul->quarantine);
 		}
 		spin_unlock_bh(&ul->lock);
 	}
@@ -1644,7 +1644,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 		rt->rt_uses_gateway = 0;
 		rt->rt_gw_family = 0;
 		rt->rt_gw4 = 0;
-		INIT_LIST_HEAD(&rt->rt_uncached);
+		INIT_LIST_HEAD(&rt->dst.rt_uncached);
 
 		rt->dst.output = ip_output;
 		if (flags & RTCF_LOCAL)
@@ -1675,7 +1675,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
 			new_rt->rt_gw4 = rt->rt_gw4;
 		else if (rt->rt_gw_family == AF_INET6)
 			new_rt->rt_gw6 = rt->rt_gw6;
-		INIT_LIST_HEAD(&new_rt->rt_uncached);
+		INIT_LIST_HEAD(&new_rt->dst.rt_uncached);
 
 		new_rt->dst.input = rt->dst.input;
 		new_rt->dst.output = rt->dst.output;
@@ -2859,7 +2859,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		else if (rt->rt_gw_family == AF_INET6)
 			rt->rt_gw6 = ort->rt_gw6;
 
-		INIT_LIST_HEAD(&rt->rt_uncached);
+		INIT_LIST_HEAD(&rt->dst.rt_uncached);
 	}
 
 	dst_release(dst_orig);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 3d0dfa6cf9f9..47861c8b7340 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -91,7 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 		xdst->u.rt.rt_gw6 = rt->rt_gw6;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
 	xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
-	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
+	INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached);
 	rt_add_uncached_list(&xdst->u.rt);
 
 	return 0;
@@ -121,7 +121,7 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
 	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
 
 	dst_destroy_metrics_generic(dst);
-	if (xdst->u.rt.rt_uncached_list)
+	if (xdst->u.rt.dst.rt_uncached_list)
 		rt_del_uncached_list(&xdst->u.rt);
 	xfrm_dst_destroy(xdst);
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 244df77fac87..12e163dec34e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -139,20 +139,20 @@ void rt6_uncached_list_add(struct rt6_info *rt)
 {
 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 
-	rt->rt6i_uncached_list = ul;
+	rt->dst.rt_uncached_list = ul;
 
 	spin_lock_bh(&ul->lock);
-	list_add_tail(&rt->rt6i_uncached, &ul->head);
+	list_add_tail(&rt->dst.rt_uncached, &ul->head);
 	spin_unlock_bh(&ul->lock);
 }
 
 void rt6_uncached_list_del(struct rt6_info *rt)
 {
-	if (!list_empty(&rt->rt6i_uncached)) {
-		struct uncached_list *ul = rt->rt6i_uncached_list;
+	if (!list_empty(&rt->dst.rt_uncached)) {
+		struct uncached_list *ul = rt->dst.rt_uncached_list;
 
 		spin_lock_bh(&ul->lock);
-		list_del_init(&rt->rt6i_uncached);
+		list_del_init(&rt->dst.rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
 }
@@ -169,7 +169,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
 			continue;
 
 		spin_lock_bh(&ul->lock);
-		list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
+		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
 			struct inet6_dev *rt_idev = rt->rt6i_idev;
 			struct net_device *rt_dev = rt->dst.dev;
 			bool handled = false;
@@ -188,7 +188,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev)
 				handled = true;
 			}
 			if (handled)
-				list_move(&rt->rt6i_uncached,
+				list_move(&rt->dst.rt_uncached,
 					  &ul->quarantine);
 		}
 		spin_unlock_bh(&ul->lock);
@@ -334,7 +334,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
 static void rt6_info_init(struct rt6_info *rt)
 {
 	memset_after(rt, 0, dst);
-	INIT_LIST_HEAD(&rt->rt6i_uncached);
+	INIT_LIST_HEAD(&rt->dst.rt_uncached);
 }
 
 /* allocate dst with ip6_dst_ops */
@@ -2638,7 +2638,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net,
 	dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
 	rt6 = (struct rt6_info *)dst;
 	/* For dst cached in uncached_list, refcnt is already taken. */
-	if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+	if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
 		dst = &net->ipv6.ip6_null_entry->dst;
 		dst_hold(dst);
 	}
@@ -2748,7 +2748,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
 	from = rcu_dereference(rt->from);
 
 	if (from && (rt->rt6i_flags & RTF_PCPU ||
-	    unlikely(!list_empty(&rt->rt6i_uncached))))
+	    unlikely(!list_empty(&rt->dst.rt_uncached))))
 		dst_ret = rt6_dst_from_check(rt, from, cookie);
 	else
 		dst_ret = rt6_check(rt, from, cookie);
@@ -6477,7 +6477,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
 			 ip6_template_metrics, true);
-	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
+	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 	net->ipv6.fib6_has_custom_rules = false;
@@ -6489,7 +6489,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
 			 ip6_template_metrics, true);
-	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
+	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
 
 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -6499,7 +6499,7 @@ static int __net_init ip6_route_net_init(struct net *net)
 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
 			 ip6_template_metrics, true);
-	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
+	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
 #ifdef CONFIG_IPV6_SUBTREES
 	net->ipv6.fib6_routes_require_src = 0;
 #endif
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ea435eba3053..2b493f8d0091 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -89,7 +89,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
 	xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
 	xdst->u.rt6.rt6i_src = rt->rt6i_src;
-	INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
+	INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached);
 	rt6_uncached_list_add(&xdst->u.rt6);
 
 	return 0;
@@ -121,7 +121,7 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
 	if (likely(xdst->u.rt6.rt6i_idev))
 		in6_dev_put(xdst->u.rt6.rt6i_idev);
 	dst_destroy_metrics_generic(dst);
-	if (xdst->u.rt6.rt6i_uncached_list)
+	if (xdst->u.rt6.dst.rt_uncached_list)
 		rt6_uncached_list_del(&xdst->u.rt6);
 	xfrm_dst_destroy(xdst);
 }
-- 
cgit v1.2.3


From bc9d3a9f2afca189a6ae40225b6985e3c775375e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 23 Mar 2023 21:55:32 +0100
Subject: net: dst: Switch to rcuref_t reference counting

Under high contention dst_entry::__refcnt becomes a significant bottleneck.

atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into
high retry rates on contention.

Switch the reference count to rcuref_t which results in a significant
performance gain. Rename the reference count member to __rcuref to reflect
the change.

The gain depends on the micro-architecture and the number of concurrent
operations and has been measured in the range of +25% to +130% with a
localhost memtier/memcached benchmark which amplifies the problem
massively.

Running the memtier/memcached benchmark over a real (1Gb) network
connection the conversion on top of the false sharing fix for struct
dst_entry::__refcnt results in a total gain in the 2%-5% range over the
upstream baseline.

Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20230307125538.989175656@linutronix.de
Link: https://lore.kernel.org/r/20230323102800.215027837@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst.h               | 19 ++++++++++---------
 include/net/sock.h              |  2 +-
 net/bridge/br_nf_core.c         |  2 +-
 net/core/dst.c                  | 26 +++++---------------------
 net/core/rtnetlink.c            |  2 +-
 net/ipv6/route.c                |  6 +++---
 net/netfilter/ipvs/ip_vs_xmit.c |  4 ++--
 7 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 81f2279ea911..78884429deed 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -16,6 +16,7 @@
 #include <linux/bug.h>
 #include <linux/jiffies.h>
 #include <linux/refcount.h>
+#include <linux/rcuref.h>
 #include <net/neighbour.h>
 #include <asm/processor.h>
 #include <linux/indirect_call_wrapper.h>
@@ -61,11 +62,11 @@ struct dst_entry {
 	unsigned short		trailer_len;	/* space to reserve at tail */
 
 	/*
-	 * __refcnt wants to be on a different cache line from
+	 * __rcuref wants to be on a different cache line from
 	 * input/output/ops or performance tanks badly
 	 */
 #ifdef CONFIG_64BIT
-	atomic_t		__refcnt;	/* 64-bit offset 64 */
+	rcuref_t		__rcuref;	/* 64-bit offset 64 */
 #endif
 	int			__use;
 	unsigned long		lastuse;
@@ -75,16 +76,16 @@ struct dst_entry {
 	__u32			tclassid;
 #ifndef CONFIG_64BIT
 	struct lwtunnel_state   *lwtstate;
-	atomic_t		__refcnt;	/* 32-bit offset 64 */
+	rcuref_t		__rcuref;	/* 32-bit offset 64 */
 #endif
 	netdevice_tracker	dev_tracker;
 
 	/*
 	 * Used by rtable and rt6_info. Moves lwtstate into the next cache
 	 * line on 64bit so that lwtstate does not cause false sharing with
-	 * __refcnt under contention of __refcnt. This also puts the
+	 * __rcuref under contention of __rcuref. This also puts the
 	 * frequently accessed members of rtable and rt6_info out of the
-	 * __refcnt cache line.
+	 * __rcuref cache line.
 	 */
 	struct list_head	rt_uncached;
 	struct uncached_list	*rt_uncached_list;
@@ -238,10 +239,10 @@ static inline void dst_hold(struct dst_entry *dst)
 {
 	/*
 	 * If your kernel compilation stops here, please check
-	 * the placement of __refcnt in struct dst_entry
+	 * the placement of __rcuref in struct dst_entry
 	 */
-	BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
-	WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
+	BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
+	WARN_ON(!rcuref_get(&dst->__rcuref));
 }
 
 static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
@@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
  */
 static inline bool dst_hold_safe(struct dst_entry *dst)
 {
-	return atomic_inc_not_zero(&dst->__refcnt);
+	return rcuref_get(&dst->__rcuref);
 }
 
 /**
diff --git a/include/net/sock.h b/include/net/sock.h
index 573f2bf7e0de..5edf0038867c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2131,7 +2131,7 @@ sk_dst_get(struct sock *sk)
 
 	rcu_read_lock();
 	dst = rcu_dereference(sk->sk_dst_cache);
-	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+	if (dst && !rcuref_get(&dst->__rcuref))
 		dst = NULL;
 	rcu_read_unlock();
 	return dst;
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 8c69f0c95a8e..98aea5485aae 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
 {
 	struct rtable *rt = &br->fake_rtable;
 
-	atomic_set(&rt->dst.__refcnt, 1);
+	rcuref_init(&rt->dst.__rcuref, 1);
 	rt->dst.dev = br->dev;
 	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
 	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
diff --git a/net/core/dst.c b/net/core/dst.c
index 31c08a3386d3..3247e84045ca 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
 	dst->tclassid = 0;
 #endif
 	dst->lwtstate = NULL;
-	atomic_set(&dst->__refcnt, initial_ref);
+	rcuref_init(&dst->__rcuref, initial_ref);
 	dst->__use = 0;
 	dst->lastuse = jiffies;
 	dst->flags = flags;
@@ -162,31 +162,15 @@ EXPORT_SYMBOL(dst_dev_put);
 
 void dst_release(struct dst_entry *dst)
 {
-	if (dst) {
-		int newrefcnt;
-
-		newrefcnt = atomic_dec_return(&dst->__refcnt);
-		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
-			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
-					     __func__, dst, newrefcnt);
-		if (!newrefcnt)
-			call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
-	}
+	if (dst && rcuref_put(&dst->__rcuref))
+		call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
 }
 EXPORT_SYMBOL(dst_release);
 
 void dst_release_immediate(struct dst_entry *dst)
 {
-	if (dst) {
-		int newrefcnt;
-
-		newrefcnt = atomic_dec_return(&dst->__refcnt);
-		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
-			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
-					     __func__, dst, newrefcnt);
-		if (!newrefcnt)
-			dst_destroy(dst);
-	}
+	if (dst && rcuref_put(&dst->__rcuref))
+		dst_destroy(dst);
 }
 EXPORT_SYMBOL(dst_release_immediate);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b7b1661d0d56..906aebdc566b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -843,7 +843,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 	if (dst) {
 		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
 		ci.rta_used = dst->__use;
-		ci.rta_clntref = atomic_read(&dst->__refcnt);
+		ci.rta_clntref = rcuref_read(&dst->__rcuref);
 	}
 	if (expires) {
 		unsigned long clock;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 12e163dec34e..35085fc0cf15 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = {
 
 static const struct rt6_info ip6_null_entry_template = {
 	.dst = {
-		.__refcnt	= ATOMIC_INIT(1),
+		.__rcuref	= RCUREF_INIT(1),
 		.__use		= 1,
 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 		.error		= -ENETUNREACH,
@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
 
 static const struct rt6_info ip6_prohibit_entry_template = {
 	.dst = {
-		.__refcnt	= ATOMIC_INIT(1),
+		.__rcuref	= RCUREF_INIT(1),
 		.__use		= 1,
 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 		.error		= -EACCES,
@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
 
 static const struct rt6_info ip6_blk_hole_entry_template = {
 	.dst = {
-		.__refcnt	= ATOMIC_INIT(1),
+		.__rcuref	= RCUREF_INIT(1),
 		.__use		= 1,
 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 		.error		= -EINVAL,
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 80448885c3d7..99c349c0d968 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
-				  atomic_read(&rt->dst.__refcnt));
+				  rcuref_read(&rt->dst.__rcuref));
 		}
 		if (ret_saddr)
 			*ret_saddr = dest_dst->dst_saddr.ip;
@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			spin_unlock_bh(&dest->dst_lock);
 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
-				  atomic_read(&rt->dst.__refcnt));
+				  rcuref_read(&rt->dst.__rcuref));
 		}
 		if (ret_saddr)
 			*ret_saddr = dest_dst->dst_saddr.in6;
-- 
cgit v1.2.3


From 634f1a7110b439c65fd8a809171c1d2d28bcea6f Mon Sep 17 00:00:00 2001
From: Bobby Eshleman <bobby.eshleman@bytedance.com>
Date: Mon, 27 Mar 2023 19:11:51 +0000
Subject: vsock: support sockmap

This patch adds sockmap support for vsock sockets. It is intended to be
usable by all transports, but only the virtio and loopback transports
are implemented.

SOCK_STREAM, SOCK_DGRAM, and SOCK_SEQPACKET are all supported.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c                   |   1 +
 include/linux/virtio_vsock.h            |   1 +
 include/net/af_vsock.h                  |  17 ++++
 net/vmw_vsock/Makefile                  |   1 +
 net/vmw_vsock/af_vsock.c                |  64 ++++++++++--
 net/vmw_vsock/virtio_transport.c        |   2 +
 net/vmw_vsock/virtio_transport_common.c |  25 +++++
 net/vmw_vsock/vsock_bpf.c               | 174 ++++++++++++++++++++++++++++++++
 net/vmw_vsock/vsock_loopback.c          |   2 +
 9 files changed, 281 insertions(+), 6 deletions(-)
 create mode 100644 net/vmw_vsock/vsock_bpf.c

(limited to 'include')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index c8e6087769a1..6578db78f0ae 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = {
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
 
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = vhost_transport_send_pkt,
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 3f9c16611306..c58453699ee9 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
 void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
 void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
 int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor);
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 568a87c5e0d0..0e7504a42925 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -75,6 +75,7 @@ struct vsock_sock {
 	void *trans;
 };
 
+s64 vsock_connectible_has_data(struct vsock_sock *vsk);
 s64 vsock_stream_has_data(struct vsock_sock *vsk);
 s64 vsock_stream_has_space(struct vsock_sock *vsk);
 struct sock *vsock_create_connected(struct sock *parent);
@@ -173,6 +174,9 @@ struct vsock_transport {
 
 	/* Addressing. */
 	u32 (*get_local_cid)(void);
+
+	/* Read a single skb */
+	int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
 };
 
 /**** CORE ****/
@@ -225,5 +229,18 @@ int vsock_init_tap(void);
 int vsock_add_tap(struct vsock_tap *vt);
 int vsock_remove_tap(struct vsock_tap *vt);
 void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
+int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+			      int flags);
+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+			size_t len, int flags);
+
+#ifdef CONFIG_BPF_SYSCALL
+extern struct proto vsock_proto;
+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
+void __init vsock_bpf_build_proto(void);
+#else
+static inline void __init vsock_bpf_build_proto(void)
+{}
+#endif
 
 #endif /* __AF_VSOCK_H__ */
diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile
index 6a943ec95c4a..5da74c4a9f1d 100644
--- a/net/vmw_vsock/Makefile
+++ b/net/vmw_vsock/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o
 obj-$(CONFIG_VSOCKETS_LOOPBACK) += vsock_loopback.o
 
 vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o
+vsock-$(CONFIG_BPF_SYSCALL) += vsock_bpf.o
 
 vsock_diag-y += diag.o
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 19aea7cba26e..5f2dda35c980 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -116,10 +116,13 @@ static void vsock_sk_destruct(struct sock *sk);
 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
 /* Protocol family. */
-static struct proto vsock_proto = {
+struct proto vsock_proto = {
 	.name = "AF_VSOCK",
 	.owner = THIS_MODULE,
 	.obj_size = sizeof(struct vsock_sock),
+#ifdef CONFIG_BPF_SYSCALL
+	.psock_update_sk_prot = vsock_bpf_update_proto,
+#endif
 };
 
 /* The default peer timeout indicates how long we will wait for a peer response
@@ -865,7 +868,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(vsock_stream_has_data);
 
-static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
+s64 vsock_connectible_has_data(struct vsock_sock *vsk)
 {
 	struct sock *sk = sk_vsock(vsk);
 
@@ -874,6 +877,7 @@ static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
 	else
 		return vsock_stream_has_data(vsk);
 }
+EXPORT_SYMBOL_GPL(vsock_connectible_has_data);
 
 s64 vsock_stream_has_space(struct vsock_sock *vsk)
 {
@@ -1131,6 +1135,13 @@ static __poll_t vsock_poll(struct file *file, struct socket *sock,
 	return mask;
 }
 
+static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+
+	return vsk->transport->read_skb(vsk, read_actor);
+}
+
 static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 			       size_t len)
 {
@@ -1242,18 +1253,42 @@ static int vsock_dgram_connect(struct socket *sock,
 	memcpy(&vsk->remote_addr, remote_addr, sizeof(vsk->remote_addr));
 	sock->state = SS_CONNECTED;
 
+	/* sock map disallows redirection of non-TCP sockets with sk_state !=
+	 * TCP_ESTABLISHED (see sock_map_redirect_allowed()), so we set
+	 * TCP_ESTABLISHED here to allow redirection of connected vsock dgrams.
+	 *
+	 * This doesn't seem to be abnormal state for datagram sockets, as the
+	 * same approach can be see in other datagram socket types as well
+	 * (such as unix sockets).
+	 */
+	sk->sk_state = TCP_ESTABLISHED;
+
 out:
 	release_sock(sk);
 	return err;
 }
 
-static int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
-			       size_t len, int flags)
+int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
+			size_t len, int flags)
 {
-	struct vsock_sock *vsk = vsock_sk(sock->sk);
+#ifdef CONFIG_BPF_SYSCALL
+	const struct proto *prot;
+#endif
+	struct vsock_sock *vsk;
+	struct sock *sk;
+
+	sk = sock->sk;
+	vsk = vsock_sk(sk);
+
+#ifdef CONFIG_BPF_SYSCALL
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot != &vsock_proto)
+		return prot->recvmsg(sk, msg, len, flags, NULL);
+#endif
 
 	return vsk->transport->dgram_dequeue(vsk, msg, len, flags);
 }
+EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg);
 
 static const struct proto_ops vsock_dgram_ops = {
 	.family = PF_VSOCK,
@@ -1272,6 +1307,7 @@ static const struct proto_ops vsock_dgram_ops = {
 	.recvmsg = vsock_dgram_recvmsg,
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
+	.read_skb = vsock_read_skb,
 };
 
 static int vsock_transport_cancel_pkt(struct vsock_sock *vsk)
@@ -2086,13 +2122,16 @@ out:
 	return err;
 }
 
-static int
+int
 vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			  int flags)
 {
 	struct sock *sk;
 	struct vsock_sock *vsk;
 	const struct vsock_transport *transport;
+#ifdef CONFIG_BPF_SYSCALL
+	const struct proto *prot;
+#endif
 	int err;
 
 	sk = sock->sk;
@@ -2139,6 +2178,14 @@ vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		goto out;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	prot = READ_ONCE(sk->sk_prot);
+	if (prot != &vsock_proto) {
+		release_sock(sk);
+		return prot->recvmsg(sk, msg, len, flags, NULL);
+	}
+#endif
+
 	if (sk->sk_type == SOCK_STREAM)
 		err = __vsock_stream_recvmsg(sk, msg, len, flags);
 	else
@@ -2148,6 +2195,7 @@ out:
 	release_sock(sk);
 	return err;
 }
+EXPORT_SYMBOL_GPL(vsock_connectible_recvmsg);
 
 static int vsock_set_rcvlowat(struct sock *sk, int val)
 {
@@ -2188,6 +2236,7 @@ static const struct proto_ops vsock_stream_ops = {
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
 	.set_rcvlowat = vsock_set_rcvlowat,
+	.read_skb = vsock_read_skb,
 };
 
 static const struct proto_ops vsock_seqpacket_ops = {
@@ -2209,6 +2258,7 @@ static const struct proto_ops vsock_seqpacket_ops = {
 	.recvmsg = vsock_connectible_recvmsg,
 	.mmap = sock_no_mmap,
 	.sendpage = sock_no_sendpage,
+	.read_skb = vsock_read_skb,
 };
 
 static int vsock_create(struct net *net, struct socket *sock,
@@ -2348,6 +2398,8 @@ static int __init vsock_init(void)
 		goto err_unregister_proto;
 	}
 
+	vsock_bpf_build_proto();
+
 	return 0;
 
 err_unregister_proto:
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 28b5a8e8e094..e95df847176b 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -457,6 +457,8 @@ static struct virtio_transport virtio_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = virtio_transport_send_pkt,
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index 7fc178c3ee07..f39639dd6eb5 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1418,6 +1418,31 @@ int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
 }
 EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
 
+int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_actor)
+{
+	struct virtio_vsock_sock *vvs = vsk->trans;
+	struct sock *sk = sk_vsock(vsk);
+	struct sk_buff *skb;
+	int off = 0;
+	int copied;
+	int err;
+
+	spin_lock_bh(&vvs->rx_lock);
+	/* Use __skb_recv_datagram() for race-free handling of the receive. It
+	 * works for types other than dgrams.
+	 */
+	skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err);
+	spin_unlock_bh(&vvs->rx_lock);
+
+	if (!skb)
+		return err;
+
+	copied = recv_actor(sk, skb);
+	kfree_skb(skb);
+	return copied;
+}
+EXPORT_SYMBOL_GPL(virtio_transport_read_skb);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Asias He");
 MODULE_DESCRIPTION("common code for virtio vsock");
diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c
new file mode 100644
index 000000000000..a3c97546ab84
--- /dev/null
+++ b/net/vmw_vsock/vsock_bpf.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Bobby Eshleman <bobby.eshleman@bytedance.com>
+ *
+ * Based off of net/unix/unix_bpf.c
+ */
+
+#include <linux/bpf.h>
+#include <linux/module.h>
+#include <linux/skmsg.h>
+#include <linux/socket.h>
+#include <linux/wait.h>
+#include <net/af_vsock.h>
+#include <net/sock.h>
+
+#define vsock_sk_has_data(__sk, __psock)				\
+		({	!skb_queue_empty(&(__sk)->sk_receive_queue) ||	\
+			!skb_queue_empty(&(__psock)->ingress_skb) ||	\
+			!list_empty(&(__psock)->ingress_msg);		\
+		})
+
+static struct proto *vsock_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(vsock_prot_lock);
+static struct proto vsock_bpf_prot;
+
+static bool vsock_has_data(struct sock *sk, struct sk_psock *psock)
+{
+	struct vsock_sock *vsk = vsock_sk(sk);
+	s64 ret;
+
+	ret = vsock_connectible_has_data(vsk);
+	if (ret > 0)
+		return true;
+
+	return vsock_sk_has_data(sk, psock);
+}
+
+static bool vsock_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo)
+{
+	bool ret;
+
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		return true;
+
+	if (!timeo)
+		return false;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = vsock_has_data(sk, psock);
+	if (!ret) {
+		wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+		ret = vsock_has_data(sk, psock);
+	}
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+static int __vsock_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags)
+{
+	struct socket *sock = sk->sk_socket;
+	int err;
+
+	if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
+		err = vsock_connectible_recvmsg(sock, msg, len, flags);
+	else if (sk->sk_type == SOCK_DGRAM)
+		err = vsock_dgram_recvmsg(sock, msg, len, flags);
+	else
+		err = -EPROTOTYPE;
+
+	return err;
+}
+
+static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg,
+			     size_t len, int flags, int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return __vsock_recvmsg(sk, msg, len, flags);
+
+	lock_sock(sk);
+	if (vsock_has_data(sk, psock) && sk_psock_queue_empty(psock)) {
+		release_sock(sk);
+		sk_psock_put(sk, psock);
+		return __vsock_recvmsg(sk, msg, len, flags);
+	}
+
+	copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	while (copied == 0) {
+		long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+		if (!vsock_msg_wait_data(sk, psock, timeo)) {
+			copied = -EAGAIN;
+			break;
+		}
+
+		if (sk_psock_queue_empty(psock)) {
+			release_sock(sk);
+			sk_psock_put(sk, psock);
+			return __vsock_recvmsg(sk, msg, len, flags);
+		}
+
+		copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+	}
+
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+
+	return copied;
+}
+
+/* Copy of original proto with updated sock_map methods */
+static struct proto vsock_bpf_prot = {
+	.close = sock_map_close,
+	.recvmsg = vsock_bpf_recvmsg,
+	.sock_is_readable = sk_msg_is_readable,
+	.unhash = sock_map_unhash,
+};
+
+static void vsock_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+	*prot        = *base;
+	prot->close  = sock_map_close;
+	prot->recvmsg = vsock_bpf_recvmsg;
+	prot->sock_is_readable = sk_msg_is_readable;
+}
+
+static void vsock_bpf_check_needs_rebuild(struct proto *ops)
+{
+	/* Paired with the smp_store_release() below. */
+	if (unlikely(ops != smp_load_acquire(&vsock_prot_saved))) {
+		spin_lock_bh(&vsock_prot_lock);
+		if (likely(ops != vsock_prot_saved)) {
+			vsock_bpf_rebuild_protos(&vsock_bpf_prot, ops);
+			/* Make sure proto function pointers are updated before publishing the
+			 * pointer to the struct.
+			 */
+			smp_store_release(&vsock_prot_saved, ops);
+		}
+		spin_unlock_bh(&vsock_prot_lock);
+	}
+}
+
+int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+	struct vsock_sock *vsk;
+
+	if (restore) {
+		sk->sk_write_space = psock->saved_write_space;
+		sock_replace_proto(sk, psock->sk_proto);
+		return 0;
+	}
+
+	vsk = vsock_sk(sk);
+	if (!vsk->transport)
+		return -ENODEV;
+
+	if (!vsk->transport->read_skb)
+		return -EOPNOTSUPP;
+
+	vsock_bpf_check_needs_rebuild(psock->sk_proto);
+	sock_replace_proto(sk, &vsock_bpf_prot);
+	return 0;
+}
+
+void __init vsock_bpf_build_proto(void)
+{
+	vsock_bpf_rebuild_protos(&vsock_bpf_prot, &vsock_proto);
+}
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 671e03240fc5..40753b661c13 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -94,6 +94,8 @@ static struct virtio_transport loopback_transport = {
 		.notify_send_pre_enqueue  = virtio_transport_notify_send_pre_enqueue,
 		.notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue,
 		.notify_buffer_size       = virtio_transport_notify_buffer_size,
+
+		.read_skb = virtio_transport_read_skb,
 	},
 
 	.send_pkt = vsock_loopback_send_pkt,
-- 
cgit v1.2.3


From 8cdc3223e78c43e1b60ea1c536a103e32fdca3c5 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 27 Mar 2023 16:54:54 -0700
Subject: ipv6: Remove in6addr_any alternatives.

Some code defines the IPv6 wildcard address as a local variable and
use it with memcmp() or ipv6_addr_equal().

Let's use in6addr_any and ipv6_addr_any() instead.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c |  5 ++---
 include/net/ip6_fib.h                                     |  9 +++------
 include/trace/events/fib.h                                |  5 ++---
 include/trace/events/fib6.h                               |  5 +----
 net/ethtool/ioctl.c                                       | 10 +++++-----
 net/ipv4/inet_hashtables.c                                | 11 ++++-------
 6 files changed, 17 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
index a108e73c9f66..20c2d2ecaf93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
@@ -98,7 +98,6 @@ int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
 #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
 	else if (ip_version == 6) {
 		int ipv6_size = MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6);
-		struct in6_addr zerov6 = {};
 
 		daddr = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 				     outer_headers.dst_ipv4_dst_ipv6.ipv6_layout.ipv6);
@@ -106,8 +105,8 @@ int mlx5e_tc_set_attr_rx_tun(struct mlx5e_tc_flow *flow,
 				     outer_headers.src_ipv4_src_ipv6.ipv6_layout.ipv6);
 		memcpy(&tun_attr->dst_ip.v6, daddr, ipv6_size);
 		memcpy(&tun_attr->src_ip.v6, saddr, ipv6_size);
-		if (!memcmp(&tun_attr->dst_ip.v6, &zerov6, sizeof(zerov6)) ||
-		    !memcmp(&tun_attr->src_ip.v6, &zerov6, sizeof(zerov6)))
+		if (ipv6_addr_any(&tun_attr->dst_ip.v6) ||
+		    ipv6_addr_any(&tun_attr->src_ip.v6))
 			return 0;
 	}
 #endif
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 79570cb4ea9c..05e6f756feaf 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -469,13 +469,10 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
 	rcu_read_lock();
 
 	from = rcu_dereference(rt->from);
-	if (from) {
+	if (from)
 		*addr = from->fib6_prefsrc.addr;
-	} else {
-		struct in6_addr in6_zero = {};
-
-		*addr = in6_zero;
-	}
+	else
+		*addr = in6addr_any;
 
 	rcu_read_unlock();
 }
diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index c2300c407f58..76297ecd4935 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -36,7 +36,6 @@ TRACE_EVENT(fib_table_lookup,
 	),
 
 	TP_fast_assign(
-		struct in6_addr in6_zero = {};
 		struct net_device *dev;
 		struct in6_addr *in6;
 		__be32 *p32;
@@ -74,7 +73,7 @@ TRACE_EVENT(fib_table_lookup,
 				*p32 = nhc->nhc_gw.ipv4;
 
 				in6 = (struct in6_addr *)__entry->gw6;
-				*in6 = in6_zero;
+				*in6 = in6addr_any;
 			} else if (nhc->nhc_gw_family == AF_INET6) {
 				p32 = (__be32 *) __entry->gw4;
 				*p32 = 0;
@@ -87,7 +86,7 @@ TRACE_EVENT(fib_table_lookup,
 			*p32 = 0;
 
 			in6 = (struct in6_addr *)__entry->gw6;
-			*in6 = in6_zero;
+			*in6 = in6addr_any;
 		}
 	),
 
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 6e821eb79450..4d3e607b3cde 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -68,11 +68,8 @@ TRACE_EVENT(fib6_table_lookup,
 			strcpy(__entry->name, "-");
 		}
 		if (res->f6i == net->ipv6.fib6_null_entry) {
-			struct in6_addr in6_zero = {};
-
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = in6_zero;
-
+			*in6 = in6addr_any;
 		} else if (res->nh) {
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = res->nh->fib_nh_gw6;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 646b3e490c71..59adc4e6e9ee 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -27,6 +27,7 @@
 #include <linux/net.h>
 #include <linux/pm_runtime.h>
 #include <net/devlink.h>
+#include <net/ipv6.h>
 #include <net/xdp_sock_drv.h>
 #include <net/flow_offload.h>
 #include <linux/ethtool_netlink.h>
@@ -3127,7 +3128,6 @@ struct ethtool_rx_flow_rule *
 ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 {
 	const struct ethtool_rx_flow_spec *fs = input->fs;
-	static struct in6_addr zero_addr = {};
 	struct ethtool_rx_flow_match *match;
 	struct ethtool_rx_flow_rule *flow;
 	struct flow_action_entry *act;
@@ -3233,20 +3233,20 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 
 		v6_spec = &fs->h_u.tcp_ip6_spec;
 		v6_m_spec = &fs->m_u.tcp_ip6_spec;
-		if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) {
+		if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src)) {
 			memcpy(&match->key.ipv6.src, v6_spec->ip6src,
 			       sizeof(match->key.ipv6.src));
 			memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src,
 			       sizeof(match->mask.ipv6.src));
 		}
-		if (memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) {
+		if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
 			memcpy(&match->key.ipv6.dst, v6_spec->ip6dst,
 			       sizeof(match->key.ipv6.dst));
 			memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst,
 			       sizeof(match->mask.ipv6.dst));
 		}
-		if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr)) ||
-		    memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) {
+		if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) ||
+		    !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
 			match->dissector.used_keys |=
 				BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS);
 			match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] =
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 6edae3886885..e7391bf310a7 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -826,13 +826,11 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const
 				      unsigned short port, int l3mdev, const struct sock *sk)
 {
 #if IS_ENABLED(CONFIG_IPV6)
-	struct in6_addr addr_any = {};
-
 	if (sk->sk_family != tb->family) {
 		if (sk->sk_family == AF_INET)
 			return net_eq(ib2_net(tb), net) && tb->port == port &&
 				tb->l3mdev == l3mdev &&
-				ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any);
+				ipv6_addr_any(&tb->v6_rcv_saddr);
 
 		return false;
 	}
@@ -840,7 +838,7 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const
 	if (sk->sk_family == AF_INET6)
 		return net_eq(ib2_net(tb), net) && tb->port == port &&
 			tb->l3mdev == l3mdev &&
-			ipv6_addr_equal(&tb->v6_rcv_saddr, &addr_any);
+			ipv6_addr_any(&tb->v6_rcv_saddr);
 	else
 #endif
 		return net_eq(ib2_net(tb), net) && tb->port == port &&
@@ -866,11 +864,10 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
 {
 	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
 	u32 hash;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct in6_addr addr_any = {};
 
+#if IS_ENABLED(CONFIG_IPV6)
 	if (sk->sk_family == AF_INET6)
-		hash = ipv6_portaddr_hash(net, &addr_any, port);
+		hash = ipv6_portaddr_hash(net, &in6addr_any, port);
 	else
 #endif
 		hash = ipv4_portaddr_hash(net, 0, port);
-- 
cgit v1.2.3


From 954d1fa1ac93aa8a66f7d9a9ba545cf7f020d348 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 28 Mar 2023 10:57:59 +0800
Subject: macvlan: Add netlink attribute for broadcast cutoff

Make the broadcast cutoff configurable through netlink.  Note
that macvlan is weird because there is no central device for
us to configure (the lowerdev could be anything).  So all the
options are duplicated over what could be thousands of child
devices.

IFLA_MACVLAN_BC_QUEUE_LEN took the approach of taking the maximum
of all child device settings.  This is unnecessary as we could
simply store the option in the port device and take the last
child device that gets updated as the value to use.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvlan.c              | 31 +++++++++++++++++++++++++++++--
 include/uapi/linux/if_link.h       |  1 +
 tools/include/uapi/linux/if_link.h |  1 +
 3 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 62b4748d3836..4215106adc40 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -47,6 +47,7 @@ struct macvlan_port {
 	struct sk_buff_head	bc_queue;
 	struct work_struct	bc_work;
 	u32			bc_queue_len_used;
+	int			bc_cutoff;
 	u32			flags;
 	int			count;
 	struct hlist_head	vlan_source_hash[MACVLAN_HASH_SIZE];
@@ -814,6 +815,12 @@ static void macvlan_compute_filter(unsigned long *mc_filter,
 	}
 }
 
+static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan)
+{
+	macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL,
+			       vlan->port->bc_cutoff);
+}
+
 static void macvlan_set_mac_lists(struct net_device *dev)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
@@ -838,8 +845,16 @@ static void macvlan_set_mac_lists(struct net_device *dev)
 	 */
 	macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL,
 			       0);
-	macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL,
-			       1);
+	macvlan_recompute_bc_filter(vlan);
+}
+
+static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff)
+{
+	if (vlan->port->bc_cutoff == cutoff)
+		return;
+
+	vlan->port->bc_cutoff = cutoff;
+	macvlan_recompute_bc_filter(vlan);
 }
 
 static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
@@ -1254,6 +1269,7 @@ static int macvlan_port_create(struct net_device *dev)
 		INIT_HLIST_HEAD(&port->vlan_source_hash[i]);
 
 	port->bc_queue_len_used = 0;
+	port->bc_cutoff = 1;
 	skb_queue_head_init(&port->bc_queue);
 	INIT_WORK(&port->bc_work, macvlan_process_broadcast);
 
@@ -1527,6 +1543,10 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN])
 		vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]);
 
+	if (data && data[IFLA_MACVLAN_BC_CUTOFF])
+		update_port_bc_cutoff(
+			vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF]));
+
 	err = register_netdevice(dev);
 	if (err < 0)
 		goto destroy_macvlan_port;
@@ -1623,6 +1643,10 @@ static int macvlan_changelink(struct net_device *dev,
 		update_port_bc_queue_len(vlan->port);
 	}
 
+	if (data && data[IFLA_MACVLAN_BC_CUTOFF])
+		update_port_bc_cutoff(
+			vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF]));
+
 	if (set_mode)
 		vlan->mode = mode;
 	if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
@@ -1703,6 +1727,9 @@ static int macvlan_fill_info(struct sk_buff *skb,
 		goto nla_put_failure;
 	if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used))
 		goto nla_put_failure;
+	if (port->bc_cutoff != 1 &&
+	    nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff))
+		goto nla_put_failure;
 	return 0;
 
 nla_put_failure:
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 57ceb788250f..8d679688efe0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -635,6 +635,7 @@ enum {
 	IFLA_MACVLAN_MACADDR_COUNT,
 	IFLA_MACVLAN_BC_QUEUE_LEN,
 	IFLA_MACVLAN_BC_QUEUE_LEN_USED,
+	IFLA_MACVLAN_BC_CUTOFF,
 	__IFLA_MACVLAN_MAX,
 };
 
diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index 901d98b865a1..39e659c83cfd 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -605,6 +605,7 @@ enum {
 	IFLA_MACVLAN_MACADDR_COUNT,
 	IFLA_MACVLAN_BC_QUEUE_LEN,
 	IFLA_MACVLAN_BC_QUEUE_LEN_USED,
+	IFLA_MACVLAN_BC_CUTOFF,
 	__IFLA_MACVLAN_MAX,
 };
 
-- 
cgit v1.2.3


From 9a8aac92eba90b3b7c71d0531db535f5588388f5 Mon Sep 17 00:00:00 2001
From: Kieran Frewen <kieran.frewen@morsemicro.com>
Date: Fri, 24 Feb 2023 10:29:17 +1300
Subject: wifi: nl80211: support advertising S1G capabilities

Include S1G capabilities in netlink band info messages.

Signed-off-by: Kieran Frewen <kieran.frewen@morsemicro.com>
Co-developed-by: Gilad Itzkovitch <gilad.itzkovitch@morsemicro.com>
Signed-off-by: Gilad Itzkovitch <gilad.itzkovitch@morsemicro.com>
Link: https://lore.kernel.org/r/20230223212917.4010246-1-gilad.itzkovitch@virscient.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  7 +++++++
 net/wireless/nl80211.c       | 10 ++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index cf4fb981e131..c59fec406da5 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4061,6 +4061,10 @@ enum nl80211_band_iftype_attr {
  * @NL80211_BAND_ATTR_EDMG_BW_CONFIG: Channel BW Configuration subfield encodes
  *	the allowed channel bandwidth configurations.
  *	Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13.
+ * @NL80211_BAND_ATTR_S1G_MCS_NSS_SET: S1G capabilities, supported S1G-MCS and NSS
+ *	set subfield, as in the S1G information IE, 5 bytes
+ * @NL80211_BAND_ATTR_S1G_CAPA: S1G capabilities information subfield as in the
+ *	S1G information IE, 10 bytes
  * @NL80211_BAND_ATTR_MAX: highest band attribute currently defined
  * @__NL80211_BAND_ATTR_AFTER_LAST: internal use
  */
@@ -4081,6 +4085,9 @@ enum nl80211_band_attr {
 	NL80211_BAND_ATTR_EDMG_CHANNELS,
 	NL80211_BAND_ATTR_EDMG_BW_CONFIG,
 
+	NL80211_BAND_ATTR_S1G_MCS_NSS_SET,
+	NL80211_BAND_ATTR_S1G_CAPA,
+
 	/* keep last */
 	__NL80211_BAND_ATTR_AFTER_LAST,
 	NL80211_BAND_ATTR_MAX = __NL80211_BAND_ATTR_AFTER_LAST - 1
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f1cd3d9130dd..2c9edb015652 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -1961,6 +1961,16 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg,
 
 	nla_nest_end(msg, nl_rates);
 
+	/* S1G capabilities */
+	if (sband->band == NL80211_BAND_S1GHZ && sband->s1g_cap.s1g &&
+	    (nla_put(msg, NL80211_BAND_ATTR_S1G_CAPA,
+		     sizeof(sband->s1g_cap.cap),
+		     sband->s1g_cap.cap) ||
+	     nla_put(msg, NL80211_BAND_ATTR_S1G_MCS_NSS_SET,
+		     sizeof(sband->s1g_cap.nss_mcs),
+		     sband->s1g_cap.nss_mcs)))
+		return -ENOBUFS;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c59647c0dc679008886756a888368da1c6d4ccd3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Mar 2023 23:50:19 +0000
Subject: net: add softnet_data.in_net_rx_action

We want to make two optimizations in napi_schedule_rps() and
____napi_schedule() which require to know if these helpers are
called from net_rx_action(), instead of being called from
other contexts.

sd.in_net_rx_action is only read/written by the owning cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 18a5be6ddd0f..c8c634091a65 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3188,6 +3188,7 @@ struct softnet_data {
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
 #endif
+	bool			in_net_rx_action;
 #ifdef CONFIG_NET_FLOW_LIMIT
 	struct sd_flow_limit __rcu *flow_limit;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index f7050b95d125..15331edbacf4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6646,6 +6646,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 	LIST_HEAD(list);
 	LIST_HEAD(repoll);
 
+	sd->in_net_rx_action = true;
 	local_irq_disable();
 	list_splice_init(&sd->poll_list, &list);
 	local_irq_enable();
@@ -6656,6 +6657,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 		skb_defer_free_flush(sd);
 
 		if (list_empty(&list)) {
+			sd->in_net_rx_action = false;
 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
 				goto end;
 			break;
@@ -6682,6 +6684,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
 	list_splice(&list, &sd->poll_list);
 	if (!list_empty(&sd->poll_list))
 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	else
+		sd->in_net_rx_action = false;
 
 	net_rps_action_and_irq_enable(sd);
 end:;
-- 
cgit v1.2.3


From 28c1b6df436819a7ed8a781835766e45139771a3 Mon Sep 17 00:00:00 2001
From: Eric Sage <eric_sage@apple.com>
Date: Mon, 27 Mar 2023 13:44:49 -0400
Subject: netfilter: nfnetlink_queue: enable classid socket info retrieval

This enables associating a socket with a v1 net_cls cgroup. Useful for
applying a per-cgroup policy when processing packets in userspace.

Signed-off-by: Eric Sage <eric_sage@apple.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nfnetlink_queue.h |  1 +
 net/netfilter/nfnetlink_queue.c                | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index ef7c97f21a15..efcb7c044a74 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -62,6 +62,7 @@ enum nfqnl_attr_type {
 	NFQA_VLAN,			/* nested attribute: packet vlan info */
 	NFQA_L2HDR,			/* full L2 header */
 	NFQA_PRIORITY,			/* skb->priority */
+	NFQA_CGROUP_CLASSID,		/* __u32 cgroup classid */
 
 	__NFQA_MAX
 };
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 87a9009d5234..e311462f6d98 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -29,6 +29,7 @@
 #include <linux/netfilter/nfnetlink_queue.h>
 #include <linux/netfilter/nf_conntrack_common.h>
 #include <linux/list.h>
+#include <linux/cgroup-defs.h>
 #include <net/sock.h>
 #include <net/tcp_states.h>
 #include <net/netfilter/nf_queue.h>
@@ -301,6 +302,19 @@ nla_put_failure:
 	return -1;
 }
 
+static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+	if (sk && sk_fullsock(sk)) {
+		u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+
+		if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid)))
+			return -1;
+	}
+#endif
+	return 0;
+}
+
 static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
 {
 	u32 seclen = 0;
@@ -406,6 +420,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 		+ nla_total_size(sizeof(u_int32_t))	/* priority */
 		+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
 		+ nla_total_size(sizeof(u_int32_t))	/* skbinfo */
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+		+ nla_total_size(sizeof(u_int32_t))	/* classid */
+#endif
 		+ nla_total_size(sizeof(u_int32_t));	/* cap_len */
 
 	tstamp = skb_tstamp_cond(entskb, false);
@@ -599,6 +616,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	    nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
 		goto nla_put_failure;
 
+	if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
+		goto nla_put_failure;
+
 	if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From a25b8b7136ad43760bd876af62b6e59abd30496c Mon Sep 17 00:00:00 2001
From: Matthieu De Beule <matthieu.debeule@proton.ch>
Date: Wed, 29 Mar 2023 12:52:18 +0000
Subject: netfilter: Correct documentation errors in nf_tables.h

NFTA_RANGE_OP incorrectly says nft_cmp_ops instead of nft_range_ops.
NFTA_LOG_GROUP and NFTA_LOG_QTHRESHOLD claim NLA_U32 instead of NLA_U16
NFTA_EXTHDR_SREG isn't documented as a register

Signed-off-by: Matthieu De Beule <matthieu.debeule@proton.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/uapi/linux/netfilter/nf_tables.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9c6f02c26054..c4d4d8e42dc8 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -685,7 +685,7 @@ enum nft_range_ops {
  * enum nft_range_attributes - nf_tables range expression netlink attributes
  *
  * @NFTA_RANGE_SREG: source register of data to compare (NLA_U32: nft_registers)
- * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_cmp_ops)
+ * @NFTA_RANGE_OP: cmp operation (NLA_U32: nft_range_ops)
  * @NFTA_RANGE_FROM_DATA: data range from (NLA_NESTED: nft_data_attributes)
  * @NFTA_RANGE_TO_DATA: data range to (NLA_NESTED: nft_data_attributes)
  */
@@ -878,7 +878,7 @@ enum nft_exthdr_op {
  * @NFTA_EXTHDR_LEN: extension header length (NLA_U32)
  * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32)
  * @NFTA_EXTHDR_OP: option match type (NLA_U32)
- * @NFTA_EXTHDR_SREG: option match type (NLA_U32)
+ * @NFTA_EXTHDR_SREG: source register (NLA_U32: nft_registers)
  */
 enum nft_exthdr_attributes {
 	NFTA_EXTHDR_UNSPEC,
@@ -1262,10 +1262,10 @@ enum nft_last_attributes {
 /**
  * enum nft_log_attributes - nf_tables log expression netlink attributes
  *
- * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U32)
+ * @NFTA_LOG_GROUP: netlink group to send messages to (NLA_U16)
  * @NFTA_LOG_PREFIX: prefix to prepend to log messages (NLA_STRING)
  * @NFTA_LOG_SNAPLEN: length of payload to include in netlink message (NLA_U32)
- * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U32)
+ * @NFTA_LOG_QTHRESHOLD: queue threshold (NLA_U16)
  * @NFTA_LOG_LEVEL: log level (NLA_U32)
  * @NFTA_LOG_FLAGS: logging flags (NLA_U32)
  */
-- 
cgit v1.2.3


From 2384127e98db52a6ac2577924ad9cae25f3e7472 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 29 Mar 2023 11:54:52 +0200
Subject: net/sched: act_tunnel_key: add support for "don't fragment"

extend "act_tunnel_key" to allow specifying TUNNEL_DONT_FRAGMENT.

Suggested-by: Ilya Maximets <i.maximets@ovn.org>
Reviewed-by: Pedro Tammela <pctammela@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h | 1 +
 net/sched/act_tunnel_key.c                | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index 49ad4033951b..37c6f612f161 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -34,6 +34,7 @@ enum {
 					 */
 	TCA_TUNNEL_KEY_ENC_TOS,		/* u8 */
 	TCA_TUNNEL_KEY_ENC_TTL,		/* u8 */
+	TCA_TUNNEL_KEY_NO_FRAG,		/* flag */
 	__TCA_TUNNEL_KEY_MAX,
 };
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 2d12d2626415..0c8aa7e686ea 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -420,6 +420,9 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 		    nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
 			flags &= ~TUNNEL_CSUM;
 
+		if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG]))
+			flags |= TUNNEL_DONT_FRAGMENT;
+
 		if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
 			dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
 
@@ -747,6 +750,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 				   key->tp_dst)) ||
 		    nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
 			       !(key->tun_flags & TUNNEL_CSUM)) ||
+		    ((key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+		     nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) ||
 		    tunnel_key_opts_dump(skb, info))
 			goto nla_put_failure;
 
-- 
cgit v1.2.3


From dd2d6604407da5b1b260faee409cd601fe914ce9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 30 Mar 2023 21:47:31 -0700
Subject: net: minor reshuffle of napi_struct

napi_id is read by GRO and drivers to mark skbs, and it currently
sits at the end of the structure, in a mostly unused cache line.
Move it up into a hole, and separate the clearly control path
fields from the important ones.

Before:

struct napi_struct {
	struct list_head           poll_list;            /*     0    16 */
	long unsigned int          state;                /*    16     8 */
	int                        weight;               /*    24     4 */
	int                        defer_hard_irqs_count; /*    28     4 */
	long unsigned int          gro_bitmask;          /*    32     8 */
	int                        (*poll)(struct napi_struct *, int); /*    40     8 */
	int                        poll_owner;           /*    48     4 */

	/* XXX 4 bytes hole, try to pack */

	struct net_device *        dev;                  /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct gro_list            gro_hash[8];          /*    64   192 */
	/* --- cacheline 4 boundary (256 bytes) --- */
	struct sk_buff *           skb;                  /*   256     8 */
	struct list_head           rx_list;              /*   264    16 */
	int                        rx_count;             /*   280     4 */

	/* XXX 4 bytes hole, try to pack */

	struct hrtimer             timer;                /*   288    64 */

	/* XXX last struct has 4 bytes of padding */

	/* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */
	struct list_head           dev_list;             /*   352    16 */
	struct hlist_node          napi_hash_node;       /*   368    16 */
	/* --- cacheline 6 boundary (384 bytes) --- */
	unsigned int               napi_id;              /*   384     4 */

	/* XXX 4 bytes hole, try to pack */

	struct task_struct *       thread;               /*   392     8 */

	/* size: 400, cachelines: 7, members: 17 */
	/* sum members: 388, holes: 3, sum holes: 12 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 16 bytes */
};

After:

struct napi_struct {
	struct list_head           poll_list;            /*     0    16 */
	long unsigned int          state;                /*    16     8 */
	int                        weight;               /*    24     4 */
	int                        defer_hard_irqs_count; /*    28     4 */
	long unsigned int          gro_bitmask;          /*    32     8 */
	int                        (*poll)(struct napi_struct *, int); /*    40     8 */
	int                        poll_owner;           /*    48     4 */

	/* XXX 4 bytes hole, try to pack */

	struct net_device *        dev;                  /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct gro_list            gro_hash[8];          /*    64   192 */
	/* --- cacheline 4 boundary (256 bytes) --- */
	struct sk_buff *           skb;                  /*   256     8 */
	struct list_head           rx_list;              /*   264    16 */
	int                        rx_count;             /*   280     4 */
	unsigned int               napi_id;              /*   284     4 */
	struct hrtimer             timer;                /*   288    64 */

	/* XXX last struct has 4 bytes of padding */

	/* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */
	struct task_struct *       thread;               /*   352     8 */
	struct list_head           dev_list;             /*   360    16 */
	struct hlist_node          napi_hash_node;       /*   376    16 */

	/* size: 392, cachelines: 7, members: 17 */
	/* sum members: 388, holes: 1, sum holes: 4 */
	/* paddings: 1, sum paddings: 4 */
	/* forced alignments: 1 */
	/* last cacheline: 8 bytes */
} __attribute__((__aligned__(8)));

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c8c634091a65..62e093a6d6d1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -367,11 +367,12 @@ struct napi_struct {
 	struct sk_buff		*skb;
 	struct list_head	rx_list; /* Pending GRO_NORMAL skbs */
 	int			rx_count; /* length of rx_list */
+	unsigned int		napi_id;
 	struct hrtimer		timer;
+	struct task_struct	*thread;
+	/* control-path-only fields follow */
 	struct list_head	dev_list;
 	struct hlist_node	napi_hash_node;
-	unsigned int		napi_id;
-	struct task_struct	*thread;
 };
 
 enum {
-- 
cgit v1.2.3


From c4bffeaa8d50b7279c5a76597efa4b06e709df63 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 2 Apr 2023 15:37:53 +0300
Subject: net: add struct kernel_hwtstamp_config and make
 net_hwtstamp_validate() use it

Jakub Kicinski suggested that we may want to add new UAPI for
controlling hardware timestamping through netlink in the future, and in
that case, we will be limited to the struct hwtstamp_config that is
currently passed in fixed binary format through the SIOCGHWTSTAMP and
SIOCSHWTSTAMP ioctls. It would be good if new kernel code already
started operating on an extensible kernel variant of that structure,
similar in concept to struct kernel_ethtool_coalesce vs struct
ethtool_coalesce.

Since struct hwtstamp_config is in include/uapi/linux/net_tstamp.h, here
we introduce include/linux/net_tstamp.h which shadows that other header,
but also includes it, so that existing includers of this header work as
before. In addition to that, we add the definition for the kernel-only
structure, and a helper which translates all fields by manual copying.
I am doing a manual copy in order to not force the alignment (or type)
of the fields of struct kernel_hwtstamp_config to be the same as of
struct hwtstamp_config, even though now, they are the same.

Link: https://lore.kernel.org/netdev/20230330223519.36ce7d23@kernel.org/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net_tstamp.h | 33 +++++++++++++++++++++++++++++++++
 net/core/dev_ioctl.c       |  7 +++++--
 2 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/net_tstamp.h

(limited to 'include')

diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h
new file mode 100644
index 000000000000..fd67f3cc0c4b
--- /dev/null
+++ b/include/linux/net_tstamp.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_NET_TIMESTAMPING_H_
+#define _LINUX_NET_TIMESTAMPING_H_
+
+#include <uapi/linux/net_tstamp.h>
+
+/**
+ * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config
+ *
+ * @flags: see struct hwtstamp_config
+ * @tx_type: see struct hwtstamp_config
+ * @rx_filter: see struct hwtstamp_config
+ *
+ * Prefer using this structure for in-kernel processing of hardware
+ * timestamping configuration, over the inextensible struct hwtstamp_config
+ * exposed to the %SIOCGHWTSTAMP and %SIOCSHWTSTAMP ioctl UAPI.
+ */
+struct kernel_hwtstamp_config {
+	int flags;
+	int tx_type;
+	int rx_filter;
+};
+
+static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg,
+					     const struct hwtstamp_config *cfg)
+{
+	kernel_cfg->flags = cfg->flags;
+	kernel_cfg->tx_type = cfg->tx_type;
+	kernel_cfg->rx_filter = cfg->rx_filter;
+}
+
+#endif /* _LINUX_NET_TIMESTAMPING_H_ */
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 34a0da5fbcfc..c532ef4d5dff 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -183,7 +183,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
 	return err;
 }
 
-static int net_hwtstamp_validate(const struct hwtstamp_config *cfg)
+static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
 {
 	enum hwtstamp_tx_types tx_type;
 	enum hwtstamp_rx_filters rx_filter;
@@ -259,13 +259,16 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
+	struct kernel_hwtstamp_config kernel_cfg;
 	struct hwtstamp_config cfg;
 	int err;
 
 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
 		return -EFAULT;
 
-	err = net_hwtstamp_validate(&cfg);
+	hwtstamp_config_to_kernel(&kernel_cfg, &cfg);
+
+	err = net_hwtstamp_validate(&kernel_cfg);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 88c0a6b503b7f4fffb68a8d49c3987870c5b1d6b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sun, 2 Apr 2023 15:37:55 +0300
Subject: net: create a netdev notifier for DSA to reject PTP on DSA master

The fact that PTP 2-step TX timestamping is broken on DSA switches if
the master also timestamps the same packets is documented by commit
f685e609a301 ("net: dsa: Deny PTP on master if switch supports it").
We attempt to help the users avoid shooting themselves in the foot by
making DSA reject the timestamping ioctls on an interface that is a DSA
master, and the switch tree beneath it contains switches which are aware
of PTP.

The only problem is that there isn't an established way of intercepting
ndo_eth_ioctl calls, so DSA creates avoidable burden upon the network
stack by creating a struct dsa_netdevice_ops with overlaid function
pointers that are manually checked from the relevant call sites. There
used to be 2 such dsa_netdevice_ops, but now, ndo_eth_ioctl is the only
one left.

There is an ongoing effort to migrate driver-visible hardware timestamping
control from the ndo_eth_ioctl() based API to a new ndo_hwtstamp_set()
model, but DSA actively prevents that migration, since dsa_master_ioctl()
is currently coded to manually call the master's legacy ndo_eth_ioctl(),
and so, whenever a network device driver would be converted to the new
API, DSA's restrictions would be circumvented, because any device could
be used as a DSA master.

The established way for unrelated modules to react on a net device event
is via netdevice notifiers. So we create a new notifier which gets
called whenever there is an attempt to change hardware timestamping
settings on a device.

Finally, there is another reason why a netdev notifier will be a good
idea, besides strictly DSA, and this has to do with PHY timestamping.

With ndo_eth_ioctl(), all MAC drivers must manually call
phy_has_hwtstamp() before deciding whether to act upon SIOCSHWTSTAMP,
otherwise they must pass this ioctl to the PHY driver via
phy_mii_ioctl().

With the new ndo_hwtstamp_set() API, it will be desirable to simply not
make any calls into the MAC device driver when timestamping should be
performed at the PHY level.

But there exist drivers, such as the lan966x switch, which need to
install packet traps for PTP regardless of whether they are the layer
that provides the hardware timestamps, or the PHY is. That would be
impossible to support with the new API.

The proposal there, too, is to introduce a netdev notifier which acts as
a better cue for switching drivers to add or remove PTP packet traps,
than ndo_hwtstamp_set(). The one introduced here "almost" works there as
well, except for the fact that packet traps should only be installed if
the PHY driver succeeded to enable hardware timestamping, whereas here,
we need to deny hardware timestamping on the DSA master before it
actually gets enabled. This is why this notifier is called "PRE_", and
the notifier that would get used for PHY timestamping and packet traps
would be called NETDEV_CHANGE_HWTSTAMP. This isn't a new concept, for
example NETDEV_CHANGEUPPER and NETDEV_PRECHANGEUPPER do the same thing.

In expectation of future netlink UAPI, we also pass a non-NULL extack
pointer to the netdev notifier, and we make DSA populate it with an
informative reason for the rejection. To avoid making it go to waste, we
make the ioctl-based dev_set_hwtstamp() create a fake extack and print
the message to the kernel log.

Link: https://lore.kernel.org/netdev/20230401191215.tvveoi3lkawgg6g4@skbuf/
Link: https://lore.kernel.org/netdev/20230310164451.ls7bbs6pdzs4m6pw@skbuf/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  9 ++++++++-
 include/net/dsa.h         | 51 -----------------------------------------------
 net/core/dev.c            |  8 +++-----
 net/core/dev_ioctl.c      | 16 +++++++++++++--
 net/dsa/master.c          | 50 ++++++++++++++--------------------------------
 net/dsa/master.h          |  3 +++
 net/dsa/slave.c           | 11 ++++++++++
 7 files changed, 54 insertions(+), 94 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 62e093a6d6d1..a740be3bb911 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2878,6 +2878,7 @@ enum netdev_cmd {
 	NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 	NETDEV_XDP_FEAT_CHANGE,
+	NETDEV_PRE_CHANGE_HWTSTAMP,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
@@ -2928,6 +2929,11 @@ struct netdev_notifier_pre_changeaddr_info {
 	const unsigned char *dev_addr;
 };
 
+struct netdev_notifier_hwtstamp_info {
+	struct netdev_notifier_info info; /* must be first */
+	struct kernel_hwtstamp_config *config;
+};
+
 enum netdev_offload_xstats_type {
 	NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
 };
@@ -2984,7 +2990,8 @@ netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
 }
 
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
-
+int call_netdevice_notifiers_info(unsigned long val,
+				  struct netdev_notifier_info *info);
 
 extern rwlock_t				dev_base_lock;		/* Device list lock */
 
diff --git a/include/net/dsa.h b/include/net/dsa.h
index a15f17a38eca..8903053fa5aa 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -109,16 +109,6 @@ struct dsa_device_ops {
 	bool promisc_on_master;
 };
 
-/* This structure defines the control interfaces that are overlayed by the
- * DSA layer on top of the DSA CPU/management net_device instance. This is
- * used by the core net_device layer while calling various net_device_ops
- * function pointers.
- */
-struct dsa_netdevice_ops {
-	int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr,
-			     int cmd);
-};
-
 struct dsa_lag {
 	struct net_device *dev;
 	unsigned int id;
@@ -317,11 +307,6 @@ struct dsa_port {
 	 */
 	const struct ethtool_ops *orig_ethtool_ops;
 
-	/*
-	 * Original copy of the master netdev net_device_ops
-	 */
-	const struct dsa_netdevice_ops *netdev_ops;
-
 	/* List of MAC addresses that must be forwarded on this port.
 	 * These are only valid on CPU ports and DSA links.
 	 */
@@ -1339,42 +1324,6 @@ static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb,
 #endif
 }
 
-#if IS_ENABLED(CONFIG_NET_DSA)
-static inline int __dsa_netdevice_ops_check(struct net_device *dev)
-{
-	int err = -EOPNOTSUPP;
-
-	if (!dev->dsa_ptr)
-		return err;
-
-	if (!dev->dsa_ptr->netdev_ops)
-		return err;
-
-	return 0;
-}
-
-static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr,
-				    int cmd)
-{
-	const struct dsa_netdevice_ops *ops;
-	int err;
-
-	err = __dsa_netdevice_ops_check(dev);
-	if (err)
-		return err;
-
-	ops = dev->dsa_ptr->netdev_ops;
-
-	return ops->ndo_eth_ioctl(dev, ifr, cmd);
-}
-#else
-static inline int dsa_ndo_eth_ioctl(struct net_device *dev, struct ifreq *ifr,
-				    int cmd)
-{
-	return -EOPNOTSUPP;
-}
-#endif
-
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds);
 void dsa_switch_shutdown(struct dsa_switch *ds);
diff --git a/net/core/dev.c b/net/core/dev.c
index 0c4b21291348..7ce5985be84b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -160,8 +160,6 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 struct list_head ptype_all __read_mostly;	/* Taps */
 
 static int netif_rx_internal(struct sk_buff *skb);
-static int call_netdevice_notifiers_info(unsigned long val,
-					 struct netdev_notifier_info *info);
 static int call_netdevice_notifiers_extack(unsigned long val,
 					   struct net_device *dev,
 					   struct netlink_ext_ack *extack);
@@ -1614,7 +1612,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
-	N(XDP_FEAT_CHANGE)
+	N(XDP_FEAT_CHANGE) N(PRE_CHANGE_HWTSTAMP)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
@@ -1919,8 +1917,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev,
  *	are as for raw_notifier_call_chain().
  */
 
-static int call_netdevice_notifiers_info(unsigned long val,
-					 struct netdev_notifier_info *info)
+int call_netdevice_notifiers_info(unsigned long val,
+				  struct netdev_notifier_info *info)
 {
 	struct net *net = dev_net(info->dev);
 	int ret;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index c532ef4d5dff..6d772837eb3f 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -259,7 +259,11 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
+	struct netdev_notifier_hwtstamp_info info = {
+		.info.dev = dev,
+	};
 	struct kernel_hwtstamp_config kernel_cfg;
+	struct netlink_ext_ack extack = {};
 	struct hwtstamp_config cfg;
 	int err;
 
@@ -272,9 +276,17 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 	if (err)
 		return err;
 
-	err = dsa_ndo_eth_ioctl(dev, ifr, SIOCSHWTSTAMP);
-	if (err != -EOPNOTSUPP)
+	info.info.extack = &extack;
+	info.config = &kernel_cfg;
+
+	err = call_netdevice_notifiers_info(NETDEV_PRE_CHANGE_HWTSTAMP,
+					    &info.info);
+	err = notifier_to_errno(err);
+	if (err) {
+		if (extack._msg)
+			netdev_err(dev, "%s\n", extack._msg);
 		return err;
+	}
 
 	return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP);
 }
diff --git a/net/dsa/master.c b/net/dsa/master.c
index e397641382ca..c2cabe6248b1 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -195,38 +195,31 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 	}
 }
 
-static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+/* Deny PTP operations on master if there is at least one switch in the tree
+ * that is PTP capable.
+ */
+int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+				   const struct kernel_hwtstamp_config *config,
+				   struct netlink_ext_ack *extack)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
 	struct dsa_switch *ds = cpu_dp->ds;
 	struct dsa_switch_tree *dst;
-	int err = -EOPNOTSUPP;
 	struct dsa_port *dp;
 
 	dst = ds->dst;
 
-	switch (cmd) {
-	case SIOCGHWTSTAMP:
-	case SIOCSHWTSTAMP:
-		/* Deny PTP operations on master if there is at least one
-		 * switch in the tree that is PTP capable.
-		 */
-		list_for_each_entry(dp, &dst->ports, list)
-			if (dsa_port_supports_hwtstamp(dp))
-				return -EBUSY;
-		break;
+	list_for_each_entry(dp, &dst->ports, list) {
+		if (dsa_port_supports_hwtstamp(dp)) {
+			NL_SET_ERR_MSG(extack,
+				       "HW timestamping not allowed on DSA master when switch supports the operation");
+			return -EBUSY;
+		}
 	}
 
-	if (dev->netdev_ops->ndo_eth_ioctl)
-		err = dev->netdev_ops->ndo_eth_ioctl(dev, ifr, cmd);
-
-	return err;
+	return 0;
 }
 
-static const struct dsa_netdevice_ops dsa_netdev_ops = {
-	.ndo_eth_ioctl = dsa_master_ioctl,
-};
-
 static int dsa_master_ethtool_setup(struct net_device *dev)
 {
 	struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -267,15 +260,6 @@ static void dsa_master_ethtool_teardown(struct net_device *dev)
 	cpu_dp->orig_ethtool_ops = NULL;
 }
 
-static void dsa_netdev_ops_set(struct net_device *dev,
-			       const struct dsa_netdevice_ops *ops)
-{
-	if (netif_is_lag_master(dev))
-		return;
-
-	dev->dsa_ptr->netdev_ops = ops;
-}
-
 /* Keep the master always promiscuous if the tagging protocol requires that
  * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
  * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
@@ -414,16 +398,13 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
 	if (ret)
 		goto out_err_reset_promisc;
 
-	dsa_netdev_ops_set(dev, &dsa_netdev_ops);
-
 	ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
 	if (ret)
-		goto out_err_ndo_teardown;
+		goto out_err_ethtool_teardown;
 
 	return ret;
 
-out_err_ndo_teardown:
-	dsa_netdev_ops_set(dev, NULL);
+out_err_ethtool_teardown:
 	dsa_master_ethtool_teardown(dev);
 out_err_reset_promisc:
 	dsa_master_set_promiscuity(dev, -1);
@@ -433,7 +414,6 @@ out_err_reset_promisc:
 void dsa_master_teardown(struct net_device *dev)
 {
 	sysfs_remove_group(&dev->dev.kobj, &dsa_group);
-	dsa_netdev_ops_set(dev, NULL);
 	dsa_master_ethtool_teardown(dev);
 	dsa_master_reset_mtu(dev);
 	dsa_master_set_promiscuity(dev, -1);
diff --git a/net/dsa/master.h b/net/dsa/master.h
index 3fc0e610b5b5..80842f4e27f7 100644
--- a/net/dsa/master.h
+++ b/net/dsa/master.h
@@ -15,5 +15,8 @@ int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
 			 struct netlink_ext_ack *extack);
 void dsa_master_lag_teardown(struct net_device *lag_dev,
 			     struct dsa_port *cpu_dp);
+int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+				   const struct kernel_hwtstamp_config *config,
+				   struct netlink_ext_ack *extack);
 
 #endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 165bb2cb8431..8abc1658ac47 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -3289,6 +3289,7 @@ static int dsa_master_changeupper(struct net_device *dev,
 static int dsa_slave_netdevice_event(struct notifier_block *nb,
 				     unsigned long event, void *ptr)
 {
+	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
 	switch (event) {
@@ -3418,6 +3419,16 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
 
 		return NOTIFY_OK;
 	}
+	case NETDEV_PRE_CHANGE_HWTSTAMP: {
+		struct netdev_notifier_hwtstamp_info *info = ptr;
+		int err;
+
+		if (!netdev_uses_dsa(dev))
+			return NOTIFY_DONE;
+
+		err = dsa_master_pre_change_hwtstamp(dev, info->config, extack);
+		return notifier_from_errno(err);
+	}
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 657de1cf258dbe2489906c81bd91e4af536de255 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 2 Apr 2023 17:16:56 +0200
Subject: net: phy: smsc: add support for edpd tunable

This adds support for the EDPD PHY tunable.
Per default EDPD is disabled in interrupt mode, the tunable can be used
to override this, e.g. if the link partner doesn't use EDPD.
The interval to check for energy can be chosen between 1000ms and
2000ms. Note that this value consists of the 1000ms phylib interval
for state machine runs plus the time to wait for energy being detected.

v2:
- consider that phylib core holds phydev->lock when calling the
  phy tunable hooks

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/smsc.c  | 75 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/smscphy.h |  4 +++
 2 files changed, 79 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index 659a3ab10d82..0eba69ad5745 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -34,6 +34,8 @@
 #define SPECIAL_CTRL_STS_AMDIX_STATE_	0x2000
 
 #define EDPD_MAX_WAIT_DFLT_MS		640
+/* interval between phylib state machine runs in ms */
+#define PHY_STATE_MACH_MS		1000
 
 struct smsc_hw_stat {
 	const char *string;
@@ -295,6 +297,79 @@ static void smsc_get_stats(struct phy_device *phydev,
 		data[i] = smsc_get_stat(phydev, i);
 }
 
+static int smsc_phy_get_edpd(struct phy_device *phydev, u16 *edpd)
+{
+	struct smsc_phy_priv *priv = phydev->priv;
+
+	if (!priv)
+		return -EOPNOTSUPP;
+
+	if (!priv->edpd_enable)
+		*edpd = ETHTOOL_PHY_EDPD_DISABLE;
+	else if (!priv->edpd_max_wait_ms)
+		*edpd = ETHTOOL_PHY_EDPD_NO_TX;
+	else
+		*edpd = PHY_STATE_MACH_MS + priv->edpd_max_wait_ms;
+
+	return 0;
+}
+
+static int smsc_phy_set_edpd(struct phy_device *phydev, u16 edpd)
+{
+	struct smsc_phy_priv *priv = phydev->priv;
+
+	if (!priv)
+		return -EOPNOTSUPP;
+
+	switch (edpd) {
+	case ETHTOOL_PHY_EDPD_DISABLE:
+		priv->edpd_enable = false;
+		break;
+	case ETHTOOL_PHY_EDPD_NO_TX:
+		priv->edpd_enable = true;
+		priv->edpd_max_wait_ms = 0;
+		break;
+	case ETHTOOL_PHY_EDPD_DFLT_TX_MSECS:
+		edpd = PHY_STATE_MACH_MS + EDPD_MAX_WAIT_DFLT_MS;
+		fallthrough;
+	default:
+		if (phydev->irq != PHY_POLL)
+			return -EOPNOTSUPP;
+		if (edpd < PHY_STATE_MACH_MS || edpd > PHY_STATE_MACH_MS + 1000)
+			return -EINVAL;
+		priv->edpd_enable = true;
+		priv->edpd_max_wait_ms = edpd - PHY_STATE_MACH_MS;
+	}
+
+	priv->edpd_mode_set_by_user = true;
+
+	return smsc_phy_config_edpd(phydev);
+}
+
+int smsc_phy_get_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, void *data)
+{
+	switch (tuna->id) {
+	case ETHTOOL_PHY_EDPD:
+		return smsc_phy_get_edpd(phydev, data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(smsc_phy_get_tunable);
+
+int smsc_phy_set_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, const void *data)
+{
+	switch (tuna->id) {
+	case ETHTOOL_PHY_EDPD:
+		return smsc_phy_set_edpd(phydev, *(u16 *)data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+EXPORT_SYMBOL_GPL(smsc_phy_set_tunable);
+
 int smsc_phy_probe(struct phy_device *phydev)
 {
 	struct device *dev = &phydev->mdio.dev;
diff --git a/include/linux/smscphy.h b/include/linux/smscphy.h
index 80f37c1dba58..e1c88627755a 100644
--- a/include/linux/smscphy.h
+++ b/include/linux/smscphy.h
@@ -32,6 +32,10 @@ int smsc_phy_config_intr(struct phy_device *phydev);
 irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev);
 int smsc_phy_config_init(struct phy_device *phydev);
 int lan87xx_read_status(struct phy_device *phydev);
+int smsc_phy_get_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, void *data);
+int smsc_phy_set_tunable(struct phy_device *phydev,
+			 struct ethtool_tunable *tuna, const void *data);
 int smsc_phy_probe(struct phy_device *phydev);
 
 #endif /* __LINUX_SMSCPHY_H__ */
-- 
cgit v1.2.3


From b7e852a9ec96635168c04204fb7cf1f7390b9a8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:23 -0700
Subject: bpf: Remove unused arguments from btf_struct_access().

Remove unused arguments from btf_struct_access() callback.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-3-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h              |  3 +--
 include/linux/filter.h           |  3 +--
 kernel/bpf/verifier.c            |  4 ++--
 net/bpf/bpf_dummy_struct_ops.c   | 12 +++++-------
 net/core/filter.c                | 13 +++++--------
 net/ipv4/bpf_tcp_ca.c            |  3 +--
 net/netfilter/nf_conntrack_bpf.c |  3 +--
 7 files changed, 16 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2d8f3f639e68..4f689dda748f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -893,8 +893,7 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 	int (*btf_struct_access)(struct bpf_verifier_log *log,
 				 const struct bpf_reg_state *reg,
-				 int off, int size, enum bpf_access_type atype,
-				 u32 *next_btf_id, enum bpf_type_flag *flag);
+				 int off, int size);
 };
 
 struct bpf_prog_offload_ops {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 23c08c31bea9..5364b0c52c1d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -571,8 +571,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 extern struct mutex nf_conn_btf_access_lock;
 extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 				     const struct bpf_reg_state *reg,
-				     int off, int size, enum bpf_access_type atype,
-				     u32 *next_btf_id, enum bpf_type_flag *flag);
+				     int off, int size);
 
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83984568ccb4..5ca520e5eddf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5459,7 +5459,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
 	enum bpf_type_flag flag = 0;
-	u32 btf_id;
+	u32 btf_id = 0;
 	int ret;
 
 	if (!env->allow_ptr_leaks) {
@@ -5509,7 +5509,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
 			return -EFAULT;
 		}
-		ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+		ret = env->ops->btf_struct_access(&env->log, reg, off, size);
 	} else {
 		/* Writes are permitted with default btf_struct_access for
 		 * program allocated objects (which always have ref_obj_id > 0),
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 9535c8506cda..5918d1b32e19 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -173,14 +173,11 @@ static int bpf_dummy_ops_check_member(const struct btf_type *t,
 
 static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 					   const struct bpf_reg_state *reg,
-					   int off, int size, enum bpf_access_type atype,
-					   u32 *next_btf_id,
-					   enum bpf_type_flag *flag)
+					   int off, int size)
 {
 	const struct btf_type *state;
 	const struct btf_type *t;
 	s32 type_id;
-	int err;
 
 	type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
 					BTF_KIND_STRUCT);
@@ -194,9 +191,10 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
 		return -EACCES;
 	}
 
-	err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
-	if (err < 0)
-		return err;
+	if (off + size > sizeof(struct bpf_dummy_ops_state)) {
+		bpf_log(log, "write access at off %d with size %d\n", off, size);
+		return -EACCES;
+	}
 
 	return NOT_INIT;
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 8b9f409a2ec3..1f2abf0f60e6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8742,20 +8742,18 @@ EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
 
 int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 			      const struct bpf_reg_state *reg,
-			      int off, int size, enum bpf_access_type atype,
-			      u32 *next_btf_id, enum bpf_type_flag *flag);
+			      int off, int size);
 EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
 
 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
 					const struct bpf_reg_state *reg,
-					int off, int size, enum bpf_access_type atype,
-					u32 *next_btf_id, enum bpf_type_flag *flag)
+					int off, int size)
 {
 	int ret = -EACCES;
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
@@ -8822,14 +8820,13 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
 static int xdp_btf_struct_access(struct bpf_verifier_log *log,
 				 const struct bpf_reg_state *reg,
-				 int off, int size, enum bpf_access_type atype,
-				 u32 *next_btf_id, enum bpf_type_flag *flag)
+				 int off, int size)
 {
 	int ret = -EACCES;
 
 	mutex_lock(&nf_conn_btf_access_lock);
 	if (nfct_btf_struct_access)
-		ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag);
+		ret = nfct_btf_struct_access(log, reg, off, size);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index d6465876bbf6..4406d796cc2f 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -72,8 +72,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size,
 
 static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
 					const struct bpf_reg_state *reg,
-					int off, int size, enum bpf_access_type atype,
-					u32 *next_btf_id, enum bpf_type_flag *flag)
+					int off, int size)
 {
 	const struct btf_type *t;
 	size_t end;
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 002e9d24a1e9..3f821b7ba646 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -192,8 +192,7 @@ BTF_ID(struct, nf_conn___init)
 /* Check writes into `struct nf_conn` */
 static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
 					   const struct bpf_reg_state *reg,
-					   int off, int size, enum bpf_access_type atype,
-					   u32 *next_btf_id, enum bpf_type_flag *flag)
+					   int off, int size)
 {
 	const struct btf_type *ncit, *nct, *t;
 	size_t end;
-- 
cgit v1.2.3


From 63260df1396578226ac3134cf7f764690002e70e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 3 Apr 2023 21:50:24 -0700
Subject: bpf: Refactor btf_nested_type_is_trusted().

btf_nested_type_is_trusted() tries to find a struct member at corresponding offset.
It works for flat structures and falls apart in more complex structs with nested structs.
The offset->member search is already performed by btf_struct_walk() including nested structs.
Reuse this work and pass {field name, field btf id} into btf_nested_type_is_trusted()
instead of offset to make BTF_TYPE_SAFE*() logic more robust.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230404045029.82870-4-alexei.starovoitov@gmail.com
---
 include/linux/bpf.h   |  7 ++++---
 kernel/bpf/btf.c      | 44 +++++++++++++++++---------------------------
 kernel/bpf/verifier.c | 23 ++++++++++++-----------
 3 files changed, 33 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4f689dda748f..002a811b6b90 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2263,7 +2263,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size,
 int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct bpf_reg_state *reg,
 		      int off, int size, enum bpf_access_type atype,
-		      u32 *next_btf_id, enum bpf_type_flag *flag);
+		      u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name);
 bool btf_struct_ids_match(struct bpf_verifier_log *log,
 			  const struct btf *btf, u32 id, int off,
 			  const struct btf *need_btf, u32 need_type_id,
@@ -2302,7 +2302,7 @@ struct bpf_core_ctx {
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off, const char *suffix);
+				const char *field_name, u32 btf_id, const char *suffix);
 
 bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 			       const struct btf *reg_btf, u32 reg_id,
@@ -2517,7 +2517,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 static inline int btf_struct_access(struct bpf_verifier_log *log,
 				    const struct bpf_reg_state *reg,
 				    int off, int size, enum bpf_access_type atype,
-				    u32 *next_btf_id, enum bpf_type_flag *flag)
+				    u32 *next_btf_id, enum bpf_type_flag *flag,
+				    const char **field_name)
 {
 	return -EACCES;
 }
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b7e5a5510b91..593c45a294d0 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6166,7 +6166,8 @@ enum bpf_struct_walk_result {
 
 static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
 			   const struct btf_type *t, int off, int size,
-			   u32 *next_btf_id, enum bpf_type_flag *flag)
+			   u32 *next_btf_id, enum bpf_type_flag *flag,
+			   const char **field_name)
 {
 	u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
 	const struct btf_type *mtype, *elem_type = NULL;
@@ -6395,6 +6396,8 @@ error:
 			if (btf_type_is_struct(stype)) {
 				*next_btf_id = id;
 				*flag |= tmp_flag;
+				if (field_name)
+					*field_name = mname;
 				return WALK_PTR;
 			}
 		}
@@ -6421,7 +6424,8 @@ error:
 int btf_struct_access(struct bpf_verifier_log *log,
 		      const struct bpf_reg_state *reg,
 		      int off, int size, enum bpf_access_type atype __maybe_unused,
-		      u32 *next_btf_id, enum bpf_type_flag *flag)
+		      u32 *next_btf_id, enum bpf_type_flag *flag,
+		      const char **field_name)
 {
 	const struct btf *btf = reg->btf;
 	enum bpf_type_flag tmp_flag = 0;
@@ -6453,7 +6457,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
 
 	t = btf_type_by_id(btf, id);
 	do {
-		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
+		err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
 
 		switch (err) {
 		case WALK_PTR:
@@ -6528,7 +6532,7 @@ again:
 	type = btf_type_by_id(btf, id);
 	if (!type)
 		return false;
-	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
+	err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
 	if (err != WALK_STRUCT)
 		return false;
 
@@ -8488,16 +8492,15 @@ out:
 
 bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 				const struct bpf_reg_state *reg,
-				int off, const char *suffix)
+				const char *field_name, u32 btf_id, const char *suffix)
 {
 	struct btf *btf = reg->btf;
 	const struct btf_type *walk_type, *safe_type;
 	const char *tname;
 	char safe_tname[64];
 	long ret, safe_id;
-	const struct btf_member *member, *m_walk = NULL;
+	const struct btf_member *member;
 	u32 i;
-	const char *walk_name;
 
 	walk_type = btf_type_by_id(btf, reg->btf_id);
 	if (!walk_type)
@@ -8517,30 +8520,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
 	if (!safe_type)
 		return false;
 
-	for_each_member(i, walk_type, member) {
-		u32 moff;
-
-		/* We're looking for the PTR_TO_BTF_ID member in the struct
-		 * type we're walking which matches the specified offset.
-		 * Below, we'll iterate over the fields in the safe variant of
-		 * the struct and see if any of them has a matching type /
-		 * name.
-		 */
-		moff = __btf_member_bit_offset(walk_type, member) / 8;
-		if (off == moff) {
-			m_walk = member;
-			break;
-		}
-	}
-	if (m_walk == NULL)
-		return false;
-
-	walk_name = __btf_name_by_offset(btf, m_walk->name_off);
 	for_each_member(i, safe_type, member) {
 		const char *m_name = __btf_name_by_offset(btf, member->name_off);
+		const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+		u32 id;
+
+		if (!btf_type_is_ptr(mtype))
+			continue;
 
+		btf_type_skip_modifiers(btf, mtype->type, &id);
 		/* If we match on both type and name, the field is considered trusted. */
-		if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+		if (btf_id == id && !strcmp(field_name, m_name))
 			return true;
 	}
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5ca520e5eddf..2cd2e0b725cd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5400,12 +5400,12 @@ BTF_TYPE_SAFE_RCU(struct css_set) {
 
 /* full trusted: these fields are trusted even outside of RCU CS and never NULL */
 BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
-	__bpf_md_ptr(struct seq_file *, seq);
+	struct seq_file *seq;
 };
 
 BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
-	__bpf_md_ptr(struct bpf_iter_meta *, meta);
-	__bpf_md_ptr(struct task_struct *, task);
+	struct bpf_iter_meta *meta;
+	struct task_struct *task;
 };
 
 BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
@@ -5427,17 +5427,17 @@ BTF_TYPE_SAFE_TRUSTED(struct socket) {
 
 static bool type_is_rcu(struct bpf_verifier_env *env,
 			struct bpf_reg_state *reg,
-			int off)
+			const char *field_name, u32 btf_id)
 {
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
 
-	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu");
+	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
 }
 
 static bool type_is_trusted(struct bpf_verifier_env *env,
 			    struct bpf_reg_state *reg,
-			    int off)
+			    const char *field_name, u32 btf_id)
 {
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
@@ -5446,7 +5446,7 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
 	BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
 
-	return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted");
+	return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
 }
 
 static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5458,6 +5458,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 	struct bpf_reg_state *reg = regs + regno;
 	const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
 	const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+	const char *field_name = NULL;
 	enum bpf_type_flag flag = 0;
 	u32 btf_id = 0;
 	int ret;
@@ -5526,7 +5527,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 			return -EFAULT;
 		}
 
-		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+		ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
 	}
 
 	if (ret < 0)
@@ -5554,10 +5555,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
 		 * A regular RCU-protected pointer with __rcu tag can also be deemed
 		 * trusted if we are in an RCU CS. Such pointer can be NULL.
 		 */
-		if (type_is_trusted(env, reg, off)) {
+		if (type_is_trusted(env, reg, field_name, btf_id)) {
 			flag |= PTR_TRUSTED;
 		} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
-			if (type_is_rcu(env, reg, off)) {
+			if (type_is_rcu(env, reg, field_name, btf_id)) {
 				/* ignore __rcu tag and mark it MEM_RCU */
 				flag |= MEM_RCU;
 			} else if (flag & MEM_RCU) {
@@ -5640,7 +5641,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
 	/* Simulate access to a PTR_TO_BTF_ID */
 	memset(&map_reg, 0, sizeof(map_reg));
 	mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
-	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
+	ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From 054fbf7ff8143d35ca7d3bb5414bb44ee1574194 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 3 Apr 2023 17:43:16 +0200
Subject: net: qrtr: correct types of trace event parameters

The arguments passed to the trace events are of type unsigned int,
however the signature of the events used __le32 parameters.

I may be missing the point here, but sparse flagged this and it
does seem incorrect to me.

  net/qrtr/ns.c: note: in included file (through include/trace/trace_events.h, include/trace/define_trace.h, include/trace/events/qrtr.h):
  ./include/trace/events/qrtr.h:11:1: warning: cast to restricted __le32
  ./include/trace/events/qrtr.h:11:1: warning: restricted __le32 degrades to integer
  ./include/trace/events/qrtr.h:11:1: warning: restricted __le32 degrades to integer
  ... (a lot more similar warnings)
  net/qrtr/ns.c:115:47:    expected restricted __le32 [usertype] service
  net/qrtr/ns.c:115:47:    got unsigned int service
  net/qrtr/ns.c:115:61: warning: incorrect type in argument 2 (different base types)
  ... (a lot more similar warnings)

Fixes: dfddb54043f0 ("net: qrtr: Add tracepoint support")
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20230402-qrtr-trace-types-v1-1-92ad55008dd3@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/qrtr.h | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/qrtr.h b/include/trace/events/qrtr.h
index b1de14c3bb93..441132c67133 100644
--- a/include/trace/events/qrtr.h
+++ b/include/trace/events/qrtr.h
@@ -10,15 +10,16 @@
 
 TRACE_EVENT(qrtr_ns_service_announce_new,
 
-	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+	TP_PROTO(unsigned int service, unsigned int instance,
+		 unsigned int node, unsigned int port),
 
 	TP_ARGS(service, instance, node, port),
 
 	TP_STRUCT__entry(
-		__field(__le32, service)
-		__field(__le32, instance)
-		__field(__le32, node)
-		__field(__le32, port)
+		__field(unsigned int, service)
+		__field(unsigned int, instance)
+		__field(unsigned int, node)
+		__field(unsigned int, port)
 	),
 
 	TP_fast_assign(
@@ -36,15 +37,16 @@ TRACE_EVENT(qrtr_ns_service_announce_new,
 
 TRACE_EVENT(qrtr_ns_service_announce_del,
 
-	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+	TP_PROTO(unsigned int service, unsigned int instance,
+		 unsigned int node, unsigned int port),
 
 	TP_ARGS(service, instance, node, port),
 
 	TP_STRUCT__entry(
-		__field(__le32, service)
-		__field(__le32, instance)
-		__field(__le32, node)
-		__field(__le32, port)
+		__field(unsigned int, service)
+		__field(unsigned int, instance)
+		__field(unsigned int, node)
+		__field(unsigned int, port)
 	),
 
 	TP_fast_assign(
@@ -62,15 +64,16 @@ TRACE_EVENT(qrtr_ns_service_announce_del,
 
 TRACE_EVENT(qrtr_ns_server_add,
 
-	TP_PROTO(__le32 service, __le32 instance, __le32 node, __le32 port),
+	TP_PROTO(unsigned int service, unsigned int instance,
+		 unsigned int node, unsigned int port),
 
 	TP_ARGS(service, instance, node, port),
 
 	TP_STRUCT__entry(
-		__field(__le32, service)
-		__field(__le32, instance)
-		__field(__le32, node)
-		__field(__le32, port)
+		__field(unsigned int, service)
+		__field(unsigned int, instance)
+		__field(unsigned int, node)
+		__field(unsigned int, port)
 	),
 
 	TP_fast_assign(
-- 
cgit v1.2.3


From 10739ea3132861b1344264b110ea48d951a0e3b0 Mon Sep 17 00:00:00 2001
From: Shenwei Wang <shenwei.wang@nxp.com>
Date: Mon, 3 Apr 2023 17:23:01 -0500
Subject: net: stmmac: add support for platform specific reset

This patch adds support for platform-specific reset logic in the
stmmac driver. Some SoCs require a different reset mechanism than
the standard dwmac IP reset. To support these platforms, a new function
pointer 'fix_soc_reset' is added to the plat_stmmacenet_data structure.
The stmmac_reset in hwif.h is modified to call the 'fix_soc_reset'
function if it exists. This enables the driver to use the platform-specific
reset logic when necessary.

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Link: https://lore.kernel.org/r/20230403222302.328262-1-shenwei.wang@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/hwif.c | 13 +++++++++++++
 drivers/net/ethernet/stmicro/stmmac/hwif.h |  3 +--
 include/linux/stmmac.h                     |  1 +
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c
index bb7114f970f8..b8ba8f2d8041 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.c
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c
@@ -87,6 +87,19 @@ static int stmmac_dwxlgmac_quirks(struct stmmac_priv *priv)
 	return 0;
 }
 
+int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr)
+{
+	struct plat_stmmacenet_data *plat = priv ? priv->plat : NULL;
+
+	if (!priv)
+		return -EINVAL;
+
+	if (plat && plat->fix_soc_reset)
+		return plat->fix_soc_reset(plat, ioaddr);
+
+	return stmmac_do_callback(priv, dma, reset, ioaddr);
+}
+
 static const struct stmmac_hwif_entry {
 	bool gmac;
 	bool gmac4;
diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h
index 16a7421715cb..1cc286b000b6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@ -214,8 +214,6 @@ struct stmmac_dma_ops {
 	int (*enable_tbs)(void __iomem *ioaddr, bool en, u32 chan);
 };
 
-#define stmmac_reset(__priv, __args...) \
-	stmmac_do_callback(__priv, dma, reset, __args)
 #define stmmac_dma_init(__priv, __args...) \
 	stmmac_do_void_callback(__priv, dma, init, __args)
 #define stmmac_init_chan(__priv, __args...) \
@@ -640,6 +638,7 @@ extern const struct stmmac_mmc_ops dwxgmac_mmc_ops;
 #define GMAC_VERSION		0x00000020	/* GMAC CORE Version */
 #define GMAC4_VERSION		0x00000110	/* GMAC4+ CORE Version */
 
+int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr);
 int stmmac_hwif_init(struct stmmac_priv *priv);
 
 #endif /* __STMMAC_HWIF_H__ */
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index a2414c187483..dafa001e9e7a 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -223,6 +223,7 @@ struct plat_stmmacenet_data {
 	struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES];
 	struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES];
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
+	int (*fix_soc_reset)(void *priv, void __iomem *ioaddr);
 	int (*serdes_powerup)(struct net_device *ndev, void *priv);
 	void (*serdes_powerdown)(struct net_device *ndev, void *priv);
 	void (*speed_mode_2500)(struct net_device *ndev, void *priv);
-- 
cgit v1.2.3


From 34bf93472f8fb60b4189aa2872471017e739cf0a Mon Sep 17 00:00:00 2001
From: Viktor Malik <vmalik@redhat.com>
Date: Thu, 30 Mar 2023 12:20:01 +0200
Subject: kallsyms: move module-related functions under correct configs

Functions for searching module kallsyms should have non-empty
definitions only if CONFIG_MODULES=y and CONFIG_KALLSYMS=y. Until now,
only CONFIG_MODULES check was used for many of these, which may have
caused complilation errors on some configs.

This patch moves all relevant functions under the correct configs.

Fixes: bd5314f8dd2d ("kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header")
Signed-off-by: Viktor Malik <vmalik@redhat.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303181535.RFDCnz3E-lkp@intel.com/
Link: https://lore.kernel.org/r/20230330102001.2183693-1-vmalik@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/module.h | 135 +++++++++++++++++++++++++++----------------------
 1 file changed, 74 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 41cfd3be57e5..886d24877c7c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -608,16 +608,6 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
 /* Search for module by name: must be in a RCU-sched critical section. */
 struct module *find_module(const char *name);
 
-/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
-   symnum out of range. */
-int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
-			char *name, char *module_name, int *exported);
-
-/* Look for this name: can be of form module:name. */
-unsigned long module_kallsyms_lookup_name(const char *name);
-
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
-
 extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
 			long code);
 #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
@@ -664,17 +654,6 @@ static inline void __module_get(struct module *module)
 /* Dereference module function descriptor */
 void *dereference_module_function_descriptor(struct module *mod, void *ptr);
 
-/* For kallsyms to ask for address resolution.  namebuf should be at
- * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
- * found, otherwise NULL. */
-const char *module_address_lookup(unsigned long addr,
-			    unsigned long *symbolsize,
-			    unsigned long *offset,
-			    char **modname, const unsigned char **modbuildid,
-			    char *namebuf);
-int lookup_module_symbol_name(unsigned long addr, char *symname);
-int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name);
-
 int register_module_notifier(struct notifier_block *nb);
 int unregister_module_notifier(struct notifier_block *nb);
 
@@ -765,45 +744,6 @@ static inline void module_put(struct module *module)
 
 #define module_name(mod) "kernel"
 
-/* For kallsyms to ask for address resolution.  NULL means not found. */
-static inline const char *module_address_lookup(unsigned long addr,
-					  unsigned long *symbolsize,
-					  unsigned long *offset,
-					  char **modname,
-					  const unsigned char **modbuildid,
-					  char *namebuf)
-{
-	return NULL;
-}
-
-static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
-{
-	return -ERANGE;
-}
-
-static inline int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name)
-{
-	return -ERANGE;
-}
-
-static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
-					char *type, char *name,
-					char *module_name, int *exported)
-{
-	return -ERANGE;
-}
-
-static inline unsigned long module_kallsyms_lookup_name(const char *name)
-{
-	return 0;
-}
-
-static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
-						       const char *name)
-{
-	return 0;
-}
-
 static inline int register_module_notifier(struct notifier_block *nb)
 {
 	/* no events will happen anyway, so this can always succeed */
@@ -899,7 +839,36 @@ int module_kallsyms_on_each_symbol(const char *modname,
 				   int (*fn)(void *, const char *,
 					     struct module *, unsigned long),
 				   void *data);
-#else
+
+/* For kallsyms to ask for address resolution.  namebuf should be at
+ * least KSYM_NAME_LEN long: a pointer to namebuf is returned if
+ * found, otherwise NULL.
+ */
+const char *module_address_lookup(unsigned long addr,
+				  unsigned long *symbolsize,
+				  unsigned long *offset,
+				  char **modname, const unsigned char **modbuildid,
+				  char *namebuf);
+int lookup_module_symbol_name(unsigned long addr, char *symname);
+int lookup_module_symbol_attrs(unsigned long addr,
+			       unsigned long *size,
+			       unsigned long *offset,
+			       char *modname,
+			       char *name);
+
+/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
+ * symnum out of range.
+ */
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *name, char *module_name, int *exported);
+
+/* Look for this name: can be of form module:name. */
+unsigned long module_kallsyms_lookup_name(const char *name);
+
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
+
+#else	/* CONFIG_MODULES && CONFIG_KALLSYMS */
+
 static inline int module_kallsyms_on_each_symbol(const char *modname,
 						 int (*fn)(void *, const char *,
 						 struct module *, unsigned long),
@@ -907,6 +876,50 @@ static inline int module_kallsyms_on_each_symbol(const char *modname,
 {
 	return -EOPNOTSUPP;
 }
+
+/* For kallsyms to ask for address resolution.  NULL means not found. */
+static inline const char *module_address_lookup(unsigned long addr,
+						unsigned long *symbolsize,
+						unsigned long *offset,
+						char **modname,
+						const unsigned char **modbuildid,
+						char *namebuf)
+{
+	return NULL;
+}
+
+static inline int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+	return -ERANGE;
+}
+
+static inline int lookup_module_symbol_attrs(unsigned long addr,
+					     unsigned long *size,
+					     unsigned long *offset,
+					     char *modname,
+					     char *name)
+{
+	return -ERANGE;
+}
+
+static inline int module_get_kallsym(unsigned int symnum, unsigned long *value,
+				     char *type, char *name,
+				     char *module_name, int *exported)
+{
+	return -ERANGE;
+}
+
+static inline unsigned long module_kallsyms_lookup_name(const char *name)
+{
+	return 0;
+}
+
+static inline unsigned long find_kallsyms_symbol_value(struct module *mod,
+						       const char *name)
+{
+	return 0;
+}
+
 #endif  /* CONFIG_MODULES && CONFIG_KALLSYMS */
 
 #endif /* _LINUX_MODULE_H */
-- 
cgit v1.2.3


From d769ccaf957fe7391f357c0a923de71f594b8a2b Mon Sep 17 00:00:00 2001
From: Kal Conley <kal.conley@dectris.com>
Date: Thu, 6 Apr 2023 01:59:18 +0200
Subject: xsk: Fix unaligned descriptor validation

Make sure unaligned descriptors that straddle the end of the UMEM are
considered invalid. Currently, descriptor validation is broken for
zero-copy mode which only checks descriptors at page granularity.
For example, descriptors in zero-copy mode that overrun the end of the
UMEM but not a page boundary are (incorrectly) considered valid. The
UMEM boundary check needs to happen before the page boundary and
contiguity checks in xp_desc_crosses_non_contig_pg(). Do this check in
xp_unaligned_validate_desc() instead like xp_check_unaligned() already
does.

Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API")
Signed-off-by: Kal Conley <kal.conley@dectris.com>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/r/20230405235920.7305-2-kal.conley@dectris.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/xsk_buff_pool.h | 9 ++-------
 net/xdp/xsk_queue.h         | 1 +
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 3e952e569418..d318c769b445 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -180,13 +180,8 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
 	if (likely(!cross_pg))
 		return false;
 
-	if (pool->dma_pages_cnt) {
-		return !(pool->dma_pages[addr >> PAGE_SHIFT] &
-			 XSK_NEXT_PG_CONTIG_MASK);
-	}
-
-	/* skb path */
-	return addr + len > pool->addrs_cnt;
+	return pool->dma_pages_cnt &&
+	       !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK);
 }
 
 static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index bfb2a7e50c26..66c6f57c9c44 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -162,6 +162,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
 		return false;
 
 	if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
+	    addr + desc->len > pool->addrs_cnt ||
 	    xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
 		return false;
 
-- 
cgit v1.2.3


From 05f3ab7780b3c0cfe26a8134606bdf641c4f4bb2 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 5 Apr 2023 17:10:25 +0200
Subject: net: ethernet: mtk_eth_soc: add code for offloading flows from wlan
 devices

WED version 2 (on MT7986 and later) can offload flows originating from
wireless devices.
In order to make that work, ndo_setup_tc needs to be implemented on the
netdevs. This adds the required code to offload flows coming in from WED,
while keeping track of the incoming wed index used for selecting the
correct PPE device.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.h     |   3 +
 drivers/net/ethernet/mediatek/mtk_ppe_offload.c |  40 ++++++----
 drivers/net/ethernet/mediatek/mtk_wed.c         | 101 ++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk_wed.h            |   6 ++
 4 files changed, 136 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
index 23c7abeb5c14..cdcf8534283e 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -1276,6 +1276,9 @@ int mtk_gmac_rgmii_path_setup(struct mtk_eth *eth, int mac_id);
 int mtk_eth_offload_init(struct mtk_eth *eth);
 int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
 		     void *type_data);
+int mtk_flow_offload_cmd(struct mtk_eth *eth, struct flow_cls_offload *cls,
+			 int ppe_index);
+void mtk_flow_offload_cleanup(struct mtk_eth *eth, struct list_head *list);
 void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev);
 
 
diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
index 46634dc29d2f..02eebff02d45 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -235,7 +235,8 @@ out:
 }
 
 static int
-mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
+mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f,
+			 int ppe_index)
 {
 	struct flow_rule *rule = flow_cls_offload_flow_rule(f);
 	struct flow_action_entry *act;
@@ -452,6 +453,7 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
 	entry->cookie = f->cookie;
 	memcpy(&entry->data, &foe, sizeof(entry->data));
 	entry->wed_index = wed_index;
+	entry->ppe_index = ppe_index;
 
 	err = mtk_foe_entry_commit(eth->ppe[entry->ppe_index], entry);
 	if (err < 0)
@@ -520,25 +522,15 @@ mtk_flow_offload_stats(struct mtk_eth *eth, struct flow_cls_offload *f)
 
 static DEFINE_MUTEX(mtk_flow_offload_mutex);
 
-static int
-mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+int mtk_flow_offload_cmd(struct mtk_eth *eth, struct flow_cls_offload *cls,
+			 int ppe_index)
 {
-	struct flow_cls_offload *cls = type_data;
-	struct net_device *dev = cb_priv;
-	struct mtk_mac *mac = netdev_priv(dev);
-	struct mtk_eth *eth = mac->hw;
 	int err;
 
-	if (!tc_can_offload(dev))
-		return -EOPNOTSUPP;
-
-	if (type != TC_SETUP_CLSFLOWER)
-		return -EOPNOTSUPP;
-
 	mutex_lock(&mtk_flow_offload_mutex);
 	switch (cls->command) {
 	case FLOW_CLS_REPLACE:
-		err = mtk_flow_offload_replace(eth, cls);
+		err = mtk_flow_offload_replace(eth, cls, ppe_index);
 		break;
 	case FLOW_CLS_DESTROY:
 		err = mtk_flow_offload_destroy(eth, cls);
@@ -555,6 +547,26 @@ mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_pri
 	return err;
 }
 
+static int
+mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{
+	struct flow_cls_offload *cls = type_data;
+	struct net_device *dev = cb_priv;
+	struct mtk_mac *mac;
+	struct mtk_eth *eth;
+
+	mac = netdev_priv(dev);
+	eth = mac->hw;
+
+	if (!tc_can_offload(dev))
+		return -EOPNOTSUPP;
+
+	if (type != TC_SETUP_CLSFLOWER)
+		return -EOPNOTSUPP;
+
+	return mtk_flow_offload_cmd(eth, cls, 0);
+}
+
 static int
 mtk_eth_setup_tc_block(struct net_device *dev, struct flow_block_offload *f)
 {
diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 95d890870984..4c205afbd230 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -13,6 +13,8 @@
 #include <linux/mfd/syscon.h>
 #include <linux/debugfs.h>
 #include <linux/soc/mediatek/mtk_wed.h>
+#include <net/flow_offload.h>
+#include <net/pkt_cls.h>
 #include "mtk_eth_soc.h"
 #include "mtk_wed_regs.h"
 #include "mtk_wed.h"
@@ -41,6 +43,11 @@
 static struct mtk_wed_hw *hw_list[2];
 static DEFINE_MUTEX(hw_lock);
 
+struct mtk_wed_flow_block_priv {
+	struct mtk_wed_hw *hw;
+	struct net_device *dev;
+};
+
 static void
 wed_m32(struct mtk_wed_device *dev, u32 reg, u32 mask, u32 val)
 {
@@ -1745,6 +1752,99 @@ out:
 	mutex_unlock(&hw_lock);
 }
 
+static int
+mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{
+	struct mtk_wed_flow_block_priv *priv = cb_priv;
+	struct flow_cls_offload *cls = type_data;
+	struct mtk_wed_hw *hw = priv->hw;
+
+	if (!tc_can_offload(priv->dev))
+		return -EOPNOTSUPP;
+
+	if (type != TC_SETUP_CLSFLOWER)
+		return -EOPNOTSUPP;
+
+	return mtk_flow_offload_cmd(hw->eth, cls, hw->index);
+}
+
+static int
+mtk_wed_setup_tc_block(struct mtk_wed_hw *hw, struct net_device *dev,
+		       struct flow_block_offload *f)
+{
+	struct mtk_wed_flow_block_priv *priv;
+	static LIST_HEAD(block_cb_list);
+	struct flow_block_cb *block_cb;
+	struct mtk_eth *eth = hw->eth;
+	flow_setup_cb_t *cb;
+
+	if (!eth->soc->offload_version)
+		return -EOPNOTSUPP;
+
+	if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+		return -EOPNOTSUPP;
+
+	cb = mtk_wed_setup_tc_block_cb;
+	f->driver_block_list = &block_cb_list;
+
+	switch (f->command) {
+	case FLOW_BLOCK_BIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (block_cb) {
+			flow_block_cb_incref(block_cb);
+			return 0;
+		}
+
+		priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+		if (!priv)
+			return -ENOMEM;
+
+		priv->hw = hw;
+		priv->dev = dev;
+		block_cb = flow_block_cb_alloc(cb, dev, priv, NULL);
+		if (IS_ERR(block_cb)) {
+			kfree(priv);
+			return PTR_ERR(block_cb);
+		}
+
+		flow_block_cb_incref(block_cb);
+		flow_block_cb_add(block_cb, f);
+		list_add_tail(&block_cb->driver_list, &block_cb_list);
+		return 0;
+	case FLOW_BLOCK_UNBIND:
+		block_cb = flow_block_cb_lookup(f->block, cb, dev);
+		if (!block_cb)
+			return -ENOENT;
+
+		if (!flow_block_cb_decref(block_cb)) {
+			flow_block_cb_remove(block_cb, f);
+			list_del(&block_cb->driver_list);
+			kfree(block_cb->cb_priv);
+		}
+		return 0;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int
+mtk_wed_setup_tc(struct mtk_wed_device *wed, struct net_device *dev,
+		 enum tc_setup_type type, void *type_data)
+{
+	struct mtk_wed_hw *hw = wed->hw;
+
+	if (hw->version < 2)
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case TC_SETUP_BLOCK:
+	case TC_SETUP_FT:
+		return mtk_wed_setup_tc_block(hw, dev, type_data);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		    void __iomem *wdma, phys_addr_t wdma_phy,
 		    int index)
@@ -1764,6 +1864,7 @@ void mtk_wed_add_hw(struct device_node *np, struct mtk_eth *eth,
 		.irq_set_mask = mtk_wed_irq_set_mask,
 		.detach = mtk_wed_detach,
 		.ppe_check = mtk_wed_ppe_check,
+		.setup_tc = mtk_wed_setup_tc,
 	};
 	struct device_node *eth_np = eth->dev->of_node;
 	struct platform_device *pdev;
diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h
index fd0b0605cf90..b2b28180dff7 100644
--- a/include/linux/soc/mediatek/mtk_wed.h
+++ b/include/linux/soc/mediatek/mtk_wed.h
@@ -6,6 +6,7 @@
 #include <linux/regmap.h>
 #include <linux/pci.h>
 #include <linux/skbuff.h>
+#include <linux/netdevice.h>
 
 #define MTK_WED_TX_QUEUES		2
 #define MTK_WED_RX_QUEUES		2
@@ -179,6 +180,8 @@ struct mtk_wed_ops {
 
 	u32 (*irq_get)(struct mtk_wed_device *dev, u32 mask);
 	void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask);
+	int (*setup_tc)(struct mtk_wed_device *wed, struct net_device *dev,
+			enum tc_setup_type type, void *type_data);
 };
 
 extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops;
@@ -237,6 +240,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev)
 	(_dev)->ops->msg_update(_dev, _id, _msg, _len)
 #define mtk_wed_device_stop(_dev) (_dev)->ops->stop(_dev)
 #define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev)
+#define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) \
+	(_dev)->ops->setup_tc(_dev, _netdev, _type, _type_data)
 #else
 static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 {
@@ -255,6 +260,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev)
 #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV
 #define mtk_wed_device_stop(_dev) do {} while (0)
 #define mtk_wed_device_dma_reset(_dev) do {} while (0)
+#define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) -EOPNOTSUPP
 #endif
 
 #endif
-- 
cgit v1.2.3


From 5a17818682cf43ad0fdd6035945f3b7a8c9dc5e9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 6 Apr 2023 14:42:46 +0300
Subject: net: dsa: replace NETDEV_PRE_CHANGE_HWTSTAMP notifier with a stub

There was a sort of rush surrounding commit 88c0a6b503b7 ("net: create a
netdev notifier for DSA to reject PTP on DSA master"), due to a desire
to convert DSA's attempt to deny TX timestamping on a DSA master to
something that doesn't block the kernel-wide API conversion from
ndo_eth_ioctl() to ndo_hwtstamp_set().

What was required was a mechanism that did not depend on ndo_eth_ioctl(),
and what was provided was a mechanism that did not depend on
ndo_eth_ioctl(), while at the same time introducing something that
wasn't absolutely necessary - a new netdev notifier.

There have been objections from Jakub Kicinski that using notifiers in
general when they are not absolutely necessary creates complications to
the control flow and difficulties to maintainers who look at the code.
So there is a desire to not use notifiers.

In addition to that, the notifier chain gets called even if there is no
DSA in the system and no one is interested in applying any restriction.

Take the model of udp_tunnel_nic_ops and introduce a stub mechanism,
through which net/core/dev_ioctl.c can call into DSA even when
CONFIG_NET_DSA=m.

Compared to the code that existed prior to the notifier conversion, aka
what was added in commits:
- 4cfab3566710 ("net: dsa: Add wrappers for overloaded ndo_ops")
- 3369afba1e46 ("net: Call into DSA netdevice_ops wrappers")

this is different because we are not overloading any struct
net_device_ops of the DSA master anymore, but rather, we are exposing a
rather specific functionality which is orthogonal to which API is used
to enable it - ndo_eth_ioctl() or ndo_hwtstamp_set().

Also, what is similar is that both approaches use function pointers to
get from built-in code to DSA.

There is no point in replicating the function pointers towards
__dsa_master_hwtstamp_validate() once for every CPU port (dev->dsa_ptr).
Instead, it is sufficient to introduce a singleton struct dsa_stubs,
built into the kernel, which contains a single function pointer to
__dsa_master_hwtstamp_validate().

I find this approach preferable to what we had originally, because
dev->dsa_ptr->netdev_ops->ndo_do_ioctl() used to require going through
struct dsa_port (dev->dsa_ptr), and so, this was incompatible with any
attempts to add any data encapsulation and hide DSA data structures from
the outside world.

Link: https://lore.kernel.org/netdev/20230403083019.120b72fd@kernel.org/
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 ------
 include/net/dsa_stubs.h   | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 net/Makefile              |  2 +-
 net/core/dev.c            |  2 +-
 net/core/dev_ioctl.c      | 12 ++----------
 net/dsa/Makefile          |  6 ++++++
 net/dsa/dsa.c             | 19 +++++++++++++++++++
 net/dsa/master.c          |  2 +-
 net/dsa/master.h          |  2 +-
 net/dsa/slave.c           | 11 -----------
 net/dsa/stubs.c           | 10 ++++++++++
 11 files changed, 89 insertions(+), 31 deletions(-)
 create mode 100644 include/net/dsa_stubs.h
 create mode 100644 net/dsa/stubs.c

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a740be3bb911..1c25b39681b3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2878,7 +2878,6 @@ enum netdev_cmd {
 	NETDEV_OFFLOAD_XSTATS_REPORT_USED,
 	NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
 	NETDEV_XDP_FEAT_CHANGE,
-	NETDEV_PRE_CHANGE_HWTSTAMP,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
@@ -2929,11 +2928,6 @@ struct netdev_notifier_pre_changeaddr_info {
 	const unsigned char *dev_addr;
 };
 
-struct netdev_notifier_hwtstamp_info {
-	struct netdev_notifier_info info; /* must be first */
-	struct kernel_hwtstamp_config *config;
-};
-
 enum netdev_offload_xstats_type {
 	NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
 };
diff --git a/include/net/dsa_stubs.h b/include/net/dsa_stubs.h
new file mode 100644
index 000000000000..361811750a54
--- /dev/null
+++ b/include/net/dsa_stubs.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * include/net/dsa_stubs.h - Stubs for the Distributed Switch Architecture framework
+ */
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/net_tstamp.h>
+#include <net/dsa.h>
+
+#if IS_ENABLED(CONFIG_NET_DSA)
+
+extern const struct dsa_stubs *dsa_stubs;
+
+struct dsa_stubs {
+	int (*master_hwtstamp_validate)(struct net_device *dev,
+					const struct kernel_hwtstamp_config *config,
+					struct netlink_ext_ack *extack);
+};
+
+static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
+					       const struct kernel_hwtstamp_config *config,
+					       struct netlink_ext_ack *extack)
+{
+	if (!netdev_uses_dsa(dev))
+		return 0;
+
+	/* rtnl_lock() is a sufficient guarantee, because as long as
+	 * netdev_uses_dsa() returns true, the dsa_core module is still
+	 * registered, and so, dsa_unregister_stubs() couldn't have run.
+	 * For netdev_uses_dsa() to start returning false, it would imply that
+	 * dsa_master_teardown() has executed, which requires rtnl_lock().
+	 */
+	ASSERT_RTNL();
+
+	return dsa_stubs->master_hwtstamp_validate(dev, config, extack);
+}
+
+#else
+
+static inline int dsa_master_hwtstamp_validate(struct net_device *dev,
+					       const struct kernel_hwtstamp_config *config,
+					       struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+#endif
diff --git a/net/Makefile b/net/Makefile
index 0914bea9c335..87592009366f 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -24,7 +24,7 @@ obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_BRIDGE)		+= bridge/
 obj-$(CONFIG_NET_DEVLINK)	+= devlink/
-obj-$(CONFIG_NET_DSA)		+= dsa/
+obj-y				+= dsa/
 obj-$(CONFIG_ATALK)		+= appletalk/
 obj-$(CONFIG_X25)		+= x25/
 obj-$(CONFIG_LAPB)		+= lapb/
diff --git a/net/core/dev.c b/net/core/dev.c
index 7ce5985be84b..480600a075ce 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1612,7 +1612,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
 	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
-	N(XDP_FEAT_CHANGE) N(PRE_CHANGE_HWTSTAMP)
+	N(XDP_FEAT_CHANGE)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 6d772837eb3f..3730945ee294 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -7,7 +7,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/wireless.h>
 #include <linux/if_bridge.h>
-#include <net/dsa.h>
+#include <net/dsa_stubs.h>
 #include <net/wext.h>
 
 #include "dev.h"
@@ -259,9 +259,6 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 
 static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 {
-	struct netdev_notifier_hwtstamp_info info = {
-		.info.dev = dev,
-	};
 	struct kernel_hwtstamp_config kernel_cfg;
 	struct netlink_ext_ack extack = {};
 	struct hwtstamp_config cfg;
@@ -276,12 +273,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
 	if (err)
 		return err;
 
-	info.info.extack = &extack;
-	info.config = &kernel_cfg;
-
-	err = call_netdevice_notifiers_info(NETDEV_PRE_CHANGE_HWTSTAMP,
-					    &info.info);
-	err = notifier_to_errno(err);
+	err = dsa_master_hwtstamp_validate(dev, &kernel_cfg, &extack);
 	if (err) {
 		if (extack._msg)
 			netdev_err(dev, "%s\n", extack._msg);
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index cc7e93a562fe..3835de286116 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+
+# the stubs are built-in whenever DSA is built-in or module
+ifdef CONFIG_NET_DSA
+obj-y := stubs.o
+endif
+
 # the core
 obj-$(CONFIG_NET_DSA) += dsa_core.o
 dsa_core-y += \
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index e5f156940c67..ab1afe67fd18 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -17,6 +17,7 @@
 #include <linux/of.h>
 #include <linux/of_mdio.h>
 #include <linux/of_net.h>
+#include <net/dsa_stubs.h>
 #include <net/sch_generic.h>
 
 #include "devlink.h"
@@ -1702,6 +1703,20 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
 }
 EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
 
+static const struct dsa_stubs __dsa_stubs = {
+	.master_hwtstamp_validate = __dsa_master_hwtstamp_validate,
+};
+
+static void dsa_register_stubs(void)
+{
+	dsa_stubs = &__dsa_stubs;
+}
+
+static void dsa_unregister_stubs(void)
+{
+	dsa_stubs = NULL;
+}
+
 static int __init dsa_init_module(void)
 {
 	int rc;
@@ -1721,6 +1736,8 @@ static int __init dsa_init_module(void)
 	if (rc)
 		goto netlink_register_fail;
 
+	dsa_register_stubs();
+
 	return 0;
 
 netlink_register_fail:
@@ -1735,6 +1752,8 @@ module_init(dsa_init_module);
 
 static void __exit dsa_cleanup_module(void)
 {
+	dsa_unregister_stubs();
+
 	rtnl_link_unregister(&dsa_link_ops);
 
 	dsa_slave_unregister_notifier();
diff --git a/net/dsa/master.c b/net/dsa/master.c
index c2cabe6248b1..6be89ab0cc01 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -198,7 +198,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 /* Deny PTP operations on master if there is at least one switch in the tree
  * that is PTP capable.
  */
-int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+int __dsa_master_hwtstamp_validate(struct net_device *dev,
 				   const struct kernel_hwtstamp_config *config,
 				   struct netlink_ext_ack *extack)
 {
diff --git a/net/dsa/master.h b/net/dsa/master.h
index 80842f4e27f7..76e39d3ec909 100644
--- a/net/dsa/master.h
+++ b/net/dsa/master.h
@@ -15,7 +15,7 @@ int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
 			 struct netlink_ext_ack *extack);
 void dsa_master_lag_teardown(struct net_device *lag_dev,
 			     struct dsa_port *cpu_dp);
-int dsa_master_pre_change_hwtstamp(struct net_device *dev,
+int __dsa_master_hwtstamp_validate(struct net_device *dev,
 				   const struct kernel_hwtstamp_config *config,
 				   struct netlink_ext_ack *extack);
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 8abc1658ac47..165bb2cb8431 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -3289,7 +3289,6 @@ static int dsa_master_changeupper(struct net_device *dev,
 static int dsa_slave_netdevice_event(struct notifier_block *nb,
 				     unsigned long event, void *ptr)
 {
-	struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
 	switch (event) {
@@ -3419,16 +3418,6 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb,
 
 		return NOTIFY_OK;
 	}
-	case NETDEV_PRE_CHANGE_HWTSTAMP: {
-		struct netdev_notifier_hwtstamp_info *info = ptr;
-		int err;
-
-		if (!netdev_uses_dsa(dev))
-			return NOTIFY_DONE;
-
-		err = dsa_master_pre_change_hwtstamp(dev, info->config, extack);
-		return notifier_from_errno(err);
-	}
 	default:
 		break;
 	}
diff --git a/net/dsa/stubs.c b/net/dsa/stubs.c
new file mode 100644
index 000000000000..2ed8a6c85fbf
--- /dev/null
+++ b/net/dsa/stubs.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Stubs for DSA functionality called by the core network stack.
+ * These are necessary because CONFIG_NET_DSA can be a module, and built-in
+ * code cannot directly call symbols exported by modules.
+ */
+#include <net/dsa_stubs.h>
+
+const struct dsa_stubs *dsa_stubs;
+EXPORT_SYMBOL_GPL(dsa_stubs);
-- 
cgit v1.2.3


From c91c46de6bbc1147ae5dfe046b87f5f3d6593215 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 6 Apr 2023 18:25:33 -0700
Subject: net: provide macros for commonly copied lockless queue stop/wake code

A lot of drivers follow the same scheme to stop / start queues
without introducing locks between xmit and NAPI tx completions.
I'm guessing they all copy'n'paste each other's code.
The original code dates back all the way to e1000 and Linux 2.6.19.

Smaller drivers shy away from the scheme and introduce a lock
which may cause deadlocks in netpoll.

Provide macros which encapsulate the necessary logic.

The macros do not prevent false wake ups, the extra barrier
required to close that race is not worth it. See discussion in:
https://lore.kernel.org/all/c39312a2-4537-14b4-270c-9fe1fbb91e89@gmail.com/

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/driver.rst |   6 ++
 include/linux/netdevice.h           |   1 +
 include/net/netdev_queues.h         | 144 ++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+)
 create mode 100644 include/net/netdev_queues.h

(limited to 'include')

diff --git a/Documentation/networking/driver.rst b/Documentation/networking/driver.rst
index 19c363291d04..4071f2c00f8b 100644
--- a/Documentation/networking/driver.rst
+++ b/Documentation/networking/driver.rst
@@ -104,6 +104,12 @@ and:
 	    TX_BUFFS_AVAIL(dp) > 0)
 		netif_wake_queue(dp->dev);
 
+Lockless queue stop / wake helper macros
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: include/net/netdev_queues.h
+   :doc: Lockless queue stopping / waking helpers.
+
 No exclusive ownership
 ----------------------
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1c25b39681b3..7bec9a2be8ef 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3335,6 +3335,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 
 static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
 {
+	/* Must be an atomic op see netif_txq_try_stop() */
 	set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
 
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
new file mode 100644
index 000000000000..5236d78bbdeb
--- /dev/null
+++ b/include/net/netdev_queues.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NET_QUEUES_H
+#define _LINUX_NET_QUEUES_H
+
+#include <linux/netdevice.h>
+
+/**
+ * DOC: Lockless queue stopping / waking helpers.
+ *
+ * The netif_txq_maybe_stop() and __netif_txq_completed_wake()
+ * macros are designed to safely implement stopping
+ * and waking netdev queues without full lock protection.
+ *
+ * We assume that there can be no concurrent stop attempts and no concurrent
+ * wake attempts. The try-stop should happen from the xmit handler,
+ * while wake up should be triggered from NAPI poll context.
+ * The two may run concurrently (single producer, single consumer).
+ *
+ * The try-stop side is expected to run from the xmit handler and therefore
+ * it does not reschedule Tx (netif_tx_start_queue() instead of
+ * netif_tx_wake_queue()). Uses of the ``stop`` macros outside of the xmit
+ * handler may lead to xmit queue being enabled but not run.
+ * The waking side does not have similar context restrictions.
+ *
+ * The macros guarantee that rings will not remain stopped if there's
+ * space available, but they do *not* prevent false wake ups when
+ * the ring is full! Drivers should check for ring full at the start
+ * for the xmit handler.
+ *
+ * All descriptor ring indexes (and other relevant shared state) must
+ * be updated before invoking the macros.
+ */
+
+#define netif_txq_try_stop(txq, get_desc, start_thrs)			\
+	({								\
+		int _res;						\
+									\
+		netif_tx_stop_queue(txq);				\
+		/* Producer index and stop bit must be visible		\
+		 * to consumer before we recheck.			\
+		 * Pairs with a barrier in __netif_txq_maybe_wake().	\
+		 */							\
+		smp_mb__after_atomic();					\
+									\
+		/* We need to check again in a case another		\
+		 * CPU has just made room available.			\
+		 */							\
+		_res = 0;						\
+		if (unlikely(get_desc >= start_thrs)) {			\
+			netif_tx_start_queue(txq);			\
+			_res = -1;					\
+		}							\
+		_res;							\
+	})								\
+
+/**
+ * netif_txq_maybe_stop() - locklessly stop a Tx queue, if needed
+ * @txq:	struct netdev_queue to stop/start
+ * @get_desc:	get current number of free descriptors (see requirements below!)
+ * @stop_thrs:	minimal number of available descriptors for queue to be left
+ *		enabled
+ * @start_thrs:	minimal number of descriptors to re-enable the queue, can be
+ *		equal to @stop_thrs or higher to avoid frequent waking
+ *
+ * All arguments may be evaluated multiple times, beware of side effects.
+ * @get_desc must be a formula or a function call, it must always
+ * return up-to-date information when evaluated!
+ * Expected to be used from ndo_start_xmit, see the comment on top of the file.
+ *
+ * Returns:
+ *	 0 if the queue was stopped
+ *	 1 if the queue was left enabled
+ *	-1 if the queue was re-enabled (raced with waking)
+ */
+#define netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs)	\
+	({								\
+		int _res;						\
+									\
+		_res = 1;						\
+		if (unlikely(get_desc < stop_thrs))			\
+			_res = netif_txq_try_stop(txq, get_desc, start_thrs); \
+		_res;							\
+	})								\
+
+
+/**
+ * __netif_txq_maybe_wake() - locklessly wake a Tx queue, if needed
+ * @txq:	struct netdev_queue to stop/start
+ * @get_desc:	get current number of free descriptors (see requirements below!)
+ * @start_thrs:	minimal number of descriptors to re-enable the queue
+ * @down_cond:	down condition, predicate indicating that the queue should
+ *		not be woken up even if descriptors are available
+ *
+ * All arguments may be evaluated multiple times.
+ * @get_desc must be a formula or a function call, it must always
+ * return up-to-date information when evaluated!
+ *
+ * Returns:
+ *	 0 if the queue was woken up
+ *	 1 if the queue was already enabled (or disabled but @down_cond is true)
+ *	-1 if the queue was left unchanged (@start_thrs not reached)
+ */
+#define __netif_txq_maybe_wake(txq, get_desc, start_thrs, down_cond)	\
+	({								\
+		int _res;						\
+									\
+		_res = -1;						\
+		if (likely(get_desc > start_thrs)) {			\
+			/* Make sure that anybody stopping the queue after \
+			 * this sees the new next_to_clean.		\
+			 */						\
+			smp_mb();					\
+			_res = 1;					\
+			if (unlikely(netif_tx_queue_stopped(txq)) &&	\
+			    !(down_cond)) {				\
+				netif_tx_wake_queue(txq);		\
+				_res = 0;				\
+			}						\
+		}							\
+		_res;							\
+	})
+
+#define netif_txq_maybe_wake(txq, get_desc, start_thrs)		\
+	__netif_txq_maybe_wake(txq, get_desc, start_thrs, false)
+
+/* subqueue variants follow */
+
+#define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs)		\
+	({								\
+		struct netdev_queue *txq;				\
+									\
+		txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_try_stop(txq, get_desc, start_thrs);		\
+	})
+
+#define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \
+	({								\
+		struct netdev_queue *txq;				\
+									\
+		txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \
+	})
+
+#endif
-- 
cgit v1.2.3


From 301f227fc860624d37ba5dae9da57dcf371268db Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 6 Apr 2023 18:25:36 -0700
Subject: net: piggy back on the memory barrier in bql when waking queues

Drivers call netdev_tx_completed_queue() right before
netif_txq_maybe_wake(). If BQL is enabled netdev_tx_completed_queue()
should issue a memory barrier, so we can depend on that separating
the stop check from the consumer index update, instead of adding
another barrier in netif_txq_maybe_wake().

This matters more than the barriers on the xmit path, because
the wake condition is almost always true. So we issue the
consumer side barrier often.

Wrap netdev_tx_completed_queue() in a local helper to issue
the barrier even if BQL is disabled. Keep the same semantics
as netdev_tx_completed_queue() (barrier only if bytes != 0)
to make it clear that the barrier is conditional.

Plus since macro gets pkt/byte counts as arguments now -
we can skip waking if there were no packets completed.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  6 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 12 ++++-----
 include/linux/netdevice.h                     |  2 +-
 include/net/netdev_queues.h                   | 39 ++++++++++++++++++++-------
 4 files changed, 38 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index de97bee25249..f7602d8d79e3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -688,11 +688,11 @@ next_tx_int:
 		dev_kfree_skb_any(skb);
 	}
 
-	netdev_tx_completed_queue(txq, nr_pkts, tx_bytes);
 	txr->tx_cons = cons;
 
-	__netif_txq_maybe_wake(txq, bnxt_tx_avail(bp, txr), bp->tx_wake_thresh,
-			       READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING);
+	__netif_txq_completed_wake(txq, nr_pkts, tx_bytes,
+				   bnxt_tx_avail(bp, txr), bp->tx_wake_thresh,
+				   READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING);
 }
 
 static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping,
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index cbbddee55db1..f2604fc05991 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1251,15 +1251,13 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 	if (ring_is_xdp(tx_ring))
 		return !!budget;
 
-	netdev_tx_completed_queue(txring_txq(tx_ring),
-				  total_packets, total_bytes);
-
 #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2)
 	txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index);
-	if (total_packets && netif_carrier_ok(tx_ring->netdev) &&
-	    !__netif_txq_maybe_wake(txq, ixgbe_desc_unused(tx_ring),
-				    TX_WAKE_THRESHOLD,
-				    test_bit(__IXGBE_DOWN, &adapter->state)))
+	if (!__netif_txq_completed_wake(txq, total_packets, total_bytes,
+					ixgbe_desc_unused(tx_ring),
+					TX_WAKE_THRESHOLD,
+					netif_carrier_ok(tx_ring->netdev) &&
+					test_bit(__IXGBE_DOWN, &adapter->state)))
 		++tx_ring->tx_stats.restart_queue;
 
 	return !!budget;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7bec9a2be8ef..fe355592dfde 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3532,7 +3532,7 @@ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
 	 * netdev_tx_sent_queue will miss the update and cause the queue to
 	 * be stopped forever
 	 */
-	smp_mb();
+	smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */
 
 	if (unlikely(dql_avail(&dev_queue->dql) < 0))
 		return;
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 5236d78bbdeb..b26fdb441e39 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -38,7 +38,7 @@
 		netif_tx_stop_queue(txq);				\
 		/* Producer index and stop bit must be visible		\
 		 * to consumer before we recheck.			\
-		 * Pairs with a barrier in __netif_txq_maybe_wake().	\
+		 * Pairs with a barrier in __netif_txq_completed_wake(). \
 		 */							\
 		smp_mb__after_atomic();					\
 									\
@@ -82,10 +82,24 @@
 		_res;							\
 	})								\
 
+/* Variant of netdev_tx_completed_queue() which guarantees smp_mb() if
+ * @bytes != 0, regardless of kernel config.
+ */
+static inline void
+netdev_txq_completed_mb(struct netdev_queue *dev_queue,
+			unsigned int pkts, unsigned int bytes)
+{
+	if (IS_ENABLED(CONFIG_BQL))
+		netdev_tx_completed_queue(dev_queue, pkts, bytes);
+	else if (bytes)
+		smp_mb();
+}
 
 /**
- * __netif_txq_maybe_wake() - locklessly wake a Tx queue, if needed
+ * __netif_txq_completed_wake() - locklessly wake a Tx queue, if needed
  * @txq:	struct netdev_queue to stop/start
+ * @pkts:	number of packets completed
+ * @bytes:	number of bytes completed
  * @get_desc:	get current number of free descriptors (see requirements below!)
  * @start_thrs:	minimal number of descriptors to re-enable the queue
  * @down_cond:	down condition, predicate indicating that the queue should
@@ -94,22 +108,27 @@
  * All arguments may be evaluated multiple times.
  * @get_desc must be a formula or a function call, it must always
  * return up-to-date information when evaluated!
+ * Reports completed pkts/bytes to BQL.
  *
  * Returns:
  *	 0 if the queue was woken up
  *	 1 if the queue was already enabled (or disabled but @down_cond is true)
  *	-1 if the queue was left unchanged (@start_thrs not reached)
  */
-#define __netif_txq_maybe_wake(txq, get_desc, start_thrs, down_cond)	\
+#define __netif_txq_completed_wake(txq, pkts, bytes,			\
+				   get_desc, start_thrs, down_cond)	\
 	({								\
 		int _res;						\
 									\
+		/* Report to BQL and piggy back on its barrier.		\
+		 * Barrier makes sure that anybody stopping the queue	\
+		 * after this point sees the new consumer index.	\
+		 * Pairs with barrier in netif_txq_try_stop().		\
+		 */							\
+		netdev_txq_completed_mb(txq, pkts, bytes);		\
+									\
 		_res = -1;						\
-		if (likely(get_desc > start_thrs)) {			\
-			/* Make sure that anybody stopping the queue after \
-			 * this sees the new next_to_clean.		\
-			 */						\
-			smp_mb();					\
+		if (pkts && likely(get_desc > start_thrs)) {		\
 			_res = 1;					\
 			if (unlikely(netif_tx_queue_stopped(txq)) &&	\
 			    !(down_cond)) {				\
@@ -120,8 +139,8 @@
 		_res;							\
 	})
 
-#define netif_txq_maybe_wake(txq, get_desc, start_thrs)		\
-	__netif_txq_maybe_wake(txq, get_desc, start_thrs, false)
+#define netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs) \
+	__netif_txq_completed_wake(txq, pkts, bytes, get_desc, start_thrs, false)
 
 /* subqueue variants follow */
 
-- 
cgit v1.2.3


From 4294a0a7ab6282c3d92f03de84e762dda993c93d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:47 -0700
Subject: bpf: Split off basic BPF verifier log into separate file

kernel/bpf/verifier.c file is large and growing larger all the time. So
it's good to start splitting off more or less self-contained parts into
separate files to keep source code size (somewhat) somewhat under
control.

This patch is a one step in this direction, moving some of BPF verifier log
routines into a separate kernel/bpf/log.c. Right now it's most low-level
and isolated routines to append data to log, reset log to previous
position, etc. Eventually we could probably move verifier state
printing logic here as well, but this patch doesn't attempt to do that
yet.

Subsequent patches will add more logic to verifier log management, so
having basics in a separate file will make sure verifier.c doesn't grow
more with new changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-2-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 19 ++++------
 kernel/bpf/Makefile          |  3 +-
 kernel/bpf/log.c             | 85 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 69 -----------------------------------
 4 files changed, 94 insertions(+), 82 deletions(-)
 create mode 100644 kernel/bpf/log.c

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 81d525d057c7..83dff25545ee 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -498,11 +498,6 @@ struct bpf_verifier_log {
 	u32 len_total;
 };
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	return log->len_used >= log->len_total - 1;
-}
-
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
@@ -512,6 +507,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log &&
@@ -519,13 +519,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 		 log->level == BPF_LOG_KERNEL);
 }
 
-static inline bool
-bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
-{
-	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
-	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
-}
-
 #define BPF_MAX_SUBPROGS 256
 
 struct bpf_subprog_info {
@@ -608,12 +601,14 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
+bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 02242614dcc7..1d3892168d32 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
 endif
 CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..920061e38d2e
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+
+bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+	return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 &&
+	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+		       va_list args)
+{
+	unsigned int n;
+
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	if (log->level == BPF_LOG_KERNEL) {
+		bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+		return;
+	}
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+{
+	char zero = 0;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	log->len_used = new_pos;
+	if (put_user(zero, log->ubuf + new_pos))
+		log->ubuf = NULL;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+					   const char *fmt, ...)
+{
+	va_list args;
+
+	if (!bpf_verifier_log_needed(&env->log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(&env->log, fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+			    const char *fmt, ...)
+{
+	va_list args;
+
+	if (!bpf_verifier_log_needed(log))
+		return;
+
+	va_start(args, fmt);
+	bpf_verifier_vlog(log, fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3660b573048a..745ae0cd01d4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -335,61 +335,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
 	return &linfo[i - 1];
 }
 
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
-		       va_list args)
-{
-	unsigned int n;
-
-	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-
-	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
-		  "verifier log line truncated - local buffer too short\n");
-
-	if (log->level == BPF_LOG_KERNEL) {
-		bool newline = n > 0 && log->kbuf[n - 1] == '\n';
-
-		pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
-		return;
-	}
-
-	n = min(log->len_total - log->len_used - 1, n);
-	log->kbuf[n] = '\0';
-	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
-		log->len_used += n;
-	else
-		log->ubuf = NULL;
-}
-
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
-{
-	char zero = 0;
-
-	if (!bpf_verifier_log_needed(log))
-		return;
-
-	log->len_used = new_pos;
-	if (put_user(zero, log->ubuf + new_pos))
-		log->ubuf = NULL;
-}
-
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
-					   const char *fmt, ...)
-{
-	va_list args;
-
-	if (!bpf_verifier_log_needed(&env->log))
-		return;
-
-	va_start(args, fmt);
-	bpf_verifier_vlog(&env->log, fmt, args);
-	va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-
 __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 {
 	struct bpf_verifier_env *env = private_data;
@@ -403,20 +348,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
 	va_end(args);
 }
 
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
-			    const char *fmt, ...)
-{
-	va_list args;
-
-	if (!bpf_verifier_log_needed(log))
-		return;
-
-	va_start(args, fmt);
-	bpf_verifier_vlog(log, fmt, args);
-	va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_log);
-
 static const char *ltrim(const char *s)
 {
 	while (isspace(*s))
-- 
cgit v1.2.3


From 1216640938035e63bdbd32438e91c9bcc1fd8ee1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:49 -0700
Subject: bpf: Switch BPF verifier log to be a rotating log by default

Currently, if user-supplied log buffer to collect BPF verifier log turns
out to be too small to contain full log, bpf() syscall returns -ENOSPC,
fails BPF program verification/load, and preserves first N-1 bytes of
the verifier log (where N is the size of user-supplied buffer).

This is problematic in a bunch of common scenarios, especially when
working with real-world BPF programs that tend to be pretty complex as
far as verification goes and require big log buffers. Typically, it's
when debugging tricky cases at log level 2 (verbose). Also, when BPF program
is successfully validated, log level 2 is the only way to actually see
verifier state progression and all the important details.

Even with log level 1, it's possible to get -ENOSPC even if the final
verifier log fits in log buffer, if there is a code path that's deep
enough to fill up entire log, even if normally it would be reset later
on (there is a logic to chop off successfully validated portions of BPF
verifier log).

In short, it's not always possible to pre-size log buffer. Also, what's
worse, in practice, the end of the log most often is way more important
than the beginning, but verifier stops emitting log as soon as initial
log buffer is filled up.

This patch switches BPF verifier log behavior to effectively behave as
rotating log. That is, if user-supplied log buffer turns out to be too
short, verifier will keep overwriting previously written log,
effectively treating user's log buffer as a ring buffer. -ENOSPC is
still going to be returned at the end, to notify user that log contents
was truncated, but the important last N bytes of the log would be
returned, which might be all that user really needs. This consistent
-ENOSPC behavior, regardless of rotating or fixed log behavior, allows
to prevent backwards compatibility breakage. The only user-visible
change is which portion of verifier log user ends up seeing *if buffer
is too small*. Given contents of verifier log itself is not an ABI,
there is no breakage due to this behavior change. Specialized tools that
rely on specific contents of verifier log in -ENOSPC scenario are
expected to be easily adapted to accommodate old and new behaviors.

Importantly, though, to preserve good user experience and not require
every user-space application to adopt to this new behavior, before
exiting to user-space verifier will rotate log (in place) to make it
start at the very beginning of user buffer as a continuous
zero-terminated string. The contents will be a chopped off N-1 last
bytes of full verifier log, of course.

Given beginning of log is sometimes important as well, we add
BPF_LOG_FIXED (which equals 8) flag to force old behavior, which allows
tools like veristat to request first part of verifier log, if necessary.
BPF_LOG_FIXED flag is also a simple and straightforward way to check if
BPF verifier supports rotating behavior.

On the implementation side, conceptually, it's all simple. We maintain
64-bit logical start and end positions. If we need to truncate the log,
start position will be adjusted accordingly to lag end position by
N bytes. We then use those logical positions to calculate their matching
actual positions in user buffer and handle wrap around the end of the
buffer properly. Finally, right before returning from bpf_check(), we
rotate user log buffer contents in-place as necessary, to make log
contents contiguous. See comments in relevant functions for details.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-4-andrii@kernel.org
---
 include/linux/bpf_verifier.h                       |  33 +++-
 kernel/bpf/btf.c                                   |   3 +-
 kernel/bpf/log.c                                   | 198 ++++++++++++++++++++-
 kernel/bpf/verifier.c                              |  19 +-
 tools/testing/selftests/bpf/prog_tests/log_fixup.c |   1 +
 5 files changed, 228 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 83dff25545ee..4c926227f612 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -491,25 +491,42 @@ struct bpf_insn_aux_data {
 #define BPF_VERIFIER_TMP_LOG_SIZE	1024
 
 struct bpf_verifier_log {
-	u32 level;
-	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
+	/* Logical start and end positions of a "log window" of the verifier log.
+	 * start_pos == 0 means we haven't truncated anything.
+	 * Once truncation starts to happen, start_pos + len_total == end_pos,
+	 * except during log reset situations, in which (end_pos - start_pos)
+	 * might get smaller than len_total (see bpf_vlog_reset()).
+	 * Generally, (end_pos - start_pos) gives number of useful data in
+	 * user log buffer.
+	 */
+	u64 start_pos;
+	u64 end_pos;
 	char __user *ubuf;
-	u32 len_used;
+	u32 level;
 	u32 len_total;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
 #define BPF_LOG_LEVEL1	1
 #define BPF_LOG_LEVEL2	2
 #define BPF_LOG_STATS	4
+#define BPF_LOG_FIXED	8
 #define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
-#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
+#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED)
 #define BPF_LOG_KERNEL	(BPF_LOG_MASK + 1) /* kernel internal flag */
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
+static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
+{
+	return log->end_pos - log->start_pos;
+}
+
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
-	return log->len_used >= log->len_total - 1;
+	if (log->level & BPF_LOG_FIXED)
+		return bpf_log_used(log) >= log->len_total - 1;
+	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
@@ -596,7 +613,7 @@ struct bpf_verifier_env {
 	u32 scratched_regs;
 	/* Same as scratched_regs but for stack slots */
 	u64 scratched_stack_slots;
-	u32 prev_log_len, prev_insn_print_len;
+	u64 prev_log_pos, prev_insn_print_pos;
 	/* buffer used in reg_type_str() to generate reg_type string */
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
@@ -608,7 +625,9 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
-void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos);
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
+void bpf_vlog_finalize(struct bpf_verifier_log *log);
+bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 593c45a294d0..20a05b8932db 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5593,7 +5593,8 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 		}
 	}
 
-	if (log->level && bpf_verifier_log_full(log)) {
+	bpf_vlog_finalize(log);
+	if (log->level && bpf_vlog_truncated(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
 	}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 1974891fc324..92b1c8ad6601 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <linux/bpf_verifier.h>
+#include <linux/math64.h>
 
 bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
@@ -32,23 +33,202 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		return;
 	}
 
-	n = min(log->len_total - log->len_used - 1, n);
-	log->kbuf[n] = '\0';
-	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
-		log->len_used += n;
-	else
-		log->ubuf = NULL;
+	if (log->level & BPF_LOG_FIXED) {
+		n = min(log->len_total - bpf_log_used(log) - 1, n);
+		log->kbuf[n] = '\0';
+		n += 1;
+
+		if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n))
+			goto fail;
+
+		log->end_pos += n - 1; /* don't count terminating '\0' */
+	} else {
+		u64 new_end, new_start, cur_pos;
+		u32 buf_start, buf_end, new_n;
+
+		n += 1;
+
+		new_end = log->end_pos + n;
+		if (new_end - log->start_pos >= log->len_total)
+			new_start = new_end - log->len_total;
+		else
+			new_start = log->start_pos;
+		new_n = min(n, log->len_total);
+		cur_pos = new_end - new_n;
+
+		div_u64_rem(cur_pos, log->len_total, &buf_start);
+		div_u64_rem(new_end, log->len_total, &buf_end);
+		/* new_end and buf_end are exclusive indices, so if buf_end is
+		 * exactly zero, then it actually points right to the end of
+		 * ubuf and there is no wrap around
+		 */
+		if (buf_end == 0)
+			buf_end = log->len_total;
+
+		/* if buf_start > buf_end, we wrapped around;
+		 * if buf_start == buf_end, then we fill ubuf completely; we
+		 * can't have buf_start == buf_end to mean that there is
+		 * nothing to write, because we always write at least
+		 * something, even if terminal '\0'
+		 */
+		if (buf_start < buf_end) {
+			/* message fits within contiguous chunk of ubuf */
+			if (copy_to_user(log->ubuf + buf_start,
+					 log->kbuf + n - new_n,
+					 buf_end - buf_start))
+				goto fail;
+		} else {
+			/* message wraps around the end of ubuf, copy in two chunks */
+			if (copy_to_user(log->ubuf + buf_start,
+					 log->kbuf + n - new_n,
+					 log->len_total - buf_start))
+				goto fail;
+			if (copy_to_user(log->ubuf,
+					 log->kbuf + n - buf_end,
+					 buf_end))
+				goto fail;
+		}
+
+		log->start_pos = new_start;
+		log->end_pos = new_end - 1; /* don't count terminating '\0' */
+	}
+
+	return;
+fail:
+	log->ubuf = NULL;
 }
 
-void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 {
 	char zero = 0;
+	u32 pos;
+
+	if (WARN_ON_ONCE(new_pos > log->end_pos))
+		return;
 
 	if (!bpf_verifier_log_needed(log))
 		return;
 
-	log->len_used = new_pos;
-	if (put_user(zero, log->ubuf + new_pos))
+	/* if position to which we reset is beyond current log window,
+	 * then we didn't preserve any useful content and should adjust
+	 * start_pos to end up with an empty log (start_pos == end_pos)
+	 */
+	log->end_pos = new_pos;
+	if (log->end_pos < log->start_pos)
+		log->start_pos = log->end_pos;
+	div_u64_rem(new_pos, log->len_total, &pos);
+	if (put_user(zero, log->ubuf + pos))
+		log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+	int i, j;
+
+	for (i = 0, j = len - 1; i < j; i++, j--)
+		swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+	/* we split log->kbuf into two equal parts for both ends of array */
+	int n = sizeof(log->kbuf) / 2, nn;
+	char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+	/* Read ubuf's section [start, end) two chunks at a time, from left
+	 * and right side; within each chunk, swap all the bytes; after that
+	 * reverse the order of lbuf and rbuf and write result back to ubuf.
+	 * This way we'll end up with swapped contents of specified
+	 * [start, end) ubuf segment.
+	 */
+	while (end - start > 1) {
+		nn = min(n, (end - start ) / 2);
+
+		if (copy_from_user(lbuf, log->ubuf + start, nn))
+			return -EFAULT;
+		if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+			return -EFAULT;
+
+		bpf_vlog_reverse_kbuf(lbuf, nn);
+		bpf_vlog_reverse_kbuf(rbuf, nn);
+
+		/* we write lbuf to the right end of ubuf, while rbuf to the
+		 * left one to end up with properly reversed overall ubuf
+		 */
+		if (copy_to_user(log->ubuf + start, rbuf, nn))
+			return -EFAULT;
+		if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+			return -EFAULT;
+
+		start += nn;
+		end -= nn;
+	}
+
+	return 0;
+}
+
+bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
+{
+	if (log->level & BPF_LOG_FIXED)
+		return bpf_log_used(log) >= log->len_total - 1;
+	else
+		return log->start_pos > 0;
+}
+
+void bpf_vlog_finalize(struct bpf_verifier_log *log)
+{
+	u32 sublen;
+	int err;
+
+	if (!log || !log->level || !log->ubuf)
+		return;
+	if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL)
+		return;
+
+	/* If we never truncated log, there is nothing to move around. */
+	if (log->start_pos == 0)
+		return;
+
+	/* Otherwise we need to rotate log contents to make it start from the
+	 * buffer beginning and be a continuous zero-terminated string. Note
+	 * that if log->start_pos != 0 then we definitely filled up entire log
+	 * buffer with no gaps, and we just need to shift buffer contents to
+	 * the left by (log->start_pos % log->len_total) bytes.
+	 *
+	 * Unfortunately, user buffer could be huge and we don't want to
+	 * allocate temporary kernel memory of the same size just to shift
+	 * contents in a straightforward fashion. Instead, we'll be clever and
+	 * do in-place array rotation. This is a leetcode-style problem, which
+	 * could be solved by three rotations.
+	 *
+	 * Let's say we have log buffer that has to be shifted left by 7 bytes
+	 * (spaces and vertical bar is just for demonstrative purposes):
+	 *   E F G H I J K | A B C D
+	 *
+	 * First, we reverse entire array:
+	 *   D C B A | K J I H G F E
+	 *
+	 * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+	 * (KJIHGFE), resulting in a properly rotated array:
+	 *   A B C D | E F G H I J K
+	 *
+	 * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+	 * bytes, and write them back. Doing it byte-by-byte would be
+	 * unnecessarily inefficient. Altogether we are going to read and
+	 * write each byte twice, for total 4 memory copies between kernel and
+	 * user space.
+	 */
+
+	/* length of the chopped off part that will be the beginning;
+	 * len(ABCD) in the example above
+	 */
+	div_u64_rem(log->start_pos, log->len_total, &sublen);
+	sublen = log->len_total - sublen;
+
+	err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+	err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+	err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+	if (err)
 		log->ubuf = NULL;
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 745ae0cd01d4..a476bb319685 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1439,10 +1439,10 @@ static inline u32 vlog_alignment(u32 pos)
 static void print_insn_state(struct bpf_verifier_env *env,
 			     const struct bpf_func_state *state)
 {
-	if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+	if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
 		/* remove new line character */
-		bpf_vlog_reset(&env->log, env->prev_log_len - 1);
-		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+		bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+		verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
 	} else {
 		verbose(env, "%d:", env->insn_idx);
 	}
@@ -1750,7 +1750,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
-	elem->log_pos = env->log.len_used;
+	elem->log_pos = env->log.end_pos;
 	env->head = elem;
 	env->stack_size++;
 	err = copy_verifier_state(&elem->st, cur);
@@ -2286,7 +2286,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
-	elem->log_pos = env->log.len_used;
+	elem->log_pos = env->log.end_pos;
 	env->head = elem;
 	env->stack_size++;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
@@ -15638,11 +15638,11 @@ static int do_check(struct bpf_verifier_env *env)
 				print_insn_state(env, state->frame[state->curframe]);
 
 			verbose_linfo(env, env->insn_idx, "; ");
-			env->prev_log_len = env->log.len_used;
+			env->prev_log_pos = env->log.end_pos;
 			verbose(env, "%d: ", env->insn_idx);
 			print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
-			env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
-			env->prev_log_len = env->log.len_used;
+			env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+			env->prev_log_pos = env->log.end_pos;
 		}
 
 		if (bpf_prog_is_offloaded(env->prog->aux)) {
@@ -18860,7 +18860,8 @@ skip_full_check:
 	print_verification_stats(env);
 	env->prog->aux->verified_insns = env->insn_processed;
 
-	if (log->level && bpf_verifier_log_full(log))
+	bpf_vlog_finalize(log);
+	if (log->level && bpf_vlog_truncated(log))
 		ret = -ENOSPC;
 	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
diff --git a/tools/testing/selftests/bpf/prog_tests/log_fixup.c b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
index 239e1c5753b0..bc27170bdeb0 100644
--- a/tools/testing/selftests/bpf/prog_tests/log_fixup.c
+++ b/tools/testing/selftests/bpf/prog_tests/log_fixup.c
@@ -24,6 +24,7 @@ static void bad_core_relo(size_t log_buf_size, enum trunc_type trunc_type)
 	bpf_program__set_autoload(skel->progs.bad_relo, true);
 	memset(log_buf, 0, sizeof(log_buf));
 	bpf_program__set_log_buf(skel->progs.bad_relo, log_buf, log_buf_size ?: sizeof(log_buf));
+	bpf_program__set_log_level(skel->progs.bad_relo, 1 | 8); /* BPF_LOG_FIXED to force truncation */
 
 	err = test_log_fixup__load(skel);
 	if (!ASSERT_ERR(err, "load_fail"))
-- 
cgit v1.2.3


From fa1c7d5cc404ac3b6e6b4ab6d00b07c76bd819be Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:57 -0700
Subject: bpf: Keep track of total log content size in both fixed and rolling
 modes

Change how we do accounting in BPF_LOG_FIXED mode and adopt log->end_pos
as *logical* log position. This means that we can go beyond physical log
buffer size now and be able to tell what log buffer size should be to
fit entire log contents without -ENOSPC.

To do this for BPF_LOG_FIXED mode, we need to remove a short-circuiting
logic of not vsnprintf()'ing further log content once we filled up
user-provided buffer, which is done by bpf_verifier_log_needed() checks.
We modify these checks to always keep going if log->level is non-zero
(i.e., log is requested), even if log->ubuf was NULL'ed out due to
copying data to user-space, or if entire log buffer is physically full.
We adopt bpf_verifier_vlog() routine to work correctly with
log->ubuf == NULL condition, performing log formatting into temporary
kernel buffer, doing all the necessary accounting, but just avoiding
copying data out if buffer is full or NULL'ed out.

With these changes, it's now possible to do this sort of determination of
log contents size in both BPF_LOG_FIXED and default rolling log mode.
We need to keep in mind bpf_vlog_reset(), though, which shrinks log
contents after successful verification of a particular code path. This
log reset means that log->end_pos isn't always increasing, so to return
back to users what should be the log buffer size to fit all log content
without causing -ENOSPC even in the presence of log resetting, we need
to keep maximum over "lifetime" of logging. We do this accounting in
bpf_vlog_update_len_max() helper.

A related and subtle aspect is that with this logical log->end_pos even in
BPF_LOG_FIXED mode we could temporary "overflow" buffer, but then reset
it back with bpf_vlog_reset() to a position inside user-supplied
log_buf. In such situation we still want to properly maintain
terminating zero. We will eventually return -ENOSPC even if final log
buffer is small (we detect this through log->len_max check). This
behavior is simpler to reason about and is consistent with current
behavior of verifier log. Handling of this required a small addition to
bpf_vlog_reset() logic to avoid doing put_user() beyond physical log
buffer dimensions.

Another issue to keep in mind is that we limit log buffer size to 32-bit
value and keep such log length as u32, but theoretically verifier could
produce huge log stretching beyond 4GB. Instead of keeping (and later
returning) 64-bit log length, we cap it at UINT_MAX. Current UAPI makes
it impossible to specify log buffer size bigger than 4GB anyways, so we
don't really loose anything here and keep everything consistently 32-bit
in UAPI. This property will be utilized in next patch.

Doing the same determination of maximum log buffer for rolling mode is
trivial, as log->end_pos and log->start_pos are already logical
positions, so there is nothing new there.

These changes do incidentally fix one small issue with previous logging
logic. Previously, if use provided log buffer of size N, and actual log
output was exactly N-1 bytes + terminating \0, kernel logic coun't
distinguish this condition from log truncation scenario which would end
up with truncated log contents of N-1 bytes + terminating \0 as well.

But now with log->end_pos being logical position that could go beyond
actual log buffer size, we can distinguish these two conditions, which
we do in this patch. This plays nicely with returning log_size_actual
(implemented in UAPI in the next patch), as we can now guarantee that if
user takes such log_size_actual and provides log buffer of that exact
size, they will not get -ENOSPC in return.

All in all, all these changes do conceptually unify fixed and rolling
log modes much better, and allow a nice feature requested by users:
knowing what should be the size of the buffer to avoid -ENOSPC.

We'll plumb this through the UAPI and the code in the next patch.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-12-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 12 ++------
 kernel/bpf/log.c             | 67 ++++++++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4c926227f612..98d2eb382dbb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -504,6 +504,7 @@ struct bpf_verifier_log {
 	char __user *ubuf;
 	u32 level;
 	u32 len_total;
+	u32 len_max;
 	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 };
 
@@ -517,23 +518,16 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline u32 bpf_log_used(const struct bpf_verifier_log *log)
-{
-	return log->end_pos - log->start_pos;
-}
-
 static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
 	if (log->level & BPF_LOG_FIXED)
-		return bpf_log_used(log) >= log->len_total - 1;
+		return log->end_pos >= log->len_total;
 	return false;
 }
 
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
-	return log &&
-		((log->level && log->ubuf && !bpf_verifier_log_full(log)) ||
-		 log->level == BPF_LOG_KERNEL);
+	return log && log->level;
 }
 
 #define BPF_MAX_SUBPROGS 256
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index c778f3b290cb..47bea2fad6fe 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -16,10 +16,26 @@ bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
 }
 
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+	/* add_len includes terminal \0, so no need for +1. */
+	u64 len = log->end_pos + add_len;
+
+	/* log->len_max could be larger than our current len due to
+	 * bpf_vlog_reset() calls, so we maintain the max of any length at any
+	 * previous point
+	 */
+	if (len > UINT_MAX)
+		log->len_max = UINT_MAX;
+	else if (len > log->len_max)
+		log->len_max = len;
+}
+
 void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		       va_list args)
 {
-	unsigned int n;
+	u64 cur_pos;
+	u32 new_n, n;
 
 	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 
@@ -33,21 +49,27 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		return;
 	}
 
-	if (log->level & BPF_LOG_FIXED) {
-		n = min(log->len_total - bpf_log_used(log) - 1, n);
-		log->kbuf[n] = '\0';
-		n += 1;
+	n += 1; /* include terminating zero */
+	bpf_vlog_update_len_max(log, n);
 
-		if (copy_to_user(log->ubuf + log->end_pos, log->kbuf, n))
-			goto fail;
+	if (log->level & BPF_LOG_FIXED) {
+		/* check if we have at least something to put into user buf */
+		new_n = 0;
+		if (log->end_pos < log->len_total) {
+			new_n = min_t(u32, log->len_total - log->end_pos, n);
+			log->kbuf[new_n - 1] = '\0';
+		}
 
+		cur_pos = log->end_pos;
 		log->end_pos += n - 1; /* don't count terminating '\0' */
+
+		if (log->ubuf && new_n &&
+		    copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+			goto fail;
 	} else {
-		u64 new_end, new_start, cur_pos;
+		u64 new_end, new_start;
 		u32 buf_start, buf_end, new_n;
 
-		n += 1;
-
 		new_end = log->end_pos + n;
 		if (new_end - log->start_pos >= log->len_total)
 			new_start = new_end - log->len_total;
@@ -65,6 +87,12 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 		if (buf_end == 0)
 			buf_end = log->len_total;
 
+		log->start_pos = new_start;
+		log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+		if (!log->ubuf)
+			return;
+
 		/* if buf_start > buf_end, we wrapped around;
 		 * if buf_start == buf_end, then we fill ubuf completely; we
 		 * can't have buf_start == buf_end to mean that there is
@@ -88,9 +116,6 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
 					 buf_end))
 				goto fail;
 		}
-
-		log->start_pos = new_start;
-		log->end_pos = new_end - 1; /* don't count terminating '\0' */
 	}
 
 	return;
@@ -116,8 +141,13 @@ void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
 	log->end_pos = new_pos;
 	if (log->end_pos < log->start_pos)
 		log->start_pos = log->end_pos;
-	div_u64_rem(new_pos, log->len_total, &pos);
-	if (put_user(zero, log->ubuf + pos))
+
+	if (log->level & BPF_LOG_FIXED)
+		pos = log->end_pos + 1;
+	else
+		div_u64_rem(new_pos, log->len_total, &pos);
+
+	if (log->ubuf && pos < log->len_total && put_user(zero, log->ubuf + pos))
 		log->ubuf = NULL;
 }
 
@@ -169,12 +199,7 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 
 bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
 {
-	if (!log->level)
-		return false;
-	else if (log->level & BPF_LOG_FIXED)
-		return bpf_log_used(log) >= log->len_total - 1;
-	else
-		return log->start_pos > 0;
+	return log->len_max > log->len_total;
 }
 
 void bpf_vlog_finalize(struct bpf_verifier_log *log)
-- 
cgit v1.2.3


From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:58 -0700
Subject: bpf: Add log_true_size output field to return necessary log buffer
 size

Add output-only log_true_size and btf_log_true_size field to
BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return
the size of log buffer necessary to fit in all the log contents at
specified log_level. This is very useful for BPF loader libraries like
libbpf to be able to size log buffer correctly, but could be used by
users directly, if necessary, as well.

This patch plumbs all this through the code, taking into account actual
bpf_attr size provided by user to determine if these new fields are
expected by users. And if they are, set them from kernel on return.

We refactory btf_parse() function to accommodate this, moving attr and
uattr handling inside it. The rest is very straightforward code, which
is split from the logging accounting changes in the previous patch to
make it simpler to review logic vs UAPI changes.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org
---
 include/linux/bpf.h            |  2 +-
 include/linux/btf.h            |  2 +-
 include/uapi/linux/bpf.h       | 10 ++++++++++
 kernel/bpf/btf.c               | 32 ++++++++++++++++++--------------
 kernel/bpf/syscall.c           | 16 ++++++++--------
 kernel/bpf/verifier.c          |  8 +++++++-
 tools/include/uapi/linux/bpf.h | 12 +++++++++++-
 7 files changed, 56 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 002a811b6b90..2c6095bd7d69 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2175,7 +2175,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
 /* verify correctness of eBPF program */
-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr);
+int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size);
 
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
diff --git a/include/linux/btf.h b/include/linux/btf.h
index d53b10cc55f2..495250162422 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -125,7 +125,7 @@ extern const struct file_operations btf_fops;
 
 void btf_get(struct btf *btf);
 void btf_put(struct btf *btf);
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr);
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz);
 struct btf *btf_get_by_fd(int fd);
 int btf_get_info_by_fd(const struct btf *btf,
 		       const union bpf_attr *attr,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e3d3b5160d26..3823100b7934 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1407,6 +1407,11 @@ union bpf_attr {
 		__aligned_u64	fd_array;	/* array of FDs */
 		__aligned_u64	core_relos;
 		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		log_true_size;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1492,6 +1497,11 @@ union bpf_attr {
 		__u32		btf_size;
 		__u32		btf_log_size;
 		__u32		btf_log_level;
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		btf_log_true_size;
 	};
 
 	struct {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 5aa540ee611f..0748cf4b8ab6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5504,9 +5504,10 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	return 0;
 }
 
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
-			     u32 log_level, char __user *log_ubuf, u32 log_size)
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
+	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
 	struct bpf_verifier_log *log;
@@ -5514,7 +5515,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	u8 *data;
 	int err;
 
-	if (btf_data_size > BTF_MAX_SIZE)
+	if (attr->btf_size > BTF_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
 
 	env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -5522,13 +5523,13 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 		return ERR_PTR(-ENOMEM);
 
 	log = &env->log;
-	if (log_level || log_ubuf || log_size) {
+	if (attr->btf_log_level || log_ubuf || attr->btf_log_size) {
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log->level = log_level;
+		log->level = attr->btf_log_level;
 		log->ubuf = log_ubuf;
-		log->len_total = log_size;
+		log->len_total = attr->btf_log_size;
 
 		/* log attributes have to be sane */
 		if (!bpf_verifier_log_attr_valid(log)) {
@@ -5544,16 +5545,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 	env->btf = btf;
 
-	data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+	data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
 	if (!data) {
 		err = -ENOMEM;
 		goto errout;
 	}
 
 	btf->data = data;
-	btf->data_size = btf_data_size;
+	btf->data_size = attr->btf_size;
 
-	if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+	if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
 		err = -EFAULT;
 		goto errout;
 	}
@@ -5594,6 +5595,12 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
 	}
 
 	bpf_vlog_finalize(log);
+	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+				  &log->len_max, sizeof(log->len_max))) {
+		err = -EFAULT;
+		goto errout_meta;
+	}
 	if (bpf_vlog_truncated(log)) {
 		err = -ENOSPC;
 		goto errout_meta;
@@ -7218,15 +7225,12 @@ static int __btf_new_fd(struct btf *btf)
 	return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
 }
 
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	struct btf *btf;
 	int ret;
 
-	btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
-			attr->btf_size, attr->btf_log_level,
-			u64_to_user_ptr(attr->btf_log_buf),
-			attr->btf_log_size);
+	btf = btf_parse(attr, uattr, uattr_size);
 	if (IS_ERR(btf))
 		return PTR_ERR(btf);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e18ac7fdc210..6d575505f89c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2501,9 +2501,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
+#define	BPF_PROG_LOAD_LAST_FIELD log_true_size
 
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
 	struct bpf_prog *prog, *dst_prog = NULL;
@@ -2653,7 +2653,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
 		goto free_prog_sec;
 
 	/* run eBPF verifier */
-	err = bpf_check(&prog, attr, uattr);
+	err = bpf_check(&prog, attr, uattr, uattr_size);
 	if (err < 0)
 		goto free_used_maps;
 
@@ -4371,9 +4371,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
 	return err;
 }
 
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
 
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
@@ -4381,7 +4381,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
 	if (!bpf_capable())
 		return -EPERM;
 
-	return btf_new_fd(attr, uattr);
+	return btf_new_fd(attr, uattr, uattr_size);
 }
 
 #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -5059,7 +5059,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 		err = map_freeze(&attr);
 		break;
 	case BPF_PROG_LOAD:
-		err = bpf_prog_load(&attr, uattr);
+		err = bpf_prog_load(&attr, uattr, size);
 		break;
 	case BPF_OBJ_PIN:
 		err = bpf_obj_pin(&attr);
@@ -5104,7 +5104,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
 		err = bpf_raw_tracepoint_open(&attr);
 		break;
 	case BPF_BTF_LOAD:
-		err = bpf_btf_load(&attr, uattr);
+		err = bpf_btf_load(&attr, uattr, size);
 		break;
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a98cbc046d1e..308e7abeb979 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18694,7 +18694,7 @@ struct btf *bpf_get_btf_vmlinux(void)
 	return btf_vmlinux;
 }
 
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
@@ -18861,6 +18861,12 @@ skip_full_check:
 	env->prog->aux->verified_insns = env->insn_processed;
 
 	bpf_vlog_finalize(log);
+	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+				  &log->len_max, sizeof(log->len_max))) {
+		ret = -EFAULT;
+		goto err_release_maps;
+	}
 	if (bpf_vlog_truncated(log))
 		ret = -ENOSPC;
 	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d6c5a022ae28..3823100b7934 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1407,6 +1407,11 @@ union bpf_attr {
 		__aligned_u64	fd_array;	/* array of FDs */
 		__aligned_u64	core_relos;
 		__u32		core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		log_true_size;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -1492,6 +1497,11 @@ union bpf_attr {
 		__u32		btf_size;
 		__u32		btf_log_size;
 		__u32		btf_log_level;
+		/* output: actual total log contents size (including termintaing zero).
+		 * It could be both larger than original log_size (if log was
+		 * truncated), or smaller (if log buffer wasn't filled completely).
+		 */
+		__u32		btf_log_true_size;
 	};
 
 	struct {
@@ -1513,7 +1523,7 @@ union bpf_attr {
 	struct { /* struct used by BPF_LINK_CREATE command */
 		union {
 			__u32		prog_fd;	/* eBPF program to attach */
-			__u32		map_fd;		/* eBPF struct_ops to attach */
+			__u32		map_fd;		/* struct_ops to attach */
 		};
 		union {
 			__u32		target_fd;	/* object to attach to */
-- 
cgit v1.2.3


From bdcab4144f5da97cc0fa7e1dd63b8475e10c8f0a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 6 Apr 2023 16:41:59 -0700
Subject: bpf: Simplify internal verifier log interface

Simplify internal verifier log API down to bpf_vlog_init() and
bpf_vlog_finalize(). The former handles input arguments validation in
one place and makes it easier to change it. The latter subsumes -ENOSPC
(truncation) and -EFAULT handling and simplifies both caller's code
(bpf_check() and btf_parse()).

For btf_parse(), this patch also makes sure that verifier log
finalization happens even if there is some error condition during BTF
verification process prior to normal finalization step.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Lorenz Bauer <lmb@isovalent.com>
Link: https://lore.kernel.org/bpf/20230406234205.323208-14-andrii@kernel.org
---
 include/linux/bpf_verifier.h | 13 ++-------
 kernel/bpf/btf.c             | 65 ++++++++++++++++++++++----------------------
 kernel/bpf/log.c             | 48 ++++++++++++++++++++++++++------
 kernel/bpf/verifier.c        | 39 +++++++++++---------------
 4 files changed, 90 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 98d2eb382dbb..f03852b89d28 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -518,13 +518,6 @@ struct bpf_verifier_log {
 #define BPF_LOG_MIN_ALIGNMENT 8U
 #define BPF_LOG_ALIGNMENT 40U
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
-{
-	if (log->level & BPF_LOG_FIXED)
-		return log->end_pos >= log->len_total;
-	return false;
-}
-
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log && log->level;
@@ -612,16 +605,16 @@ struct bpf_verifier_env {
 	char type_str_buf[TYPE_STR_BUF_LEN];
 };
 
-bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log);
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
 				      const char *fmt, va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 			    const char *fmt, ...);
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+		  char __user *log_buf, u32 log_size);
 void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos);
-void bpf_vlog_finalize(struct bpf_verifier_log *log);
-bool bpf_vlog_truncated(const struct bpf_verifier_log *log);
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual);
 
 static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env)
 {
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 0748cf4b8ab6..ffc31a1c84af 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5504,16 +5504,30 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
 	return 0;
 }
 
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+	u32 log_true_size;
+	int err;
+
+	err = bpf_vlog_finalize(log, &log_true_size);
+
+	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+				  &log_true_size, sizeof(log_true_size)))
+		err = -EFAULT;
+
+	return err;
+}
+
 static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
 	char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
 	struct btf_struct_metas *struct_meta_tab;
 	struct btf_verifier_env *env = NULL;
-	struct bpf_verifier_log *log;
 	struct btf *btf = NULL;
 	u8 *data;
-	int err;
+	int err, ret;
 
 	if (attr->btf_size > BTF_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
@@ -5522,21 +5536,13 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	if (!env)
 		return ERR_PTR(-ENOMEM);
 
-	log = &env->log;
-	if (attr->btf_log_level || log_ubuf || attr->btf_log_size) {
-		/* user requested verbose verifier output
-		 * and supplied buffer to store the verification trace
-		 */
-		log->level = attr->btf_log_level;
-		log->ubuf = log_ubuf;
-		log->len_total = attr->btf_log_size;
-
-		/* log attributes have to be sane */
-		if (!bpf_verifier_log_attr_valid(log)) {
-			err = -EINVAL;
-			goto errout;
-		}
-	}
+	/* user could have requested verbose verifier output
+	 * and supplied buffer to store the verification trace
+	 */
+	err = bpf_vlog_init(&env->log, attr->btf_log_level,
+			    log_ubuf, attr->btf_log_size);
+	if (err)
+		goto errout_free;
 
 	btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
 	if (!btf) {
@@ -5577,7 +5583,7 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 	if (err)
 		goto errout;
 
-	struct_meta_tab = btf_parse_struct_metas(log, btf);
+	struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
 	if (IS_ERR(struct_meta_tab)) {
 		err = PTR_ERR(struct_meta_tab);
 		goto errout;
@@ -5594,21 +5600,9 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 		}
 	}
 
-	bpf_vlog_finalize(log);
-	if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
-	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
-				  &log->len_max, sizeof(log->len_max))) {
-		err = -EFAULT;
-		goto errout_meta;
-	}
-	if (bpf_vlog_truncated(log)) {
-		err = -ENOSPC;
-		goto errout_meta;
-	}
-	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf) {
-		err = -EFAULT;
-		goto errout_meta;
-	}
+	err = finalize_log(&env->log, uattr, uattr_size);
+	if (err)
+		goto errout_free;
 
 	btf_verifier_env_free(env);
 	refcount_set(&btf->refcnt, 1);
@@ -5617,6 +5611,11 @@ static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uat
 errout_meta:
 	btf_free_struct_meta_tab(btf);
 errout:
+	/* overwrite err with -ENOSPC or -EFAULT */
+	ret = finalize_log(&env->log, uattr, uattr_size);
+	if (ret)
+		err = ret;
+errout_free:
 	btf_verifier_env_free(env);
 	if (btf)
 		btf_free(btf);
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 47bea2fad6fe..1fae2c5d7ae4 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -10,12 +10,26 @@
 #include <linux/bpf_verifier.h>
 #include <linux/math64.h>
 
-bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
 {
 	return log->len_total > 0 && log->len_total <= UINT_MAX >> 2 &&
 	       log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK);
 }
 
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+		  char __user *log_buf, u32 log_size)
+{
+	log->level = log_level;
+	log->ubuf = log_buf;
+	log->len_total = log_size;
+
+	/* log attributes have to be sane */
+	if (!bpf_verifier_log_attr_valid(log))
+		return -EINVAL;
+
+	return 0;
+}
+
 static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
 {
 	/* add_len includes terminal \0, so no need for +1. */
@@ -197,24 +211,25 @@ static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int en
 	return 0;
 }
 
-bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
+static bool bpf_vlog_truncated(const struct bpf_verifier_log *log)
 {
 	return log->len_max > log->len_total;
 }
 
-void bpf_vlog_finalize(struct bpf_verifier_log *log)
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
 {
 	u32 sublen;
 	int err;
 
-	if (!log || !log->level || !log->ubuf)
-		return;
-	if ((log->level & BPF_LOG_FIXED) || log->level == BPF_LOG_KERNEL)
-		return;
+	*log_size_actual = 0;
+	if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+		return 0;
 
+	if (!log->ubuf)
+		goto skip_log_rotate;
 	/* If we never truncated log, there is nothing to move around. */
-	if (log->start_pos == 0)
-		return;
+	if ((log->level & BPF_LOG_FIXED) || log->start_pos == 0)
+		goto skip_log_rotate;
 
 	/* Otherwise we need to rotate log contents to make it start from the
 	 * buffer beginning and be a continuous zero-terminated string. Note
@@ -257,6 +272,21 @@ void bpf_vlog_finalize(struct bpf_verifier_log *log)
 	err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
 	if (err)
 		log->ubuf = NULL;
+
+skip_log_rotate:
+	*log_size_actual = log->len_max;
+
+	/* properly initialized log has either both ubuf!=NULL and len_total>0
+	 * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+	 * we got a fault somewhere along the way, so report it back
+	 */
+	if (!!log->ubuf != !!log->len_total)
+		return -EFAULT;
+
+	if (bpf_vlog_truncated(log))
+		return -ENOSPC;
+
+	return 0;
 }
 
 /* log_level controls verbosity level of eBPF verifier.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 308e7abeb979..d6db6de3e9ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -18698,8 +18698,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 {
 	u64 start_time = ktime_get_ns();
 	struct bpf_verifier_env *env;
-	struct bpf_verifier_log *log;
-	int i, len, ret = -EINVAL;
+	int i, len, ret = -EINVAL, err;
+	u32 log_true_size;
 	bool is_priv;
 
 	/* no program is valid */
@@ -18712,7 +18712,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
-	log = &env->log;
 
 	len = (*prog)->len;
 	env->insn_aux_data =
@@ -18733,20 +18732,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (!is_priv)
 		mutex_lock(&bpf_verifier_lock);
 
-	if (attr->log_level || attr->log_buf || attr->log_size) {
-		/* user requested verbose verifier output
-		 * and supplied buffer to store the verification trace
-		 */
-		log->level = attr->log_level;
-		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log->len_total = attr->log_size;
-
-		/* log attributes have to be sane */
-		if (!bpf_verifier_log_attr_valid(log)) {
-			ret = -EINVAL;
-			goto err_unlock;
-		}
-	}
+	/* user could have requested verbose verifier output
+	 * and supplied buffer to store the verification trace
+	 */
+	ret = bpf_vlog_init(&env->log, attr->log_level,
+			    (char __user *) (unsigned long) attr->log_buf,
+			    attr->log_size);
+	if (ret)
+		goto err_unlock;
 
 	mark_verifier_state_clean(env);
 
@@ -18860,17 +18853,17 @@ skip_full_check:
 	print_verification_stats(env);
 	env->prog->aux->verified_insns = env->insn_processed;
 
-	bpf_vlog_finalize(log);
+	/* preserve original error even if log finalization is successful */
+	err = bpf_vlog_finalize(&env->log, &log_true_size);
+	if (err)
+		ret = err;
+
 	if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
 	    copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
-				  &log->len_max, sizeof(log->len_max))) {
+				  &log_true_size, sizeof(log_true_size))) {
 		ret = -EFAULT;
 		goto err_release_maps;
 	}
-	if (bpf_vlog_truncated(log))
-		ret = -ENOSPC;
-	if (log->level && log->level != BPF_LOG_KERNEL && !log->ubuf)
-		ret = -EFAULT;
 
 	if (ret)
 		goto err_release_maps;
-- 
cgit v1.2.3


From e5688f6fb9e3d0254a8fb1c714c38e407f0baa64 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Wed, 1 Mar 2023 10:33:55 +0100
Subject: net/mlx5: Add mlx5_ifc definitions for bridge multicast support

Add the required hardware definitions to mlx5_ifc: fdb_uplink_hairpin,
fdb_multi_path_any_table_limit_regc, fdb_multi_path_any_table.

Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e47d6c58da35..02c628f4fe26 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -880,7 +880,12 @@ enum {
 
 struct mlx5_ifc_flow_table_eswitch_cap_bits {
 	u8      fdb_to_vport_reg_c_id[0x8];
-	u8      reserved_at_8[0xd];
+	u8      reserved_at_8[0x5];
+	u8      fdb_uplink_hairpin[0x1];
+	u8      fdb_multi_path_any_table_limit_regc[0x1];
+	u8      reserved_at_f[0x3];
+	u8      fdb_multi_path_any_table[0x1];
+	u8      reserved_at_13[0x2];
 	u8      fdb_modify_header_fwd_to_table[0x1];
 	u8      fdb_ipv4_ttl_modify[0x1];
 	u8      flow_source[0x1];
-- 
cgit v1.2.3


From 9df839a711aee437390b16ee39cf0b5c1620be6a Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Thu, 23 Apr 2020 08:27:59 -0500
Subject: net/mlx5: Create a new profile for SFs

Create a new profile for SFs in order to disable the command cache.
Each function command cache consumes ~500KB of memory, when using a
large number of SFs this savings is notable on memory constarined
systems.

Use a new profile to provide for future differences between SFs and PFs.

The mr_cache not used for non-PF functions, so it is excluded from the
new profile.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Bodong Wang <bodong@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c           | 6 +++---
 drivers/net/ethernet/mellanox/mlx5/core/main.c          | 9 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h     | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c | 2 +-
 include/linux/mlx5/driver.h                             | 1 +
 5 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index b00e33ed05e9..d53de39539a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1802,7 +1802,7 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
 	if (in_size <= 16)
 		goto cache_miss;
 
-	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+	for (i = 0; i < dev->profile.num_cmd_caches; i++) {
 		ch = &cmd->cache[i];
 		if (in_size > ch->max_inbox_size)
 			continue;
@@ -2097,7 +2097,7 @@ static void destroy_msg_cache(struct mlx5_core_dev *dev)
 	struct mlx5_cmd_msg *n;
 	int i;
 
-	for (i = 0; i < MLX5_NUM_COMMAND_CACHES; i++) {
+	for (i = 0; i < dev->profile.num_cmd_caches; i++) {
 		ch = &dev->cmd.cache[i];
 		list_for_each_entry_safe(msg, n, &ch->head, list) {
 			list_del(&msg->list);
@@ -2127,7 +2127,7 @@ static void create_msg_cache(struct mlx5_core_dev *dev)
 	int k;
 
 	/* Initialize and fill the caches with initial entries */
-	for (k = 0; k < MLX5_NUM_COMMAND_CACHES; k++) {
+	for (k = 0; k < dev->profile.num_cmd_caches; k++) {
 		ch = &cmd->cache[k];
 		spin_lock_init(&ch->lock);
 		INIT_LIST_HEAD(&ch->head);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f95df73d1089..a95d1218def9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -100,15 +100,19 @@ enum {
 static struct mlx5_profile profile[] = {
 	[0] = {
 		.mask           = 0,
+		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
 	},
 	[1] = {
 		.mask		= MLX5_PROF_MASK_QP_SIZE,
 		.log_max_qp	= 12,
+		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
+
 	},
 	[2] = {
 		.mask		= MLX5_PROF_MASK_QP_SIZE |
 				  MLX5_PROF_MASK_MR_CACHE,
 		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
+		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
 		.mr_cache[0]	= {
 			.size	= 500,
 			.limit	= 250
@@ -174,6 +178,11 @@ static struct mlx5_profile profile[] = {
 			.limit	= 4
 		},
 	},
+	[3] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE,
+		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
+		.num_cmd_caches = 0,
+	},
 };
 
 static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index be0785f83083..5eaab99678ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -142,6 +142,7 @@ enum mlx5_semaphore_space_address {
 };
 
 #define MLX5_DEFAULT_PROF       2
+#define MLX5_SF_PROF		3
 
 static inline int mlx5_flexible_inlen(struct mlx5_core_dev *dev, size_t fixed,
 				      size_t item_size, size_t num_items,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
index a7377619ba6f..e2f26d0bc615 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/driver.c
@@ -28,7 +28,7 @@ static int mlx5_sf_dev_probe(struct auxiliary_device *adev, const struct auxilia
 	mdev->priv.adev_idx = adev->id;
 	sf_dev->mdev = mdev;
 
-	err = mlx5_mdev_init(mdev, MLX5_DEFAULT_PROF);
+	err = mlx5_mdev_init(mdev, MLX5_SF_PROF);
 	if (err) {
 		mlx5_core_warn(mdev, "mlx5_mdev_init on err=%d\n", err);
 		goto mdev_err;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f243bd10a5e1..135a3c8d8237 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -751,6 +751,7 @@ enum {
 struct mlx5_profile {
 	u64	mask;
 	u8	log_max_qp;
+	u8	num_cmd_caches;
 	struct {
 		int	size;
 		int	limit;
-- 
cgit v1.2.3


From 9fa7f1de3dda0bc74c964b5127b3f22bd8ef8fd2 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 30 Aug 2022 01:20:30 +0300
Subject: net/mlx5: Add mlx5_ifc bits for modify header argument

Add enum value for modify-header argument object and mlx5_bits
for the related capabilities.

Signed-off-by: Muhammad Sammar <muhammads@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 02c628f4fe26..6c84bf6eec85 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -78,12 +78,15 @@ enum {
 
 enum {
 	MLX5_OBJ_TYPE_SW_ICM = 0x0008,
+	MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT  = 0x23,
 };
 
 enum {
 	MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM),
 	MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11),
 	MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13),
+	MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT =
+		(1ULL << MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT),
 	MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39),
 };
 
@@ -321,6 +324,10 @@ enum {
 	MLX5_FT_NIC_TX_RDMA_2_NIC_TX = BIT(1),
 };
 
+enum {
+	MLX5_CMD_OP_MOD_UPDATE_HEADER_MODIFY_ARGUMENT = 0x1,
+};
+
 struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         outer_dmac[0x1];
 	u8         outer_smac[0x1];
@@ -1927,7 +1934,14 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_750[0x4];
 	u8	   max_dynamic_vf_msix_table_size[0xc];
 
-	u8	   reserved_at_760[0x20];
+	u8         reserved_at_760[0x3];
+	u8         log_max_num_header_modify_argument[0x5];
+	u8         reserved_at_768[0x4];
+	u8         log_header_modify_argument_granularity[0x4];
+	u8         reserved_at_770[0x3];
+	u8         log_header_modify_argument_max_alloc[0x5];
+	u8         reserved_at_778[0x8];
+
 	u8	   vhca_tunnel_commands[0x40];
 	u8         match_definer_format_supported[0x40];
 };
@@ -6361,6 +6375,18 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_modify_header_arg_bits {
+	u8         reserved_at_0[0x80];
+
+	u8         reserved_at_80[0x8];
+	u8         access_pd[0x18];
+};
+
+struct mlx5_ifc_create_modify_header_arg_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+	struct mlx5_ifc_modify_header_arg_bits arg;
+};
+
 struct mlx5_ifc_create_match_definer_in_bits {
 	struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr;
 
-- 
cgit v1.2.3


From 977c4a3e7c89d5b48fdc548773fb9ed3e6c50cf8 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Tue, 30 Aug 2022 01:20:40 +0300
Subject: net/mlx5: Add new WQE for updating flow table

Add new WQE type: FLOW_TBL_ACCESS, which will be used for
writing modify header arguments.
This type has specific control segment and special data segment.

Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 include/linux/mlx5/device.h |  2 ++
 include/linux/mlx5/qp.h     | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index af4dd536a52c..e4aa147ab390 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -442,6 +442,8 @@ enum {
 
 	MLX5_OPCODE_UMR			= 0x25,
 
+	MLX5_OPCODE_FLOW_TBL_ACCESS	= 0x2c,
+
 	MLX5_OPCODE_ACCESS_ASO		= 0x2d,
 };
 
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index df55fbb65717..bd53cf4be7bd 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -499,6 +499,16 @@ struct mlx5_stride_block_ctrl_seg {
 	__be16		num_entries;
 };
 
+struct mlx5_wqe_flow_update_ctrl_seg {
+	__be32		flow_idx_update;
+	__be32		dest_handle;
+	u8		reserved0[40];
+};
+
+struct mlx5_wqe_header_modify_argument_update_seg {
+	u8		argument_list[64];
+};
+
 struct mlx5_core_qp {
 	struct mlx5_core_rsc_common	common; /* must be first */
 	void (*event)		(struct mlx5_core_qp *, int);
-- 
cgit v1.2.3


From 7d6d2dd326a8a8d32091e9748f3428dd3be68367 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Wed, 5 Apr 2023 22:07:26 +0200
Subject: mmc: sdio: add Realtek SDIO vendor ID and various wifi device IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the SDIO vendor ID for Realtek and some device IDs extracted from
their GPL vendor driver. This will be useful in the future when the
rtw88 driver gains support for these chips.

Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Ping-Ke Shih <pkshih@realtek.com>
Reviewed-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20230405200729.632435-7-martin.blumenstingl@googlemail.com
---
 include/linux/mmc/sdio_ids.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 0e4ef9c5127a..66f503ed2448 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -112,6 +112,15 @@
 #define SDIO_VENDOR_ID_MICROCHIP_WILC		0x0296
 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000	0x5347
 
+#define SDIO_VENDOR_ID_REALTEK			0x024c
+#define SDIO_DEVICE_ID_REALTEK_RTW8723BS	0xb723
+#define SDIO_DEVICE_ID_REALTEK_RTW8821BS	0xb821
+#define SDIO_DEVICE_ID_REALTEK_RTW8822BS	0xb822
+#define SDIO_DEVICE_ID_REALTEK_RTW8821CS	0xc821
+#define SDIO_DEVICE_ID_REALTEK_RTW8822CS	0xc822
+#define SDIO_DEVICE_ID_REALTEK_RTW8723DS	0xd723
+#define SDIO_DEVICE_ID_REALTEK_RTW8821DS	0xd821
+
 #define SDIO_VENDOR_ID_SIANO			0x039a
 #define SDIO_DEVICE_ID_SIANO_NOVA_B0		0x0201
 #define SDIO_DEVICE_ID_SIANO_NICE		0x0202
-- 
cgit v1.2.3


From ac931d4cdec3df8b6eac3bc40a6871123021f078 Mon Sep 17 00:00:00 2001
From: Christian Ehrig <cehrig@cloudflare.com>
Date: Fri, 7 Apr 2023 15:38:53 +0200
Subject: ipip,ip_tunnel,sit: Add FOU support for externally controlled ipip
 devices

Today ipip devices in collect-metadata mode don't allow for sending FOU
or GUE encapsulated packets. This patch lifts the restriction by adding
a struct ip_tunnel_encap to the tunnel metadata.

On the egress path, the members of this struct can be set by the
bpf_skb_set_fou_encap kfunc via a BPF tc-hook. Instead of dropping packets
wishing to use additional UDP encapsulation, ip_md_tunnel_xmit now
evaluates the contents of this struct and adds the corresponding FOU or
GUE header. Furthermore, it is making sure that additional header bytes
are taken into account for PMTU discovery.

On the ingress path, an ipip device in collect-metadata mode will fill this
struct and a BPF tc-hook can obtain the information via a call to the
bpf_skb_get_fou_encap kfunc.

The minor change to ip_tunnel_encap, which now takes a pointer to
struct ip_tunnel_encap instead of struct ip_tunnel, allows us to control
FOU encap type and parameters on a per packet-level.

Signed-off-by: Christian Ehrig <cehrig@cloudflare.com>
Link: https://lore.kernel.org/r/cfea47de655d0f870248abf725932f851b53960a.1680874078.git.cehrig@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/ip_tunnels.h | 28 +++++++++++++++-------------
 net/ipv4/ip_tunnel.c     | 22 ++++++++++++++++++++--
 net/ipv4/ipip.c          |  1 +
 net/ipv6/sit.c           |  2 +-
 4 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index fca357679816..7912f53caae0 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -57,6 +57,13 @@ struct ip_tunnel_key {
 	__u8			flow_flags;
 };
 
+struct ip_tunnel_encap {
+	u16			type;
+	u16			flags;
+	__be16			sport;
+	__be16			dport;
+};
+
 /* Flags for ip_tunnel_info mode. */
 #define IP_TUNNEL_INFO_TX	0x01	/* represents tx tunnel parameters */
 #define IP_TUNNEL_INFO_IPV6	0x02	/* key contains IPv6 addresses */
@@ -66,9 +73,9 @@ struct ip_tunnel_key {
 #define IP_TUNNEL_OPTS_MAX					\
 	GENMASK((sizeof_field(struct ip_tunnel_info,		\
 			      options_len) * BITS_PER_BYTE) - 1, 0)
-
 struct ip_tunnel_info {
 	struct ip_tunnel_key	key;
+	struct ip_tunnel_encap	encap;
 #ifdef CONFIG_DST_CACHE
 	struct dst_cache	dst_cache;
 #endif
@@ -86,13 +93,6 @@ struct ip_tunnel_6rd_parm {
 };
 #endif
 
-struct ip_tunnel_encap {
-	u16			type;
-	u16			flags;
-	__be16			sport;
-	__be16			dport;
-};
-
 struct ip_tunnel_prl_entry {
 	struct ip_tunnel_prl_entry __rcu *next;
 	__be32				addr;
@@ -293,6 +293,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 				   __be32 remote, __be32 local,
 				   __be32 key);
 
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info);
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 		  bool log_ecn_error);
@@ -371,22 +372,23 @@ static inline int ip_encap_hlen(struct ip_tunnel_encap *e)
 	return hlen;
 }
 
-static inline int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
+static inline int ip_tunnel_encap(struct sk_buff *skb,
+				  struct ip_tunnel_encap *e,
 				  u8 *protocol, struct flowi4 *fl4)
 {
 	const struct ip_tunnel_encap_ops *ops;
 	int ret = -EINVAL;
 
-	if (t->encap.type == TUNNEL_ENCAP_NONE)
+	if (e->type == TUNNEL_ENCAP_NONE)
 		return 0;
 
-	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
+	if (e->type >= MAX_IPTUN_ENCAP_OPS)
 		return -EINVAL;
 
 	rcu_read_lock();
-	ops = rcu_dereference(iptun_encaps[t->encap.type]);
+	ops = rcu_dereference(iptun_encaps[e->type]);
 	if (likely(ops && ops->build_header))
-		ret = ops->build_header(skb, &t->encap, protocol, fl4);
+		ret = ops->build_header(skb, e, protocol, fl4);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index de90b09dfe78..add437f710fc 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -359,6 +359,20 @@ err_dev_set_mtu:
 	return ERR_PTR(err);
 }
 
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	const struct udphdr *udph;
+
+	if (iph->protocol != IPPROTO_UDP)
+		return;
+
+	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
+	info->encap.sport = udph->source;
+	info->encap.dport = udph->dest;
+}
+EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
+
 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 		  bool log_ecn_error)
@@ -572,7 +586,11 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
 			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
 			    key->flow_flags);
-	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+
+	if (!tunnel_hlen)
+		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
+
+	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
 		goto tx_error;
 
 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
@@ -732,7 +750,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			    dev_net(dev), tunnel->parms.link,
 			    tunnel->fwmark, skb_get_hash(skb), 0);
 
-	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
 		goto tx_error;
 
 	if (connected && md) {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index abea77759b7e..27b8f83c6ea2 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -241,6 +241,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
 			if (!tun_dst)
 				return 0;
+			ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
 		}
 		skb_reset_mac_header(skb);
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 70d81bba5093..063560e2cb1a 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1024,7 +1024,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		ttl = iph6->hop_limit;
 	tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
 
-	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) {
+	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) {
 		ip_rt_put(rt);
 		goto tx_error;
 	}
-- 
cgit v1.2.3


From c50e96099edb134bf107fafc02715fbc4aa2277f Mon Sep 17 00:00:00 2001
From: Christian Ehrig <cehrig@cloudflare.com>
Date: Fri, 7 Apr 2023 15:38:54 +0200
Subject: bpf,fou: Add bpf_skb_{set,get}_fou_encap kfuncs

Add two new kfuncs that allow a BPF tc-hook, installed on an ipip
device in collect-metadata mode, to control FOU encap parameters on a
per-packet level. The set of kfuncs is registered with the fou module.

The bpf_skb_set_fou_encap kfunc is supposed to be used in tandem and after
a successful call to the bpf_skb_set_tunnel_key bpf-helper. UDP source and
destination ports can be controlled by passing a struct bpf_fou_encap. A
source port of zero will auto-assign a source port. enum bpf_fou_encap_type
is used to specify if the egress path should FOU or GUE encap the packet.

On the ingress path bpf_skb_get_fou_encap can be used to read UDP source
and destination ports from the receiver's point of view and allows for
packet multiplexing across different destination ports within a single
BPF program and ipip device.

Signed-off-by: Christian Ehrig <cehrig@cloudflare.com>
Link: https://lore.kernel.org/r/e17c94a646b63e78ce0dbf3f04b2c33dc948a32d.1680874078.git.cehrig@cloudflare.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/fou.h   |   2 +
 net/ipv4/Makefile   |   2 +-
 net/ipv4/fou_bpf.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/fou_core.c |   5 +++
 4 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 net/ipv4/fou_bpf.c

(limited to 'include')

diff --git a/include/net/fou.h b/include/net/fou.h
index 80f56e275b08..824eb4b231fd 100644
--- a/include/net/fou.h
+++ b/include/net/fou.h
@@ -17,4 +17,6 @@ int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 		       u8 *protocol, __be16 *sport, int type);
 
+int register_fou_bpf(void);
+
 #endif
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 880277c9fd07..b18ba8ef93ad 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -26,7 +26,7 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
 gre-y := gre_demux.o
-fou-y := fou_core.o fou_nl.o
+fou-y := fou_core.o fou_nl.o fou_bpf.o
 obj-$(CONFIG_NET_FOU) += fou.o
 obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
new file mode 100644
index 000000000000..3760a14b6b57
--- /dev/null
+++ b/net/ipv4/fou_bpf.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Fou Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/fou.h>
+
+struct bpf_fou_encap {
+	__be16 sport;
+	__be16 dport;
+};
+
+enum bpf_fou_encap_type {
+	FOU_BPF_ENCAP_FOU,
+	FOU_BPF_ENCAP_GUE,
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in BTF");
+
+/* bpf_skb_set_fou_encap - Set FOU encap parameters
+ *
+ * This function allows for using GUE or FOU encapsulation together with an
+ * ipip device in collect-metadata mode.
+ *
+ * It is meant to be used in BPF tc-hooks and after a call to the
+ * bpf_skb_set_tunnel_key helper, responsible for setting IP addresses.
+ *
+ * Parameters:
+ * @skb_ctx	Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap	Pointer to a `struct bpf_fou_encap` storing UDP src and
+ * 		dst ports. If sport is set to 0 the kernel will auto-assign a
+ * 		port. This is similar to using `encap-sport auto`.
+ * 		Cannot be NULL
+ * @type	Encapsulation type for the packet. Their definitions are
+ * 		specified in `enum bpf_fou_encap_type`
+ */
+__bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
+				      struct bpf_fou_encap *encap, int type)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!encap))
+		return -EINVAL;
+
+	if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX)))
+		return -EINVAL;
+
+	switch (type) {
+	case FOU_BPF_ENCAP_FOU:
+		info->encap.type = TUNNEL_ENCAP_FOU;
+		break;
+	case FOU_BPF_ENCAP_GUE:
+		info->encap.type = TUNNEL_ENCAP_GUE;
+		break;
+	default:
+		info->encap.type = TUNNEL_ENCAP_NONE;
+	}
+
+	if (info->key.tun_flags & TUNNEL_CSUM)
+		info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
+
+	info->encap.sport = encap->sport;
+	info->encap.dport = encap->dport;
+
+	return 0;
+}
+
+/* bpf_skb_get_fou_encap - Get FOU encap parameters
+ *
+ * This function allows for reading encap metadata from a packet received
+ * on an ipip device in collect-metadata mode.
+ *
+ * Parameters:
+ * @skb_ctx	Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap	Pointer to a struct bpf_fou_encap storing UDP source and
+ * 		destination port. Cannot be NULL
+ */
+__bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
+				      struct bpf_fou_encap *encap)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!info))
+		return -EINVAL;
+
+	encap->sport = info->encap.sport;
+	encap->dport = info->encap.dport;
+
+	return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(fou_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
+BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
+BTF_SET8_END(fou_kfunc_set)
+
+static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &fou_kfunc_set,
+};
+
+int register_fou_bpf(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &fou_bpf_kfunc_set);
+}
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
index cafec9b4eee0..0c41076e31ed 100644
--- a/net/ipv4/fou_core.c
+++ b/net/ipv4/fou_core.c
@@ -1236,10 +1236,15 @@ static int __init fou_init(void)
 	if (ret < 0)
 		goto unregister;
 
+	ret = register_fou_bpf();
+	if (ret < 0)
+		goto kfunc_failed;
+
 	ret = ip_tunnel_encap_add_fou_ops();
 	if (ret == 0)
 		return 0;
 
+kfunc_failed:
 	genl_unregister_family(&fou_nl_family);
 unregister:
 	unregister_pernet_device(&fou_net_ops);
-- 
cgit v1.2.3


From 33719b57f52e5b761234373f98f55f4e036d61c9 Mon Sep 17 00:00:00 2001
From: Andrew Halaney <ahalaney@redhat.com>
Date: Tue, 11 Apr 2023 15:04:06 -0500
Subject: net: stmmac: dwmac4: Allow platforms to specify some DMA/MTL offsets

Some platforms have dwmac4 implementations that have a different
address space layout than the default, resulting in the need to define
their own DMA/MTL offsets.

Extend the functions to allow a platform driver to indicate what its
addresses are, overriding the defaults.

Signed-off-by: Andrew Halaney <ahalaney@redhat.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Tested-by: Brian Masney <bmasney@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h      | 101 +++++++++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c |  36 +++--
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c  | 157 +++++++++++++---------
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h  |  54 +++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c  |  67 +++++----
 include/linux/stmmac.h                            |  19 +++
 6 files changed, 293 insertions(+), 141 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index ccd49346d3b3..4538f334df57 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -336,14 +336,25 @@ enum power_event {
 
 #define MTL_CHAN_BASE_ADDR		0x00000d00
 #define MTL_CHAN_BASE_OFFSET		0x40
-#define MTL_CHANX_BASE_ADDR(x)		(MTL_CHAN_BASE_ADDR + \
-					(x * MTL_CHAN_BASE_OFFSET))
 
-#define MTL_CHAN_TX_OP_MODE(x)		MTL_CHANX_BASE_ADDR(x)
-#define MTL_CHAN_TX_DEBUG(x)		(MTL_CHANX_BASE_ADDR(x) + 0x8)
-#define MTL_CHAN_INT_CTRL(x)		(MTL_CHANX_BASE_ADDR(x) + 0x2c)
-#define MTL_CHAN_RX_OP_MODE(x)		(MTL_CHANX_BASE_ADDR(x) + 0x30)
-#define MTL_CHAN_RX_DEBUG(x)		(MTL_CHANX_BASE_ADDR(x) + 0x38)
+static inline u32 mtl_chanx_base_addr(const struct dwmac4_addrs *addrs,
+				      const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_chan + (x * addrs->mtl_chan_offset);
+	else
+		addr = MTL_CHAN_BASE_ADDR + (x * MTL_CHAN_BASE_OFFSET);
+
+	return addr;
+}
+
+#define MTL_CHAN_TX_OP_MODE(addrs, x)	mtl_chanx_base_addr(addrs, x)
+#define MTL_CHAN_TX_DEBUG(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x8)
+#define MTL_CHAN_INT_CTRL(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x2c)
+#define MTL_CHAN_RX_OP_MODE(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x30)
+#define MTL_CHAN_RX_DEBUG(addrs, x)	(mtl_chanx_base_addr(addrs, x) + 0x38)
 
 #define MTL_OP_MODE_RSF			BIT(5)
 #define MTL_OP_MODE_TXQEN_MASK		GENMASK(3, 2)
@@ -388,8 +399,19 @@ enum power_event {
 /* MTL ETS Control register */
 #define MTL_ETS_CTRL_BASE_ADDR		0x00000d10
 #define MTL_ETS_CTRL_BASE_OFFSET	0x40
-#define MTL_ETSX_CTRL_BASE_ADDR(x)	(MTL_ETS_CTRL_BASE_ADDR + \
-					((x) * MTL_ETS_CTRL_BASE_OFFSET))
+
+static inline u32 mtl_etsx_ctrl_base_addr(const struct dwmac4_addrs *addrs,
+					  const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_ets_ctrl + (x * addrs->mtl_ets_ctrl_offset);
+	else
+		addr = MTL_ETS_CTRL_BASE_ADDR + (x * MTL_ETS_CTRL_BASE_OFFSET);
+
+	return addr;
+}
 
 #define MTL_ETS_CTRL_CC			BIT(3)
 #define MTL_ETS_CTRL_AVALG		BIT(2)
@@ -397,31 +419,76 @@ enum power_event {
 /* MTL Queue Quantum Weight */
 #define MTL_TXQ_WEIGHT_BASE_ADDR	0x00000d18
 #define MTL_TXQ_WEIGHT_BASE_OFFSET	0x40
-#define MTL_TXQX_WEIGHT_BASE_ADDR(x)	(MTL_TXQ_WEIGHT_BASE_ADDR + \
-					((x) * MTL_TXQ_WEIGHT_BASE_OFFSET))
+
+static inline u32 mtl_txqx_weight_base_addr(const struct dwmac4_addrs *addrs,
+					    const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_txq_weight + (x * addrs->mtl_txq_weight_offset);
+	else
+		addr = MTL_TXQ_WEIGHT_BASE_ADDR + (x * MTL_TXQ_WEIGHT_BASE_OFFSET);
+
+	return addr;
+}
+
 #define MTL_TXQ_WEIGHT_ISCQW_MASK	GENMASK(20, 0)
 
 /* MTL sendSlopeCredit register */
 #define MTL_SEND_SLP_CRED_BASE_ADDR	0x00000d1c
 #define MTL_SEND_SLP_CRED_OFFSET	0x40
-#define MTL_SEND_SLP_CREDX_BASE_ADDR(x)	(MTL_SEND_SLP_CRED_BASE_ADDR + \
-					((x) * MTL_SEND_SLP_CRED_OFFSET))
+
+static inline u32 mtl_send_slp_credx_base_addr(const struct dwmac4_addrs *addrs,
+					       const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_send_slp_cred + (x * addrs->mtl_send_slp_cred_offset);
+	else
+		addr = MTL_SEND_SLP_CRED_BASE_ADDR + (x * MTL_SEND_SLP_CRED_OFFSET);
+
+	return addr;
+}
 
 #define MTL_SEND_SLP_CRED_SSC_MASK	GENMASK(13, 0)
 
 /* MTL hiCredit register */
 #define MTL_HIGH_CRED_BASE_ADDR		0x00000d20
 #define MTL_HIGH_CRED_OFFSET		0x40
-#define MTL_HIGH_CREDX_BASE_ADDR(x)	(MTL_HIGH_CRED_BASE_ADDR + \
-					((x) * MTL_HIGH_CRED_OFFSET))
+
+static inline u32 mtl_high_credx_base_addr(const struct dwmac4_addrs *addrs,
+					   const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_high_cred + (x * addrs->mtl_high_cred_offset);
+	else
+		addr = MTL_HIGH_CRED_BASE_ADDR + (x * MTL_HIGH_CRED_OFFSET);
+
+	return addr;
+}
 
 #define MTL_HIGH_CRED_HC_MASK		GENMASK(28, 0)
 
 /* MTL loCredit register */
 #define MTL_LOW_CRED_BASE_ADDR		0x00000d24
 #define MTL_LOW_CRED_OFFSET		0x40
-#define MTL_LOW_CREDX_BASE_ADDR(x)	(MTL_LOW_CRED_BASE_ADDR + \
-					((x) * MTL_LOW_CRED_OFFSET))
+
+static inline u32 mtl_low_credx_base_addr(const struct dwmac4_addrs *addrs,
+					  const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->mtl_low_cred + (x * addrs->mtl_low_cred_offset);
+	else
+		addr = MTL_LOW_CRED_BASE_ADDR + (x * MTL_LOW_CRED_OFFSET);
+
+	return addr;
+}
 
 #define MTL_HIGH_CRED_LC_MASK		GENMASK(28, 0)
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index f44180519638..afaec3fb9ab6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -202,12 +202,14 @@ static void dwmac4_set_mtl_tx_queue_weight(struct stmmac_priv *priv,
 					   struct mac_device_info *hw,
 					   u32 weight, u32 queue)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	void __iomem *ioaddr = hw->pcsr;
-	u32 value = readl(ioaddr + MTL_TXQX_WEIGHT_BASE_ADDR(queue));
+	u32 value = readl(ioaddr + mtl_txqx_weight_base_addr(dwmac4_addrs,
+							     queue));
 
 	value &= ~MTL_TXQ_WEIGHT_ISCQW_MASK;
 	value |= weight & MTL_TXQ_WEIGHT_ISCQW_MASK;
-	writel(value, ioaddr + MTL_TXQX_WEIGHT_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_txqx_weight_base_addr(dwmac4_addrs, queue));
 }
 
 static void dwmac4_map_mtl_dma(struct mac_device_info *hw, u32 queue, u32 chan)
@@ -233,6 +235,7 @@ static void dwmac4_config_cbs(struct stmmac_priv *priv,
 			      u32 send_slope, u32 idle_slope,
 			      u32 high_credit, u32 low_credit, u32 queue)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	void __iomem *ioaddr = hw->pcsr;
 	u32 value;
 
@@ -243,31 +246,33 @@ static void dwmac4_config_cbs(struct stmmac_priv *priv,
 	pr_debug("\tlow_credit: 0x%08x\n", low_credit);
 
 	/* enable AV algorithm */
-	value = readl(ioaddr + MTL_ETSX_CTRL_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_etsx_ctrl_base_addr(dwmac4_addrs, queue));
 	value |= MTL_ETS_CTRL_AVALG;
 	value |= MTL_ETS_CTRL_CC;
-	writel(value, ioaddr + MTL_ETSX_CTRL_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_etsx_ctrl_base_addr(dwmac4_addrs, queue));
 
 	/* configure send slope */
-	value = readl(ioaddr + MTL_SEND_SLP_CREDX_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_send_slp_credx_base_addr(dwmac4_addrs,
+							    queue));
 	value &= ~MTL_SEND_SLP_CRED_SSC_MASK;
 	value |= send_slope & MTL_SEND_SLP_CRED_SSC_MASK;
-	writel(value, ioaddr + MTL_SEND_SLP_CREDX_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_send_slp_credx_base_addr(dwmac4_addrs,
+							    queue));
 
 	/* configure idle slope (same register as tx weight) */
 	dwmac4_set_mtl_tx_queue_weight(priv, hw, idle_slope, queue);
 
 	/* configure high credit */
-	value = readl(ioaddr + MTL_HIGH_CREDX_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_high_credx_base_addr(dwmac4_addrs, queue));
 	value &= ~MTL_HIGH_CRED_HC_MASK;
 	value |= high_credit & MTL_HIGH_CRED_HC_MASK;
-	writel(value, ioaddr + MTL_HIGH_CREDX_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_high_credx_base_addr(dwmac4_addrs, queue));
 
 	/* configure high credit */
-	value = readl(ioaddr + MTL_LOW_CREDX_BASE_ADDR(queue));
+	value = readl(ioaddr + mtl_low_credx_base_addr(dwmac4_addrs, queue));
 	value &= ~MTL_HIGH_CRED_LC_MASK;
 	value |= low_credit & MTL_HIGH_CRED_LC_MASK;
-	writel(value, ioaddr + MTL_LOW_CREDX_BASE_ADDR(queue));
+	writel(value, ioaddr + mtl_low_credx_base_addr(dwmac4_addrs, queue));
 }
 
 static void dwmac4_dump_regs(struct mac_device_info *hw, u32 *reg_space)
@@ -764,6 +769,7 @@ static void dwmac4_phystatus(void __iomem *ioaddr, struct stmmac_extra_stats *x)
 static int dwmac4_irq_mtl_status(struct stmmac_priv *priv,
 				 struct mac_device_info *hw, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	void __iomem *ioaddr = hw->pcsr;
 	u32 mtl_int_qx_status;
 	int ret = 0;
@@ -773,12 +779,13 @@ static int dwmac4_irq_mtl_status(struct stmmac_priv *priv,
 	/* Check MTL Interrupt */
 	if (mtl_int_qx_status & MTL_INT_QX(chan)) {
 		/* read Queue x Interrupt status */
-		u32 status = readl(ioaddr + MTL_CHAN_INT_CTRL(chan));
+		u32 status = readl(ioaddr + MTL_CHAN_INT_CTRL(dwmac4_addrs,
+							      chan));
 
 		if (status & MTL_RX_OVERFLOW_INT) {
 			/*  clear Interrupt */
 			writel(status | MTL_RX_OVERFLOW_INT,
-			       ioaddr + MTL_CHAN_INT_CTRL(chan));
+			       ioaddr + MTL_CHAN_INT_CTRL(dwmac4_addrs, chan));
 			ret = CORE_IRQ_MTL_RX_OVERFLOW;
 		}
 	}
@@ -840,11 +847,12 @@ static void dwmac4_debug(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 struct stmmac_extra_stats *x,
 			 u32 rx_queues, u32 tx_queues)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 	u32 queue;
 
 	for (queue = 0; queue < tx_queues; queue++) {
-		value = readl(ioaddr + MTL_CHAN_TX_DEBUG(queue));
+		value = readl(ioaddr + MTL_CHAN_TX_DEBUG(dwmac4_addrs, queue));
 
 		if (value & MTL_DEBUG_TXSTSFSTS)
 			x->mtl_tx_status_fifo_full++;
@@ -869,7 +877,7 @@ static void dwmac4_debug(struct stmmac_priv *priv, void __iomem *ioaddr,
 	}
 
 	for (queue = 0; queue < rx_queues; queue++) {
-		value = readl(ioaddr + MTL_CHAN_RX_DEBUG(queue));
+		value = readl(ioaddr + MTL_CHAN_RX_DEBUG(dwmac4_addrs, queue));
 
 		if (value & MTL_DEBUG_RXFSTS_MASK) {
 			u32 rxfsts = (value & MTL_DEBUG_RXFSTS_MASK)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 6f255d12f60f..84d3a8551b03 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -13,6 +13,7 @@
 #include <linux/io.h>
 #include "dwmac4.h"
 #include "dwmac4_dma.h"
+#include "stmmac.h"
 
 static void dwmac4_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 {
@@ -73,18 +74,20 @@ static void dwmac4_dma_init_rx_chan(struct stmmac_priv *priv,
 				    struct stmmac_dma_cfg *dma_cfg,
 				    dma_addr_t dma_rx_phy, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 	u32 rxpbl = dma_cfg->rxpbl ?: dma_cfg->pbl;
 
-	value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 	value = value | (rxpbl << DMA_BUS_MODE_RPBL_SHIFT);
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	if (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) && likely(dma_cfg->eame))
 		writel(upper_32_bits(dma_rx_phy),
-		       ioaddr + DMA_CHAN_RX_BASE_ADDR_HI(chan));
+		       ioaddr + DMA_CHAN_RX_BASE_ADDR_HI(dwmac4_addrs, chan));
 
-	writel(lower_32_bits(dma_rx_phy), ioaddr + DMA_CHAN_RX_BASE_ADDR(chan));
+	writel(lower_32_bits(dma_rx_phy),
+	       ioaddr + DMA_CHAN_RX_BASE_ADDR(dwmac4_addrs, chan));
 }
 
 static void dwmac4_dma_init_tx_chan(struct stmmac_priv *priv,
@@ -92,57 +95,61 @@ static void dwmac4_dma_init_tx_chan(struct stmmac_priv *priv,
 				    struct stmmac_dma_cfg *dma_cfg,
 				    dma_addr_t dma_tx_phy, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 	u32 txpbl = dma_cfg->txpbl ?: dma_cfg->pbl;
 
-	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 	value = value | (txpbl << DMA_BUS_MODE_PBL_SHIFT);
 
 	/* Enable OSP to get best performance */
 	value |= DMA_CONTROL_OSP;
 
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	if (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) && likely(dma_cfg->eame))
 		writel(upper_32_bits(dma_tx_phy),
-		       ioaddr + DMA_CHAN_TX_BASE_ADDR_HI(chan));
+		       ioaddr + DMA_CHAN_TX_BASE_ADDR_HI(dwmac4_addrs, chan));
 
-	writel(lower_32_bits(dma_tx_phy), ioaddr + DMA_CHAN_TX_BASE_ADDR(chan));
+	writel(lower_32_bits(dma_tx_phy),
+	       ioaddr + DMA_CHAN_TX_BASE_ADDR(dwmac4_addrs, chan));
 }
 
 static void dwmac4_dma_init_channel(struct stmmac_priv *priv,
 				    void __iomem *ioaddr,
 				    struct stmmac_dma_cfg *dma_cfg, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 
 	/* common channel control register config */
-	value = readl(ioaddr + DMA_CHAN_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 	if (dma_cfg->pblx8)
 		value = value | DMA_BUS_MODE_PBL;
-	writel(value, ioaddr + DMA_CHAN_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 
 	/* Mask interrupts by writing to CSR7 */
 	writel(DMA_CHAN_INTR_DEFAULT_MASK,
-	       ioaddr + DMA_CHAN_INTR_ENA(chan));
+	       ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 static void dwmac410_dma_init_channel(struct stmmac_priv *priv,
 				      void __iomem *ioaddr,
 				      struct stmmac_dma_cfg *dma_cfg, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 
 	/* common channel control register config */
-	value = readl(ioaddr + DMA_CHAN_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 	if (dma_cfg->pblx8)
 		value = value | DMA_BUS_MODE_PBL;
 
-	writel(value, ioaddr + DMA_CHAN_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 
 	/* Mask interrupts by writing to CSR7 */
 	writel(DMA_CHAN_INTR_DEFAULT_MASK_4_10,
-	       ioaddr + DMA_CHAN_INTR_ENA(chan));
+	       ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 static void dwmac4_dma_init(void __iomem *ioaddr,
@@ -184,40 +191,46 @@ static void _dwmac4_dump_dma_regs(struct stmmac_priv *priv,
 				  void __iomem *ioaddr, u32 channel,
 				  u32 *reg_space)
 {
-	reg_space[DMA_CHAN_CONTROL(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CONTROL(channel));
-	reg_space[DMA_CHAN_TX_CONTROL(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_CONTROL(channel));
-	reg_space[DMA_CHAN_RX_CONTROL(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_CONTROL(channel));
-	reg_space[DMA_CHAN_TX_BASE_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(channel));
-	reg_space[DMA_CHAN_RX_BASE_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(channel));
-	reg_space[DMA_CHAN_TX_END_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_END_ADDR(channel));
-	reg_space[DMA_CHAN_RX_END_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_END_ADDR(channel));
-	reg_space[DMA_CHAN_TX_RING_LEN(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_TX_RING_LEN(channel));
-	reg_space[DMA_CHAN_RX_RING_LEN(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_RING_LEN(channel));
-	reg_space[DMA_CHAN_INTR_ENA(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_INTR_ENA(channel));
-	reg_space[DMA_CHAN_RX_WATCHDOG(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_RX_WATCHDOG(channel));
-	reg_space[DMA_CHAN_SLOT_CTRL_STATUS(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(channel));
-	reg_space[DMA_CHAN_CUR_TX_DESC(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_TX_DESC(channel));
-	reg_space[DMA_CHAN_CUR_RX_DESC(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_RX_DESC(channel));
-	reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(channel));
-	reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(channel));
-	reg_space[DMA_CHAN_STATUS(channel) / 4] =
-		readl(ioaddr + DMA_CHAN_STATUS(channel));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	const struct dwmac4_addrs *default_addrs = NULL;
+
+	/* Purposely save the registers in the "normal" layout, regardless of
+	 * platform modifications, to keep reg_space size constant
+	 */
+	reg_space[DMA_CHAN_CONTROL(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_CONTROL(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_CONTROL(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_BASE_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_BASE_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_BASE_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_BASE_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_END_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_END_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_END_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_END_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_TX_RING_LEN(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_TX_RING_LEN(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_RING_LEN(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_RING_LEN(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_INTR_ENA(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_RX_WATCHDOG(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_RX_WATCHDOG(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_SLOT_CTRL_STATUS(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_SLOT_CTRL_STATUS(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_TX_DESC(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_TX_DESC(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_RX_DESC(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_RX_DESC(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_TX_BUF_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_TX_BUF_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_CUR_RX_BUF_ADDR(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_CUR_RX_BUF_ADDR(dwmac4_addrs, channel));
+	reg_space[DMA_CHAN_STATUS(default_addrs, channel) / 4] =
+		readl(ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, channel));
 }
 
 static void dwmac4_dump_dma_regs(struct stmmac_priv *priv, void __iomem *ioaddr,
@@ -232,17 +245,20 @@ static void dwmac4_dump_dma_regs(struct stmmac_priv *priv, void __iomem *ioaddr,
 static void dwmac4_rx_watchdog(struct stmmac_priv *priv, void __iomem *ioaddr,
 			       u32 riwt, u32 queue)
 {
-	writel(riwt, ioaddr + DMA_CHAN_RX_WATCHDOG(queue));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(riwt, ioaddr + DMA_CHAN_RX_WATCHDOG(dwmac4_addrs, queue));
 }
 
 static void dwmac4_dma_rx_chan_op_mode(struct stmmac_priv *priv,
 				       void __iomem *ioaddr, int mode,
 				       u32 channel, int fifosz, u8 qmode)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	unsigned int rqs = fifosz / 256 - 1;
 	u32 mtl_rx_op;
 
-	mtl_rx_op = readl(ioaddr + MTL_CHAN_RX_OP_MODE(channel));
+	mtl_rx_op = readl(ioaddr + MTL_CHAN_RX_OP_MODE(dwmac4_addrs, channel));
 
 	if (mode == SF_DMA_MODE) {
 		pr_debug("GMAC: enable RX store and forward mode\n");
@@ -300,14 +316,16 @@ static void dwmac4_dma_rx_chan_op_mode(struct stmmac_priv *priv,
 		mtl_rx_op |= rfa << MTL_OP_MODE_RFA_SHIFT;
 	}
 
-	writel(mtl_rx_op, ioaddr + MTL_CHAN_RX_OP_MODE(channel));
+	writel(mtl_rx_op, ioaddr + MTL_CHAN_RX_OP_MODE(dwmac4_addrs, channel));
 }
 
 static void dwmac4_dma_tx_chan_op_mode(struct stmmac_priv *priv,
 				       void __iomem *ioaddr, int mode,
 				       u32 channel, int fifosz, u8 qmode)
 {
-	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(channel));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(dwmac4_addrs,
+							   channel));
 	unsigned int tqs = fifosz / 256 - 1;
 
 	if (mode == SF_DMA_MODE) {
@@ -353,7 +371,7 @@ static void dwmac4_dma_tx_chan_op_mode(struct stmmac_priv *priv,
 	mtl_tx_op &= ~MTL_OP_MODE_TQS_MASK;
 	mtl_tx_op |= tqs << MTL_OP_MODE_TQS_SHIFT;
 
-	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(channel));
+	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(dwmac4_addrs, channel));
 }
 
 static int dwmac4_get_hw_feature(void __iomem *ioaddr,
@@ -454,25 +472,28 @@ static int dwmac4_get_hw_feature(void __iomem *ioaddr,
 static void dwmac4_enable_tso(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      bool en, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value;
 
 	if (en) {
 		/* enable TSO */
-		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 		writel(value | DMA_CONTROL_TSE,
-		       ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		       ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 	} else {
 		/* enable TSO */
-		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 		writel(value & ~DMA_CONTROL_TSE,
-		       ioaddr + DMA_CHAN_TX_CONTROL(chan));
+		       ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 	}
 }
 
 static void dwmac4_qmode(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 u32 channel, u8 qmode)
 {
-	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(channel));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(dwmac4_addrs,
+							   channel));
 
 	mtl_tx_op &= ~MTL_OP_MODE_TXQEN_MASK;
 	if (qmode != MTL_QUEUE_AVB)
@@ -480,50 +501,54 @@ static void dwmac4_qmode(struct stmmac_priv *priv, void __iomem *ioaddr,
 	else
 		mtl_tx_op |= MTL_OP_MODE_TXQEN_AV;
 
-	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(channel));
+	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(dwmac4_addrs, channel));
 }
 
 static void dwmac4_set_bfsize(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      int bfsize, u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value &= ~DMA_RBSZ_MASK;
 	value |= (bfsize << DMA_RBSZ_SHIFT) & DMA_RBSZ_MASK;
 
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 }
 
 static void dwmac4_enable_sph(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      bool en, u32 chan)
 {
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
 	u32 value = readl(ioaddr + GMAC_EXT_CONFIG);
 
 	value &= ~GMAC_CONFIG_HDSMS;
 	value |= GMAC_CONFIG_HDSMS_256; /* Segment max 256 bytes */
 	writel(value, ioaddr + GMAC_EXT_CONFIG);
 
-	value = readl(ioaddr + DMA_CHAN_CONTROL(chan));
+	value = readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 	if (en)
 		value |= DMA_CONTROL_SPH;
 	else
 		value &= ~DMA_CONTROL_SPH;
-	writel(value, ioaddr + DMA_CHAN_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan));
 }
 
 static int dwmac4_enable_tbs(struct stmmac_priv *priv, void __iomem *ioaddr,
 			     bool en, u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	if (en)
 		value |= DMA_CONTROL_EDSE;
 	else
 		value &= ~DMA_CONTROL_EDSE;
 
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
-	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan)) & DMA_CONTROL_EDSE;
+	value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs,
+						   chan)) & DMA_CONTROL_EDSE;
 	if (en && !value)
 		return -EIO;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
index 740c3bc8d9a0..358e7dcb6a9a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h
@@ -95,29 +95,41 @@
 /* Following DMA defines are chanels oriented */
 #define DMA_CHAN_BASE_ADDR		0x00001100
 #define DMA_CHAN_BASE_OFFSET		0x80
-#define DMA_CHANX_BASE_ADDR(x)		(DMA_CHAN_BASE_ADDR + \
-					(x * DMA_CHAN_BASE_OFFSET))
+
+static inline u32 dma_chanx_base_addr(const struct dwmac4_addrs *addrs,
+				      const u32 x)
+{
+	u32 addr;
+
+	if (addrs)
+		addr = addrs->dma_chan + (x * addrs->dma_chan_offset);
+	else
+		addr = DMA_CHAN_BASE_ADDR + (x * DMA_CHAN_BASE_OFFSET);
+
+	return addr;
+}
+
 #define DMA_CHAN_REG_NUMBER		17
 
-#define DMA_CHAN_CONTROL(x)		DMA_CHANX_BASE_ADDR(x)
-#define DMA_CHAN_TX_CONTROL(x)		(DMA_CHANX_BASE_ADDR(x) + 0x4)
-#define DMA_CHAN_RX_CONTROL(x)		(DMA_CHANX_BASE_ADDR(x) + 0x8)
-#define DMA_CHAN_TX_BASE_ADDR_HI(x)	(DMA_CHANX_BASE_ADDR(x) + 0x10)
-#define DMA_CHAN_TX_BASE_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x14)
-#define DMA_CHAN_RX_BASE_ADDR_HI(x)	(DMA_CHANX_BASE_ADDR(x) + 0x18)
-#define DMA_CHAN_RX_BASE_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x1c)
-#define DMA_CHAN_TX_END_ADDR(x)		(DMA_CHANX_BASE_ADDR(x) + 0x20)
-#define DMA_CHAN_RX_END_ADDR(x)		(DMA_CHANX_BASE_ADDR(x) + 0x28)
-#define DMA_CHAN_TX_RING_LEN(x)		(DMA_CHANX_BASE_ADDR(x) + 0x2c)
-#define DMA_CHAN_RX_RING_LEN(x)		(DMA_CHANX_BASE_ADDR(x) + 0x30)
-#define DMA_CHAN_INTR_ENA(x)		(DMA_CHANX_BASE_ADDR(x) + 0x34)
-#define DMA_CHAN_RX_WATCHDOG(x)		(DMA_CHANX_BASE_ADDR(x) + 0x38)
-#define DMA_CHAN_SLOT_CTRL_STATUS(x)	(DMA_CHANX_BASE_ADDR(x) + 0x3c)
-#define DMA_CHAN_CUR_TX_DESC(x)		(DMA_CHANX_BASE_ADDR(x) + 0x44)
-#define DMA_CHAN_CUR_RX_DESC(x)		(DMA_CHANX_BASE_ADDR(x) + 0x4c)
-#define DMA_CHAN_CUR_TX_BUF_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x54)
-#define DMA_CHAN_CUR_RX_BUF_ADDR(x)	(DMA_CHANX_BASE_ADDR(x) + 0x5c)
-#define DMA_CHAN_STATUS(x)		(DMA_CHANX_BASE_ADDR(x) + 0x60)
+#define DMA_CHAN_CONTROL(addrs, x)	dma_chanx_base_addr(addrs, x)
+#define DMA_CHAN_TX_CONTROL(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x4)
+#define DMA_CHAN_RX_CONTROL(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x8)
+#define DMA_CHAN_TX_BASE_ADDR_HI(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x10)
+#define DMA_CHAN_TX_BASE_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x14)
+#define DMA_CHAN_RX_BASE_ADDR_HI(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x18)
+#define DMA_CHAN_RX_BASE_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x1c)
+#define DMA_CHAN_TX_END_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x20)
+#define DMA_CHAN_RX_END_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x28)
+#define DMA_CHAN_TX_RING_LEN(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x2c)
+#define DMA_CHAN_RX_RING_LEN(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x30)
+#define DMA_CHAN_INTR_ENA(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x34)
+#define DMA_CHAN_RX_WATCHDOG(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x38)
+#define DMA_CHAN_SLOT_CTRL_STATUS(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x3c)
+#define DMA_CHAN_CUR_TX_DESC(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x44)
+#define DMA_CHAN_CUR_RX_DESC(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x4c)
+#define DMA_CHAN_CUR_TX_BUF_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x54)
+#define DMA_CHAN_CUR_RX_BUF_ADDR(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x5c)
+#define DMA_CHAN_STATUS(addrs, x)	(dma_chanx_base_addr(addrs, x) + 0x60)
 
 /* DMA Control X */
 #define DMA_CONTROL_SPH			BIT(24)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
index 5e9c495aa03e..df41eac54058 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c
@@ -11,6 +11,7 @@
 #include "common.h"
 #include "dwmac4_dma.h"
 #include "dwmac4.h"
+#include "stmmac.h"
 
 int dwmac4_dma_reset(void __iomem *ioaddr)
 {
@@ -28,22 +29,27 @@ int dwmac4_dma_reset(void __iomem *ioaddr)
 void dwmac4_set_rx_tail_ptr(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 tail_ptr, u32 chan)
 {
-	writel(tail_ptr, ioaddr + DMA_CHAN_RX_END_ADDR(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(tail_ptr, ioaddr + DMA_CHAN_RX_END_ADDR(dwmac4_addrs, chan));
 }
 
 void dwmac4_set_tx_tail_ptr(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 tail_ptr, u32 chan)
 {
-	writel(tail_ptr, ioaddr + DMA_CHAN_TX_END_ADDR(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(tail_ptr, ioaddr + DMA_CHAN_TX_END_ADDR(dwmac4_addrs, chan));
 }
 
 void dwmac4_dma_start_tx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	value |= DMA_CONTROL_ST;
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	value = readl(ioaddr + GMAC_CONFIG);
 	value |= GMAC_CONFIG_TE;
@@ -53,20 +59,24 @@ void dwmac4_dma_start_tx(struct stmmac_priv *priv, void __iomem *ioaddr,
 void dwmac4_dma_stop_tx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	u32 value = readl(ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 
 	value &= ~DMA_CONTROL_ST;
-	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_TX_CONTROL(dwmac4_addrs, chan));
 }
 
 void dwmac4_dma_start_rx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value |= DMA_CONTROL_SR;
 
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value = readl(ioaddr + GMAC_CONFIG);
 	value |= GMAC_CONFIG_RE;
@@ -76,81 +86,91 @@ void dwmac4_dma_start_rx(struct stmmac_priv *priv, void __iomem *ioaddr,
 void dwmac4_dma_stop_rx(struct stmmac_priv *priv, void __iomem *ioaddr,
 			u32 chan)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 
 	value &= ~DMA_CONTROL_SR;
-	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
+	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(dwmac4_addrs, chan));
 }
 
 void dwmac4_set_tx_ring_len(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 len, u32 chan)
 {
-	writel(len, ioaddr + DMA_CHAN_TX_RING_LEN(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(len, ioaddr + DMA_CHAN_TX_RING_LEN(dwmac4_addrs, chan));
 }
 
 void dwmac4_set_rx_ring_len(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 len, u32 chan)
 {
-	writel(len, ioaddr + DMA_CHAN_RX_RING_LEN(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+
+	writel(len, ioaddr + DMA_CHAN_RX_RING_LEN(dwmac4_addrs, chan));
 }
 
 void dwmac4_enable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			   u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value |= DMA_CHAN_INTR_DEFAULT_RX;
 	if (tx)
 		value |= DMA_CHAN_INTR_DEFAULT_TX;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 void dwmac410_enable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			     u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value |= DMA_CHAN_INTR_DEFAULT_RX_4_10;
 	if (tx)
 		value |= DMA_CHAN_INTR_DEFAULT_TX_4_10;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 void dwmac4_disable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			    u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_RX;
 	if (tx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_TX;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 void dwmac410_disable_dma_irq(struct stmmac_priv *priv, void __iomem *ioaddr,
 			      u32 chan, bool rx, bool tx)
 {
-	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 value = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 
 	if (rx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_RX_4_10;
 	if (tx)
 		value &= ~DMA_CHAN_INTR_DEFAULT_TX_4_10;
 
-	writel(value, ioaddr + DMA_CHAN_INTR_ENA(chan));
+	writel(value, ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 }
 
 int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr,
 			 struct stmmac_extra_stats *x, u32 chan, u32 dir)
 {
-	u32 intr_status = readl(ioaddr + DMA_CHAN_STATUS(chan));
-	u32 intr_en = readl(ioaddr + DMA_CHAN_INTR_ENA(chan));
+	const struct dwmac4_addrs *dwmac4_addrs = priv->plat->dwmac4_addrs;
+	u32 intr_status = readl(ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, chan));
+	u32 intr_en = readl(ioaddr + DMA_CHAN_INTR_ENA(dwmac4_addrs, chan));
 	int ret = 0;
 
 	if (dir == DMA_DIR_RX)
@@ -195,7 +215,8 @@ int dwmac4_dma_interrupt(struct stmmac_priv *priv, void __iomem *ioaddr,
 	if (unlikely(intr_status & DMA_CHAN_STATUS_ERI))
 		x->rx_early_irq++;
 
-	writel(intr_status & intr_en, ioaddr + DMA_CHAN_STATUS(chan));
+	writel(intr_status & intr_en,
+	       ioaddr + DMA_CHAN_STATUS(dwmac4_addrs, chan));
 	return ret;
 }
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index dafa001e9e7a..225751a8fd8e 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -186,6 +186,24 @@ struct stmmac_safety_feature_cfg {
 	u32 tmouten;
 };
 
+/* Addresses that may be customized by a platform */
+struct dwmac4_addrs {
+	u32 dma_chan;
+	u32 dma_chan_offset;
+	u32 mtl_chan;
+	u32 mtl_chan_offset;
+	u32 mtl_ets_ctrl;
+	u32 mtl_ets_ctrl_offset;
+	u32 mtl_txq_weight;
+	u32 mtl_txq_weight_offset;
+	u32 mtl_send_slp_cred;
+	u32 mtl_send_slp_cred_offset;
+	u32 mtl_high_cred;
+	u32 mtl_high_cred_offset;
+	u32 mtl_low_cred;
+	u32 mtl_low_cred_offset;
+};
+
 struct plat_stmmacenet_data {
 	int bus_id;
 	int phy_addr;
@@ -274,5 +292,6 @@ struct plat_stmmacenet_data {
 	bool use_phy_wol;
 	bool sph_disable;
 	bool serdes_up_after_phy_linkup;
+	const struct dwmac4_addrs *dwmac4_addrs;
 };
 #endif
-- 
cgit v1.2.3


From 4fdeb847130229dc94befa241461669c7359776b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Mar 2023 16:59:07 +0200
Subject: wifi: ieee80211: clean up public action codes

WLAN_PUBLIC_ACTION_FTM_RESPONSE is duplicated with
WLAN_PUB_ACTION_FTM, but that might better be called
WLAN_PUB_ACTION_FTM_RESPONSE; clean up here.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 2463bdd2a382..0583b2b0ce1f 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -9,7 +9,7 @@
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
  * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
- * Copyright (c) 2018 - 2022 Intel Corporation
+ * Copyright (c) 2018 - 2023 Intel Corporation
  */
 
 #ifndef LINUX_IEEE80211_H
@@ -3557,11 +3557,6 @@ enum ieee80211_unprotected_wnm_actioncode {
 	WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1,
 };
 
-/* Public action codes */
-enum ieee80211_public_actioncode {
-	WLAN_PUBLIC_ACTION_FTM_RESPONSE = 33,
-};
-
 /* Security key length */
 enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP40 = 5,
@@ -3653,7 +3648,7 @@ enum ieee80211_pub_actioncode {
 	WLAN_PUB_ACTION_NETWORK_CHANNEL_CONTROL = 30,
 	WLAN_PUB_ACTION_WHITE_SPACE_MAP_ANN = 31,
 	WLAN_PUB_ACTION_FTM_REQUEST = 32,
-	WLAN_PUB_ACTION_FTM = 33,
+	WLAN_PUB_ACTION_FTM_RESPONSE = 33,
 	WLAN_PUB_ACTION_FILS_DISCOVERY = 34,
 };
 
@@ -4383,7 +4378,7 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb)
 		return false;
 
 	if (mgmt->u.action.u.ftm.action_code ==
-		WLAN_PUBLIC_ACTION_FTM_RESPONSE &&
+		WLAN_PUB_ACTION_FTM_RESPONSE &&
 	    skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm))
 		return true;
 
-- 
cgit v1.2.3


From 2c9abe653bc5134eeab411c46dde008d8a1c37b0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Mar 2023 16:59:08 +0200
Subject: wifi: ieee80211: correctly mark FTM frames non-bufferable

The checks of whether or not a frame is bufferable were not
taking into account that some action frames aren't, such as
FTM. Check this, which requires some changes to the function
ieee80211_is_bufferable_mmpdu() since we need the whole skb
for the checks now.

Reviewed-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c |  7 ++--
 drivers/net/wireless/mediatek/mt76/tx.c     |  2 +-
 include/linux/ieee80211.h                   | 52 +++++++++++++++++++++--------
 net/mac80211/tx.c                           |  4 +--
 4 files changed, 45 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 51f21cbf7a20..478442e16d43 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -604,8 +604,9 @@ static void iwl_mvm_skb_prepare_status(struct sk_buff *skb,
 static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm,
 				      struct iwl_mvm_vif_link_info *link,
 				      struct ieee80211_tx_info *info,
-				      struct ieee80211_hdr *hdr)
+				      struct sk_buff *skb)
 {
+	struct ieee80211_hdr *hdr = (void *)skb->data;
 	__le16 fc = hdr->frame_control;
 
 	switch (info->control.vif->type) {
@@ -622,7 +623,7 @@ static int iwl_mvm_get_ctrl_vif_queue(struct iwl_mvm *mvm,
 		 * reason 7 ("Class 3 frame received from nonassociated STA").
 		 */
 		if (ieee80211_is_mgmt(fc) &&
-		    (!ieee80211_is_bufferable_mmpdu(fc) ||
+		    (!ieee80211_is_bufferable_mmpdu(skb) ||
 		     ieee80211_is_deauth(fc) || ieee80211_is_disassoc(fc)))
 			return link->mgmt_queue;
 
@@ -755,7 +756,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
 				sta_id = link->mcast_sta.sta_id;
 
 			queue = iwl_mvm_get_ctrl_vif_queue(mvm, link, &info,
-							   hdr);
+							   skb);
 		} else if (info.control.vif->type == NL80211_IFTYPE_MONITOR) {
 			queue = mvm->snif_queue;
 			sta_id = mvm->snif_sta.sta_id;
diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index 1f309d05380a..3ad9742364ba 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -330,7 +330,7 @@ mt76_tx(struct mt76_phy *phy, struct ieee80211_sta *sta,
 	if ((dev->drv->drv_flags & MT_DRV_HW_MGMT_TXQ) &&
 	    !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
 	    !ieee80211_is_data(hdr->frame_control) &&
-	    !ieee80211_is_bufferable_mmpdu(hdr->frame_control)) {
+	    !ieee80211_is_bufferable_mmpdu(skb)) {
 		qid = MT_TXQ_PSD;
 	}
 
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0583b2b0ce1f..c4cf296e7eaf 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -782,20 +782,6 @@ static inline bool ieee80211_is_any_nullfunc(__le16 fc)
 	return (ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc));
 }
 
-/**
- * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU
- * @fc: frame control field in little-endian byteorder
- */
-static inline bool ieee80211_is_bufferable_mmpdu(__le16 fc)
-{
-	/* IEEE 802.11-2012, definition of "bufferable management frame";
-	 * note that this ignores the IBSS special case. */
-	return ieee80211_is_mgmt(fc) &&
-	       (ieee80211_is_action(fc) ||
-		ieee80211_is_disassoc(fc) ||
-		ieee80211_is_deauth(fc));
-}
-
 /**
  * ieee80211_is_first_frag - check if IEEE80211_SCTL_FRAG is not set
  * @seq_ctrl: frame sequence control bytes in little-endian byteorder
@@ -4132,6 +4118,44 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr)
 		return hdr->addr1;
 }
 
+/**
+ * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU
+ * @skb: the skb to check, starting with the 802.11 header
+ */
+static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb)
+{
+	struct ieee80211_mgmt *mgmt = (void *)skb->data;
+	__le16 fc = mgmt->frame_control;
+
+	/*
+	 * IEEE 802.11 REVme D2.0 definition of bufferable MMPDU;
+	 * note that this ignores the IBSS special case.
+	 */
+	if (!ieee80211_is_mgmt(fc))
+		return false;
+
+	if (ieee80211_is_disassoc(fc) || ieee80211_is_deauth(fc))
+		return true;
+
+	if (!ieee80211_is_action(fc))
+		return false;
+
+	if (skb->len < offsetofend(typeof(*mgmt), u.action.u.ftm.action_code))
+		return true;
+
+	/* action frame - additionally check for non-bufferable FTM */
+
+	if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC &&
+	    mgmt->u.action.category != WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION)
+		return true;
+
+	if (mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_REQUEST ||
+	    mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_RESPONSE)
+		return false;
+
+	return true;
+}
+
 /**
  * _ieee80211_is_robust_mgmt_frame - check if frame is a robust management frame
  * @hdr: the frame (buffer must include at least the first octet of payload)
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index dfe6b9c9b29e..1a3327407552 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -488,7 +488,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx)
 		int ac = skb_get_queue_mapping(tx->skb);
 
 		if (ieee80211_is_mgmt(hdr->frame_control) &&
-		    !ieee80211_is_bufferable_mmpdu(hdr->frame_control)) {
+		    !ieee80211_is_bufferable_mmpdu(tx->skb)) {
 			info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER;
 			return TX_CONTINUE;
 		}
@@ -1323,7 +1323,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
 	if (!(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP) &&
 	    unlikely(!ieee80211_is_data_present(hdr->frame_control))) {
 		if ((!ieee80211_is_mgmt(hdr->frame_control) ||
-		     ieee80211_is_bufferable_mmpdu(hdr->frame_control) ||
+		     ieee80211_is_bufferable_mmpdu(skb) ||
 		     vif->type == NL80211_IFTYPE_STATION) &&
 		    sta && sta->uploaded) {
 			/*
-- 
cgit v1.2.3


From d00800a289c9349bb659a698cbd7bc04521dc927 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 31 Mar 2023 16:59:17 +0200
Subject: wifi: mac80211: add flush_sta method

Some drivers like iwlwifi might have per-STA queues, so we
may want to flush/drop just those queues rather than all
when removing a station. Add a separate method for that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h    |  6 ++++++
 net/mac80211/driver-ops.h | 15 +++++++++++++++
 net/mac80211/sta_info.c   |  8 ++++++--
 net/mac80211/trace.h      |  7 +++++++
 4 files changed, 34 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 679421d37a42..a8dadbd83d95 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3950,6 +3950,10 @@ struct ieee80211_prep_tx_info {
  *	Note that vif can be NULL.
  *	The callback can sleep.
  *
+ * @flush_sta: Flush or drop all pending frames from the hardware queue(s) for
+ *	the given station, as it's about to be removed.
+ *	The callback can sleep.
+ *
  * @channel_switch: Drivers that need (or want) to offload the channel
  *	switch operation for CSAs received from the AP may implement this
  *	callback. They must then call ieee80211_chswitch_done() to indicate
@@ -4415,6 +4419,8 @@ struct ieee80211_ops {
 #endif
 	void (*flush)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 		      u32 queues, bool drop);
+	void (*flush_sta)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
+			  struct ieee80211_sta *sta);
 	void (*channel_switch)(struct ieee80211_hw *hw,
 			       struct ieee80211_vif *vif,
 			       struct ieee80211_channel_switch *ch_switch);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 0bf208f5bbc5..45d3e53c7383 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -649,6 +649,21 @@ static inline void drv_flush(struct ieee80211_local *local,
 	trace_drv_return_void(local);
 }
 
+static inline void drv_flush_sta(struct ieee80211_local *local,
+				 struct ieee80211_sub_if_data *sdata,
+				 struct sta_info *sta)
+{
+	might_sleep();
+
+	if (sdata && !check_sdata_in_driver(sdata))
+		return;
+
+	trace_drv_flush_sta(local, sdata, &sta->sta);
+	if (local->ops->flush_sta)
+		local->ops->flush_sta(&local->hw, &sdata->vif, &sta->sta);
+	trace_drv_return_void(local);
+}
+
 static inline void drv_channel_switch(struct ieee80211_local *local,
 				      struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_channel_switch *ch_switch)
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index ce7c3b997269..1400512e0dde 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1299,8 +1299,12 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
 	 * frames sitting on hardware queues might be sent out without
 	 * any encryption at all.
 	 */
-	if (local->ops->set_key)
-		ieee80211_flush_queues(local, sta->sdata, false);
+	if (local->ops->set_key) {
+		if (local->ops->flush_sta)
+			drv_flush_sta(local, sta->sdata, sta);
+		else
+			ieee80211_flush_queues(local, sta->sdata, false);
+	}
 
 	/* now keys can no longer be reached */
 	ieee80211_free_sta_keys(local, sta);
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index e0ccf5fe708a..de5d69f21306 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -1177,6 +1177,13 @@ TRACE_EVENT(drv_flush,
 	)
 );
 
+DEFINE_EVENT(sta_event, drv_flush_sta,
+	TP_PROTO(struct ieee80211_local *local,
+		 struct ieee80211_sub_if_data *sdata,
+		 struct ieee80211_sta *sta),
+	TP_ARGS(local, sdata, sta)
+);
+
 TRACE_EVENT(drv_channel_switch,
 	TP_PROTO(struct ieee80211_local *local,
 		 struct ieee80211_sub_if_data *sdata,
-- 
cgit v1.2.3


From 1cf3bfc60f9836f44da951f58b6ae24680484b35 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Thu, 13 Apr 2023 01:06:32 +0200
Subject: bpf: Support 64-bit pointers to kfuncs

test_ksyms_module fails to emit a kfunc call targeting a module on
s390x, because the verifier stores the difference between kfunc
address and __bpf_call_base in bpf_insn.imm, which is s32, and modules
are roughly (1 << 42) bytes away from the kernel on s390x.

Fix by keeping BTF id in bpf_insn.imm for BPF_PSEUDO_KFUNC_CALLs,
and storing the absolute address in bpf_kfunc_desc.

Introduce bpf_jit_supports_far_kfunc_call() in order to limit this new
behavior to the s390x JIT. Otherwise other JITs need to be modified,
which is not desired.

Introduce bpf_get_kfunc_addr() instead of exposing both
find_kfunc_desc() and struct bpf_kfunc_desc.

In addition to sorting kfuncs by imm, also sort them by offset, in
order to handle conflicting imms from different modules. Do this on
all architectures in order to simplify code.

Factor out resolving specialized kfuncs (XPD and dynptr) from
fixup_kfunc_call(). This was required in the first place, because
fixup_kfunc_call() uses find_kfunc_desc(), which returns a const
pointer, so it's not possible to modify kfunc addr without stripping
const, which is not nice. It also removes repetition of code like:

	if (bpf_jit_supports_far_kfunc_call())
		desc->addr = func;
	else
		insn->imm = BPF_CALL_IMM(func);

and separates kfunc_desc_tab fixups from kfunc_call fixups.

Suggested-by: Jiri Olsa <olsajiri@gmail.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20230412230632.885985-1-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/s390/net/bpf_jit_comp.c |   5 ++
 include/linux/bpf.h          |  10 ++++
 include/linux/filter.h       |   1 +
 kernel/bpf/core.c            |  11 ++++
 kernel/bpf/verifier.c        | 123 +++++++++++++++++++++++++++++--------------
 5 files changed, 110 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index d0846ba818ee..7102e4b674a0 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2001,6 +2001,11 @@ bool bpf_jit_supports_kfunc_call(void)
 	return true;
 }
 
+bool bpf_jit_supports_far_kfunc_call(void)
+{
+	return true;
+}
+
 int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
 		       void *old_addr, void *new_addr)
 {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2c6095bd7d69..88845aadc47d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2295,6 +2295,9 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog);
 const struct btf_func_model *
 bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 			 const struct bpf_insn *insn);
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+		       u16 btf_fd_idx, u8 **func_addr);
+
 struct bpf_core_ctx {
 	struct bpf_verifier_log *log;
 	const struct btf *btf;
@@ -2545,6 +2548,13 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 	return NULL;
 }
 
+static inline int
+bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+		   u16 btf_fd_idx, u8 **func_addr)
+{
+	return -ENOTSUPP;
+}
+
 static inline bool unprivileged_ebpf_enabled(void)
 {
 	return false;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5364b0c52c1d..bbce89937fde 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -920,6 +920,7 @@ void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_kfunc_call(void);
+bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_helper_changes_pkt_data(void *func);
 
 static inline bool bpf_dump_raw_ok(const struct cred *cred)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e2d256c82072..7421487422d4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1187,6 +1187,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 	s16 off = insn->off;
 	s32 imm = insn->imm;
 	u8 *addr;
+	int err;
 
 	*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
 	if (!*func_addr_fixed) {
@@ -1201,6 +1202,11 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
 			addr = (u8 *)prog->aux->func[off]->bpf_func;
 		else
 			return -EINVAL;
+	} else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+		   bpf_jit_supports_far_kfunc_call()) {
+		err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+		if (err)
+			return err;
 	} else {
 		/* Address of a BPF helper call. Since part of the core
 		 * kernel, it's always at a fixed location. __bpf_call_base
@@ -2732,6 +2738,11 @@ bool __weak bpf_jit_supports_kfunc_call(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e3d5a7a2f428..4aa6d715e655 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -195,6 +195,8 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
 static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
 static int ref_set_non_owning(struct bpf_verifier_env *env,
 			      struct bpf_reg_state *reg);
+static void specialize_kfunc(struct bpf_verifier_env *env,
+			     u32 func_id, u16 offset, unsigned long *addr);
 
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
@@ -2374,6 +2376,7 @@ struct bpf_kfunc_desc {
 	u32 func_id;
 	s32 imm;
 	u16 offset;
+	unsigned long addr;
 };
 
 struct bpf_kfunc_btf {
@@ -2383,6 +2386,11 @@ struct bpf_kfunc_btf {
 };
 
 struct bpf_kfunc_desc_tab {
+	/* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+	 * verification. JITs do lookups by bpf_insn, where func_id may not be
+	 * available, therefore at the end of verification do_misc_fixups()
+	 * sorts this by imm and offset.
+	 */
 	struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
 	u32 nr_descs;
 };
@@ -2423,6 +2431,19 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
 		       sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
 }
 
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+		       u16 btf_fd_idx, u8 **func_addr)
+{
+	const struct bpf_kfunc_desc *desc;
+
+	desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+	if (!desc)
+		return -EFAULT;
+
+	*func_addr = (u8 *)desc->addr;
+	return 0;
+}
+
 static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
 					 s16 offset)
 {
@@ -2602,13 +2623,18 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 			func_name);
 		return -EINVAL;
 	}
+	specialize_kfunc(env, func_id, offset, &addr);
 
-	call_imm = BPF_CALL_IMM(addr);
-	/* Check whether or not the relative offset overflows desc->imm */
-	if ((unsigned long)(s32)call_imm != call_imm) {
-		verbose(env, "address of kernel function %s is out of range\n",
-			func_name);
-		return -EINVAL;
+	if (bpf_jit_supports_far_kfunc_call()) {
+		call_imm = func_id;
+	} else {
+		call_imm = BPF_CALL_IMM(addr);
+		/* Check whether the relative offset overflows desc->imm */
+		if ((unsigned long)(s32)call_imm != call_imm) {
+			verbose(env, "address of kernel function %s is out of range\n",
+				func_name);
+			return -EINVAL;
+		}
 	}
 
 	if (bpf_dev_bound_kfunc_id(func_id)) {
@@ -2621,6 +2647,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 	desc->func_id = func_id;
 	desc->imm = call_imm;
 	desc->offset = offset;
+	desc->addr = addr;
 	err = btf_distill_func_proto(&env->log, desc_btf,
 				     func_proto, func_name,
 				     &desc->func_model);
@@ -2630,19 +2657,19 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
 	return err;
 }
 
-static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
 {
 	const struct bpf_kfunc_desc *d0 = a;
 	const struct bpf_kfunc_desc *d1 = b;
 
-	if (d0->imm > d1->imm)
-		return 1;
-	else if (d0->imm < d1->imm)
-		return -1;
+	if (d0->imm != d1->imm)
+		return d0->imm < d1->imm ? -1 : 1;
+	if (d0->offset != d1->offset)
+		return d0->offset < d1->offset ? -1 : 1;
 	return 0;
 }
 
-static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
 {
 	struct bpf_kfunc_desc_tab *tab;
 
@@ -2651,7 +2678,7 @@ static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
 		return;
 
 	sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
-	     kfunc_desc_cmp_by_imm, NULL);
+	     kfunc_desc_cmp_by_imm_off, NULL);
 }
 
 bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -2665,13 +2692,14 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
 {
 	const struct bpf_kfunc_desc desc = {
 		.imm = insn->imm,
+		.offset = insn->off,
 	};
 	const struct bpf_kfunc_desc *res;
 	struct bpf_kfunc_desc_tab *tab;
 
 	tab = prog->aux->kfunc_tab;
 	res = bsearch(&desc, tab->descs, tab->nr_descs,
-		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+		      sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
 
 	return res ? &res->func_model : NULL;
 }
@@ -17293,11 +17321,45 @@ static int fixup_call_args(struct bpf_verifier_env *env)
 	return err;
 }
 
+/* replace a generic kfunc with a specialized version if necessary */
+static void specialize_kfunc(struct bpf_verifier_env *env,
+			     u32 func_id, u16 offset, unsigned long *addr)
+{
+	struct bpf_prog *prog = env->prog;
+	bool seen_direct_write;
+	void *xdp_kfunc;
+	bool is_rdonly;
+
+	if (bpf_dev_bound_kfunc_id(func_id)) {
+		xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+		if (xdp_kfunc) {
+			*addr = (unsigned long)xdp_kfunc;
+			return;
+		}
+		/* fallback to default kfunc when not supported by netdev */
+	}
+
+	if (offset)
+		return;
+
+	if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+		seen_direct_write = env->seen_direct_write;
+		is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+		if (is_rdonly)
+			*addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+		/* restore env->seen_direct_write to its original value, since
+		 * may_access_direct_pkt_data mutates it
+		 */
+		env->seen_direct_write = seen_direct_write;
+	}
+}
+
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
 {
 	const struct bpf_kfunc_desc *desc;
-	void *xdp_kfunc;
 
 	if (!insn->imm) {
 		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -17306,18 +17368,9 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 	*cnt = 0;
 
-	if (bpf_dev_bound_kfunc_id(insn->imm)) {
-		xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
-		if (xdp_kfunc) {
-			insn->imm = BPF_CALL_IMM(xdp_kfunc);
-			return 0;
-		}
-
-		/* fallback to default kfunc when not supported by netdev */
-	}
-
-	/* insn->imm has the btf func_id. Replace it with
-	 * an address (relative to __bpf_call_base).
+	/* insn->imm has the btf func_id. Replace it with an offset relative to
+	 * __bpf_call_base, unless the JIT needs to call functions that are
+	 * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
 	 */
 	desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
 	if (!desc) {
@@ -17326,7 +17379,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EFAULT;
 	}
 
-	insn->imm = desc->imm;
+	if (!bpf_jit_supports_far_kfunc_call())
+		insn->imm = BPF_CALL_IMM(desc->addr);
 	if (insn->off)
 		return 0;
 	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
@@ -17351,17 +17405,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
-	} else if (desc->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
-		bool seen_direct_write = env->seen_direct_write;
-		bool is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
-
-		if (is_rdonly)
-			insn->imm = BPF_CALL_IMM(bpf_dynptr_from_skb_rdonly);
-
-		/* restore env->seen_direct_write to its original value, since
-		 * may_access_direct_pkt_data mutates it
-		 */
-		env->seen_direct_write = seen_direct_write;
 	}
 	return 0;
 }
@@ -17891,7 +17934,7 @@ patch_call_imm:
 		}
 	}
 
-	sort_kfunc_descs_by_imm(env->prog);
+	sort_kfunc_descs_by_imm_off(env->prog);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 9ecd05794b8da1f6cfca4c3721a3b0fed2e21a82 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 12 Apr 2023 15:47:30 +0300
Subject: net: mscc: ocelot: strengthen type of "u32 reg" in I/O accessors

The "u32 reg" argument that is passed to these functions is not a plain
address, but rather a driver-specific encoding of another enum
ocelot_target target in the upper bits, and an index into the
u32 ocelot->map[target][] array in the lower bits. That encoded value
takes the type "enum ocelot_reg" and is what is passed to these I/O
functions, so let's actually use that to prevent type confusion.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mscc/ocelot_io.c | 20 +++++++++++---------
 include/soc/mscc/ocelot.h             | 20 +++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot_io.c b/drivers/net/ethernet/mscc/ocelot_io.c
index 2067382d0ee1..ddb96f32830d 100644
--- a/drivers/net/ethernet/mscc/ocelot_io.c
+++ b/drivers/net/ethernet/mscc/ocelot_io.c
@@ -10,8 +10,8 @@
 
 #include "ocelot.h"
 
-int __ocelot_bulk_read_ix(struct ocelot *ocelot, u32 reg, u32 offset, void *buf,
-			  int count)
+int __ocelot_bulk_read_ix(struct ocelot *ocelot, enum ocelot_reg reg,
+			  u32 offset, void *buf, int count)
 {
 	u16 target = reg >> TARGET_OFFSET;
 
@@ -23,7 +23,7 @@ int __ocelot_bulk_read_ix(struct ocelot *ocelot, u32 reg, u32 offset, void *buf,
 }
 EXPORT_SYMBOL_GPL(__ocelot_bulk_read_ix);
 
-u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset)
+u32 __ocelot_read_ix(struct ocelot *ocelot, enum ocelot_reg reg, u32 offset)
 {
 	u16 target = reg >> TARGET_OFFSET;
 	u32 val;
@@ -36,7 +36,8 @@ u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset)
 }
 EXPORT_SYMBOL_GPL(__ocelot_read_ix);
 
-void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset)
+void __ocelot_write_ix(struct ocelot *ocelot, u32 val, enum ocelot_reg reg,
+		       u32 offset)
 {
 	u16 target = reg >> TARGET_OFFSET;
 
@@ -47,8 +48,8 @@ void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset)
 }
 EXPORT_SYMBOL_GPL(__ocelot_write_ix);
 
-void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
-		     u32 offset)
+void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask,
+		     enum ocelot_reg reg, u32 offset)
 {
 	u16 target = reg >> TARGET_OFFSET;
 
@@ -60,7 +61,7 @@ void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
 }
 EXPORT_SYMBOL_GPL(__ocelot_rmw_ix);
 
-u32 ocelot_port_readl(struct ocelot_port *port, u32 reg)
+u32 ocelot_port_readl(struct ocelot_port *port, enum ocelot_reg reg)
 {
 	struct ocelot *ocelot = port->ocelot;
 	u16 target = reg >> TARGET_OFFSET;
@@ -73,7 +74,7 @@ u32 ocelot_port_readl(struct ocelot_port *port, u32 reg)
 }
 EXPORT_SYMBOL_GPL(ocelot_port_readl);
 
-void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg)
+void ocelot_port_writel(struct ocelot_port *port, u32 val, enum ocelot_reg reg)
 {
 	struct ocelot *ocelot = port->ocelot;
 	u16 target = reg >> TARGET_OFFSET;
@@ -84,7 +85,8 @@ void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg)
 }
 EXPORT_SYMBOL_GPL(ocelot_port_writel);
 
-void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg)
+void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask,
+		      enum ocelot_reg reg)
 {
 	u32 cur = ocelot_port_readl(port, reg);
 
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index d757b5e26d26..277e6d1f2096 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -940,15 +940,17 @@ struct ocelot_policer {
 	__ocelot_target_write_ix(ocelot, target, val, reg, 0)
 
 /* I/O */
-u32 ocelot_port_readl(struct ocelot_port *port, u32 reg);
-void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg);
-void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg);
-int __ocelot_bulk_read_ix(struct ocelot *ocelot, u32 reg, u32 offset, void *buf,
-			  int count);
-u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset);
-void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset);
-void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg,
-		     u32 offset);
+u32 ocelot_port_readl(struct ocelot_port *port, enum ocelot_reg reg);
+void ocelot_port_writel(struct ocelot_port *port, u32 val, enum ocelot_reg reg);
+void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask,
+		      enum ocelot_reg reg);
+int __ocelot_bulk_read_ix(struct ocelot *ocelot, enum ocelot_reg reg,
+			  u32 offset, void *buf, int count);
+u32 __ocelot_read_ix(struct ocelot *ocelot, enum ocelot_reg reg, u32 offset);
+void __ocelot_write_ix(struct ocelot *ocelot, u32 val, enum ocelot_reg reg,
+		       u32 offset);
+void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask,
+		     enum ocelot_reg reg, u32 offset);
 u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target,
 			    u32 reg, u32 offset);
 void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target,
-- 
cgit v1.2.3


From d54151aa0f4b5c89561705a00d8a5ebb4230028c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:49 +0300
Subject: net: ethtool: create and export ethtool_dev_mm_supported()

Create a wrapper over __ethtool_dev_mm_supported() which also calls
ethnl_ops_begin() and ethnl_ops_complete(). It can be used by other code
layers, such as tc, to make sure that preemptible TCs are supported
(this is true if an underlying MAC Merge layer exists).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool_netlink.h |  6 ++++++
 net/ethtool/mm.c                | 23 +++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h
index 17003b385756..fae0dfb9a9c8 100644
--- a/include/linux/ethtool_netlink.h
+++ b/include/linux/ethtool_netlink.h
@@ -39,6 +39,7 @@ void ethtool_aggregate_pause_stats(struct net_device *dev,
 				   struct ethtool_pause_stats *pause_stats);
 void ethtool_aggregate_rmon_stats(struct net_device *dev,
 				  struct ethtool_rmon_stats *rmon_stats);
+bool ethtool_dev_mm_supported(struct net_device *dev);
 
 #else
 static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
@@ -112,5 +113,10 @@ ethtool_aggregate_rmon_stats(struct net_device *dev,
 {
 }
 
+static inline bool ethtool_dev_mm_supported(struct net_device *dev)
+{
+	return false;
+}
+
 #endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */
 #endif /* _LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
index fce3cc2734f9..e00d7d5cea7e 100644
--- a/net/ethtool/mm.c
+++ b/net/ethtool/mm.c
@@ -249,3 +249,26 @@ bool __ethtool_dev_mm_supported(struct net_device *dev)
 
 	return !ret;
 }
+
+bool ethtool_dev_mm_supported(struct net_device *dev)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	bool supported;
+	int ret;
+
+	ASSERT_RTNL();
+
+	if (!ops)
+		return false;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return false;
+
+	supported = __ethtool_dev_mm_supported(dev);
+
+	ethnl_ops_complete(dev);
+
+	return supported;
+}
+EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported);
-- 
cgit v1.2.3


From c54876cd5961ce0f8e74807f79a6739cd6b35ddf Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:53 +0300
Subject: net/sched: pass netlink extack to mqprio and taprio offload

With the multiplexed ndo_setup_tc() model which lacks a first-class
struct netlink_ext_ack * argument, the only way to pass the netlink
extended ACK message down to the device driver is to embed it within the
offload structure.

Do this for struct tc_mqprio_qopt_offload and struct tc_taprio_qopt_offload.

Since struct tc_taprio_qopt_offload also contains a tc_mqprio_qopt_offload
structure, and since device drivers might effectively reuse their mqprio
implementation for the mqprio portion of taprio, we make taprio set the
extack in both offload structures to point at the same netlink extack
message.

In fact, the taprio handling is a bit more tricky, for 2 reasons.

First is because the offload structure has a longer lifetime than the
extack structure. The driver is supposed to populate the extack
synchronously from ndo_setup_tc() and leave it alone afterwards.
To not have any use-after-free surprises, we zero out the extack pointer
when we leave taprio_enable_offload().

The second reason is because taprio does overwrite the extack message on
ndo_setup_tc() error. We need to switch to the weak form of setting an
extack message, which preserves a potential message set by the driver.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h |  2 ++
 net/sched/sch_mqprio.c  |  5 ++++-
 net/sched/sch_taprio.c  | 12 ++++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index bb0bd69fb655..b43ed4733455 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -166,6 +166,7 @@ struct tc_mqprio_caps {
 struct tc_mqprio_qopt_offload {
 	/* struct tc_mqprio_qopt must always be the first element */
 	struct tc_mqprio_qopt qopt;
+	struct netlink_ext_ack *extack;
 	u16 mode;
 	u16 shaper;
 	u32 flags;
@@ -193,6 +194,7 @@ struct tc_taprio_sched_entry {
 
 struct tc_taprio_qopt_offload {
 	struct tc_mqprio_qopt_offload mqprio;
+	struct netlink_ext_ack *extack;
 	u8 enable;
 	ktime_t base_time;
 	u64 cycle_time;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index aae4d64dbf3f..67d77495c8fd 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -33,9 +33,12 @@ static int mqprio_enable_offload(struct Qdisc *sch,
 				 const struct tc_mqprio_qopt *qopt,
 				 struct netlink_ext_ack *extack)
 {
-	struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
 	struct mqprio_sched *priv = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
+	struct tc_mqprio_qopt_offload mqprio = {
+		.qopt = *qopt,
+		.extack = extack,
+	};
 	int err, i;
 
 	switch (priv->mode) {
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 1f469861eae3..cbad43019172 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -1520,7 +1520,9 @@ static int taprio_enable_offload(struct net_device *dev,
 		return -ENOMEM;
 	}
 	offload->enable = 1;
+	offload->extack = extack;
 	mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
+	offload->mqprio.extack = extack;
 	taprio_sched_to_offload(dev, sched, offload, &caps);
 
 	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
@@ -1528,14 +1530,20 @@ static int taprio_enable_offload(struct net_device *dev,
 
 	err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
 	if (err < 0) {
-		NL_SET_ERR_MSG(extack,
-			       "Device failed to setup taprio offload");
+		NL_SET_ERR_MSG_WEAK(extack,
+				    "Device failed to setup taprio offload");
 		goto done;
 	}
 
 	q->offloaded = true;
 
 done:
+	/* The offload structure may linger around via a reference taken by the
+	 * device driver, so clear up the netlink extack pointer so that the
+	 * driver isn't tempted to dereference data which stopped being valid
+	 */
+	offload->extack = NULL;
+	offload->mqprio.extack = NULL;
 	taprio_offload_free(offload);
 
 	return err;
-- 
cgit v1.2.3


From f62af20bed2d9e824f51cfc97ff01bc261f40e58 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:54 +0300
Subject: net/sched: mqprio: allow per-TC user input of FP adminStatus

IEEE 802.1Q-2018 clause 6.7.2 Frame preemption specifies that each
packet priority can be assigned to a "frame preemption status" value of
either "express" or "preemptible". Express priorities are transmitted by
the local device through the eMAC, and preemptible priorities through
the pMAC (the concepts of eMAC and pMAC come from the 802.3 MAC Merge
layer).

The FP adminStatus is defined per packet priority, but 802.1Q clause
12.30.1.1.1 framePreemptionAdminStatus also says that:

| Priorities that all map to the same traffic class should be
| constrained to use the same value of preemption status.

It is impossible to ignore the cognitive dissonance in the standard
here, because it practically means that the FP adminStatus only takes
distinct values per traffic class, even though it is defined per
priority.

I can see no valid use case which is prevented by having the kernel take
the FP adminStatus as input per traffic class (what we do here).
In addition, this also enforces the above constraint by construction.
User space network managers which wish to expose FP adminStatus per
priority are free to do so; they must only observe the prio_tc_map of
the netdev (which presumably is also under their control, when
constructing the mqprio netlink attributes).

The reason for configuring frame preemption as a property of the Qdisc
layer is that the information about "preemptible TCs" is closest to the
place which handles the num_tc and prio_tc_map of the netdev. If the
UAPI would have been any other layer, it would be unclear what to do
with the FP information when num_tc collapses to 0. A key assumption is
that only mqprio/taprio change the num_tc and prio_tc_map of the netdev.
Not sure if that's a great assumption to make.

Having FP in tc-mqprio can be seen as an implementation of the use case
defined in 802.1Q Annex S.2 "Preemption used in isolation". There will
be a separate implementation of FP in tc-taprio, for the other use
cases.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/pkt_sched.h        |   1 +
 include/uapi/linux/pkt_sched.h |  16 ++++++
 net/sched/sch_mqprio.c         | 128 ++++++++++++++++++++++++++++++++++++++++-
 net/sched/sch_mqprio_lib.c     |  14 +++++
 net/sched/sch_mqprio_lib.h     |   2 +
 5 files changed, 160 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b43ed4733455..f436688b6efc 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -172,6 +172,7 @@ struct tc_mqprio_qopt_offload {
 	u32 flags;
 	u64 min_rate[TC_QOPT_MAX_QUEUE];
 	u64 max_rate[TC_QOPT_MAX_QUEUE];
+	unsigned long preemptible_tcs;
 };
 
 struct tc_taprio_caps {
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 000eec106856..b8d29be91b62 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -719,6 +719,11 @@ enum {
 
 #define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
 
+enum {
+	TC_FP_EXPRESS = 1,
+	TC_FP_PREEMPTIBLE = 2,
+};
+
 struct tc_mqprio_qopt {
 	__u8	num_tc;
 	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -732,12 +737,23 @@ struct tc_mqprio_qopt {
 #define TC_MQPRIO_F_MIN_RATE		0x4
 #define TC_MQPRIO_F_MAX_RATE		0x8
 
+enum {
+	TCA_MQPRIO_TC_ENTRY_UNSPEC,
+	TCA_MQPRIO_TC_ENTRY_INDEX,		/* u32 */
+	TCA_MQPRIO_TC_ENTRY_FP,			/* u32 */
+
+	/* add new constants above here */
+	__TCA_MQPRIO_TC_ENTRY_CNT,
+	TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1)
+};
+
 enum {
 	TCA_MQPRIO_UNSPEC,
 	TCA_MQPRIO_MODE,
 	TCA_MQPRIO_SHAPER,
 	TCA_MQPRIO_MIN_RATE64,
 	TCA_MQPRIO_MAX_RATE64,
+	TCA_MQPRIO_TC_ENTRY,
 	__TCA_MQPRIO_MAX,
 };
 
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 67d77495c8fd..dc5a0ff50b14 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -5,6 +5,7 @@
  * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
  */
 
+#include <linux/ethtool_netlink.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -27,6 +28,7 @@ struct mqprio_sched {
 	u32 flags;
 	u64 min_rate[TC_QOPT_MAX_QUEUE];
 	u64 max_rate[TC_QOPT_MAX_QUEUE];
+	u32 fp[TC_QOPT_MAX_QUEUE];
 };
 
 static int mqprio_enable_offload(struct Qdisc *sch,
@@ -63,6 +65,8 @@ static int mqprio_enable_offload(struct Qdisc *sch,
 		return -EINVAL;
 	}
 
+	mqprio_fp_to_offload(priv->fp, &mqprio);
+
 	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
 					    &mqprio);
 	if (err)
@@ -145,13 +149,95 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
 	return 0;
 }
 
+static const struct
+nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = {
+	[TCA_MQPRIO_TC_ENTRY_INDEX]	= NLA_POLICY_MAX(NLA_U32,
+							 TC_QOPT_MAX_QUEUE),
+	[TCA_MQPRIO_TC_ENTRY_FP]	= NLA_POLICY_RANGE(NLA_U32,
+							   TC_FP_EXPRESS,
+							   TC_FP_PREEMPTIBLE),
+};
+
 static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
 	[TCA_MQPRIO_MODE]	= { .len = sizeof(u16) },
 	[TCA_MQPRIO_SHAPER]	= { .len = sizeof(u16) },
 	[TCA_MQPRIO_MIN_RATE64]	= { .type = NLA_NESTED },
 	[TCA_MQPRIO_MAX_RATE64]	= { .type = NLA_NESTED },
+	[TCA_MQPRIO_TC_ENTRY]	= { .type = NLA_NESTED },
 };
 
+static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE],
+				 struct nlattr *opt,
+				 unsigned long *seen_tcs,
+				 struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1];
+	int err, tc;
+
+	err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt,
+			       mqprio_tc_entry_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) {
+		NL_SET_ERR_MSG(extack, "TC entry index missing");
+		return -EINVAL;
+	}
+
+	tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]);
+	if (*seen_tcs & BIT(tc)) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX],
+				    "Duplicate tc entry");
+		return -EINVAL;
+	}
+
+	*seen_tcs |= BIT(tc);
+
+	if (tb[TCA_MQPRIO_TC_ENTRY_FP])
+		fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]);
+
+	return 0;
+}
+
+static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt,
+				   int nlattr_opt_len,
+				   struct netlink_ext_ack *extack)
+{
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	bool have_preemption = false;
+	unsigned long seen_tcs = 0;
+	u32 fp[TC_QOPT_MAX_QUEUE];
+	struct nlattr *n;
+	int tc, rem;
+	int err = 0;
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		fp[tc] = priv->fp[tc];
+
+	nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) {
+		if (nla_type(n) != TCA_MQPRIO_TC_ENTRY)
+			continue;
+
+		err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack);
+		if (err)
+			goto out;
+	}
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+		priv->fp[tc] = fp[tc];
+		if (fp[tc] == TC_FP_PREEMPTIBLE)
+			have_preemption = true;
+	}
+
+	if (have_preemption && !ethtool_dev_mm_supported(dev)) {
+		NL_SET_ERR_MSG(extack, "Device does not support preemption");
+		return -EOPNOTSUPP;
+	}
+out:
+	return err;
+}
+
 /* Parse the other netlink attributes that represent the payload of
  * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt.
  */
@@ -234,6 +320,13 @@ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
 		priv->flags |= TC_MQPRIO_F_MAX_RATE;
 	}
 
+	if (tb[TCA_MQPRIO_TC_ENTRY]) {
+		err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len,
+					      extack);
+		if (err)
+			return err;
+	}
+
 	return 0;
 }
 
@@ -247,7 +340,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
 	int i, err = -EOPNOTSUPP;
 	struct tc_mqprio_qopt *qopt = NULL;
 	struct tc_mqprio_caps caps;
-	int len;
+	int len, tc;
 
 	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
 	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
@@ -265,6 +358,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt || nla_len(opt) < sizeof(*qopt))
 		return -EINVAL;
 
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		priv->fp[tc] = TC_FP_EXPRESS;
+
 	qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
 				 &caps, sizeof(caps));
 
@@ -415,6 +511,33 @@ nla_put_failure:
 	return -1;
 }
 
+static int mqprio_dump_tc_entries(struct mqprio_sched *priv,
+				  struct sk_buff *skb)
+{
+	struct nlattr *n;
+	int tc;
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+		n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY);
+		if (!n)
+			return -EMSGSIZE;
+
+		if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc))
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc]))
+			goto nla_put_failure;
+
+		nla_nest_end(skb, n);
+	}
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, n);
+	return -EMSGSIZE;
+}
+
 static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct net_device *dev = qdisc_dev(sch);
@@ -465,6 +588,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    (dump_rates(priv, &opt, skb) != 0))
 		goto nla_put_failure;
 
+	if (mqprio_dump_tc_entries(priv, skb))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, nla);
 nla_put_failure:
 	nlmsg_trim(skb, nla);
diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
index c58a533b8ec5..83b3793c4012 100644
--- a/net/sched/sch_mqprio_lib.c
+++ b/net/sched/sch_mqprio_lib.c
@@ -114,4 +114,18 @@ void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt
 }
 EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);
 
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+			  struct tc_mqprio_qopt_offload *mqprio)
+{
+	unsigned long preemptible_tcs = 0;
+	int tc;
+
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		if (fp[tc] == TC_FP_PREEMPTIBLE)
+			preemptible_tcs |= BIT(tc);
+
+	mqprio->preemptible_tcs = preemptible_tcs;
+}
+EXPORT_SYMBOL_GPL(mqprio_fp_to_offload);
+
 MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
index 63f725ab8761..079f597072e3 100644
--- a/net/sched/sch_mqprio_lib.h
+++ b/net/sched/sch_mqprio_lib.h
@@ -14,5 +14,7 @@ int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
 			 struct netlink_ext_ack *extack);
 void mqprio_qopt_reconstruct(struct net_device *dev,
 			     struct tc_mqprio_qopt *qopt);
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+			  struct tc_mqprio_qopt_offload *mqprio);
 
 #endif
-- 
cgit v1.2.3


From a721c3e54b80e45cd9202e7fca29ef018bed9069 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 11 Apr 2023 21:01:55 +0300
Subject: net/sched: taprio: allow per-TC user input of FP adminStatus

This is a duplication of the FP adminStatus logic introduced for
tc-mqprio. Offloading is done through the tc_mqprio_qopt_offload
structure embedded within tc_taprio_qopt_offload. So practically, if a
device driver is written to treat the mqprio portion of taprio just like
standalone mqprio, it gets unified handling of frame preemption.

I would have reused more code with taprio, but this is mostly netlink
attribute parsing, which is hard to transform into generic code without
having something that stinks as a result. We have the same variables
with the same semantics, just different nlattr type values
(TCA_MQPRIO_TC_ENTRY=5 vs TCA_TAPRIO_ATTR_TC_ENTRY=12;
TCA_MQPRIO_TC_ENTRY_FP=2 vs TCA_TAPRIO_TC_ENTRY_FP=3, etc) and
consequently, different policies for the nest.

Every time nla_parse_nested() is called, an on-stack table "tb" of
nlattr pointers is allocated statically, up to the maximum understood
nlattr type. That array size is hardcoded as a constant, but when
transforming this into a common parsing function, it would become either
a VLA (which the Linux kernel rightfully doesn't like) or a call to the
allocator.

Having FP adminStatus in tc-taprio can be seen as addressing the 802.1Q
Annex S.3 "Scheduling and preemption used in combination, no HOLD/RELEASE"
and S.4 "Scheduling and preemption used in combination with HOLD/RELEASE"
use cases. HOLD and RELEASE events are emitted towards the underlying
MAC Merge layer when the schedule hits a Set-And-Hold-MAC or a
Set-And-Release-MAC gate operation. So within the tc-taprio UAPI space,
one can distinguish between the 2 use cases by choosing whether to use
the TC_TAPRIO_CMD_SET_AND_HOLD and TC_TAPRIO_CMD_SET_AND_RELEASE gate
operations within the schedule, or just TC_TAPRIO_CMD_SET_GATES.

A small part of the change is dedicated to refactoring the max_sdu
nlattr parsing to put all logic under the "if" that tests for presence
of that nlattr.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_sched.h |  1 +
 net/sched/sch_taprio.c         | 65 +++++++++++++++++++++++++++++++++---------
 2 files changed, 53 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index b8d29be91b62..51a7addc56c6 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1252,6 +1252,7 @@ enum {
 	TCA_TAPRIO_TC_ENTRY_UNSPEC,
 	TCA_TAPRIO_TC_ENTRY_INDEX,		/* u32 */
 	TCA_TAPRIO_TC_ENTRY_MAX_SDU,		/* u32 */
+	TCA_TAPRIO_TC_ENTRY_FP,			/* u32 */
 
 	/* add new constants above here */
 	__TCA_TAPRIO_TC_ENTRY_CNT,
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index cbad43019172..76db9a10ef50 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -96,6 +97,7 @@ struct taprio_sched {
 	struct list_head taprio_list;
 	int cur_txq[TC_MAX_QUEUE];
 	u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
+	u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */
 	u32 txtime_delay;
 };
 
@@ -1002,6 +1004,9 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
 static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
 	[TCA_TAPRIO_TC_ENTRY_INDEX]	   = { .type = NLA_U32 },
 	[TCA_TAPRIO_TC_ENTRY_MAX_SDU]	   = { .type = NLA_U32 },
+	[TCA_TAPRIO_TC_ENTRY_FP]	   = NLA_POLICY_RANGE(NLA_U32,
+							      TC_FP_EXPRESS,
+							      TC_FP_PREEMPTIBLE),
 };
 
 static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
@@ -1524,6 +1529,7 @@ static int taprio_enable_offload(struct net_device *dev,
 	mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
 	offload->mqprio.extack = extack;
 	taprio_sched_to_offload(dev, sched, offload, &caps);
+	mqprio_fp_to_offload(q->fp, &offload->mqprio);
 
 	for (tc = 0; tc < TC_MAX_QUEUE; tc++)
 		offload->max_sdu[tc] = q->max_sdu[tc];
@@ -1671,13 +1677,14 @@ out:
 static int taprio_parse_tc_entry(struct Qdisc *sch,
 				 struct nlattr *opt,
 				 u32 max_sdu[TC_QOPT_MAX_QUEUE],
+				 u32 fp[TC_QOPT_MAX_QUEUE],
 				 unsigned long *seen_tcs,
 				 struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
 	struct net_device *dev = qdisc_dev(sch);
-	u32 val = 0;
 	int err, tc;
+	u32 val;
 
 	err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
 			       taprio_tc_policy, extack);
@@ -1702,15 +1709,18 @@ static int taprio_parse_tc_entry(struct Qdisc *sch,
 
 	*seen_tcs |= BIT(tc);
 
-	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU])
+	if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) {
 		val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
+		if (val > dev->max_mtu) {
+			NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
+			return -ERANGE;
+		}
 
-	if (val > dev->max_mtu) {
-		NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
-		return -ERANGE;
+		max_sdu[tc] = val;
 	}
 
-	max_sdu[tc] = val;
+	if (tb[TCA_TAPRIO_TC_ENTRY_FP])
+		fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]);
 
 	return 0;
 }
@@ -1720,29 +1730,51 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
 				   struct netlink_ext_ack *extack)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 	u32 max_sdu[TC_QOPT_MAX_QUEUE];
+	bool have_preemption = false;
 	unsigned long seen_tcs = 0;
+	u32 fp[TC_QOPT_MAX_QUEUE];
 	struct nlattr *n;
 	int tc, rem;
 	int err = 0;
 
-	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
 		max_sdu[tc] = q->max_sdu[tc];
+		fp[tc] = q->fp[tc];
+	}
 
 	nla_for_each_nested(n, opt, rem) {
 		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
 			continue;
 
-		err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs,
+		err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs,
 					    extack);
 		if (err)
-			goto out;
+			return err;
 	}
 
-	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
 		q->max_sdu[tc] = max_sdu[tc];
+		q->fp[tc] = fp[tc];
+		if (fp[tc] != TC_FP_EXPRESS)
+			have_preemption = true;
+	}
+
+	if (have_preemption) {
+		if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) {
+			NL_SET_ERR_MSG(extack,
+				       "Preemption only supported with full offload");
+			return -EOPNOTSUPP;
+		}
+
+		if (!ethtool_dev_mm_supported(dev)) {
+			NL_SET_ERR_MSG(extack,
+				       "Device does not support preemption");
+			return -EOPNOTSUPP;
+		}
+	}
 
-out:
 	return err;
 }
 
@@ -2023,7 +2055,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
-	int i;
+	int i, tc;
 
 	spin_lock_init(&q->current_entry_lock);
 
@@ -2080,6 +2112,9 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
 		q->qdiscs[i] = qdisc;
 	}
 
+	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+		q->fp[tc] = TC_FP_EXPRESS;
+
 	taprio_detect_broken_mqprio(q);
 
 	return taprio_change(sch, opt, extack);
@@ -2223,6 +2258,7 @@ error_nest:
 }
 
 static int taprio_dump_tc_entries(struct sk_buff *skb,
+				  struct taprio_sched *q,
 				  struct sched_gate_list *sched)
 {
 	struct nlattr *n;
@@ -2240,6 +2276,9 @@ static int taprio_dump_tc_entries(struct sk_buff *skb,
 				sched->max_sdu[tc]))
 			goto nla_put_failure;
 
+		if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc]))
+			goto nla_put_failure;
+
 		nla_nest_end(skb, n);
 	}
 
@@ -2281,7 +2320,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
 		goto options_error;
 
-	if (oper && taprio_dump_tc_entries(skb, oper))
+	if (oper && taprio_dump_tc_entries(skb, q, oper))
 		goto options_error;
 
 	if (oper && dump_schedule(skb, oper))
-- 
cgit v1.2.3


From a2917b23497e4205db32271e4e06e142a9f8a6aa Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 12 Apr 2023 14:16:01 -0700
Subject: net: mana: Refactor RX buffer allocation code to prepare for various
 MTU

Move out common buffer allocation code from mana_process_rx_cqe() and
mana_alloc_rx_wqe() to helper functions.
Refactor related variables so they can be changed in one place, and buffer
sizes are in sync.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 154 +++++++++++++++-----------
 include/net/mana/mana.h                       |   6 +-
 2 files changed, 91 insertions(+), 69 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 112c642dc89b..911954ff84ee 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1282,14 +1282,64 @@ drop_xdp:
 	u64_stats_update_end(&rx_stats->syncp);
 
 drop:
-	WARN_ON_ONCE(rxq->xdp_save_page);
-	rxq->xdp_save_page = virt_to_page(buf_va);
+	WARN_ON_ONCE(rxq->xdp_save_va);
+	/* Save for reuse */
+	rxq->xdp_save_va = buf_va;
 
 	++ndev->stats.rx_dropped;
 
 	return;
 }
 
+static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
+			     dma_addr_t *da, bool is_napi)
+{
+	struct page *page;
+	void *va;
+
+	/* Reuse XDP dropped page if available */
+	if (rxq->xdp_save_va) {
+		va = rxq->xdp_save_va;
+		rxq->xdp_save_va = NULL;
+	} else {
+		page = dev_alloc_page();
+		if (!page)
+			return NULL;
+
+		va = page_to_virt(page);
+	}
+
+	*da = dma_map_single(dev, va + XDP_PACKET_HEADROOM, rxq->datasize,
+			     DMA_FROM_DEVICE);
+
+	if (dma_mapping_error(dev, *da)) {
+		put_page(virt_to_head_page(va));
+		return NULL;
+	}
+
+	return va;
+}
+
+/* Allocate frag for rx buffer, and save the old buf */
+static void mana_refill_rxoob(struct device *dev, struct mana_rxq *rxq,
+			      struct mana_recv_buf_oob *rxoob, void **old_buf)
+{
+	dma_addr_t da;
+	void *va;
+
+	va = mana_get_rxfrag(rxq, dev, &da, true);
+
+	if (!va)
+		return;
+
+	dma_unmap_single(dev, rxoob->sgl[0].address, rxq->datasize,
+			 DMA_FROM_DEVICE);
+	*old_buf = rxoob->buf_va;
+
+	rxoob->buf_va = va;
+	rxoob->sgl[0].address = da;
+}
+
 static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 				struct gdma_comp *cqe)
 {
@@ -1299,10 +1349,8 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	struct mana_recv_buf_oob *rxbuf_oob;
 	struct mana_port_context *apc;
 	struct device *dev = gc->dev;
-	void *new_buf, *old_buf;
-	struct page *new_page;
+	void *old_buf = NULL;
 	u32 curr, pktlen;
-	dma_addr_t da;
 
 	apc = netdev_priv(ndev);
 
@@ -1345,40 +1393,11 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	rxbuf_oob = &rxq->rx_oobs[curr];
 	WARN_ON_ONCE(rxbuf_oob->wqe_inf.wqe_size_in_bu != 1);
 
-	/* Reuse XDP dropped page if available */
-	if (rxq->xdp_save_page) {
-		new_page = rxq->xdp_save_page;
-		rxq->xdp_save_page = NULL;
-	} else {
-		new_page = alloc_page(GFP_ATOMIC);
-	}
-
-	if (new_page) {
-		da = dma_map_page(dev, new_page, XDP_PACKET_HEADROOM, rxq->datasize,
-				  DMA_FROM_DEVICE);
-
-		if (dma_mapping_error(dev, da)) {
-			__free_page(new_page);
-			new_page = NULL;
-		}
-	}
-
-	new_buf = new_page ? page_to_virt(new_page) : NULL;
-
-	if (new_buf) {
-		dma_unmap_page(dev, rxbuf_oob->buf_dma_addr, rxq->datasize,
-			       DMA_FROM_DEVICE);
-
-		old_buf = rxbuf_oob->buf_va;
-
-		/* refresh the rxbuf_oob with the new page */
-		rxbuf_oob->buf_va = new_buf;
-		rxbuf_oob->buf_dma_addr = da;
-		rxbuf_oob->sgl[0].address = rxbuf_oob->buf_dma_addr;
-	} else {
-		old_buf = NULL; /* drop the packet if no memory */
-	}
+	mana_refill_rxoob(dev, rxq, rxbuf_oob, &old_buf);
 
+	/* Unsuccessful refill will have old_buf == NULL.
+	 * In this case, mana_rx_skb() will drop the packet.
+	 */
 	mana_rx_skb(old_buf, oob, rxq);
 
 drop:
@@ -1659,8 +1678,8 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 
 	mana_deinit_cq(apc, &rxq->rx_cq);
 
-	if (rxq->xdp_save_page)
-		__free_page(rxq->xdp_save_page);
+	if (rxq->xdp_save_va)
+		put_page(virt_to_head_page(rxq->xdp_save_va));
 
 	for (i = 0; i < rxq->num_rx_buf; i++) {
 		rx_oob = &rxq->rx_oobs[i];
@@ -1668,10 +1687,10 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 		if (!rx_oob->buf_va)
 			continue;
 
-		dma_unmap_page(dev, rx_oob->buf_dma_addr, rxq->datasize,
-			       DMA_FROM_DEVICE);
+		dma_unmap_single(dev, rx_oob->sgl[0].address,
+				 rx_oob->sgl[0].size, DMA_FROM_DEVICE);
 
-		free_page((unsigned long)rx_oob->buf_va);
+		put_page(virt_to_head_page(rx_oob->buf_va));
 		rx_oob->buf_va = NULL;
 	}
 
@@ -1681,6 +1700,26 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 	kfree(rxq);
 }
 
+static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
+			    struct mana_rxq *rxq, struct device *dev)
+{
+	dma_addr_t da;
+	void *va;
+
+	va = mana_get_rxfrag(rxq, dev, &da, false);
+
+	if (!va)
+		return -ENOMEM;
+
+	rx_oob->buf_va = va;
+
+	rx_oob->sgl[0].address = da;
+	rx_oob->sgl[0].size = rxq->datasize;
+	rx_oob->sgl[0].mem_key = mem_key;
+
+	return 0;
+}
+
 #define MANA_WQE_HEADER_SIZE 16
 #define MANA_WQE_SGE_SIZE 16
 
@@ -1690,9 +1729,8 @@ static int mana_alloc_rx_wqe(struct mana_port_context *apc,
 	struct gdma_context *gc = apc->ac->gdma_dev->gdma_context;
 	struct mana_recv_buf_oob *rx_oob;
 	struct device *dev = gc->dev;
-	struct page *page;
-	dma_addr_t da;
 	u32 buf_idx;
+	int ret;
 
 	WARN_ON(rxq->datasize == 0 || rxq->datasize > PAGE_SIZE);
 
@@ -1703,25 +1741,12 @@ static int mana_alloc_rx_wqe(struct mana_port_context *apc,
 		rx_oob = &rxq->rx_oobs[buf_idx];
 		memset(rx_oob, 0, sizeof(*rx_oob));
 
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-
-		da = dma_map_page(dev, page, XDP_PACKET_HEADROOM, rxq->datasize,
-				  DMA_FROM_DEVICE);
-
-		if (dma_mapping_error(dev, da)) {
-			__free_page(page);
-			return -ENOMEM;
-		}
-
-		rx_oob->buf_va = page_to_virt(page);
-		rx_oob->buf_dma_addr = da;
-
 		rx_oob->num_sge = 1;
-		rx_oob->sgl[0].address = rx_oob->buf_dma_addr;
-		rx_oob->sgl[0].size = rxq->datasize;
-		rx_oob->sgl[0].mem_key = apc->ac->gdma_dev->gpa_mkey;
+
+		ret = mana_fill_rx_oob(rx_oob, apc->ac->gdma_dev->gpa_mkey, rxq,
+				       dev);
+		if (ret)
+			return ret;
 
 		rx_oob->wqe_req.sgl = rx_oob->sgl;
 		rx_oob->wqe_req.num_sge = rx_oob->num_sge;
@@ -1780,9 +1805,10 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->ndev = ndev;
 	rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE;
 	rxq->rxq_idx = rxq_idx;
-	rxq->datasize = ALIGN(MAX_FRAME_SIZE, 64);
 	rxq->rxobj = INVALID_MANA_HANDLE;
 
+	rxq->datasize = ALIGN(ETH_FRAME_LEN, 64);
+
 	err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
 	if (err)
 		goto out;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index bb11a6535d80..037bcabf6b98 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -36,9 +36,6 @@ enum TRI_STATE {
 
 #define COMP_ENTRY_SIZE 64
 
-#define ADAPTER_MTU_SIZE 1500
-#define MAX_FRAME_SIZE (ADAPTER_MTU_SIZE + 14)
-
 #define RX_BUFFERS_PER_QUEUE 512
 
 #define MAX_SEND_BUFFERS_PER_QUEUE 256
@@ -282,7 +279,6 @@ struct mana_recv_buf_oob {
 	struct gdma_wqe_request wqe_req;
 
 	void *buf_va;
-	dma_addr_t buf_dma_addr;
 
 	/* SGL of the buffer going to be sent has part of the work request. */
 	u32 num_sge;
@@ -322,7 +318,7 @@ struct mana_rxq {
 
 	struct bpf_prog __rcu *bpf_prog;
 	struct xdp_rxq_info xdp_rxq;
-	struct page *xdp_save_page;
+	void *xdp_save_va; /* for reusing */
 	bool xdp_flush;
 	int xdp_rc; /* XDP redirect return code */
 
-- 
cgit v1.2.3


From 2fbbd712baf1c60996554326728bbdbef5616e12 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 12 Apr 2023 14:16:02 -0700
Subject: net: mana: Enable RX path to handle various MTU sizes

Update RX data path to allocate and use RX queue DMA buffers with
proper size based on potentially various MTU sizes.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 38 ++++++++++++++++++++-------
 include/net/mana/mana.h                       |  7 +++++
 2 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 911954ff84ee..8e7fa6e9c3b5 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1185,10 +1185,10 @@ static void mana_post_pkt_rxq(struct mana_rxq *rxq)
 	WARN_ON_ONCE(recv_buf_oob->wqe_inf.wqe_size_in_bu != 1);
 }
 
-static struct sk_buff *mana_build_skb(void *buf_va, uint pkt_len,
-				      struct xdp_buff *xdp)
+static struct sk_buff *mana_build_skb(struct mana_rxq *rxq, void *buf_va,
+				      uint pkt_len, struct xdp_buff *xdp)
 {
-	struct sk_buff *skb = napi_build_skb(buf_va, PAGE_SIZE);
+	struct sk_buff *skb = napi_build_skb(buf_va, rxq->alloc_size);
 
 	if (!skb)
 		return NULL;
@@ -1196,11 +1196,12 @@ static struct sk_buff *mana_build_skb(void *buf_va, uint pkt_len,
 	if (xdp->data_hard_start) {
 		skb_reserve(skb, xdp->data - xdp->data_hard_start);
 		skb_put(skb, xdp->data_end - xdp->data);
-	} else {
-		skb_reserve(skb, XDP_PACKET_HEADROOM);
-		skb_put(skb, pkt_len);
+		return skb;
 	}
 
+	skb_reserve(skb, rxq->headroom);
+	skb_put(skb, pkt_len);
+
 	return skb;
 }
 
@@ -1233,7 +1234,7 @@ static void mana_rx_skb(void *buf_va, struct mana_rxcomp_oob *cqe,
 	if (act != XDP_PASS && act != XDP_TX)
 		goto drop_xdp;
 
-	skb = mana_build_skb(buf_va, pkt_len, &xdp);
+	skb = mana_build_skb(rxq, buf_va, pkt_len, &xdp);
 
 	if (!skb)
 		goto drop;
@@ -1301,6 +1302,14 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 	if (rxq->xdp_save_va) {
 		va = rxq->xdp_save_va;
 		rxq->xdp_save_va = NULL;
+	} else if (rxq->alloc_size > PAGE_SIZE) {
+		if (is_napi)
+			va = napi_alloc_frag(rxq->alloc_size);
+		else
+			va = netdev_alloc_frag(rxq->alloc_size);
+
+		if (!va)
+			return NULL;
 	} else {
 		page = dev_alloc_page();
 		if (!page)
@@ -1309,7 +1318,7 @@ static void *mana_get_rxfrag(struct mana_rxq *rxq, struct device *dev,
 		va = page_to_virt(page);
 	}
 
-	*da = dma_map_single(dev, va + XDP_PACKET_HEADROOM, rxq->datasize,
+	*da = dma_map_single(dev, va + rxq->headroom, rxq->datasize,
 			     DMA_FROM_DEVICE);
 
 	if (dma_mapping_error(dev, *da)) {
@@ -1732,7 +1741,7 @@ static int mana_alloc_rx_wqe(struct mana_port_context *apc,
 	u32 buf_idx;
 	int ret;
 
-	WARN_ON(rxq->datasize == 0 || rxq->datasize > PAGE_SIZE);
+	WARN_ON(rxq->datasize == 0);
 
 	*rxq_size = 0;
 	*cq_size = 0;
@@ -1788,6 +1797,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_obj_spec wq_spec;
 	struct mana_obj_spec cq_spec;
+	unsigned int mtu = ndev->mtu;
 	struct gdma_queue_spec spec;
 	struct mana_cq *cq = NULL;
 	struct gdma_context *gc;
@@ -1807,7 +1817,15 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->rxq_idx = rxq_idx;
 	rxq->rxobj = INVALID_MANA_HANDLE;
 
-	rxq->datasize = ALIGN(ETH_FRAME_LEN, 64);
+	rxq->datasize = ALIGN(mtu + ETH_HLEN, 64);
+
+	if (mtu > MANA_XDP_MTU_MAX) {
+		rxq->alloc_size = mtu + MANA_RXBUF_PAD;
+		rxq->headroom = 0;
+	} else {
+		rxq->alloc_size = mtu + MANA_RXBUF_PAD + XDP_PACKET_HEADROOM;
+		rxq->headroom = XDP_PACKET_HEADROOM;
+	}
 
 	err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
 	if (err)
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 037bcabf6b98..fee99d704281 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -291,6 +291,11 @@ struct mana_recv_buf_oob {
 	struct gdma_posted_wqe_info wqe_inf;
 };
 
+#define MANA_RXBUF_PAD (SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) \
+			+ ETH_HLEN)
+
+#define MANA_XDP_MTU_MAX (PAGE_SIZE - MANA_RXBUF_PAD - XDP_PACKET_HEADROOM)
+
 struct mana_rxq {
 	struct gdma_queue *gdma_rq;
 	/* Cache the gdma receive queue id */
@@ -300,6 +305,8 @@ struct mana_rxq {
 	u32 rxq_idx;
 
 	u32 datasize;
+	u32 alloc_size;
+	u32 headroom;
 
 	mana_handle_t rxobj;
 
-- 
cgit v1.2.3


From 80f6215b450eb8e92d8b1f117abf5ecf867f963e Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Wed, 12 Apr 2023 14:16:03 -0700
Subject: net: mana: Add support for jumbo frame

During probe, get the hardware-allowed max MTU by querying the device
configuration. Users can select MTU up to the device limit.
When XDP is in use, limit MTU settings so the buffer size is within
one page. And, when MTU is set to a too large value, XDP is not allowed
to run.
Also, to prevent changing MTU fails, and leaves the NIC in a bad state,
pre-allocate all buffers before starting the change. So in low memory
condition, it will return error, without affecting the NIC.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microsoft/mana/mana_bpf.c |  22 +--
 drivers/net/ethernet/microsoft/mana/mana_en.c  | 217 +++++++++++++++++++++++--
 include/net/mana/gdma.h                        |   4 +
 include/net/mana/mana.h                        |  14 ++
 4 files changed, 233 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
index 3caea631229c..23b1521c0df9 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
@@ -133,12 +133,6 @@ out:
 	return act;
 }
 
-static unsigned int mana_xdp_fraglen(unsigned int len)
-{
-	return SKB_DATA_ALIGN(len) +
-	       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-}
-
 struct bpf_prog *mana_xdp_get(struct mana_port_context *apc)
 {
 	ASSERT_RTNL();
@@ -179,17 +173,18 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog,
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	struct bpf_prog *old_prog;
-	int buf_max;
+	struct gdma_context *gc;
+
+	gc = apc->ac->gdma_dev->gdma_context;
 
 	old_prog = mana_xdp_get(apc);
 
 	if (!old_prog && !prog)
 		return 0;
 
-	buf_max = XDP_PACKET_HEADROOM + mana_xdp_fraglen(ndev->mtu + ETH_HLEN);
-	if (prog && buf_max > PAGE_SIZE) {
-		netdev_err(ndev, "XDP: mtu:%u too large, buf_max:%u\n",
-			   ndev->mtu, buf_max);
+	if (prog && ndev->mtu > MANA_XDP_MTU_MAX) {
+		netdev_err(ndev, "XDP: mtu:%u too large, mtu_max:%lu\n",
+			   ndev->mtu, MANA_XDP_MTU_MAX);
 		NL_SET_ERR_MSG_MOD(extack, "XDP: mtu too large");
 
 		return -EOPNOTSUPP;
@@ -206,6 +201,11 @@ static int mana_xdp_set(struct net_device *ndev, struct bpf_prog *prog,
 	if (apc->port_is_up)
 		mana_chn_setxdp(apc, prog);
 
+	if (prog)
+		ndev->max_mtu = MANA_XDP_MTU_MAX;
+	else
+		ndev->max_mtu = gc->adapter_mtu - ETH_HLEN;
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 8e7fa6e9c3b5..cabecbfa1102 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -427,6 +427,192 @@ static u16 mana_select_queue(struct net_device *ndev, struct sk_buff *skb,
 	return txq;
 }
 
+/* Release pre-allocated RX buffers */
+static void mana_pre_dealloc_rxbufs(struct mana_port_context *mpc)
+{
+	struct device *dev;
+	int i;
+
+	dev = mpc->ac->gdma_dev->gdma_context->dev;
+
+	if (!mpc->rxbufs_pre)
+		goto out1;
+
+	if (!mpc->das_pre)
+		goto out2;
+
+	while (mpc->rxbpre_total) {
+		i = --mpc->rxbpre_total;
+		dma_unmap_single(dev, mpc->das_pre[i], mpc->rxbpre_datasize,
+				 DMA_FROM_DEVICE);
+		put_page(virt_to_head_page(mpc->rxbufs_pre[i]));
+	}
+
+	kfree(mpc->das_pre);
+	mpc->das_pre = NULL;
+
+out2:
+	kfree(mpc->rxbufs_pre);
+	mpc->rxbufs_pre = NULL;
+
+out1:
+	mpc->rxbpre_datasize = 0;
+	mpc->rxbpre_alloc_size = 0;
+	mpc->rxbpre_headroom = 0;
+}
+
+/* Get a buffer from the pre-allocated RX buffers */
+static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
+{
+	struct net_device *ndev = rxq->ndev;
+	struct mana_port_context *mpc;
+	void *va;
+
+	mpc = netdev_priv(ndev);
+
+	if (!mpc->rxbufs_pre || !mpc->das_pre || !mpc->rxbpre_total) {
+		netdev_err(ndev, "No RX pre-allocated bufs\n");
+		return NULL;
+	}
+
+	/* Check sizes to catch unexpected coding error */
+	if (mpc->rxbpre_datasize != rxq->datasize) {
+		netdev_err(ndev, "rxbpre_datasize mismatch: %u: %u\n",
+			   mpc->rxbpre_datasize, rxq->datasize);
+		return NULL;
+	}
+
+	if (mpc->rxbpre_alloc_size != rxq->alloc_size) {
+		netdev_err(ndev, "rxbpre_alloc_size mismatch: %u: %u\n",
+			   mpc->rxbpre_alloc_size, rxq->alloc_size);
+		return NULL;
+	}
+
+	if (mpc->rxbpre_headroom != rxq->headroom) {
+		netdev_err(ndev, "rxbpre_headroom mismatch: %u: %u\n",
+			   mpc->rxbpre_headroom, rxq->headroom);
+		return NULL;
+	}
+
+	mpc->rxbpre_total--;
+
+	*da = mpc->das_pre[mpc->rxbpre_total];
+	va = mpc->rxbufs_pre[mpc->rxbpre_total];
+	mpc->rxbufs_pre[mpc->rxbpre_total] = NULL;
+
+	/* Deallocate the array after all buffers are gone */
+	if (!mpc->rxbpre_total)
+		mana_pre_dealloc_rxbufs(mpc);
+
+	return va;
+}
+
+/* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
+static void mana_get_rxbuf_cfg(int mtu, u32 *datasize, u32 *alloc_size,
+			       u32 *headroom)
+{
+	if (mtu > MANA_XDP_MTU_MAX)
+		*headroom = 0; /* no support for XDP */
+	else
+		*headroom = XDP_PACKET_HEADROOM;
+
+	*alloc_size = mtu + MANA_RXBUF_PAD + *headroom;
+
+	*datasize = ALIGN(mtu + ETH_HLEN, MANA_RX_DATA_ALIGN);
+}
+
+static int mana_pre_alloc_rxbufs(struct mana_port_context *mpc, int new_mtu)
+{
+	struct device *dev;
+	struct page *page;
+	dma_addr_t da;
+	int num_rxb;
+	void *va;
+	int i;
+
+	mana_get_rxbuf_cfg(new_mtu, &mpc->rxbpre_datasize,
+			   &mpc->rxbpre_alloc_size, &mpc->rxbpre_headroom);
+
+	dev = mpc->ac->gdma_dev->gdma_context->dev;
+
+	num_rxb = mpc->num_queues * RX_BUFFERS_PER_QUEUE;
+
+	WARN(mpc->rxbufs_pre, "mana rxbufs_pre exists\n");
+	mpc->rxbufs_pre = kmalloc_array(num_rxb, sizeof(void *), GFP_KERNEL);
+	if (!mpc->rxbufs_pre)
+		goto error;
+
+	mpc->das_pre = kmalloc_array(num_rxb, sizeof(dma_addr_t), GFP_KERNEL);
+	if (!mpc->das_pre)
+		goto error;
+
+	mpc->rxbpre_total = 0;
+
+	for (i = 0; i < num_rxb; i++) {
+		if (mpc->rxbpre_alloc_size > PAGE_SIZE) {
+			va = netdev_alloc_frag(mpc->rxbpre_alloc_size);
+			if (!va)
+				goto error;
+		} else {
+			page = dev_alloc_page();
+			if (!page)
+				goto error;
+
+			va = page_to_virt(page);
+		}
+
+		da = dma_map_single(dev, va + mpc->rxbpre_headroom,
+				    mpc->rxbpre_datasize, DMA_FROM_DEVICE);
+
+		if (dma_mapping_error(dev, da)) {
+			put_page(virt_to_head_page(va));
+			goto error;
+		}
+
+		mpc->rxbufs_pre[i] = va;
+		mpc->das_pre[i] = da;
+		mpc->rxbpre_total = i + 1;
+	}
+
+	return 0;
+
+error:
+	mana_pre_dealloc_rxbufs(mpc);
+	return -ENOMEM;
+}
+
+static int mana_change_mtu(struct net_device *ndev, int new_mtu)
+{
+	struct mana_port_context *mpc = netdev_priv(ndev);
+	unsigned int old_mtu = ndev->mtu;
+	int err;
+
+	/* Pre-allocate buffers to prevent failure in mana_attach later */
+	err = mana_pre_alloc_rxbufs(mpc, new_mtu);
+	if (err) {
+		netdev_err(ndev, "Insufficient memory for new MTU\n");
+		return err;
+	}
+
+	err = mana_detach(ndev, false);
+	if (err) {
+		netdev_err(ndev, "mana_detach failed: %d\n", err);
+		goto out;
+	}
+
+	ndev->mtu = new_mtu;
+
+	err = mana_attach(ndev);
+	if (err) {
+		netdev_err(ndev, "mana_attach failed: %d\n", err);
+		ndev->mtu = old_mtu;
+	}
+
+out:
+	mana_pre_dealloc_rxbufs(mpc);
+	return err;
+}
+
 static const struct net_device_ops mana_devops = {
 	.ndo_open		= mana_open,
 	.ndo_stop		= mana_close,
@@ -436,6 +622,7 @@ static const struct net_device_ops mana_devops = {
 	.ndo_get_stats64	= mana_get_stats64,
 	.ndo_bpf		= mana_bpf,
 	.ndo_xdp_xmit		= mana_xdp_xmit,
+	.ndo_change_mtu		= mana_change_mtu,
 };
 
 static void mana_cleanup_port_context(struct mana_port_context *apc)
@@ -625,6 +812,9 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 
 	mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_DEV_CONFIG,
 			     sizeof(req), sizeof(resp));
+
+	req.hdr.resp.msg_version = GDMA_MESSAGE_V2;
+
 	req.proto_major_ver = proto_major_ver;
 	req.proto_minor_ver = proto_minor_ver;
 	req.proto_micro_ver = proto_micro_ver;
@@ -647,6 +837,11 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 
 	*max_num_vports = resp.max_num_vports;
 
+	if (resp.hdr.response.msg_version == GDMA_MESSAGE_V2)
+		gc->adapter_mtu = resp.adapter_mtu;
+	else
+		gc->adapter_mtu = ETH_FRAME_LEN;
+
 	return 0;
 }
 
@@ -1712,10 +1907,14 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
 static int mana_fill_rx_oob(struct mana_recv_buf_oob *rx_oob, u32 mem_key,
 			    struct mana_rxq *rxq, struct device *dev)
 {
+	struct mana_port_context *mpc = netdev_priv(rxq->ndev);
 	dma_addr_t da;
 	void *va;
 
-	va = mana_get_rxfrag(rxq, dev, &da, false);
+	if (mpc->rxbufs_pre)
+		va = mana_get_rxbuf_pre(rxq, &da);
+	else
+		va = mana_get_rxfrag(rxq, dev, &da, false);
 
 	if (!va)
 		return -ENOMEM;
@@ -1797,7 +1996,6 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_obj_spec wq_spec;
 	struct mana_obj_spec cq_spec;
-	unsigned int mtu = ndev->mtu;
 	struct gdma_queue_spec spec;
 	struct mana_cq *cq = NULL;
 	struct gdma_context *gc;
@@ -1817,15 +2015,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->rxq_idx = rxq_idx;
 	rxq->rxobj = INVALID_MANA_HANDLE;
 
-	rxq->datasize = ALIGN(mtu + ETH_HLEN, 64);
-
-	if (mtu > MANA_XDP_MTU_MAX) {
-		rxq->alloc_size = mtu + MANA_RXBUF_PAD;
-		rxq->headroom = 0;
-	} else {
-		rxq->alloc_size = mtu + MANA_RXBUF_PAD + XDP_PACKET_HEADROOM;
-		rxq->headroom = XDP_PACKET_HEADROOM;
-	}
+	mana_get_rxbuf_cfg(ndev->mtu, &rxq->datasize, &rxq->alloc_size,
+			   &rxq->headroom);
 
 	err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size);
 	if (err)
@@ -2238,8 +2429,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	ndev->netdev_ops = &mana_devops;
 	ndev->ethtool_ops = &mana_ethtool_ops;
 	ndev->mtu = ETH_DATA_LEN;
-	ndev->max_mtu = ndev->mtu;
-	ndev->min_mtu = ndev->mtu;
+	ndev->max_mtu = gc->adapter_mtu - ETH_HLEN;
+	ndev->min_mtu = ETH_MIN_MTU;
 	ndev->needed_headroom = MANA_HEADROOM;
 	ndev->dev_port = port_idx;
 	SET_NETDEV_DEV(ndev, gc->dev);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 56189e4252da..96c120160f15 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -145,6 +145,7 @@ struct gdma_general_req {
 }; /* HW DATA */
 
 #define GDMA_MESSAGE_V1 1
+#define GDMA_MESSAGE_V2 2
 
 struct gdma_general_resp {
 	struct gdma_resp_hdr hdr;
@@ -354,6 +355,9 @@ struct gdma_context {
 	struct gdma_resource	msix_resource;
 	struct gdma_irq_context	*irq_contexts;
 
+	/* L2 MTU */
+	u16 adapter_mtu;
+
 	/* This maps a CQ index to the queue structure. */
 	unsigned int		max_num_cqs;
 	struct gdma_queue	**cq_table;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index fee99d704281..cd386aa7c7cc 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -37,6 +37,7 @@ enum TRI_STATE {
 #define COMP_ENTRY_SIZE 64
 
 #define RX_BUFFERS_PER_QUEUE 512
+#define MANA_RX_DATA_ALIGN 64
 
 #define MAX_SEND_BUFFERS_PER_QUEUE 256
 
@@ -390,6 +391,14 @@ struct mana_port_context {
 	/* This points to an array of num_queues of RQ pointers. */
 	struct mana_rxq **rxqs;
 
+	/* pre-allocated rx buffer array */
+	void **rxbufs_pre;
+	dma_addr_t *das_pre;
+	int rxbpre_total;
+	u32 rxbpre_datasize;
+	u32 rxbpre_alloc_size;
+	u32 rxbpre_headroom;
+
 	struct bpf_prog *bpf_prog;
 
 	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
@@ -489,6 +498,11 @@ struct mana_query_device_cfg_resp {
 	u16 max_num_vports;
 	u16 reserved;
 	u32 max_num_eqs;
+
+	/* response v2: */
+	u16 adapter_mtu;
+	u16 reserved2;
+	u32 reserved3;
 }; /* HW DATA */
 
 /* Query vPort Configuration */
-- 
cgit v1.2.3


From cc4cffc3c142d57df48c07851862444e1d33bdaa Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Fri, 7 Apr 2023 22:37:52 +0200
Subject: wifi: brcmfmac: add Cypress 43439 SDIO ids

Add SDIO ids for use with the muRata 1YN (Cypress CYW43439).
The odd thing about this is that the previous 1YN populated
on M.2 card for evaluation purposes had BRCM SDIO vendor ID,
while the chip populated on real hardware has a Cypress one.
The device ID also differs between the two devices. But they
are both 43439 otherwise, so add the IDs for both.

On-device 1YN (43439), the new one, chip label reads "1YN":
```
/sys/.../mmc_host/mmc2/mmc2:0001 # cat vendor device
0x04b4
0xbd3d
```

EA M.2 evaluation board 1YN (43439), the old one, chip label reads "1YN ES1.4":
```
/sys/.../mmc_host/mmc0/mmc0:0001/# cat vendor device
0x02d0
0xa9a6
```

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Marek Vasut <marex@denx.de>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20230407203752.128539-1-marex@denx.de
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 9 ++++++++-
 include/linux/mmc/sdio_ids.h                              | 5 ++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index b7c918f241c9..f5dc3bb11b64 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -965,6 +965,12 @@ out:
 		.driver_data = BRCMF_FWVENDOR_ ## fw_vend \
 	}
 
+#define CYW_SDIO_DEVICE(dev_id, fw_vend) \
+	{ \
+		SDIO_DEVICE(SDIO_VENDOR_ID_CYPRESS, dev_id), \
+		.driver_data = BRCMF_FWVENDOR_ ## fw_vend \
+	}
+
 /* devices we support, null terminated */
 static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43143, WCC),
@@ -979,6 +985,7 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4335_4339, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4339, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43430, WCC),
+	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43439, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4345, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43455, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4354, WCC),
@@ -986,9 +993,9 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4359, WCC),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012, CYW),
-	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752, CYW),
 	BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359, CYW),
+	CYW_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439, CYW),
 	{ /* end: all zeroes */ }
 };
 MODULE_DEVICE_TABLE(sdio, brcmf_sdmmc_ids);
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 66f503ed2448..c653accdc7fd 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -74,10 +74,13 @@
 #define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
 #define SDIO_DEVICE_ID_BROADCOM_43364		0xa9a4
 #define SDIO_DEVICE_ID_BROADCOM_43430		0xa9a6
-#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439	0xa9af
+#define SDIO_DEVICE_ID_BROADCOM_43439		0xa9af
 #define SDIO_DEVICE_ID_BROADCOM_43455		0xa9bf
 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752	0xaae8
 
+#define SDIO_VENDOR_ID_CYPRESS			0x04b4
+#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439	0xbd3d
+
 #define SDIO_VENDOR_ID_MARVELL			0x02df
 #define SDIO_DEVICE_ID_MARVELL_LIBERTAS		0x9103
 #define SDIO_DEVICE_ID_MARVELL_8688_WLAN	0x9104
-- 
cgit v1.2.3


From 8c48eea3adf3119e0a3fc57bd31f6966f26ee784 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 12 Apr 2023 21:26:04 -0700
Subject: page_pool: allow caching from safely localized NAPI

Recent patches to mlx5 mentioned a regression when moving from
driver local page pool to only using the generic page pool code.
Page pool has two recycling paths (1) direct one, which runs in
safe NAPI context (basically consumer context, so producing
can be lockless); and (2) via a ptr_ring, which takes a spin
lock because the freeing can happen from any CPU; producer
and consumer may run concurrently.

Since the page pool code was added, Eric introduced a revised version
of deferred skb freeing. TCP skbs are now usually returned to the CPU
which allocated them, and freed in softirq context. This places the
freeing (producing of pages back to the pool) enticingly close to
the allocation (consumer).

If we can prove that we're freeing in the same softirq context in which
the consumer NAPI will run - lockless use of the cache is perfectly fine,
no need for the lock.

Let drivers link the page pool to a NAPI instance. If the NAPI instance
is scheduled on the same CPU on which we're freeing - place the pages
in the direct cache.

With that and patched bnxt (XDP enabled to engage the page pool, sigh,
bnxt really needs page pool work :() I see a 2.6% perf boost with
a TCP stream test (app on a different physical core than softirq).

The CPU use of relevant functions decreases as expected:

  page_pool_refill_alloc_cache   1.17% -> 0%
  _raw_spin_lock                 2.41% -> 0.98%

Only consider lockless path to be safe when NAPI is scheduled
- in practice this should cover majority if not all of steady state
workloads. It's usually the NAPI kicking in that causes the skb flush.

The main case we'll miss out on is when application runs on the same
CPU as NAPI. In that case we don't use the deferred skb free path.

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/page_pool.rst |  1 +
 include/linux/netdevice.h              |  3 +++
 include/linux/skbuff.h                 | 20 +++++++++++++-------
 include/net/page_pool.h                |  3 ++-
 net/core/dev.c                         |  3 +++
 net/core/page_pool.c                   | 15 +++++++++++++--
 net/core/skbuff.c                      |  4 ++--
 7 files changed, 37 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/page_pool.rst b/Documentation/networking/page_pool.rst
index 30f1344e7cca..873efd97f822 100644
--- a/Documentation/networking/page_pool.rst
+++ b/Documentation/networking/page_pool.rst
@@ -165,6 +165,7 @@ Registration
     pp_params.pool_size = DESC_NUM;
     pp_params.nid = NUMA_NO_NODE;
     pp_params.dev = priv->dev;
+    pp_params.napi = napi; /* only if locking is tied to NAPI */
     pp_params.dma_dir = xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
     page_pool = page_pool_create(&pp_params);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 96d27d558b0c..203c0df2046c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -360,8 +360,11 @@ struct napi_struct {
 	unsigned long		gro_bitmask;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
+	/* CPU actively polling if netpoll is configured */
 	int			poll_owner;
 #endif
+	/* CPU on which NAPI has been scheduled for processing */
+	int			list_owner;
 	struct net_device	*dev;
 	struct gro_list		gro_hash[GRO_HASH_BUCKETS];
 	struct sk_buff		*skb;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 494a23a976b0..a823ec3aa326 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
+static inline void
+napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+{
+	struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+	if (recycle && page_pool_return_skb_page(page, napi_safe))
+		return;
+#endif
+	put_page(page);
+}
+
 /**
  * __skb_frag_unref - release a reference on a paged fragment.
  * @frag: the paged fragment
@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	struct page *page = skb_frag_page(frag);
-
-#ifdef CONFIG_PAGE_POOL
-	if (recycle && page_pool_return_skb_page(page))
-		return;
-#endif
-	put_page(page);
+	napi_frag_unref(frag, recycle, false);
 }
 
 /**
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ddfa0b328677..91b808dade82 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -77,6 +77,7 @@ struct page_pool_params {
 	unsigned int	pool_size;
 	int		nid;  /* Numa node id to allocate from pages from */
 	struct device	*dev; /* device, for DMA pre-mapping purposes */
+	struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
 	enum dma_data_direction dma_dir; /* DMA mapping direction */
 	unsigned int	max_len; /* max DMA sync memory size */
 	unsigned int	offset;  /* DMA addr offset */
@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
 	return pool->p.dma_dir;
 }
 
-bool page_pool_return_skb_page(struct page *page);
+bool page_pool_return_skb_page(struct page *page, bool napi_safe);
 
 struct page_pool *page_pool_create(const struct page_pool_params *params);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index c7f13742b56c..8aea68275172 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4359,6 +4359,7 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 	}
 
 	list_add_tail(&napi->poll_list, &sd->poll_list);
+	WRITE_ONCE(napi->list_owner, smp_processor_id());
 	/* If not called from net_rx_action()
 	 * we have to raise NET_RX_SOFTIRQ.
 	 */
@@ -6069,6 +6070,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
 		list_del_init(&n->poll_list);
 		local_irq_restore(flags);
 	}
+	WRITE_ONCE(n->list_owner, -1);
 
 	val = READ_ONCE(n->state);
 	do {
@@ -6384,6 +6386,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 #ifdef CONFIG_NETPOLL
 	napi->poll_owner = -1;
 #endif
+	napi->list_owner = -1;
 	set_bit(NAPI_STATE_SCHED, &napi->state);
 	set_bit(NAPI_STATE_NPSVC, &napi->state);
 	list_add_rcu(&napi->dev_list, &dev->napi_list);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 193c18799865..2f6bf422ed30 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h> /* for put_page() */
 #include <linux/poison.h>
 #include <linux/ethtool.h>
+#include <linux/netdevice.h>
 
 #include <trace/events/page_pool.h>
 
@@ -874,9 +875,11 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
 }
 EXPORT_SYMBOL(page_pool_update_nid);
 
-bool page_pool_return_skb_page(struct page *page)
+bool page_pool_return_skb_page(struct page *page, bool napi_safe)
 {
+	struct napi_struct *napi;
 	struct page_pool *pp;
+	bool allow_direct;
 
 	page = compound_head(page);
 
@@ -892,12 +895,20 @@ bool page_pool_return_skb_page(struct page *page)
 
 	pp = page->pp;
 
+	/* Allow direct recycle if we have reasons to believe that we are
+	 * in the same context as the consumer would run, so there's
+	 * no possible race.
+	 */
+	napi = pp->p.napi;
+	allow_direct = napi_safe && napi &&
+		READ_ONCE(napi->list_owner) == smp_processor_id();
+
 	/* Driver set this to memory recycling info. Reset it on recycle.
 	 * This will *not* work for NIC using a split-page memory model.
 	 * The page will be returned to the pool here regardless of the
 	 * 'flipped' fragment being in use or not.
 	 */
-	page_pool_put_full_page(pp, page, false);
+	page_pool_put_full_page(pp, page, allow_direct);
 
 	return true;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2b5a98c5cb49..ef81452759be 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -843,7 +843,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
 {
 	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
 		return false;
-	return page_pool_return_skb_page(virt_to_page(data));
+	return page_pool_return_skb_page(virt_to_page(data), napi_safe);
 }
 
 static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -889,7 +889,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
 	}
 
 	for (i = 0; i < shinfo->nr_frags; i++)
-		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
 
 free_head:
 	if (shinfo->frag_list)
-- 
cgit v1.2.3


From cd2a8079014aced27da9b2e669784f31680f1351 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:03 -0700
Subject: bpf: Remove btf_field_offs, use btf_record's fields instead

The btf_field_offs struct contains (offset, size) for btf_record fields,
sorted by offset. btf_field_offs is always used in conjunction with
btf_record, which has btf_field 'fields' array with (offset, type), the
latter of which btf_field_offs' size is derived from via
btf_field_type_size.

This patch adds a size field to struct btf_field and sorts btf_record's
fields by offset, making it possible to get rid of btf_field_offs. Less
data duplication and less code complexity results.

Since btf_field_offs' lifetime closely followed the btf_record used to
populate it, most complexity wins are from removal of initialization
code like:

  if (btf_record_successfully_initialized) {
    foffs = btf_parse_field_offs(rec);
    if (IS_ERR_OR_NULL(foffs))
      // free the btf_record and return err
  }

Other changes in this patch are pretty mechanical:

  * foffs->field_off[i] -> rec->fields[i].offset
  * foffs->field_sz[i] -> rec->fields[i].size
  * Sort rec->fields in btf_parse_fields before returning
    * It's possible that this is necessary independently of other
      changes in this patch. btf_record_find in syscall.c expects
      btf_record's fields to be sorted by offset, yet there's no
      explicit sorting of them before this patch, record's fields are
      populated in the order they're read from BTF struct definition.
      BTF docs don't say anything about the sortedness of struct fields.
  * All functions taking struct btf_field_offs * input now instead take
    struct btf_record *. All callsites of these functions already have
    access to the correct btf_record.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-2-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     | 44 ++++++++++-------------
 include/linux/btf.h     |  2 --
 kernel/bpf/btf.c        | 93 +++++++++++--------------------------------------
 kernel/bpf/helpers.c    |  2 +-
 kernel/bpf/map_in_map.c | 15 --------
 kernel/bpf/syscall.c    | 17 ++-------
 6 files changed, 43 insertions(+), 130 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 88845aadc47d..7888ed497432 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -210,6 +210,7 @@ struct btf_field_graph_root {
 
 struct btf_field {
 	u32 offset;
+	u32 size;
 	enum btf_field_type type;
 	union {
 		struct btf_field_kptr kptr;
@@ -225,12 +226,6 @@ struct btf_record {
 	struct btf_field fields[];
 };
 
-struct btf_field_offs {
-	u32 cnt;
-	u32 field_off[BTF_FIELDS_MAX];
-	u8 field_sz[BTF_FIELDS_MAX];
-};
-
 struct bpf_map {
 	/* The first two cachelines with read-mostly members of which some
 	 * are also accessed in fast-path (e.g. ops, max_entries).
@@ -257,7 +252,6 @@ struct bpf_map {
 	struct obj_cgroup *objcg;
 #endif
 	char name[BPF_OBJ_NAME_LEN];
-	struct btf_field_offs *field_offs;
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
 	 */
@@ -360,14 +354,14 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f
 	return rec->field_mask & type;
 }
 
-static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj)
+static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
 {
 	int i;
 
-	if (!foffs)
+	if (IS_ERR_OR_NULL(rec))
 		return;
-	for (i = 0; i < foffs->cnt; i++)
-		memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]);
+	for (i = 0; i < rec->cnt; i++)
+		memset(obj + rec->fields[i].offset, 0, rec->fields[i].size);
 }
 
 /* 'dst' must be a temporary buffer and should not point to memory that is being
@@ -379,7 +373,7 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj)
  */
 static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
 {
-	bpf_obj_init(map->field_offs, dst);
+	bpf_obj_init(map->record, dst);
 }
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
@@ -399,14 +393,14 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 }
 
 /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
-static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
+static inline void bpf_obj_memcpy(struct btf_record *rec,
 				  void *dst, void *src, u32 size,
 				  bool long_memcpy)
 {
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!foffs)) {
+	if (IS_ERR_OR_NULL(rec)) {
 		if (long_memcpy)
 			bpf_long_memcpy(dst, src, round_up(size, 8));
 		else
@@ -414,49 +408,49 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs,
 		return;
 	}
 
-	for (i = 0; i < foffs->cnt; i++) {
-		u32 next_off = foffs->field_off[i];
+	for (i = 0; i < rec->cnt; i++) {
+		u32 next_off = rec->fields[i].offset;
 		u32 sz = next_off - curr_off;
 
 		memcpy(dst + curr_off, src + curr_off, sz);
-		curr_off += foffs->field_sz[i] + sz;
+		curr_off += rec->fields[i].size + sz;
 	}
 	memcpy(dst + curr_off, src + curr_off, size - curr_off);
 }
 
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
-	bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false);
+	bpf_obj_memcpy(map->record, dst, src, map->value_size, false);
 }
 
 static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
 {
-	bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true);
+	bpf_obj_memcpy(map->record, dst, src, map->value_size, true);
 }
 
-static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size)
+static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size)
 {
 	u32 curr_off = 0;
 	int i;
 
-	if (likely(!foffs)) {
+	if (IS_ERR_OR_NULL(rec)) {
 		memset(dst, 0, size);
 		return;
 	}
 
-	for (i = 0; i < foffs->cnt; i++) {
-		u32 next_off = foffs->field_off[i];
+	for (i = 0; i < rec->cnt; i++) {
+		u32 next_off = rec->fields[i].offset;
 		u32 sz = next_off - curr_off;
 
 		memset(dst + curr_off, 0, sz);
-		curr_off += foffs->field_sz[i] + sz;
+		curr_off += rec->fields[i].size + sz;
 	}
 	memset(dst + curr_off, 0, size - curr_off);
 }
 
 static inline void zero_map_value(struct bpf_map *map, void *dst)
 {
-	bpf_obj_memzero(map->field_offs, dst, map->value_size);
+	bpf_obj_memzero(map->record, dst, map->value_size);
 }
 
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 495250162422..813227bff58a 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -113,7 +113,6 @@ struct btf_id_dtor_kfunc {
 struct btf_struct_meta {
 	u32 btf_id;
 	struct btf_record *record;
-	struct btf_field_offs *field_offs;
 };
 
 struct btf_struct_metas {
@@ -207,7 +206,6 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t);
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size);
 int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec);
-struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec);
 bool btf_type_is_void(const struct btf_type *t);
 s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
 const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2c2d1fb9f410..f3c998feeccb 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1666,10 +1666,8 @@ static void btf_struct_metas_free(struct btf_struct_metas *tab)
 
 	if (!tab)
 		return;
-	for (i = 0; i < tab->cnt; i++) {
+	for (i = 0; i < tab->cnt; i++)
 		btf_record_free(tab->types[i].record);
-		kfree(tab->types[i].field_offs);
-	}
 	kfree(tab);
 }
 
@@ -3700,12 +3698,24 @@ static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
 					    __alignof__(struct bpf_rb_node));
 }
 
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+	const struct btf_field *a = (const struct btf_field *)_a;
+	const struct btf_field *b = (const struct btf_field *)_b;
+
+	if (a->offset < b->offset)
+		return -1;
+	else if (a->offset > b->offset)
+		return 1;
+	return 0;
+}
+
 struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
 				    u32 field_mask, u32 value_size)
 {
 	struct btf_field_info info_arr[BTF_FIELDS_MAX];
+	u32 next_off = 0, field_type_size;
 	struct btf_record *rec;
-	u32 next_off = 0;
 	int ret, i, cnt;
 
 	ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
@@ -3725,7 +3735,8 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 	rec->spin_lock_off = -EINVAL;
 	rec->timer_off = -EINVAL;
 	for (i = 0; i < cnt; i++) {
-		if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+		field_type_size = btf_field_type_size(info_arr[i].type);
+		if (info_arr[i].off + field_type_size > value_size) {
 			WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
 			ret = -EFAULT;
 			goto end;
@@ -3734,11 +3745,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			ret = -EEXIST;
 			goto end;
 		}
-		next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
+		next_off = info_arr[i].off + field_type_size;
 
 		rec->field_mask |= info_arr[i].type;
 		rec->fields[i].offset = info_arr[i].off;
 		rec->fields[i].type = info_arr[i].type;
+		rec->fields[i].size = field_type_size;
 
 		switch (info_arr[i].type) {
 		case BPF_SPIN_LOCK:
@@ -3808,6 +3820,9 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 		goto end;
 	}
 
+	sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+	       NULL, rec);
+
 	return rec;
 end:
 	btf_record_free(rec);
@@ -3889,61 +3904,6 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
 	return 0;
 }
 
-static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
-{
-	const u32 a = *(const u32 *)_a;
-	const u32 b = *(const u32 *)_b;
-
-	if (a < b)
-		return -1;
-	else if (a > b)
-		return 1;
-	return 0;
-}
-
-static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
-{
-	struct btf_field_offs *foffs = (void *)priv;
-	u32 *off_base = foffs->field_off;
-	u32 *a = _a, *b = _b;
-	u8 *sz_a, *sz_b;
-
-	sz_a = foffs->field_sz + (a - off_base);
-	sz_b = foffs->field_sz + (b - off_base);
-
-	swap(*a, *b);
-	swap(*sz_a, *sz_b);
-}
-
-struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
-{
-	struct btf_field_offs *foffs;
-	u32 i, *off;
-	u8 *sz;
-
-	BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
-	if (IS_ERR_OR_NULL(rec))
-		return NULL;
-
-	foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
-	if (!foffs)
-		return ERR_PTR(-ENOMEM);
-
-	off = foffs->field_off;
-	sz = foffs->field_sz;
-	for (i = 0; i < rec->cnt; i++) {
-		off[i] = rec->fields[i].offset;
-		sz[i] = btf_field_type_size(rec->fields[i].type);
-	}
-	foffs->cnt = rec->cnt;
-
-	if (foffs->cnt == 1)
-		return foffs;
-	sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
-	       btf_field_offs_cmp, btf_field_offs_swap, foffs);
-	return foffs;
-}
-
 static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
 			      u32 type_id, void *data, u8 bits_offset,
 			      struct btf_show *show)
@@ -5386,7 +5346,6 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 	for (i = 1; i < n; i++) {
 		struct btf_struct_metas *new_tab;
 		const struct btf_member *member;
-		struct btf_field_offs *foffs;
 		struct btf_struct_meta *type;
 		struct btf_record *record;
 		const struct btf_type *t;
@@ -5428,17 +5387,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
 			goto free;
 		}
-		foffs = btf_parse_field_offs(record);
-		/* We need the field_offs to be valid for a valid record,
-		 * either both should be set or both should be unset.
-		 */
-		if (IS_ERR_OR_NULL(foffs)) {
-			btf_record_free(record);
-			ret = -EFAULT;
-			goto free;
-		}
 		type->record = record;
-		type->field_offs = foffs;
 		tab->cnt++;
 	}
 	return tab;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f04e60a4847f..e989b6460147 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1893,7 +1893,7 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	if (!p)
 		return NULL;
 	if (meta)
-		bpf_obj_init(meta->field_offs, p);
+		bpf_obj_init(meta->record, p);
 	return p;
 }
 
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 38136ec4e095..2c5c64c2a53b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -56,18 +56,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		ret = PTR_ERR(inner_map_meta->record);
 		goto free;
 	}
-	if (inner_map_meta->record) {
-		struct btf_field_offs *field_offs;
-		/* If btf_record is !IS_ERR_OR_NULL, then field_offs is always
-		 * valid.
-		 */
-		field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN);
-		if (!field_offs) {
-			ret = -ENOMEM;
-			goto free_rec;
-		}
-		inner_map_meta->field_offs = field_offs;
-	}
 	/* Note: We must use the same BTF, as we also used btf_record_dup above
 	 * which relies on BTF being same for both maps, as some members like
 	 * record->fields.list_head have pointers like value_rec pointing into
@@ -88,8 +76,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 
 	fdput(f);
 	return inner_map_meta;
-free_rec:
-	btf_record_free(inner_map_meta->record);
 free:
 	kfree(inner_map_meta);
 put:
@@ -99,7 +85,6 @@ put:
 
 void bpf_map_meta_free(struct bpf_map *map_meta)
 {
-	kfree(map_meta->field_offs);
 	bpf_map_free_record(map_meta);
 	btf_put(map_meta->btf);
 	kfree(map_meta);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6d575505f89c..c08b7933bf8f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -717,14 +717,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
-	struct btf_field_offs *foffs = map->field_offs;
 	struct btf_record *rec = map->record;
 
 	security_bpf_map_free(map);
 	bpf_map_release_memcg(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
-	/* Delay freeing of field_offs and btf_record for maps, as map_free
+	/* Delay freeing of btf_record for maps, as map_free
 	 * callback usually needs access to them. It is better to do it here
 	 * than require each callback to do the free itself manually.
 	 *
@@ -733,7 +732,6 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	 * eventually calls bpf_map_free_meta, since inner_map_meta is only a
 	 * template bpf_map struct used during verification.
 	 */
-	kfree(foffs);
 	btf_record_free(rec);
 }
 
@@ -1125,7 +1123,6 @@ free_map_tab:
 static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
-	struct btf_field_offs *foffs;
 	struct bpf_map *map;
 	int f_flags;
 	int err;
@@ -1205,17 +1202,9 @@ static int map_create(union bpf_attr *attr)
 			attr->btf_vmlinux_value_type_id;
 	}
 
-
-	foffs = btf_parse_field_offs(map->record);
-	if (IS_ERR(foffs)) {
-		err = PTR_ERR(foffs);
-		goto free_map;
-	}
-	map->field_offs = foffs;
-
 	err = security_bpf_map_alloc(map);
 	if (err)
-		goto free_map_field_offs;
+		goto free_map;
 
 	err = bpf_map_alloc_id(map);
 	if (err)
@@ -1239,8 +1228,6 @@ static int map_create(union bpf_attr *attr)
 
 free_map_sec:
 	security_bpf_map_free(map);
-free_map_field_offs:
-	kfree(map->field_offs);
 free_map:
 	btf_put(map->btf);
 	map->ops->map_free(map);
-- 
cgit v1.2.3


From d54730b50bae1f3119bd686d551d66f0fcc387ca Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:04 -0700
Subject: bpf: Introduce opaque bpf_refcount struct and add btf_record plumbing

A 'struct bpf_refcount' is added to the set of opaque uapi/bpf.h types
meant for use in BPF programs. Similarly to other opaque types like
bpf_spin_lock and bpf_rbtree_node, the verifier needs to know where in
user-defined struct types a bpf_refcount can be located, so necessary
btf_record plumbing is added to enable this. bpf_refcount is sized to
hold a refcount_t.

Similarly to bpf_spin_lock, the offset of a bpf_refcount is cached in
btf_record as refcount_off in addition to being in the field array.
Caching refcount_off makes sense for this field because further patches
in the series will modify functions that take local kptrs (e.g.
bpf_obj_drop) to change their behavior if the type they're operating on
is refcounted. So enabling fast "is this type refcounted?" checks is
desirable.

No such verifier behavior changes are introduced in this patch, just
logic to recognize 'struct bpf_refcount' in btf_record.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-3-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  8 ++++++++
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/btf.c               | 12 +++++++++++-
 kernel/bpf/syscall.c           |  6 +++++-
 tools/include/uapi/linux/bpf.h |  4 ++++
 5 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7888ed497432..be44d765b7a4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,7 @@ enum btf_field_type {
 	BPF_RB_NODE    = (1 << 7),
 	BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD |
 				 BPF_RB_NODE | BPF_RB_ROOT,
+	BPF_REFCOUNT   = (1 << 8),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -223,6 +224,7 @@ struct btf_record {
 	u32 field_mask;
 	int spin_lock_off;
 	int timer_off;
+	int refcount_off;
 	struct btf_field fields[];
 };
 
@@ -293,6 +295,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "bpf_rb_root";
 	case BPF_RB_NODE:
 		return "bpf_rb_node";
+	case BPF_REFCOUNT:
+		return "bpf_refcount";
 	default:
 		WARN_ON_ONCE(1);
 		return "unknown";
@@ -317,6 +321,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_rb_root);
 	case BPF_RB_NODE:
 		return sizeof(struct bpf_rb_node);
+	case BPF_REFCOUNT:
+		return sizeof(struct bpf_refcount);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -341,6 +347,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_rb_root);
 	case BPF_RB_NODE:
 		return __alignof__(struct bpf_rb_node);
+	case BPF_REFCOUNT:
+		return __alignof__(struct bpf_refcount);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3823100b7934..4b20a7269bee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6985,6 +6985,10 @@ struct bpf_rb_node {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_refcount {
+	__u32 :32;
+} __attribute__((aligned(4)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index f3c998feeccb..14889fd5ba8e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3391,6 +3391,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 	field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
 	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");
 	field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node");
+	field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");
 
 	/* Only return BPF_KPTR when all other types with matchable names fail */
 	if (field_mask & BPF_KPTR) {
@@ -3439,6 +3440,7 @@ static int btf_find_struct_field(const struct btf *btf,
 		case BPF_TIMER:
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_REFCOUNT:
 			ret = btf_find_struct(btf, member_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3504,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 		case BPF_TIMER:
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_REFCOUNT:
 			ret = btf_find_struct(btf, var_type, off, sz, field_type,
 					      idx < info_cnt ? &info[idx] : &tmp);
 			if (ret < 0)
@@ -3734,6 +3737,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 
 	rec->spin_lock_off = -EINVAL;
 	rec->timer_off = -EINVAL;
+	rec->refcount_off = -EINVAL;
 	for (i = 0; i < cnt; i++) {
 		field_type_size = btf_field_type_size(info_arr[i].type);
 		if (info_arr[i].off + field_type_size > value_size) {
@@ -3763,6 +3767,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			/* Cache offset for faster lookup at runtime */
 			rec->timer_off = rec->fields[i].offset;
 			break;
+		case BPF_REFCOUNT:
+			WARN_ON_ONCE(rec->refcount_off >= 0);
+			/* Cache offset for faster lookup at runtime */
+			rec->refcount_off = rec->fields[i].offset;
+			break;
 		case BPF_KPTR_UNREF:
 		case BPF_KPTR_REF:
 			ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
@@ -5308,6 +5317,7 @@ static const char *alloc_obj_fields[] = {
 	"bpf_list_node",
 	"bpf_rb_root",
 	"bpf_rb_node",
+	"bpf_refcount",
 };
 
 static struct btf_struct_metas *
@@ -5381,7 +5391,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
 		type = &tab->types[tab->cnt];
 		type->btf_id = i;
 		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
-						  BPF_RB_ROOT | BPF_RB_NODE, t->size);
+						  BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
 		/* The record cannot be unset, treat it as an error if so */
 		if (IS_ERR_OR_NULL(record)) {
 			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c08b7933bf8f..28eac7434d32 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -552,6 +552,7 @@ void btf_record_free(struct btf_record *rec)
 		case BPF_RB_NODE:
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_REFCOUNT:
 			/* Nothing to release */
 			break;
 		default:
@@ -599,6 +600,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		case BPF_RB_NODE:
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_REFCOUNT:
 			/* Nothing to acquire */
 			break;
 		default:
@@ -705,6 +707,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			break;
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_REFCOUNT:
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -1032,7 +1035,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 
 	map->record = btf_parse_fields(btf, value_type,
 				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
-				       BPF_RB_ROOT,
+				       BPF_RB_ROOT | BPF_REFCOUNT,
 				       map->value_size);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
@@ -1071,6 +1074,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 				break;
 			case BPF_KPTR_UNREF:
 			case BPF_KPTR_REF:
+			case BPF_REFCOUNT:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3823100b7934..4b20a7269bee 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6985,6 +6985,10 @@ struct bpf_rb_node {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_refcount {
+	__u32 :32;
+} __attribute__((aligned(4)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
-- 
cgit v1.2.3


From 1512217c47f0e8ea076dd0e67262e5a668a78f01 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:05 -0700
Subject: bpf: Support refcounted local kptrs in existing semantics

A local kptr is considered 'refcounted' when it is of a type that has a
bpf_refcount field. When such a kptr is created, its refcount should be
initialized to 1; when destroyed, the object should be free'd only if a
refcount decr results in 0 refcount.

Existing logic always frees the underlying memory when destroying a
local kptr, and 0-initializes all btf_record fields. This patch adds
checks for "is local kptr refcounted?" and new logic for that case in
the appropriate places.

This patch focuses on changing existing semantics and thus conspicuously
does _not_ provide a way for BPF programs in increment refcount. That
follows later in the series.

__bpf_obj_drop_impl is modified to do the right thing when it sees a
refcounted type. Container types for graph nodes (list, tree, stashed in
map) are migrated to use __bpf_obj_drop_impl as a destructor for their
nodes instead of each having custom destruction code in their _free
paths. Now that "drop" isn't a synonym for "free" when the type is
refcounted it makes sense to centralize this logic.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-4-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  |  3 +++
 kernel/bpf/helpers.c | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index be44d765b7a4..b065de2cfcea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -370,6 +370,9 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
 		return;
 	for (i = 0; i < rec->cnt; i++)
 		memset(obj + rec->fields[i].offset, 0, rec->fields[i].size);
+
+	if (rec->refcount_off >= 0)
+		refcount_set((refcount_t *)(obj + rec->refcount_off), 1);
 }
 
 /* 'dst' must be a temporary buffer and should not point to memory that is being
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e989b6460147..e2dbd9644e5c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1798,6 +1798,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock)
 {
@@ -1828,13 +1830,8 @@ unlock:
 		/* The contained type can also have resources, including a
 		 * bpf_list_head which needs to be freed.
 		 */
-		bpf_obj_free_fields(field->graph_root.value_rec, obj);
-		/* bpf_mem_free requires migrate_disable(), since we can be
-		 * called from map free path as well apart from BPF program (as
-		 * part of map ops doing bpf_obj_free_fields).
-		 */
 		migrate_disable();
-		bpf_mem_free(&bpf_global_ma, obj);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
 		migrate_enable();
 	}
 }
@@ -1871,10 +1868,9 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		obj = pos;
 		obj -= field->graph_root.node_offset;
 
-		bpf_obj_free_fields(field->graph_root.value_rec, obj);
 
 		migrate_disable();
-		bpf_mem_free(&bpf_global_ma, obj);
+		__bpf_obj_drop_impl(obj, field->graph_root.value_rec);
 		migrate_enable();
 	}
 }
@@ -1897,8 +1893,17 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
 	return p;
 }
 
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
 {
+	if (rec && rec->refcount_off >= 0 &&
+	    !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+		/* Object is refcounted and refcount_dec didn't result in 0
+		 * refcount. Return without freeing the object
+		 */
+		return;
+	}
+
 	if (rec)
 		bpf_obj_free_fields(rec, p);
 	bpf_mem_free(&bpf_global_ma, p);
-- 
cgit v1.2.3


From d2dcc67df910dd85253a701b6a5b747f955d28f5 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:07 -0700
Subject: bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to
 possibly fail

Consider this code snippet:

  struct node {
    long key;
    bpf_list_node l;
    bpf_rb_node r;
    bpf_refcount ref;
  }

  int some_bpf_prog(void *ctx)
  {
    struct node *n = bpf_obj_new(/*...*/), *m;

    bpf_spin_lock(&glock);

    bpf_rbtree_add(&some_tree, &n->r, /* ... */);
    m = bpf_refcount_acquire(n);
    bpf_rbtree_add(&other_tree, &m->r, /* ... */);

    bpf_spin_unlock(&glock);

    /* ... */
  }

After bpf_refcount_acquire, n and m point to the same underlying memory,
and that node's bpf_rb_node field is being used by the some_tree insert,
so overwriting it as a result of the second insert is an error. In order
to properly support refcounted nodes, the rbtree and list insert
functions must be allowed to fail. This patch adds such support.

The kfuncs bpf_rbtree_add, bpf_list_push_{front,back} are modified to
return an int indicating success/failure, with 0 -> success, nonzero ->
failure.

bpf_obj_drop on failure
=======================

Currently the only reason an insert can fail is the example above: the
bpf_{list,rb}_node is already in use. When such a failure occurs, the
insert kfuncs will bpf_obj_drop the input node. This allows the insert
operations to logically fail without changing their verifier owning ref
behavior, namely the unconditional release_reference of the input
owning ref.

With insert that always succeeds, ownership of the node is always passed
to the collection, since the node always ends up in the collection.

With a possibly-failed insert w/ bpf_obj_drop, ownership of the node
is always passed either to the collection (success), or to bpf_obj_drop
(failure). Regardless, it's correct to continue unconditionally
releasing the input owning ref, as something is always taking ownership
from the calling program on insert.

Keeping owning ref behavior unchanged results in a nice default UX for
insert functions that can fail. If the program's reaction to a failed
insert is "fine, just get rid of this owning ref for me and let me go
on with my business", then there's no reason to check for failure since
that's default behavior. e.g.:

  long important_failures = 0;

  int some_bpf_prog(void *ctx)
  {
    struct node *n, *m, *o; /* all bpf_obj_new'd */

    bpf_spin_lock(&glock);
    bpf_rbtree_add(&some_tree, &n->node, /* ... */);
    bpf_rbtree_add(&some_tree, &m->node, /* ... */);
    if (bpf_rbtree_add(&some_tree, &o->node, /* ... */)) {
      important_failures++;
    }
    bpf_spin_unlock(&glock);
  }

If we instead chose to pass ownership back to the program on failed
insert - by returning NULL on success or an owning ref on failure -
programs would always have to do something with the returned ref on
failure. The most likely action is probably "I'll just get rid of this
owning ref and go about my business", which ideally would look like:

  if (n = bpf_rbtree_add(&some_tree, &n->node, /* ... */))
    bpf_obj_drop(n);

But bpf_obj_drop isn't allowed in a critical section and inserts must
occur within one, so in reality error handling would become a
hard-to-parse mess.

For refcounted nodes, we can replicate the "pass ownership back to
program on failure" logic with this patch's semantics, albeit in an ugly
way:

  struct node *n = bpf_obj_new(/* ... */), *m;

  bpf_spin_lock(&glock);

  m = bpf_refcount_acquire(n);
  if (bpf_rbtree_add(&some_tree, &n->node, /* ... */)) {
    /* Do something with m */
  }

  bpf_spin_unlock(&glock);
  bpf_obj_drop(m);

bpf_refcount_acquire is used to simulate "return owning ref on failure".
This should be an uncommon occurrence, though.

Addition of two verifier-fixup'd args to collection inserts
===========================================================

The actual bpf_obj_drop kfunc is
bpf_obj_drop_impl(void *, struct btf_struct_meta *), with bpf_obj_drop
macro populating the second arg with 0 and the verifier later filling in
the arg during insn fixup.

Because bpf_rbtree_add and bpf_list_push_{front,back} now might do
bpf_obj_drop, these kfuncs need a btf_struct_meta parameter that can be
passed to bpf_obj_drop_impl.

Similarly, because the 'node' param to those insert functions is the
bpf_{list,rb}_node within the node type, and bpf_obj_drop expects a
pointer to the beginning of the node, the insert functions need to be
able to find the beginning of the node struct. A second
verifier-populated param is necessary: the offset of {list,rb}_node within the
node type.

These two new params allow the insert kfuncs to correctly call
__bpf_obj_drop_impl:

  beginning_of_node = bpf_rb_node_ptr - offset
  if (already_inserted)
    __bpf_obj_drop_impl(beginning_of_node, btf_struct_meta->record);

Similarly to other kfuncs with "hidden" verifier-populated params, the
insert functions are renamed with _impl prefix and a macro is provided
for common usage. For example, bpf_rbtree_add kfunc is now
bpf_rbtree_add_impl and bpf_rbtree_add is now a macro which sets
"hidden" args to 0.

Due to the two new args BPF progs will need to be recompiled to work
with the new _impl kfuncs.

This patch also rewrites the "hidden argument" explanation to more
directly say why the BPF program writer doesn't need to populate the
arguments with anything meaningful.

How does this new logic affect non-owning references?
=====================================================

Currently, non-owning refs are valid until the end of the critical
section in which they're created. We can make this guarantee because, if
a non-owning ref exists, the referent was added to some collection. The
collection will drop() its nodes when it goes away, but it can't go away
while our program is accessing it, so that's not a problem. If the
referent is removed from the collection in the same CS that it was added
in, it can't be bpf_obj_drop'd until after CS end. Those are the only
two ways to free the referent's memory and neither can happen until
after the non-owning ref's lifetime ends.

On first glance, having these collection insert functions potentially
bpf_obj_drop their input seems like it breaks the "can't be
bpf_obj_drop'd until after CS end" line of reasoning. But we care about
the memory not being _freed_ until end of CS end, and a previous patch
in the series modified bpf_obj_drop such that it doesn't free refcounted
nodes until refcount == 0. So the statement can be more accurately
rewritten as "can't be free'd until after CS end".

We can prove that this rewritten statement holds for any non-owning
reference produced by collection insert functions:

* If the input to the insert function is _not_ refcounted
  * We have an owning reference to the input, and can conclude it isn't
    in any collection
    * Inserting a node in a collection turns owning refs into
      non-owning, and since our input type isn't refcounted, there's no
      way to obtain additional owning refs to the same underlying
      memory
  * Because our node isn't in any collection, the insert operation
    cannot fail, so bpf_obj_drop will not execute
  * If bpf_obj_drop is guaranteed not to execute, there's no risk of
    memory being free'd

* Otherwise, the input to the insert function is refcounted
  * If the insert operation fails due to the node's list_head or rb_root
    already being in some collection, there was some previous successful
    insert which passed refcount to the collection
  * We have an owning reference to the input, it must have been
    acquired via bpf_refcount_acquire, which bumped the refcount
  * refcount must be >= 2 since there's a valid owning reference and the
    node is already in a collection
  * Insert triggering bpf_obj_drop will decr refcount to >= 1, never
    resulting in a free

So although we may do bpf_obj_drop during the critical section, this
will never result in memory being free'd, and no changes to non-owning
ref logic are needed in this patch.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-6-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h                   |  7 ++-
 kernel/bpf/helpers.c                           | 65 +++++++++++++++------
 kernel/bpf/verifier.c                          | 78 ++++++++++++++++++--------
 tools/testing/selftests/bpf/bpf_experimental.h | 49 ++++++++++++----
 4 files changed, 148 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f03852b89d28..3dd29a53b711 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -464,7 +464,12 @@ struct bpf_insn_aux_data {
 		 */
 		struct bpf_loop_inline_state loop_inline_state;
 	};
-	u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */
+	union {
+		/* remember the size of type passed to bpf_obj_new to rewrite R1 */
+		u64 obj_new_size;
+		/* remember the offset of node field within type to rewrite */
+		u64 insert_off;
+	};
 	struct btf_struct_meta *kptr_struct_meta;
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 57ff8a60222c..5067f8d46872 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1931,7 +1931,8 @@ __bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta
 	return (void *)p__refcounted_kptr;
 }
 
-static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
+static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head,
+			  bool tail, struct btf_record *rec, u64 off)
 {
 	struct list_head *n = (void *)node, *h = (void *)head;
 
@@ -1939,17 +1940,35 @@ static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *hea
 		INIT_LIST_HEAD(h);
 	if (unlikely(!n->next))
 		INIT_LIST_HEAD(n);
+	if (!list_empty(n)) {
+		/* Only called from BPF prog, no need to migrate_disable */
+		__bpf_obj_drop_impl(n - off, rec);
+		return -EINVAL;
+	}
+
 	tail ? list_add_tail(n, h) : list_add(n, h);
+
+	return 0;
 }
 
-__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+					 struct bpf_list_node *node,
+					 void *meta__ign, u64 off)
 {
-	return __bpf_list_add(node, head, false);
+	struct btf_struct_meta *meta = meta__ign;
+
+	return __bpf_list_add(node, head, false,
+			      meta ? meta->record : NULL, off);
 }
 
-__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+					struct bpf_list_node *node,
+					void *meta__ign, u64 off)
 {
-	return __bpf_list_add(node, head, true);
+	struct btf_struct_meta *meta = meta__ign;
+
+	return __bpf_list_add(node, head, true,
+			      meta ? meta->record : NULL, off);
 }
 
 static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
@@ -1989,14 +2008,23 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 /* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
  * program
  */
-static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
-			     void *less)
+static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			    void *less, struct btf_record *rec, u64 off)
 {
 	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+	struct rb_node *parent = NULL, *n = (struct rb_node *)node;
 	bpf_callback_t cb = (bpf_callback_t)less;
-	struct rb_node *parent = NULL;
 	bool leftmost = true;
 
+	if (!n->__rb_parent_color)
+		RB_CLEAR_NODE(n);
+
+	if (!RB_EMPTY_NODE(n)) {
+		/* Only called from BPF prog, no need to migrate_disable */
+		__bpf_obj_drop_impl(n - off, rec);
+		return -EINVAL;
+	}
+
 	while (*link) {
 		parent = *link;
 		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
@@ -2007,15 +2035,18 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
 		}
 	}
 
-	rb_link_node((struct rb_node *)node, parent, link);
-	rb_insert_color_cached((struct rb_node *)node,
-			       (struct rb_root_cached *)root, leftmost);
+	rb_link_node(n, parent, link);
+	rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+	return 0;
 }
 
-__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
-				bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+				    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+				    void *meta__ign, u64 off)
 {
-	__bpf_rbtree_add(root, node, (void *)less);
+	struct btf_struct_meta *meta = meta__ign;
+
+	return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);
 }
 
 __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
@@ -2291,14 +2322,14 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
 BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_list_push_front)
-BTF_ID_FLAGS(func, bpf_list_push_back)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
 BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_rbtree_add)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
 BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
 
 #ifdef CONFIG_CGROUPS
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 29e106f7ccaa..736cb7cec0bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8500,10 +8500,10 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
 					 struct bpf_func_state *callee,
 					 int insn_idx)
 {
-	/* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+	/* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
 	 *                     bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
 	 *
-	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset
+	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
 	 * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
 	 * by this point, so look at 'root'
 	 */
@@ -9571,8 +9571,8 @@ enum special_kfunc_type {
 	KF_bpf_obj_new_impl,
 	KF_bpf_obj_drop_impl,
 	KF_bpf_refcount_acquire_impl,
-	KF_bpf_list_push_front,
-	KF_bpf_list_push_back,
+	KF_bpf_list_push_front_impl,
+	KF_bpf_list_push_back_impl,
 	KF_bpf_list_pop_front,
 	KF_bpf_list_pop_back,
 	KF_bpf_cast_to_kern_ctx,
@@ -9580,7 +9580,7 @@ enum special_kfunc_type {
 	KF_bpf_rcu_read_lock,
 	KF_bpf_rcu_read_unlock,
 	KF_bpf_rbtree_remove,
-	KF_bpf_rbtree_add,
+	KF_bpf_rbtree_add_impl,
 	KF_bpf_rbtree_first,
 	KF_bpf_dynptr_from_skb,
 	KF_bpf_dynptr_from_xdp,
@@ -9592,14 +9592,14 @@ BTF_SET_START(special_kfunc_set)
 BTF_ID(func, bpf_obj_new_impl)
 BTF_ID(func, bpf_obj_drop_impl)
 BTF_ID(func, bpf_refcount_acquire_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
 BTF_ID(func, bpf_cast_to_kern_ctx)
 BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
@@ -9611,8 +9611,8 @@ BTF_ID_LIST(special_kfunc_list)
 BTF_ID(func, bpf_obj_new_impl)
 BTF_ID(func, bpf_obj_drop_impl)
 BTF_ID(func, bpf_refcount_acquire_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
 BTF_ID(func, bpf_list_pop_front)
 BTF_ID(func, bpf_list_pop_back)
 BTF_ID(func, bpf_cast_to_kern_ctx)
@@ -9620,7 +9620,7 @@ BTF_ID(func, bpf_rdonly_cast)
 BTF_ID(func, bpf_rcu_read_lock)
 BTF_ID(func, bpf_rcu_read_unlock)
 BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
 BTF_ID(func, bpf_rbtree_first)
 BTF_ID(func, bpf_dynptr_from_skb)
 BTF_ID(func, bpf_dynptr_from_xdp)
@@ -9954,15 +9954,15 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
 
 static bool is_bpf_list_api_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
-	       btf_id == special_kfunc_list[KF_bpf_list_push_back] ||
+	return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+	       btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
 	       btf_id == special_kfunc_list[KF_bpf_list_pop_back];
 }
 
 static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
 	       btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
 	       btf_id == special_kfunc_list[KF_bpf_rbtree_first];
 }
@@ -9975,7 +9975,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
 
 static bool is_callback_calling_kfunc(u32 btf_id)
 {
-	return btf_id == special_kfunc_list[KF_bpf_rbtree_add];
+	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
 
 static bool is_rbtree_lock_required_kfunc(u32 btf_id)
@@ -10016,12 +10016,12 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
 
 	switch (node_field_type) {
 	case BPF_LIST_NODE:
-		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
-		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]);
+		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
 		break;
 	case BPF_RB_NODE:
 		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
-		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]);
+		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
 		break;
 	default:
 		verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -10702,10 +10702,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
-	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] ||
-	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back] ||
-	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
 		release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+		insn_aux->insert_off = regs[BPF_REG_2].off;
 		err = ref_convert_owning_non_owning(env, release_ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
@@ -10721,7 +10722,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
-	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_rbtree_add_callback_state);
 		if (err) {
@@ -14764,7 +14765,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
 		       const struct bpf_reg_state *rcur,
 		       struct bpf_id_pair *idmap)
 {
-	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && 
+	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
 	       check_ids(rold->id, rcur->id, idmap) &&
 	       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
 }
@@ -17407,6 +17408,23 @@ static void specialize_kfunc(struct bpf_verifier_env *env,
 	}
 }
 
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+					    u16 struct_meta_reg,
+					    u16 node_offset_reg,
+					    struct bpf_insn *insn,
+					    struct bpf_insn *insn_buf,
+					    int *cnt)
+{
+	struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+	struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+	insn_buf[0] = addr[0];
+	insn_buf[1] = addr[1];
+	insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+	insn_buf[3] = *insn;
+	*cnt = 4;
+}
+
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
 {
@@ -17453,6 +17471,20 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		insn_buf[1] = addr[1];
 		insn_buf[2] = *insn;
 		*cnt = 3;
+	} else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+		   desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+		   desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+		int struct_meta_reg = BPF_REG_3;
+		int node_offset_reg = BPF_REG_4;
+
+		/* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+		if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+			struct_meta_reg = BPF_REG_4;
+			node_offset_reg = BPF_REG_5;
+		}
+
+		__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+						node_offset_reg, insn, insn_buf, cnt);
 	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 619afcab2ab0..209811b1993a 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -14,7 +14,8 @@
  *	type ID of a struct in program BTF.
  *
  *	The 'local_type_id' parameter must be a known constant.
- *	The 'meta' parameter is a hidden argument that is ignored.
+ *	The 'meta' parameter is rewritten by the verifier, no need for BPF
+ *	program to set it.
  * Returns
  *	A pointer to an object of the type corresponding to the passed in
  *	'local_type_id', or NULL on failure.
@@ -28,7 +29,8 @@ extern void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
  *	Free an allocated object. All fields of the object that require
  *	destruction will be destructed before the storage is freed.
  *
- *	The 'meta' parameter is a hidden argument that is ignored.
+ *	The 'meta' parameter is rewritten by the verifier, no need for BPF
+ *	program to set it.
  * Returns
  *	Void.
  */
@@ -41,7 +43,8 @@ extern void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
  *	Increment the refcount on a refcounted local kptr, turning the
  *	non-owning reference input into an owning reference in the process.
  *
- *	The 'meta' parameter is a hidden argument that is ignored.
+ *	The 'meta' parameter is rewritten by the verifier, no need for BPF
+ *	program to set it.
  * Returns
  *	An owning reference to the object pointed to by 'kptr'
  */
@@ -52,17 +55,35 @@ extern void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
 
 /* Description
  *	Add a new entry to the beginning of the BPF linked list.
+ *
+ *	The 'meta' and 'off' parameters are rewritten by the verifier, no need
+ *	for BPF programs to set them
  * Returns
- *	Void.
+ *	0 if the node was successfully added
+ *	-EINVAL if the node wasn't added because it's already in a list
  */
-extern void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+extern int bpf_list_push_front_impl(struct bpf_list_head *head,
+				    struct bpf_list_node *node,
+				    void *meta, __u64 off) __ksym;
+
+/* Convenience macro to wrap over bpf_list_push_front_impl */
+#define bpf_list_push_front(head, node) bpf_list_push_front_impl(head, node, NULL, 0)
 
 /* Description
  *	Add a new entry to the end of the BPF linked list.
+ *
+ *	The 'meta' and 'off' parameters are rewritten by the verifier, no need
+ *	for BPF programs to set them
  * Returns
- *	Void.
+ *	0 if the node was successfully added
+ *	-EINVAL if the node wasn't added because it's already in a list
  */
-extern void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+extern int bpf_list_push_back_impl(struct bpf_list_head *head,
+				   struct bpf_list_node *node,
+				   void *meta, __u64 off) __ksym;
+
+/* Convenience macro to wrap over bpf_list_push_back_impl */
+#define bpf_list_push_back(head, node) bpf_list_push_back_impl(head, node, NULL, 0)
 
 /* Description
  *	Remove the entry at the beginning of the BPF linked list.
@@ -88,11 +109,19 @@ extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 
 /* Description
  *	Add 'node' to rbtree with root 'root' using comparator 'less'
+ *
+ *	The 'meta' and 'off' parameters are rewritten by the verifier, no need
+ *	for BPF programs to set them
  * Returns
- *	Nothing
+ *	0 if the node was successfully added
+ *	-EINVAL if the node wasn't added because it's already in a tree
  */
-extern void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
-			   bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym;
+extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+			       bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+			       void *meta, __u64 off) __ksym;
+
+/* Convenience macro to wrap over bpf_rbtree_add_impl */
+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
 
 /* Description
  *	Return the first (leftmost) node in input tree
-- 
cgit v1.2.3


From 3e81740a90626024a9d9c6f9bfa3d36204dafefb Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Sat, 15 Apr 2023 13:18:10 -0700
Subject: bpf: Centralize btf_field-specific initialization logic

All btf_fields in an object are 0-initialized by memset in
bpf_obj_init. This might not be a valid initial state for some field
types, in which case kfuncs that use the type will properly initialize
their input if it's been 0-initialized. Some BPF graph collection types
and kfuncs do this: bpf_list_{head,node} and bpf_rb_node.

An earlier patch in this series added the bpf_refcount field, for which
the 0 state indicates that the refcounted object should be free'd.
bpf_obj_init treats this field specially, setting refcount to 1 instead
of relying on scattered "refcount is 0? Must have just been initialized,
let's set to 1" logic in kfuncs.

This patch extends this treatment to list and rbtree field types,
allowing most scattered initialization logic in kfuncs to be removed.

Note that bpf_{list_head,rb_root} may be inside a BPF map, in which case
they'll be 0-initialized without passing through the newly-added logic,
so scattered initialization logic must remain for these collection root
types.

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Link: https://lore.kernel.org/r/20230415201811.343116-9-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 33 +++++++++++++++++++++++++++++----
 kernel/bpf/helpers.c | 14 ++++++--------
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b065de2cfcea..18b592fde896 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,34 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 	}
 }
 
+static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
+{
+	memset(addr, 0, field->size);
+
+	switch (field->type) {
+	case BPF_REFCOUNT:
+		refcount_set((refcount_t *)addr, 1);
+		break;
+	case BPF_RB_NODE:
+		RB_CLEAR_NODE((struct rb_node *)addr);
+		break;
+	case BPF_LIST_HEAD:
+	case BPF_LIST_NODE:
+		INIT_LIST_HEAD((struct list_head *)addr);
+		break;
+	case BPF_RB_ROOT:
+		/* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
+	case BPF_SPIN_LOCK:
+	case BPF_TIMER:
+	case BPF_KPTR_UNREF:
+	case BPF_KPTR_REF:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return;
+	}
+}
+
 static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type)
 {
 	if (IS_ERR_OR_NULL(rec))
@@ -369,10 +397,7 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj)
 	if (IS_ERR_OR_NULL(rec))
 		return;
 	for (i = 0; i < rec->cnt; i++)
-		memset(obj + rec->fields[i].offset, 0, rec->fields[i].size);
-
-	if (rec->refcount_off >= 0)
-		refcount_set((refcount_t *)(obj + rec->refcount_off), 1);
+		bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset);
 }
 
 /* 'dst' must be a temporary buffer and should not point to memory that is being
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1835df333287..00e5fb0682ac 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1936,10 +1936,11 @@ static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head
 {
 	struct list_head *n = (void *)node, *h = (void *)head;
 
+	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+	 * called on its fields, so init here
+	 */
 	if (unlikely(!h->next))
 		INIT_LIST_HEAD(h);
-	if (unlikely(!n->next))
-		INIT_LIST_HEAD(n);
 	if (!list_empty(n)) {
 		/* Only called from BPF prog, no need to migrate_disable */
 		__bpf_obj_drop_impl(n - off, rec);
@@ -1975,6 +1976,9 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai
 {
 	struct list_head *n, *h = (void *)head;
 
+	/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+	 * called on its fields, so init here
+	 */
 	if (unlikely(!h->next))
 		INIT_LIST_HEAD(h);
 	if (list_empty(h))
@@ -2000,9 +2004,6 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
 	struct rb_root_cached *r = (struct rb_root_cached *)root;
 	struct rb_node *n = (struct rb_node *)node;
 
-	if (!n->__rb_parent_color)
-		RB_CLEAR_NODE(n);
-
 	if (RB_EMPTY_NODE(n))
 		return NULL;
 
@@ -2022,9 +2023,6 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
 	bpf_callback_t cb = (bpf_callback_t)less;
 	bool leftmost = true;
 
-	if (!n->__rb_parent_color)
-		RB_CLEAR_NODE(n);
-
 	if (!RB_EMPTY_NODE(n)) {
 		/* Only called from BPF prog, no need to migrate_disable */
 		__bpf_obj_drop_impl(n - off, rec);
-- 
cgit v1.2.3


From 7b4ddf3920d247c2949073b9c274301c8131332a Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Sun, 16 Apr 2023 03:49:27 -0500
Subject: bpf: Remove KF_KPTR_GET kfunc flag

We've managed to improve the UX for kptrs significantly over the last 9
months. All of the existing use cases which previously had KF_KPTR_GET
kfuncs (struct bpf_cpumask *, struct task_struct *, and struct cgroup *)
have all been updated to be synchronized using RCU. In other words,
their KF_KPTR_GET kfuncs have been removed in favor of KF_RCU |
KF_ACQUIRE kfuncs, with the pointers themselves also being readable from
maps in an RCU read region thanks to the types being RCU safe.

While KF_KPTR_GET was a logical starting point for kptrs, it's become
clear that they're not the correct abstraction. KF_KPTR_GET is a flag
that essentially does nothing other than enforcing that the argument to
a function is a pointer to a referenced kptr map value. At first glance,
that's a useful thing to guarantee to a kfunc. It gives kfuncs the
ability to try and acquire a reference on that kptr without requiring
the BPF prog to do something like this:

struct kptr_type *in_map, *new = NULL;

in_map = bpf_kptr_xchg(&map->value, NULL);
if (in_map) {
        new = bpf_kptr_type_acquire(in_map);
        in_map = bpf_kptr_xchg(&map->value, in_map);
        if (in_map)
                bpf_kptr_type_release(in_map);
}

That's clearly a pretty ugly (and racy) UX, and if using KF_KPTR_GET is
the only alternative, it's better than nothing. However, the problem
with any KF_KPTR_GET kfunc lies in the fact that it always requires some
kind of synchronization in order to safely do an opportunistic acquire
of the kptr in the map. This is because a BPF program running on another
CPU could do a bpf_kptr_xchg() on that map value, and free the kptr
after it's been read by the KF_KPTR_GET kfunc. For example, the
now-removed bpf_task_kptr_get() kfunc did the following:

struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
{
            struct task_struct *p;

        rcu_read_lock();
        p = READ_ONCE(*pp);
        /* If p is non-NULL, it could still be freed by another CPU,
         * so we have to do an opportunistic refcount_inc_not_zero()
         * and return NULL if the task will be freed after the
         * current RCU read region.
         */
        |f (p && !refcount_inc_not_zero(&p->rcu_users))
                p = NULL;
        rcu_read_unlock();

        return p;
}

In other words, the kfunc uses RCU to ensure that the task remains valid
after it's been peeked from the map. However, this is completely
redundant with just defining a KF_RCU kfunc that itself does a
refcount_inc_not_zero(), which is exactly what bpf_task_acquire() now
does.

So, the question of whether KF_KPTR_GET is useful is actually, "Are
there any synchronization mechanisms / safety flags that are required by
certain kptrs, but which are not provided by the verifier to kfuncs?"
The answer to that question today is "No", because every kptr we
currently care about is RCU protected.

Even if the answer ever became "yes", the proper way to support that
referenced kptr type would be to add support for whatever
synchronization mechanism it requires in the verifier, rather than
giving kfuncs a flag that says, "Here's a pointer to a referenced kptr
in a map, do whatever you need to do."

With all that said -- so as to allow us to consolidate the kfunc API,
and simplify the verifier a bit, this patch removes KF_KPTR_GET, and all
relevant logic from the verifier.

Signed-off-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20230416084928.326135-3-void@manifault.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h   |  1 -
 kernel/bpf/verifier.c | 65 ---------------------------------------------------
 2 files changed, 66 deletions(-)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 813227bff58a..508199e38415 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -18,7 +18,6 @@
 #define KF_ACQUIRE	(1 << 0) /* kfunc is an acquire function */
 #define KF_RELEASE	(1 << 1) /* kfunc is a release function */
 #define KF_RET_NULL	(1 << 2) /* kfunc returns a pointer that may be NULL */
-#define KF_KPTR_GET	(1 << 3) /* kfunc returns reference to a kptr */
 /* Trusted arguments are those which are guaranteed to be valid when passed to
  * the kfunc. It is used to enforce that pointers obtained from either acquire
  * kfuncs, or from the main kernel on a tracepoint or struct_ops callback
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a41b69a424e..5dae11ee01c3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9339,11 +9339,6 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->kfunc_flags & KF_RCU;
 }
 
-static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
-{
-	return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
-}
-
 static bool __kfunc_param_match_suffix(const struct btf *btf,
 				       const struct btf_param *arg,
 				       const char *suffix)
@@ -9554,7 +9549,6 @@ enum kfunc_ptr_arg_type {
 	KF_ARG_PTR_TO_CTX,
 	KF_ARG_PTR_TO_ALLOC_BTF_ID,    /* Allocated object */
 	KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
-	KF_ARG_PTR_TO_KPTR,	       /* PTR_TO_KPTR but type specific */
 	KF_ARG_PTR_TO_DYNPTR,
 	KF_ARG_PTR_TO_ITER,
 	KF_ARG_PTR_TO_LIST_HEAD,
@@ -9666,21 +9660,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 	if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
 
-	if (is_kfunc_arg_kptr_get(meta, argno)) {
-		if (!btf_type_is_ptr(ref_t)) {
-			verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
-			return -EINVAL;
-		}
-		ref_t = btf_type_by_id(meta->btf, ref_t->type);
-		ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
-		if (!btf_type_is_struct(ref_t)) {
-			verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
-				meta->func_name, btf_type_str(ref_t), ref_tname);
-			return -EINVAL;
-		}
-		return KF_ARG_PTR_TO_KPTR;
-	}
-
 	if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
 		return KF_ARG_PTR_TO_DYNPTR;
 
@@ -9794,40 +9773,6 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
-				      struct bpf_reg_state *reg,
-				      const struct btf_type *ref_t,
-				      const char *ref_tname,
-				      struct bpf_kfunc_call_arg_meta *meta,
-				      int argno)
-{
-	struct btf_field *kptr_field;
-
-	/* check_func_arg_reg_off allows var_off for
-	 * PTR_TO_MAP_VALUE, but we need fixed offset to find
-	 * off_desc.
-	 */
-	if (!tnum_is_const(reg->var_off)) {
-		verbose(env, "arg#0 must have constant offset\n");
-		return -EINVAL;
-	}
-
-	kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
-	if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
-		verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
-			reg->off + reg->var_off.value);
-		return -EINVAL;
-	}
-
-	if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
-				  kptr_field->kptr.btf_id, true)) {
-		verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
-			meta->func_name, argno, btf_type_str(ref_t), ref_tname);
-		return -EINVAL;
-	}
-	return 0;
-}
-
 static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -10315,7 +10260,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			/* Trusted arguments have the same offset checks as release arguments */
 			arg_type |= OBJ_RELEASE;
 			break;
-		case KF_ARG_PTR_TO_KPTR:
 		case KF_ARG_PTR_TO_DYNPTR:
 		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
@@ -10368,15 +10312,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 				meta->arg_obj_drop.btf_id = reg->btf_id;
 			}
 			break;
-		case KF_ARG_PTR_TO_KPTR:
-			if (reg->type != PTR_TO_MAP_VALUE) {
-				verbose(env, "arg#0 expected pointer to map value\n");
-				return -EINVAL;
-			}
-			ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
-			if (ret < 0)
-				return ret;
-			break;
 		case KF_ARG_PTR_TO_DYNPTR:
 		{
 			enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
-- 
cgit v1.2.3


From bd4b28189469492df2b962d737842c311ce2659c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 14 Apr 2023 17:21:15 -0400
Subject: sctp: delete the obsolete code for the host name address param

In the latest RFC9260, the Host Name Address param has been deprecated.
For INIT chunk:

  Note 3: An INIT chunk MUST NOT contain the Host Name Address
  parameter.  The receiver of an INIT chunk containing a Host Name
  Address parameter MUST send an ABORT chunk and MAY include an
  "Unresolvable Address" error cause.

For Supported Address Types:

  The value indicating the Host Name Address parameter MUST NOT be
  used when sending this parameter and MUST be ignored when receiving
  this parameter.

Currently Linux SCTP doesn't really support Host Name Address param,
but only saves some flag and print debug info, which actually won't
even be triggered due to the verification in sctp_verify_param().
This patch is to delete those dead code.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  1 -
 net/sctp/sm_make_chunk.c   | 10 +---------
 net/sctp/socket.c          |  2 +-
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a0933efd93c3..070c9458fff4 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1711,7 +1711,6 @@ struct sctp_association {
 		__u16	ecn_capable:1,      /* Can peer do ECN? */
 			ipv4_address:1,     /* Peer understands IPv4 addresses? */
 			ipv6_address:1,     /* Peer understands IPv6 addresses? */
-			hostname_address:1, /* Peer understands DNS addresses? */
 			asconf_capable:1,   /* Does peer support ADDIP? */
 			prsctp_capable:1,   /* Can peer do PR-SCTP? */
 			reconf_capable:1,   /* Can peer do RE-CONFIG? */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c7503fd64915..c8f4ec5d5f98 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2207,7 +2207,7 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
 		break;
 
 	case SCTP_PARAM_HOST_NAME_ADDRESS:
-		/* Tell the peer, we won't support this param.  */
+		/* This param has been Deprecated, send ABORT.  */
 		sctp_process_hn_param(asoc, param, chunk, err_chunk);
 		retval = SCTP_IERROR_ABORT;
 		break;
@@ -2589,10 +2589,6 @@ do_addr_param:
 		asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale);
 		break;
 
-	case SCTP_PARAM_HOST_NAME_ADDRESS:
-		pr_debug("%s: unimplemented SCTP_HOST_NAME_ADDRESS\n", __func__);
-		break;
-
 	case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES:
 		/* Turn off the default values first so we'll know which
 		 * ones are really set by the peer.
@@ -2624,10 +2620,6 @@ do_addr_param:
 					asoc->peer.ipv6_address = 1;
 				break;
 
-			case SCTP_PARAM_HOST_NAME_ADDRESS:
-				asoc->peer.hostname_address = 1;
-				break;
-
 			default: /* Just ignore anything else.  */
 				break;
 			}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 218e0982c370..079e726909b4 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5195,7 +5195,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 	mask = asoc->peer.ecn_capable << 1;
 	mask = (mask | asoc->peer.ipv4_address) << 1;
 	mask = (mask | asoc->peer.ipv6_address) << 1;
-	mask = (mask | asoc->peer.hostname_address) << 1;
+	mask = mask << 1;
 	mask = (mask | asoc->peer.asconf_capable) << 1;
 	mask = (mask | asoc->peer.prsctp_capable) << 1;
 	mask = (mask | asoc->peer.auth_capable);
-- 
cgit v1.2.3


From 59e498a3289f685261c076b998a8a2f8a516874f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 17 Apr 2023 15:49:15 +0200
Subject: bpf: Set skb redirect and from_ingress info in __bpf_tx_skb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some use-cases where it is desirable to use bpf_redirect()
in combination with ifb device, which currently is not supported, for
example, around filtering inbound traffic with BPF to then push it to
ifb which holds the qdisc for shaping in contrast to doing that on the
egress device.

Toke mentions the following case related to OpenWrt:

   Because there's not always a single egress on the other side. These are
   mainly home routers, which tend to have one or more WiFi devices bridged
   to one or more ethernet ports on the LAN side, and a single upstream WAN
   port. And the objective is to control the total amount of traffic going
   over the WAN link (in both directions), to deal with bufferbloat in the
   ISP network (which is sadly still all too prevalent).

   In this setup, the traffic can be split arbitrarily between the links
   on the LAN side, and the only "single bottleneck" is the WAN link. So we
   install both egress and ingress shapers on this, configured to something
   like 95-98% of the true link bandwidth, thus moving the queues into the
   qdisc layer in the router. It's usually necessary to set the ingress
   bandwidth shaper a bit lower than the egress due to being "downstream"
   of the bottleneck link, but it does work surprisingly well.

   We usually use something like a matchall filter to put all ingress
   traffic on the ifb, so doing the redirect from BPF has not been an
   immediate requirement thus far. However, it does seem a bit odd that
   this is not possible, and we do have a BPF-based filter that layers on
   top of this kind of setup, which currently uses u32 as the ingress
   filter and so it could presumably be improved to use BPF instead if
   that was available.

Reported-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reported-by: Yafang Shao <laoar.shao@gmail.com>
Reported-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://git.openwrt.org/?p=project/qosify.git;a=blob;f=README
Link: https://lore.kernel.org/bpf/875y9yzbuy.fsf@toke.dk
Link: https://lore.kernel.org/r/8cebc8b2b6e967e10cbafe2ffd6795050e74accd.1681739137.git.daniel@iogearbox.net
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skbuff.h | 9 +++++++++
 net/core/filter.c      | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 494a23a976b0..9ff2e3d57329 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5041,6 +5041,15 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 	skb->redirected = 0;
 }
 
+static inline void skb_set_redirected_noclear(struct sk_buff *skb,
+					      bool from_ingress)
+{
+	skb->redirected = 1;
+#ifdef CONFIG_NET_REDIRECT
+	skb->from_ingress = from_ingress;
+#endif
+}
+
 static inline bool skb_csum_is_sctp(struct sk_buff *skb)
 {
 	return skb->csum_not_inet;
diff --git a/net/core/filter.c b/net/core/filter.c
index df0df59814ae..44fb997434ad 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2122,6 +2122,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 	}
 
 	skb->dev = dev;
+	skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
 	skb_clear_tstamp(skb);
 
 	dev_xmit_recursion_inc();
-- 
cgit v1.2.3


From 1210af3b99561bbe140af8d9a6b16d1bd0ba3fda Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Thu, 13 Apr 2023 15:29:19 +0300
Subject: net/mlx5e: Add IPsec packet offload tunnel bits

Extend packet reformat types and flow table capabilities with
IPsec packet offload tunnel bits.

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6c84bf6eec85..20d00e09b168 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -463,9 +463,11 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         max_ft_level[0x8];
 
 	u8         reformat_add_esp_trasport[0x1];
-	u8         reserved_at_41[0x2];
+	u8         reformat_l2_to_l3_esp_tunnel[0x1];
+	u8         reserved_at_42[0x1];
 	u8         reformat_del_esp_trasport[0x1];
-	u8         reserved_at_44[0x2];
+	u8         reformat_l3_esp_tunnel_to_l2[0x1];
+	u8         reserved_at_45[0x1];
 	u8         execute_aso[0x1];
 	u8         reserved_at_47[0x19];
 
@@ -6630,7 +6632,9 @@ enum mlx5_reformat_ctx_type {
 	MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
 	MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
 	MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5,
+	MLX5_REFORMAT_TYPE_L2_TO_L3_ESP_TUNNEL = 0x6,
 	MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8,
+	MLX5_REFORMAT_TYPE_L3_ESP_TUNNEL_TO_L2 = 0x9,
 	MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb,
 	MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf,
 	MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10,
-- 
cgit v1.2.3


From 15f93f46f31232da863316769182c699e364c45f Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:45 +0300
Subject: net: mscc: ocelot: export a single ocelot_mm_irq()

When the switch emits an IRQ, we don't know what caused it, and we
iterate through all ports to check the MAC Merge status.

Move that iteration inside the ocelot lib; we will change the locking in
a future change and it would be good to encapsulate that lock completely
within the ocelot lib.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c |  5 +----
 drivers/net/ethernet/mscc/ocelot_mm.c  | 12 ++++++++++--
 include/soc/mscc/ocelot.h              |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index dddb28984bdf..478893c06f56 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -2610,12 +2610,9 @@ static const struct felix_info felix_info_vsc9959 = {
 static irqreturn_t felix_irq_handler(int irq, void *data)
 {
 	struct ocelot *ocelot = (struct ocelot *)data;
-	int port;
 
 	ocelot_get_txtstamp(ocelot);
-
-	for (port = 0; port < ocelot->num_phys_ports; port++)
-		ocelot_port_mm_irq(ocelot, port);
+	ocelot_mm_irq(ocelot);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index 0a8f21ae23f0..ddaf1fb05e48 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -49,7 +49,7 @@ static enum ethtool_mm_verify_status ocelot_mm_verify_status(u32 val)
 	}
 }
 
-void ocelot_port_mm_irq(struct ocelot *ocelot, int port)
+static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct ocelot_mm_state *mm = &ocelot->mm[port];
@@ -91,7 +91,15 @@ void ocelot_port_mm_irq(struct ocelot *ocelot, int port)
 
 	mutex_unlock(&mm->lock);
 }
-EXPORT_SYMBOL_GPL(ocelot_port_mm_irq);
+
+void ocelot_mm_irq(struct ocelot *ocelot)
+{
+	int port;
+
+	for (port = 0; port < ocelot->num_phys_ports; port++)
+		ocelot_mm_update_port_status(ocelot, port);
+}
+EXPORT_SYMBOL_GPL(ocelot_mm_irq);
 
 int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_cfg *cfg,
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 277e6d1f2096..eb8e3935375d 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -1148,7 +1148,7 @@ int ocelot_vcap_policer_add(struct ocelot *ocelot, u32 pol_ix,
 			    struct ocelot_policer *pol);
 int ocelot_vcap_policer_del(struct ocelot *ocelot, u32 pol_ix);
 
-void ocelot_port_mm_irq(struct ocelot *ocelot, int port);
+void ocelot_mm_irq(struct ocelot *ocelot);
 int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_cfg *cfg,
 		       struct netlink_ext_ack *extack);
-- 
cgit v1.2.3


From 3ff468ef987e38740de9ca0a811c55e11bfb2141 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:46 +0300
Subject: net: mscc: ocelot: remove struct ocelot_mm_state :: lock

Unfortunately, the workarounds for the hardware bugs make it pointless
to keep fine-grained locking for the MAC Merge state of each port.

Our vsc9959_cut_through_fwd() implementation requires
ocelot->fwd_domain_lock to be held, in order to serialize with changes
to the bridging domains and to port speed changes (which affect which
ports can be cut-through). Simultaneously, the traffic classes which can
be cut-through cannot be preemptible at the same time, and this will
depend on the MAC Merge layer state (which changes from threaded
interrupt context).

Since vsc9959_cut_through_fwd() would have to hold the mm->lock of all
ports for a correct and race-free implementation with respect to
ocelot_mm_irq(), in practice it means that any time a port's mm->lock is
held, it would potentially block holders of ocelot->fwd_domain_lock.

In the interest of simple locking rules, make all MAC Merge layer state
changes (and preemptible traffic class changes) be serialized by the
ocelot->fwd_domain_lock.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mscc/ocelot_mm.c | 20 ++++++++------------
 include/soc/mscc/ocelot.h             |  1 -
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index ddaf1fb05e48..d2df47e6f8f6 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -56,8 +56,6 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 	enum ethtool_mm_verify_status verify_status;
 	u32 val;
 
-	mutex_lock(&mm->lock);
-
 	val = ocelot_port_readl(ocelot_port, DEV_MM_STATUS);
 
 	verify_status = ocelot_mm_verify_status(val);
@@ -88,16 +86,18 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 	}
 
 	ocelot_port_writel(ocelot_port, val, DEV_MM_STATUS);
-
-	mutex_unlock(&mm->lock);
 }
 
 void ocelot_mm_irq(struct ocelot *ocelot)
 {
 	int port;
 
+	mutex_lock(&ocelot->fwd_domain_lock);
+
 	for (port = 0; port < ocelot->num_phys_ports; port++)
 		ocelot_mm_update_port_status(ocelot, port);
+
+	mutex_unlock(&ocelot->fwd_domain_lock);
 }
 EXPORT_SYMBOL_GPL(ocelot_mm_irq);
 
@@ -107,14 +107,11 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	u32 mm_enable = 0, verify_disable = 0, add_frag_size;
-	struct ocelot_mm_state *mm;
 	int err;
 
 	if (!ocelot->mm_supported)
 		return -EOPNOTSUPP;
 
-	mm = &ocelot->mm[port];
-
 	err = ethtool_mm_frag_size_min_to_add(cfg->tx_min_frag_size,
 					      &add_frag_size, extack);
 	if (err)
@@ -129,7 +126,7 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 	if (!cfg->verify_enabled)
 		verify_disable = DEV_MM_CONFIG_VERIF_CONFIG_PRM_VERIFY_DIS;
 
-	mutex_lock(&mm->lock);
+	mutex_lock(&ocelot->fwd_domain_lock);
 
 	ocelot_port_rmwl(ocelot_port, mm_enable,
 			 DEV_MM_CONFIG_ENABLE_CONFIG_MM_TX_ENA |
@@ -148,7 +145,7 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       QSYS_PREEMPTION_CFG,
 		       port);
 
-	mutex_unlock(&mm->lock);
+	mutex_unlock(&ocelot->fwd_domain_lock);
 
 	return 0;
 }
@@ -166,7 +163,7 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 
 	mm = &ocelot->mm[port];
 
-	mutex_lock(&mm->lock);
+	mutex_lock(&ocelot->fwd_domain_lock);
 
 	val = ocelot_port_readl(ocelot_port, DEV_MM_ENABLE_CONFIG);
 	state->pmac_enabled = !!(val & DEV_MM_CONFIG_ENABLE_CONFIG_MM_RX_ENA);
@@ -185,7 +182,7 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 	state->verify_status = mm->verify_status;
 	state->tx_active = mm->tx_active;
 
-	mutex_unlock(&mm->lock);
+	mutex_unlock(&ocelot->fwd_domain_lock);
 
 	return 0;
 }
@@ -209,7 +206,6 @@ int ocelot_mm_init(struct ocelot *ocelot)
 		u32 val;
 
 		mm = &ocelot->mm[port];
-		mutex_init(&mm->lock);
 		ocelot_port = ocelot->ports[port];
 
 		/* Update initial status variable for the
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index eb8e3935375d..9599be6a0a39 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -744,7 +744,6 @@ struct ocelot_mirror {
 };
 
 struct ocelot_mm_state {
-	struct mutex lock;
 	enum ethtool_mm_verify_status verify_status;
 	bool tx_active;
 };
-- 
cgit v1.2.3


From 7bf4a5b071e59f48de8d39dfde07a3a65e7f6488 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:47 +0300
Subject: net: mscc: ocelot: optimize ocelot_mm_irq()

The MAC Merge IRQ of all ports is shared with the PTP TX timestamp IRQ
of all ports, which means that currently, when a PTP TX timestamp is
generated, felix_irq_handler() also polls for the MAC Merge layer status
of all ports, looking for changes. This makes the kernel do more work,
and under certain circumstances may make ptp4l require a
tx_timestamp_timeout argument higher than before.

Changes to the MAC Merge layer status are only to be expected under
certain conditions - its TX direction needs to be enabled - so we can
check early if that is the case, and omit register access otherwise.

Make ocelot_mm_update_port_status() skip register access if
mm->tx_enabled is unset, and also call it once more, outside IRQ
context, from ocelot_port_set_mm(), when mm->tx_enabled transitions from
true to false, because an IRQ is also expected in that case.

Also, a port may have its MAC Merge layer enabled but it may not have
generated the interrupt. In that case, there's no point in writing to
DEV_MM_STATUS to acknowledge that IRQ. We can reduce the number of
register writes per port with MM enabled by keeping an "ack" variable
which writes the "write-one-to-clear" bits. Those are 3 in number:
PRMPT_ACTIVE_STICKY, UNEXP_RX_PFRM_STICKY and UNEXP_TX_PFRM_STICKY.
The other fields in DEV_MM_STATUS are read-only and it doesn't matter
what is written to them, so writing zero is just fine.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mscc/ocelot_mm.c | 30 ++++++++++++++++++++++++++++--
 include/soc/mscc/ocelot.h             |  1 +
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index d2df47e6f8f6..ce6429d46814 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -54,7 +54,10 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	struct ocelot_mm_state *mm = &ocelot->mm[port];
 	enum ethtool_mm_verify_status verify_status;
-	u32 val;
+	u32 val, ack = 0;
+
+	if (!mm->tx_enabled)
+		return;
 
 	val = ocelot_port_readl(ocelot_port, DEV_MM_STATUS);
 
@@ -71,21 +74,28 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 
 		dev_dbg(ocelot->dev, "Port %d TX preemption %s\n",
 			port, mm->tx_active ? "active" : "inactive");
+
+		ack |= DEV_MM_STAT_MM_STATUS_PRMPT_ACTIVE_STICKY;
 	}
 
 	if (val & DEV_MM_STAT_MM_STATUS_UNEXP_RX_PFRM_STICKY) {
 		dev_err(ocelot->dev,
 			"Unexpected P-frame received on port %d while verification was unsuccessful or not yet verified\n",
 			port);
+
+		ack |= DEV_MM_STAT_MM_STATUS_UNEXP_RX_PFRM_STICKY;
 	}
 
 	if (val & DEV_MM_STAT_MM_STATUS_UNEXP_TX_PFRM_STICKY) {
 		dev_err(ocelot->dev,
 			"Unexpected P-frame requested to be transmitted on port %d while verification was unsuccessful or not yet verified, or MM_TX_ENA=0\n",
 			port);
+
+		ack |= DEV_MM_STAT_MM_STATUS_UNEXP_TX_PFRM_STICKY;
 	}
 
-	ocelot_port_writel(ocelot_port, val, DEV_MM_STATUS);
+	if (ack)
+		ocelot_port_writel(ocelot_port, ack, DEV_MM_STATUS);
 }
 
 void ocelot_mm_irq(struct ocelot *ocelot)
@@ -107,11 +117,14 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
 	u32 mm_enable = 0, verify_disable = 0, add_frag_size;
+	struct ocelot_mm_state *mm;
 	int err;
 
 	if (!ocelot->mm_supported)
 		return -EOPNOTSUPP;
 
+	mm = &ocelot->mm[port];
+
 	err = ethtool_mm_frag_size_min_to_add(cfg->tx_min_frag_size,
 					      &add_frag_size, extack);
 	if (err)
@@ -145,6 +158,19 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       QSYS_PREEMPTION_CFG,
 		       port);
 
+	/* The switch will emit an IRQ when TX is disabled, to notify that it
+	 * has become inactive. We optimize ocelot_mm_update_port_status() to
+	 * not bother processing MM IRQs at all for ports with TX disabled,
+	 * but we need to ACK this IRQ now, while mm->tx_enabled is still set,
+	 * otherwise we get an IRQ storm.
+	 */
+	if (mm->tx_enabled && !cfg->tx_enabled) {
+		ocelot_mm_update_port_status(ocelot, port);
+		WARN_ON(mm->tx_active);
+	}
+
+	mm->tx_enabled = cfg->tx_enabled;
+
 	mutex_unlock(&ocelot->fwd_domain_lock);
 
 	return 0;
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 9599be6a0a39..ee8d43dc5c06 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -745,6 +745,7 @@ struct ocelot_mirror {
 
 struct ocelot_mm_state {
 	enum ethtool_mm_verify_status verify_status;
+	bool tx_enabled;
 	bool tx_active;
 };
 
-- 
cgit v1.2.3


From aac80140dc31963d818a65a522c2e2da81979857 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:49 +0300
Subject: net: mscc: ocelot: add support for mqprio offload

This doesn't apply anything to hardware and in general doesn't do
anything that the software variant doesn't do, except for checking that
there isn't more than 1 TXQ per TC (TXQs for a DSA switch are a dubious
concept anyway). The reason we add this is to be able to parse one more
field added to struct tc_mqprio_qopt_offload, namely preemptible_tcs.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c |  9 ++++++
 drivers/net/ethernet/mscc/ocelot.c     | 50 ++++++++++++++++++++++++++++++++++
 include/soc/mscc/ocelot.h              |  4 +++
 3 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index 478893c06f56..66ec2740e3cb 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -1612,6 +1612,13 @@ static int vsc9959_qos_port_cbs_set(struct dsa_switch *ds, int port,
 static int vsc9959_qos_query_caps(struct tc_query_caps_base *base)
 {
 	switch (base->type) {
+	case TC_SETUP_QDISC_MQPRIO: {
+		struct tc_mqprio_caps *caps = base->caps;
+
+		caps->validate_queue_counts = true;
+
+		return 0;
+	}
 	case TC_SETUP_QDISC_TAPRIO: {
 		struct tc_taprio_caps *caps = base->caps;
 
@@ -1635,6 +1642,8 @@ static int vsc9959_port_setup_tc(struct dsa_switch *ds, int port,
 		return vsc9959_qos_query_caps(type_data);
 	case TC_SETUP_QDISC_TAPRIO:
 		return vsc9959_qos_port_tas_set(ocelot, port, type_data);
+	case TC_SETUP_QDISC_MQPRIO:
+		return ocelot_port_mqprio(ocelot, port, type_data);
 	case TC_SETUP_QDISC_CBS:
 		return vsc9959_qos_port_cbs_set(ds, port, type_data);
 	default:
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 1502bb2c8ea7..8dc5fb1bc61b 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -8,6 +8,7 @@
 #include <linux/if_bridge.h>
 #include <linux/iopoll.h>
 #include <linux/phy/phy.h>
+#include <net/pkt_sched.h>
 #include <soc/mscc/ocelot_hsio.h>
 #include <soc/mscc/ocelot_vcap.h>
 #include "ocelot.h"
@@ -2699,6 +2700,55 @@ void ocelot_port_mirror_del(struct ocelot *ocelot, int from, bool ingress)
 }
 EXPORT_SYMBOL_GPL(ocelot_port_mirror_del);
 
+static void ocelot_port_reset_mqprio(struct ocelot *ocelot, int port)
+{
+	struct net_device *dev = ocelot->ops->port_to_netdev(ocelot, port);
+
+	netdev_reset_tc(dev);
+}
+
+int ocelot_port_mqprio(struct ocelot *ocelot, int port,
+		       struct tc_mqprio_qopt_offload *mqprio)
+{
+	struct net_device *dev = ocelot->ops->port_to_netdev(ocelot, port);
+	struct netlink_ext_ack *extack = mqprio->extack;
+	struct tc_mqprio_qopt *qopt = &mqprio->qopt;
+	int num_tc = qopt->num_tc;
+	int tc, err;
+
+	if (!num_tc) {
+		ocelot_port_reset_mqprio(ocelot, port);
+		return 0;
+	}
+
+	err = netdev_set_num_tc(dev, num_tc);
+	if (err)
+		return err;
+
+	for (tc = 0; tc < num_tc; tc++) {
+		if (qopt->count[tc] != 1) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Only one TXQ per TC supported");
+			return -EINVAL;
+		}
+
+		err = netdev_set_tc_queue(dev, tc, 1, qopt->offset[tc]);
+		if (err)
+			goto err_reset_tc;
+	}
+
+	err = netif_set_real_num_tx_queues(dev, num_tc);
+	if (err)
+		goto err_reset_tc;
+
+	return 0;
+
+err_reset_tc:
+	ocelot_port_reset_mqprio(ocelot, port);
+	return err;
+}
+EXPORT_SYMBOL_GPL(ocelot_port_mqprio);
+
 void ocelot_init_port(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index ee8d43dc5c06..9596c79e9223 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -11,6 +11,8 @@
 #include <linux/regmap.h>
 #include <net/dsa.h>
 
+struct tc_mqprio_qopt_offload;
+
 /* Port Group IDs (PGID) are masks of destination ports.
  *
  * For L2 forwarding, the switch performs 3 lookups in the PGID table for each
@@ -1154,6 +1156,8 @@ int ocelot_port_set_mm(struct ocelot *ocelot, int port,
 		       struct netlink_ext_ack *extack);
 int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_state *state);
+int ocelot_port_mqprio(struct ocelot *ocelot, int port,
+		       struct tc_mqprio_qopt_offload *mqprio);
 
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
 int ocelot_mrp_add(struct ocelot *ocelot, int port,
-- 
cgit v1.2.3


From 403ffc2c34de5297d007e0e169bf022094d444c2 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Sat, 15 Apr 2023 20:05:51 +0300
Subject: net: mscc: ocelot: add support for preemptible traffic classes

In order to not transmit (preemptible) frames which will be received by
the link partner as corrupted (because it doesn't support FP), the
hardware requires the driver to program the QSYS_PREEMPTION_CFG_P_QUEUES
register only after the MAC Merge layer becomes active (verification
succeeds, or was disabled).

There are some cases when FP is known (through experimentation) to be
broken. Give priority to FP over cut-through switching, and disable FP
for known broken link modes.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/ocelot/felix_vsc9959.c |  7 +++--
 drivers/net/ethernet/mscc/ocelot.c     | 10 ++++++-
 drivers/net/ethernet/mscc/ocelot.h     |  3 ++
 drivers/net/ethernet/mscc/ocelot_mm.c  | 54 ++++++++++++++++++++++++++++++++++
 include/soc/mscc/ocelot.h              |  3 ++
 5 files changed, 74 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c
index e055b3980ccc..cfb3faeaa5bf 100644
--- a/drivers/net/dsa/ocelot/felix_vsc9959.c
+++ b/drivers/net/dsa/ocelot/felix_vsc9959.c
@@ -2519,6 +2519,7 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot)
 
 	for (port = 0; port < ocelot->num_phys_ports; port++) {
 		struct ocelot_port *ocelot_port = ocelot->ports[port];
+		struct ocelot_mm_state *mm = &ocelot->mm[port];
 		int min_speed = ocelot_port->speed;
 		unsigned long mask = 0;
 		u32 tmp, val = 0;
@@ -2559,10 +2560,12 @@ static void vsc9959_cut_through_fwd(struct ocelot *ocelot)
 
 		/* Enable cut-through forwarding for all traffic classes that
 		 * don't have oversized dropping enabled, since this check is
-		 * bypassed in cut-through mode.
+		 * bypassed in cut-through mode. Also exclude preemptible
+		 * traffic classes, since these would hang the port for some
+		 * reason, if sent as cut-through.
 		 */
 		if (ocelot_port->speed == min_speed) {
-			val = GENMASK(7, 0);
+			val = GENMASK(7, 0) & ~mm->active_preemptible_tcs;
 
 			for (tc = 0; tc < OCELOT_NUM_TC; tc++)
 				if (vsc9959_port_qmaxsdu_get(ocelot, port, tc))
diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
index 8dc5fb1bc61b..1f5f00b30441 100644
--- a/drivers/net/ethernet/mscc/ocelot.c
+++ b/drivers/net/ethernet/mscc/ocelot.c
@@ -1006,7 +1006,12 @@ void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port,
 	 */
 	if (ocelot->ops->cut_through_fwd) {
 		mutex_lock(&ocelot->fwd_domain_lock);
-		ocelot->ops->cut_through_fwd(ocelot);
+		/* Workaround for hardware bug - FP doesn't work
+		 * at all link speeds for all PHY modes. The function
+		 * below also calls ocelot->ops->cut_through_fwd(),
+		 * so we don't need to do it twice.
+		 */
+		ocelot_port_update_active_preemptible_tcs(ocelot, port);
 		mutex_unlock(&ocelot->fwd_domain_lock);
 	}
 
@@ -2705,6 +2710,7 @@ static void ocelot_port_reset_mqprio(struct ocelot *ocelot, int port)
 	struct net_device *dev = ocelot->ops->port_to_netdev(ocelot, port);
 
 	netdev_reset_tc(dev);
+	ocelot_port_change_fp(ocelot, port, 0);
 }
 
 int ocelot_port_mqprio(struct ocelot *ocelot, int port,
@@ -2741,6 +2747,8 @@ int ocelot_port_mqprio(struct ocelot *ocelot, int port,
 	if (err)
 		goto err_reset_tc;
 
+	ocelot_port_change_fp(ocelot, port, mqprio->preemptible_tcs);
+
 	return 0;
 
 err_reset_tc:
diff --git a/drivers/net/ethernet/mscc/ocelot.h b/drivers/net/ethernet/mscc/ocelot.h
index d920ca930690..14440a3b04c3 100644
--- a/drivers/net/ethernet/mscc/ocelot.h
+++ b/drivers/net/ethernet/mscc/ocelot.h
@@ -119,6 +119,9 @@ int ocelot_stats_init(struct ocelot *ocelot);
 void ocelot_stats_deinit(struct ocelot *ocelot);
 
 int ocelot_mm_init(struct ocelot *ocelot);
+void ocelot_port_change_fp(struct ocelot *ocelot, int port,
+			   unsigned long preemptible_tcs);
+void ocelot_port_update_active_preemptible_tcs(struct ocelot *ocelot, int port);
 
 extern struct notifier_block ocelot_netdevice_nb;
 extern struct notifier_block ocelot_switchdev_nb;
diff --git a/drivers/net/ethernet/mscc/ocelot_mm.c b/drivers/net/ethernet/mscc/ocelot_mm.c
index 3e458f72f645..fb3145118d68 100644
--- a/drivers/net/ethernet/mscc/ocelot_mm.c
+++ b/drivers/net/ethernet/mscc/ocelot_mm.c
@@ -49,6 +49,59 @@ static enum ethtool_mm_verify_status ocelot_mm_verify_status(u32 val)
 	}
 }
 
+void ocelot_port_update_active_preemptible_tcs(struct ocelot *ocelot, int port)
+{
+	struct ocelot_port *ocelot_port = ocelot->ports[port];
+	struct ocelot_mm_state *mm = &ocelot->mm[port];
+	u32 val = 0;
+
+	lockdep_assert_held(&ocelot->fwd_domain_lock);
+
+	/* Only commit preemptible TCs when MAC Merge is active.
+	 * On NXP LS1028A, when using QSGMII, the port hangs if transmitting
+	 * preemptible frames at any other link speed than gigabit, so avoid
+	 * preemption at lower speeds in this PHY mode.
+	 */
+	if ((ocelot_port->phy_mode != PHY_INTERFACE_MODE_QSGMII ||
+	     ocelot_port->speed == SPEED_1000) && mm->tx_active)
+		val = mm->preemptible_tcs;
+
+	/* Cut through switching doesn't work for preemptible priorities,
+	 * so first make sure it is disabled.
+	 */
+	mm->active_preemptible_tcs = val;
+	ocelot->ops->cut_through_fwd(ocelot);
+
+	dev_dbg(ocelot->dev,
+		"port %d %s/%s, MM TX %s, preemptible TCs 0x%x, active 0x%x\n",
+		port, phy_modes(ocelot_port->phy_mode),
+		phy_speed_to_str(ocelot_port->speed),
+		mm->tx_active ? "active" : "inactive", mm->preemptible_tcs,
+		mm->active_preemptible_tcs);
+
+	ocelot_rmw_rix(ocelot, QSYS_PREEMPTION_CFG_P_QUEUES(val),
+		       QSYS_PREEMPTION_CFG_P_QUEUES_M,
+		       QSYS_PREEMPTION_CFG, port);
+}
+
+void ocelot_port_change_fp(struct ocelot *ocelot, int port,
+			   unsigned long preemptible_tcs)
+{
+	struct ocelot_mm_state *mm = &ocelot->mm[port];
+
+	mutex_lock(&ocelot->fwd_domain_lock);
+
+	if (mm->preemptible_tcs == preemptible_tcs)
+		goto out_unlock;
+
+	mm->preemptible_tcs = preemptible_tcs;
+
+	ocelot_port_update_active_preemptible_tcs(ocelot, port);
+
+out_unlock:
+	mutex_unlock(&ocelot->fwd_domain_lock);
+}
+
 static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 {
 	struct ocelot_port *ocelot_port = ocelot->ports[port];
@@ -74,6 +127,7 @@ static void ocelot_mm_update_port_status(struct ocelot *ocelot, int port)
 
 		dev_dbg(ocelot->dev, "Port %d TX preemption %s\n",
 			port, mm->tx_active ? "active" : "inactive");
+		ocelot_port_update_active_preemptible_tcs(ocelot, port);
 
 		ack |= DEV_MM_STAT_MM_STATUS_PRMPT_ACTIVE_STICKY;
 	}
diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
index 9596c79e9223..cb8fbb241879 100644
--- a/include/soc/mscc/ocelot.h
+++ b/include/soc/mscc/ocelot.h
@@ -749,6 +749,8 @@ struct ocelot_mm_state {
 	enum ethtool_mm_verify_status verify_status;
 	bool tx_enabled;
 	bool tx_active;
+	u8 preemptible_tcs;
+	u8 active_preemptible_tcs;
 };
 
 struct ocelot_port;
@@ -1158,6 +1160,7 @@ int ocelot_port_get_mm(struct ocelot *ocelot, int port,
 		       struct ethtool_mm_state *state);
 int ocelot_port_mqprio(struct ocelot *ocelot, int port,
 		       struct tc_mqprio_qopt_offload *mqprio);
+void ocelot_port_update_preemptible_tcs(struct ocelot *ocelot, int port);
 
 #if IS_ENABLED(CONFIG_BRIDGE_MRP)
 int ocelot_mrp_add(struct ocelot *ocelot, int port,
-- 
cgit v1.2.3


From cb18e5595df7f4e01d0dd4a0f9c4e71b68ae351e Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 17 Apr 2023 11:36:15 +0200
Subject: net: add macro netif_subqueue_completed_wake

Add netif_subqueue_completed_wake, complementing the subqueue versions
netif_subqueue_try_stop and netif_subqueue_maybe_stop.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netdev_queues.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index b26fdb441e39..d68b0a483431 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -160,4 +160,14 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue,
 		netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \
 	})
 
+#define netif_subqueue_completed_wake(dev, idx, pkts, bytes,		\
+				      get_desc, start_thrs)		\
+	({								\
+		struct netdev_queue *txq;				\
+									\
+		txq = netdev_get_tx_queue(dev, idx);			\
+		netif_txq_completed_wake(txq, pkts, bytes,		\
+					 get_desc, start_thrs);		\
+	})
+
 #endif
-- 
cgit v1.2.3


From 9ae708f00161e1d789268fa9cc05bf6bec2af474 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 17 Apr 2023 15:37:51 +0200
Subject: wifi: mac80211: remove ieee80211_tx_status_8023

It is unused and should not be used. In order to avoid limitations in
4-address mode, the driver should always use ieee80211_tx_status_ext for
802.3 frames with a valid sta pointer.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20230417133751.79160-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 20 --------------------
 net/mac80211/status.c  | 24 ------------------------
 2 files changed, 44 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a8dadbd83d95..ac0370e76874 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5215,26 +5215,6 @@ static inline void ieee80211_tx_status_ni(struct ieee80211_hw *hw,
 void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw,
 				 struct sk_buff *skb);
 
-/**
- * ieee80211_tx_status_8023 - transmit status callback for 802.3 frame format
- *
- * Call this function for all transmitted data frames after their transmit
- * completion. This callback should only be called for data frames which
- * are using driver's (or hardware's) offload capability of encap/decap
- * 802.11 frames.
- *
- * This function may not be called in IRQ context. Calls to this function
- * for a single hardware must be synchronized against each other and all
- * calls in the same tx status family.
- *
- * @hw: the hardware the frame was transmitted by
- * @vif: the interface for which the frame was transmitted
- * @skb: the frame that was transmitted, owned by mac80211 after this call
- */
-void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
-			       struct ieee80211_vif *vif,
-			       struct sk_buff *skb);
-
 /**
  * ieee80211_report_low_ack - report non-responding station
  *
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 3f9ddd7f04b6..2b13a52ce96c 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -1244,30 +1244,6 @@ void ieee80211_tx_rate_update(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL(ieee80211_tx_rate_update);
 
-void ieee80211_tx_status_8023(struct ieee80211_hw *hw,
-			      struct ieee80211_vif *vif,
-			      struct sk_buff *skb)
-{
-	struct ieee80211_sub_if_data *sdata;
-	struct ieee80211_tx_status status = {
-		.skb = skb,
-		.info = IEEE80211_SKB_CB(skb),
-	};
-	struct sta_info *sta;
-
-	sdata = vif_to_sdata(vif);
-
-	rcu_read_lock();
-
-	if (!ieee80211_lookup_ra_sta(sdata, skb, &sta) && !IS_ERR(sta))
-		status.sta = &sta->sta;
-
-	ieee80211_tx_status_ext(hw, &status);
-
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL(ieee80211_tx_status_8023);
-
 void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets)
 {
 	struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
-- 
cgit v1.2.3


From 980f0799a15c75403f1f9284a32b6056b9660144 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 18 Apr 2023 11:48:41 +0800
Subject: bonding: add software tx timestamping support

Currently, bonding only obtain the timestamp (ts) information of
the active slave, which is available only for modes 1, 5, and 6.
For other modes, bonding only has software rx timestamping support.

However, some users who use modes such as LACP also want tx timestamp
support. To address this issue, let's check the ts information of each
slave. If all slaves support tx timestamping, we can enable tx
timestamping support for the bond.

Add a note that the get_ts_info may be called with RCU, or rtnl or
reference on the device in ethtool.h>

Suggested-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
Link: https://lore.kernel.org/r/20230418034841.2566262-1-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 30 ++++++++++++++++++++++++++++++
 include/linux/ethtool.h         |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8cc9a74789b7..db7e650d9ebb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -5696,9 +5696,13 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
 				    struct ethtool_ts_info *info)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
+	struct ethtool_ts_info ts_info;
 	const struct ethtool_ops *ops;
 	struct net_device *real_dev;
+	bool sw_tx_support = false;
 	struct phy_device *phydev;
+	struct list_head *iter;
+	struct slave *slave;
 	int ret = 0;
 
 	rcu_read_lock();
@@ -5717,10 +5721,36 @@ static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
 			ret = ops->get_ts_info(real_dev, info);
 			goto out;
 		}
+	} else {
+		/* Check if all slaves support software tx timestamping */
+		rcu_read_lock();
+		bond_for_each_slave_rcu(bond, slave, iter) {
+			ret = -1;
+			ops = slave->dev->ethtool_ops;
+			phydev = slave->dev->phydev;
+
+			if (phy_has_tsinfo(phydev))
+				ret = phy_ts_info(phydev, &ts_info);
+			else if (ops->get_ts_info)
+				ret = ops->get_ts_info(slave->dev, &ts_info);
+
+			if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) {
+				sw_tx_support = true;
+				continue;
+			}
+
+			sw_tx_support = false;
+			break;
+		}
+		rcu_read_unlock();
 	}
 
+	ret = 0;
 	info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
 				SOF_TIMESTAMPING_SOFTWARE;
+	if (sw_tx_support)
+		info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+
 	info->phc_index = -1;
 
 out:
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 798d35890118..62b61527bcc4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -711,6 +711,7 @@ struct ethtool_mm_stats {
  * @get_dump_data: Get dump data.
  * @set_dump: Set dump specific flags to the device.
  * @get_ts_info: Get the time stamping and PTP hardware clock capabilities.
+ *	It may be called with RCU, or rtnl or reference on the device.
  *	Drivers supporting transmit time stamps in software should set this to
  *	ethtool_op_get_ts_info().
  * @get_module_info: Get the size and type of the eeprom contained within
-- 
cgit v1.2.3


From 73db1b8f2bb6725b7391e85aab41fdf592b3c0c1 Mon Sep 17 00:00:00 2001
From: Tzung-Bi Shih <tzungbi@kernel.org>
Date: Wed, 19 Apr 2023 13:15:26 +0800
Subject: netfilter: conntrack: fix wrong ct->timeout value

(struct nf_conn)->timeout is an interval before the conntrack
confirmed.  After confirmed, it becomes a timestamp.

It is observed that timeout of an unconfirmed conntrack:
- Set by calling ctnetlink_change_timeout(). As a result,
  `nfct_time_stamp` was wrongly added to `ct->timeout` twice.
- Get by calling ctnetlink_dump_timeout(). As a result,
  `nfct_time_stamp` was wrongly subtracted.

Call Trace:
 <TASK>
 dump_stack_lvl
 ctnetlink_dump_timeout
 __ctnetlink_glue_build
 ctnetlink_glue_build
 __nfqnl_enqueue_packet
 nf_queue
 nf_hook_slow
 ip_mc_output
 ? __pfx_ip_finish_output
 ip_send_skb
 ? __pfx_dst_output
 udp_send_skb
 udp_sendmsg
 ? __pfx_ip_generic_getfrag
 sock_sendmsg

Separate the 2 cases in:
- Setting `ct->timeout` in __nf_ct_set_timeout().
- Getting `ct->timeout` in ctnetlink_dump_timeout().

Pablo appends:

Update ctnetlink to set up the timeout _after_ the IPS_CONFIRMED flag is
set on, otherwise conntrack creation via ctnetlink breaks.

Note that the problem described in this patch occurs since the
introduction of the nfnetlink_queue conntrack support, select a
sufficiently old Fixes: tag for -stable kernel to pick up this fix.

Fixes: a4b4766c3ceb ("netfilter: nfnetlink_queue: rename related to nfqueue attaching conntrack info")
Signed-off-by: Tzung-Bi Shih <tzungbi@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h |  6 +++++-
 net/netfilter/nf_conntrack_netlink.c      | 13 +++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 71d1269fe4d4..3384859a8921 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -89,7 +89,11 @@ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
 {
 	if (timeout > INT_MAX)
 		timeout = INT_MAX;
-	WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
+
+	if (nf_ct_is_confirmed(ct))
+		WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout);
+	else
+		ct->timeout = (u32)timeout;
 }
 
 int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index d3ee18854698..6f3b23a6653c 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -176,7 +176,12 @@ nla_put_failure:
 static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct,
 				  bool skip_zero)
 {
-	long timeout = nf_ct_expires(ct) / HZ;
+	long timeout;
+
+	if (nf_ct_is_confirmed(ct))
+		timeout = nf_ct_expires(ct) / HZ;
+	else
+		timeout = ct->timeout / HZ;
 
 	if (skip_zero && timeout == 0)
 		return 0;
@@ -2253,9 +2258,6 @@ ctnetlink_create_conntrack(struct net *net,
 	if (!cda[CTA_TIMEOUT])
 		goto err1;
 
-	timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
-	__nf_ct_set_timeout(ct, timeout);
-
 	rcu_read_lock();
  	if (cda[CTA_HELP]) {
 		char *helpname = NULL;
@@ -2319,6 +2321,9 @@ ctnetlink_create_conntrack(struct net *net,
 	/* we must add conntrack extensions before confirmation. */
 	ct->status |= IPS_CONFIRMED;
 
+	timeout = (u64)ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ;
+	__nf_ct_set_timeout(ct, timeout);
+
 	if (cda[CTA_STATUS]) {
 		err = ctnetlink_change_status(ct, cda);
 		if (err < 0)
-- 
cgit v1.2.3


From e5029edd53937a29801ef507cee12e657ff31ea9 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:26 +0200
Subject: leds: Provide stubs for when CLASS_LED & NEW_LEDS are disabled

Provide stubs for devm_led_classdev_register_ext() and
led_init_default_state_get() so that LED drivers embedded within other
drivers such as PHYs and Ethernet switches still build when LEDS_CLASS
or NEW_LEDS are disabled. This also helps with Kconfig dependencies,
which are somewhat hairy for phylib and mdio and only get worse when
adding a dependency on LED_CLASS.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/leds.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index d71201a968b6..aa48e643f655 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -82,7 +82,15 @@ struct led_init_data {
 	bool devname_mandatory;
 };
 
+#if IS_ENABLED(CONFIG_NEW_LEDS)
 enum led_default_state led_init_default_state_get(struct fwnode_handle *fwnode);
+#else
+static inline enum led_default_state
+led_init_default_state_get(struct fwnode_handle *fwnode)
+{
+	return LEDS_DEFSTATE_OFF;
+}
+#endif
 
 struct led_hw_trigger_type {
 	int dummy;
@@ -217,9 +225,19 @@ static inline int led_classdev_register(struct device *parent,
 	return led_classdev_register_ext(parent, led_cdev, NULL);
 }
 
+#if IS_ENABLED(CONFIG_LEDS_CLASS)
 int devm_led_classdev_register_ext(struct device *parent,
 					  struct led_classdev *led_cdev,
 					  struct led_init_data *init_data);
+#else
+static inline int
+devm_led_classdev_register_ext(struct device *parent,
+			       struct led_classdev *led_cdev,
+			       struct led_init_data *init_data)
+{
+	return 0;
+}
+#endif
 
 static inline int devm_led_classdev_register(struct device *parent,
 					     struct led_classdev *led_cdev)
-- 
cgit v1.2.3


From 01e5b728e9e43ae444e0369695a5f72209906464 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:27 +0200
Subject: net: phy: Add a binding for PHY LEDs

Define common binding parsing for all PHY drivers with LEDs using
phylib. Parse the DT as part of the phy_probe and add LEDs to the
linux LED class infrastructure. For the moment, provide a dummy
brightness function, which will later be replaced with a call into the
PHY driver. This allows testing since the LED core might otherwise
reject an LED whose brightness cannot be set.

Add a dependency on LED_CLASS. It either needs to be built in, or not
enabled, since a modular build can result in linker errors.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Kconfig      |  1 +
 drivers/net/phy/phy_device.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/phy.h          | 16 ++++++++++
 3 files changed, 93 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 6b9525def973..b8cc49820ced 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -18,6 +18,7 @@ menuconfig PHYLIB
 	depends on NETDEVICES
 	select MDIO_DEVICE
 	select MDIO_DEVRES
+	depends on LEDS_CLASS || LEDS_CLASS=n
 	help
 	  Ethernet controllers are usually attached to PHY
 	  devices.  This option provides infrastructure for
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 917ba84105fc..61b971251de5 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -19,10 +19,12 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/list.h>
 #include <linux/mdio.h>
 #include <linux/mii.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/netdevice.h>
 #include <linux/phy.h>
 #include <linux/phy_led_triggers.h>
@@ -674,6 +676,7 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id,
 	device_initialize(&mdiodev->dev);
 
 	dev->state = PHY_DOWN;
+	INIT_LIST_HEAD(&dev->leds);
 
 	mutex_init(&dev->lock);
 	INIT_DELAYED_WORK(&dev->state_queue, phy_state_machine);
@@ -2988,6 +2991,74 @@ static bool phy_drv_supports_irq(struct phy_driver *phydrv)
 	return phydrv->config_intr && phydrv->handle_interrupt;
 }
 
+/* Dummy implementation until calls into PHY driver are added */
+static int phy_led_set_brightness(struct led_classdev *led_cdev,
+				  enum led_brightness value)
+{
+	return 0;
+}
+
+static int of_phy_led(struct phy_device *phydev,
+		      struct device_node *led)
+{
+	struct device *dev = &phydev->mdio.dev;
+	struct led_init_data init_data = {};
+	struct led_classdev *cdev;
+	struct phy_led *phyled;
+	int err;
+
+	phyled = devm_kzalloc(dev, sizeof(*phyled), GFP_KERNEL);
+	if (!phyled)
+		return -ENOMEM;
+
+	cdev = &phyled->led_cdev;
+
+	err = of_property_read_u8(led, "reg", &phyled->index);
+	if (err)
+		return err;
+
+	cdev->brightness_set_blocking = phy_led_set_brightness;
+	cdev->max_brightness = 1;
+	init_data.devicename = dev_name(&phydev->mdio.dev);
+	init_data.fwnode = of_fwnode_handle(led);
+	init_data.devname_mandatory = true;
+
+	err = devm_led_classdev_register_ext(dev, cdev, &init_data);
+	if (err)
+		return err;
+
+	list_add(&phyled->list, &phydev->leds);
+
+	return 0;
+}
+
+static int of_phy_leds(struct phy_device *phydev)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	struct device_node *leds, *led;
+	int err;
+
+	if (!IS_ENABLED(CONFIG_OF_MDIO))
+		return 0;
+
+	if (!node)
+		return 0;
+
+	leds = of_get_child_by_name(node, "leds");
+	if (!leds)
+		return 0;
+
+	for_each_available_child_of_node(leds, led) {
+		err = of_phy_led(phydev, led);
+		if (err) {
+			of_node_put(led);
+			return err;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * fwnode_mdio_find_device - Given a fwnode, find the mdio_device
  * @fwnode: pointer to the mdio_device's fwnode
@@ -3183,6 +3254,11 @@ static int phy_probe(struct device *dev)
 	/* Set the state to READY by default */
 	phydev->state = PHY_READY;
 
+	/* Get the LEDs from the device tree, and instantiate standard
+	 * LEDs for them.
+	 */
+	err = of_phy_leds(phydev);
+
 out:
 	/* Re-assert the reset signal on error */
 	if (err)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 2f83cfc206e5..bd6b5e9bb729 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -14,6 +14,7 @@
 #include <linux/compiler.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
+#include <linux/leds.h>
 #include <linux/linkmode.h>
 #include <linux/netlink.h>
 #include <linux/mdio.h>
@@ -600,6 +601,7 @@ struct macsec_ops;
  * @phy_num_led_triggers: Number of triggers in @phy_led_triggers
  * @led_link_trigger: LED trigger for link up/down
  * @last_triggered: last LED trigger for link speed
+ * @leds: list of PHY LED structures
  * @master_slave_set: User requested master/slave configuration
  * @master_slave_get: Current master/slave advertisement
  * @master_slave_state: Current master/slave configuration
@@ -699,6 +701,7 @@ struct phy_device {
 
 	struct phy_led_trigger *led_link_trigger;
 #endif
+	struct list_head leds;
 
 	/*
 	 * Interrupt number for this PHY
@@ -834,6 +837,19 @@ struct phy_plca_status {
 	bool pst;
 };
 
+/**
+ * struct phy_led: An LED driven by the PHY
+ *
+ * @list: List of LEDs
+ * @led_cdev: Standard LED class structure
+ * @index: Number of the LED
+ */
+struct phy_led {
+	struct list_head list;
+	struct led_classdev led_cdev;
+	u8 index;
+};
+
 /**
  * struct phy_driver - Driver structure for a particular PHY type
  *
-- 
cgit v1.2.3


From 684818189b04b095b34964ed4a3ea5249a840eab Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:28 +0200
Subject: net: phy: phy_device: Call into the PHY driver to set LED brightness

Linux LEDs can be software controlled via the brightness file in /sys.
LED drivers need to implement a brightness_set function which the core
will call. Implement an intermediary in phy_device, which will call
into the phy driver if it implements the necessary function.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 15 ++++++++++++---
 include/linux/phy.h          | 13 +++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 61b971251de5..5c1200160c51 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -2991,11 +2991,18 @@ static bool phy_drv_supports_irq(struct phy_driver *phydrv)
 	return phydrv->config_intr && phydrv->handle_interrupt;
 }
 
-/* Dummy implementation until calls into PHY driver are added */
 static int phy_led_set_brightness(struct led_classdev *led_cdev,
 				  enum led_brightness value)
 {
-	return 0;
+	struct phy_led *phyled = to_phy_led(led_cdev);
+	struct phy_device *phydev = phyled->phydev;
+	int err;
+
+	mutex_lock(&phydev->lock);
+	err = phydev->drv->led_brightness_set(phydev, phyled->index, value);
+	mutex_unlock(&phydev->lock);
+
+	return err;
 }
 
 static int of_phy_led(struct phy_device *phydev,
@@ -3012,12 +3019,14 @@ static int of_phy_led(struct phy_device *phydev,
 		return -ENOMEM;
 
 	cdev = &phyled->led_cdev;
+	phyled->phydev = phydev;
 
 	err = of_property_read_u8(led, "reg", &phyled->index);
 	if (err)
 		return err;
 
-	cdev->brightness_set_blocking = phy_led_set_brightness;
+	if (phydev->drv->led_brightness_set)
+		cdev->brightness_set_blocking = phy_led_set_brightness;
 	cdev->max_brightness = 1;
 	init_data.devicename = dev_name(&phydev->mdio.dev);
 	init_data.fwnode = of_fwnode_handle(led);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bd6b5e9bb729..f3c7e3c99f24 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -841,15 +841,19 @@ struct phy_plca_status {
  * struct phy_led: An LED driven by the PHY
  *
  * @list: List of LEDs
+ * @phydev: PHY this LED is attached to
  * @led_cdev: Standard LED class structure
  * @index: Number of the LED
  */
 struct phy_led {
 	struct list_head list;
+	struct phy_device *phydev;
 	struct led_classdev led_cdev;
 	u8 index;
 };
 
+#define to_phy_led(d) container_of(d, struct phy_led, led_cdev)
+
 /**
  * struct phy_driver - Driver structure for a particular PHY type
  *
@@ -1072,6 +1076,15 @@ struct phy_driver {
 	/** @get_plca_status: Return the current PLCA status info */
 	int (*get_plca_status)(struct phy_device *dev,
 			       struct phy_plca_status *plca_st);
+
+	/**
+	 * @led_brightness_set: Set a PHY LED brightness. Index
+	 * indicates which of the PHYs led should be set. Value
+	 * follows the standard LED class meaning, e.g. LED_OFF,
+	 * LED_HALF, LED_FULL.
+	 */
+	int (*led_brightness_set)(struct phy_device *dev,
+				  u8 index, enum led_brightness value);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
-- 
cgit v1.2.3


From 4e901018432e38eab35d2a352661ce4727795be1 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Mon, 17 Apr 2023 17:17:30 +0200
Subject: net: phy: phy_device: Call into the PHY driver to set LED blinking

Linux LEDs can be requested to perform hardware accelerated
blinking. Pass this to the PHY driver, if it implements the op.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 18 ++++++++++++++++++
 include/linux/phy.h          | 12 ++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 5c1200160c51..538523a7cd51 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -3005,6 +3005,22 @@ static int phy_led_set_brightness(struct led_classdev *led_cdev,
 	return err;
 }
 
+static int phy_led_blink_set(struct led_classdev *led_cdev,
+			     unsigned long *delay_on,
+			     unsigned long *delay_off)
+{
+	struct phy_led *phyled = to_phy_led(led_cdev);
+	struct phy_device *phydev = phyled->phydev;
+	int err;
+
+	mutex_lock(&phydev->lock);
+	err = phydev->drv->led_blink_set(phydev, phyled->index,
+					 delay_on, delay_off);
+	mutex_unlock(&phydev->lock);
+
+	return err;
+}
+
 static int of_phy_led(struct phy_device *phydev,
 		      struct device_node *led)
 {
@@ -3027,6 +3043,8 @@ static int of_phy_led(struct phy_device *phydev,
 
 	if (phydev->drv->led_brightness_set)
 		cdev->brightness_set_blocking = phy_led_set_brightness;
+	if (phydev->drv->led_blink_set)
+		cdev->blink_set = phy_led_blink_set;
 	cdev->max_brightness = 1;
 	init_data.devicename = dev_name(&phydev->mdio.dev);
 	init_data.fwnode = of_fwnode_handle(led);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f3c7e3c99f24..c5a0dc829714 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1085,6 +1085,18 @@ struct phy_driver {
 	 */
 	int (*led_brightness_set)(struct phy_device *dev,
 				  u8 index, enum led_brightness value);
+
+	/**
+	 * @led_blink_set: Set a PHY LED brightness.  Index indicates
+	 * which of the PHYs led should be configured to blink. Delays
+	 * are in milliseconds and if both are zero then a sensible
+	 * default should be chosen.  The call should adjust the
+	 * timings in that case and if it can't match the values
+	 * specified exactly.
+	 */
+	int (*led_blink_set)(struct phy_device *dev, u8 index,
+			     unsigned long *delay_on,
+			     unsigned long *delay_off);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
-- 
cgit v1.2.3


From eb6fba7555a812c07aa984fb9e8e9b151a65ca16 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:46 -0700
Subject: net: skbuff: hide wifi_acked when CONFIG_WIRELESS not set

Datacenter kernel builds will very likely not include WIRELESS,
so let them shave 2 bits off the skb by hiding the wifi fields.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 11 +++++++++++
 include/net/sock.h     |  2 +-
 net/core/skbuff.c      |  2 ++
 net/socket.c           |  2 ++
 4 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a823ec3aa326..513f03b23a73 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -953,8 +953,10 @@ struct sk_buff {
 
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
+#ifdef CONFIG_WIRELESS
 	__u8			wifi_acked_valid:1;
 	__u8			wifi_acked:1;
+#endif
 	__u8			no_fcs:1;
 	/* Indicates the inner headers are valid in the skbuff. */
 	__u8			encapsulation:1;
@@ -1187,6 +1189,15 @@ static inline unsigned int skb_napi_id(const struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
+{
+#ifdef CONFIG_WIRELESS
+	return skb->wifi_acked_valid;
+#else
+	return 0;
+#endif
+}
+
 /**
  * skb_unref - decrement the skb's reference count
  * @skb: buffer
diff --git a/include/net/sock.h b/include/net/sock.h
index 5edf0038867c..8b7ed7167243 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2697,7 +2697,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
 	else
 		sock_write_timestamp(sk, kt);
 
-	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
+	if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb))
 		__sock_recv_wifi_status(msg, sk, skb);
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ef81452759be..768f9d04911f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5189,6 +5189,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 }
 EXPORT_SYMBOL_GPL(skb_tstamp_tx);
 
+#ifdef CONFIG_WIRELESS
 void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
 {
 	struct sock *sk = skb->sk;
@@ -5214,6 +5215,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
 		kfree_skb(skb);
 }
 EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+#endif /* CONFIG_WIRELESS */
 
 /**
  * skb_partial_csum_set - set up and verify partial csum values for packet
diff --git a/net/socket.c b/net/socket.c
index 73e493da4589..a7b4b37d86df 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -957,6 +957,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
 
+#ifdef CONFIG_WIRELESS
 void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
 	struct sk_buff *skb)
 {
@@ -972,6 +973,7 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
 	put_cmsg(msg, SOL_SOCKET, SCM_WIFI_STATUS, sizeof(ack), &ack);
 }
 EXPORT_SYMBOL_GPL(__sock_recv_wifi_status);
+#endif
 
 static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk,
 				   struct sk_buff *skb)
-- 
cgit v1.2.3


From c24831a13ba2e472f874483525084da2f2feb890 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:47 -0700
Subject: net: skbuff: hide csum_not_inet when CONFIG_IP_SCTP not set

SCTP is not universally deployed, allow hiding its bit
from the skb.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 14 ++++++++++++++
 net/core/dev.c         |  3 +--
 net/sched/act_csum.c   |  3 +--
 3 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 513f03b23a73..98d6b48f4dcf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -983,7 +983,9 @@ struct sk_buff {
 	__u8			decrypted:1;
 #endif
 	__u8			slow_gro:1;
+#if IS_ENABLED(CONFIG_IP_SCTP)
 	__u8			csum_not_inet:1;
+#endif
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
@@ -5060,7 +5062,19 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 
 static inline bool skb_csum_is_sctp(struct sk_buff *skb)
 {
+#if IS_ENABLED(CONFIG_IP_SCTP)
 	return skb->csum_not_inet;
+#else
+	return 0;
+#endif
+}
+
+static inline void skb_reset_csum_not_inet(struct sk_buff *skb)
+{
+	skb->ip_summed = CHECKSUM_NONE;
+#if IS_ENABLED(CONFIG_IP_SCTP)
+	skb->csum_not_inet = 0;
+#endif
 }
 
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 8aea68275172..3fc4dba71f9d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3315,8 +3315,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
 						  skb->len - start, ~(__u32)0,
 						  crc32c_csum_stub));
 	*(__le32 *)(skb->data + offset) = crc32c_csum;
-	skb->ip_summed = CHECKSUM_NONE;
-	skb->csum_not_inet = 0;
+	skb_reset_csum_not_inet(skb);
 out:
 	return ret;
 }
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 95e9304024b7..8ed285023a40 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -376,8 +376,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
 
 	sctph->checksum = sctp_compute_cksum(skb,
 					     skb_network_offset(skb) + ihl);
-	skb->ip_summed = CHECKSUM_NONE;
-	skb->csum_not_inet = 0;
+	skb_reset_csum_not_inet(skb);
 
 	return 1;
 }
-- 
cgit v1.2.3


From 4398f3f6d1380d9f71a484e1e5d869ba0eaf23d5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:48 -0700
Subject: net: skbuff: move alloc_cpu into a potential hole

alloc_cpu is currently between 4 byte fields, so it's almost
guaranteed to create a 2B hole. It has a knock on effect of
creating a 4B hole after @end (and @end and @tail being in
different cachelines).

None of this matters hugely, but for kernel configs which
don't enable all the features there may well be a 2B hole
after the bitfield. Move alloc_cpu there.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 98d6b48f4dcf..2595b2cfba0d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -991,6 +991,8 @@ struct sk_buff {
 	__u16			tc_index;	/* traffic control index */
 #endif
 
+	u16			alloc_cpu;
+
 	union {
 		__wsum		csum;
 		struct {
@@ -1014,7 +1016,6 @@ struct sk_buff {
 		unsigned int	sender_cpu;
 	};
 #endif
-	u16			alloc_cpu;
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32		secmark;
 #endif
-- 
cgit v1.2.3


From 4c60d04c2888a542f96fe77ecbdfc242b484f943 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:49 -0700
Subject: net: skbuff: push nf_trace down the bitfield

nf_trace is a debug feature, AFAIU, and yet it sits oddly
high in the sk_buff bitfield. Move it down, pushing up
dst_pending_confirm and inner_protocol_type.

Next change will make nf_trace optional (under Kconfig)
and all optional fields should be placed after 2b fields
to avoid 2b fields straddling bytes.

dst_pending_confirm is L3, so it makes sense next to ignore_df.
inner_protocol_type goes up just to keep the balance.

Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2595b2cfba0d..3ae9e8868afa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -934,7 +934,7 @@ struct sk_buff {
 	/* public: */
 	__u8			pkt_type:3; /* see PKT_TYPE_MAX */
 	__u8			ignore_df:1;
-	__u8			nf_trace:1;
+	__u8			dst_pending_confirm:1;
 	__u8			ip_summed:2;
 	__u8			ooo_okay:1;
 
@@ -949,7 +949,7 @@ struct sk_buff {
 	__u8			remcsum_offload:1;
 	__u8			csum_complete_sw:1;
 	__u8			csum_level:2;
-	__u8			dst_pending_confirm:1;
+	__u8			inner_protocol_type:1;
 
 	__u8			l4_hash:1;
 	__u8			sw_hash:1;
@@ -967,7 +967,7 @@ struct sk_buff {
 #endif
 
 	__u8			ipvs_property:1;
-	__u8			inner_protocol_type:1;
+	__u8			nf_trace:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 	__u8			offload_l3_fwd_mark:1;
-- 
cgit v1.2.3


From 48d80c394d3d1afcb49d26398917f5be27bf44cb Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 17 Apr 2023 08:53:50 -0700
Subject: net: skbuff: hide nf_trace and ipvs_property

Accesses to nf_trace and ipvs_property are already wrapped
by ifdefs where necessary. Don't allocate the bits for those
fields at all if possible.

Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3ae9e8868afa..5f120bbab9da 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -966,8 +966,12 @@ struct sk_buff {
 	__u8			ndisc_nodetype:2;
 #endif
 
+#if IS_ENABLED(CONFIG_IP_VS)
 	__u8			ipvs_property:1;
+#endif
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
 	__u8			nf_trace:1;
+#endif
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 	__u8			offload_l3_fwd_mark:1;
-- 
cgit v1.2.3


From 3b3009ea8abb713b022d94fba95ec270cf6e7eae Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 17 Apr 2023 10:32:26 -0400
Subject: net/handshake: Create a NETLINK service for handling handshake
 requests

When a kernel consumer needs a transport layer security session, it
first needs a handshake to negotiate and establish a session. This
negotiation can be done in user space via one of the several
existing library implementations, or it can be done in the kernel.

No in-kernel handshake implementations yet exist. In their absence,
we add a netlink service that can:

a. Notify a user space daemon that a handshake is needed.

b. Once notified, the daemon calls the kernel back via this
   netlink service to get the handshake parameters, including an
   open socket on which to establish the session.

c. Once the handshake is complete, the daemon reports the
   session status and other information via a second netlink
   operation. This operation marks that it is safe for the
   kernel to use the open socket and the security session
   established there.

The notification service uses a multicast group. Each handshake
mechanism (eg, tlshd) adopts its own group number so that the
handshake services are completely independent of one another. The
kernel can then tell via netlink_has_listeners() whether a handshake
service is active and prepared to handle a handshake request.

A new netlink operation, ACCEPT, acts like accept(2) in that it
instantiates a file descriptor in the user space daemon's fd table.
If this operation is successful, the reply carries the fd number,
which can be treated as an open and ready file descriptor.

While user space is performing the handshake, the kernel keeps its
muddy paws off the open socket. A second new netlink operation,
DONE, indicates that the user space daemon is finished with the
socket and it is safe for the kernel to use again. The operation
also indicates whether a session was established successfully.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/handshake.yaml | 122 +++++++++++
 MAINTAINERS                                |   9 +
 include/trace/events/handshake.h           | 159 ++++++++++++++
 include/uapi/linux/handshake.h             |  71 ++++++
 net/Kconfig                                |   5 +
 net/Makefile                               |   1 +
 net/handshake/Makefile                     |  11 +
 net/handshake/genl.c                       |  57 +++++
 net/handshake/genl.h                       |  23 ++
 net/handshake/handshake.h                  |  82 +++++++
 net/handshake/netlink.c                    | 312 ++++++++++++++++++++++++++
 net/handshake/request.c                    | 339 +++++++++++++++++++++++++++++
 net/handshake/trace.c                      |  20 ++
 13 files changed, 1211 insertions(+)
 create mode 100644 Documentation/netlink/specs/handshake.yaml
 create mode 100644 include/trace/events/handshake.h
 create mode 100644 include/uapi/linux/handshake.h
 create mode 100644 net/handshake/Makefile
 create mode 100644 net/handshake/genl.c
 create mode 100644 net/handshake/genl.h
 create mode 100644 net/handshake/handshake.h
 create mode 100644 net/handshake/netlink.c
 create mode 100644 net/handshake/request.c
 create mode 100644 net/handshake/trace.c

(limited to 'include')

diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml
new file mode 100644
index 000000000000..0333d92b1438
--- /dev/null
+++ b/Documentation/netlink/specs/handshake.yaml
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+#
+# Author: Chuck Lever <chuck.lever@oracle.com>
+#
+# Copyright (c) 2023, Oracle and/or its affiliates.
+#
+
+name: handshake
+
+protocol: genetlink
+
+doc: Netlink protocol to request a transport layer security handshake.
+
+definitions:
+  -
+    type: enum
+    name: handler-class
+    value-start: 0
+    entries: [ none, max ]
+  -
+    type: enum
+    name: msg-type
+    value-start: 0
+    entries: [ unspec, clienthello, serverhello ]
+  -
+    type: enum
+    name: auth
+    value-start: 0
+    entries: [ unspec, unauth, psk, x509 ]
+
+attribute-sets:
+  -
+    name: x509
+    attributes:
+      -
+        name: cert
+        type: u32
+      -
+        name: privkey
+        type: u32
+  -
+    name: accept
+    attributes:
+      -
+        name: sockfd
+        type: u32
+      -
+        name: handler-class
+        type: u32
+        enum: handler-class
+      -
+        name: message-type
+        type: u32
+        enum: msg-type
+      -
+        name: timeout
+        type: u32
+      -
+        name: auth-mode
+        type: u32
+        enum: auth
+      -
+        name: peer-identity
+        type: u32
+        multi-attr: true
+      -
+        name: certificate
+        type: nest
+        nested-attributes: x509
+        multi-attr: true
+  -
+    name: done
+    attributes:
+      -
+        name: status
+        type: u32
+      -
+        name: sockfd
+        type: u32
+      -
+        name: remote-auth
+        type: u32
+        multi-attr: true
+
+operations:
+  list:
+    -
+      name: ready
+      doc: Notify handlers that a new handshake request is waiting
+      notify: accept
+    -
+      name: accept
+      doc: Handler retrieves next queued handshake request
+      attribute-set: accept
+      flags: [ admin-perm ]
+      do:
+        request:
+          attributes:
+            - handler-class
+        reply:
+          attributes:
+            - sockfd
+            - message-type
+            - timeout
+            - auth-mode
+            - peer-identity
+            - certificate
+    -
+      name: done
+      doc: Handler reports handshake completion
+      attribute-set: done
+      do:
+        request:
+          attributes:
+            - status
+            - sockfd
+            - remote-auth
+
+mcast-groups:
+  list:
+    -
+      name: none
diff --git a/MAINTAINERS b/MAINTAINERS
index 4fc57dfd5fd0..cdc7748d15b8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8947,6 +8947,15 @@ Q:	http://patchwork.linuxtv.org/project/linux-media/list/
 T:	git git://linuxtv.org/anttip/media_tree.git
 F:	drivers/media/usb/hackrf/
 
+HANDSHAKE UPCALL FOR TRANSPORT LAYER SECURITY
+M:	Chuck Lever <chuck.lever@oracle.com>
+L:	kernel-tls-handshake@lists.linux.dev
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	Documentation/netlink/specs/handshake.yaml
+F:	include/trace/events/handshake.h
+F:	net/handshake/
+
 HANTRO VPU CODEC DRIVER
 M:	Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
 M:	Philipp Zabel <p.zabel@pengutronix.de>
diff --git a/include/trace/events/handshake.h b/include/trace/events/handshake.h
new file mode 100644
index 000000000000..8dadcab5f12a
--- /dev/null
+++ b/include/trace/events/handshake.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM handshake
+
+#if !defined(_TRACE_HANDSHAKE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HANDSHAKE_H
+
+#include <linux/net.h>
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(handshake_event_class,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk
+	),
+	TP_ARGS(net, req, sk),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = sk;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p",
+		__entry->req, __entry->sk
+	)
+);
+#define DEFINE_HANDSHAKE_EVENT(name)				\
+	DEFINE_EVENT(handshake_event_class, name,		\
+		TP_PROTO(					\
+			const struct net *net,			\
+			const struct handshake_req *req,	\
+			const struct sock *sk			\
+		),						\
+		TP_ARGS(net, req, sk))
+
+DECLARE_EVENT_CLASS(handshake_fd_class,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk,
+		int fd
+	),
+	TP_ARGS(net, req, sk, fd),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(int, fd)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = req->hr_sk;
+		__entry->fd = fd;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p fd=%d",
+		__entry->req, __entry->sk, __entry->fd
+	)
+);
+#define DEFINE_HANDSHAKE_FD_EVENT(name)				\
+	DEFINE_EVENT(handshake_fd_class, name,			\
+		TP_PROTO(					\
+			const struct net *net,			\
+			const struct handshake_req *req,	\
+			const struct sock *sk,			\
+			int fd					\
+		),						\
+		TP_ARGS(net, req, sk, fd))
+
+DECLARE_EVENT_CLASS(handshake_error_class,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk,
+		int err
+	),
+	TP_ARGS(net, req, sk, err),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(int, err)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = sk;
+		__entry->err = err;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p err=%d",
+		__entry->req, __entry->sk, __entry->err
+	)
+);
+#define DEFINE_HANDSHAKE_ERROR(name)				\
+	DEFINE_EVENT(handshake_error_class, name,		\
+		TP_PROTO(					\
+			const struct net *net,			\
+			const struct handshake_req *req,	\
+			const struct sock *sk,			\
+			int err					\
+		),						\
+		TP_ARGS(net, req, sk, err))
+
+
+/*
+ * Request lifetime events
+ */
+
+DEFINE_HANDSHAKE_EVENT(handshake_submit);
+DEFINE_HANDSHAKE_ERROR(handshake_submit_err);
+DEFINE_HANDSHAKE_EVENT(handshake_cancel);
+DEFINE_HANDSHAKE_EVENT(handshake_cancel_none);
+DEFINE_HANDSHAKE_EVENT(handshake_cancel_busy);
+DEFINE_HANDSHAKE_EVENT(handshake_destruct);
+
+
+TRACE_EVENT(handshake_complete,
+	TP_PROTO(
+		const struct net *net,
+		const struct handshake_req *req,
+		const struct sock *sk,
+		int status
+	),
+	TP_ARGS(net, req, sk, status),
+	TP_STRUCT__entry(
+		__field(const void *, req)
+		__field(const void *, sk)
+		__field(int, status)
+		__field(unsigned int, netns_ino)
+	),
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->sk = sk;
+		__entry->status = status;
+		__entry->netns_ino = net->ns.inum;
+	),
+	TP_printk("req=%p sk=%p status=%d",
+		__entry->req, __entry->sk, __entry->status
+	)
+);
+
+/*
+ * Netlink events
+ */
+
+DEFINE_HANDSHAKE_ERROR(handshake_notify_err);
+DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_accept);
+DEFINE_HANDSHAKE_ERROR(handshake_cmd_accept_err);
+DEFINE_HANDSHAKE_FD_EVENT(handshake_cmd_done);
+DEFINE_HANDSHAKE_ERROR(handshake_cmd_done_err);
+
+#endif /* _TRACE_HANDSHAKE_H */
+
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
new file mode 100644
index 000000000000..7f66ff489b87
--- /dev/null
+++ b/include/uapi/linux/handshake.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _UAPI_LINUX_HANDSHAKE_H
+#define _UAPI_LINUX_HANDSHAKE_H
+
+#define HANDSHAKE_FAMILY_NAME		"handshake"
+#define HANDSHAKE_FAMILY_VERSION	1
+
+enum handshake_handler_class {
+	HANDSHAKE_HANDLER_CLASS_NONE,
+	HANDSHAKE_HANDLER_CLASS_MAX,
+};
+
+enum handshake_msg_type {
+	HANDSHAKE_MSG_TYPE_UNSPEC,
+	HANDSHAKE_MSG_TYPE_CLIENTHELLO,
+	HANDSHAKE_MSG_TYPE_SERVERHELLO,
+};
+
+enum handshake_auth {
+	HANDSHAKE_AUTH_UNSPEC,
+	HANDSHAKE_AUTH_UNAUTH,
+	HANDSHAKE_AUTH_PSK,
+	HANDSHAKE_AUTH_X509,
+};
+
+enum {
+	HANDSHAKE_A_X509_CERT = 1,
+	HANDSHAKE_A_X509_PRIVKEY,
+
+	__HANDSHAKE_A_X509_MAX,
+	HANDSHAKE_A_X509_MAX = (__HANDSHAKE_A_X509_MAX - 1)
+};
+
+enum {
+	HANDSHAKE_A_ACCEPT_SOCKFD = 1,
+	HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+	HANDSHAKE_A_ACCEPT_MESSAGE_TYPE,
+	HANDSHAKE_A_ACCEPT_TIMEOUT,
+	HANDSHAKE_A_ACCEPT_AUTH_MODE,
+	HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
+	HANDSHAKE_A_ACCEPT_CERTIFICATE,
+
+	__HANDSHAKE_A_ACCEPT_MAX,
+	HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1)
+};
+
+enum {
+	HANDSHAKE_A_DONE_STATUS = 1,
+	HANDSHAKE_A_DONE_SOCKFD,
+	HANDSHAKE_A_DONE_REMOTE_AUTH,
+
+	__HANDSHAKE_A_DONE_MAX,
+	HANDSHAKE_A_DONE_MAX = (__HANDSHAKE_A_DONE_MAX - 1)
+};
+
+enum {
+	HANDSHAKE_CMD_READY = 1,
+	HANDSHAKE_CMD_ACCEPT,
+	HANDSHAKE_CMD_DONE,
+
+	__HANDSHAKE_CMD_MAX,
+	HANDSHAKE_CMD_MAX = (__HANDSHAKE_CMD_MAX - 1)
+};
+
+#define HANDSHAKE_MCGRP_NONE	"none"
+
+#endif /* _UAPI_LINUX_HANDSHAKE_H */
diff --git a/net/Kconfig b/net/Kconfig
index f806722bccf4..4b800706cc76 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -68,6 +68,11 @@ source "net/iucv/Kconfig"
 source "net/smc/Kconfig"
 source "net/xdp/Kconfig"
 
+config NET_HANDSHAKE
+	bool
+	depends on SUNRPC || NVME_TARGET_TCP || NVME_TCP
+	default y
+
 config INET
 	bool "TCP/IP networking"
 	help
diff --git a/net/Makefile b/net/Makefile
index 87592009366f..4c4dc535453d 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -79,3 +79,4 @@ obj-$(CONFIG_NET_NCSI)		+= ncsi/
 obj-$(CONFIG_XDP_SOCKETS)	+= xdp/
 obj-$(CONFIG_MPTCP)		+= mptcp/
 obj-$(CONFIG_MCTP)		+= mctp/
+obj-$(CONFIG_NET_HANDSHAKE)	+= handshake/
diff --git a/net/handshake/Makefile b/net/handshake/Makefile
new file mode 100644
index 000000000000..d38736de45da
--- /dev/null
+++ b/net/handshake/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the Generic HANDSHAKE service
+#
+# Author: Chuck Lever <chuck.lever@oracle.com>
+#
+# Copyright (c) 2023, Oracle and/or its affiliates.
+#
+
+obj-y += handshake.o
+handshake-y := genl.o netlink.o request.o trace.o
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
new file mode 100644
index 000000000000..652f37d19bd6
--- /dev/null
+++ b/net/handshake/genl.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "genl.h"
+
+#include <linux/handshake.h>
+
+/* HANDSHAKE_CMD_ACCEPT - do */
+static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
+	[HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 1),
+};
+
+/* HANDSHAKE_CMD_DONE - do */
+static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = {
+	[HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, },
+	[HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_U32, },
+	[HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, },
+};
+
+/* Ops table for handshake */
+static const struct genl_split_ops handshake_nl_ops[] = {
+	{
+		.cmd		= HANDSHAKE_CMD_ACCEPT,
+		.doit		= handshake_nl_accept_doit,
+		.policy		= handshake_accept_nl_policy,
+		.maxattr	= HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd		= HANDSHAKE_CMD_DONE,
+		.doit		= handshake_nl_done_doit,
+		.policy		= handshake_done_nl_policy,
+		.maxattr	= HANDSHAKE_A_DONE_REMOTE_AUTH,
+		.flags		= GENL_CMD_CAP_DO,
+	},
+};
+
+static const struct genl_multicast_group handshake_nl_mcgrps[] = {
+	[HANDSHAKE_NLGRP_NONE] = { "none", },
+};
+
+struct genl_family handshake_nl_family __ro_after_init = {
+	.name		= HANDSHAKE_FAMILY_NAME,
+	.version	= HANDSHAKE_FAMILY_VERSION,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.split_ops	= handshake_nl_ops,
+	.n_split_ops	= ARRAY_SIZE(handshake_nl_ops),
+	.mcgrps		= handshake_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(handshake_nl_mcgrps),
+};
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
new file mode 100644
index 000000000000..a1eb7ccccc7f
--- /dev/null
+++ b/net/handshake/genl.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_HANDSHAKE_GEN_H
+#define _LINUX_HANDSHAKE_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <linux/handshake.h>
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info);
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+	HANDSHAKE_NLGRP_NONE,
+};
+
+extern struct genl_family handshake_nl_family;
+
+#endif /* _LINUX_HANDSHAKE_GEN_H */
diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h
new file mode 100644
index 000000000000..52568dbe24f1
--- /dev/null
+++ b/net/handshake/handshake.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#ifndef _INTERNAL_HANDSHAKE_H
+#define _INTERNAL_HANDSHAKE_H
+
+/* Per-net namespace context */
+struct handshake_net {
+	spinlock_t		hn_lock;	/* protects next 3 fields */
+	int			hn_pending;
+	int			hn_pending_max;
+	struct list_head	hn_requests;
+
+	unsigned long		hn_flags;
+};
+
+enum hn_flags_bits {
+	HANDSHAKE_F_NET_DRAINING,
+};
+
+struct handshake_proto;
+
+/* One handshake request */
+struct handshake_req {
+	struct list_head		hr_list;
+	struct rhash_head		hr_rhash;
+	unsigned long			hr_flags;
+	const struct handshake_proto	*hr_proto;
+	struct sock			*hr_sk;
+	void				(*hr_odestruct)(struct sock *sk);
+
+	/* Always the last field */
+	char				hr_priv[];
+};
+
+enum hr_flags_bits {
+	HANDSHAKE_F_REQ_COMPLETED,
+};
+
+/* Invariants for all handshake requests for one transport layer
+ * security protocol
+ */
+struct handshake_proto {
+	int			hp_handler_class;
+	size_t			hp_privsize;
+
+	int			(*hp_accept)(struct handshake_req *req,
+					     struct genl_info *info, int fd);
+	void			(*hp_done)(struct handshake_req *req,
+					   unsigned int status,
+					   struct genl_info *info);
+	void			(*hp_destroy)(struct handshake_req *req);
+};
+
+/* netlink.c */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+			  gfp_t flags);
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+				    struct genl_info *info);
+struct handshake_net *handshake_pernet(struct net *net);
+
+/* request.c */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+					  gfp_t flags);
+int handshake_req_hash_init(void);
+void handshake_req_hash_destroy(void);
+void *handshake_req_private(struct handshake_req *req);
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk);
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class);
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+			 gfp_t flags);
+void handshake_complete(struct handshake_req *req, unsigned int status,
+			struct genl_info *info);
+bool handshake_req_cancel(struct sock *sk);
+
+#endif /* _INTERNAL_HANDSHAKE_H */
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
new file mode 100644
index 000000000000..7264cac04047
--- /dev/null
+++ b/net/handshake/netlink.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+#include "genl.h"
+
+#include <trace/events/handshake.h>
+
+/**
+ * handshake_genl_notify - Notify handlers that a request is waiting
+ * @net: target network namespace
+ * @proto: handshake protocol
+ * @flags: memory allocation control flags
+ *
+ * Returns zero on success or a negative errno if notification failed.
+ */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+			  gfp_t flags)
+{
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (!genl_has_listeners(&handshake_nl_family, net,
+				proto->hp_handler_class))
+		return -ESRCH;
+
+	msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(msg, 0, 0, &handshake_nl_family, 0,
+			  HANDSHAKE_CMD_READY);
+	if (!hdr)
+		goto out_free;
+
+	if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+			proto->hp_handler_class) < 0) {
+		genlmsg_cancel(msg, hdr);
+		goto out_free;
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_multicast_netns(&handshake_nl_family, net, msg,
+				       0, proto->hp_handler_class, flags);
+
+out_free:
+	nlmsg_free(msg);
+	return -EMSGSIZE;
+}
+
+/**
+ * handshake_genl_put - Create a generic netlink message header
+ * @msg: buffer in which to create the header
+ * @info: generic netlink message context
+ *
+ * Returns a ready-to-use header, or NULL.
+ */
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+				    struct genl_info *info)
+{
+	return genlmsg_put(msg, info->snd_portid, info->snd_seq,
+			   &handshake_nl_family, 0, info->genlhdr->cmd);
+}
+EXPORT_SYMBOL(handshake_genl_put);
+
+/*
+ * dup() a kernel socket for use as a user space file descriptor
+ * in the current process. The kernel socket must have an
+ * instatiated struct file.
+ *
+ * Implicit argument: "current()"
+ */
+static int handshake_dup(struct socket *sock)
+{
+	struct file *file;
+	int newfd;
+
+	if (!sock->file)
+		return -EBADF;
+
+	file = get_file(sock->file);
+	newfd = get_unused_fd_flags(O_CLOEXEC);
+	if (newfd < 0) {
+		fput(file);
+		return newfd;
+	}
+
+	fd_install(newfd, file);
+	return newfd;
+}
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = sock_net(skb->sk);
+	struct handshake_net *hn = handshake_pernet(net);
+	struct handshake_req *req = NULL;
+	struct socket *sock;
+	int class, fd, err;
+
+	err = -EOPNOTSUPP;
+	if (!hn)
+		goto out_status;
+
+	err = -EINVAL;
+	if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_ACCEPT_HANDLER_CLASS))
+		goto out_status;
+	class = nla_get_u32(info->attrs[HANDSHAKE_A_ACCEPT_HANDLER_CLASS]);
+
+	err = -EAGAIN;
+	req = handshake_req_next(hn, class);
+	if (!req)
+		goto out_status;
+
+	sock = req->hr_sk->sk_socket;
+	fd = handshake_dup(sock);
+	if (fd < 0) {
+		err = fd;
+		goto out_complete;
+	}
+	err = req->hr_proto->hp_accept(req, info, fd);
+	if (err)
+		goto out_complete;
+
+	trace_handshake_cmd_accept(net, req, req->hr_sk, fd);
+	return 0;
+
+out_complete:
+	handshake_complete(req, -EIO, NULL);
+	fput(sock->file);
+out_status:
+	trace_handshake_cmd_accept_err(net, req, NULL, err);
+	return err;
+}
+
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = sock_net(skb->sk);
+	struct socket *sock = NULL;
+	struct handshake_req *req;
+	int fd, status, err;
+
+	if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD))
+		return -EINVAL;
+	fd = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]);
+
+	err = 0;
+	sock = sockfd_lookup(fd, &err);
+	if (err) {
+		err = -EBADF;
+		goto out_status;
+	}
+
+	req = handshake_req_hash_lookup(sock->sk);
+	if (!req) {
+		err = -EBUSY;
+		fput(sock->file);
+		goto out_status;
+	}
+
+	trace_handshake_cmd_done(net, req, sock->sk, fd);
+
+	status = -EIO;
+	if (info->attrs[HANDSHAKE_A_DONE_STATUS])
+		status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]);
+
+	handshake_complete(req, status, info);
+	fput(sock->file);
+	return 0;
+
+out_status:
+	trace_handshake_cmd_done_err(net, req, sock->sk, err);
+	return err;
+}
+
+static unsigned int handshake_net_id;
+
+static int __net_init handshake_net_init(struct net *net)
+{
+	struct handshake_net *hn = net_generic(net, handshake_net_id);
+	unsigned long tmp;
+	struct sysinfo si;
+
+	/*
+	 * Arbitrary limit to prevent handshakes that do not make
+	 * progress from clogging up the system. The cap scales up
+	 * with the amount of physical memory on the system.
+	 */
+	si_meminfo(&si);
+	tmp = si.totalram / (25 * si.mem_unit);
+	hn->hn_pending_max = clamp(tmp, 3UL, 50UL);
+
+	spin_lock_init(&hn->hn_lock);
+	hn->hn_pending = 0;
+	hn->hn_flags = 0;
+	INIT_LIST_HEAD(&hn->hn_requests);
+	return 0;
+}
+
+static void __net_exit handshake_net_exit(struct net *net)
+{
+	struct handshake_net *hn = net_generic(net, handshake_net_id);
+	struct handshake_req *req;
+	LIST_HEAD(requests);
+
+	/*
+	 * Drain the net's pending list. Requests that have been
+	 * accepted and are in progress will be destroyed when
+	 * the socket is closed.
+	 */
+	spin_lock(&hn->hn_lock);
+	set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags);
+	list_splice_init(&requests, &hn->hn_requests);
+	spin_unlock(&hn->hn_lock);
+
+	while (!list_empty(&requests)) {
+		req = list_first_entry(&requests, struct handshake_req, hr_list);
+		list_del(&req->hr_list);
+
+		/*
+		 * Requests on this list have not yet been
+		 * accepted, so they do not have an fd to put.
+		 */
+
+		handshake_complete(req, -ETIMEDOUT, NULL);
+	}
+}
+
+static struct pernet_operations __net_initdata handshake_genl_net_ops = {
+	.init		= handshake_net_init,
+	.exit		= handshake_net_exit,
+	.id		= &handshake_net_id,
+	.size		= sizeof(struct handshake_net),
+};
+
+/**
+ * handshake_pernet - Get the handshake private per-net structure
+ * @net: network namespace
+ *
+ * Returns a pointer to the net's private per-net structure for the
+ * handshake module, or NULL if handshake_init() failed.
+ */
+struct handshake_net *handshake_pernet(struct net *net)
+{
+	return handshake_net_id ?
+		net_generic(net, handshake_net_id) : NULL;
+}
+
+static int __init handshake_init(void)
+{
+	int ret;
+
+	ret = handshake_req_hash_init();
+	if (ret) {
+		pr_warn("handshake: hash initialization failed (%d)\n", ret);
+		return ret;
+	}
+
+	ret = genl_register_family(&handshake_nl_family);
+	if (ret) {
+		pr_warn("handshake: netlink registration failed (%d)\n", ret);
+		handshake_req_hash_destroy();
+		return ret;
+	}
+
+	/*
+	 * ORDER: register_pernet_subsys must be done last.
+	 *
+	 *	If initialization does not make it past pernet_subsys
+	 *	registration, then handshake_net_id will remain 0. That
+	 *	shunts the handshake consumer API to return ENOTSUPP
+	 *	to prevent it from dereferencing something that hasn't
+	 *	been allocated.
+	 */
+	ret = register_pernet_subsys(&handshake_genl_net_ops);
+	if (ret) {
+		pr_warn("handshake: pernet registration failed (%d)\n", ret);
+		genl_unregister_family(&handshake_nl_family);
+		handshake_req_hash_destroy();
+	}
+
+	return ret;
+}
+
+static void __exit handshake_exit(void)
+{
+	unregister_pernet_subsys(&handshake_genl_net_ops);
+	handshake_net_id = 0;
+
+	handshake_req_hash_destroy();
+	genl_unregister_family(&handshake_nl_family);
+}
+
+module_init(handshake_init);
+module_exit(handshake_exit);
diff --git a/net/handshake/request.c b/net/handshake/request.c
new file mode 100644
index 000000000000..d5b2bc6de057
--- /dev/null
+++ b/net/handshake/request.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handshake request lifetime events
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/fdtable.h>
+#include <linux/rhashtable.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+#include <trace/events/handshake.h>
+
+/*
+ * We need both a handshake_req -> sock mapping, and a sock ->
+ * handshake_req mapping. Both are one-to-one.
+ *
+ * To avoid adding another pointer field to struct sock, net/handshake
+ * maintains a hash table, indexed by the memory address of @sock, to
+ * find the struct handshake_req outstanding for that socket. The
+ * reverse direction uses a simple pointer field in the handshake_req
+ * struct.
+ */
+
+static struct rhashtable handshake_rhashtbl ____cacheline_aligned_in_smp;
+
+static const struct rhashtable_params handshake_rhash_params = {
+	.key_len		= sizeof_field(struct handshake_req, hr_sk),
+	.key_offset		= offsetof(struct handshake_req, hr_sk),
+	.head_offset		= offsetof(struct handshake_req, hr_rhash),
+	.automatic_shrinking	= true,
+};
+
+int handshake_req_hash_init(void)
+{
+	return rhashtable_init(&handshake_rhashtbl, &handshake_rhash_params);
+}
+
+void handshake_req_hash_destroy(void)
+{
+	rhashtable_destroy(&handshake_rhashtbl);
+}
+
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk)
+{
+	return rhashtable_lookup_fast(&handshake_rhashtbl, &sk,
+				      handshake_rhash_params);
+}
+
+static bool handshake_req_hash_add(struct handshake_req *req)
+{
+	int ret;
+
+	ret = rhashtable_lookup_insert_fast(&handshake_rhashtbl,
+					    &req->hr_rhash,
+					    handshake_rhash_params);
+	return ret == 0;
+}
+
+static void handshake_req_destroy(struct handshake_req *req)
+{
+	if (req->hr_proto->hp_destroy)
+		req->hr_proto->hp_destroy(req);
+	rhashtable_remove_fast(&handshake_rhashtbl, &req->hr_rhash,
+			       handshake_rhash_params);
+	kfree(req);
+}
+
+static void handshake_sk_destruct(struct sock *sk)
+{
+	void (*sk_destruct)(struct sock *sk);
+	struct handshake_req *req;
+
+	req = handshake_req_hash_lookup(sk);
+	if (!req)
+		return;
+
+	trace_handshake_destruct(sock_net(sk), req, sk);
+	sk_destruct = req->hr_odestruct;
+	handshake_req_destroy(req);
+	if (sk_destruct)
+		sk_destruct(sk);
+}
+
+/**
+ * handshake_req_alloc - Allocate a handshake request
+ * @proto: security protocol
+ * @flags: memory allocation flags
+ *
+ * Returns an initialized handshake_req or NULL.
+ */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+					  gfp_t flags)
+{
+	struct handshake_req *req;
+
+	if (!proto)
+		return NULL;
+	if (proto->hp_handler_class <= HANDSHAKE_HANDLER_CLASS_NONE)
+		return NULL;
+	if (proto->hp_handler_class >= HANDSHAKE_HANDLER_CLASS_MAX)
+		return NULL;
+	if (!proto->hp_accept || !proto->hp_done)
+		return NULL;
+
+	req = kzalloc(struct_size(req, hr_priv, proto->hp_privsize), flags);
+	if (!req)
+		return NULL;
+
+	INIT_LIST_HEAD(&req->hr_list);
+	req->hr_proto = proto;
+	return req;
+}
+EXPORT_SYMBOL(handshake_req_alloc);
+
+/**
+ * handshake_req_private - Get per-handshake private data
+ * @req: handshake arguments
+ *
+ */
+void *handshake_req_private(struct handshake_req *req)
+{
+	return (void *)&req->hr_priv;
+}
+EXPORT_SYMBOL(handshake_req_private);
+
+static bool __add_pending_locked(struct handshake_net *hn,
+				 struct handshake_req *req)
+{
+	if (WARN_ON_ONCE(!list_empty(&req->hr_list)))
+		return false;
+	hn->hn_pending++;
+	list_add_tail(&req->hr_list, &hn->hn_requests);
+	return true;
+}
+
+static void __remove_pending_locked(struct handshake_net *hn,
+				    struct handshake_req *req)
+{
+	hn->hn_pending--;
+	list_del_init(&req->hr_list);
+}
+
+/*
+ * Returns %true if the request was found on @net's pending list,
+ * otherwise %false.
+ *
+ * If @req was on a pending list, it has not yet been accepted.
+ */
+static bool remove_pending(struct handshake_net *hn, struct handshake_req *req)
+{
+	bool ret = false;
+
+	spin_lock(&hn->hn_lock);
+	if (!list_empty(&req->hr_list)) {
+		__remove_pending_locked(hn, req);
+		ret = true;
+	}
+	spin_unlock(&hn->hn_lock);
+
+	return ret;
+}
+
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class)
+{
+	struct handshake_req *req, *pos;
+
+	req = NULL;
+	spin_lock(&hn->hn_lock);
+	list_for_each_entry(pos, &hn->hn_requests, hr_list) {
+		if (pos->hr_proto->hp_handler_class != class)
+			continue;
+		__remove_pending_locked(hn, pos);
+		req = pos;
+		break;
+	}
+	spin_unlock(&hn->hn_lock);
+
+	return req;
+}
+
+/**
+ * handshake_req_submit - Submit a handshake request
+ * @sock: open socket on which to perform the handshake
+ * @req: handshake arguments
+ * @flags: memory allocation flags
+ *
+ * Return values:
+ *   %0: Request queued
+ *   %-EINVAL: Invalid argument
+ *   %-EBUSY: A handshake is already under way for this socket
+ *   %-ESRCH: No handshake agent is available
+ *   %-EAGAIN: Too many pending handshake requests
+ *   %-ENOMEM: Failed to allocate memory
+ *   %-EMSGSIZE: Failed to construct notification message
+ *   %-EOPNOTSUPP: Handshake module not initialized
+ *
+ * A zero return value from handshake_req_submit() means that
+ * exactly one subsequent completion callback is guaranteed.
+ *
+ * A negative return value from handshake_req_submit() means that
+ * no completion callback will be done and that @req has been
+ * destroyed.
+ */
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+			 gfp_t flags)
+{
+	struct handshake_net *hn;
+	struct net *net;
+	int ret;
+
+	if (!sock || !req || !sock->file) {
+		kfree(req);
+		return -EINVAL;
+	}
+
+	req->hr_sk = sock->sk;
+	if (!req->hr_sk) {
+		kfree(req);
+		return -EINVAL;
+	}
+	req->hr_odestruct = req->hr_sk->sk_destruct;
+	req->hr_sk->sk_destruct = handshake_sk_destruct;
+
+	ret = -EOPNOTSUPP;
+	net = sock_net(req->hr_sk);
+	hn = handshake_pernet(net);
+	if (!hn)
+		goto out_err;
+
+	ret = -EAGAIN;
+	if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max)
+		goto out_err;
+
+	spin_lock(&hn->hn_lock);
+	ret = -EOPNOTSUPP;
+	if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags))
+		goto out_unlock;
+	ret = -EBUSY;
+	if (!handshake_req_hash_add(req))
+		goto out_unlock;
+	if (!__add_pending_locked(hn, req))
+		goto out_unlock;
+	spin_unlock(&hn->hn_lock);
+
+	ret = handshake_genl_notify(net, req->hr_proto, flags);
+	if (ret) {
+		trace_handshake_notify_err(net, req, req->hr_sk, ret);
+		if (remove_pending(hn, req))
+			goto out_err;
+	}
+
+	/* Prevent socket release while a handshake request is pending */
+	sock_hold(req->hr_sk);
+
+	trace_handshake_submit(net, req, req->hr_sk);
+	return 0;
+
+out_unlock:
+	spin_unlock(&hn->hn_lock);
+out_err:
+	trace_handshake_submit_err(net, req, req->hr_sk, ret);
+	handshake_req_destroy(req);
+	return ret;
+}
+EXPORT_SYMBOL(handshake_req_submit);
+
+void handshake_complete(struct handshake_req *req, unsigned int status,
+			struct genl_info *info)
+{
+	struct sock *sk = req->hr_sk;
+	struct net *net = sock_net(sk);
+
+	if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+		trace_handshake_complete(net, req, sk, status);
+		req->hr_proto->hp_done(req, status, info);
+
+		/* Handshake request is no longer pending */
+		sock_put(sk);
+	}
+}
+
+/**
+ * handshake_req_cancel - Cancel an in-progress handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ *   %true - Uncompleted handshake request was canceled
+ *   %false - Handshake request already completed or not found
+ */
+bool handshake_req_cancel(struct sock *sk)
+{
+	struct handshake_req *req;
+	struct handshake_net *hn;
+	struct net *net;
+
+	net = sock_net(sk);
+	req = handshake_req_hash_lookup(sk);
+	if (!req) {
+		trace_handshake_cancel_none(net, req, sk);
+		return false;
+	}
+
+	hn = handshake_pernet(net);
+	if (hn && remove_pending(hn, req)) {
+		/* Request hadn't been accepted */
+		goto out_true;
+	}
+	if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+		/* Request already completed */
+		trace_handshake_cancel_busy(net, req, sk);
+		return false;
+	}
+
+out_true:
+	trace_handshake_cancel(net, req, sk);
+
+	/* Handshake request is no longer pending */
+	sock_put(sk);
+	return true;
+}
+EXPORT_SYMBOL(handshake_req_cancel);
diff --git a/net/handshake/trace.c b/net/handshake/trace.c
new file mode 100644
index 000000000000..1c4d8e27e17a
--- /dev/null
+++ b/net/handshake/trace.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace points for transport security layer handshakes.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+
+#include <net/sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "handshake.h"
+
+#define CREATE_TRACE_POINTS
+
+#include <trace/events/handshake.h>
-- 
cgit v1.2.3


From 2fd5532044a89d2403b543520b4902e196f7d165 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 17 Apr 2023 10:32:33 -0400
Subject: net/handshake: Add a kernel API for requesting a TLSv1.3 handshake

To enable kernel consumers of TLS to request a TLS handshake, add
support to net/handshake/ to request a handshake upcall.

This patch also acts as a template for adding handshake upcall
support for other kernel transport layer security providers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/handshake.yaml |   4 +-
 Documentation/networking/index.rst         |   1 +
 Documentation/networking/tls-handshake.rst | 217 +++++++++++++++
 MAINTAINERS                                |   2 +
 include/net/handshake.h                    |  43 +++
 include/uapi/linux/handshake.h             |   2 +
 net/handshake/Makefile                     |   2 +-
 net/handshake/genl.c                       |   3 +-
 net/handshake/genl.h                       |   1 +
 net/handshake/tlshd.c                      | 417 +++++++++++++++++++++++++++++
 10 files changed, 689 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/networking/tls-handshake.rst
 create mode 100644 include/net/handshake.h
 create mode 100644 net/handshake/tlshd.c

(limited to 'include')

diff --git a/Documentation/netlink/specs/handshake.yaml b/Documentation/netlink/specs/handshake.yaml
index 0333d92b1438..614f1a585511 100644
--- a/Documentation/netlink/specs/handshake.yaml
+++ b/Documentation/netlink/specs/handshake.yaml
@@ -16,7 +16,7 @@ definitions:
     type: enum
     name: handler-class
     value-start: 0
-    entries: [ none, max ]
+    entries: [ none, tlshd, max ]
   -
     type: enum
     name: msg-type
@@ -120,3 +120,5 @@ mcast-groups:
   list:
     -
       name: none
+    -
+      name: tlshd
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 24bb256d6d53..a164ff074356 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -36,6 +36,7 @@ Contents:
    scaling
    tls
    tls-offload
+   tls-handshake
    nfc
    6lowpan
    6pack
diff --git a/Documentation/networking/tls-handshake.rst b/Documentation/networking/tls-handshake.rst
new file mode 100644
index 000000000000..a2817a88e905
--- /dev/null
+++ b/Documentation/networking/tls-handshake.rst
@@ -0,0 +1,217 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+In-Kernel TLS Handshake
+=======================
+
+Overview
+========
+
+Transport Layer Security (TLS) is a Upper Layer Protocol (ULP) that runs
+over TCP. TLS provides end-to-end data integrity and confidentiality in
+addition to peer authentication.
+
+The kernel's kTLS implementation handles the TLS record subprotocol, but
+does not handle the TLS handshake subprotocol which is used to establish
+a TLS session. Kernel consumers can use the API described here to
+request TLS session establishment.
+
+There are several possible ways to provide a handshake service in the
+kernel. The API described here is designed to hide the details of those
+implementations so that in-kernel TLS consumers do not need to be
+aware of how the handshake gets done.
+
+
+User handshake agent
+====================
+
+As of this writing, there is no TLS handshake implementation in the
+Linux kernel. To provide a handshake service, a handshake agent
+(typically in user space) is started in each network namespace where a
+kernel consumer might require a TLS handshake. Handshake agents listen
+for events sent from the kernel that indicate a handshake request is
+waiting.
+
+An open socket is passed to a handshake agent via a netlink operation,
+which creates a socket descriptor in the agent's file descriptor table.
+If the handshake completes successfully, the handshake agent promotes
+the socket to use the TLS ULP and sets the session information using the
+SOL_TLS socket options. The handshake agent returns the socket to the
+kernel via a second netlink operation.
+
+
+Kernel Handshake API
+====================
+
+A kernel TLS consumer initiates a client-side TLS handshake on an open
+socket by invoking one of the tls_client_hello() functions. First, it
+fills in a structure that contains the parameters of the request:
+
+.. code-block:: c
+
+  struct tls_handshake_args {
+        struct socket   *ta_sock;
+        tls_done_func_t ta_done;
+        void            *ta_data;
+        unsigned int    ta_timeout_ms;
+        key_serial_t    ta_keyring;
+        key_serial_t    ta_my_cert;
+        key_serial_t    ta_my_privkey;
+        unsigned int    ta_num_peerids;
+        key_serial_t    ta_my_peerids[5];
+  };
+
+The @ta_sock field references an open and connected socket. The consumer
+must hold a reference on the socket to prevent it from being destroyed
+while the handshake is in progress. The consumer must also have
+instantiated a struct file in sock->file.
+
+
+@ta_done contains a callback function that is invoked when the handshake
+has completed. Further explanation of this function is in the "Handshake
+Completion" sesction below.
+
+The consumer can fill in the @ta_timeout_ms field to force the servicing
+handshake agent to exit after a number of milliseconds. This enables the
+socket to be fully closed once both the kernel and the handshake agent
+have closed their endpoints.
+
+Authentication material such as x.509 certificates, private certificate
+keys, and pre-shared keys are provided to the handshake agent in keys
+that are instantiated by the consumer before making the handshake
+request. The consumer can provide a private keyring that is linked into
+the handshake agent's process keyring in the @ta_keyring field to prevent
+access of those keys by other subsystems.
+
+To request an x.509-authenticated TLS session, the consumer fills in
+the @ta_my_cert and @ta_my_privkey fields with the serial numbers of
+keys containing an x.509 certificate and the private key for that
+certificate. Then, it invokes this function:
+
+.. code-block:: c
+
+  ret = tls_client_hello_x509(args, gfp_flags);
+
+The function returns zero when the handshake request is under way. A
+zero return guarantees the callback function @ta_done will be invoked
+for this socket. The function returns a negative errno if the handshake
+could not be started. A negative errno guarantees the callback function
+@ta_done will not be invoked on this socket.
+
+
+To initiate a client-side TLS handshake with a pre-shared key, use:
+
+.. code-block:: c
+
+  ret = tls_client_hello_psk(args, gfp_flags);
+
+However, in this case, the consumer fills in the @ta_my_peerids array
+with serial numbers of keys containing the peer identities it wishes
+to offer, and the @ta_num_peerids field with the number of array
+entries it has filled in. The other fields are filled in as above.
+
+
+To initiate an anonymous client-side TLS handshake use:
+
+.. code-block:: c
+
+  ret = tls_client_hello_anon(args, gfp_flags);
+
+The handshake agent presents no peer identity information to the remote
+during this type of handshake. Only server authentication (ie the client
+verifies the server's identity) is performed during the handshake. Thus
+the established session uses encryption only.
+
+
+Consumers that are in-kernel servers use:
+
+.. code-block:: c
+
+  ret = tls_server_hello_x509(args, gfp_flags);
+
+or
+
+.. code-block:: c
+
+  ret = tls_server_hello_psk(args, gfp_flags);
+
+The argument structure is filled in as above.
+
+
+If the consumer needs to cancel the handshake request, say, due to a ^C
+or other exigent event, the consumer can invoke:
+
+.. code-block:: c
+
+  bool tls_handshake_cancel(sock);
+
+This function returns true if the handshake request associated with
+@sock has been canceled. The consumer's handshake completion callback
+will not be invoked. If this function returns false, then the consumer's
+completion callback has already been invoked.
+
+
+Handshake Completion
+====================
+
+When the handshake agent has completed processing, it notifies the
+kernel that the socket may be used by the consumer again. At this point,
+the consumer's handshake completion callback, provided in the @ta_done
+field in the tls_handshake_args structure, is invoked.
+
+The synopsis of this function is:
+
+.. code-block:: c
+
+  typedef void	(*tls_done_func_t)(void *data, int status,
+                                   key_serial_t peerid);
+
+The consumer provides a cookie in the @ta_data field of the
+tls_handshake_args structure that is returned in the @data parameter of
+this callback. The consumer uses the cookie to match the callback to the
+thread waiting for the handshake to complete.
+
+The success status of the handshake is returned via the @status
+parameter:
+
++------------+----------------------------------------------+
+|  status    |  meaning                                     |
++============+==============================================+
+|  0         |  TLS session established successfully        |
++------------+----------------------------------------------+
+|  -EACCESS  |  Remote peer rejected the handshake or       |
+|            |  authentication failed                       |
++------------+----------------------------------------------+
+|  -ENOMEM   |  Temporary resource allocation failure       |
++------------+----------------------------------------------+
+|  -EINVAL   |  Consumer provided an invalid argument       |
++------------+----------------------------------------------+
+|  -ENOKEY   |  Missing authentication material             |
++------------+----------------------------------------------+
+|  -EIO      |  An unexpected fault occurred                |
++------------+----------------------------------------------+
+
+The @peerid parameter contains the serial number of a key containing the
+remote peer's identity or the value TLS_NO_PEERID if the session is not
+authenticated.
+
+A best practice is to close and destroy the socket immediately if the
+handshake failed.
+
+
+Other considerations
+--------------------
+
+While a handshake is under way, the kernel consumer must alter the
+socket's sk_data_ready callback function to ignore all incoming data.
+Once the handshake completion callback function has been invoked, normal
+receive operation can be resumed.
+
+Once a TLS session is established, the consumer must provide a buffer
+for and then examine the control message (CMSG) that is part of every
+subsequent sock_recvmsg(). Each control message indicates whether the
+received message data is TLS record data or session metadata.
+
+See tls.rst for details on how a kTLS consumer recognizes incoming
+(decrypted) application data, alerts, and handshake packets once the
+socket has been promoted to use the TLS ULP.
diff --git a/MAINTAINERS b/MAINTAINERS
index cdc7748d15b8..04ebde8ccb75 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8953,6 +8953,8 @@ L:	kernel-tls-handshake@lists.linux.dev
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	Documentation/netlink/specs/handshake.yaml
+F:	Documentation/networking/tls-handshake.rst
+F:	include/net/handshake.h
 F:	include/trace/events/handshake.h
 F:	net/handshake/
 
diff --git a/include/net/handshake.h b/include/net/handshake.h
new file mode 100644
index 000000000000..3352b1ab43b3
--- /dev/null
+++ b/include/net/handshake.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic netlink HANDSHAKE service.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#ifndef _NET_HANDSHAKE_H
+#define _NET_HANDSHAKE_H
+
+enum {
+	TLS_NO_KEYRING = 0,
+	TLS_NO_PEERID = 0,
+	TLS_NO_CERT = 0,
+	TLS_NO_PRIVKEY = 0,
+};
+
+typedef void	(*tls_done_func_t)(void *data, int status,
+				   key_serial_t peerid);
+
+struct tls_handshake_args {
+	struct socket		*ta_sock;
+	tls_done_func_t		ta_done;
+	void			*ta_data;
+	unsigned int		ta_timeout_ms;
+	key_serial_t		ta_keyring;
+	key_serial_t		ta_my_cert;
+	key_serial_t		ta_my_privkey;
+	unsigned int		ta_num_peerids;
+	key_serial_t		ta_my_peerids[5];
+};
+
+int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags);
+int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags);
+int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags);
+int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags);
+int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags);
+
+bool tls_handshake_cancel(struct sock *sk);
+
+#endif /* _NET_HANDSHAKE_H */
diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
index 7f66ff489b87..1de4d0b95325 100644
--- a/include/uapi/linux/handshake.h
+++ b/include/uapi/linux/handshake.h
@@ -11,6 +11,7 @@
 
 enum handshake_handler_class {
 	HANDSHAKE_HANDLER_CLASS_NONE,
+	HANDSHAKE_HANDLER_CLASS_TLSHD,
 	HANDSHAKE_HANDLER_CLASS_MAX,
 };
 
@@ -67,5 +68,6 @@ enum {
 };
 
 #define HANDSHAKE_MCGRP_NONE	"none"
+#define HANDSHAKE_MCGRP_TLSHD	"tlshd"
 
 #endif /* _UAPI_LINUX_HANDSHAKE_H */
diff --git a/net/handshake/Makefile b/net/handshake/Makefile
index d38736de45da..a089f7e3df24 100644
--- a/net/handshake/Makefile
+++ b/net/handshake/Makefile
@@ -8,4 +8,4 @@
 #
 
 obj-y += handshake.o
-handshake-y := genl.o netlink.o request.o trace.o
+handshake-y := genl.o netlink.o request.o tlshd.o trace.o
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
index 652f37d19bd6..9f29efb1493e 100644
--- a/net/handshake/genl.c
+++ b/net/handshake/genl.c
@@ -12,7 +12,7 @@
 
 /* HANDSHAKE_CMD_ACCEPT - do */
 static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
-	[HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 1),
+	[HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 2),
 };
 
 /* HANDSHAKE_CMD_DONE - do */
@@ -42,6 +42,7 @@ static const struct genl_split_ops handshake_nl_ops[] = {
 
 static const struct genl_multicast_group handshake_nl_mcgrps[] = {
 	[HANDSHAKE_NLGRP_NONE] = { "none", },
+	[HANDSHAKE_NLGRP_TLSHD] = { "tlshd", },
 };
 
 struct genl_family handshake_nl_family __ro_after_init = {
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
index a1eb7ccccc7f..2c1f1aa6a02a 100644
--- a/net/handshake/genl.h
+++ b/net/handshake/genl.h
@@ -16,6 +16,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
 
 enum {
 	HANDSHAKE_NLGRP_NONE,
+	HANDSHAKE_NLGRP_TLSHD,
 };
 
 extern struct genl_family handshake_nl_family;
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
new file mode 100644
index 000000000000..1b8353296060
--- /dev/null
+++ b/net/handshake/tlshd.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Establish a TLS session for a kernel socket consumer
+ * using the tlshd user space handler.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2021-2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+
+#include <net/sock.h>
+#include <net/handshake.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/keyctl.h>
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+struct tls_handshake_req {
+	void			(*th_consumer_done)(void *data, int status,
+						    key_serial_t peerid);
+	void			*th_consumer_data;
+
+	int			th_type;
+	unsigned int		th_timeout_ms;
+	int			th_auth_mode;
+	key_serial_t		th_keyring;
+	key_serial_t		th_certificate;
+	key_serial_t		th_privkey;
+
+	unsigned int		th_num_peerids;
+	key_serial_t		th_peerid[5];
+};
+
+static struct tls_handshake_req *
+tls_handshake_req_init(struct handshake_req *req,
+		       const struct tls_handshake_args *args)
+{
+	struct tls_handshake_req *treq = handshake_req_private(req);
+
+	treq->th_timeout_ms = args->ta_timeout_ms;
+	treq->th_consumer_done = args->ta_done;
+	treq->th_consumer_data = args->ta_data;
+	treq->th_keyring = args->ta_keyring;
+	treq->th_num_peerids = 0;
+	treq->th_certificate = TLS_NO_CERT;
+	treq->th_privkey = TLS_NO_PRIVKEY;
+	return treq;
+}
+
+static void tls_handshake_remote_peerids(struct tls_handshake_req *treq,
+					 struct genl_info *info)
+{
+	struct nlattr *head = nlmsg_attrdata(info->nlhdr, GENL_HDRLEN);
+	int rem, len = nlmsg_attrlen(info->nlhdr, GENL_HDRLEN);
+	struct nlattr *nla;
+	unsigned int i;
+
+	i = 0;
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+			i++;
+	}
+	if (!i)
+		return;
+	treq->th_num_peerids = min_t(unsigned int, i,
+				     ARRAY_SIZE(treq->th_peerid));
+
+	i = 0;
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+			treq->th_peerid[i++] = nla_get_u32(nla);
+		if (i >= treq->th_num_peerids)
+			break;
+	}
+}
+
+/**
+ * tls_handshake_done - callback to handle a CMD_DONE request
+ * @req: socket on which the handshake was performed
+ * @status: session status code
+ * @info: full results of session establishment
+ *
+ */
+static void tls_handshake_done(struct handshake_req *req,
+			       unsigned int status, struct genl_info *info)
+{
+	struct tls_handshake_req *treq = handshake_req_private(req);
+
+	treq->th_peerid[0] = TLS_NO_PEERID;
+	if (info)
+		tls_handshake_remote_peerids(treq, info);
+
+	treq->th_consumer_done(treq->th_consumer_data, -status,
+			       treq->th_peerid[0]);
+}
+
+#if IS_ENABLED(CONFIG_KEYS)
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+	key_ref_t process_keyring_ref, keyring_ref;
+	int ret;
+
+	if (treq->th_keyring == TLS_NO_KEYRING)
+		return 0;
+
+	process_keyring_ref = lookup_user_key(KEY_SPEC_PROCESS_KEYRING,
+					      KEY_LOOKUP_CREATE,
+					      KEY_NEED_WRITE);
+	if (IS_ERR(process_keyring_ref)) {
+		ret = PTR_ERR(process_keyring_ref);
+		goto out;
+	}
+
+	keyring_ref = lookup_user_key(treq->th_keyring, KEY_LOOKUP_CREATE,
+				      KEY_NEED_LINK);
+	if (IS_ERR(keyring_ref)) {
+		ret = PTR_ERR(keyring_ref);
+		goto out_put_key;
+	}
+
+	ret = key_link(key_ref_to_ptr(process_keyring_ref),
+		       key_ref_to_ptr(keyring_ref));
+
+	key_ref_put(keyring_ref);
+out_put_key:
+	key_ref_put(process_keyring_ref);
+out:
+	return ret;
+}
+#else
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+	return 0;
+}
+#endif
+
+static int tls_handshake_put_peer_identity(struct sk_buff *msg,
+					   struct tls_handshake_req *treq)
+{
+	unsigned int i;
+
+	for (i = 0; i < treq->th_num_peerids; i++)
+		if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
+				treq->th_peerid[i]) < 0)
+			return -EMSGSIZE;
+	return 0;
+}
+
+static int tls_handshake_put_certificate(struct sk_buff *msg,
+					 struct tls_handshake_req *treq)
+{
+	struct nlattr *entry_attr;
+
+	if (treq->th_certificate == TLS_NO_CERT &&
+	    treq->th_privkey == TLS_NO_PRIVKEY)
+		return 0;
+
+	entry_attr = nla_nest_start(msg, HANDSHAKE_A_ACCEPT_CERTIFICATE);
+	if (!entry_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_u32(msg, HANDSHAKE_A_X509_CERT,
+			treq->th_certificate) ||
+	    nla_put_u32(msg, HANDSHAKE_A_X509_PRIVKEY,
+			treq->th_privkey)) {
+		nla_nest_cancel(msg, entry_attr);
+		return -EMSGSIZE;
+	}
+
+	nla_nest_end(msg, entry_attr);
+	return 0;
+}
+
+/**
+ * tls_handshake_accept - callback to construct a CMD_ACCEPT response
+ * @req: handshake parameters to return
+ * @info: generic netlink message context
+ * @fd: file descriptor to be returned
+ *
+ * Returns zero on success, or a negative errno on failure.
+ */
+static int tls_handshake_accept(struct handshake_req *req,
+				struct genl_info *info, int fd)
+{
+	struct tls_handshake_req *treq = handshake_req_private(req);
+	struct nlmsghdr *hdr;
+	struct sk_buff *msg;
+	int ret;
+
+	ret = tls_handshake_private_keyring(treq);
+	if (ret < 0)
+		goto out;
+
+	ret = -ENOMEM;
+	msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		goto out;
+	hdr = handshake_genl_put(msg, info);
+	if (!hdr)
+		goto out_cancel;
+
+	ret = -EMSGSIZE;
+	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd);
+	if (ret < 0)
+		goto out_cancel;
+	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type);
+	if (ret < 0)
+		goto out_cancel;
+	if (treq->th_timeout_ms) {
+		ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms);
+		if (ret < 0)
+			goto out_cancel;
+	}
+
+	ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE,
+			  treq->th_auth_mode);
+	if (ret < 0)
+		goto out_cancel;
+	switch (treq->th_auth_mode) {
+	case HANDSHAKE_AUTH_PSK:
+		ret = tls_handshake_put_peer_identity(msg, treq);
+		if (ret < 0)
+			goto out_cancel;
+		break;
+	case HANDSHAKE_AUTH_X509:
+		ret = tls_handshake_put_certificate(msg, treq);
+		if (ret < 0)
+			goto out_cancel;
+		break;
+	}
+
+	genlmsg_end(msg, hdr);
+	return genlmsg_reply(msg, info);
+
+out_cancel:
+	genlmsg_cancel(msg, hdr);
+out:
+	return ret;
+}
+
+static const struct handshake_proto tls_handshake_proto = {
+	.hp_handler_class	= HANDSHAKE_HANDLER_CLASS_TLSHD,
+	.hp_privsize		= sizeof(struct tls_handshake_req),
+
+	.hp_accept		= tls_handshake_accept,
+	.hp_done		= tls_handshake_done,
+};
+
+/**
+ * tls_client_hello_anon - request an anonymous TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_UNAUTH;
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_anon);
+
+/**
+ * tls_client_hello_x509 - request an x.509-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+	treq->th_certificate = args->ta_my_cert;
+	treq->th_privkey = args->ta_my_privkey;
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_x509);
+
+/**
+ * tls_client_hello_psk - request a PSK-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-EINVAL: Wrong number of local peer IDs
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+	unsigned int i;
+
+	if (!args->ta_num_peerids ||
+	    args->ta_num_peerids > ARRAY_SIZE(treq->th_peerid))
+		return -EINVAL;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+	treq->th_num_peerids = args->ta_num_peerids;
+	for (i = 0; i < args->ta_num_peerids; i++)
+		treq->th_peerid[i] = args->ta_my_peerids[i];
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_psk);
+
+/**
+ * tls_server_hello_x509 - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+	treq->th_certificate = args->ta_my_cert;
+	treq->th_privkey = args->ta_my_privkey;
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_x509);
+
+/**
+ * tls_server_hello_psk - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ *   %0: Handshake request enqueue; ->done will be called when complete
+ *   %-ESRCH: No user agent is available
+ *   %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+	struct tls_handshake_req *treq;
+	struct handshake_req *req;
+
+	req = handshake_req_alloc(&tls_handshake_proto, flags);
+	if (!req)
+		return -ENOMEM;
+	treq = tls_handshake_req_init(req, args);
+	treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+	treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+	treq->th_num_peerids = 1;
+	treq->th_peerid[0] = args->ta_my_peerids[0];
+
+	return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_psk);
+
+/**
+ * tls_handshake_cancel - cancel a pending handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ *   %true - Uncompleted handshake request was canceled
+ *   %false - Handshake request already completed or not found
+ */
+bool tls_handshake_cancel(struct sock *sk)
+{
+	return handshake_req_cancel(sk);
+}
+EXPORT_SYMBOL(tls_handshake_cancel);
-- 
cgit v1.2.3


From b0bc615df488abd0e95107e4a9ecefb9bf8c250a Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Tue, 21 Mar 2023 00:10:16 +0200
Subject: net/mlx5: Add vnic devlink health reporter to PFs/VFs

Create a vnic devlink health reporter for PFs/VFs interfaces.
The reporter's diagnose callback displays the values of vNIC/vport
transport debug counters of PFs/VFs, as follows:

$ devlink health diagnose pci/0000:08:00.0 reporter vnic
 vNIC env counters:
    total_error_queues: 0 send_queue_priority_update_flow: 0
    comp_eq_overrun: 0 async_eq_overrun: 0 cq_overrun: 0
    invalid_command: 0 quota_exceeded_command: 0
    nic_receive_steering_discard: 0

Moreover, add documentation on the reporter functionality and the
counters description.

While at it, expose the vNIC counters diagnose function to be used by
the downstream patch, which will reveal the counters for representor
interfaces.

Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 .../ethernet/mellanox/mlx5/devlink.rst             |  30 +++++
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 .../mellanox/mlx5/core/diag/reporter_vnic.c        | 125 +++++++++++++++++++++
 .../mellanox/mlx5/core/diag/reporter_vnic.h        |  16 +++
 drivers/net/ethernet/mellanox/mlx5/core/health.c   |   4 +
 include/linux/mlx5/driver.h                        |   1 +
 6 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst
index 0995e4e5acd7..ceab18e46456 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5/devlink.rst
@@ -257,3 +257,33 @@ User commands examples:
     $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal
 
 NOTE: This command can run only on PF.
+
+vnic reporter
+-------------
+The vnic reporter implements only the `diagnose` callback.
+It is responsible for querying the vnic diagnostic counters from fw and displaying
+them in realtime.
+
+Description of the vnic counters:
+total_q_under_processor_handle: number of queues in an error state due to
+an async error or errored command.
+send_queue_priority_update_flow: number of QP/SQ priority/SL update
+events.
+cq_overrun: number of times CQ entered an error state due to an
+overflow.
+async_eq_overrun: number of times an EQ mapped to async events was
+overrun.
+comp_eq_overrun: number of times an EQ mapped to completion events was
+overrun.
+quota_exceeded_command: number of commands issued and failed due to quota
+exceeded.
+invalid_command: number of commands issued and failed dues to any reason
+other than quota exceeded.
+nic_receive_steering_discard: number of packets that completed RX flow
+steering but were discarded due to a mismatch in flow table.
+
+User commands examples:
+- Diagnose PF/VF vnic counters
+        $ devlink health diagnose pci/0000:82:00.1 reporter vnic
+
+NOTE: This command can run only on PF/VF ports.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 68f6a4544f7e..ddf1e352f51d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -16,7 +16,7 @@ mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
 		transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \
 		fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \
 		lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \
-		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o \
+		diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \
 		fw_reset.o qos.o lib/tout.o lib/aso.o
 
 #
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
new file mode 100644
index 000000000000..9114661cd967
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */
+
+#include "reporter_vnic.h"
+#include "devlink.h"
+
+#define VNIC_ENV_GET64(vnic_env_stats, c) \
+	MLX5_GET64(query_vnic_env_out, (vnic_env_stats)->query_vnic_env_out, \
+		 vport_env.c)
+
+struct mlx5_vnic_diag_stats {
+	__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
+};
+
+int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
+					 struct devlink_fmsg *fmsg,
+					 u16 vport_num, bool other_vport)
+{
+	u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {};
+	struct mlx5_vnic_diag_stats vnic;
+	int err;
+
+	MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV);
+	MLX5_SET(query_vnic_env_in, in, vport_number, vport_num);
+	MLX5_SET(query_vnic_env_in, in, other_vport, !!other_vport);
+
+	err = mlx5_cmd_exec_inout(dev, query_vnic_env, in, &vnic.query_vnic_env_out);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_start(fmsg, "vNIC env counters");
+	if (err)
+		return err;
+
+	err = devlink_fmsg_obj_nest_start(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "total_error_queues",
+					VNIC_ENV_GET64(&vnic, total_error_queues));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "send_queue_priority_update_flow",
+					VNIC_ENV_GET64(&vnic, send_queue_priority_update_flow));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "comp_eq_overrun",
+					VNIC_ENV_GET64(&vnic, comp_eq_overrun));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "async_eq_overrun",
+					VNIC_ENV_GET64(&vnic, async_eq_overrun));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "cq_overrun",
+					VNIC_ENV_GET64(&vnic, cq_overrun));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "invalid_command",
+					VNIC_ENV_GET64(&vnic, invalid_command));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "quota_exceeded_command",
+					VNIC_ENV_GET64(&vnic, quota_exceeded_command));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_u64_pair_put(fmsg, "nic_receive_steering_discard",
+					VNIC_ENV_GET64(&vnic, nic_receive_steering_discard));
+	if (err)
+		return err;
+
+	err = devlink_fmsg_obj_nest_end(fmsg);
+	if (err)
+		return err;
+
+	err = devlink_fmsg_pair_nest_end(fmsg);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int mlx5_reporter_vnic_diagnose(struct devlink_health_reporter *reporter,
+				       struct devlink_fmsg *fmsg,
+				       struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
+
+	return mlx5_reporter_vnic_diagnose_counters(dev, fmsg, 0, false);
+}
+
+static const struct devlink_health_reporter_ops mlx5_reporter_vnic_ops = {
+	.name = "vnic",
+	.diagnose = mlx5_reporter_vnic_diagnose,
+};
+
+void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct devlink *devlink = priv_to_devlink(dev);
+
+	health->vnic_reporter =
+		devlink_health_reporter_create(devlink,
+					       &mlx5_reporter_vnic_ops,
+					       0, dev);
+	if (IS_ERR(health->vnic_reporter))
+		mlx5_core_warn(dev,
+			       "Failed to create vnic reporter, err = %ld\n",
+			       PTR_ERR(health->vnic_reporter));
+}
+
+void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	if (!IS_ERR_OR_NULL(health->vnic_reporter))
+		devlink_health_reporter_destroy(health->vnic_reporter);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h
new file mode 100644
index 000000000000..eba87a39e9b1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef __MLX5_REPORTER_VNIC_H
+#define __MLX5_REPORTER_VNIC_H
+
+#include "mlx5_core.h"
+
+void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev);
+void mlx5_reporter_vnic_destroy(struct mlx5_core_dev *dev);
+
+int mlx5_reporter_vnic_diagnose_counters(struct mlx5_core_dev *dev,
+					 struct devlink_fmsg *fmsg,
+					 u16 vport_num, bool other_vport);
+
+#endif /* __MLX5_REPORTER_VNIC_H */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 016c5f99c470..871c32dda66e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -42,6 +42,7 @@
 #include "lib/pci_vsc.h"
 #include "lib/tout.h"
 #include "diag/fw_tracer.h"
+#include "diag/reporter_vnic.h"
 
 enum {
 	MAX_MISSES			= 3,
@@ -898,6 +899,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
 
 	cancel_delayed_work_sync(&health->update_fw_log_ts_work);
 	destroy_workqueue(health->wq);
+	mlx5_reporter_vnic_destroy(dev);
 	mlx5_fw_reporters_destroy(dev);
 }
 
@@ -907,6 +909,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	char *name;
 
 	mlx5_fw_reporters_create(dev);
+	mlx5_reporter_vnic_create(dev);
 
 	health = &dev->priv.health;
 	name = kmalloc(64, GFP_KERNEL);
@@ -926,6 +929,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
 	return 0;
 
 out_err:
+	mlx5_reporter_vnic_destroy(dev);
 	mlx5_fw_reporters_destroy(dev);
 	return -ENOMEM;
 }
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 135a3c8d8237..5d25c4c73046 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -439,6 +439,7 @@ struct mlx5_core_health {
 	struct work_struct		report_work;
 	struct devlink_health_reporter *fw_reporter;
 	struct devlink_health_reporter *fw_fatal_reporter;
+	struct devlink_health_reporter *vnic_reporter;
 	struct delayed_work		update_fw_log_ts_work;
 };
 
-- 
cgit v1.2.3


From f9c895a72a390656f9582e048fdcc3d2cec1dd7c Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Wed, 22 Mar 2023 10:21:46 +0200
Subject: net/mlx5: Update op_mode to op_mod for port selection

To be consistent with the other enum keys use OP_MOD
instead of OP_MODE.

Signed-off-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +-
 include/linux/mlx5/mlx5_ifc.h                  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index a95d1218def9..89a65779494e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -717,7 +717,7 @@ static int handle_hca_cap_port_selection(struct mlx5_core_dev *dev,
 	       MLX5_ST_SZ_BYTES(port_selection_cap));
 	MLX5_SET(port_selection_cap, set_hca_cap, port_select_flow_table_bypass, 1);
 
-	err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION);
+	err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_PORT_SELECTION);
 
 	return err;
 }
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 20d00e09b168..b42696d74c9f 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -69,7 +69,7 @@ enum {
 	MLX5_SET_HCA_CAP_OP_MOD_ATOMIC                = 0x3,
 	MLX5_SET_HCA_CAP_OP_MOD_ROCE                  = 0x4,
 	MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2       = 0x20,
-	MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION       = 0x25,
+	MLX5_SET_HCA_CAP_OP_MOD_PORT_SELECTION        = 0x25,
 };
 
 enum {
-- 
cgit v1.2.3


From dd64b232deb8d48812a2ea739d1fedaeaffb59ed Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 19 Apr 2023 11:20:06 -0700
Subject: page_pool: unlink from napi during destroy

Jesper points out that we must prevent recycling into cache
after page_pool_destroy() is called, because page_pool_destroy()
is not synchronized with recycling (some pages may still be
outstanding when destroy() gets called).

I assumed this will not happen because NAPI can't be scheduled
if its page pool is being destroyed. But I missed the fact that
NAPI may get reused. For instance when user changes ring configuration
driver may allocate a new page pool, stop NAPI, swap, start NAPI,
and then destroy the old pool. The NAPI is running so old page
pool will think it can recycle to the cache, but the consumer
at that point is the destroy() path, not NAPI.

To avoid extra synchronization let the drivers do "unlinking"
during the "swap" stage while NAPI is indeed disabled.

Fixes: 8c48eea3adf3 ("page_pool: allow caching from safely localized NAPI")
Reported-by: Jesper Dangaard Brouer <jbrouer@redhat.com>
Link: https://lore.kernel.org/all/e8df2654-6a5b-3c92-489d-2fe5e444135f@redhat.com/
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Link: https://lore.kernel.org/r/20230419182006.719923-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/page_pool.h |  5 +++++
 net/core/page_pool.c    | 18 +++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index 91b808dade82..c8ec2f34722b 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -247,6 +247,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params);
 struct xdp_mem_info;
 
 #ifdef CONFIG_PAGE_POOL
+void page_pool_unlink_napi(struct page_pool *pool);
 void page_pool_destroy(struct page_pool *pool);
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 			   struct xdp_mem_info *mem);
@@ -254,6 +255,10 @@ void page_pool_release_page(struct page_pool *pool, struct page *page);
 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 			     int count);
 #else
+static inline void page_pool_unlink_napi(struct page_pool *pool)
+{
+}
+
 static inline void page_pool_destroy(struct page_pool *pool)
 {
 }
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 97f20f7ff4fc..e212e9d7edcb 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -839,6 +839,21 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 	pool->xdp_mem_id = mem->id;
 }
 
+void page_pool_unlink_napi(struct page_pool *pool)
+{
+	if (!pool->p.napi)
+		return;
+
+	/* To avoid races with recycling and additional barriers make sure
+	 * pool and NAPI are unlinked when NAPI is disabled.
+	 */
+	WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
+		READ_ONCE(pool->p.napi->list_owner) != -1);
+
+	WRITE_ONCE(pool->p.napi, NULL);
+}
+EXPORT_SYMBOL(page_pool_unlink_napi);
+
 void page_pool_destroy(struct page_pool *pool)
 {
 	if (!pool)
@@ -847,6 +862,7 @@ void page_pool_destroy(struct page_pool *pool)
 	if (!page_pool_put(pool))
 		return;
 
+	page_pool_unlink_napi(pool);
 	page_pool_free_frag(pool);
 
 	if (!page_pool_release(pool))
@@ -900,7 +916,7 @@ bool page_pool_return_skb_page(struct page *page, bool napi_safe)
 	 * in the same context as the consumer would run, so there's
 	 * no possible race.
 	 */
-	napi = pp->p.napi;
+	napi = READ_ONCE(pp->p.napi);
 	allow_direct = napi_safe && napi &&
 		READ_ONCE(napi->list_owner) == smp_processor_id();
 
-- 
cgit v1.2.3


From 8c966a10eb8478dea333a0f7f77b11c3cb3f7dcb Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Wed, 19 Apr 2023 13:34:35 +0200
Subject: flow_dissector: Address kdoc warnings

Address a number of warnings flagged by
./scripts/kernel-doc -none include/net/flow_dissector.h

 include/net/flow_dissector.h:23: warning: Function parameter or member 'addr_type' not described in 'flow_dissector_key_control'
 include/net/flow_dissector.h:23: warning: Function parameter or member 'flags' not described in 'flow_dissector_key_control'
 include/net/flow_dissector.h:46: warning: Function parameter or member 'padding' not described in 'flow_dissector_key_basic'
 include/net/flow_dissector.h:145: warning: Function parameter or member 'tipckey' not described in 'flow_dissector_key_addrs'
 include/net/flow_dissector.h:157: warning: cannot understand function prototype: 'struct flow_dissector_key_arp '
 include/net/flow_dissector.h:171: warning: cannot understand function prototype: 'struct flow_dissector_key_ports '
 include/net/flow_dissector.h:203: warning: cannot understand function prototype: 'struct flow_dissector_key_icmp '

Also improve indentation on adjacent lines to those changed
to address the above.

No functional changes intended.

Signed-off-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20230419-flow-dissector-kdoc-v1-1-1aa0cca1118b@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/flow_dissector.h | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 5ccf52ef8809..85b2281576ed 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -14,7 +14,9 @@ struct sk_buff;
 
 /**
  * struct flow_dissector_key_control:
- * @thoff: Transport header offset
+ * @thoff:     Transport header offset
+ * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_*
+ * @flags:     Key flags. Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAGENCAPSULATION)
  */
 struct flow_dissector_key_control {
 	u16	thoff;
@@ -36,8 +38,9 @@ enum flow_dissect_ret {
 
 /**
  * struct flow_dissector_key_basic:
- * @n_proto: Network header protocol (eg. IPv4/IPv6)
+ * @n_proto:  Network header protocol (eg. IPv4/IPv6)
  * @ip_proto: Transport header protocol (eg. TCP/UDP)
+ * @padding:  Unused
  */
 struct flow_dissector_key_basic {
 	__be16	n_proto;
@@ -135,6 +138,7 @@ struct flow_dissector_key_tipc {
  * struct flow_dissector_key_addrs:
  * @v4addrs: IPv4 addresses
  * @v6addrs: IPv6 addresses
+ * @tipckey: TIPC key
  */
 struct flow_dissector_key_addrs {
 	union {
@@ -145,14 +149,12 @@ struct flow_dissector_key_addrs {
 };
 
 /**
- * flow_dissector_key_arp:
- *	@ports: Operation, source and target addresses for an ARP header
- *              for Ethernet hardware addresses and IPv4 protocol addresses
- *		sip: Sender IP address
- *		tip: Target IP address
- *		op:  Operation
- *		sha: Sender hardware address
- *		tpa: Target hardware address
+ * struct flow_dissector_key_arp:
+ * @sip: Sender IP address
+ * @tip: Target IP address
+ * @op:  Operation
+ * @sha: Sender hardware address
+ * @tha: Target hardware address
  */
 struct flow_dissector_key_arp {
 	__u32 sip;
@@ -163,10 +165,10 @@ struct flow_dissector_key_arp {
 };
 
 /**
- * flow_dissector_key_tp_ports:
- *	@ports: port numbers of Transport header
- *		src: source port number
- *		dst: destination port number
+ * struct flow_dissector_key_ports:
+ * @ports: port numbers of Transport header
+ * @src: source port number
+ * @dst: destination port number
  */
 struct flow_dissector_key_ports {
 	union {
@@ -195,10 +197,10 @@ struct flow_dissector_key_ports_range {
 };
 
 /**
- * flow_dissector_key_icmp:
- *		type: ICMP type
- *		code: ICMP code
- *		id:   session identifier
+ * struct flow_dissector_key_icmp:
+ * @type: ICMP type
+ * @code: ICMP code
+ * @id:   Session identifier
  */
 struct flow_dissector_key_icmp {
 	struct {
-- 
cgit v1.2.3


From 8fa66e4a1bdd41d55d7842928e60a40fed65715d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 19 Apr 2023 19:00:05 -0700
Subject: net: skbuff: update and rename __kfree_skb_defer()

__kfree_skb_defer() uses the old naming where "defer" meant
slab bulk free/alloc APIs. In the meantime we also made
__kfree_skb_defer() feed the per-NAPI skb cache, which
implies bulk APIs. So take away the 'defer' and add 'napi'.

While at it add a drop reason. This only matters on the
tx_action path, if the skb has a frag_list. But getting
rid of a SKB_DROP_REASON_NOT_SPECIFIED seems like a net
benefit so why not.

Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20230420020005.815854-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 2 +-
 net/core/dev.c         | 3 ++-
 net/core/gro.c         | 2 +-
 net/core/skbuff.c      | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index db5973559042..03aa7ed076f0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3253,7 +3253,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
 void napi_skb_free_stolen_head(struct sk_buff *skb);
-void __kfree_skb_defer(struct sk_buff *skb);
+void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);
 
 /**
  * __dev_alloc_pages - allocate page for network Rx
diff --git a/net/core/dev.c b/net/core/dev.c
index 3fc4dba71f9d..1551aabac343 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5040,7 +5040,8 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 				__kfree_skb(skb);
 			else
-				__kfree_skb_defer(skb);
+				__napi_kfree_skb(skb,
+						 get_kfree_skb_cb(skb)->reason);
 		}
 	}
 
diff --git a/net/core/gro.c b/net/core/gro.c
index a606705a0859..2d84165cb4f1 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -633,7 +633,7 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
 		else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
 			__kfree_skb(skb);
 		else
-			__kfree_skb_defer(skb);
+			__napi_kfree_skb(skb, SKB_CONSUMED);
 		break;
 
 	case GRO_HELD:
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 768f9d04911f..8764653bede7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1226,9 +1226,9 @@ static void napi_skb_cache_put(struct sk_buff *skb)
 	}
 }
 
-void __kfree_skb_defer(struct sk_buff *skb)
+void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
 {
-	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, true);
+	skb_release_all(skb, reason, true);
 	napi_skb_cache_put(skb);
 }
 
-- 
cgit v1.2.3


From 7ab75456be144a354fbb3df1516d82fc24d3d67d Mon Sep 17 00:00:00 2001
From: Mahesh Bandewar <maheshb@google.com>
Date: Tue, 18 Apr 2023 18:32:38 -0700
Subject: ipv6: add icmpv6_error_anycast_as_unicast for ICMPv6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ICMPv6 error packets are not sent to the anycast destinations and this
prevents things like traceroute from working. So create a setting similar
to ECHO when dealing with Anycast sources (icmpv6_echo_ignore_anycast).

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Maciej Żenczykowski <maze@google.com>
Link: https://lore.kernel.org/r/20230419013238.2691167-1-maheshb@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst |  7 +++++++
 include/net/netns/ipv6.h               |  1 +
 net/ipv6/af_inet6.c                    |  1 +
 net/ipv6/icmp.c                        | 15 +++++++++++++--
 4 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 58a78a316697..6ec06a33688a 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2721,6 +2721,13 @@ echo_ignore_anycast - BOOLEAN
 
 	Default: 0
 
+error_anycast_as_unicast - BOOLEAN
+	If set to 1, then the kernel will respond with ICMP Errors
+	resulting from requests sent to it over the IPv6 protocol destined
+	to anycast address essentially treating anycast as unicast.
+
+	Default: 0
+
 xfrm6_gc_thresh - INTEGER
 	(Obsolete since linux-4.14)
 	The threshold at which we will start garbage collecting for IPv6
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index b4af4837d80b..3cceb3e9320b 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -55,6 +55,7 @@ struct netns_sysctl_ipv6 {
 	u64 ioam6_id_wide;
 	bool skip_notify_on_dev_down;
 	u8 fib_notify_on_flag_change;
+	u8 icmpv6_error_anycast_as_unicast;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index e1b679a590c9..2bbf13216a3d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -952,6 +952,7 @@ static int __net_init inet6_net_init(struct net *net)
 	net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
 	net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
 	net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
+	net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0;
 
 	/* By default, rate limit error messages.
 	 * Except for pmtu discovery, it would break it.
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 1f53f2a74480..9edf1f45b1ed 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -362,9 +362,10 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
 
 	/*
 	 * We won't send icmp if the destination is known
-	 * anycast.
+	 * anycast unless we need to treat anycast as unicast.
 	 */
-	if (ipv6_anycast_destination(dst, &fl6->daddr)) {
+	if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) &&
+	    ipv6_anycast_destination(dst, &fl6->daddr)) {
 		net_dbg_ratelimited("icmp6_send: acast source\n");
 		dst_release(dst);
 		return ERR_PTR(-EINVAL);
@@ -1195,6 +1196,15 @@ static struct ctl_table ipv6_icmp_table_template[] = {
 		.mode		= 0644,
 		.proc_handler = proc_do_large_bitmap,
 	},
+	{
+		.procname	= "error_anycast_as_unicast",
+		.data		= &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast,
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{ },
 };
 
@@ -1212,6 +1222,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
 		table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
 		table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
 		table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
+		table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
 	}
 	return table;
 }
-- 
cgit v1.2.3


From 5b8285cca6fed9bc5baabe2e5699a5a5c0d96371 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Apr 2023 14:52:52 +0200
Subject: net: move dropreason.h to dropreason-core.h

This will, after the next patch, hold only the core
drop reasons and minimal infrastructure. Fix a small
kernel-doc issue while at it, to avoid the move
triggering a checker.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h     |   2 +-
 include/linux/skbuff.h        |   2 +-
 include/net/dropreason-core.h | 364 ++++++++++++++++++++++++++++++++++++++++++
 include/net/dropreason.h      | 363 -----------------------------------------
 include/net/inet_frag.h       |   2 +-
 5 files changed, 367 insertions(+), 366 deletions(-)
 create mode 100644 include/net/dropreason-core.h
 delete mode 100644 include/net/dropreason.h

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 203c0df2046c..a6a3e9457d6c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,7 +52,7 @@
 #include <linux/rbtree.h>
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
-#include <net/dropreason.h>
+#include <net/dropreason-core.h>
 
 struct netpoll_info;
 struct device;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 03aa7ed076f0..eb9e7bb76fa6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -37,7 +37,7 @@
 #include <linux/netfilter/nf_conntrack_common.h>
 #endif
 #include <net/net_debug.h>
-#include <net/dropreason.h>
+#include <net/dropreason-core.h>
 
 /**
  * DOC: skb checksums
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
new file mode 100644
index 000000000000..ade6d5b9186c
--- /dev/null
+++ b/include/net/dropreason-core.h
@@ -0,0 +1,364 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_DROPREASON_CORE_H
+#define _LINUX_DROPREASON_CORE_H
+
+#define DEFINE_DROP_REASON(FN, FNe)	\
+	FN(NOT_SPECIFIED)		\
+	FN(NO_SOCKET)			\
+	FN(PKT_TOO_SMALL)		\
+	FN(TCP_CSUM)			\
+	FN(SOCKET_FILTER)		\
+	FN(UDP_CSUM)			\
+	FN(NETFILTER_DROP)		\
+	FN(OTHERHOST)			\
+	FN(IP_CSUM)			\
+	FN(IP_INHDR)			\
+	FN(IP_RPFILTER)			\
+	FN(UNICAST_IN_L2_MULTICAST)	\
+	FN(XFRM_POLICY)			\
+	FN(IP_NOPROTO)			\
+	FN(SOCKET_RCVBUFF)		\
+	FN(PROTO_MEM)			\
+	FN(TCP_MD5NOTFOUND)		\
+	FN(TCP_MD5UNEXPECTED)		\
+	FN(TCP_MD5FAILURE)		\
+	FN(SOCKET_BACKLOG)		\
+	FN(TCP_FLAGS)			\
+	FN(TCP_ZEROWINDOW)		\
+	FN(TCP_OLD_DATA)		\
+	FN(TCP_OVERWINDOW)		\
+	FN(TCP_OFOMERGE)		\
+	FN(TCP_RFC7323_PAWS)		\
+	FN(TCP_INVALID_SEQUENCE)	\
+	FN(TCP_RESET)			\
+	FN(TCP_INVALID_SYN)		\
+	FN(TCP_CLOSE)			\
+	FN(TCP_FASTOPEN)		\
+	FN(TCP_OLD_ACK)			\
+	FN(TCP_TOO_OLD_ACK)		\
+	FN(TCP_ACK_UNSENT_DATA)		\
+	FN(TCP_OFO_QUEUE_PRUNE)		\
+	FN(TCP_OFO_DROP)		\
+	FN(IP_OUTNOROUTES)		\
+	FN(BPF_CGROUP_EGRESS)		\
+	FN(IPV6DISABLED)		\
+	FN(NEIGH_CREATEFAIL)		\
+	FN(NEIGH_FAILED)		\
+	FN(NEIGH_QUEUEFULL)		\
+	FN(NEIGH_DEAD)			\
+	FN(TC_EGRESS)			\
+	FN(QDISC_DROP)			\
+	FN(CPU_BACKLOG)			\
+	FN(XDP)				\
+	FN(TC_INGRESS)			\
+	FN(UNHANDLED_PROTO)		\
+	FN(SKB_CSUM)			\
+	FN(SKB_GSO_SEG)			\
+	FN(SKB_UCOPY_FAULT)		\
+	FN(DEV_HDR)			\
+	FN(DEV_READY)			\
+	FN(FULL_RING)			\
+	FN(NOMEM)			\
+	FN(HDR_TRUNC)			\
+	FN(TAP_FILTER)			\
+	FN(TAP_TXFILTER)		\
+	FN(ICMP_CSUM)			\
+	FN(INVALID_PROTO)		\
+	FN(IP_INADDRERRORS)		\
+	FN(IP_INNOROUTES)		\
+	FN(PKT_TOO_BIG)			\
+	FN(DUP_FRAG)			\
+	FN(FRAG_REASM_TIMEOUT)		\
+	FN(FRAG_TOO_FAR)		\
+	FN(TCP_MINTTL)			\
+	FN(IPV6_BAD_EXTHDR)		\
+	FN(IPV6_NDISC_FRAG)		\
+	FN(IPV6_NDISC_HOP_LIMIT)	\
+	FN(IPV6_NDISC_BAD_CODE)		\
+	FN(IPV6_NDISC_BAD_OPTIONS)	\
+	FN(IPV6_NDISC_NS_OTHERHOST)	\
+	FNe(MAX)
+
+/**
+ * enum skb_drop_reason - the reasons of skb drops
+ *
+ * The reason of skb drop, which is used in kfree_skb_reason().
+ */
+enum skb_drop_reason {
+	/**
+	 * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
+	 */
+	SKB_NOT_DROPPED_YET = 0,
+	/** @SKB_CONSUMED: packet has been consumed */
+	SKB_CONSUMED,
+	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
+	SKB_DROP_REASON_NOT_SPECIFIED,
+	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
+	SKB_DROP_REASON_NO_SOCKET,
+	/** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */
+	SKB_DROP_REASON_PKT_TOO_SMALL,
+	/** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */
+	SKB_DROP_REASON_TCP_CSUM,
+	/** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */
+	SKB_DROP_REASON_SOCKET_FILTER,
+	/** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */
+	SKB_DROP_REASON_UDP_CSUM,
+	/** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */
+	SKB_DROP_REASON_NETFILTER_DROP,
+	/**
+	 * @SKB_DROP_REASON_OTHERHOST: packet don't belong to current host
+	 * (interface is in promisc mode)
+	 */
+	SKB_DROP_REASON_OTHERHOST,
+	/** @SKB_DROP_REASON_IP_CSUM: IP checksum error */
+	SKB_DROP_REASON_IP_CSUM,
+	/**
+	 * @SKB_DROP_REASON_IP_INHDR: there is something wrong with IP header (see
+	 * IPSTATS_MIB_INHDRERRORS)
+	 */
+	SKB_DROP_REASON_IP_INHDR,
+	/**
+	 * @SKB_DROP_REASON_IP_RPFILTER: IP rpfilter validate failed. see the
+	 * document for rp_filter in ip-sysctl.rst for more information
+	 */
+	SKB_DROP_REASON_IP_RPFILTER,
+	/**
+	 * @SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST: destination address of L2 is
+	 * multicast, but L3 is unicast.
+	 */
+	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST,
+	/** @SKB_DROP_REASON_XFRM_POLICY: xfrm policy check failed */
+	SKB_DROP_REASON_XFRM_POLICY,
+	/** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */
+	SKB_DROP_REASON_IP_NOPROTO,
+	/** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */
+	SKB_DROP_REASON_SOCKET_RCVBUFF,
+	/**
+	 * @SKB_DROP_REASON_PROTO_MEM: proto memory limition, such as udp packet
+	 * drop out of udp_memory_allocated.
+	 */
+	SKB_DROP_REASON_PROTO_MEM,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected,
+	 * corresponding to LINUX_MIB_TCPMD5NOTFOUND
+	 */
+	SKB_DROP_REASON_TCP_MD5NOTFOUND,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5UNEXPECTED: MD5 hash and we're not expecting
+	 * one, corresponding to LINUX_MIB_TCPMD5UNEXPECTED
+	 */
+	SKB_DROP_REASON_TCP_MD5UNEXPECTED,
+	/**
+	 * @SKB_DROP_REASON_TCP_MD5FAILURE: MD5 hash and its wrong, corresponding
+	 * to LINUX_MIB_TCPMD5FAILURE
+	 */
+	SKB_DROP_REASON_TCP_MD5FAILURE,
+	/**
+	 * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog (
+	 * see LINUX_MIB_TCPBACKLOGDROP)
+	 */
+	SKB_DROP_REASON_SOCKET_BACKLOG,
+	/** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */
+	SKB_DROP_REASON_TCP_FLAGS,
+	/**
+	 * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero,
+	 * see LINUX_MIB_TCPZEROWINDOWDROP
+	 */
+	SKB_DROP_REASON_TCP_ZEROWINDOW,
+	/**
+	 * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data reveived is already
+	 * received before (spurious retrans may happened), see
+	 * LINUX_MIB_DELAYEDACKLOST
+	 */
+	SKB_DROP_REASON_TCP_OLD_DATA,
+	/**
+	 * @SKB_DROP_REASON_TCP_OVERWINDOW: the TCP data is out of window,
+	 * the seq of the first byte exceed the right edges of receive
+	 * window
+	 */
+	SKB_DROP_REASON_TCP_OVERWINDOW,
+	/**
+	 * @SKB_DROP_REASON_TCP_OFOMERGE: the data of skb is already in the ofo
+	 * queue, corresponding to LINUX_MIB_TCPOFOMERGE
+	 */
+	SKB_DROP_REASON_TCP_OFOMERGE,
+	/**
+	 * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
+	 * LINUX_MIB_PAWSESTABREJECTED
+	 */
+	SKB_DROP_REASON_TCP_RFC7323_PAWS,
+	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
+	SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
+	/** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */
+	SKB_DROP_REASON_TCP_RESET,
+	/**
+	 * @SKB_DROP_REASON_TCP_INVALID_SYN: Incoming packet has unexpected
+	 * SYN flag
+	 */
+	SKB_DROP_REASON_TCP_INVALID_SYN,
+	/** @SKB_DROP_REASON_TCP_CLOSE: TCP socket in CLOSE state */
+	SKB_DROP_REASON_TCP_CLOSE,
+	/** @SKB_DROP_REASON_TCP_FASTOPEN: dropped by FASTOPEN request socket */
+	SKB_DROP_REASON_TCP_FASTOPEN,
+	/** @SKB_DROP_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
+	SKB_DROP_REASON_TCP_OLD_ACK,
+	/** @SKB_DROP_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
+	SKB_DROP_REASON_TCP_TOO_OLD_ACK,
+	/**
+	 * @SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
+	 * sent yet
+	 */
+	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA,
+	/** @SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE: pruned from TCP OFO queue */
+	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE,
+	/** @SKB_DROP_REASON_TCP_OFO_DROP: data already in receive queue */
+	SKB_DROP_REASON_TCP_OFO_DROP,
+	/** @SKB_DROP_REASON_IP_OUTNOROUTES: route lookup failed */
+	SKB_DROP_REASON_IP_OUTNOROUTES,
+	/**
+	 * @SKB_DROP_REASON_BPF_CGROUP_EGRESS: dropped by BPF_PROG_TYPE_CGROUP_SKB
+	 * eBPF program
+	 */
+	SKB_DROP_REASON_BPF_CGROUP_EGRESS,
+	/** @SKB_DROP_REASON_IPV6DISABLED: IPv6 is disabled on the device */
+	SKB_DROP_REASON_IPV6DISABLED,
+	/** @SKB_DROP_REASON_NEIGH_CREATEFAIL: failed to create neigh entry */
+	SKB_DROP_REASON_NEIGH_CREATEFAIL,
+	/** @SKB_DROP_REASON_NEIGH_FAILED: neigh entry in failed state */
+	SKB_DROP_REASON_NEIGH_FAILED,
+	/** @SKB_DROP_REASON_NEIGH_QUEUEFULL: arp_queue for neigh entry is full */
+	SKB_DROP_REASON_NEIGH_QUEUEFULL,
+	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
+	SKB_DROP_REASON_NEIGH_DEAD,
+	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
+	SKB_DROP_REASON_TC_EGRESS,
+	/**
+	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
+	 * failed to enqueue to current qdisc)
+	 */
+	SKB_DROP_REASON_QDISC_DROP,
+	/**
+	 * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU
+	 * backlog queue. This can be caused by backlog queue full (see
+	 * netdev_max_backlog in net.rst) or RPS flow limit
+	 */
+	SKB_DROP_REASON_CPU_BACKLOG,
+	/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
+	SKB_DROP_REASON_XDP,
+	/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
+	SKB_DROP_REASON_TC_INGRESS,
+	/** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */
+	SKB_DROP_REASON_UNHANDLED_PROTO,
+	/** @SKB_DROP_REASON_SKB_CSUM: sk_buff checksum computation error */
+	SKB_DROP_REASON_SKB_CSUM,
+	/** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */
+	SKB_DROP_REASON_SKB_GSO_SEG,
+	/**
+	 * @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space,
+	 * e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx()
+	 */
+	SKB_DROP_REASON_SKB_UCOPY_FAULT,
+	/** @SKB_DROP_REASON_DEV_HDR: device driver specific header/metadata is invalid */
+	SKB_DROP_REASON_DEV_HDR,
+	/**
+	 * @SKB_DROP_REASON_DEV_READY: the device is not ready to xmit/recv due to
+	 * any of its data structure that is not up/ready/initialized,
+	 * e.g., the IFF_UP is not set, or driver specific tun->tfiles[txq]
+	 * is not initialized
+	 */
+	SKB_DROP_REASON_DEV_READY,
+	/** @SKB_DROP_REASON_FULL_RING: ring buffer is full */
+	SKB_DROP_REASON_FULL_RING,
+	/** @SKB_DROP_REASON_NOMEM: error due to OOM */
+	SKB_DROP_REASON_NOMEM,
+	/**
+	 * @SKB_DROP_REASON_HDR_TRUNC: failed to trunc/extract the header from
+	 * networking data, e.g., failed to pull the protocol header from
+	 * frags via pskb_may_pull()
+	 */
+	SKB_DROP_REASON_HDR_TRUNC,
+	/**
+	 * @SKB_DROP_REASON_TAP_FILTER: dropped by (ebpf) filter directly attached
+	 * to tun/tap, e.g., via TUNSETFILTEREBPF
+	 */
+	SKB_DROP_REASON_TAP_FILTER,
+	/**
+	 * @SKB_DROP_REASON_TAP_TXFILTER: dropped by tx filter implemented at
+	 * tun/tap, e.g., check_filter()
+	 */
+	SKB_DROP_REASON_TAP_TXFILTER,
+	/** @SKB_DROP_REASON_ICMP_CSUM: ICMP checksum error */
+	SKB_DROP_REASON_ICMP_CSUM,
+	/**
+	 * @SKB_DROP_REASON_INVALID_PROTO: the packet doesn't follow RFC 2211,
+	 * such as a broadcasts ICMP_TIMESTAMP
+	 */
+	SKB_DROP_REASON_INVALID_PROTO,
+	/**
+	 * @SKB_DROP_REASON_IP_INADDRERRORS: host unreachable, corresponding to
+	 * IPSTATS_MIB_INADDRERRORS
+	 */
+	SKB_DROP_REASON_IP_INADDRERRORS,
+	/**
+	 * @SKB_DROP_REASON_IP_INNOROUTES: network unreachable, corresponding to
+	 * IPSTATS_MIB_INADDRERRORS
+	 */
+	SKB_DROP_REASON_IP_INNOROUTES,
+	/**
+	 * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the
+	 * MTU)
+	 */
+	SKB_DROP_REASON_PKT_TOO_BIG,
+	/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
+	SKB_DROP_REASON_DUP_FRAG,
+	/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
+	SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
+	/**
+	 * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
+	 * (/proc/sys/net/ipv4/ipfrag_max_dist)
+	 */
+	SKB_DROP_REASON_FRAG_TOO_FAR,
+	/**
+	 * @SKB_DROP_REASON_TCP_MINTTL: ipv4 ttl or ipv6 hoplimit below
+	 * the threshold (IP_MINTTL or IPV6_MINHOPCOUNT).
+	 */
+	SKB_DROP_REASON_TCP_MINTTL,
+	/** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */
+	SKB_DROP_REASON_IPV6_BAD_EXTHDR,
+	/** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */
+	SKB_DROP_REASON_IPV6_NDISC_FRAG,
+	/** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */
+	SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT,
+	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. */
+	SKB_DROP_REASON_IPV6_NDISC_BAD_CODE,
+	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS: invalid NDISC options. */
+	SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS,
+	/**
+	 * @SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST: NEIGHBOUR SOLICITATION
+	 * for another host.
+	 */
+	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
+	/**
+	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
+	 * used as a real 'reason'
+	 */
+	SKB_DROP_REASON_MAX,
+};
+
+#define SKB_DR_INIT(name, reason)				\
+	enum skb_drop_reason name = SKB_DROP_REASON_##reason
+#define SKB_DR(name)						\
+	SKB_DR_INIT(name, NOT_SPECIFIED)
+#define SKB_DR_SET(name, reason)				\
+	(name = SKB_DROP_REASON_##reason)
+#define SKB_DR_OR(name, reason)					\
+	do {							\
+		if (name == SKB_DROP_REASON_NOT_SPECIFIED ||	\
+		    name == SKB_NOT_DROPPED_YET)		\
+			SKB_DR_SET(name, reason);		\
+	} while (0)
+
+extern const char * const drop_reasons[];
+
+#endif
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
deleted file mode 100644
index c0a3ea806cd5..000000000000
--- a/include/net/dropreason.h
+++ /dev/null
@@ -1,363 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-
-#ifndef _LINUX_DROPREASON_H
-#define _LINUX_DROPREASON_H
-
-#define DEFINE_DROP_REASON(FN, FNe)	\
-	FN(NOT_SPECIFIED)		\
-	FN(NO_SOCKET)			\
-	FN(PKT_TOO_SMALL)		\
-	FN(TCP_CSUM)			\
-	FN(SOCKET_FILTER)		\
-	FN(UDP_CSUM)			\
-	FN(NETFILTER_DROP)		\
-	FN(OTHERHOST)			\
-	FN(IP_CSUM)			\
-	FN(IP_INHDR)			\
-	FN(IP_RPFILTER)			\
-	FN(UNICAST_IN_L2_MULTICAST)	\
-	FN(XFRM_POLICY)			\
-	FN(IP_NOPROTO)			\
-	FN(SOCKET_RCVBUFF)		\
-	FN(PROTO_MEM)			\
-	FN(TCP_MD5NOTFOUND)		\
-	FN(TCP_MD5UNEXPECTED)		\
-	FN(TCP_MD5FAILURE)		\
-	FN(SOCKET_BACKLOG)		\
-	FN(TCP_FLAGS)			\
-	FN(TCP_ZEROWINDOW)		\
-	FN(TCP_OLD_DATA)		\
-	FN(TCP_OVERWINDOW)		\
-	FN(TCP_OFOMERGE)		\
-	FN(TCP_RFC7323_PAWS)		\
-	FN(TCP_INVALID_SEQUENCE)	\
-	FN(TCP_RESET)			\
-	FN(TCP_INVALID_SYN)		\
-	FN(TCP_CLOSE)			\
-	FN(TCP_FASTOPEN)		\
-	FN(TCP_OLD_ACK)			\
-	FN(TCP_TOO_OLD_ACK)		\
-	FN(TCP_ACK_UNSENT_DATA)		\
-	FN(TCP_OFO_QUEUE_PRUNE)		\
-	FN(TCP_OFO_DROP)		\
-	FN(IP_OUTNOROUTES)		\
-	FN(BPF_CGROUP_EGRESS)		\
-	FN(IPV6DISABLED)		\
-	FN(NEIGH_CREATEFAIL)		\
-	FN(NEIGH_FAILED)		\
-	FN(NEIGH_QUEUEFULL)		\
-	FN(NEIGH_DEAD)			\
-	FN(TC_EGRESS)			\
-	FN(QDISC_DROP)			\
-	FN(CPU_BACKLOG)			\
-	FN(XDP)				\
-	FN(TC_INGRESS)			\
-	FN(UNHANDLED_PROTO)		\
-	FN(SKB_CSUM)			\
-	FN(SKB_GSO_SEG)			\
-	FN(SKB_UCOPY_FAULT)		\
-	FN(DEV_HDR)			\
-	FN(DEV_READY)			\
-	FN(FULL_RING)			\
-	FN(NOMEM)			\
-	FN(HDR_TRUNC)			\
-	FN(TAP_FILTER)			\
-	FN(TAP_TXFILTER)		\
-	FN(ICMP_CSUM)			\
-	FN(INVALID_PROTO)		\
-	FN(IP_INADDRERRORS)		\
-	FN(IP_INNOROUTES)		\
-	FN(PKT_TOO_BIG)			\
-	FN(DUP_FRAG)			\
-	FN(FRAG_REASM_TIMEOUT)		\
-	FN(FRAG_TOO_FAR)		\
-	FN(TCP_MINTTL)			\
-	FN(IPV6_BAD_EXTHDR)		\
-	FN(IPV6_NDISC_FRAG)		\
-	FN(IPV6_NDISC_HOP_LIMIT)	\
-	FN(IPV6_NDISC_BAD_CODE)		\
-	FN(IPV6_NDISC_BAD_OPTIONS)	\
-	FN(IPV6_NDISC_NS_OTHERHOST)	\
-	FNe(MAX)
-
-/**
- * enum skb_drop_reason - the reasons of skb drops
- *
- * The reason of skb drop, which is used in kfree_skb_reason().
- */
-enum skb_drop_reason {
-	/**
-	 * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
-	 */
-	SKB_NOT_DROPPED_YET = 0,
-	/** @SKB_CONSUMED: packet has been consumed */
-	SKB_CONSUMED,
-	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
-	SKB_DROP_REASON_NOT_SPECIFIED,
-	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
-	SKB_DROP_REASON_NO_SOCKET,
-	/** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */
-	SKB_DROP_REASON_PKT_TOO_SMALL,
-	/** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */
-	SKB_DROP_REASON_TCP_CSUM,
-	/** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */
-	SKB_DROP_REASON_SOCKET_FILTER,
-	/** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */
-	SKB_DROP_REASON_UDP_CSUM,
-	/** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */
-	SKB_DROP_REASON_NETFILTER_DROP,
-	/**
-	 * @SKB_DROP_REASON_OTHERHOST: packet don't belong to current host
-	 * (interface is in promisc mode)
-	 */
-	SKB_DROP_REASON_OTHERHOST,
-	/** @SKB_DROP_REASON_IP_CSUM: IP checksum error */
-	SKB_DROP_REASON_IP_CSUM,
-	/**
-	 * @SKB_DROP_REASON_IP_INHDR: there is something wrong with IP header (see
-	 * IPSTATS_MIB_INHDRERRORS)
-	 */
-	SKB_DROP_REASON_IP_INHDR,
-	/**
-	 * @SKB_DROP_REASON_IP_RPFILTER: IP rpfilter validate failed. see the
-	 * document for rp_filter in ip-sysctl.rst for more information
-	 */
-	SKB_DROP_REASON_IP_RPFILTER,
-	/**
-	 * @SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST: destination address of L2 is
-	 * multicast, but L3 is unicast.
-	 */
-	SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST,
-	/** @SKB_DROP_REASON_XFRM_POLICY: xfrm policy check failed */
-	SKB_DROP_REASON_XFRM_POLICY,
-	/** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */
-	SKB_DROP_REASON_IP_NOPROTO,
-	/** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */
-	SKB_DROP_REASON_SOCKET_RCVBUFF,
-	/**
-	 * @SKB_DROP_REASON_PROTO_MEM: proto memory limition, such as udp packet
-	 * drop out of udp_memory_allocated.
-	 */
-	SKB_DROP_REASON_PROTO_MEM,
-	/**
-	 * @SKB_DROP_REASON_TCP_MD5NOTFOUND: no MD5 hash and one expected,
-	 * corresponding to LINUX_MIB_TCPMD5NOTFOUND
-	 */
-	SKB_DROP_REASON_TCP_MD5NOTFOUND,
-	/**
-	 * @SKB_DROP_REASON_TCP_MD5UNEXPECTED: MD5 hash and we're not expecting
-	 * one, corresponding to LINUX_MIB_TCPMD5UNEXPECTED
-	 */
-	SKB_DROP_REASON_TCP_MD5UNEXPECTED,
-	/**
-	 * @SKB_DROP_REASON_TCP_MD5FAILURE: MD5 hash and its wrong, corresponding
-	 * to LINUX_MIB_TCPMD5FAILURE
-	 */
-	SKB_DROP_REASON_TCP_MD5FAILURE,
-	/**
-	 * @SKB_DROP_REASON_SOCKET_BACKLOG: failed to add skb to socket backlog (
-	 * see LINUX_MIB_TCPBACKLOGDROP)
-	 */
-	SKB_DROP_REASON_SOCKET_BACKLOG,
-	/** @SKB_DROP_REASON_TCP_FLAGS: TCP flags invalid */
-	SKB_DROP_REASON_TCP_FLAGS,
-	/**
-	 * @SKB_DROP_REASON_TCP_ZEROWINDOW: TCP receive window size is zero,
-	 * see LINUX_MIB_TCPZEROWINDOWDROP
-	 */
-	SKB_DROP_REASON_TCP_ZEROWINDOW,
-	/**
-	 * @SKB_DROP_REASON_TCP_OLD_DATA: the TCP data reveived is already
-	 * received before (spurious retrans may happened), see
-	 * LINUX_MIB_DELAYEDACKLOST
-	 */
-	SKB_DROP_REASON_TCP_OLD_DATA,
-	/**
-	 * @SKB_DROP_REASON_TCP_OVERWINDOW: the TCP data is out of window,
-	 * the seq of the first byte exceed the right edges of receive
-	 * window
-	 */
-	SKB_DROP_REASON_TCP_OVERWINDOW,
-	/**
-	 * @SKB_DROP_REASON_TCP_OFOMERGE: the data of skb is already in the ofo
-	 * queue, corresponding to LINUX_MIB_TCPOFOMERGE
-	 */
-	SKB_DROP_REASON_TCP_OFOMERGE,
-	/**
-	 * @SKB_DROP_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
-	 * LINUX_MIB_PAWSESTABREJECTED
-	 */
-	SKB_DROP_REASON_TCP_RFC7323_PAWS,
-	/** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
-	SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
-	/** @SKB_DROP_REASON_TCP_RESET: Invalid RST packet */
-	SKB_DROP_REASON_TCP_RESET,
-	/**
-	 * @SKB_DROP_REASON_TCP_INVALID_SYN: Incoming packet has unexpected
-	 * SYN flag
-	 */
-	SKB_DROP_REASON_TCP_INVALID_SYN,
-	/** @SKB_DROP_REASON_TCP_CLOSE: TCP socket in CLOSE state */
-	SKB_DROP_REASON_TCP_CLOSE,
-	/** @SKB_DROP_REASON_TCP_FASTOPEN: dropped by FASTOPEN request socket */
-	SKB_DROP_REASON_TCP_FASTOPEN,
-	/** @SKB_DROP_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
-	SKB_DROP_REASON_TCP_OLD_ACK,
-	/** @SKB_DROP_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
-	SKB_DROP_REASON_TCP_TOO_OLD_ACK,
-	/**
-	 * @SKB_DROP_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
-	 * sent yet
-	 */
-	SKB_DROP_REASON_TCP_ACK_UNSENT_DATA,
-	/** @SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE: pruned from TCP OFO queue */
-	SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE,
-	/** @SKB_DROP_REASON_TCP_OFO_DROP: data already in receive queue */
-	SKB_DROP_REASON_TCP_OFO_DROP,
-	/** @SKB_DROP_REASON_IP_OUTNOROUTES: route lookup failed */
-	SKB_DROP_REASON_IP_OUTNOROUTES,
-	/**
-	 * @SKB_DROP_REASON_BPF_CGROUP_EGRESS: dropped by BPF_PROG_TYPE_CGROUP_SKB
-	 * eBPF program
-	 */
-	SKB_DROP_REASON_BPF_CGROUP_EGRESS,
-	/** @SKB_DROP_REASON_IPV6DISABLED: IPv6 is disabled on the device */
-	SKB_DROP_REASON_IPV6DISABLED,
-	/** @SKB_DROP_REASON_NEIGH_CREATEFAIL: failed to create neigh entry */
-	SKB_DROP_REASON_NEIGH_CREATEFAIL,
-	/** @SKB_DROP_REASON_NEIGH_FAILED: neigh entry in failed state */
-	SKB_DROP_REASON_NEIGH_FAILED,
-	/** @SKB_DROP_REASON_NEIGH_QUEUEFULL: arp_queue for neigh entry is full */
-	SKB_DROP_REASON_NEIGH_QUEUEFULL,
-	/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
-	SKB_DROP_REASON_NEIGH_DEAD,
-	/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
-	SKB_DROP_REASON_TC_EGRESS,
-	/**
-	 * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
-	 * failed to enqueue to current qdisc)
-	 */
-	SKB_DROP_REASON_QDISC_DROP,
-	/**
-	 * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU
-	 * backlog queue. This can be caused by backlog queue full (see
-	 * netdev_max_backlog in net.rst) or RPS flow limit
-	 */
-	SKB_DROP_REASON_CPU_BACKLOG,
-	/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
-	SKB_DROP_REASON_XDP,
-	/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
-	SKB_DROP_REASON_TC_INGRESS,
-	/** @SKB_DROP_REASON_UNHANDLED_PROTO: protocol not implemented or not supported */
-	SKB_DROP_REASON_UNHANDLED_PROTO,
-	/** @SKB_DROP_REASON_SKB_CSUM: sk_buff checksum computation error */
-	SKB_DROP_REASON_SKB_CSUM,
-	/** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */
-	SKB_DROP_REASON_SKB_GSO_SEG,
-	/**
-	 * @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space,
-	 * e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx()
-	 */
-	SKB_DROP_REASON_SKB_UCOPY_FAULT,
-	/** @SKB_DROP_REASON_DEV_HDR: device driver specific header/metadata is invalid */
-	SKB_DROP_REASON_DEV_HDR,
-	/**
-	 * @SKB_DROP_REASON_DEV_READY: the device is not ready to xmit/recv due to
-	 * any of its data structure that is not up/ready/initialized,
-	 * e.g., the IFF_UP is not set, or driver specific tun->tfiles[txq]
-	 * is not initialized
-	 */
-	SKB_DROP_REASON_DEV_READY,
-	/** @SKB_DROP_REASON_FULL_RING: ring buffer is full */
-	SKB_DROP_REASON_FULL_RING,
-	/** @SKB_DROP_REASON_NOMEM: error due to OOM */
-	SKB_DROP_REASON_NOMEM,
-	/**
-	 * @SKB_DROP_REASON_HDR_TRUNC: failed to trunc/extract the header from
-	 * networking data, e.g., failed to pull the protocol header from
-	 * frags via pskb_may_pull()
-	 */
-	SKB_DROP_REASON_HDR_TRUNC,
-	/**
-	 * @SKB_DROP_REASON_TAP_FILTER: dropped by (ebpf) filter directly attached
-	 * to tun/tap, e.g., via TUNSETFILTEREBPF
-	 */
-	SKB_DROP_REASON_TAP_FILTER,
-	/**
-	 * @SKB_DROP_REASON_TAP_TXFILTER: dropped by tx filter implemented at
-	 * tun/tap, e.g., check_filter()
-	 */
-	SKB_DROP_REASON_TAP_TXFILTER,
-	/** @SKB_DROP_REASON_ICMP_CSUM: ICMP checksum error */
-	SKB_DROP_REASON_ICMP_CSUM,
-	/**
-	 * @SKB_DROP_REASON_INVALID_PROTO: the packet doesn't follow RFC 2211,
-	 * such as a broadcasts ICMP_TIMESTAMP
-	 */
-	SKB_DROP_REASON_INVALID_PROTO,
-	/**
-	 * @SKB_DROP_REASON_IP_INADDRERRORS: host unreachable, corresponding to
-	 * IPSTATS_MIB_INADDRERRORS
-	 */
-	SKB_DROP_REASON_IP_INADDRERRORS,
-	/**
-	 * @SKB_DROP_REASON_IP_INNOROUTES: network unreachable, corresponding to
-	 * IPSTATS_MIB_INADDRERRORS
-	 */
-	SKB_DROP_REASON_IP_INNOROUTES,
-	/**
-	 * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the
-	 * MTU)
-	 */
-	SKB_DROP_REASON_PKT_TOO_BIG,
-	/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
-	SKB_DROP_REASON_DUP_FRAG,
-	/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
-	SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
-	/**
-	 * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
-	 * (/proc/sys/net/ipv4/ipfrag_max_dist)
-	 */
-	SKB_DROP_REASON_FRAG_TOO_FAR,
-	/**
-	 * @SKB_DROP_REASON_TCP_MINTTL: ipv4 ttl or ipv6 hoplimit below
-	 * the threshold (IP_MINTTL or IPV6_MINHOPCOUNT).
-	 */
-	SKB_DROP_REASON_TCP_MINTTL,
-	/** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */
-	SKB_DROP_REASON_IPV6_BAD_EXTHDR,
-	/** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */
-	SKB_DROP_REASON_IPV6_NDISC_FRAG,
-	/** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */
-	SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT,
-	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_CODE: invalid NDISC icmp6 code. */
-	SKB_DROP_REASON_IPV6_NDISC_BAD_CODE,
-	/** @SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS: invalid NDISC options. */
-	SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS,
-	/** @SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST: NEIGHBOUR SOLICITATION
-	 * for another host.
-	 */
-	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
-	/**
-	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
-	 * used as a real 'reason'
-	 */
-	SKB_DROP_REASON_MAX,
-};
-
-#define SKB_DR_INIT(name, reason)				\
-	enum skb_drop_reason name = SKB_DROP_REASON_##reason
-#define SKB_DR(name)						\
-	SKB_DR_INIT(name, NOT_SPECIFIED)
-#define SKB_DR_SET(name, reason)				\
-	(name = SKB_DROP_REASON_##reason)
-#define SKB_DR_OR(name, reason)					\
-	do {							\
-		if (name == SKB_DROP_REASON_NOT_SPECIFIED ||	\
-		    name == SKB_NOT_DROPPED_YET)		\
-			SKB_DR_SET(name, reason);		\
-	} while (0)
-
-extern const char * const drop_reasons[];
-
-#endif
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index b23ddec3cd5c..325ad893f624 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -7,7 +7,7 @@
 #include <linux/in6.h>
 #include <linux/rbtree_types.h>
 #include <linux/refcount.h>
-#include <net/dropreason.h>
+#include <net/dropreason-core.h>
 
 /* Per netns frag queues directory */
 struct fqdir {
-- 
cgit v1.2.3


From 071c0fc6fb919dcf29c676a842dda08a674877d7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Apr 2023 14:52:53 +0200
Subject: net: extend drop reasons for multiple subsystems

Extend drop reasons to make them usable by subsystems
other than core by reserving the high 16 bits for a
new subsystem ID, of which 0 of course is used for the
existing reasons immediately.

To still be able to have string reasons, restructure
that code a bit to make the loopup under RCU, the only
user of this (right now) is drop_monitor.

Link: https://lore.kernel.org/netdev/00659771ed54353f92027702c5bbb84702da62ce.camel@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason-core.h | 14 +++++++---
 include/net/dropreason.h      | 31 +++++++++++++++++++++++
 net/core/drop_monitor.c       | 33 +++++++++++++++++-------
 net/core/skbuff.c             | 59 ++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 121 insertions(+), 16 deletions(-)
 create mode 100644 include/net/dropreason.h

(limited to 'include')

diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index ade6d5b9186c..a2b953b57689 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -340,12 +340,20 @@ enum skb_drop_reason {
 	 */
 	SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST,
 	/**
-	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
-	 * used as a real 'reason'
+	 * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
+	 * shouldn't be used as a real 'reason' - only for tracing code gen
 	 */
 	SKB_DROP_REASON_MAX,
+
+	/**
+	 * @SKB_DROP_REASON_SUBSYS_MASK: subsystem mask in drop reasons,
+	 * see &enum skb_drop_reason_subsys
+	 */
+	SKB_DROP_REASON_SUBSYS_MASK = 0xffff0000,
 };
 
+#define SKB_DROP_REASON_SUBSYS_SHIFT	16
+
 #define SKB_DR_INIT(name, reason)				\
 	enum skb_drop_reason name = SKB_DROP_REASON_##reason
 #define SKB_DR(name)						\
@@ -359,6 +367,4 @@ enum skb_drop_reason {
 			SKB_DR_SET(name, reason);		\
 	} while (0)
 
-extern const char * const drop_reasons[];
-
 #endif
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
new file mode 100644
index 000000000000..f0f2378dbed0
--- /dev/null
+++ b/include/net/dropreason.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_DROPREASON_H
+#define _LINUX_DROPREASON_H
+#include <net/dropreason-core.h>
+
+/**
+ * enum skb_drop_reason_subsys - subsystem tag for (extended) drop reasons
+ */
+enum skb_drop_reason_subsys {
+	/** @SKB_DROP_REASON_SUBSYS_CORE: core drop reasons defined above */
+	SKB_DROP_REASON_SUBSYS_CORE,
+
+	/** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */
+	SKB_DROP_REASON_SUBSYS_NUM
+};
+
+struct drop_reason_list {
+	const char * const *reasons;
+	size_t n_reasons;
+};
+
+/* Note: due to dynamic registrations, access must be under RCU */
+extern const struct drop_reason_list __rcu *
+drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM];
+
+void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
+				  const struct drop_reason_list *list);
+void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys);
+
+#endif
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 5a782d1d8fd3..aff31cd944c2 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -21,6 +21,7 @@
 #include <linux/workqueue.h>
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
+#include <linux/bitfield.h>
 #include <linux/percpu.h>
 #include <linux/timer.h>
 #include <linux/bitops.h>
@@ -29,6 +30,7 @@
 #include <net/genetlink.h>
 #include <net/netevent.h>
 #include <net/flow_offload.h>
+#include <net/dropreason.h>
 #include <net/devlink.h>
 
 #include <trace/events/skb.h>
@@ -504,8 +506,6 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
 	if (!nskb)
 		return;
 
-	if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0))
-		reason = SKB_DROP_REASON_NOT_SPECIFIED;
 	cb = NET_DM_SKB_CB(nskb);
 	cb->reason = reason;
 	cb->pc = location;
@@ -552,9 +552,9 @@ static size_t net_dm_in_port_size(void)
 }
 
 #define NET_DM_MAX_SYMBOL_LEN 40
+#define NET_DM_MAX_REASON_LEN 50
 
-static size_t net_dm_packet_report_size(size_t payload_len,
-					enum skb_drop_reason reason)
+static size_t net_dm_packet_report_size(size_t payload_len)
 {
 	size_t size;
 
@@ -576,7 +576,7 @@ static size_t net_dm_packet_report_size(size_t payload_len,
 	       /* NET_DM_ATTR_PROTO */
 	       nla_total_size(sizeof(u16)) +
 	       /* NET_DM_ATTR_REASON */
-	       nla_total_size(strlen(drop_reasons[reason]) + 1) +
+	       nla_total_size(NET_DM_MAX_REASON_LEN + 1) +
 	       /* NET_DM_ATTR_PAYLOAD */
 	       nla_total_size(payload_len);
 }
@@ -610,6 +610,8 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 				     size_t payload_len)
 {
 	struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
+	const struct drop_reason_list *list = NULL;
+	unsigned int subsys, subsys_reason;
 	char buf[NET_DM_MAX_SYMBOL_LEN];
 	struct nlattr *attr;
 	void *hdr;
@@ -627,9 +629,24 @@ static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
 			      NET_DM_ATTR_PAD))
 		goto nla_put_failure;
 
+	rcu_read_lock();
+	subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK);
+	if (subsys < SKB_DROP_REASON_SUBSYS_NUM)
+		list = rcu_dereference(drop_reasons_by_subsys[subsys]);
+	subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK;
+	if (!list ||
+	    subsys_reason >= list->n_reasons ||
+	    !list->reasons[subsys_reason] ||
+	    strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) {
+		list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]);
+		subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+	}
 	if (nla_put_string(msg, NET_DM_ATTR_REASON,
-			   drop_reasons[cb->reason]))
+			   list->reasons[subsys_reason])) {
+		rcu_read_unlock();
 		goto nla_put_failure;
+	}
+	rcu_read_unlock();
 
 	snprintf(buf, sizeof(buf), "%pS", cb->pc);
 	if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
@@ -687,9 +704,7 @@ static void net_dm_packet_report(struct sk_buff *skb)
 	if (net_dm_trunc_len)
 		payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
 
-	msg = nlmsg_new(net_dm_packet_report_size(payload_len,
-						  NET_DM_SKB_CB(skb)->reason),
-			GFP_KERNEL);
+	msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
 	if (!msg)
 		goto out;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 8764653bede7..0d998806b377 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -58,6 +58,7 @@
 #include <linux/scatterlist.h>
 #include <linux/errqueue.h>
 #include <linux/prefetch.h>
+#include <linux/bitfield.h>
 #include <linux/if_vlan.h>
 #include <linux/mpls.h>
 #include <linux/kcov.h>
@@ -72,6 +73,7 @@
 #include <net/mptcp.h>
 #include <net/mctp.h>
 #include <net/page_pool.h>
+#include <net/dropreason.h>
 
 #include <linux/uaccess.h>
 #include <trace/events/skb.h>
@@ -122,11 +124,59 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
 
 #undef FN
 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
-const char * const drop_reasons[] = {
+static const char * const drop_reasons[] = {
 	[SKB_CONSUMED] = "CONSUMED",
 	DEFINE_DROP_REASON(FN, FN)
 };
-EXPORT_SYMBOL(drop_reasons);
+
+static const struct drop_reason_list drop_reasons_core = {
+	.reasons = drop_reasons,
+	.n_reasons = ARRAY_SIZE(drop_reasons),
+};
+
+const struct drop_reason_list __rcu *
+drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
+	[SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
+};
+EXPORT_SYMBOL(drop_reasons_by_subsys);
+
+/**
+ * drop_reasons_register_subsys - register another drop reason subsystem
+ * @subsys: the subsystem to register, must not be the core
+ * @list: the list of drop reasons within the subsystem, must point to
+ *	a statically initialized list
+ */
+void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
+				  const struct drop_reason_list *list)
+{
+	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+		 "invalid subsystem %d\n", subsys))
+		return;
+
+	/* must point to statically allocated memory, so INIT is OK */
+	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
+}
+EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
+
+/**
+ * drop_reasons_unregister_subsys - unregister a drop reason subsystem
+ * @subsys: the subsystem to remove, must not be the core
+ *
+ * Note: This will synchronize_rcu() to ensure no users when it returns.
+ */
+void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
+{
+	if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+		 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+		 "invalid subsystem %d\n", subsys))
+		return;
+
+	RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
+
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
 
 /**
  *	skb_panic - private function for out-of-line support
@@ -986,7 +1036,10 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 	if (unlikely(!skb_unref(skb)))
 		return false;
 
-	DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
+	DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
+			       u32_get_bits(reason,
+					    SKB_DROP_REASON_SUBSYS_MASK) >=
+				SKB_DROP_REASON_SUBSYS_NUM);
 
 	if (reason == SKB_CONSUMED)
 		trace_consume_skb(skb, __builtin_return_address(0));
-- 
cgit v1.2.3


From baa951a1c1771810f3a378a95fc93e81953027d5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 19 Apr 2023 14:52:54 +0200
Subject: mac80211: use the new drop reasons infrastructure

It can be really hard to analyse or debug why packets are
going missing in mac80211, so add the needed infrastructure
to use use the new per-subsystem drop reasons.

We actually use two drop reason subsystems here because of
the different handling of frames that are dropped but still
go to monitor for old versions of hostapd, and those that
are just completely unusable (e.g. crypto failed.)

Annotate a few reasons here just to illustrate this, we'll
need to go through and annotate more of them later.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason.h   | 12 ++++++++++
 net/mac80211/drop.h        | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/ieee80211_i.h |  8 +------
 net/mac80211/main.c        | 31 +++++++++++++++++++++++++
 net/mac80211/rx.c          | 55 +++++++++++++++++++++------------------------
 net/mac80211/wpa.c         | 24 ++++++++++----------
 6 files changed, 138 insertions(+), 48 deletions(-)
 create mode 100644 net/mac80211/drop.h

(limited to 'include')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index f0f2378dbed0..685fb37df8e8 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -11,6 +11,18 @@ enum skb_drop_reason_subsys {
 	/** @SKB_DROP_REASON_SUBSYS_CORE: core drop reasons defined above */
 	SKB_DROP_REASON_SUBSYS_CORE,
 
+	/**
+	 * @SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE: mac80211 drop reasons
+	 * for unusable frames, see net/mac80211/drop.h
+	 */
+	SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE,
+
+	/**
+	 * @SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR: mac80211 drop reasons
+	 * for frames still going to monitor, see net/mac80211/drop.h
+	 */
+	SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR,
+
 	/** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */
 	SKB_DROP_REASON_SUBSYS_NUM
 };
diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h
new file mode 100644
index 000000000000..49dc809cab29
--- /dev/null
+++ b/net/mac80211/drop.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * mac80211 drop reason list
+ *
+ * Copyright (C) 2023 Intel Corporation
+ */
+
+#ifndef MAC80211_DROP_H
+#define MAC80211_DROP_H
+#include <net/dropreason.h>
+
+typedef unsigned int __bitwise ieee80211_rx_result;
+
+#define MAC80211_DROP_REASONS_MONITOR(R)	\
+	R(RX_DROP_M_UNEXPECTED_4ADDR_FRAME)	\
+	R(RX_DROP_M_BAD_BCN_KEYIDX)		\
+	R(RX_DROP_M_BAD_MGMT_KEYIDX)		\
+/* this line for the trailing \ - add before this */
+
+#define MAC80211_DROP_REASONS_UNUSABLE(R)	\
+	R(RX_DROP_U_MIC_FAIL)			\
+	R(RX_DROP_U_REPLAY)			\
+	R(RX_DROP_U_BAD_MMIE)			\
+/* this line for the trailing \ - add before this */
+
+/* having two enums allows for checking ieee80211_rx_result use with sparse */
+enum ___mac80211_drop_reason {
+/* if we get to the end of handlers with RX_CONTINUE this will be the reason */
+	___RX_CONTINUE	= SKB_CONSUMED,
+
+/* this never gets used as an argument to kfree_skb_reason() */
+	___RX_QUEUED	= SKB_NOT_DROPPED_YET,
+
+#define ENUM(x) ___ ## x,
+	___RX_DROP_MONITOR = SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR <<
+		SKB_DROP_REASON_SUBSYS_SHIFT,
+	MAC80211_DROP_REASONS_MONITOR(ENUM)
+
+	___RX_DROP_UNUSABLE = SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE <<
+		SKB_DROP_REASON_SUBSYS_SHIFT,
+	MAC80211_DROP_REASONS_UNUSABLE(ENUM)
+#undef ENUM
+};
+
+enum mac80211_drop_reason {
+	RX_CONTINUE	 = (__force ieee80211_rx_result)___RX_CONTINUE,
+	RX_QUEUED	 = (__force ieee80211_rx_result)___RX_QUEUED,
+	RX_DROP_MONITOR	 = (__force ieee80211_rx_result)___RX_DROP_MONITOR,
+	RX_DROP_UNUSABLE = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE,
+#define DEF(x) x = (__force ieee80211_rx_result)___ ## x,
+	MAC80211_DROP_REASONS_MONITOR(DEF)
+	MAC80211_DROP_REASONS_UNUSABLE(DEF)
+#undef DEF
+};
+
+#endif /* MAC80211_DROP_H */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9b7e184430b8..a0a7839cb961 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -33,6 +33,7 @@
 #include "key.h"
 #include "sta_info.h"
 #include "debug.h"
+#include "drop.h"
 
 extern const struct cfg80211_ops mac80211_config_ops;
 
@@ -170,13 +171,6 @@ struct ieee80211_tx_data {
 	unsigned int flags;
 };
 
-
-typedef unsigned __bitwise ieee80211_rx_result;
-#define RX_CONTINUE		((__force ieee80211_rx_result) 0u)
-#define RX_DROP_UNUSABLE	((__force ieee80211_rx_result) 1u)
-#define RX_DROP_MONITOR		((__force ieee80211_rx_result) 2u)
-#define RX_QUEUED		((__force ieee80211_rx_result) 3u)
-
 /**
  * enum ieee80211_packet_rx_flags - packet RX flags
  * @IEEE80211_RX_AMSDU: a-MSDU packet
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ddf2b7811c55..55cdfaef0f5d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -22,6 +22,7 @@
 #include <linux/bitmap.h>
 #include <linux/inetdevice.h>
 #include <net/net_namespace.h>
+#include <net/dropreason.h>
 #include <net/cfg80211.h>
 #include <net/addrconf.h>
 
@@ -1542,6 +1543,28 @@ void ieee80211_free_hw(struct ieee80211_hw *hw)
 }
 EXPORT_SYMBOL(ieee80211_free_hw);
 
+static const char * const drop_reasons_monitor[] = {
+#define V(x)	#x,
+	[0] = "RX_DROP_MONITOR",
+	MAC80211_DROP_REASONS_MONITOR(V)
+};
+
+static struct drop_reason_list drop_reason_list_monitor = {
+	.reasons = drop_reasons_monitor,
+	.n_reasons = ARRAY_SIZE(drop_reasons_monitor),
+};
+
+static const char * const drop_reasons_unusable[] = {
+	[0] = "RX_DROP_UNUSABLE",
+	MAC80211_DROP_REASONS_UNUSABLE(V)
+#undef V
+};
+
+static struct drop_reason_list drop_reason_list_unusable = {
+	.reasons = drop_reasons_unusable,
+	.n_reasons = ARRAY_SIZE(drop_reasons_unusable),
+};
+
 static int __init ieee80211_init(void)
 {
 	struct sk_buff *skb;
@@ -1559,6 +1582,11 @@ static int __init ieee80211_init(void)
 	if (ret)
 		goto err_netdev;
 
+	drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR,
+				     &drop_reason_list_monitor);
+	drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE,
+				     &drop_reason_list_unusable);
+
 	return 0;
  err_netdev:
 	rc80211_minstrel_exit();
@@ -1574,6 +1602,9 @@ static void __exit ieee80211_exit(void)
 
 	ieee80211_iface_exit();
 
+	drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR);
+	drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE);
+
 	rcu_barrier();
 }
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index db3451f5f2fb..58222c077898 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1826,7 +1826,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
 				cfg80211_rx_unexpected_4addr_frame(
 					rx->sdata->dev, sta->sta.addr,
 					GFP_ATOMIC);
-			return RX_DROP_MONITOR;
+			return RX_DROP_M_UNEXPECTED_4ADDR_FRAME;
 		}
 		/*
 		 * Update counter and free packet here to avoid
@@ -1961,7 +1961,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 				cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev,
 							     skb->data,
 							     skb->len);
-			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+			return RX_DROP_M_BAD_BCN_KEYIDX;
 		}
 
 		rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx);
@@ -1975,7 +1975,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 
 		if (mmie_keyidx < NUM_DEFAULT_KEYS ||
 		    mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
-			return RX_DROP_MONITOR; /* unexpected BIP keyidx */
+			return RX_DROP_M_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */
 		if (rx->link_sta) {
 			if (ieee80211_is_group_privacy_action(skb) &&
 			    test_sta_flag(rx->sta, WLAN_STA_MFP))
@@ -3960,7 +3960,8 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
 }
 
 static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
-					struct ieee80211_rate *rate)
+					struct ieee80211_rate *rate,
+					ieee80211_rx_result reason)
 {
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_local *local = rx->local;
@@ -4024,42 +4025,38 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
 	}
 
  out_free_skb:
-	dev_kfree_skb(skb);
+	kfree_skb_reason(skb, (__force u32)reason);
 }
 
 static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
 					 ieee80211_rx_result res)
 {
-	switch (res) {
-	case RX_DROP_MONITOR:
-		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
-		if (rx->sta)
-			rx->link_sta->rx_stats.dropped++;
-		fallthrough;
-	case RX_CONTINUE: {
-		struct ieee80211_rate *rate = NULL;
-		struct ieee80211_supported_band *sband;
-		struct ieee80211_rx_status *status;
-
-		status = IEEE80211_SKB_RXCB((rx->skb));
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+	struct ieee80211_supported_band *sband;
+	struct ieee80211_rate *rate = NULL;
 
-		sband = rx->local->hw.wiphy->bands[status->band];
-		if (status->encoding == RX_ENC_LEGACY)
-			rate = &sband->bitrates[status->rate_idx];
+	if (res == RX_QUEUED) {
+		I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
+		return;
+	}
 
-		ieee80211_rx_cooked_monitor(rx, rate);
-		break;
-		}
-	case RX_DROP_UNUSABLE:
+	if (res != RX_CONTINUE) {
 		I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop);
 		if (rx->sta)
 			rx->link_sta->rx_stats.dropped++;
-		dev_kfree_skb(rx->skb);
-		break;
-	case RX_QUEUED:
-		I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued);
-		break;
 	}
+
+	if (u32_get_bits((__force u32)res, SKB_DROP_REASON_SUBSYS_MASK) ==
+			SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE) {
+		kfree_skb_reason(rx->skb, (__force u32)res);
+		return;
+	}
+
+	sband = rx->local->hw.wiphy->bands[status->band];
+	if (status->encoding == RX_ENC_LEGACY)
+		rate = &sband->bitrates[status->rate_idx];
+
+	ieee80211_rx_cooked_monitor(rx, rate, res);
 }
 
 static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 20f742b5503b..4133496da378 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -550,7 +550,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 		if (res < 0 ||
 		    (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
 			key->u.ccmp.replays++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_REPLAY;
 		}
 
 		if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -564,7 +564,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 				    skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN,
 				    data_len,
 				    skb->data + skb->len - mic_len))
-				return RX_DROP_UNUSABLE;
+				return RX_DROP_U_MIC_FAIL;
 		}
 
 		memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN);
@@ -746,7 +746,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
 		if (res < 0 ||
 		    (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) {
 			key->u.gcmp.replays++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_REPLAY;
 		}
 
 		if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -761,7 +761,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
 				    data_len,
 				    skb->data + skb->len -
 				    IEEE80211_GCMP_MIC_LEN))
-				return RX_DROP_UNUSABLE;
+				return RX_DROP_U_MIC_FAIL;
 		}
 
 		memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN);
@@ -930,13 +930,13 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
 		(skb->data + skb->len - sizeof(*mmie));
 	if (mmie->element_id != WLAN_EID_MMIE ||
 	    mmie->length != sizeof(*mmie) - 2)
-		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+		return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */
 
 	bip_ipn_swap(ipn, mmie->sequence_number);
 
 	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
 		key->u.aes_cmac.replays++;
-		return RX_DROP_UNUSABLE;
+		return RX_DROP_U_REPLAY;
 	}
 
 	if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -946,7 +946,7 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
 				   skb->data + 24, skb->len - 24, mic);
 		if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
 			key->u.aes_cmac.icverrors++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_MIC_FAIL;
 		}
 	}
 
@@ -986,7 +986,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
 
 	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
 		key->u.aes_cmac.replays++;
-		return RX_DROP_UNUSABLE;
+		return RX_DROP_U_REPLAY;
 	}
 
 	if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -996,7 +996,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
 				       skb->data + 24, skb->len - 24, mic);
 		if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
 			key->u.aes_cmac.icverrors++;
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_MIC_FAIL;
 		}
 	}
 
@@ -1079,13 +1079,13 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
 		(skb->data + skb->len - sizeof(*mmie));
 	if (mmie->element_id != WLAN_EID_MMIE ||
 	    mmie->length != sizeof(*mmie) - 2)
-		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+		return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */
 
 	bip_ipn_swap(ipn, mmie->sequence_number);
 
 	if (memcmp(ipn, key->u.aes_gmac.rx_pn, 6) <= 0) {
 		key->u.aes_gmac.replays++;
-		return RX_DROP_UNUSABLE;
+		return RX_DROP_U_REPLAY;
 	}
 
 	if (!(status->flag & RX_FLAG_DECRYPTED)) {
@@ -1104,7 +1104,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx)
 		    crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) {
 			key->u.aes_gmac.icverrors++;
 			kfree(mic);
-			return RX_DROP_UNUSABLE;
+			return RX_DROP_U_MIC_FAIL;
 		}
 		kfree(mic);
 	}
-- 
cgit v1.2.3


From add7370a398930077c6bc257ef5016b040d476eb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:28 -0400
Subject: sctp: delete the nested flexible array params

This patch deletes the flexible-array params[] from the structure
sctp_inithdr, sctp_addiphdr and sctp_reconf_chunk to avoid some
sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/input.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:278:29: warning: nested flexible array
  ./include/linux/sctp.h:675:30: warning: nested flexible array

This warning is reported if a structure having a flexible array
member is included by other structures.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  6 +++---
 include/net/sctp/sctp.h  |  8 ++++----
 net/sctp/input.c         |  2 +-
 net/sctp/sm_make_chunk.c | 18 +++++++++---------
 net/sctp/sm_statefuns.c  |  2 +-
 net/sctp/stream.c        |  2 +-
 6 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 358dc08e0831..0ff36a2737a3 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -270,7 +270,7 @@ struct sctp_inithdr {
 	__be16 num_outbound_streams;
 	__be16 num_inbound_streams;
 	__be32 initial_tsn;
-	__u8  params[];
+	/* __u8  params[]; */
 };
 
 struct sctp_init_chunk {
@@ -667,7 +667,7 @@ struct sctp_addip_param {
 
 struct sctp_addiphdr {
 	__be32	serial;
-	__u8	params[];
+	/* __u8	params[]; */
 };
 
 struct sctp_addip_chunk {
@@ -742,7 +742,7 @@ struct sctp_infox {
 
 struct sctp_reconf_chunk {
 	struct sctp_chunkhdr chunk_hdr;
-	__u8 params[];
+	/* __u8 params[]; */
 };
 
 struct sctp_strreset_outreq {
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index c335dd01a597..74fae532b944 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -425,11 +425,11 @@ static inline bool sctp_chunk_pending(const struct sctp_chunk *chunk)
  * the chunk length to indicate when to stop.  Make sure
  * there is room for a param header too.
  */
-#define sctp_walk_params(pos, chunk, member)\
-_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member)
+#define sctp_walk_params(pos, chunk)\
+_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length))
 
-#define _sctp_walk_params(pos, chunk, end, member)\
-for (pos.v = chunk->member;\
+#define _sctp_walk_params(pos, chunk, end)\
+for (pos.v = (u8 *)(chunk + 1);\
      (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\
       (void *)chunk + end) &&\
      pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 127bf28a6033..2613c4d74b16 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -1150,7 +1150,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
 	init = (struct sctp_init_chunk *)skb->data;
 
 	/* Walk the parameters looking for embedded addresses. */
-	sctp_walk_params(params, init, init_hdr.params) {
+	sctp_walk_params(params, init) {
 
 		/* Note: Ignoring hostname addresses. */
 		af = sctp_get_af_specific(param_type2af(params.p->type));
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index c8f4ec5d5f98..4dbbbc2a7742 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2306,7 +2306,7 @@ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
 	    ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW)
 		return sctp_process_inv_mandatory(asoc, chunk, errp);
 
-	sctp_walk_params(param, peer_init, init_hdr.params) {
+	sctp_walk_params(param, peer_init) {
 		if (param.p->type == SCTP_PARAM_STATE_COOKIE)
 			has_cookie = true;
 	}
@@ -2329,7 +2329,7 @@ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
 						  chunk, errp);
 
 	/* Verify all the variable length parameters */
-	sctp_walk_params(param, peer_init, init_hdr.params) {
+	sctp_walk_params(param, peer_init) {
 		result = sctp_verify_param(net, ep, asoc, param, cid,
 					   chunk, errp);
 		switch (result) {
@@ -2381,7 +2381,7 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
 		src_match = 1;
 
 	/* Process the initialization parameters.  */
-	sctp_walk_params(param, peer_init, init_hdr.params) {
+	sctp_walk_params(param, peer_init) {
 		if (!src_match &&
 		    (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
 		     param.p->type == SCTP_PARAM_IPV6_ADDRESS)) {
@@ -3202,7 +3202,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 	union sctp_params param;
 
 	addip = (struct sctp_addip_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, addip, addip_hdr.params) {
+	sctp_walk_params(param, addip) {
 		size_t length = ntohs(param.p->length);
 
 		*errp = param.p;
@@ -3215,14 +3215,14 @@ bool sctp_verify_asconf(const struct sctp_association *asoc,
 			/* ensure there is only one addr param and it's in the
 			 * beginning of addip_hdr params, or we reject it.
 			 */
-			if (param.v != addip->addip_hdr.params)
+			if (param.v != (addip + 1))
 				return false;
 			addr_param_seen = true;
 			break;
 		case SCTP_PARAM_IPV6_ADDRESS:
 			if (length != sizeof(struct sctp_ipv6addr_param))
 				return false;
-			if (param.v != addip->addip_hdr.params)
+			if (param.v != (addip + 1))
 				return false;
 			addr_param_seen = true;
 			break;
@@ -3302,7 +3302,7 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc,
 		goto done;
 
 	/* Process the TLVs contained within the ASCONF chunk. */
-	sctp_walk_params(param, addip, addip_hdr.params) {
+	sctp_walk_params(param, addip) {
 		/* Skip preceeding address parameters. */
 		if (param.p->type == SCTP_PARAM_IPV4_ADDRESS ||
 		    param.p->type == SCTP_PARAM_IPV6_ADDRESS)
@@ -3636,7 +3636,7 @@ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc,
 		return NULL;
 
 	reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
-	retval->param_hdr.v = reconf->params;
+	retval->param_hdr.v = (u8 *)(reconf + 1);
 
 	return retval;
 }
@@ -3878,7 +3878,7 @@ bool sctp_verify_reconf(const struct sctp_association *asoc,
 	__u16 cnt = 0;
 
 	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, hdr, params) {
+	sctp_walk_params(param, hdr) {
 		__u16 length = ntohs(param.p->length);
 
 		*errp = param.p;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index ce5426171206..39d416e7f795 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4142,7 +4142,7 @@ enum sctp_disposition sctp_sf_do_reconf(struct net *net,
 						  (void *)err_param, commands);
 
 	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, hdr, params) {
+	sctp_walk_params(param, hdr) {
 		struct sctp_chunk *reply = NULL;
 		struct sctp_ulpevent *ev = NULL;
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ee6514af830f..c241cc552e8d 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -491,7 +491,7 @@ static struct sctp_paramhdr *sctp_chunk_lookup_strreset_param(
 		return NULL;
 
 	hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr;
-	sctp_walk_params(param, hdr, params) {
+	sctp_walk_params(param, hdr) {
 		/* sctp_strreset_tsnreq is actually the basic structure
 		 * of all stream reconf params, so it's safe to use it
 		 * to access request_seq.
-- 
cgit v1.2.3


From 73175a042955e531ec355a8708585befa67a22db Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:29 -0400
Subject: sctp: delete the nested flexible array skip

This patch deletes the flexible-array skip[] from the structure
sctp_ifwdtsn/fwdtsn_hdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/stream_interleave.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:611:32: warning: nested flexible array
  ./include/linux/sctp.h:628:33: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h         | 4 ++--
 include/net/sctp/sctp.h      | 4 ++--
 net/sctp/stream_interleave.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 0ff36a2737a3..9815b801fec0 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -603,7 +603,7 @@ struct sctp_fwdtsn_skip {
 
 struct sctp_fwdtsn_hdr {
 	__be32 new_cum_tsn;
-	struct sctp_fwdtsn_skip skip[];
+	/* struct sctp_fwdtsn_skip skip[]; */
 };
 
 struct sctp_fwdtsn_chunk {
@@ -620,7 +620,7 @@ struct sctp_ifwdtsn_skip {
 
 struct sctp_ifwdtsn_hdr {
 	__be32 new_cum_tsn;
-	struct sctp_ifwdtsn_skip skip[];
+	/* struct sctp_ifwdtsn_skip skip[]; */
 };
 
 struct sctp_ifwdtsn_chunk {
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 74fae532b944..2a67100b2a17 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -452,8 +452,8 @@ for (err = (struct sctp_errhdr *)((void *)chunk_hdr + \
 _sctp_walk_fwdtsn((pos), (chunk), ntohs((chunk)->chunk_hdr->length) - sizeof(struct sctp_fwdtsn_chunk))
 
 #define _sctp_walk_fwdtsn(pos, chunk, end)\
-for (pos = chunk->subh.fwdtsn_hdr->skip;\
-     (void *)pos <= (void *)chunk->subh.fwdtsn_hdr->skip + end - sizeof(struct sctp_fwdtsn_skip);\
+for (pos = (void *)(chunk->subh.fwdtsn_hdr + 1);\
+     (void *)pos <= (void *)(chunk->subh.fwdtsn_hdr + 1) + end - sizeof(struct sctp_fwdtsn_skip);\
      pos++)
 
 /* External references. */
diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c
index b046b11200c9..840f24045ae2 100644
--- a/net/sctp/stream_interleave.c
+++ b/net/sctp/stream_interleave.c
@@ -1153,8 +1153,8 @@ static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn)
 }
 
 #define _sctp_walk_ifwdtsn(pos, chunk, end) \
-	for (pos = chunk->subh.ifwdtsn_hdr->skip; \
-	     (void *)pos <= (void *)chunk->subh.ifwdtsn_hdr->skip + (end) - \
+	for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \
+	     (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \
 			    sizeof(struct sctp_ifwdtsn_skip); pos++)
 
 #define sctp_walk_ifwdtsn(pos, ch) \
-- 
cgit v1.2.3


From 9789c1c6619e0a5eccfc31abe49b1ce5ca3cd11f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:30 -0400
Subject: sctp: delete the nested flexible array variable

This patch deletes the flexible-array variable[] from the structure
sctp_sackhdr and sctp_errhdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/sm_statefuns.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:451:28: warning: nested flexible array
  ./include/linux/sctp.h:393:29: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h     |  4 ++--
 net/sctp/outqueue.c      | 11 +++++++----
 net/sctp/sm_sideeffect.c |  3 +--
 net/sctp/sm_statefuns.c  |  2 +-
 4 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 9815b801fec0..01a0eb7e9fa1 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -385,7 +385,7 @@ struct sctp_sackhdr {
 	__be32 a_rwnd;
 	__be16 num_gap_ack_blocks;
 	__be16 num_dup_tsns;
-	union sctp_sack_variable variable[];
+	/* union sctp_sack_variable variable[]; */
 };
 
 struct sctp_sack_chunk {
@@ -443,7 +443,7 @@ struct sctp_shutdown_chunk {
 struct sctp_errhdr {
 	__be16 cause;
 	__be16 length;
-	__u8  variable[];
+	/* __u8  variable[]; */
 };
 
 struct sctp_operr_chunk {
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 20831079fb09..0dc6b8ab9963 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1231,7 +1231,7 @@ static void sctp_sack_update_unack_data(struct sctp_association *assoc,
 
 	unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1;
 
-	frags = sack->variable;
+	frags = (union sctp_sack_variable *)(sack + 1);
 	for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) {
 		unack_data -= ((ntohs(frags[i].gab.end) -
 				ntohs(frags[i].gab.start) + 1));
@@ -1252,7 +1252,6 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 	struct sctp_transport *transport;
 	struct sctp_chunk *tchunk = NULL;
 	struct list_head *lchunk, *transport_list, *temp;
-	union sctp_sack_variable *frags = sack->variable;
 	__u32 sack_ctsn, ctsn, tsn;
 	__u32 highest_tsn, highest_new_tsn;
 	__u32 sack_a_rwnd;
@@ -1313,8 +1312,12 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
 
 	/* Get the highest TSN in the sack. */
 	highest_tsn = sack_ctsn;
-	if (gap_ack_blocks)
+	if (gap_ack_blocks) {
+		union sctp_sack_variable *frags =
+			(union sctp_sack_variable *)(sack + 1);
+
 		highest_tsn += ntohs(frags[gap_ack_blocks - 1].gab.end);
+	}
 
 	if (TSN_lt(asoc->highest_sacked, highest_tsn))
 		asoc->highest_sacked = highest_tsn;
@@ -1789,7 +1792,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn)
 	 *  Block are assumed to have been received correctly.
 	 */
 
-	frags = sack->variable;
+	frags = (union sctp_sack_variable *)(sack + 1);
 	blocks = ntohs(sack->num_gap_ack_blocks);
 	tsn_offset = tsn - ctsn;
 	for (i = 0; i < blocks; ++i) {
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 463c4a58d2c3..7fbeb99d8d32 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -984,8 +984,7 @@ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds,
 		{
 			struct sctp_chunkhdr *unk_chunk_hdr;
 
-			unk_chunk_hdr = (struct sctp_chunkhdr *)
-							err_hdr->variable;
+			unk_chunk_hdr = (struct sctp_chunkhdr *)(err_hdr + 1);
 			switch (unk_chunk_hdr->type) {
 			/* ADDIP 4.1 A9) If the peer responds to an ASCONF with
 			 * an ERROR chunk reporting that it did not recognized
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 39d416e7f795..8d0cfd689b20 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1337,7 +1337,7 @@ static int sctp_sf_send_restart_abort(struct net *net, union sctp_addr *ssa,
 	 * throughout the code today.
 	 */
 	errhdr = (struct sctp_errhdr *)buffer;
-	addrparm = (union sctp_addr_param *)errhdr->variable;
+	addrparm = (union sctp_addr_param *)(errhdr + 1);
 
 	/* Copy into a parm format. */
 	len = af->to_addr_param(ssa, addrparm);
-- 
cgit v1.2.3


From f97278ff346a5f11c68d0ac499999d5ad06a6db2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:31 -0400
Subject: sctp: delete the nested flexible array peer_init

This patch deletes the flexible-array peer_init[] from the structure
sctp_cookie to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/sm_make_chunk.c: note: in included file (through include/net/sctp/sctp.h):
  ./include/net/sctp/structs.h:1588:28: warning: nested flexible array
  ./include/net/sctp/structs.h:343:28: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 2 +-
 net/sctp/associola.c       | 5 +++--
 net/sctp/sm_make_chunk.c   | 4 ++--
 net/sctp/sm_statefuns.c    | 8 +++-----
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 070c9458fff4..5c72d1864dd6 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -332,7 +332,7 @@ struct sctp_cookie {
 	 * the association TCB is re-constructed from the cookie.
 	 */
 	__u32 raw_addr_list_len;
-	struct sctp_init_chunk peer_init[];
+	/* struct sctp_init_chunk peer_init[]; */
 };
 
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 63ba5551c13f..796529167e8d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1597,9 +1597,10 @@ int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
 					 struct sctp_cookie *cookie,
 					 gfp_t gfp)
 {
-	int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
+	struct sctp_init_chunk *peer_init = (struct sctp_init_chunk *)(cookie + 1);
+	int var_size2 = ntohs(peer_init->chunk_hdr.length);
 	int var_size3 = cookie->raw_addr_list_len;
-	__u8 *raw = (__u8 *)cookie->peer_init + var_size2;
+	__u8 *raw = (__u8 *)peer_init + var_size2;
 
 	return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3,
 				      asoc->ep->base.bind_addr.port, gfp);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4dbbbc2a7742..08527d882e56 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1707,11 +1707,11 @@ static struct sctp_cookie_param *sctp_pack_cookie(
 					 ktime_get_real());
 
 	/* Copy the peer's init packet.  */
-	memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
+	memcpy(cookie + 1, init_chunk->chunk_hdr,
 	       ntohs(init_chunk->chunk_hdr->length));
 
 	/* Copy the raw local address list of the association. */
-	memcpy((__u8 *)&cookie->c.peer_init[0] +
+	memcpy((__u8 *)(cookie + 1) +
 	       ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
 
 	if (sctp_sk(ep->base.sk)->hmac) {
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8d0cfd689b20..7b8eb735fa88 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -794,8 +794,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 	/* This is a brand-new association, so these are not yet side
 	 * effects--it is safe to run them here.
 	 */
-	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
-
+	peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
 	if (!sctp_process_init(new_asoc, chunk,
 			       &chunk->subh.cookie_hdr->c.peer_addr,
 			       peer_init, GFP_ATOMIC))
@@ -1869,8 +1868,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_a(
 	/* new_asoc is a brand-new association, so these are not yet
 	 * side effects--it is safe to run them here.
 	 */
-	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
-
+	peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
 	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
 			       GFP_ATOMIC))
 		goto nomem;
@@ -1990,7 +1988,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
 	/* new_asoc is a brand-new association, so these are not yet
 	 * side effects--it is safe to run them here.
 	 */
-	peer_init = &chunk->subh.cookie_hdr->c.peer_init[0];
+	peer_init = (struct sctp_init_chunk *)(chunk->subh.cookie_hdr + 1);
 	if (!sctp_process_init(new_asoc, chunk, sctp_source(chunk), peer_init,
 			       GFP_ATOMIC))
 		goto nomem;
-- 
cgit v1.2.3


From 2ab399a931dddacdf7202cd4b49a5187154623d1 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:32 -0400
Subject: sctp: delete the nested flexible array hmac

This patch deletes the flexible-array hmac[] from the structure
sctp_authhdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/auth.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:735:29: warning: nested flexible array

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h    | 2 +-
 net/sctp/auth.c         | 2 +-
 net/sctp/sm_statefuns.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 01a0eb7e9fa1..d182e8c41985 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -727,7 +727,7 @@ struct sctp_addip_chunk {
 struct sctp_authhdr {
 	__be16 shkey_id;
 	__be16 hmac_id;
-	__u8   hmac[];
+	/* __u8   hmac[]; */
 };
 
 struct sctp_auth_chunk {
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 34964145514e..c58fffc86a0c 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -738,7 +738,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
 
 	tfm = asoc->ep->auth_hmacs[hmac_id];
 
-	digest = auth->auth_hdr.hmac;
+	digest = (u8 *)(&auth->auth_hdr + 1);
 	if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
 		goto free;
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 7b8eb735fa88..97f1155a2045 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4391,7 +4391,7 @@ static enum sctp_ierror sctp_sf_authenticate(
 	 *  3. Compute the new digest
 	 *  4. Compare saved and new digests.
 	 */
-	digest = auth_hdr->hmac;
+	digest = (u8 *)(auth_hdr + 1);
 	skb_pull(chunk->skb, sig_len);
 
 	save_digest = kmemdup(digest, sig_len, GFP_ATOMIC);
-- 
cgit v1.2.3


From dbda0fba7a14f14835c34d59fd329cb90a887862 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 19 Apr 2023 11:16:33 -0400
Subject: sctp: delete the nested flexible array payload

This patch deletes the flexible-array payload[] from the structure
sctp_datahdr to avoid some sparse warnings:

  # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/
  net/sctp/socket.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h):
  ./include/linux/sctp.h:230:29: warning: nested flexible array

This member is not even used anywhere.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sctp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index d182e8c41985..836a7e200f39 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -222,7 +222,7 @@ struct sctp_datahdr {
 	__be16 stream;
 	__be16 ssn;
 	__u32 ppid;
-	__u8  payload[];
+	/* __u8  payload[]; */
 };
 
 struct sctp_data_chunk {
-- 
cgit v1.2.3


From a714e3ec230892039b5d5ae6902b58bb084a15c1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 19 Apr 2023 18:34:54 +0300
Subject: bridge: Add internal flags for per-{Port, VLAN} neighbor suppression

Add two internal flags that will be used to enable / disable per-{Port,
VLAN} neighbor suppression:

1. 'BR_NEIGH_VLAN_SUPPRESS': A per-port flag used to indicate that
per-{Port, VLAN} neighbor suppression is enabled on the bridge port.
When set, 'BR_NEIGH_SUPPRESS' has no effect.

2. 'BR_VLFLAG_NEIGH_SUPPRESS_ENABLED': A per-VLAN flag used to indicate
that neighbor suppression is enabled on the given VLAN.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 1 +
 net/bridge/br_private.h   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 1668ac4d7adc..3ff96ae31bf6 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -60,6 +60,7 @@ struct br_ip_list {
 #define BR_TX_FWD_OFFLOAD	BIT(20)
 #define BR_PORT_LOCKED		BIT(21)
 #define BR_PORT_MAB		BIT(22)
+#define BR_NEIGH_VLAN_SUPPRESS	BIT(23)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 1ff4d64ab584..b17fc821ecc8 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -178,6 +178,7 @@ enum {
 	BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1),
 	BR_VLFLAG_MCAST_ENABLED = BIT(2),
 	BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3),
+	BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4),
 };
 
 /**
-- 
cgit v1.2.3


From 83f6d600796c65ab34b08dbddb5795099dfda4d1 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 19 Apr 2023 18:34:58 +0300
Subject: bridge: vlan: Allow setting VLAN neighbor suppression state

Add a new VLAN attribute that allows user space to set the neighbor
suppression state of the port VLAN. Example:

 # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]'
 false
 # bridge vlan set vid 10 dev swp1 neigh_suppress on
 # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]'
 true
 # bridge vlan set vid 10 dev swp1 neigh_suppress off
 # bridge -d -j -p vlan show dev swp1 vid 10 | jq '.[]["vlans"][]["neigh_suppress"]'
 false

 # bridge vlan set vid 10 dev br0 neigh_suppress on
 Error: bridge: Can't set neigh_suppress for non-port vlans.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_vlan.c           |  1 +
 net/bridge/br_vlan_options.c   | 20 +++++++++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index c9d624f528c5..f95326fce6bb 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -525,6 +525,7 @@ enum {
 	BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
 	BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS,
 	BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS,
+	BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS,
 	__BRIDGE_VLANDB_ENTRY_MAX,
 };
 #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1)
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 8a3dbc09ba38..15f44d026e75 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -2134,6 +2134,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
 	[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]	= { .type = NLA_U8 },
 	[BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS]	= { .type = NLA_REJECT },
 	[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]	= { .type = NLA_U32 },
+	[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]	= NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index e378c2f3a9e2..8fa89b04ee94 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -52,7 +52,9 @@ bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
 		       const struct net_bridge_port *p)
 {
 	if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
-	    !__vlan_tun_put(skb, v))
+	    !__vlan_tun_put(skb, v) ||
+	    nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS,
+		       !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED)))
 		return false;
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -80,6 +82,7 @@ size_t br_vlan_opts_nl_size(void)
 	       + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */
 	       + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */
 #endif
+	       + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS */
 	       + 0;
 }
 
@@ -239,6 +242,21 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
 	}
 #endif
 
+	if (tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]) {
+		bool enabled = v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+		bool val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]);
+
+		if (!p) {
+			NL_SET_ERR_MSG_MOD(extack, "Can't set neigh_suppress for non-port vlans");
+			return -EINVAL;
+		}
+
+		if (val != enabled) {
+			v->priv_flags ^= BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+			*changed = true;
+		}
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 160656d7201d861a1f2a0bf279a765e8cda2317a Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Wed, 19 Apr 2023 18:34:59 +0300
Subject: bridge: Allow setting per-{Port, VLAN} neighbor suppression state

Add a new bridge port attribute that allows user space to enable
per-{Port, VLAN} neighbor suppression. Example:

 # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]'
 false
 # bridge link set dev swp1 neigh_vlan_suppress on
 # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]'
 true
 # bridge link set dev swp1 neigh_vlan_suppress off
 # bridge -d -j -p link show dev swp1 | jq '.[]["neigh_vlan_suppress"]'
 false

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 net/bridge/br_netlink.c      | 8 +++++++-
 net/core/rtnetlink.c         | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8d679688efe0..4ac1000b0ef2 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -569,6 +569,7 @@ enum {
 	IFLA_BRPORT_MAB,
 	IFLA_BRPORT_MCAST_N_GROUPS,
 	IFLA_BRPORT_MCAST_MAX_GROUPS,
+	IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index fefb1c0e248b..05c5863d2e20 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -189,6 +189,7 @@ static inline size_t br_port_info_size(void)
 		+ nla_total_size(1)	/* IFLA_BRPORT_ISOLATED */
 		+ nla_total_size(1)	/* IFLA_BRPORT_LOCKED */
 		+ nla_total_size(1)	/* IFLA_BRPORT_MAB */
+		+ nla_total_size(1)	/* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_ROOT_ID */
 		+ nla_total_size(sizeof(struct ifla_bridge_id))	/* IFLA_BRPORT_BRIDGE_ID */
 		+ nla_total_size(sizeof(u16))	/* IFLA_BRPORT_DESIGNATED_PORT */
@@ -278,7 +279,9 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) ||
-	    nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)))
+	    nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+		       !!(p->flags & BR_NEIGH_VLAN_SUPPRESS)))
 		return -EMSGSIZE;
 
 	timerval = br_timer_value(&p->message_age_timer);
@@ -891,6 +894,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
 	[IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT },
 	[IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+	[IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
 };
 
 /* Change the state of the port and notify spanning tree */
@@ -957,6 +961,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
 	br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
 	br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB);
+	br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+			 BR_NEIGH_VLAN_SUPPRESS);
 
 	if ((p->flags & BR_PORT_MAB) &&
 	    (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e844d75220fb..653901a1bf75 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -61,7 +61,7 @@
 #include "dev.h"
 
 #define RTNL_MAX_TYPE		50
-#define RTNL_SLAVE_MAX_TYPE	42
+#define RTNL_SLAVE_MAX_TYPE	43
 
 struct rtnl_link {
 	rtnl_doit_func		doit;
-- 
cgit v1.2.3


From 55435ea7729accb5b8a330de751836c4be524834 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:14 -0700
Subject: pds_core: initial framework for pds_core PF driver

This is the initial PCI driver framework for the new pds_core device
driver and its family of devices.  This does the very basics of
registering for the new PF PCI device 1dd8:100c, setting up debugfs
entries, and registering with devlink.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../device_drivers/ethernet/amd/pds_core.rst       |  35 ++
 .../networking/device_drivers/ethernet/index.rst   |   1 +
 drivers/net/ethernet/amd/pds_core/Makefile         |   8 +
 drivers/net/ethernet/amd/pds_core/core.h           |  56 ++
 drivers/net/ethernet/amd/pds_core/debugfs.c        |  31 ++
 drivers/net/ethernet/amd/pds_core/main.c           | 277 ++++++++++
 include/linux/pds/pds_common.h                     |  14 +
 include/linux/pds/pds_core_if.h                    | 571 +++++++++++++++++++++
 8 files changed, 993 insertions(+)
 create mode 100644 Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
 create mode 100644 drivers/net/ethernet/amd/pds_core/Makefile
 create mode 100644 drivers/net/ethernet/amd/pds_core/core.h
 create mode 100644 drivers/net/ethernet/amd/pds_core/debugfs.c
 create mode 100644 drivers/net/ethernet/amd/pds_core/main.c
 create mode 100644 include/linux/pds/pds_common.h
 create mode 100644 include/linux/pds/pds_core_if.h

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
new file mode 100644
index 000000000000..99a70026f1bc
--- /dev/null
+++ b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
@@ -0,0 +1,35 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+========================================================
+Linux Driver for the AMD/Pensando(R) DSC adapter family
+========================================================
+
+Copyright(c) 2023 Advanced Micro Devices, Inc
+
+Identifying the Adapter
+=======================
+
+To find if one or more AMD/Pensando PCI Core devices are installed on the
+host, check for the PCI devices::
+
+  # lspci -d 1dd8:100c
+  b5:00.0 Processing accelerators: Pensando Systems Device 100c
+  b6:00.0 Processing accelerators: Pensando Systems Device 100c
+
+If such devices are listed as above, then the pds_core.ko driver should find
+and configure them for use.  There should be log entries in the kernel
+messages such as these::
+
+  $ dmesg | grep pds_core
+  pds_core 0000:b5:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link)
+  pds_core 0000:b5:00.0: FW: 1.60.0-73
+  pds_core 0000:b6:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link)
+  pds_core 0000:b6:00.0: FW: 1.60.0-73
+
+Support
+=======
+
+For general Linux networking support, please use the netdev mailing
+list, which is monitored by AMD/Pensando personnel::
+
+  netdev@vger.kernel.org
diff --git a/Documentation/networking/device_drivers/ethernet/index.rst b/Documentation/networking/device_drivers/ethernet/index.rst
index 6e9e7012d000..417ca514a4d0 100644
--- a/Documentation/networking/device_drivers/ethernet/index.rst
+++ b/Documentation/networking/device_drivers/ethernet/index.rst
@@ -14,6 +14,7 @@ Contents:
    3com/vortex
    amazon/ena
    altera/altera_tse
+   amd/pds_core
    aquantia/atlantic
    chelsio/cxgb
    cirrus/cs89x0
diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
new file mode 100644
index 000000000000..de3bf1d1886c
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2023 Advanced Micro Devices, Inc.
+
+obj-$(CONFIG_PDS_CORE) := pds_core.o
+
+pds_core-y := main.o
+
+pds_core-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
new file mode 100644
index 000000000000..34ef837e8cfe
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _PDSC_H_
+#define _PDSC_H_
+
+#include <linux/debugfs.h>
+#include <net/devlink.h>
+
+#include <linux/pds/pds_common.h>
+#include <linux/pds/pds_core_if.h>
+
+#define PDSC_DRV_DESCRIPTION	"AMD/Pensando Core Driver"
+
+struct pdsc_dev_bar {
+	void __iomem *vaddr;
+	phys_addr_t bus_addr;
+	unsigned long len;
+	int res_index;
+};
+
+/* No state flags set means we are in a steady running state */
+enum pdsc_state_flags {
+	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
+	PDSC_S_INITING_DRIVER,	    /* initial startup from probe */
+	PDSC_S_STOPPING_DRIVER,	    /* driver remove */
+
+	/* leave this as last */
+	PDSC_S_STATE_SIZE
+};
+
+struct pdsc {
+	struct pci_dev *pdev;
+	struct dentry *dentry;
+	struct device *dev;
+	struct pdsc_dev_bar bars[PDS_CORE_BARS_MAX];
+	int hw_index;
+	int uid;
+
+	unsigned long state;
+
+	struct pds_core_dev_info_regs __iomem *info_regs;
+	struct pds_core_dev_cmd_regs __iomem *cmd_regs;
+	struct pds_core_intr __iomem *intr_ctrl;
+	u64 __iomem *intr_status;
+	u64 __iomem *db_pages;
+	dma_addr_t phy_db_pages;
+	u64 __iomem *kern_dbpage;
+};
+
+void pdsc_debugfs_create(void);
+void pdsc_debugfs_destroy(void);
+void pdsc_debugfs_add_dev(struct pdsc *pdsc);
+void pdsc_debugfs_del_dev(struct pdsc *pdsc);
+
+#endif /* _PDSC_H_ */
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
new file mode 100644
index 000000000000..b2f7cb795c20
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/pci.h>
+
+#include "core.h"
+
+static struct dentry *pdsc_dir;
+
+void pdsc_debugfs_create(void)
+{
+	pdsc_dir = debugfs_create_dir(PDS_CORE_DRV_NAME, NULL);
+}
+
+void pdsc_debugfs_destroy(void)
+{
+	debugfs_remove_recursive(pdsc_dir);
+}
+
+void pdsc_debugfs_add_dev(struct pdsc *pdsc)
+{
+	pdsc->dentry = debugfs_create_dir(pci_name(pdsc->pdev), pdsc_dir);
+
+	debugfs_create_ulong("state", 0400, pdsc->dentry, &pdsc->state);
+}
+
+void pdsc_debugfs_del_dev(struct pdsc *pdsc)
+{
+	debugfs_remove_recursive(pdsc->dentry);
+	pdsc->dentry = NULL;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
new file mode 100644
index 000000000000..c2b12f226959
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/pci.h>
+
+#include <linux/pds/pds_common.h>
+
+#include "core.h"
+
+MODULE_DESCRIPTION(PDSC_DRV_DESCRIPTION);
+MODULE_AUTHOR("Advanced Micro Devices, Inc");
+MODULE_LICENSE("GPL");
+
+/* Supported devices */
+static const struct pci_device_id pdsc_id_table[] = {
+	{ PCI_VDEVICE(PENSANDO, PCI_DEVICE_ID_PENSANDO_CORE_PF) },
+	{ 0, }	/* end of table */
+};
+MODULE_DEVICE_TABLE(pci, pdsc_id_table);
+
+static void pdsc_unmap_bars(struct pdsc *pdsc)
+{
+	struct pdsc_dev_bar *bars = pdsc->bars;
+	unsigned int i;
+
+	for (i = 0; i < PDS_CORE_BARS_MAX; i++) {
+		if (bars[i].vaddr)
+			pci_iounmap(pdsc->pdev, bars[i].vaddr);
+	}
+}
+
+static int pdsc_map_bars(struct pdsc *pdsc)
+{
+	struct pdsc_dev_bar *bar = pdsc->bars;
+	struct pci_dev *pdev = pdsc->pdev;
+	struct device *dev = pdsc->dev;
+	struct pdsc_dev_bar *bars;
+	unsigned int i, j;
+	int num_bars = 0;
+	int err;
+	u32 sig;
+
+	bars = pdsc->bars;
+
+	/* Since the PCI interface in the hardware is configurable,
+	 * we need to poke into all the bars to find the set we're
+	 * expecting.
+	 */
+	for (i = 0, j = 0; i < PDS_CORE_BARS_MAX; i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+
+		bars[j].len = pci_resource_len(pdev, i);
+		bars[j].bus_addr = pci_resource_start(pdev, i);
+		bars[j].res_index = i;
+
+		/* only map the whole bar 0 */
+		if (j > 0) {
+			bars[j].vaddr = NULL;
+		} else {
+			bars[j].vaddr = pci_iomap(pdev, i, bars[j].len);
+			if (!bars[j].vaddr) {
+				dev_err(dev, "Cannot map BAR %d, aborting\n", i);
+				return -ENODEV;
+			}
+		}
+
+		j++;
+	}
+	num_bars = j;
+
+	/* BAR0: dev_cmd and interrupts */
+	if (num_bars < 1) {
+		dev_err(dev, "No bars found\n");
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	if (bar->len < PDS_CORE_BAR0_SIZE) {
+		dev_err(dev, "Resource bar size %lu too small\n", bar->len);
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	pdsc->info_regs = bar->vaddr + PDS_CORE_BAR0_DEV_INFO_REGS_OFFSET;
+	pdsc->cmd_regs = bar->vaddr + PDS_CORE_BAR0_DEV_CMD_REGS_OFFSET;
+	pdsc->intr_status = bar->vaddr + PDS_CORE_BAR0_INTR_STATUS_OFFSET;
+	pdsc->intr_ctrl = bar->vaddr + PDS_CORE_BAR0_INTR_CTRL_OFFSET;
+
+	sig = ioread32(&pdsc->info_regs->signature);
+	if (sig != PDS_CORE_DEV_INFO_SIGNATURE) {
+		dev_err(dev, "Incompatible firmware signature %x", sig);
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	/* BAR1: doorbells */
+	bar++;
+	if (num_bars < 2) {
+		dev_err(dev, "Doorbell bar missing\n");
+		err = -EFAULT;
+		goto err_out;
+	}
+
+	pdsc->db_pages = bar->vaddr;
+	pdsc->phy_db_pages = bar->bus_addr;
+
+	return 0;
+
+err_out:
+	pdsc_unmap_bars(pdsc);
+	return err;
+}
+
+static int pdsc_init_vf(struct pdsc *vf)
+{
+	return -1;
+}
+
+static int pdsc_init_pf(struct pdsc *pdsc)
+{
+	struct devlink *dl;
+	int err;
+
+	pcie_print_link_status(pdsc->pdev);
+
+	err = pci_request_regions(pdsc->pdev, PDS_CORE_DRV_NAME);
+	if (err) {
+		dev_err(pdsc->dev, "Cannot request PCI regions: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	err = pdsc_map_bars(pdsc);
+	if (err)
+		goto err_out_release_regions;
+
+	dl = priv_to_devlink(pdsc);
+	devl_lock(dl);
+	devl_register(dl);
+	devl_unlock(dl);
+
+	return 0;
+
+err_out_release_regions:
+	pci_release_regions(pdsc->pdev);
+
+	return err;
+}
+
+static const struct devlink_ops pdsc_dl_ops = {
+};
+
+static const struct devlink_ops pdsc_dl_vf_ops = {
+};
+
+static DEFINE_IDA(pdsc_ida);
+
+static int pdsc_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	struct device *dev = &pdev->dev;
+	const struct devlink_ops *ops;
+	struct devlink *dl;
+	struct pdsc *pdsc;
+	bool is_pf;
+	int err;
+
+	is_pf = !pdev->is_virtfn;
+	ops = is_pf ? &pdsc_dl_ops : &pdsc_dl_vf_ops;
+	dl = devlink_alloc(ops, sizeof(struct pdsc), dev);
+	if (!dl)
+		return -ENOMEM;
+	pdsc = devlink_priv(dl);
+
+	pdsc->pdev = pdev;
+	pdsc->dev = &pdev->dev;
+	set_bit(PDSC_S_INITING_DRIVER, &pdsc->state);
+	pci_set_drvdata(pdev, pdsc);
+	pdsc_debugfs_add_dev(pdsc);
+
+	err = ida_alloc(&pdsc_ida, GFP_KERNEL);
+	if (err < 0) {
+		dev_err(pdsc->dev, "%s: id alloc failed: %pe\n",
+			__func__, ERR_PTR(err));
+		goto err_out_free_devlink;
+	}
+	pdsc->uid = err;
+
+	/* Query system for DMA addressing limitation for the device. */
+	err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(PDS_CORE_ADDR_LEN));
+	if (err) {
+		dev_err(dev, "Unable to obtain 64-bit DMA for consistent allocations, aborting: %pe\n",
+			ERR_PTR(err));
+		goto err_out_free_ida;
+	}
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(dev, "Cannot enable PCI device: %pe\n", ERR_PTR(err));
+		goto err_out_free_ida;
+	}
+	pci_set_master(pdev);
+
+	if (is_pf)
+		err = pdsc_init_pf(pdsc);
+	else
+		err = pdsc_init_vf(pdsc);
+	if (err) {
+		dev_err(dev, "Cannot init device: %pe\n", ERR_PTR(err));
+		goto err_out_clear_master;
+	}
+
+	clear_bit(PDSC_S_INITING_DRIVER, &pdsc->state);
+	return 0;
+
+err_out_clear_master:
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+err_out_free_ida:
+	ida_free(&pdsc_ida, pdsc->uid);
+err_out_free_devlink:
+	pdsc_debugfs_del_dev(pdsc);
+	devlink_free(dl);
+
+	return err;
+}
+
+static void pdsc_remove(struct pci_dev *pdev)
+{
+	struct pdsc *pdsc = pci_get_drvdata(pdev);
+	struct devlink *dl;
+
+	/* Unhook the registrations first to be sure there
+	 * are no requests while we're stopping.
+	 */
+	dl = priv_to_devlink(pdsc);
+	devl_lock(dl);
+	devl_unregister(dl);
+	devl_unlock(dl);
+
+	pdsc_unmap_bars(pdsc);
+	pci_release_regions(pdev);
+
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+
+	ida_free(&pdsc_ida, pdsc->uid);
+	pdsc_debugfs_del_dev(pdsc);
+	devlink_free(dl);
+}
+
+static struct pci_driver pdsc_driver = {
+	.name = PDS_CORE_DRV_NAME,
+	.id_table = pdsc_id_table,
+	.probe = pdsc_probe,
+	.remove = pdsc_remove,
+};
+
+static int __init pdsc_init_module(void)
+{
+	if (strcmp(KBUILD_MODNAME, PDS_CORE_DRV_NAME))
+		return -EINVAL;
+
+	pdsc_debugfs_create();
+	return pci_register_driver(&pdsc_driver);
+}
+
+static void __exit pdsc_cleanup_module(void)
+{
+	pci_unregister_driver(&pdsc_driver);
+	pdsc_debugfs_destroy();
+}
+
+module_init(pdsc_init_module);
+module_exit(pdsc_cleanup_module);
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
new file mode 100644
index 000000000000..bd041a5170a6
--- /dev/null
+++ b/include/linux/pds/pds_common.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PDS_COMMON_H_
+#define _PDS_COMMON_H_
+
+#define PDS_CORE_DRV_NAME			"pds_core"
+
+/* the device's internal addressing uses up to 52 bits */
+#define PDS_CORE_ADDR_LEN	52
+#define PDS_CORE_ADDR_MASK	(BIT_ULL(PDS_ADDR_LEN) - 1)
+#define PDS_PAGE_SIZE		4096
+
+#endif /* _PDS_COMMON_H_ */
diff --git a/include/linux/pds/pds_core_if.h b/include/linux/pds/pds_core_if.h
new file mode 100644
index 000000000000..e838a2b90440
--- /dev/null
+++ b/include/linux/pds/pds_core_if.h
@@ -0,0 +1,571 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PDS_CORE_IF_H_
+#define _PDS_CORE_IF_H_
+
+#define PCI_VENDOR_ID_PENSANDO			0x1dd8
+#define PCI_DEVICE_ID_PENSANDO_CORE_PF		0x100c
+#define PCI_DEVICE_ID_VIRTIO_NET_TRANS		0x1000
+#define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF	0x1003
+#define PCI_DEVICE_ID_PENSANDO_VDPA_VF		0x100b
+#define PDS_CORE_BARS_MAX			4
+#define PDS_CORE_PCI_BAR_DBELL			1
+
+/* Bar0 */
+#define PDS_CORE_DEV_INFO_SIGNATURE		0x44455649 /* 'DEVI' */
+#define PDS_CORE_BAR0_SIZE			0x8000
+#define PDS_CORE_BAR0_DEV_INFO_REGS_OFFSET	0x0000
+#define PDS_CORE_BAR0_DEV_CMD_REGS_OFFSET	0x0800
+#define PDS_CORE_BAR0_DEV_CMD_DATA_REGS_OFFSET	0x0c00
+#define PDS_CORE_BAR0_INTR_STATUS_OFFSET	0x1000
+#define PDS_CORE_BAR0_INTR_CTRL_OFFSET		0x2000
+#define PDS_CORE_DEV_CMD_DONE			0x00000001
+
+#define PDS_CORE_DEVCMD_TIMEOUT			5
+
+#define PDS_CORE_CLIENT_ID			0
+#define PDS_CORE_ASIC_TYPE_CAPRI		0
+
+/*
+ * enum pds_core_cmd_opcode - Device commands
+ */
+enum pds_core_cmd_opcode {
+	/* Core init */
+	PDS_CORE_CMD_NOP		= 0,
+	PDS_CORE_CMD_IDENTIFY		= 1,
+	PDS_CORE_CMD_RESET		= 2,
+	PDS_CORE_CMD_INIT		= 3,
+
+	PDS_CORE_CMD_FW_DOWNLOAD	= 4,
+	PDS_CORE_CMD_FW_CONTROL		= 5,
+
+	/* SR/IOV commands */
+	PDS_CORE_CMD_VF_GETATTR		= 60,
+	PDS_CORE_CMD_VF_SETATTR		= 61,
+	PDS_CORE_CMD_VF_CTRL		= 62,
+
+	/* Add commands before this line */
+	PDS_CORE_CMD_MAX,
+	PDS_CORE_CMD_COUNT
+};
+
+/*
+ * enum pds_core_status_code - Device command return codes
+ */
+enum pds_core_status_code {
+	PDS_RC_SUCCESS	= 0,	/* Success */
+	PDS_RC_EVERSION	= 1,	/* Incorrect version for request */
+	PDS_RC_EOPCODE	= 2,	/* Invalid cmd opcode */
+	PDS_RC_EIO	= 3,	/* I/O error */
+	PDS_RC_EPERM	= 4,	/* Permission denied */
+	PDS_RC_EQID	= 5,	/* Bad qid */
+	PDS_RC_EQTYPE	= 6,	/* Bad qtype */
+	PDS_RC_ENOENT	= 7,	/* No such element */
+	PDS_RC_EINTR	= 8,	/* operation interrupted */
+	PDS_RC_EAGAIN	= 9,	/* Try again */
+	PDS_RC_ENOMEM	= 10,	/* Out of memory */
+	PDS_RC_EFAULT	= 11,	/* Bad address */
+	PDS_RC_EBUSY	= 12,	/* Device or resource busy */
+	PDS_RC_EEXIST	= 13,	/* object already exists */
+	PDS_RC_EINVAL	= 14,	/* Invalid argument */
+	PDS_RC_ENOSPC	= 15,	/* No space left or alloc failure */
+	PDS_RC_ERANGE	= 16,	/* Parameter out of range */
+	PDS_RC_BAD_ADDR	= 17,	/* Descriptor contains a bad ptr */
+	PDS_RC_DEV_CMD	= 18,	/* Device cmd attempted on AdminQ */
+	PDS_RC_ENOSUPP	= 19,	/* Operation not supported */
+	PDS_RC_ERROR	= 29,	/* Generic error */
+	PDS_RC_ERDMA	= 30,	/* Generic RDMA error */
+	PDS_RC_EVFID	= 31,	/* VF ID does not exist */
+	PDS_RC_BAD_FW	= 32,	/* FW file is invalid or corrupted */
+	PDS_RC_ECLIENT	= 33,   /* No such client id */
+};
+
+/**
+ * struct pds_core_drv_identity - Driver identity information
+ * @drv_type:         Driver type (enum pds_core_driver_type)
+ * @os_dist:          OS distribution, numeric format
+ * @os_dist_str:      OS distribution, string format
+ * @kernel_ver:       Kernel version, numeric format
+ * @kernel_ver_str:   Kernel version, string format
+ * @driver_ver_str:   Driver version, string format
+ */
+struct pds_core_drv_identity {
+	__le32 drv_type;
+	__le32 os_dist;
+	char   os_dist_str[128];
+	__le32 kernel_ver;
+	char   kernel_ver_str[32];
+	char   driver_ver_str[32];
+};
+
+#define PDS_DEV_TYPE_MAX	16
+/**
+ * struct pds_core_dev_identity - Device identity information
+ * @version:	      Version of device identify
+ * @type:	      Identify type (0 for now)
+ * @state:	      Device state
+ * @rsvd:	      Word boundary padding
+ * @nlifs:	      Number of LIFs provisioned
+ * @nintrs:	      Number of interrupts provisioned
+ * @ndbpgs_per_lif:   Number of doorbell pages per LIF
+ * @intr_coal_mult:   Interrupt coalescing multiplication factor
+ *		      Scale user-supplied interrupt coalescing
+ *		      value in usecs to device units using:
+ *		      device units = usecs * mult / div
+ * @intr_coal_div:    Interrupt coalescing division factor
+ *		      Scale user-supplied interrupt coalescing
+ *		      value in usecs to device units using:
+ *		      device units = usecs * mult / div
+ * @vif_types:        How many of each VIF device type is supported
+ */
+struct pds_core_dev_identity {
+	u8     version;
+	u8     type;
+	u8     state;
+	u8     rsvd;
+	__le32 nlifs;
+	__le32 nintrs;
+	__le32 ndbpgs_per_lif;
+	__le32 intr_coal_mult;
+	__le32 intr_coal_div;
+	__le16 vif_types[PDS_DEV_TYPE_MAX];
+};
+
+#define PDS_CORE_IDENTITY_VERSION_1	1
+
+/**
+ * struct pds_core_dev_identify_cmd - Driver/device identify command
+ * @opcode:	Opcode PDS_CORE_CMD_IDENTIFY
+ * @ver:	Highest version of identify supported by driver
+ *
+ * Expects to find driver identification info (struct pds_core_drv_identity)
+ * in cmd_regs->data.  Driver should keep the devcmd interface locked
+ * while preparing the driver info.
+ */
+struct pds_core_dev_identify_cmd {
+	u8 opcode;
+	u8 ver;
+};
+
+/**
+ * struct pds_core_dev_identify_comp - Device identify command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @ver:	Version of identify returned by device
+ *
+ * Device identification info (struct pds_core_dev_identity) can be found
+ * in cmd_regs->data.  Driver should keep the devcmd interface locked
+ * while reading the results.
+ */
+struct pds_core_dev_identify_comp {
+	u8 status;
+	u8 ver;
+};
+
+/**
+ * struct pds_core_dev_reset_cmd - Device reset command
+ * @opcode:	Opcode PDS_CORE_CMD_RESET
+ *
+ * Resets and clears all LIFs, VDevs, and VIFs on the device.
+ */
+struct pds_core_dev_reset_cmd {
+	u8 opcode;
+};
+
+/**
+ * struct pds_core_dev_reset_comp - Reset command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ */
+struct pds_core_dev_reset_comp {
+	u8 status;
+};
+
+/*
+ * struct pds_core_dev_init_data - Pointers and info needed for the Core
+ * initialization PDS_CORE_CMD_INIT command.  The in and out structs are
+ * overlays on the pds_core_dev_cmd_regs.data space for passing data down
+ * to the firmware on init, and then returning initialization results.
+ */
+struct pds_core_dev_init_data_in {
+	__le64 adminq_q_base;
+	__le64 adminq_cq_base;
+	__le64 notifyq_cq_base;
+	__le32 flags;
+	__le16 intr_index;
+	u8     adminq_ring_size;
+	u8     notifyq_ring_size;
+};
+
+struct pds_core_dev_init_data_out {
+	__le32 core_hw_index;
+	__le32 adminq_hw_index;
+	__le32 notifyq_hw_index;
+	u8     adminq_hw_type;
+	u8     notifyq_hw_type;
+};
+
+/**
+ * struct pds_core_dev_init_cmd - Core device initialize
+ * @opcode:          opcode PDS_CORE_CMD_INIT
+ *
+ * Initializes the core device and sets up the AdminQ and NotifyQ.
+ * Expects to find initialization data (struct pds_core_dev_init_data_in)
+ * in cmd_regs->data.  Driver should keep the devcmd interface locked
+ * while preparing the driver info.
+ */
+struct pds_core_dev_init_cmd {
+	u8     opcode;
+};
+
+/**
+ * struct pds_core_dev_init_comp - Core init completion
+ * @status:     Status of the command (enum pds_core_status_code)
+ *
+ * Initialization result data (struct pds_core_dev_init_data_in)
+ * is found in cmd_regs->data.
+ */
+struct pds_core_dev_init_comp {
+	u8     status;
+};
+
+/**
+ * struct pds_core_fw_download_cmd - Firmware download command
+ * @opcode:     opcode
+ * @rsvd:	Word boundary padding
+ * @addr:       DMA address of the firmware buffer
+ * @offset:     offset of the firmware buffer within the full image
+ * @length:     number of valid bytes in the firmware buffer
+ */
+struct pds_core_fw_download_cmd {
+	u8     opcode;
+	u8     rsvd[3];
+	__le32 offset;
+	__le64 addr;
+	__le32 length;
+};
+
+/**
+ * struct pds_core_fw_download_comp - Firmware download completion
+ * @status:     Status of the command (enum pds_core_status_code)
+ */
+struct pds_core_fw_download_comp {
+	u8     status;
+};
+
+/**
+ * enum pds_core_fw_control_oper - FW control operations
+ * @PDS_CORE_FW_INSTALL_ASYNC:     Install firmware asynchronously
+ * @PDS_CORE_FW_INSTALL_STATUS:    Firmware installation status
+ * @PDS_CORE_FW_ACTIVATE_ASYNC:    Activate firmware asynchronously
+ * @PDS_CORE_FW_ACTIVATE_STATUS:   Firmware activate status
+ * @PDS_CORE_FW_UPDATE_CLEANUP:    Cleanup any firmware update leftovers
+ * @PDS_CORE_FW_GET_BOOT:          Return current active firmware slot
+ * @PDS_CORE_FW_SET_BOOT:          Set active firmware slot for next boot
+ * @PDS_CORE_FW_GET_LIST:          Return list of installed firmware images
+ */
+enum pds_core_fw_control_oper {
+	PDS_CORE_FW_INSTALL_ASYNC          = 0,
+	PDS_CORE_FW_INSTALL_STATUS         = 1,
+	PDS_CORE_FW_ACTIVATE_ASYNC         = 2,
+	PDS_CORE_FW_ACTIVATE_STATUS        = 3,
+	PDS_CORE_FW_UPDATE_CLEANUP         = 4,
+	PDS_CORE_FW_GET_BOOT               = 5,
+	PDS_CORE_FW_SET_BOOT               = 6,
+	PDS_CORE_FW_GET_LIST               = 7,
+};
+
+enum pds_core_fw_slot {
+	PDS_CORE_FW_SLOT_INVALID    = 0,
+	PDS_CORE_FW_SLOT_A	    = 1,
+	PDS_CORE_FW_SLOT_B          = 2,
+	PDS_CORE_FW_SLOT_GOLD       = 3,
+};
+
+/**
+ * struct pds_core_fw_control_cmd - Firmware control command
+ * @opcode:    opcode
+ * @rsvd:      Word boundary padding
+ * @oper:      firmware control operation (enum pds_core_fw_control_oper)
+ * @slot:      slot to operate on (enum pds_core_fw_slot)
+ */
+struct pds_core_fw_control_cmd {
+	u8  opcode;
+	u8  rsvd[3];
+	u8  oper;
+	u8  slot;
+};
+
+/**
+ * struct pds_core_fw_control_comp - Firmware control copletion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:	Word alignment space
+ * @slot:	Slot number (enum pds_core_fw_slot)
+ * @rsvd1:	Struct padding
+ * @color:	Color bit
+ */
+struct pds_core_fw_control_comp {
+	u8     status;
+	u8     rsvd[3];
+	u8     slot;
+	u8     rsvd1[10];
+	u8     color;
+};
+
+struct pds_core_fw_name_info {
+#define PDS_CORE_FWSLOT_BUFLEN		8
+#define PDS_CORE_FWVERS_BUFLEN		32
+	char   slotname[PDS_CORE_FWSLOT_BUFLEN];
+	char   fw_version[PDS_CORE_FWVERS_BUFLEN];
+};
+
+struct pds_core_fw_list_info {
+#define PDS_CORE_FWVERS_LIST_LEN	16
+	u8 num_fw_slots;
+	struct pds_core_fw_name_info fw_names[PDS_CORE_FWVERS_LIST_LEN];
+} __packed;
+
+enum pds_core_vf_attr {
+	PDS_CORE_VF_ATTR_SPOOFCHK	= 1,
+	PDS_CORE_VF_ATTR_TRUST		= 2,
+	PDS_CORE_VF_ATTR_MAC		= 3,
+	PDS_CORE_VF_ATTR_LINKSTATE	= 4,
+	PDS_CORE_VF_ATTR_VLAN		= 5,
+	PDS_CORE_VF_ATTR_RATE		= 6,
+	PDS_CORE_VF_ATTR_STATSADDR	= 7,
+};
+
+/**
+ * enum pds_core_vf_link_status - Virtual Function link status
+ * @PDS_CORE_VF_LINK_STATUS_AUTO:   Use link state of the uplink
+ * @PDS_CORE_VF_LINK_STATUS_UP:     Link always up
+ * @PDS_CORE_VF_LINK_STATUS_DOWN:   Link always down
+ */
+enum pds_core_vf_link_status {
+	PDS_CORE_VF_LINK_STATUS_AUTO = 0,
+	PDS_CORE_VF_LINK_STATUS_UP   = 1,
+	PDS_CORE_VF_LINK_STATUS_DOWN = 2,
+};
+
+/**
+ * struct pds_core_vf_setattr_cmd - Set VF attributes on the NIC
+ * @opcode:     Opcode
+ * @attr:       Attribute type (enum pds_core_vf_attr)
+ * @vf_index:   VF index
+ * @macaddr:	mac address
+ * @vlanid:	vlan ID
+ * @maxrate:	max Tx rate in Mbps
+ * @spoofchk:	enable address spoof checking
+ * @trust:	enable VF trust
+ * @linkstate:	set link up or down
+ * @stats:	stats addr struct
+ * @stats.pa:	set DMA address for VF stats
+ * @stats.len:	length of VF stats space
+ * @pad:	force union to specific size
+ */
+struct pds_core_vf_setattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 vf_index;
+	union {
+		u8     macaddr[6];
+		__le16 vlanid;
+		__le32 maxrate;
+		u8     spoofchk;
+		u8     trust;
+		u8     linkstate;
+		struct {
+			__le64 pa;
+			__le32 len;
+		} stats;
+		u8     pad[60];
+	} __packed;
+};
+
+struct pds_core_vf_setattr_comp {
+	u8     status;
+	u8     attr;
+	__le16 vf_index;
+	__le16 comp_index;
+	u8     rsvd[9];
+	u8     color;
+};
+
+/**
+ * struct pds_core_vf_getattr_cmd - Get VF attributes from the NIC
+ * @opcode:     Opcode
+ * @attr:       Attribute type (enum pds_core_vf_attr)
+ * @vf_index:   VF index
+ */
+struct pds_core_vf_getattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 vf_index;
+};
+
+struct pds_core_vf_getattr_comp {
+	u8     status;
+	u8     attr;
+	__le16 vf_index;
+	union {
+		u8     macaddr[6];
+		__le16 vlanid;
+		__le32 maxrate;
+		u8     spoofchk;
+		u8     trust;
+		u8     linkstate;
+		__le64 stats_pa;
+		u8     pad[11];
+	} __packed;
+	u8     color;
+};
+
+enum pds_core_vf_ctrl_opcode {
+	PDS_CORE_VF_CTRL_START_ALL	= 0,
+	PDS_CORE_VF_CTRL_START		= 1,
+};
+
+/**
+ * struct pds_core_vf_ctrl_cmd - VF control command
+ * @opcode:         Opcode for the command
+ * @ctrl_opcode:    VF control operation type
+ * @vf_index:       VF Index. It is unused if op START_ALL is used.
+ */
+
+struct pds_core_vf_ctrl_cmd {
+	u8	opcode;
+	u8	ctrl_opcode;
+	__le16	vf_index;
+};
+
+/**
+ * struct pds_core_vf_ctrl_comp - VF_CTRL command completion.
+ * @status:     Status of the command (enum pds_core_status_code)
+ */
+struct pds_core_vf_ctrl_comp {
+	u8	status;
+};
+
+/*
+ * union pds_core_dev_cmd - Overlay of core device command structures
+ */
+union pds_core_dev_cmd {
+	u8     opcode;
+	u32    words[16];
+
+	struct pds_core_dev_identify_cmd identify;
+	struct pds_core_dev_init_cmd     init;
+	struct pds_core_dev_reset_cmd    reset;
+	struct pds_core_fw_download_cmd  fw_download;
+	struct pds_core_fw_control_cmd   fw_control;
+
+	struct pds_core_vf_setattr_cmd   vf_setattr;
+	struct pds_core_vf_getattr_cmd   vf_getattr;
+	struct pds_core_vf_ctrl_cmd      vf_ctrl;
+};
+
+/*
+ * union pds_core_dev_comp - Overlay of core device completion structures
+ */
+union pds_core_dev_comp {
+	u8                                status;
+	u8                                bytes[16];
+
+	struct pds_core_dev_identify_comp identify;
+	struct pds_core_dev_reset_comp    reset;
+	struct pds_core_dev_init_comp     init;
+	struct pds_core_fw_download_comp  fw_download;
+	struct pds_core_fw_control_comp   fw_control;
+
+	struct pds_core_vf_setattr_comp   vf_setattr;
+	struct pds_core_vf_getattr_comp   vf_getattr;
+	struct pds_core_vf_ctrl_comp      vf_ctrl;
+};
+
+/**
+ * struct pds_core_dev_hwstamp_regs - Hardware current timestamp registers
+ * @tick_low:        Low 32 bits of hardware timestamp
+ * @tick_high:       High 32 bits of hardware timestamp
+ */
+struct pds_core_dev_hwstamp_regs {
+	u32    tick_low;
+	u32    tick_high;
+};
+
+/**
+ * struct pds_core_dev_info_regs - Device info register format (read-only)
+ * @signature:       Signature value of 0x44455649 ('DEVI')
+ * @version:         Current version of info
+ * @asic_type:       Asic type
+ * @asic_rev:        Asic revision
+ * @fw_status:       Firmware status
+ *			bit 0   - 1 = fw running
+ *			bit 4-7 - 4 bit generation number, changes on fw restart
+ * @fw_heartbeat:    Firmware heartbeat counter
+ * @serial_num:      Serial number
+ * @fw_version:      Firmware version
+ * @oprom_regs:      oprom_regs to store oprom debug enable/disable and bmp
+ * @rsvd_pad1024:    Struct padding
+ * @hwstamp:         Hardware current timestamp registers
+ * @rsvd_pad2048:    Struct padding
+ */
+struct pds_core_dev_info_regs {
+#define PDS_CORE_DEVINFO_FWVERS_BUFLEN 32
+#define PDS_CORE_DEVINFO_SERIAL_BUFLEN 32
+	u32    signature;
+	u8     version;
+	u8     asic_type;
+	u8     asic_rev;
+#define PDS_CORE_FW_STS_F_STOPPED	0x00
+#define PDS_CORE_FW_STS_F_RUNNING	0x01
+#define PDS_CORE_FW_STS_F_GENERATION	0xF0
+	u8     fw_status;
+	__le32 fw_heartbeat;
+	char   fw_version[PDS_CORE_DEVINFO_FWVERS_BUFLEN];
+	char   serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN];
+	u8     oprom_regs[32];     /* reserved */
+	u8     rsvd_pad1024[916];
+	struct pds_core_dev_hwstamp_regs hwstamp;   /* on 1k boundary */
+	u8     rsvd_pad2048[1016];
+} __packed;
+
+/**
+ * struct pds_core_dev_cmd_regs - Device command register format (read-write)
+ * @doorbell:	Device Cmd Doorbell, write-only
+ *              Write a 1 to signal device to process cmd
+ * @done:	Command completed indicator, poll for completion
+ *              bit 0 == 1 when command is complete
+ * @cmd:	Opcode-specific command bytes
+ * @comp:	Opcode-specific response bytes
+ * @rsvd:	Struct padding
+ * @data:	Opcode-specific side-data
+ */
+struct pds_core_dev_cmd_regs {
+	u32                     doorbell;
+	u32                     done;
+	union pds_core_dev_cmd  cmd;
+	union pds_core_dev_comp comp;
+	u8                      rsvd[48];
+	u32                     data[478];
+} __packed;
+
+/**
+ * struct pds_core_dev_regs - Device register format for bar 0 page 0
+ * @info:            Device info registers
+ * @devcmd:          Device command registers
+ */
+struct pds_core_dev_regs {
+	struct pds_core_dev_info_regs info;
+	struct pds_core_dev_cmd_regs  devcmd;
+} __packed;
+
+#ifndef __CHECKER__
+static_assert(sizeof(struct pds_core_drv_identity) <= 1912);
+static_assert(sizeof(struct pds_core_dev_identity) <= 1912);
+static_assert(sizeof(union pds_core_dev_cmd) == 64);
+static_assert(sizeof(union pds_core_dev_comp) == 16);
+static_assert(sizeof(struct pds_core_dev_info_regs) == 2048);
+static_assert(sizeof(struct pds_core_dev_cmd_regs) == 2048);
+static_assert(sizeof(struct pds_core_dev_regs) == 4096);
+#endif /* __CHECKER__ */
+
+#endif /* _PDS_CORE_IF_H_ */
-- 
cgit v1.2.3


From 523847df1b3718d6286dce0ed1c83742fe0ffa94 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:15 -0700
Subject: pds_core: add devcmd device interfaces

The devcmd interface is the basic connection to the device through the
PCI BAR for low level identification and command services.  This does
the early device initialization and finds the identity data, and adds
devcmd routines to be used by later driver bits.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/Makefile  |   4 +-
 drivers/net/ethernet/amd/pds_core/core.c    |  36 +++
 drivers/net/ethernet/amd/pds_core/core.h    |  50 ++++
 drivers/net/ethernet/amd/pds_core/debugfs.c |  38 +++
 drivers/net/ethernet/amd/pds_core/dev.c     | 348 ++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/main.c    |  33 ++-
 include/linux/pds/pds_common.h              |  30 +++
 include/linux/pds/pds_intr.h                | 163 +++++++++++++
 8 files changed, 699 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/pds_core/core.c
 create mode 100644 drivers/net/ethernet/amd/pds_core/dev.c
 create mode 100644 include/linux/pds/pds_intr.h

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
index de3bf1d1886c..95a6c31e92d2 100644
--- a/drivers/net/ethernet/amd/pds_core/Makefile
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -3,6 +3,8 @@
 
 obj-$(CONFIG_PDS_CORE) := pds_core.o
 
-pds_core-y := main.o
+pds_core-y := main.o \
+	      dev.o \
+	      core.o
 
 pds_core-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
new file mode 100644
index 000000000000..80d2ecb045df
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include "core.h"
+
+int pdsc_setup(struct pdsc *pdsc, bool init)
+{
+	int err = 0;
+
+	if (init)
+		err = pdsc_dev_init(pdsc);
+	else
+		err = pdsc_dev_reinit(pdsc);
+	if (err)
+		return err;
+
+	clear_bit(PDSC_S_FW_DEAD, &pdsc->state);
+	return 0;
+}
+
+void pdsc_teardown(struct pdsc *pdsc, bool removing)
+{
+	pdsc_devcmd_reset(pdsc);
+
+	if (removing) {
+		kfree(pdsc->intr_info);
+		pdsc->intr_info = NULL;
+	}
+
+	if (pdsc->kern_dbpage) {
+		iounmap(pdsc->kern_dbpage);
+		pdsc->kern_dbpage = NULL;
+	}
+
+	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
+}
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 34ef837e8cfe..fcf6c6545c49 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -9,8 +9,13 @@
 
 #include <linux/pds/pds_common.h>
 #include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_intr.h>
 
 #define PDSC_DRV_DESCRIPTION	"AMD/Pensando Core Driver"
+#define PDSC_TEARDOWN_RECOVERY	false
+#define PDSC_TEARDOWN_REMOVING	true
+#define PDSC_SETUP_RECOVERY	false
+#define PDSC_SETUP_INIT		true
 
 struct pdsc_dev_bar {
 	void __iomem *vaddr;
@@ -19,6 +24,22 @@ struct pdsc_dev_bar {
 	int res_index;
 };
 
+struct pdsc_devinfo {
+	u8 asic_type;
+	u8 asic_rev;
+	char fw_version[PDS_CORE_DEVINFO_FWVERS_BUFLEN + 1];
+	char serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN + 1];
+};
+
+#define PDSC_INTR_NAME_MAX_SZ		32
+
+struct pdsc_intr_info {
+	char name[PDSC_INTR_NAME_MAX_SZ];
+	unsigned int index;
+	unsigned int vector;
+	void *data;
+};
+
 /* No state flags set means we are in a steady running state */
 enum pdsc_state_flags {
 	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
@@ -38,7 +59,19 @@ struct pdsc {
 	int uid;
 
 	unsigned long state;
+	u8 fw_status;
+	u8 fw_generation;
+	unsigned long last_fw_time;
+	u32 last_hb;
+
+	struct pdsc_devinfo dev_info;
+	struct pds_core_dev_identity dev_ident;
+	unsigned int nintrs;
+	struct pdsc_intr_info *intr_info;	/* array of nintrs elements */
 
+	unsigned int devcmd_timeout;
+	struct mutex devcmd_lock;	/* lock for dev_cmd operations */
+	struct mutex config_lock;	/* lock for configuration operations */
 	struct pds_core_dev_info_regs __iomem *info_regs;
 	struct pds_core_dev_cmd_regs __iomem *cmd_regs;
 	struct pds_core_intr __iomem *intr_ctrl;
@@ -52,5 +85,22 @@ void pdsc_debugfs_create(void);
 void pdsc_debugfs_destroy(void);
 void pdsc_debugfs_add_dev(struct pdsc *pdsc);
 void pdsc_debugfs_del_dev(struct pdsc *pdsc);
+void pdsc_debugfs_add_ident(struct pdsc *pdsc);
+void pdsc_debugfs_add_irqs(struct pdsc *pdsc);
+
+int pdsc_err_to_errno(enum pds_core_status_code code);
+bool pdsc_is_fw_running(struct pdsc *pdsc);
+bool pdsc_is_fw_good(struct pdsc *pdsc);
+int pdsc_devcmd(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		union pds_core_dev_comp *comp, int max_seconds);
+int pdsc_devcmd_locked(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		       union pds_core_dev_comp *comp, int max_seconds);
+int pdsc_devcmd_init(struct pdsc *pdsc);
+int pdsc_devcmd_reset(struct pdsc *pdsc);
+int pdsc_dev_reinit(struct pdsc *pdsc);
+int pdsc_dev_init(struct pdsc *pdsc);
+
+int pdsc_setup(struct pdsc *pdsc, bool init);
+void pdsc_teardown(struct pdsc *pdsc, bool removing);
 
 #endif /* _PDSC_H_ */
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
index b2f7cb795c20..601431b41abb 100644
--- a/drivers/net/ethernet/amd/pds_core/debugfs.c
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -29,3 +29,41 @@ void pdsc_debugfs_del_dev(struct pdsc *pdsc)
 	debugfs_remove_recursive(pdsc->dentry);
 	pdsc->dentry = NULL;
 }
+
+static int identity_show(struct seq_file *seq, void *v)
+{
+	struct pdsc *pdsc = seq->private;
+	struct pds_core_dev_identity *ident;
+	int vt;
+
+	ident = &pdsc->dev_ident;
+
+	seq_printf(seq, "fw_heartbeat:     0x%x\n",
+		   ioread32(&pdsc->info_regs->fw_heartbeat));
+
+	seq_printf(seq, "nlifs:            %d\n",
+		   le32_to_cpu(ident->nlifs));
+	seq_printf(seq, "nintrs:           %d\n",
+		   le32_to_cpu(ident->nintrs));
+	seq_printf(seq, "ndbpgs_per_lif:   %d\n",
+		   le32_to_cpu(ident->ndbpgs_per_lif));
+	seq_printf(seq, "intr_coal_mult:   %d\n",
+		   le32_to_cpu(ident->intr_coal_mult));
+	seq_printf(seq, "intr_coal_div:    %d\n",
+		   le32_to_cpu(ident->intr_coal_div));
+
+	seq_puts(seq, "vif_types:        ");
+	for (vt = 0; vt < PDS_DEV_TYPE_MAX; vt++)
+		seq_printf(seq, "%d ",
+			   le16_to_cpu(pdsc->dev_ident.vif_types[vt]));
+	seq_puts(seq, "\n");
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(identity);
+
+void pdsc_debugfs_add_ident(struct pdsc *pdsc)
+{
+	debugfs_create_file("identity", 0400, pdsc->dentry,
+			    pdsc, &identity_fops);
+}
diff --git a/drivers/net/ethernet/amd/pds_core/dev.c b/drivers/net/ethernet/amd/pds_core/dev.c
new file mode 100644
index 000000000000..f082d69c5128
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/dev.c
@@ -0,0 +1,348 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/utsname.h>
+
+#include "core.h"
+
+int pdsc_err_to_errno(enum pds_core_status_code code)
+{
+	switch (code) {
+	case PDS_RC_SUCCESS:
+		return 0;
+	case PDS_RC_EVERSION:
+	case PDS_RC_EQTYPE:
+	case PDS_RC_EQID:
+	case PDS_RC_EINVAL:
+	case PDS_RC_ENOSUPP:
+		return -EINVAL;
+	case PDS_RC_EPERM:
+		return -EPERM;
+	case PDS_RC_ENOENT:
+		return -ENOENT;
+	case PDS_RC_EAGAIN:
+		return -EAGAIN;
+	case PDS_RC_ENOMEM:
+		return -ENOMEM;
+	case PDS_RC_EFAULT:
+		return -EFAULT;
+	case PDS_RC_EBUSY:
+		return -EBUSY;
+	case PDS_RC_EEXIST:
+		return -EEXIST;
+	case PDS_RC_EVFID:
+		return -ENODEV;
+	case PDS_RC_ECLIENT:
+		return -ECHILD;
+	case PDS_RC_ENOSPC:
+		return -ENOSPC;
+	case PDS_RC_ERANGE:
+		return -ERANGE;
+	case PDS_RC_BAD_ADDR:
+		return -EFAULT;
+	case PDS_RC_EOPCODE:
+	case PDS_RC_EINTR:
+	case PDS_RC_DEV_CMD:
+	case PDS_RC_ERROR:
+	case PDS_RC_ERDMA:
+	case PDS_RC_EIO:
+	default:
+		return -EIO;
+	}
+}
+
+bool pdsc_is_fw_running(struct pdsc *pdsc)
+{
+	pdsc->fw_status = ioread8(&pdsc->info_regs->fw_status);
+	pdsc->last_fw_time = jiffies;
+	pdsc->last_hb = ioread32(&pdsc->info_regs->fw_heartbeat);
+
+	/* Firmware is useful only if the running bit is set and
+	 * fw_status != 0xff (bad PCI read)
+	 */
+	return (pdsc->fw_status != 0xff) &&
+		(pdsc->fw_status & PDS_CORE_FW_STS_F_RUNNING);
+}
+
+bool pdsc_is_fw_good(struct pdsc *pdsc)
+{
+	u8 gen = pdsc->fw_status & PDS_CORE_FW_STS_F_GENERATION;
+
+	return pdsc_is_fw_running(pdsc) && gen == pdsc->fw_generation;
+}
+
+static u8 pdsc_devcmd_status(struct pdsc *pdsc)
+{
+	return ioread8(&pdsc->cmd_regs->comp.status);
+}
+
+static bool pdsc_devcmd_done(struct pdsc *pdsc)
+{
+	return ioread32(&pdsc->cmd_regs->done) & PDS_CORE_DEV_CMD_DONE;
+}
+
+static void pdsc_devcmd_dbell(struct pdsc *pdsc)
+{
+	iowrite32(0, &pdsc->cmd_regs->done);
+	iowrite32(1, &pdsc->cmd_regs->doorbell);
+}
+
+static void pdsc_devcmd_clean(struct pdsc *pdsc)
+{
+	iowrite32(0, &pdsc->cmd_regs->doorbell);
+	memset_io(&pdsc->cmd_regs->cmd, 0, sizeof(pdsc->cmd_regs->cmd));
+}
+
+static const char *pdsc_devcmd_str(int opcode)
+{
+	switch (opcode) {
+	case PDS_CORE_CMD_NOP:
+		return "PDS_CORE_CMD_NOP";
+	case PDS_CORE_CMD_IDENTIFY:
+		return "PDS_CORE_CMD_IDENTIFY";
+	case PDS_CORE_CMD_RESET:
+		return "PDS_CORE_CMD_RESET";
+	case PDS_CORE_CMD_INIT:
+		return "PDS_CORE_CMD_INIT";
+	case PDS_CORE_CMD_FW_DOWNLOAD:
+		return "PDS_CORE_CMD_FW_DOWNLOAD";
+	case PDS_CORE_CMD_FW_CONTROL:
+		return "PDS_CORE_CMD_FW_CONTROL";
+	default:
+		return "PDS_CORE_CMD_UNKNOWN";
+	}
+}
+
+static int pdsc_devcmd_wait(struct pdsc *pdsc, int max_seconds)
+{
+	struct device *dev = pdsc->dev;
+	unsigned long start_time;
+	unsigned long max_wait;
+	unsigned long duration;
+	int timeout = 0;
+	int done = 0;
+	int err = 0;
+	int status;
+	int opcode;
+
+	opcode = ioread8(&pdsc->cmd_regs->cmd.opcode);
+
+	start_time = jiffies;
+	max_wait = start_time + (max_seconds * HZ);
+
+	while (!done && !timeout) {
+		done = pdsc_devcmd_done(pdsc);
+		if (done)
+			break;
+
+		timeout = time_after(jiffies, max_wait);
+		if (timeout)
+			break;
+
+		usleep_range(100, 200);
+	}
+	duration = jiffies - start_time;
+
+	if (done && duration > HZ)
+		dev_dbg(dev, "DEVCMD %d %s after %ld secs\n",
+			opcode, pdsc_devcmd_str(opcode), duration / HZ);
+
+	if (!done || timeout) {
+		dev_err(dev, "DEVCMD %d %s timeout, done %d timeout %d max_seconds=%d\n",
+			opcode, pdsc_devcmd_str(opcode), done, timeout,
+			max_seconds);
+		err = -ETIMEDOUT;
+		pdsc_devcmd_clean(pdsc);
+	}
+
+	status = pdsc_devcmd_status(pdsc);
+	err = pdsc_err_to_errno(status);
+	if (err && err != -EAGAIN)
+		dev_err(dev, "DEVCMD %d %s failed, status=%d err %d %pe\n",
+			opcode, pdsc_devcmd_str(opcode), status, err,
+			ERR_PTR(err));
+
+	return err;
+}
+
+int pdsc_devcmd_locked(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		       union pds_core_dev_comp *comp, int max_seconds)
+{
+	int err;
+
+	memcpy_toio(&pdsc->cmd_regs->cmd, cmd, sizeof(*cmd));
+	pdsc_devcmd_dbell(pdsc);
+	err = pdsc_devcmd_wait(pdsc, max_seconds);
+	memcpy_fromio(comp, &pdsc->cmd_regs->comp, sizeof(*comp));
+
+	return err;
+}
+
+int pdsc_devcmd(struct pdsc *pdsc, union pds_core_dev_cmd *cmd,
+		union pds_core_dev_comp *comp, int max_seconds)
+{
+	int err;
+
+	mutex_lock(&pdsc->devcmd_lock);
+	err = pdsc_devcmd_locked(pdsc, cmd, comp, max_seconds);
+	mutex_unlock(&pdsc->devcmd_lock);
+
+	return err;
+}
+
+int pdsc_devcmd_init(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.opcode = PDS_CORE_CMD_INIT,
+	};
+
+	return pdsc_devcmd(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+}
+
+int pdsc_devcmd_reset(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.reset.opcode = PDS_CORE_CMD_RESET,
+	};
+
+	return pdsc_devcmd(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+}
+
+static int pdsc_devcmd_identify_locked(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.identify.opcode = PDS_CORE_CMD_IDENTIFY,
+		.identify.ver = PDS_CORE_IDENTITY_VERSION_1,
+	};
+
+	return pdsc_devcmd_locked(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+}
+
+static void pdsc_init_devinfo(struct pdsc *pdsc)
+{
+	pdsc->dev_info.asic_type = ioread8(&pdsc->info_regs->asic_type);
+	pdsc->dev_info.asic_rev = ioread8(&pdsc->info_regs->asic_rev);
+	pdsc->fw_generation = PDS_CORE_FW_STS_F_GENERATION &
+			      ioread8(&pdsc->info_regs->fw_status);
+
+	memcpy_fromio(pdsc->dev_info.fw_version,
+		      pdsc->info_regs->fw_version,
+		      PDS_CORE_DEVINFO_FWVERS_BUFLEN);
+	pdsc->dev_info.fw_version[PDS_CORE_DEVINFO_FWVERS_BUFLEN] = 0;
+
+	memcpy_fromio(pdsc->dev_info.serial_num,
+		      pdsc->info_regs->serial_num,
+		      PDS_CORE_DEVINFO_SERIAL_BUFLEN);
+	pdsc->dev_info.serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN] = 0;
+
+	dev_dbg(pdsc->dev, "fw_version %s\n", pdsc->dev_info.fw_version);
+}
+
+static int pdsc_identify(struct pdsc *pdsc)
+{
+	struct pds_core_drv_identity drv = {};
+	size_t sz;
+	int err;
+
+	drv.drv_type = cpu_to_le32(PDS_DRIVER_LINUX);
+	snprintf(drv.driver_ver_str, sizeof(drv.driver_ver_str),
+		 "%s %s", PDS_CORE_DRV_NAME, utsname()->release);
+
+	/* Next let's get some info about the device
+	 * We use the devcmd_lock at this level in order to
+	 * get safe access to the cmd_regs->data before anyone
+	 * else can mess it up
+	 */
+	mutex_lock(&pdsc->devcmd_lock);
+
+	sz = min_t(size_t, sizeof(drv), sizeof(pdsc->cmd_regs->data));
+	memcpy_toio(&pdsc->cmd_regs->data, &drv, sz);
+
+	err = pdsc_devcmd_identify_locked(pdsc);
+	if (!err) {
+		sz = min_t(size_t, sizeof(pdsc->dev_ident),
+			   sizeof(pdsc->cmd_regs->data));
+		memcpy_fromio(&pdsc->dev_ident, &pdsc->cmd_regs->data, sz);
+	}
+	mutex_unlock(&pdsc->devcmd_lock);
+
+	if (err) {
+		dev_err(pdsc->dev, "Cannot identify device: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	if (isprint(pdsc->dev_info.fw_version[0]) &&
+	    isascii(pdsc->dev_info.fw_version[0]))
+		dev_info(pdsc->dev, "FW: %.*s\n",
+			 (int)(sizeof(pdsc->dev_info.fw_version) - 1),
+			 pdsc->dev_info.fw_version);
+	else
+		dev_info(pdsc->dev, "FW: (invalid string) 0x%02x 0x%02x 0x%02x 0x%02x ...\n",
+			 (u8)pdsc->dev_info.fw_version[0],
+			 (u8)pdsc->dev_info.fw_version[1],
+			 (u8)pdsc->dev_info.fw_version[2],
+			 (u8)pdsc->dev_info.fw_version[3]);
+
+	return 0;
+}
+
+int pdsc_dev_reinit(struct pdsc *pdsc)
+{
+	pdsc_init_devinfo(pdsc);
+
+	return pdsc_identify(pdsc);
+}
+
+int pdsc_dev_init(struct pdsc *pdsc)
+{
+	unsigned int nintrs;
+	int err;
+
+	/* Initial init and reset of device */
+	pdsc_init_devinfo(pdsc);
+	pdsc->devcmd_timeout = PDS_CORE_DEVCMD_TIMEOUT;
+
+	err = pdsc_devcmd_reset(pdsc);
+	if (err)
+		return err;
+
+	err = pdsc_identify(pdsc);
+	if (err)
+		return err;
+
+	pdsc_debugfs_add_ident(pdsc);
+
+	/* Now we can reserve interrupts */
+	nintrs = le32_to_cpu(pdsc->dev_ident.nintrs);
+	nintrs = min_t(unsigned int, num_online_cpus(), nintrs);
+
+	/* Get intr_info struct array for tracking */
+	pdsc->intr_info = kcalloc(nintrs, sizeof(*pdsc->intr_info), GFP_KERNEL);
+	if (!pdsc->intr_info) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	err = pci_alloc_irq_vectors(pdsc->pdev, nintrs, nintrs, PCI_IRQ_MSIX);
+	if (err != nintrs) {
+		dev_err(pdsc->dev, "Can't get %d intrs from OS: %pe\n",
+			nintrs, ERR_PTR(err));
+		err = -ENOSPC;
+		goto err_out;
+	}
+	pdsc->nintrs = nintrs;
+
+	return 0;
+
+err_out:
+	kfree(pdsc->intr_info);
+	pdsc->intr_info = NULL;
+
+	return err;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
index c2b12f226959..09afb069dcb3 100644
--- a/drivers/net/ethernet/amd/pds_core/main.c
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -137,6 +137,18 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 	if (err)
 		goto err_out_release_regions;
 
+	mutex_init(&pdsc->devcmd_lock);
+	mutex_init(&pdsc->config_lock);
+
+	mutex_lock(&pdsc->config_lock);
+	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
+
+	err = pdsc_setup(pdsc, PDSC_SETUP_INIT);
+	if (err)
+		goto err_out_unmap_bars;
+
+	mutex_unlock(&pdsc->config_lock);
+
 	dl = priv_to_devlink(pdsc);
 	devl_lock(dl);
 	devl_register(dl);
@@ -144,6 +156,12 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 
 	return 0;
 
+err_out_unmap_bars:
+	mutex_unlock(&pdsc->config_lock);
+	mutex_destroy(&pdsc->config_lock);
+	mutex_destroy(&pdsc->devcmd_lock);
+	pci_free_irq_vectors(pdsc->pdev);
+	pdsc_unmap_bars(pdsc);
 err_out_release_regions:
 	pci_release_regions(pdsc->pdev);
 
@@ -240,8 +258,19 @@ static void pdsc_remove(struct pci_dev *pdev)
 	devl_unregister(dl);
 	devl_unlock(dl);
 
-	pdsc_unmap_bars(pdsc);
-	pci_release_regions(pdev);
+	if (!pdev->is_virtfn) {
+		mutex_lock(&pdsc->config_lock);
+		set_bit(PDSC_S_STOPPING_DRIVER, &pdsc->state);
+
+		pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING);
+		mutex_unlock(&pdsc->config_lock);
+		mutex_destroy(&pdsc->config_lock);
+		mutex_destroy(&pdsc->devcmd_lock);
+
+		pci_free_irq_vectors(pdev);
+		pdsc_unmap_bars(pdsc);
+		pci_release_regions(pdev);
+	}
 
 	pci_clear_master(pdev);
 	pci_disable_device(pdev);
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index bd041a5170a6..f0798ce01acf 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -11,4 +11,34 @@
 #define PDS_CORE_ADDR_MASK	(BIT_ULL(PDS_ADDR_LEN) - 1)
 #define PDS_PAGE_SIZE		4096
 
+enum pds_core_driver_type {
+	PDS_DRIVER_LINUX   = 1,
+	PDS_DRIVER_WIN     = 2,
+	PDS_DRIVER_DPDK    = 3,
+	PDS_DRIVER_FREEBSD = 4,
+	PDS_DRIVER_IPXE    = 5,
+	PDS_DRIVER_ESXI    = 6,
+};
+
+#define PDS_CORE_IFNAMSIZ		16
+
+/**
+ * enum pds_core_logical_qtype - Logical Queue Types
+ * @PDS_CORE_QTYPE_ADMINQ:    Administrative Queue
+ * @PDS_CORE_QTYPE_NOTIFYQ:   Notify Queue
+ * @PDS_CORE_QTYPE_RXQ:       Receive Queue
+ * @PDS_CORE_QTYPE_TXQ:       Transmit Queue
+ * @PDS_CORE_QTYPE_EQ:        Event Queue
+ * @PDS_CORE_QTYPE_MAX:       Max queue type supported
+ */
+enum pds_core_logical_qtype {
+	PDS_CORE_QTYPE_ADMINQ  = 0,
+	PDS_CORE_QTYPE_NOTIFYQ = 1,
+	PDS_CORE_QTYPE_RXQ     = 2,
+	PDS_CORE_QTYPE_TXQ     = 3,
+	PDS_CORE_QTYPE_EQ      = 4,
+
+	PDS_CORE_QTYPE_MAX     = 16   /* don't change - used in struct size */
+};
+
 #endif /* _PDS_COMMON_H_ */
diff --git a/include/linux/pds/pds_intr.h b/include/linux/pds/pds_intr.h
new file mode 100644
index 000000000000..56277c37248c
--- /dev/null
+++ b/include/linux/pds/pds_intr.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc. */
+
+#ifndef _PDS_INTR_H_
+#define _PDS_INTR_H_
+
+/*
+ * Interrupt control register
+ * @coal_init:        Coalescing timer initial value, in
+ *                    device units.  Use @identity->intr_coal_mult
+ *                    and @identity->intr_coal_div to convert from
+ *                    usecs to device units:
+ *
+ *                      coal_init = coal_usecs * coal_mutl / coal_div
+ *
+ *                    When an interrupt is sent the interrupt
+ *                    coalescing timer current value
+ *                    (@coalescing_curr) is initialized with this
+ *                    value and begins counting down.  No more
+ *                    interrupts are sent until the coalescing
+ *                    timer reaches 0.  When @coalescing_init=0
+ *                    interrupt coalescing is effectively disabled
+ *                    and every interrupt assert results in an
+ *                    interrupt.  Reset value: 0
+ * @mask:             Interrupt mask.  When @mask=1 the interrupt
+ *                    resource will not send an interrupt.  When
+ *                    @mask=0 the interrupt resource will send an
+ *                    interrupt if an interrupt event is pending
+ *                    or on the next interrupt assertion event.
+ *                    Reset value: 1
+ * @credits:          Interrupt credits.  This register indicates
+ *                    how many interrupt events the hardware has
+ *                    sent.  When written by software this
+ *                    register atomically decrements @int_credits
+ *                    by the value written.  When @int_credits
+ *                    becomes 0 then the "pending interrupt" bit
+ *                    in the Interrupt Status register is cleared
+ *                    by the hardware and any pending but unsent
+ *                    interrupts are cleared.
+ *                    !!!IMPORTANT!!! This is a signed register.
+ * @flags:            Interrupt control flags
+ *                       @unmask -- When this bit is written with a 1
+ *                       the interrupt resource will set mask=0.
+ *                       @coal_timer_reset -- When this
+ *                       bit is written with a 1 the
+ *                       @coalescing_curr will be reloaded with
+ *                       @coalescing_init to reset the coalescing
+ *                       timer.
+ * @mask_on_assert:   Automatically mask on assertion.  When
+ *                    @mask_on_assert=1 the interrupt resource
+ *                    will set @mask=1 whenever an interrupt is
+ *                    sent.  When using interrupts in Legacy
+ *                    Interrupt mode the driver must select
+ *                    @mask_on_assert=0 for proper interrupt
+ *                    operation.
+ * @coalescing_curr:  Coalescing timer current value, in
+ *                    microseconds.  When this value reaches 0
+ *                    the interrupt resource is again eligible to
+ *                    send an interrupt.  If an interrupt event
+ *                    is already pending when @coalescing_curr
+ *                    reaches 0 the pending interrupt will be
+ *                    sent, otherwise an interrupt will be sent
+ *                    on the next interrupt assertion event.
+ */
+struct pds_core_intr {
+	u32 coal_init;
+	u32 mask;
+	u16 credits;
+	u16 flags;
+#define PDS_CORE_INTR_F_UNMASK		0x0001
+#define PDS_CORE_INTR_F_TIMER_RESET	0x0002
+	u32 mask_on_assert;
+	u32 coalescing_curr;
+	u32 rsvd6[3];
+};
+
+#ifndef __CHECKER__
+static_assert(sizeof(struct pds_core_intr) == 32);
+#endif /* __CHECKER__ */
+
+#define PDS_CORE_INTR_CTRL_REGS_MAX		2048
+#define PDS_CORE_INTR_CTRL_COAL_MAX		0x3F
+#define PDS_CORE_INTR_INDEX_NOT_ASSIGNED	-1
+
+struct pds_core_intr_status {
+	u32 status[2];
+};
+
+/**
+ * enum pds_core_intr_mask_vals - valid values for mask and mask_assert.
+ * @PDS_CORE_INTR_MASK_CLEAR:	unmask interrupt.
+ * @PDS_CORE_INTR_MASK_SET:	mask interrupt.
+ */
+enum pds_core_intr_mask_vals {
+	PDS_CORE_INTR_MASK_CLEAR	= 0,
+	PDS_CORE_INTR_MASK_SET		= 1,
+};
+
+/**
+ * enum pds_core_intr_credits_bits - Bitwise composition of credits values.
+ * @PDS_CORE_INTR_CRED_COUNT:	bit mask of credit count, no shift needed.
+ * @PDS_CORE_INTR_CRED_COUNT_SIGNED: bit mask of credit count, including sign bit.
+ * @PDS_CORE_INTR_CRED_UNMASK:	unmask the interrupt.
+ * @PDS_CORE_INTR_CRED_RESET_COALESCE: reset the coalesce timer.
+ * @PDS_CORE_INTR_CRED_REARM:	unmask the and reset the timer.
+ */
+enum pds_core_intr_credits_bits {
+	PDS_CORE_INTR_CRED_COUNT		= 0x7fffu,
+	PDS_CORE_INTR_CRED_COUNT_SIGNED		= 0xffffu,
+	PDS_CORE_INTR_CRED_UNMASK		= 0x10000u,
+	PDS_CORE_INTR_CRED_RESET_COALESCE	= 0x20000u,
+	PDS_CORE_INTR_CRED_REARM		= (PDS_CORE_INTR_CRED_UNMASK |
+					   PDS_CORE_INTR_CRED_RESET_COALESCE),
+};
+
+static inline void
+pds_core_intr_coal_init(struct pds_core_intr __iomem *intr_ctrl, u32 coal)
+{
+	iowrite32(coal, &intr_ctrl->coal_init);
+}
+
+static inline void
+pds_core_intr_mask(struct pds_core_intr __iomem *intr_ctrl, u32 mask)
+{
+	iowrite32(mask, &intr_ctrl->mask);
+}
+
+static inline void
+pds_core_intr_credits(struct pds_core_intr __iomem *intr_ctrl,
+		      u32 cred, u32 flags)
+{
+	if (WARN_ON_ONCE(cred > PDS_CORE_INTR_CRED_COUNT)) {
+		cred = ioread32(&intr_ctrl->credits);
+		cred &= PDS_CORE_INTR_CRED_COUNT_SIGNED;
+	}
+
+	iowrite32(cred | flags, &intr_ctrl->credits);
+}
+
+static inline void
+pds_core_intr_clean_flags(struct pds_core_intr __iomem *intr_ctrl, u32 flags)
+{
+	u32 cred;
+
+	cred = ioread32(&intr_ctrl->credits);
+	cred &= PDS_CORE_INTR_CRED_COUNT_SIGNED;
+	cred |= flags;
+	iowrite32(cred, &intr_ctrl->credits);
+}
+
+static inline void
+pds_core_intr_clean(struct pds_core_intr __iomem *intr_ctrl)
+{
+	pds_core_intr_clean_flags(intr_ctrl, PDS_CORE_INTR_CRED_RESET_COALESCE);
+}
+
+static inline void
+pds_core_intr_mask_assert(struct pds_core_intr __iomem *intr_ctrl, u32 mask)
+{
+	iowrite32(mask, &intr_ctrl->mask_on_assert);
+}
+
+#endif /* _PDS_INTR_H_ */
-- 
cgit v1.2.3


From 45d76f492938cdc27ddadc16e1e75103f4cfbf56 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:18 -0700
Subject: pds_core: set up device and adminq

Set up the basic adminq and notifyq queue structures.  These are
used mostly by the client drivers for feature configuration.
These are essentially the same adminq and notifyq as in the
ionic driver.

Part of this includes querying for device identity and FW
information, so we can make that available to devlink dev info.

  $ devlink dev info pci/0000:b5:00.0
  pci/0000:b5:00.0:
    driver pds_core
    serial_number FLM18420073
    versions:
        fixed:
          asic.id 0x0
          asic.rev 0x0
        running:
          fw 1.51.0-73
        stored:
          fw.goldfw 1.15.9-C-22
          fw.mainfwa 1.60.0-73
          fw.mainfwb 1.60.0-57

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../device_drivers/ethernet/amd/pds_core.rst       |  47 ++
 drivers/net/ethernet/amd/pds_core/core.c           | 431 +++++++++++++-
 drivers/net/ethernet/amd/pds_core/core.h           | 151 +++++
 drivers/net/ethernet/amd/pds_core/debugfs.c        |  77 +++
 drivers/net/ethernet/amd/pds_core/devlink.c        |  61 ++
 drivers/net/ethernet/amd/pds_core/main.c           |  17 +-
 include/linux/pds/pds_adminq.h                     | 638 +++++++++++++++++++++
 7 files changed, 1418 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/pds/pds_adminq.h

(limited to 'include')

diff --git a/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
index 5b88173a20ff..a48eafb3d0d3 100644
--- a/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
+++ b/Documentation/networking/device_drivers/ethernet/amd/pds_core.rst
@@ -26,6 +26,53 @@ messages such as these::
   pds_core 0000:b6:00.0: 252.048 Gb/s available PCIe bandwidth (16.0 GT/s PCIe x16 link)
   pds_core 0000:b6:00.0: FW: 1.60.0-73
 
+Driver and firmware version information can be gathered with devlink::
+
+  $ devlink dev info pci/0000:b5:00.0
+  pci/0000:b5:00.0:
+    driver pds_core
+    serial_number FLM18420073
+    versions:
+        fixed:
+          asic.id 0x0
+          asic.rev 0x0
+        running:
+          fw 1.51.0-73
+        stored:
+          fw.goldfw 1.15.9-C-22
+          fw.mainfwa 1.60.0-73
+          fw.mainfwb 1.60.0-57
+
+Info versions
+=============
+
+The ``pds_core`` driver reports the following versions
+
+.. list-table:: devlink info versions implemented
+   :widths: 5 5 90
+
+   * - Name
+     - Type
+     - Description
+   * - ``fw``
+     - running
+     - Version of firmware running on the device
+   * - ``fw.goldfw``
+     - stored
+     - Version of firmware stored in the goldfw slot
+   * - ``fw.mainfwa``
+     - stored
+     - Version of firmware stored in the mainfwa slot
+   * - ``fw.mainfwb``
+     - stored
+     - Version of firmware stored in the mainfwb slot
+   * - ``asic.id``
+     - fixed
+     - The ASIC type for this device
+   * - ``asic.rev``
+     - fixed
+     - The revision of the ASIC for this device
+
 Health Reporters
 ================
 
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index ab8531386226..8c0dbdb5efc5 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -1,10 +1,365 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2023 Advanced Micro Devices, Inc */
 
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
 #include "core.h"
 
+void pdsc_work_thread(struct work_struct *work)
+{
+	/* stub */
+}
+
+irqreturn_t pdsc_adminq_isr(int irq, void *data)
+{
+	/* stub */
+	return IRQ_HANDLED;
+}
+
+void pdsc_intr_free(struct pdsc *pdsc, int index)
+{
+	struct pdsc_intr_info *intr_info;
+
+	if (index >= pdsc->nintrs || index < 0) {
+		WARN(true, "bad intr index %d\n", index);
+		return;
+	}
+
+	intr_info = &pdsc->intr_info[index];
+	if (!intr_info->vector)
+		return;
+	dev_dbg(pdsc->dev, "%s: idx %d vec %d name %s\n",
+		__func__, index, intr_info->vector, intr_info->name);
+
+	pds_core_intr_mask(&pdsc->intr_ctrl[index], PDS_CORE_INTR_MASK_SET);
+	pds_core_intr_clean(&pdsc->intr_ctrl[index]);
+
+	free_irq(intr_info->vector, intr_info->data);
+
+	memset(intr_info, 0, sizeof(*intr_info));
+}
+
+int pdsc_intr_alloc(struct pdsc *pdsc, char *name,
+		    irq_handler_t handler, void *data)
+{
+	struct pdsc_intr_info *intr_info;
+	unsigned int index;
+	int err;
+
+	/* Find the first available interrupt */
+	for (index = 0; index < pdsc->nintrs; index++)
+		if (!pdsc->intr_info[index].vector)
+			break;
+	if (index >= pdsc->nintrs) {
+		dev_warn(pdsc->dev, "%s: no intr, index=%d nintrs=%d\n",
+			 __func__, index, pdsc->nintrs);
+		return -ENOSPC;
+	}
+
+	pds_core_intr_clean_flags(&pdsc->intr_ctrl[index],
+				  PDS_CORE_INTR_CRED_RESET_COALESCE);
+
+	intr_info = &pdsc->intr_info[index];
+
+	intr_info->index = index;
+	intr_info->data = data;
+	strscpy(intr_info->name, name, sizeof(intr_info->name));
+
+	/* Get the OS vector number for the interrupt */
+	err = pci_irq_vector(pdsc->pdev, index);
+	if (err < 0) {
+		dev_err(pdsc->dev, "failed to get intr vector index %d: %pe\n",
+			index, ERR_PTR(err));
+		goto err_out_free_intr;
+	}
+	intr_info->vector = err;
+
+	/* Init the device's intr mask */
+	pds_core_intr_clean(&pdsc->intr_ctrl[index]);
+	pds_core_intr_mask_assert(&pdsc->intr_ctrl[index], 1);
+	pds_core_intr_mask(&pdsc->intr_ctrl[index], PDS_CORE_INTR_MASK_SET);
+
+	/* Register the isr with a name */
+	err = request_irq(intr_info->vector, handler, 0, intr_info->name, data);
+	if (err) {
+		dev_err(pdsc->dev, "failed to get intr irq vector %d: %pe\n",
+			intr_info->vector, ERR_PTR(err));
+		goto err_out_free_intr;
+	}
+
+	return index;
+
+err_out_free_intr:
+	pdsc_intr_free(pdsc, index);
+	return err;
+}
+
+static void pdsc_qcq_intr_free(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	if (!(qcq->flags & PDS_CORE_QCQ_F_INTR) ||
+	    qcq->intx == PDS_CORE_INTR_INDEX_NOT_ASSIGNED)
+		return;
+
+	pdsc_intr_free(pdsc, qcq->intx);
+	qcq->intx = PDS_CORE_INTR_INDEX_NOT_ASSIGNED;
+}
+
+static int pdsc_qcq_intr_alloc(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	char name[PDSC_INTR_NAME_MAX_SZ];
+	int index;
+
+	if (!(qcq->flags & PDS_CORE_QCQ_F_INTR)) {
+		qcq->intx = PDS_CORE_INTR_INDEX_NOT_ASSIGNED;
+		return 0;
+	}
+
+	snprintf(name, sizeof(name), "%s-%d-%s",
+		 PDS_CORE_DRV_NAME, pdsc->pdev->bus->number, qcq->q.name);
+	index = pdsc_intr_alloc(pdsc, name, pdsc_adminq_isr, qcq);
+	if (index < 0)
+		return index;
+	qcq->intx = index;
+
+	return 0;
+}
+
+void pdsc_qcq_free(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	struct device *dev = pdsc->dev;
+
+	if (!(qcq && qcq->pdsc))
+		return;
+
+	pdsc_debugfs_del_qcq(qcq);
+
+	pdsc_qcq_intr_free(pdsc, qcq);
+
+	if (qcq->q_base)
+		dma_free_coherent(dev, qcq->q_size,
+				  qcq->q_base, qcq->q_base_pa);
+
+	if (qcq->cq_base)
+		dma_free_coherent(dev, qcq->cq_size,
+				  qcq->cq_base, qcq->cq_base_pa);
+
+	if (qcq->cq.info)
+		vfree(qcq->cq.info);
+
+	if (qcq->q.info)
+		vfree(qcq->q.info);
+
+	memset(qcq, 0, sizeof(*qcq));
+}
+
+static void pdsc_q_map(struct pdsc_queue *q, void *base, dma_addr_t base_pa)
+{
+	struct pdsc_q_info *cur;
+	unsigned int i;
+
+	q->base = base;
+	q->base_pa = base_pa;
+
+	for (i = 0, cur = q->info; i < q->num_descs; i++, cur++)
+		cur->desc = base + (i * q->desc_size);
+}
+
+static void pdsc_cq_map(struct pdsc_cq *cq, void *base, dma_addr_t base_pa)
+{
+	struct pdsc_cq_info *cur;
+	unsigned int i;
+
+	cq->base = base;
+	cq->base_pa = base_pa;
+
+	for (i = 0, cur = cq->info; i < cq->num_descs; i++, cur++)
+		cur->comp = base + (i * cq->desc_size);
+}
+
+int pdsc_qcq_alloc(struct pdsc *pdsc, unsigned int type, unsigned int index,
+		   const char *name, unsigned int flags, unsigned int num_descs,
+		   unsigned int desc_size, unsigned int cq_desc_size,
+		   unsigned int pid, struct pdsc_qcq *qcq)
+{
+	struct device *dev = pdsc->dev;
+	void *q_base, *cq_base;
+	dma_addr_t cq_base_pa;
+	dma_addr_t q_base_pa;
+	int err;
+
+	qcq->q.info = vzalloc(num_descs * sizeof(*qcq->q.info));
+	if (!qcq->q.info) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	qcq->pdsc = pdsc;
+	qcq->flags = flags;
+	INIT_WORK(&qcq->work, pdsc_work_thread);
+
+	qcq->q.type = type;
+	qcq->q.index = index;
+	qcq->q.num_descs = num_descs;
+	qcq->q.desc_size = desc_size;
+	qcq->q.tail_idx = 0;
+	qcq->q.head_idx = 0;
+	qcq->q.pid = pid;
+	snprintf(qcq->q.name, sizeof(qcq->q.name), "%s%u", name, index);
+
+	err = pdsc_qcq_intr_alloc(pdsc, qcq);
+	if (err)
+		goto err_out_free_q_info;
+
+	qcq->cq.info = vzalloc(num_descs * sizeof(*qcq->cq.info));
+	if (!qcq->cq.info) {
+		err = -ENOMEM;
+		goto err_out_free_irq;
+	}
+
+	qcq->cq.bound_intr = &pdsc->intr_info[qcq->intx];
+	qcq->cq.num_descs = num_descs;
+	qcq->cq.desc_size = cq_desc_size;
+	qcq->cq.tail_idx = 0;
+	qcq->cq.done_color = 1;
+
+	if (flags & PDS_CORE_QCQ_F_NOTIFYQ) {
+		/* q & cq need to be contiguous in case of notifyq */
+		qcq->q_size = PDS_PAGE_SIZE +
+			      ALIGN(num_descs * desc_size, PDS_PAGE_SIZE) +
+			      ALIGN(num_descs * cq_desc_size, PDS_PAGE_SIZE);
+		qcq->q_base = dma_alloc_coherent(dev,
+						 qcq->q_size + qcq->cq_size,
+						 &qcq->q_base_pa,
+						 GFP_KERNEL);
+		if (!qcq->q_base) {
+			err = -ENOMEM;
+			goto err_out_free_cq_info;
+		}
+		q_base = PTR_ALIGN(qcq->q_base, PDS_PAGE_SIZE);
+		q_base_pa = ALIGN(qcq->q_base_pa, PDS_PAGE_SIZE);
+		pdsc_q_map(&qcq->q, q_base, q_base_pa);
+
+		cq_base = PTR_ALIGN(q_base +
+				    ALIGN(num_descs * desc_size, PDS_PAGE_SIZE),
+				    PDS_PAGE_SIZE);
+		cq_base_pa = ALIGN(qcq->q_base_pa +
+				   ALIGN(num_descs * desc_size, PDS_PAGE_SIZE),
+				   PDS_PAGE_SIZE);
+
+	} else {
+		/* q DMA descriptors */
+		qcq->q_size = PDS_PAGE_SIZE + (num_descs * desc_size);
+		qcq->q_base = dma_alloc_coherent(dev, qcq->q_size,
+						 &qcq->q_base_pa,
+						 GFP_KERNEL);
+		if (!qcq->q_base) {
+			err = -ENOMEM;
+			goto err_out_free_cq_info;
+		}
+		q_base = PTR_ALIGN(qcq->q_base, PDS_PAGE_SIZE);
+		q_base_pa = ALIGN(qcq->q_base_pa, PDS_PAGE_SIZE);
+		pdsc_q_map(&qcq->q, q_base, q_base_pa);
+
+		/* cq DMA descriptors */
+		qcq->cq_size = PDS_PAGE_SIZE + (num_descs * cq_desc_size);
+		qcq->cq_base = dma_alloc_coherent(dev, qcq->cq_size,
+						  &qcq->cq_base_pa,
+						  GFP_KERNEL);
+		if (!qcq->cq_base) {
+			err = -ENOMEM;
+			goto err_out_free_q;
+		}
+		cq_base = PTR_ALIGN(qcq->cq_base, PDS_PAGE_SIZE);
+		cq_base_pa = ALIGN(qcq->cq_base_pa, PDS_PAGE_SIZE);
+	}
+
+	pdsc_cq_map(&qcq->cq, cq_base, cq_base_pa);
+	qcq->cq.bound_q = &qcq->q;
+
+	pdsc_debugfs_add_qcq(pdsc, qcq);
+
+	return 0;
+
+err_out_free_q:
+	dma_free_coherent(dev, qcq->q_size, qcq->q_base, qcq->q_base_pa);
+err_out_free_cq_info:
+	vfree(qcq->cq.info);
+err_out_free_irq:
+	pdsc_qcq_intr_free(pdsc, qcq);
+err_out_free_q_info:
+	vfree(qcq->q.info);
+	memset(qcq, 0, sizeof(*qcq));
+err_out:
+	dev_err(dev, "qcq alloc of %s%d failed %d\n", name, index, err);
+	return err;
+}
+
+static int pdsc_core_init(struct pdsc *pdsc)
+{
+	union pds_core_dev_comp comp = {};
+	union pds_core_dev_cmd cmd = {
+		.init.opcode = PDS_CORE_CMD_INIT,
+	};
+	struct pds_core_dev_init_data_out cido;
+	struct pds_core_dev_init_data_in cidi;
+	u32 dbid_count;
+	u32 dbpage_num;
+	size_t sz;
+	int err;
+
+	cidi.adminq_q_base = cpu_to_le64(pdsc->adminqcq.q_base_pa);
+	cidi.adminq_cq_base = cpu_to_le64(pdsc->adminqcq.cq_base_pa);
+	cidi.notifyq_cq_base = cpu_to_le64(pdsc->notifyqcq.cq.base_pa);
+	cidi.flags = cpu_to_le32(PDS_CORE_QINIT_F_IRQ | PDS_CORE_QINIT_F_ENA);
+	cidi.intr_index = cpu_to_le16(pdsc->adminqcq.intx);
+	cidi.adminq_ring_size = ilog2(pdsc->adminqcq.q.num_descs);
+	cidi.notifyq_ring_size = ilog2(pdsc->notifyqcq.q.num_descs);
+
+	mutex_lock(&pdsc->devcmd_lock);
+
+	sz = min_t(size_t, sizeof(cidi), sizeof(pdsc->cmd_regs->data));
+	memcpy_toio(&pdsc->cmd_regs->data, &cidi, sz);
+
+	err = pdsc_devcmd_locked(pdsc, &cmd, &comp, pdsc->devcmd_timeout);
+	if (!err) {
+		sz = min_t(size_t, sizeof(cido), sizeof(pdsc->cmd_regs->data));
+		memcpy_fromio(&cido, &pdsc->cmd_regs->data, sz);
+	}
+
+	mutex_unlock(&pdsc->devcmd_lock);
+	if (err) {
+		dev_err(pdsc->dev, "Device init command failed: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	pdsc->hw_index = le32_to_cpu(cido.core_hw_index);
+
+	dbid_count = le32_to_cpu(pdsc->dev_ident.ndbpgs_per_lif);
+	dbpage_num = pdsc->hw_index * dbid_count;
+	pdsc->kern_dbpage = pdsc_map_dbpage(pdsc, dbpage_num);
+	if (!pdsc->kern_dbpage) {
+		dev_err(pdsc->dev, "Cannot map dbpage, aborting\n");
+		return -ENOMEM;
+	}
+
+	pdsc->adminqcq.q.hw_type = cido.adminq_hw_type;
+	pdsc->adminqcq.q.hw_index = le32_to_cpu(cido.adminq_hw_index);
+	pdsc->adminqcq.q.dbval = PDS_CORE_DBELL_QID(pdsc->adminqcq.q.hw_index);
+
+	pdsc->notifyqcq.q.hw_type = cido.notifyq_hw_type;
+	pdsc->notifyqcq.q.hw_index = le32_to_cpu(cido.notifyq_hw_index);
+	pdsc->notifyqcq.q.dbval = PDS_CORE_DBELL_QID(pdsc->notifyqcq.q.hw_index);
+
+	pdsc->last_eid = 0;
+
+	return err;
+}
+
 int pdsc_setup(struct pdsc *pdsc, bool init)
 {
+	int numdescs;
 	int err;
 
 	if (init)
@@ -14,17 +369,60 @@ int pdsc_setup(struct pdsc *pdsc, bool init)
 	if (err)
 		return err;
 
+	/* Scale the descriptor ring length based on number of CPUs and VFs */
+	numdescs = max_t(int, PDSC_ADMINQ_MIN_LENGTH, num_online_cpus());
+	numdescs += 2 * pci_sriov_get_totalvfs(pdsc->pdev);
+	numdescs = roundup_pow_of_two(numdescs);
+	err = pdsc_qcq_alloc(pdsc, PDS_CORE_QTYPE_ADMINQ, 0, "adminq",
+			     PDS_CORE_QCQ_F_CORE | PDS_CORE_QCQ_F_INTR,
+			     numdescs,
+			     sizeof(union pds_core_adminq_cmd),
+			     sizeof(union pds_core_adminq_comp),
+			     0, &pdsc->adminqcq);
+	if (err)
+		goto err_out_teardown;
+
+	err = pdsc_qcq_alloc(pdsc, PDS_CORE_QTYPE_NOTIFYQ, 0, "notifyq",
+			     PDS_CORE_QCQ_F_NOTIFYQ,
+			     PDSC_NOTIFYQ_LENGTH,
+			     sizeof(struct pds_core_notifyq_cmd),
+			     sizeof(union pds_core_notifyq_comp),
+			     0, &pdsc->notifyqcq);
+	if (err)
+		goto err_out_teardown;
+
+	/* NotifyQ rides on the AdminQ interrupt */
+	pdsc->notifyqcq.intx = pdsc->adminqcq.intx;
+
+	/* Set up the Core with the AdminQ and NotifyQ info */
+	err = pdsc_core_init(pdsc);
+	if (err)
+		goto err_out_teardown;
+
 	clear_bit(PDSC_S_FW_DEAD, &pdsc->state);
 	return 0;
+
+err_out_teardown:
+	pdsc_teardown(pdsc, init);
+	return err;
 }
 
 void pdsc_teardown(struct pdsc *pdsc, bool removing)
 {
+	int i;
+
 	pdsc_devcmd_reset(pdsc);
+	pdsc_qcq_free(pdsc, &pdsc->notifyqcq);
+	pdsc_qcq_free(pdsc, &pdsc->adminqcq);
+
+	if (pdsc->intr_info) {
+		for (i = 0; i < pdsc->nintrs; i++)
+			pdsc_intr_free(pdsc, i);
 
-	if (removing) {
-		kfree(pdsc->intr_info);
-		pdsc->intr_info = NULL;
+		if (removing) {
+			kfree(pdsc->intr_info);
+			pdsc->intr_info = NULL;
+		}
 	}
 
 	if (pdsc->kern_dbpage) {
@@ -35,6 +433,28 @@ void pdsc_teardown(struct pdsc *pdsc, bool removing)
 	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
 }
 
+int pdsc_start(struct pdsc *pdsc)
+{
+	pds_core_intr_mask(&pdsc->intr_ctrl[pdsc->adminqcq.intx],
+			   PDS_CORE_INTR_MASK_CLEAR);
+
+	return 0;
+}
+
+void pdsc_stop(struct pdsc *pdsc)
+{
+	int i;
+
+	if (!pdsc->intr_info)
+		return;
+
+	/* Mask interrupts that are in use */
+	for (i = 0; i < pdsc->nintrs; i++)
+		if (pdsc->intr_info[i].vector)
+			pds_core_intr_mask(&pdsc->intr_ctrl[i],
+					   PDS_CORE_INTR_MASK_SET);
+}
+
 static void pdsc_fw_down(struct pdsc *pdsc)
 {
 	if (test_and_set_bit(PDSC_S_FW_DEAD, &pdsc->state)) {
@@ -44,6 +464,7 @@ static void pdsc_fw_down(struct pdsc *pdsc)
 
 	devlink_health_report(pdsc->fw_reporter, "FW down reported", pdsc);
 
+	pdsc_stop(pdsc);
 	pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY);
 }
 
@@ -60,6 +481,10 @@ static void pdsc_fw_up(struct pdsc *pdsc)
 	if (err)
 		goto err_out;
 
+	err = pdsc_start(pdsc);
+	if (err)
+		goto err_out;
+
 	pdsc->fw_recoveries++;
 	devlink_health_reporter_state_update(pdsc->fw_reporter,
 					     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 32aa38c40024..7adbd10cb6d9 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -9,11 +9,15 @@
 
 #include <linux/pds/pds_common.h>
 #include <linux/pds/pds_core_if.h>
+#include <linux/pds/pds_adminq.h>
 #include <linux/pds/pds_intr.h>
 
 #define PDSC_DRV_DESCRIPTION	"AMD/Pensando Core Driver"
 
 #define PDSC_WATCHDOG_SECS	5
+#define PDSC_QUEUE_NAME_MAX_SZ  32
+#define PDSC_ADMINQ_MIN_LENGTH	16	/* must be a power of two */
+#define PDSC_NOTIFYQ_LENGTH	64	/* must be a power of two */
 #define PDSC_TEARDOWN_RECOVERY	false
 #define PDSC_TEARDOWN_REMOVING	true
 #define PDSC_SETUP_RECOVERY	false
@@ -33,6 +37,28 @@ struct pdsc_devinfo {
 	char serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN + 1];
 };
 
+struct pdsc_queue {
+	struct pdsc_q_info *info;
+	u64 dbval;
+	u16 head_idx;
+	u16 tail_idx;
+	u8 hw_type;
+	unsigned int index;
+	unsigned int num_descs;
+	u64 dbell_count;
+	u64 features;
+	unsigned int type;
+	unsigned int hw_index;
+	union {
+		void *base;
+		struct pds_core_admin_cmd *adminq;
+	};
+	dma_addr_t base_pa;	/* must be page aligned */
+	unsigned int desc_size;
+	unsigned int pid;
+	char name[PDSC_QUEUE_NAME_MAX_SZ];
+};
+
 #define PDSC_INTR_NAME_MAX_SZ		32
 
 struct pdsc_intr_info {
@@ -42,6 +68,61 @@ struct pdsc_intr_info {
 	void *data;
 };
 
+struct pdsc_cq_info {
+	void *comp;
+};
+
+struct pdsc_buf_info {
+	struct page *page;
+	dma_addr_t dma_addr;
+	u32 page_offset;
+	u32 len;
+};
+
+struct pdsc_q_info {
+	union {
+		void *desc;
+		struct pdsc_admin_cmd *adminq_desc;
+	};
+	unsigned int bytes;
+	unsigned int nbufs;
+	struct pdsc_buf_info bufs[PDS_CORE_MAX_FRAGS];
+	struct pdsc_wait_context *wc;
+	void *dest;
+};
+
+struct pdsc_cq {
+	struct pdsc_cq_info *info;
+	struct pdsc_queue *bound_q;
+	struct pdsc_intr_info *bound_intr;
+	u16 tail_idx;
+	bool done_color;
+	unsigned int num_descs;
+	unsigned int desc_size;
+	void *base;
+	dma_addr_t base_pa;	/* must be page aligned */
+} ____cacheline_aligned_in_smp;
+
+struct pdsc_qcq {
+	struct pdsc *pdsc;
+	void *q_base;
+	dma_addr_t q_base_pa;	/* might not be page aligned */
+	void *cq_base;
+	dma_addr_t cq_base_pa;	/* might not be page aligned */
+	u32 q_size;
+	u32 cq_size;
+	bool armed;
+	unsigned int flags;
+
+	struct work_struct work;
+	struct pdsc_queue q;
+	struct pdsc_cq cq;
+	int intx;
+
+	u32 accum_work;
+	struct dentry *dentry;
+};
+
 /* No state flags set means we are in a steady running state */
 enum pdsc_state_flags {
 	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
@@ -81,6 +162,7 @@ struct pdsc {
 	unsigned int devcmd_timeout;
 	struct mutex devcmd_lock;	/* lock for dev_cmd operations */
 	struct mutex config_lock;	/* lock for configuration operations */
+	spinlock_t adminq_lock;		/* lock for adminq operations */
 	struct pds_core_dev_info_regs __iomem *info_regs;
 	struct pds_core_dev_cmd_regs __iomem *cmd_regs;
 	struct pds_core_intr __iomem *intr_ctrl;
@@ -88,11 +170,64 @@ struct pdsc {
 	u64 __iomem *db_pages;
 	dma_addr_t phy_db_pages;
 	u64 __iomem *kern_dbpage;
+
+	struct pdsc_qcq adminqcq;
+	struct pdsc_qcq notifyqcq;
+	u64 last_eid;
 };
 
+/** enum pds_core_dbell_bits - bitwise composition of dbell values.
+ *
+ * @PDS_CORE_DBELL_QID_MASK:	unshifted mask of valid queue id bits.
+ * @PDS_CORE_DBELL_QID_SHIFT:	queue id shift amount in dbell value.
+ * @PDS_CORE_DBELL_QID:		macro to build QID component of dbell value.
+ *
+ * @PDS_CORE_DBELL_RING_MASK:	unshifted mask of valid ring bits.
+ * @PDS_CORE_DBELL_RING_SHIFT:	ring shift amount in dbell value.
+ * @PDS_CORE_DBELL_RING:	macro to build ring component of dbell value.
+ *
+ * @PDS_CORE_DBELL_RING_0:	ring zero dbell component value.
+ * @PDS_CORE_DBELL_RING_1:	ring one dbell component value.
+ * @PDS_CORE_DBELL_RING_2:	ring two dbell component value.
+ * @PDS_CORE_DBELL_RING_3:	ring three dbell component value.
+ *
+ * @PDS_CORE_DBELL_INDEX_MASK:	bit mask of valid index bits, no shift needed.
+ */
+enum pds_core_dbell_bits {
+	PDS_CORE_DBELL_QID_MASK		= 0xffffff,
+	PDS_CORE_DBELL_QID_SHIFT		= 24,
+
+#define PDS_CORE_DBELL_QID(n) \
+	(((u64)(n) & PDS_CORE_DBELL_QID_MASK) << PDS_CORE_DBELL_QID_SHIFT)
+
+	PDS_CORE_DBELL_RING_MASK		= 0x7,
+	PDS_CORE_DBELL_RING_SHIFT		= 16,
+
+#define PDS_CORE_DBELL_RING(n) \
+	(((u64)(n) & PDS_CORE_DBELL_RING_MASK) << PDS_CORE_DBELL_RING_SHIFT)
+
+	PDS_CORE_DBELL_RING_0		= 0,
+	PDS_CORE_DBELL_RING_1		= PDS_CORE_DBELL_RING(1),
+	PDS_CORE_DBELL_RING_2		= PDS_CORE_DBELL_RING(2),
+	PDS_CORE_DBELL_RING_3		= PDS_CORE_DBELL_RING(3),
+
+	PDS_CORE_DBELL_INDEX_MASK		= 0xffff,
+};
+
+static inline void pds_core_dbell_ring(u64 __iomem *db_page,
+				       enum pds_core_logical_qtype qtype,
+				       u64 val)
+{
+	writeq(val, &db_page[qtype]);
+}
+
 int pdsc_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
 			      struct devlink_fmsg *fmsg,
 			      struct netlink_ext_ack *extack);
+int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
+		     struct netlink_ext_ack *extack);
+
+void __iomem *pdsc_map_dbpage(struct pdsc *pdsc, int page_num);
 
 void pdsc_debugfs_create(void);
 void pdsc_debugfs_destroy(void);
@@ -100,6 +235,8 @@ void pdsc_debugfs_add_dev(struct pdsc *pdsc);
 void pdsc_debugfs_del_dev(struct pdsc *pdsc);
 void pdsc_debugfs_add_ident(struct pdsc *pdsc);
 void pdsc_debugfs_add_irqs(struct pdsc *pdsc);
+void pdsc_debugfs_add_qcq(struct pdsc *pdsc, struct pdsc_qcq *qcq);
+void pdsc_debugfs_del_qcq(struct pdsc_qcq *qcq);
 
 int pdsc_err_to_errno(enum pds_core_status_code code);
 bool pdsc_is_fw_running(struct pdsc *pdsc);
@@ -113,8 +250,22 @@ int pdsc_devcmd_reset(struct pdsc *pdsc);
 int pdsc_dev_reinit(struct pdsc *pdsc);
 int pdsc_dev_init(struct pdsc *pdsc);
 
+int pdsc_intr_alloc(struct pdsc *pdsc, char *name,
+		    irq_handler_t handler, void *data);
+void pdsc_intr_free(struct pdsc *pdsc, int index);
+void pdsc_qcq_free(struct pdsc *pdsc, struct pdsc_qcq *qcq);
+int pdsc_qcq_alloc(struct pdsc *pdsc, unsigned int type, unsigned int index,
+		   const char *name, unsigned int flags, unsigned int num_descs,
+		   unsigned int desc_size, unsigned int cq_desc_size,
+		   unsigned int pid, struct pdsc_qcq *qcq);
 int pdsc_setup(struct pdsc *pdsc, bool init);
 void pdsc_teardown(struct pdsc *pdsc, bool removing);
+int pdsc_start(struct pdsc *pdsc);
+void pdsc_stop(struct pdsc *pdsc);
 void pdsc_health_thread(struct work_struct *work);
 
+void pdsc_process_adminq(struct pdsc_qcq *qcq);
+void pdsc_work_thread(struct work_struct *work);
+irqreturn_t pdsc_adminq_isr(int irq, void *data);
+
 #endif /* _PDSC_H_ */
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
index 601431b41abb..b83e5016644b 100644
--- a/drivers/net/ethernet/amd/pds_core/debugfs.c
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -67,3 +67,80 @@ void pdsc_debugfs_add_ident(struct pdsc *pdsc)
 	debugfs_create_file("identity", 0400, pdsc->dentry,
 			    pdsc, &identity_fops);
 }
+
+static const struct debugfs_reg32 intr_ctrl_regs[] = {
+	{ .name = "coal_init", .offset = 0, },
+	{ .name = "mask", .offset = 4, },
+	{ .name = "credits", .offset = 8, },
+	{ .name = "mask_on_assert", .offset = 12, },
+	{ .name = "coal_timer", .offset = 16, },
+};
+
+void pdsc_debugfs_add_qcq(struct pdsc *pdsc, struct pdsc_qcq *qcq)
+{
+	struct dentry *qcq_dentry, *q_dentry, *cq_dentry;
+	struct dentry *intr_dentry;
+	struct debugfs_regset32 *intr_ctrl_regset;
+	struct pdsc_intr_info *intr = &pdsc->intr_info[qcq->intx];
+	struct pdsc_queue *q = &qcq->q;
+	struct pdsc_cq *cq = &qcq->cq;
+
+	qcq_dentry = debugfs_create_dir(q->name, pdsc->dentry);
+	if (IS_ERR_OR_NULL(qcq_dentry))
+		return;
+	qcq->dentry = qcq_dentry;
+
+	debugfs_create_x64("q_base_pa", 0400, qcq_dentry, &qcq->q_base_pa);
+	debugfs_create_x32("q_size", 0400, qcq_dentry, &qcq->q_size);
+	debugfs_create_x64("cq_base_pa", 0400, qcq_dentry, &qcq->cq_base_pa);
+	debugfs_create_x32("cq_size", 0400, qcq_dentry, &qcq->cq_size);
+	debugfs_create_x32("accum_work", 0400, qcq_dentry, &qcq->accum_work);
+
+	q_dentry = debugfs_create_dir("q", qcq->dentry);
+	if (IS_ERR_OR_NULL(q_dentry))
+		return;
+
+	debugfs_create_u32("index", 0400, q_dentry, &q->index);
+	debugfs_create_u32("num_descs", 0400, q_dentry, &q->num_descs);
+	debugfs_create_u32("desc_size", 0400, q_dentry, &q->desc_size);
+	debugfs_create_u32("pid", 0400, q_dentry, &q->pid);
+
+	debugfs_create_u16("tail", 0400, q_dentry, &q->tail_idx);
+	debugfs_create_u16("head", 0400, q_dentry, &q->head_idx);
+
+	cq_dentry = debugfs_create_dir("cq", qcq->dentry);
+	if (IS_ERR_OR_NULL(cq_dentry))
+		return;
+
+	debugfs_create_x64("base_pa", 0400, cq_dentry, &cq->base_pa);
+	debugfs_create_u32("num_descs", 0400, cq_dentry, &cq->num_descs);
+	debugfs_create_u32("desc_size", 0400, cq_dentry, &cq->desc_size);
+	debugfs_create_bool("done_color", 0400, cq_dentry, &cq->done_color);
+	debugfs_create_u16("tail", 0400, cq_dentry, &cq->tail_idx);
+
+	if (qcq->flags & PDS_CORE_QCQ_F_INTR) {
+		intr_dentry = debugfs_create_dir("intr", qcq->dentry);
+		if (IS_ERR_OR_NULL(intr_dentry))
+			return;
+
+		debugfs_create_u32("index", 0400, intr_dentry, &intr->index);
+		debugfs_create_u32("vector", 0400, intr_dentry, &intr->vector);
+
+		intr_ctrl_regset = kzalloc(sizeof(*intr_ctrl_regset),
+					   GFP_KERNEL);
+		if (!intr_ctrl_regset)
+			return;
+		intr_ctrl_regset->regs = intr_ctrl_regs;
+		intr_ctrl_regset->nregs = ARRAY_SIZE(intr_ctrl_regs);
+		intr_ctrl_regset->base = &pdsc->intr_ctrl[intr->index];
+
+		debugfs_create_regset32("intr_ctrl", 0400, intr_dentry,
+					intr_ctrl_regset);
+	}
+};
+
+void pdsc_debugfs_del_qcq(struct pdsc_qcq *qcq)
+{
+	debugfs_remove_recursive(qcq->dentry);
+	qcq->dentry = NULL;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c
index 3b05b1af65d1..89f23d620d1d 100644
--- a/drivers/net/ethernet/amd/pds_core/devlink.c
+++ b/drivers/net/ethernet/amd/pds_core/devlink.c
@@ -3,6 +3,67 @@
 
 #include "core.h"
 
+static char *fw_slotnames[] = {
+	"fw.goldfw",
+	"fw.mainfwa",
+	"fw.mainfwb",
+};
+
+int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
+		     struct netlink_ext_ack *extack)
+{
+	union pds_core_dev_cmd cmd = {
+		.fw_control.opcode = PDS_CORE_CMD_FW_CONTROL,
+		.fw_control.oper = PDS_CORE_FW_GET_LIST,
+	};
+	struct pds_core_fw_list_info fw_list;
+	struct pdsc *pdsc = devlink_priv(dl);
+	union pds_core_dev_comp comp;
+	char buf[16];
+	int listlen;
+	int err;
+	int i;
+
+	mutex_lock(&pdsc->devcmd_lock);
+	err = pdsc_devcmd_locked(pdsc, &cmd, &comp, pdsc->devcmd_timeout * 2);
+	memcpy_fromio(&fw_list, pdsc->cmd_regs->data, sizeof(fw_list));
+	mutex_unlock(&pdsc->devcmd_lock);
+	if (err && err != -EIO)
+		return err;
+
+	listlen = fw_list.num_fw_slots;
+	for (i = 0; i < listlen; i++) {
+		if (i < ARRAY_SIZE(fw_slotnames))
+			strscpy(buf, fw_slotnames[i], sizeof(buf));
+		else
+			snprintf(buf, sizeof(buf), "fw.slot_%d", i);
+		err = devlink_info_version_stored_put(req, buf,
+						      fw_list.fw_names[i].fw_version);
+	}
+
+	err = devlink_info_version_running_put(req,
+					       DEVLINK_INFO_VERSION_GENERIC_FW,
+					       pdsc->dev_info.fw_version);
+	if (err)
+		return err;
+
+	snprintf(buf, sizeof(buf), "0x%x", pdsc->dev_info.asic_type);
+	err = devlink_info_version_fixed_put(req,
+					     DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
+					     buf);
+	if (err)
+		return err;
+
+	snprintf(buf, sizeof(buf), "0x%x", pdsc->dev_info.asic_rev);
+	err = devlink_info_version_fixed_put(req,
+					     DEVLINK_INFO_VERSION_GENERIC_ASIC_REV,
+					     buf);
+	if (err)
+		return err;
+
+	return devlink_info_serial_number_put(req, pdsc->dev_info.serial_num);
+}
+
 int pdsc_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
 			      struct devlink_fmsg *fmsg,
 			      struct netlink_ext_ack *extack)
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
index 54f3aed7adb1..eaff311d4a10 100644
--- a/drivers/net/ethernet/amd/pds_core/main.c
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -125,6 +125,13 @@ err_out:
 	return err;
 }
 
+void __iomem *pdsc_map_dbpage(struct pdsc *pdsc, int page_num)
+{
+	return pci_iomap_range(pdsc->pdev,
+			       pdsc->bars[PDS_CORE_PCI_BAR_DBELL].res_index,
+			       (u64)page_num << PAGE_SHIFT, PAGE_SIZE);
+}
+
 static int pdsc_init_vf(struct pdsc *vf)
 {
 	return -1;
@@ -166,6 +173,7 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 
 	mutex_init(&pdsc->devcmd_lock);
 	mutex_init(&pdsc->config_lock);
+	spin_lock_init(&pdsc->adminq_lock);
 
 	mutex_lock(&pdsc->config_lock);
 	set_bit(PDSC_S_FW_DEAD, &pdsc->state);
@@ -173,6 +181,9 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 	err = pdsc_setup(pdsc, PDSC_SETUP_INIT);
 	if (err)
 		goto err_out_unmap_bars;
+	err = pdsc_start(pdsc);
+	if (err)
+		goto err_out_teardown;
 
 	mutex_unlock(&pdsc->config_lock);
 
@@ -184,7 +195,7 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 		dev_warn(pdsc->dev, "Failed to create fw reporter: %pe\n", hr);
 		err = PTR_ERR(hr);
 		devl_unlock(dl);
-		goto err_out_teardown;
+		goto err_out_stop;
 	}
 	pdsc->fw_reporter = hr;
 
@@ -196,6 +207,8 @@ static int pdsc_init_pf(struct pdsc *pdsc)
 
 	return 0;
 
+err_out_stop:
+	pdsc_stop(pdsc);
 err_out_teardown:
 	pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING);
 err_out_unmap_bars:
@@ -214,6 +227,7 @@ err_out_release_regions:
 }
 
 static const struct devlink_ops pdsc_dl_ops = {
+	.info_get	= pdsc_dl_info_get,
 };
 
 static const struct devlink_ops pdsc_dl_vf_ops = {
@@ -315,6 +329,7 @@ static void pdsc_remove(struct pci_dev *pdev)
 		mutex_lock(&pdsc->config_lock);
 		set_bit(PDSC_S_STOPPING_DRIVER, &pdsc->state);
 
+		pdsc_stop(pdsc);
 		pdsc_teardown(pdsc, PDSC_TEARDOWN_REMOVING);
 		mutex_unlock(&pdsc->config_lock);
 		mutex_destroy(&pdsc->config_lock);
diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
new file mode 100644
index 000000000000..dd5fbe3ee141
--- /dev/null
+++ b/include/linux/pds/pds_adminq.h
@@ -0,0 +1,638 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _PDS_CORE_ADMINQ_H_
+#define _PDS_CORE_ADMINQ_H_
+
+enum pds_core_adminq_flags {
+	PDS_AQ_FLAG_FASTPOLL	= BIT(1),	/* completion poll at 1ms */
+};
+
+/*
+ * enum pds_core_adminq_opcode - AdminQ command opcodes
+ * These commands are only processed on AdminQ, not available in devcmd
+ */
+enum pds_core_adminq_opcode {
+	PDS_AQ_CMD_NOP			= 0,
+
+	/* Client control */
+	PDS_AQ_CMD_CLIENT_REG		= 6,
+	PDS_AQ_CMD_CLIENT_UNREG		= 7,
+	PDS_AQ_CMD_CLIENT_CMD		= 8,
+
+	/* LIF commands */
+	PDS_AQ_CMD_LIF_IDENTIFY		= 20,
+	PDS_AQ_CMD_LIF_INIT		= 21,
+	PDS_AQ_CMD_LIF_RESET		= 22,
+	PDS_AQ_CMD_LIF_GETATTR		= 23,
+	PDS_AQ_CMD_LIF_SETATTR		= 24,
+	PDS_AQ_CMD_LIF_SETPHC		= 25,
+
+	PDS_AQ_CMD_RX_MODE_SET		= 30,
+	PDS_AQ_CMD_RX_FILTER_ADD	= 31,
+	PDS_AQ_CMD_RX_FILTER_DEL	= 32,
+
+	/* Queue commands */
+	PDS_AQ_CMD_Q_IDENTIFY		= 39,
+	PDS_AQ_CMD_Q_INIT		= 40,
+	PDS_AQ_CMD_Q_CONTROL		= 41,
+
+	/* SR/IOV commands */
+	PDS_AQ_CMD_VF_GETATTR		= 60,
+	PDS_AQ_CMD_VF_SETATTR		= 61,
+};
+
+/*
+ * enum pds_core_notifyq_opcode - NotifyQ event codes
+ */
+enum pds_core_notifyq_opcode {
+	PDS_EVENT_LINK_CHANGE		= 1,
+	PDS_EVENT_RESET			= 2,
+	PDS_EVENT_XCVR			= 5,
+	PDS_EVENT_CLIENT		= 6,
+};
+
+#define PDS_COMP_COLOR_MASK  0x80
+
+/**
+ * struct pds_core_notifyq_event - Generic event reporting structure
+ * @eid:   event number
+ * @ecode: event code
+ *
+ * This is the generic event report struct from which the other
+ * actual events will be formed.
+ */
+struct pds_core_notifyq_event {
+	__le64 eid;
+	__le16 ecode;
+};
+
+/**
+ * struct pds_core_link_change_event - Link change event notification
+ * @eid:		event number
+ * @ecode:		event code = PDS_EVENT_LINK_CHANGE
+ * @link_status:	link up/down, with error bits
+ * @link_speed:		speed of the network link
+ *
+ * Sent when the network link state changes between UP and DOWN
+ */
+struct pds_core_link_change_event {
+	__le64 eid;
+	__le16 ecode;
+	__le16 link_status;
+	__le32 link_speed;	/* units of 1Mbps: e.g. 10000 = 10Gbps */
+};
+
+/**
+ * struct pds_core_reset_event - Reset event notification
+ * @eid:		event number
+ * @ecode:		event code = PDS_EVENT_RESET
+ * @reset_code:		reset type
+ * @state:		0=pending, 1=complete, 2=error
+ *
+ * Sent when the NIC or some subsystem is going to be or
+ * has been reset.
+ */
+struct pds_core_reset_event {
+	__le64 eid;
+	__le16 ecode;
+	u8     reset_code;
+	u8     state;
+};
+
+/**
+ * struct pds_core_client_event - Client event notification
+ * @eid:		event number
+ * @ecode:		event code = PDS_EVENT_CLIENT
+ * @client_id:          client to sent event to
+ * @client_event:       wrapped event struct for the client
+ *
+ * Sent when an event needs to be passed on to a client
+ */
+struct pds_core_client_event {
+	__le64 eid;
+	__le16 ecode;
+	__le16 client_id;
+	u8     client_event[54];
+};
+
+/**
+ * struct pds_core_notifyq_cmd - Placeholder for building qcq
+ * @data:      anonymous field for building the qcq
+ */
+struct pds_core_notifyq_cmd {
+	__le32 data;	/* Not used but needed for qcq structure */
+};
+
+/*
+ * union pds_core_notifyq_comp - Overlay of notifyq event structures
+ */
+union pds_core_notifyq_comp {
+	struct {
+		__le64 eid;
+		__le16 ecode;
+	};
+	struct pds_core_notifyq_event     event;
+	struct pds_core_link_change_event link_change;
+	struct pds_core_reset_event       reset;
+	u8     data[64];
+};
+
+#define PDS_DEVNAME_LEN		32
+/**
+ * struct pds_core_client_reg_cmd - Register a new client with DSC
+ * @opcode:         opcode PDS_AQ_CMD_CLIENT_REG
+ * @rsvd:           word boundary padding
+ * @devname:        text name of client device
+ * @vif_type:       what type of device (enum pds_core_vif_types)
+ *
+ * Tell the DSC of the new client, and receive a client_id from DSC.
+ */
+struct pds_core_client_reg_cmd {
+	u8     opcode;
+	u8     rsvd[3];
+	char   devname[PDS_DEVNAME_LEN];
+	u8     vif_type;
+};
+
+/**
+ * struct pds_core_client_reg_comp - Client registration completion
+ * @status:     Status of the command (enum pdc_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index: Index in the descriptor ring for which this is the completion
+ * @client_id:  New id assigned by DSC
+ * @rsvd1:      Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_client_reg_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	__le16 client_id;
+	u8     rsvd1[9];
+	u8     color;
+};
+
+/**
+ * struct pds_core_client_unreg_cmd - Unregister a client from DSC
+ * @opcode:     opcode PDS_AQ_CMD_CLIENT_UNREG
+ * @rsvd:       word boundary padding
+ * @client_id:  id of client being removed
+ *
+ * Tell the DSC this client is going away and remove its context
+ * This uses the generic completion.
+ */
+struct pds_core_client_unreg_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 client_id;
+};
+
+/**
+ * struct pds_core_client_request_cmd - Pass along a wrapped client AdminQ cmd
+ * @opcode:     opcode PDS_AQ_CMD_CLIENT_CMD
+ * @rsvd:       word boundary padding
+ * @client_id:  id of client being removed
+ * @client_cmd: the wrapped client command
+ *
+ * Proxy post an adminq command for the client.
+ * This uses the generic completion.
+ */
+struct pds_core_client_request_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 client_id;
+	u8     client_cmd[60];
+};
+
+#define PDS_CORE_MAX_FRAGS		16
+
+#define PDS_CORE_QCQ_F_INITED		BIT(0)
+#define PDS_CORE_QCQ_F_SG		BIT(1)
+#define PDS_CORE_QCQ_F_INTR		BIT(2)
+#define PDS_CORE_QCQ_F_TX_STATS		BIT(3)
+#define PDS_CORE_QCQ_F_RX_STATS		BIT(4)
+#define PDS_CORE_QCQ_F_NOTIFYQ		BIT(5)
+#define PDS_CORE_QCQ_F_CMB_RINGS	BIT(6)
+#define PDS_CORE_QCQ_F_CORE		BIT(7)
+
+enum pds_core_lif_type {
+	PDS_CORE_LIF_TYPE_DEFAULT = 0,
+};
+
+/**
+ * union pds_core_lif_config - LIF configuration
+ * @state:	    LIF state (enum pds_core_lif_state)
+ * @rsvd:           Word boundary padding
+ * @name:	    LIF name
+ * @rsvd2:          Word boundary padding
+ * @features:	    LIF features active (enum pds_core_hw_features)
+ * @queue_count:    Queue counts per queue-type
+ * @words:          Full union buffer size
+ */
+union pds_core_lif_config {
+	struct {
+		u8     state;
+		u8     rsvd[3];
+		char   name[PDS_CORE_IFNAMSIZ];
+		u8     rsvd2[12];
+		__le64 features;
+		__le32 queue_count[PDS_CORE_QTYPE_MAX];
+	} __packed;
+	__le32 words[64];
+};
+
+/**
+ * struct pds_core_lif_status - LIF status register
+ * @eid:	     most recent NotifyQ event id
+ * @rsvd:            full struct size
+ */
+struct pds_core_lif_status {
+	__le64 eid;
+	u8     rsvd[56];
+};
+
+/**
+ * struct pds_core_lif_info - LIF info structure
+ * @config:	LIF configuration structure
+ * @status:	LIF status structure
+ */
+struct pds_core_lif_info {
+	union pds_core_lif_config config;
+	struct pds_core_lif_status status;
+};
+
+/**
+ * struct pds_core_lif_identity - LIF identity information (type-specific)
+ * @features:		LIF features (see enum pds_core_hw_features)
+ * @version:		Identify structure version
+ * @hw_index:		LIF hardware index
+ * @rsvd:		Word boundary padding
+ * @max_nb_sessions:	Maximum number of sessions supported
+ * @rsvd2:		buffer padding
+ * @config:		LIF config struct with features, q counts
+ */
+struct pds_core_lif_identity {
+	__le64 features;
+	u8     version;
+	u8     hw_index;
+	u8     rsvd[2];
+	__le32 max_nb_sessions;
+	u8     rsvd2[120];
+	union pds_core_lif_config config;
+};
+
+/**
+ * struct pds_core_lif_identify_cmd - Get LIF identity info command
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_IDENTIFY
+ * @type:	LIF type (enum pds_core_lif_type)
+ * @client_id:	Client identifier
+ * @ver:	Version of identify returned by device
+ * @rsvd:       Word boundary padding
+ * @ident_pa:	DMA address to receive identity info
+ *
+ * Firmware will copy LIF identity data (struct pds_core_lif_identity)
+ * into the buffer address given.
+ */
+struct pds_core_lif_identify_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	u8     ver;
+	u8     rsvd[3];
+	__le64 ident_pa;
+};
+
+/**
+ * struct pds_core_lif_identify_comp - LIF identify command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @ver:	Version of identify returned by device
+ * @bytes:	Bytes copied into the buffer
+ * @rsvd:       Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_lif_identify_comp {
+	u8     status;
+	u8     ver;
+	__le16 bytes;
+	u8     rsvd[11];
+	u8     color;
+};
+
+/**
+ * struct pds_core_lif_init_cmd - LIF init command
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_INIT
+ * @type:	LIF type (enum pds_core_lif_type)
+ * @client_id:	Client identifier
+ * @rsvd:       Word boundary padding
+ * @info_pa:	Destination address for LIF info (struct pds_core_lif_info)
+ */
+struct pds_core_lif_init_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	__le32 rsvd;
+	__le64 info_pa;
+};
+
+/**
+ * struct pds_core_lif_init_comp - LIF init command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @hw_index:	Hardware index of the initialized LIF
+ * @rsvd1:      Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_lif_init_comp {
+	u8 status;
+	u8 rsvd;
+	__le16 hw_index;
+	u8     rsvd1[11];
+	u8     color;
+};
+
+/**
+ * struct pds_core_lif_reset_cmd - LIF reset command
+ * Will reset only the specified LIF.
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_RESET
+ * @rsvd:       Word boundary padding
+ * @client_id:	Client identifier
+ */
+struct pds_core_lif_reset_cmd {
+	u8     opcode;
+	u8     rsvd;
+	__le16 client_id;
+};
+
+/**
+ * enum pds_core_lif_attr - List of LIF attributes
+ * @PDS_CORE_LIF_ATTR_STATE:		LIF state attribute
+ * @PDS_CORE_LIF_ATTR_NAME:		LIF name attribute
+ * @PDS_CORE_LIF_ATTR_FEATURES:		LIF features attribute
+ * @PDS_CORE_LIF_ATTR_STATS_CTRL:	LIF statistics control attribute
+ */
+enum pds_core_lif_attr {
+	PDS_CORE_LIF_ATTR_STATE		= 0,
+	PDS_CORE_LIF_ATTR_NAME		= 1,
+	PDS_CORE_LIF_ATTR_FEATURES	= 4,
+	PDS_CORE_LIF_ATTR_STATS_CTRL	= 6,
+};
+
+/**
+ * struct pds_core_lif_setattr_cmd - Set LIF attributes on the NIC
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_SETATTR
+ * @attr:	Attribute type (enum pds_core_lif_attr)
+ * @client_id:	Client identifier
+ * @state:	LIF state (enum pds_core_lif_state)
+ * @name:	The name string, 0 terminated
+ * @features:	Features (enum pds_core_hw_features)
+ * @stats_ctl:	Stats control commands (enum pds_core_stats_ctl_cmd)
+ * @rsvd:       Command Buffer padding
+ */
+struct pds_core_lif_setattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 client_id;
+	union {
+		u8      state;
+		char    name[PDS_CORE_IFNAMSIZ];
+		__le64  features;
+		u8      stats_ctl;
+		u8      rsvd[60];
+	} __packed;
+};
+
+/**
+ * struct pds_core_lif_setattr_comp - LIF set attr command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index: Index in the descriptor ring for which this is the completion
+ * @features:	Features (enum pds_core_hw_features)
+ * @rsvd2:      Word boundary padding
+ * @color:	Color bit
+ */
+struct pds_core_lif_setattr_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	union {
+		__le64  features;
+		u8      rsvd2[11];
+	} __packed;
+	u8     color;
+};
+
+/**
+ * struct pds_core_lif_getattr_cmd - Get LIF attributes from the NIC
+ * @opcode:	Opcode PDS_AQ_CMD_LIF_GETATTR
+ * @attr:	Attribute type (enum pds_core_lif_attr)
+ * @client_id:	Client identifier
+ */
+struct pds_core_lif_getattr_cmd {
+	u8     opcode;
+	u8     attr;
+	__le16 client_id;
+};
+
+/**
+ * struct pds_core_lif_getattr_comp - LIF get attr command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index: Index in the descriptor ring for which this is the completion
+ * @state:	LIF state (enum pds_core_lif_state)
+ * @name:	LIF name string, 0 terminated
+ * @features:	Features (enum pds_core_hw_features)
+ * @rsvd2:      Word boundary padding
+ * @color:	Color bit
+ */
+struct pds_core_lif_getattr_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	union {
+		u8      state;
+		__le64  features;
+		u8      rsvd2[11];
+	} __packed;
+	u8     color;
+};
+
+/**
+ * union pds_core_q_identity - Queue identity information
+ * @version:	Queue type version that can be used with FW
+ * @supported:	Bitfield of queue versions, first bit = ver 0
+ * @rsvd:       Word boundary padding
+ * @features:	Queue features
+ * @desc_sz:	Descriptor size
+ * @comp_sz:	Completion descriptor size
+ * @rsvd2:      Word boundary padding
+ */
+struct pds_core_q_identity {
+	u8      version;
+	u8      supported;
+	u8      rsvd[6];
+#define PDS_CORE_QIDENT_F_CQ	0x01	/* queue has completion ring */
+	__le64  features;
+	__le16  desc_sz;
+	__le16  comp_sz;
+	u8      rsvd2[6];
+};
+
+/**
+ * struct pds_core_q_identify_cmd - queue identify command
+ * @opcode:	Opcode PDS_AQ_CMD_Q_IDENTIFY
+ * @type:	Logical queue type (enum pds_core_logical_qtype)
+ * @client_id:	Client identifier
+ * @ver:	Highest queue type version that the driver supports
+ * @rsvd:       Word boundary padding
+ * @ident_pa:   DMA address to receive the data (struct pds_core_q_identity)
+ */
+struct pds_core_q_identify_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	u8     ver;
+	u8     rsvd[3];
+	__le64 ident_pa;
+};
+
+/**
+ * struct pds_core_q_identify_comp - queue identify command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index:	Index in the descriptor ring for which this is the completion
+ * @ver:	Queue type version that can be used with FW
+ * @rsvd1:      Word boundary padding
+ * @color:      Color bit
+ */
+struct pds_core_q_identify_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	u8     ver;
+	u8     rsvd1[10];
+	u8     color;
+};
+
+/**
+ * struct pds_core_q_init_cmd - Queue init command
+ * @opcode:	  Opcode PDS_AQ_CMD_Q_INIT
+ * @type:	  Logical queue type
+ * @client_id:	  Client identifier
+ * @ver:	  Queue type version
+ * @rsvd:         Word boundary padding
+ * @index:	  (LIF, qtype) relative admin queue index
+ * @intr_index:	  Interrupt control register index, or Event queue index
+ * @pid:	  Process ID
+ * @flags:
+ *    IRQ:	  Interrupt requested on completion
+ *    ENA:	  Enable the queue.  If ENA=0 the queue is initialized
+ *		  but remains disabled, to be later enabled with the
+ *		  Queue Enable command. If ENA=1, then queue is
+ *		  initialized and then enabled.
+ * @cos:	  Class of service for this queue
+ * @ring_size:	  Queue ring size, encoded as a log2(size), in
+ *		  number of descriptors.  The actual ring size is
+ *		  (1 << ring_size).  For example, to select a ring size
+ *		  of 64 descriptors write ring_size = 6. The minimum
+ *		  ring_size value is 2 for a ring of 4 descriptors.
+ *		  The maximum ring_size value is 12 for a ring of 4k
+ *		  descriptors. Values of ring_size <2 and >12 are
+ *		  reserved.
+ * @ring_base:	  Queue ring base address
+ * @cq_ring_base: Completion queue ring base address
+ */
+struct pds_core_q_init_cmd {
+	u8     opcode;
+	u8     type;
+	__le16 client_id;
+	u8     ver;
+	u8     rsvd[3];
+	__le32 index;
+	__le16 pid;
+	__le16 intr_index;
+	__le16 flags;
+#define PDS_CORE_QINIT_F_IRQ	0x01	/* Request interrupt on completion */
+#define PDS_CORE_QINIT_F_ENA	0x02	/* Enable the queue */
+	u8     cos;
+#define PDS_CORE_QSIZE_MIN_LG2	2
+#define PDS_CORE_QSIZE_MAX_LG2	12
+	u8     ring_size;
+	__le64 ring_base;
+	__le64 cq_ring_base;
+} __packed;
+
+/**
+ * struct pds_core_q_init_comp - Queue init command completion
+ * @status:	Status of the command (enum pds_core_status_code)
+ * @rsvd:       Word boundary padding
+ * @comp_index:	Index in the descriptor ring for which this is the completion
+ * @hw_index:	Hardware Queue ID
+ * @hw_type:	Hardware Queue type
+ * @rsvd2:      Word boundary padding
+ * @color:	Color
+ */
+struct pds_core_q_init_comp {
+	u8     status;
+	u8     rsvd;
+	__le16 comp_index;
+	__le32 hw_index;
+	u8     hw_type;
+	u8     rsvd2[6];
+	u8     color;
+};
+
+union pds_core_adminq_cmd {
+	u8     opcode;
+	u8     bytes[64];
+
+	struct pds_core_client_reg_cmd     client_reg;
+	struct pds_core_client_unreg_cmd   client_unreg;
+	struct pds_core_client_request_cmd client_request;
+
+	struct pds_core_lif_identify_cmd  lif_ident;
+	struct pds_core_lif_init_cmd      lif_init;
+	struct pds_core_lif_reset_cmd     lif_reset;
+	struct pds_core_lif_setattr_cmd   lif_setattr;
+	struct pds_core_lif_getattr_cmd   lif_getattr;
+
+	struct pds_core_q_identify_cmd    q_ident;
+	struct pds_core_q_init_cmd        q_init;
+};
+
+union pds_core_adminq_comp {
+	struct {
+		u8     status;
+		u8     rsvd;
+		__le16 comp_index;
+		u8     rsvd2[11];
+		u8     color;
+	};
+	u32    words[4];
+
+	struct pds_core_client_reg_comp   client_reg;
+
+	struct pds_core_lif_identify_comp lif_ident;
+	struct pds_core_lif_init_comp     lif_init;
+	struct pds_core_lif_setattr_comp  lif_setattr;
+	struct pds_core_lif_getattr_comp  lif_getattr;
+
+	struct pds_core_q_identify_comp   q_ident;
+	struct pds_core_q_init_comp       q_init;
+};
+
+#ifndef __CHECKER__
+static_assert(sizeof(union pds_core_adminq_cmd) == 64);
+static_assert(sizeof(union pds_core_adminq_comp) == 16);
+static_assert(sizeof(union pds_core_notifyq_comp) == 64);
+#endif /* __CHECKER__ */
+
+/* The color bit is a 'done' bit for the completion descriptors
+ * where the meaning alternates between '1' and '0' for alternating
+ * passes through the completion descriptor ring.
+ */
+static inline u8 pdsc_color_match(u8 color, u8 done_color)
+{
+	return (!!(color & PDS_COMP_COLOR_MASK)) == done_color;
+}
+#endif /* _PDS_CORE_ADMINQ_H_ */
-- 
cgit v1.2.3


From 01ba61b55b2041a39c54aefb3153c770dd59a0ef Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:19 -0700
Subject: pds_core: Add adminq processing and commands

Add the service routines for submitting and processing
the adminq messages and for handling notifyq events.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/Makefile |   1 +
 drivers/net/ethernet/amd/pds_core/adminq.c | 288 +++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.c   |  11 --
 include/linux/pds/pds_adminq.h             |  11 +-
 4 files changed, 299 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/pds_core/adminq.c

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
index eaca8557ba66..ef76dcd7fccd 100644
--- a/drivers/net/ethernet/amd/pds_core/Makefile
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_PDS_CORE) := pds_core.o
 pds_core-y := main.o \
 	      devlink.o \
 	      dev.o \
+	      adminq.o \
 	      core.o
 
 pds_core-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c
new file mode 100644
index 000000000000..fb2ba3f62480
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/adminq.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/dynamic_debug.h>
+
+#include "core.h"
+
+struct pdsc_wait_context {
+	struct pdsc_qcq *qcq;
+	struct completion wait_completion;
+};
+
+static int pdsc_process_notifyq(struct pdsc_qcq *qcq)
+{
+	union pds_core_notifyq_comp *comp;
+	struct pdsc *pdsc = qcq->pdsc;
+	struct pdsc_cq *cq = &qcq->cq;
+	struct pdsc_cq_info *cq_info;
+	int nq_work = 0;
+	u64 eid;
+
+	cq_info = &cq->info[cq->tail_idx];
+	comp = cq_info->comp;
+	eid = le64_to_cpu(comp->event.eid);
+	while (eid > pdsc->last_eid) {
+		u16 ecode = le16_to_cpu(comp->event.ecode);
+
+		switch (ecode) {
+		case PDS_EVENT_LINK_CHANGE:
+			dev_info(pdsc->dev, "NotifyQ LINK_CHANGE ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+
+		case PDS_EVENT_RESET:
+			dev_info(pdsc->dev, "NotifyQ RESET ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+
+		case PDS_EVENT_XCVR:
+			dev_info(pdsc->dev, "NotifyQ XCVR ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+
+		default:
+			dev_info(pdsc->dev, "NotifyQ ecode %d eid %lld\n",
+				 ecode, eid);
+			break;
+		}
+
+		pdsc->last_eid = eid;
+		cq->tail_idx = (cq->tail_idx + 1) & (cq->num_descs - 1);
+		cq_info = &cq->info[cq->tail_idx];
+		comp = cq_info->comp;
+		eid = le64_to_cpu(comp->event.eid);
+
+		nq_work++;
+	}
+
+	qcq->accum_work += nq_work;
+
+	return nq_work;
+}
+
+void pdsc_process_adminq(struct pdsc_qcq *qcq)
+{
+	union pds_core_adminq_comp *comp;
+	struct pdsc_queue *q = &qcq->q;
+	struct pdsc *pdsc = qcq->pdsc;
+	struct pdsc_cq *cq = &qcq->cq;
+	struct pdsc_q_info *q_info;
+	unsigned long irqflags;
+	int nq_work = 0;
+	int aq_work = 0;
+	int credits;
+
+	/* Don't process AdminQ when shutting down */
+	if (pdsc->state & BIT_ULL(PDSC_S_STOPPING_DRIVER)) {
+		dev_err(pdsc->dev, "%s: called while PDSC_S_STOPPING_DRIVER\n",
+			__func__);
+		return;
+	}
+
+	/* Check for NotifyQ event */
+	nq_work = pdsc_process_notifyq(&pdsc->notifyqcq);
+
+	/* Check for empty queue, which can happen if the interrupt was
+	 * for a NotifyQ event and there are no new AdminQ completions.
+	 */
+	if (q->tail_idx == q->head_idx)
+		goto credits;
+
+	/* Find the first completion to clean,
+	 * run the callback in the related q_info,
+	 * and continue while we still match done color
+	 */
+	spin_lock_irqsave(&pdsc->adminq_lock, irqflags);
+	comp = cq->info[cq->tail_idx].comp;
+	while (pdsc_color_match(comp->color, cq->done_color)) {
+		q_info = &q->info[q->tail_idx];
+		q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
+
+		/* Copy out the completion data */
+		memcpy(q_info->dest, comp, sizeof(*comp));
+
+		complete_all(&q_info->wc->wait_completion);
+
+		if (cq->tail_idx == cq->num_descs - 1)
+			cq->done_color = !cq->done_color;
+		cq->tail_idx = (cq->tail_idx + 1) & (cq->num_descs - 1);
+		comp = cq->info[cq->tail_idx].comp;
+
+		aq_work++;
+	}
+	spin_unlock_irqrestore(&pdsc->adminq_lock, irqflags);
+
+	qcq->accum_work += aq_work;
+
+credits:
+	/* Return the interrupt credits, one for each completion */
+	credits = nq_work + aq_work;
+	if (credits)
+		pds_core_intr_credits(&pdsc->intr_ctrl[qcq->intx],
+				      credits,
+				      PDS_CORE_INTR_CRED_REARM);
+}
+
+void pdsc_work_thread(struct work_struct *work)
+{
+	struct pdsc_qcq *qcq = container_of(work, struct pdsc_qcq, work);
+
+	pdsc_process_adminq(qcq);
+}
+
+irqreturn_t pdsc_adminq_isr(int irq, void *data)
+{
+	struct pdsc_qcq *qcq = data;
+	struct pdsc *pdsc = qcq->pdsc;
+
+	/* Don't process AdminQ when shutting down */
+	if (pdsc->state & BIT_ULL(PDSC_S_STOPPING_DRIVER)) {
+		dev_err(pdsc->dev, "%s: called while PDSC_S_STOPPING_DRIVER\n",
+			__func__);
+		return IRQ_HANDLED;
+	}
+
+	queue_work(pdsc->wq, &qcq->work);
+	pds_core_intr_mask(&pdsc->intr_ctrl[irq], PDS_CORE_INTR_MASK_CLEAR);
+
+	return IRQ_HANDLED;
+}
+
+static int __pdsc_adminq_post(struct pdsc *pdsc,
+			      struct pdsc_qcq *qcq,
+			      union pds_core_adminq_cmd *cmd,
+			      union pds_core_adminq_comp *comp,
+			      struct pdsc_wait_context *wc)
+{
+	struct pdsc_queue *q = &qcq->q;
+	struct pdsc_q_info *q_info;
+	unsigned long irqflags;
+	unsigned int avail;
+	int index;
+	int ret;
+
+	spin_lock_irqsave(&pdsc->adminq_lock, irqflags);
+
+	/* Check for space in the queue */
+	avail = q->tail_idx;
+	if (q->head_idx >= avail)
+		avail += q->num_descs - q->head_idx - 1;
+	else
+		avail -= q->head_idx + 1;
+	if (!avail) {
+		ret = -ENOSPC;
+		goto err_out_unlock;
+	}
+
+	/* Check that the FW is running */
+	if (!pdsc_is_fw_running(pdsc)) {
+		u8 fw_status = ioread8(&pdsc->info_regs->fw_status);
+
+		dev_info(pdsc->dev, "%s: post failed - fw not running %#02x:\n",
+			 __func__, fw_status);
+		ret = -ENXIO;
+
+		goto err_out_unlock;
+	}
+
+	/* Post the request */
+	index = q->head_idx;
+	q_info = &q->info[index];
+	q_info->wc = wc;
+	q_info->dest = comp;
+	memcpy(q_info->desc, cmd, sizeof(*cmd));
+
+	dev_dbg(pdsc->dev, "head_idx %d tail_idx %d\n",
+		q->head_idx, q->tail_idx);
+	dev_dbg(pdsc->dev, "post admin queue command:\n");
+	dynamic_hex_dump("cmd ", DUMP_PREFIX_OFFSET, 16, 1,
+			 cmd, sizeof(*cmd), true);
+
+	q->head_idx = (q->head_idx + 1) & (q->num_descs - 1);
+
+	pds_core_dbell_ring(pdsc->kern_dbpage,
+			    q->hw_type, q->dbval | q->head_idx);
+	ret = index;
+
+err_out_unlock:
+	spin_unlock_irqrestore(&pdsc->adminq_lock, irqflags);
+	return ret;
+}
+
+int pdsc_adminq_post(struct pdsc *pdsc,
+		     union pds_core_adminq_cmd *cmd,
+		     union pds_core_adminq_comp *comp,
+		     bool fast_poll)
+{
+	struct pdsc_wait_context wc = {
+		.wait_completion =
+			COMPLETION_INITIALIZER_ONSTACK(wc.wait_completion),
+	};
+	unsigned long poll_interval = 1;
+	unsigned long poll_jiffies;
+	unsigned long time_limit;
+	unsigned long time_start;
+	unsigned long time_done;
+	unsigned long remaining;
+	int err = 0;
+	int index;
+
+	wc.qcq = &pdsc->adminqcq;
+	index = __pdsc_adminq_post(pdsc, &pdsc->adminqcq, cmd, comp, &wc);
+	if (index < 0) {
+		err = index;
+		goto err_out;
+	}
+
+	time_start = jiffies;
+	time_limit = time_start + HZ * pdsc->devcmd_timeout;
+	do {
+		/* Timeslice the actual wait to catch IO errors etc early */
+		poll_jiffies = msecs_to_jiffies(poll_interval);
+		remaining = wait_for_completion_timeout(&wc.wait_completion,
+							poll_jiffies);
+		if (remaining)
+			break;
+
+		if (!pdsc_is_fw_running(pdsc)) {
+			u8 fw_status = ioread8(&pdsc->info_regs->fw_status);
+
+			dev_dbg(pdsc->dev, "%s: post wait failed - fw not running %#02x:\n",
+				__func__, fw_status);
+			err = -ENXIO;
+			break;
+		}
+
+		/* When fast_poll is not requested, prevent aggressive polling
+		 * on failures due to timeouts by doing exponential back off.
+		 */
+		if (!fast_poll && poll_interval < PDSC_ADMINQ_MAX_POLL_INTERVAL)
+			poll_interval <<= 1;
+	} while (time_before(jiffies, time_limit));
+	time_done = jiffies;
+	dev_dbg(pdsc->dev, "%s: elapsed %d msecs\n",
+		__func__, jiffies_to_msecs(time_done - time_start));
+
+	/* Check the results */
+	if (time_after_eq(time_done, time_limit))
+		err = -ETIMEDOUT;
+
+	dev_dbg(pdsc->dev, "read admin queue completion idx %d:\n", index);
+	dynamic_hex_dump("comp ", DUMP_PREFIX_OFFSET, 16, 1,
+			 comp, sizeof(*comp), true);
+
+	if (remaining && comp->status)
+		err = pdsc_err_to_errno(comp->status);
+
+err_out:
+	if (err) {
+		dev_dbg(pdsc->dev, "%s: opcode %d status %d err %pe\n",
+			__func__, cmd->opcode, comp->status, ERR_PTR(err));
+		if (err == -ENXIO || err == -ETIMEDOUT)
+			queue_work(pdsc->wq, &pdsc->health_work);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pdsc_adminq_post);
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index 8c0dbdb5efc5..59daf8a67ac6 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -6,17 +6,6 @@
 
 #include "core.h"
 
-void pdsc_work_thread(struct work_struct *work)
-{
-	/* stub */
-}
-
-irqreturn_t pdsc_adminq_isr(int irq, void *data)
-{
-	/* stub */
-	return IRQ_HANDLED;
-}
-
 void pdsc_intr_free(struct pdsc *pdsc, int index)
 {
 	struct pdsc_intr_info *intr_info;
diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h
index dd5fbe3ee141..98a60ce87b92 100644
--- a/include/linux/pds/pds_adminq.h
+++ b/include/linux/pds/pds_adminq.h
@@ -4,6 +4,8 @@
 #ifndef _PDS_CORE_ADMINQ_H_
 #define _PDS_CORE_ADMINQ_H_
 
+#define PDSC_ADMINQ_MAX_POLL_INTERVAL	256
+
 enum pds_core_adminq_flags {
 	PDS_AQ_FLAG_FASTPOLL	= BIT(1),	/* completion poll at 1ms */
 };
@@ -631,8 +633,15 @@ static_assert(sizeof(union pds_core_notifyq_comp) == 64);
  * where the meaning alternates between '1' and '0' for alternating
  * passes through the completion descriptor ring.
  */
-static inline u8 pdsc_color_match(u8 color, u8 done_color)
+static inline bool pdsc_color_match(u8 color, bool done_color)
 {
 	return (!!(color & PDS_COMP_COLOR_MASK)) == done_color;
 }
+
+struct pdsc;
+int pdsc_adminq_post(struct pdsc *pdsc,
+		     union pds_core_adminq_cmd *cmd,
+		     union pds_core_adminq_comp *comp,
+		     bool fast_poll);
+
 #endif /* _PDS_CORE_ADMINQ_H_ */
-- 
cgit v1.2.3


From 65e0185ad764d2801811bb2e7c122e92557208c4 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:21 -0700
Subject: pds_core: set up the VIF definitions and defaults

The Virtual Interfaces (VIFs) supported by the DSC's
configuration (vDPA, Eth, RDMA, etc) are reported in the
dev_ident struct and made visible in debugfs.  At this point
only vDPA is supported in this driver so we only setup
devices for that feature.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/core.c    | 48 +++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.h    | 11 +++++++
 drivers/net/ethernet/amd/pds_core/debugfs.c | 24 +++++++++++++++
 include/linux/pds/pds_common.h              | 19 ++++++++++++
 4 files changed, 102 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index 59daf8a67ac6..b2fca3b99f02 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -346,6 +346,43 @@ static int pdsc_core_init(struct pdsc *pdsc)
 	return err;
 }
 
+static struct pdsc_viftype pdsc_viftype_defaults[] = {
+	[PDS_DEV_TYPE_VDPA] = { .name = PDS_DEV_TYPE_VDPA_STR,
+				.vif_id = PDS_DEV_TYPE_VDPA,
+				.dl_id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET },
+	[PDS_DEV_TYPE_MAX] = {}
+};
+
+static int pdsc_viftypes_init(struct pdsc *pdsc)
+{
+	enum pds_core_vif_types vt;
+
+	pdsc->viftype_status = kzalloc(sizeof(pdsc_viftype_defaults),
+				       GFP_KERNEL);
+	if (!pdsc->viftype_status)
+		return -ENOMEM;
+
+	for (vt = 0; vt < PDS_DEV_TYPE_MAX; vt++) {
+		bool vt_support;
+
+		if (!pdsc_viftype_defaults[vt].name)
+			continue;
+
+		/* Grab the defaults */
+		pdsc->viftype_status[vt] = pdsc_viftype_defaults[vt];
+
+		/* See what the Core device has for support */
+		vt_support = !!le16_to_cpu(pdsc->dev_ident.vif_types[vt]);
+		dev_dbg(pdsc->dev, "VIF %s is %ssupported\n",
+			pdsc->viftype_status[vt].name,
+			vt_support ? "" : "not ");
+
+		pdsc->viftype_status[vt].supported = vt_support;
+	}
+
+	return 0;
+}
+
 int pdsc_setup(struct pdsc *pdsc, bool init)
 {
 	int numdescs;
@@ -388,6 +425,14 @@ int pdsc_setup(struct pdsc *pdsc, bool init)
 	if (err)
 		goto err_out_teardown;
 
+	/* Set up the VIFs */
+	err = pdsc_viftypes_init(pdsc);
+	if (err)
+		goto err_out_teardown;
+
+	if (init)
+		pdsc_debugfs_add_viftype(pdsc);
+
 	clear_bit(PDSC_S_FW_DEAD, &pdsc->state);
 	return 0;
 
@@ -404,6 +449,9 @@ void pdsc_teardown(struct pdsc *pdsc, bool removing)
 	pdsc_qcq_free(pdsc, &pdsc->notifyqcq);
 	pdsc_qcq_free(pdsc, &pdsc->adminqcq);
 
+	kfree(pdsc->viftype_status);
+	pdsc->viftype_status = NULL;
+
 	if (pdsc->intr_info) {
 		for (i = 0; i < pdsc->nintrs; i++)
 			pdsc_intr_free(pdsc, i);
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 7eb02b359f3a..ac0480d7f0f1 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -123,6 +123,15 @@ struct pdsc_qcq {
 	struct dentry *dentry;
 };
 
+struct pdsc_viftype {
+	char *name;
+	bool supported;
+	bool enabled;
+	int dl_id;
+	int vif_id;
+	struct pds_auxiliary_dev *padev;
+};
+
 /* No state flags set means we are in a steady running state */
 enum pdsc_state_flags {
 	PDSC_S_FW_DEAD,		    /* stopped, wait on startup or recovery */
@@ -174,6 +183,7 @@ struct pdsc {
 	struct pdsc_qcq adminqcq;
 	struct pdsc_qcq notifyqcq;
 	u64 last_eid;
+	struct pdsc_viftype *viftype_status;
 };
 
 /** enum pds_core_dbell_bits - bitwise composition of dbell values.
@@ -237,6 +247,7 @@ void pdsc_debugfs_destroy(void);
 void pdsc_debugfs_add_dev(struct pdsc *pdsc);
 void pdsc_debugfs_del_dev(struct pdsc *pdsc);
 void pdsc_debugfs_add_ident(struct pdsc *pdsc);
+void pdsc_debugfs_add_viftype(struct pdsc *pdsc);
 void pdsc_debugfs_add_irqs(struct pdsc *pdsc);
 void pdsc_debugfs_add_qcq(struct pdsc *pdsc, struct pdsc_qcq *qcq);
 void pdsc_debugfs_del_qcq(struct pdsc_qcq *qcq);
diff --git a/drivers/net/ethernet/amd/pds_core/debugfs.c b/drivers/net/ethernet/amd/pds_core/debugfs.c
index b83e5016644b..8ec392299b7d 100644
--- a/drivers/net/ethernet/amd/pds_core/debugfs.c
+++ b/drivers/net/ethernet/amd/pds_core/debugfs.c
@@ -68,6 +68,30 @@ void pdsc_debugfs_add_ident(struct pdsc *pdsc)
 			    pdsc, &identity_fops);
 }
 
+static int viftype_show(struct seq_file *seq, void *v)
+{
+	struct pdsc *pdsc = seq->private;
+	int vt;
+
+	for (vt = 0; vt < PDS_DEV_TYPE_MAX; vt++) {
+		if (!pdsc->viftype_status[vt].name)
+			continue;
+
+		seq_printf(seq, "%s\t%d supported %d enabled\n",
+			   pdsc->viftype_status[vt].name,
+			   pdsc->viftype_status[vt].supported,
+			   pdsc->viftype_status[vt].enabled);
+	}
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(viftype);
+
+void pdsc_debugfs_add_viftype(struct pdsc *pdsc)
+{
+	debugfs_create_file("viftypes", 0400, pdsc->dentry,
+			    pdsc, &viftype_fops);
+}
+
 static const struct debugfs_reg32 intr_ctrl_regs[] = {
 	{ .name = "coal_init", .offset = 0, },
 	{ .name = "mask", .offset = 4, },
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index f0798ce01acf..b2be14ebadb6 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -20,6 +20,25 @@ enum pds_core_driver_type {
 	PDS_DRIVER_ESXI    = 6,
 };
 
+enum pds_core_vif_types {
+	PDS_DEV_TYPE_CORE	= 0,
+	PDS_DEV_TYPE_VDPA	= 1,
+	PDS_DEV_TYPE_VFIO	= 2,
+	PDS_DEV_TYPE_ETH	= 3,
+	PDS_DEV_TYPE_RDMA	= 4,
+	PDS_DEV_TYPE_LM		= 5,
+
+	/* new ones added before this line */
+	PDS_DEV_TYPE_MAX	= 16   /* don't change - used in struct size */
+};
+
+#define PDS_DEV_TYPE_CORE_STR	"Core"
+#define PDS_DEV_TYPE_VDPA_STR	"vDPA"
+#define PDS_DEV_TYPE_VFIO_STR	"VFio"
+#define PDS_DEV_TYPE_ETH_STR	"Eth"
+#define PDS_DEV_TYPE_RDMA_STR	"RDMA"
+#define PDS_DEV_TYPE_LM_STR	"LM"
+
 #define PDS_CORE_IFNAMSIZ		16
 
 /**
-- 
cgit v1.2.3


From 4569cce43bc61e4cdd76597a1cf9b608846c18cc Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:23 -0700
Subject: pds_core: add auxiliary_bus devices

An auxiliary_bus device is created for each vDPA type VF at VF
probe and destroyed at VF remove.  The aux device name comes
from the driver name + VIF type + the unique id assigned at PCI
probe.  The VFs are always removed on PF remove, so there should
be no issues with VFs trying to access missing PF structures.

The auxiliary_device names will look like "pds_core.vDPA.nn"
where 'nn' is the VF's uid.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/Makefile |   1 +
 drivers/net/ethernet/amd/pds_core/auxbus.c | 115 +++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.h   |   6 ++
 drivers/net/ethernet/amd/pds_core/main.c   |  36 ++++++++-
 include/linux/pds/pds_auxbus.h             |  14 ++++
 include/linux/pds/pds_common.h             |   1 +
 6 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/pds_core/auxbus.c
 create mode 100644 include/linux/pds/pds_auxbus.h

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/Makefile b/drivers/net/ethernet/amd/pds_core/Makefile
index 6d1d6c58a1fa..0abc33ce826c 100644
--- a/drivers/net/ethernet/amd/pds_core/Makefile
+++ b/drivers/net/ethernet/amd/pds_core/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_PDS_CORE) := pds_core.o
 
 pds_core-y := main.o \
 	      devlink.o \
+	      auxbus.o \
 	      dev.o \
 	      adminq.o \
 	      core.o \
diff --git a/drivers/net/ethernet/amd/pds_core/auxbus.c b/drivers/net/ethernet/amd/pds_core/auxbus.c
new file mode 100644
index 000000000000..adee516b3f0c
--- /dev/null
+++ b/drivers/net/ethernet/amd/pds_core/auxbus.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#include <linux/pci.h>
+
+#include "core.h"
+#include <linux/pds/pds_auxbus.h>
+
+static void pdsc_auxbus_dev_release(struct device *dev)
+{
+	struct pds_auxiliary_dev *padev =
+		container_of(dev, struct pds_auxiliary_dev, aux_dev.dev);
+
+	kfree(padev);
+}
+
+static struct pds_auxiliary_dev *pdsc_auxbus_dev_register(struct pdsc *cf,
+							  struct pdsc *pf,
+							  char *name)
+{
+	struct auxiliary_device *aux_dev;
+	struct pds_auxiliary_dev *padev;
+	int err;
+
+	padev = kzalloc(sizeof(*padev), GFP_KERNEL);
+	if (!padev)
+		return ERR_PTR(-ENOMEM);
+
+	padev->vf_pdev = cf->pdev;
+
+	aux_dev = &padev->aux_dev;
+	aux_dev->name = name;
+	aux_dev->id = cf->uid;
+	aux_dev->dev.parent = cf->dev;
+	aux_dev->dev.release = pdsc_auxbus_dev_release;
+
+	err = auxiliary_device_init(aux_dev);
+	if (err < 0) {
+		dev_warn(cf->dev, "auxiliary_device_init of %s failed: %pe\n",
+			 name, ERR_PTR(err));
+		goto err_out;
+	}
+
+	err = auxiliary_device_add(aux_dev);
+	if (err) {
+		dev_warn(cf->dev, "auxiliary_device_add of %s failed: %pe\n",
+			 name, ERR_PTR(err));
+		goto err_out_uninit;
+	}
+
+	return padev;
+
+err_out_uninit:
+	auxiliary_device_uninit(aux_dev);
+err_out:
+	kfree(padev);
+	return ERR_PTR(err);
+}
+
+int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf)
+{
+	struct pds_auxiliary_dev *padev;
+	int err = 0;
+
+	mutex_lock(&pf->config_lock);
+
+	padev = pf->vfs[cf->vf_id].padev;
+	if (padev) {
+		auxiliary_device_delete(&padev->aux_dev);
+		auxiliary_device_uninit(&padev->aux_dev);
+	}
+	pf->vfs[cf->vf_id].padev = NULL;
+
+	mutex_unlock(&pf->config_lock);
+	return err;
+}
+
+int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf)
+{
+	struct pds_auxiliary_dev *padev;
+	enum pds_core_vif_types vt;
+	u16 vt_support;
+	int err = 0;
+
+	mutex_lock(&pf->config_lock);
+
+	/* We only support vDPA so far, so it is the only one to
+	 * be verified that it is available in the Core device and
+	 * enabled in the devlink param.  In the future this might
+	 * become a loop for several VIF types.
+	 */
+
+	/* Verify that the type is supported and enabled.  It is not
+	 * an error if there is no auxbus device support for this
+	 * VF, it just means something else needs to happen with it.
+	 */
+	vt = PDS_DEV_TYPE_VDPA;
+	vt_support = !!le16_to_cpu(pf->dev_ident.vif_types[vt]);
+	if (!(vt_support &&
+	      pf->viftype_status[vt].supported &&
+	      pf->viftype_status[vt].enabled))
+		goto out_unlock;
+
+	padev = pdsc_auxbus_dev_register(cf, pf,
+					 pf->viftype_status[vt].name);
+	if (IS_ERR(padev)) {
+		err = PTR_ERR(padev);
+		goto out_unlock;
+	}
+	pf->vfs[cf->vf_id].padev = padev;
+
+out_unlock:
+	mutex_unlock(&pf->config_lock);
+	return err;
+}
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 1ec8784773f1..36099d3ac3dd 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -30,8 +30,11 @@ struct pdsc_dev_bar {
 	int res_index;
 };
 
+struct pdsc;
+
 struct pdsc_vf {
 	struct pds_auxiliary_dev *padev;
+	struct pdsc *vf;
 	u16     index;
 	__le16  vif_types[PDS_DEV_TYPE_MAX];
 };
@@ -287,6 +290,9 @@ int pdsc_start(struct pdsc *pdsc);
 void pdsc_stop(struct pdsc *pdsc);
 void pdsc_health_thread(struct work_struct *work);
 
+int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf);
+int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf);
+
 void pdsc_process_adminq(struct pdsc_qcq *qcq);
 void pdsc_work_thread(struct work_struct *work);
 irqreturn_t pdsc_adminq_isr(int irq, void *data);
diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c
index 511cb91a295e..b848f3360fe2 100644
--- a/drivers/net/ethernet/amd/pds_core/main.c
+++ b/drivers/net/ethernet/amd/pds_core/main.c
@@ -169,6 +169,12 @@ no_vfs:
 static int pdsc_init_vf(struct pdsc *vf)
 {
 	struct devlink *dl;
+	struct pdsc *pf;
+	int err;
+
+	pf = pdsc_get_pf_struct(vf->pdev);
+	if (IS_ERR_OR_NULL(pf))
+		return PTR_ERR(pf) ?: -1;
 
 	vf->vf_id = pci_iov_vf_id(vf->pdev);
 
@@ -177,7 +183,15 @@ static int pdsc_init_vf(struct pdsc *vf)
 	devl_register(dl);
 	devl_unlock(dl);
 
-	return 0;
+	pf->vfs[vf->vf_id].vf = vf;
+	err = pdsc_auxbus_dev_add(vf, pf);
+	if (err) {
+		devl_lock(dl);
+		devl_unregister(dl);
+		devl_unlock(dl);
+	}
+
+	return err;
 }
 
 static const struct devlink_health_reporter_ops pdsc_fw_reporter_ops = {
@@ -365,7 +379,19 @@ static void pdsc_remove(struct pci_dev *pdev)
 	}
 	devl_unlock(dl);
 
-	if (!pdev->is_virtfn) {
+	if (pdev->is_virtfn) {
+		struct pdsc *pf;
+
+		pf = pdsc_get_pf_struct(pdsc->pdev);
+		if (!IS_ERR(pf)) {
+			pdsc_auxbus_dev_del(pdsc, pf);
+			pf->vfs[pdsc->vf_id].vf = NULL;
+		}
+	} else {
+		/* Remove the VFs and their aux_bus connections before other
+		 * cleanup so that the clients can use the AdminQ to cleanly
+		 * shut themselves down.
+		 */
 		pdsc_sriov_configure(pdev, 0);
 
 		del_timer_sync(&pdsc->wdtimer);
@@ -402,6 +428,12 @@ static struct pci_driver pdsc_driver = {
 	.sriov_configure = pdsc_sriov_configure,
 };
 
+void *pdsc_get_pf_struct(struct pci_dev *vf_pdev)
+{
+	return pci_iov_get_pf_drvdata(vf_pdev, &pdsc_driver);
+}
+EXPORT_SYMBOL_GPL(pdsc_get_pf_struct);
+
 static int __init pdsc_init_module(void)
 {
 	if (strcmp(KBUILD_MODNAME, PDS_CORE_DRV_NAME))
diff --git a/include/linux/pds/pds_auxbus.h b/include/linux/pds/pds_auxbus.h
new file mode 100644
index 000000000000..493f75b1995e
--- /dev/null
+++ b/include/linux/pds/pds_auxbus.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2023 Advanced Micro Devices, Inc */
+
+#ifndef _PDSC_AUXBUS_H_
+#define _PDSC_AUXBUS_H_
+
+#include <linux/auxiliary_bus.h>
+
+struct pds_auxiliary_dev {
+	struct auxiliary_device aux_dev;
+	struct pci_dev *vf_pdev;
+	u16 client_id;
+};
+#endif /* _PDSC_AUXBUS_H_ */
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index b2be14ebadb6..961b3d02c69f 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -60,4 +60,5 @@ enum pds_core_logical_qtype {
 	PDS_CORE_QTYPE_MAX     = 16   /* don't change - used in struct size */
 };
 
+void *pdsc_get_pf_struct(struct pci_dev *vf_pdev);
 #endif /* _PDS_COMMON_H_ */
-- 
cgit v1.2.3


From 10659034c622738bc1bfab8a76fc576c52d5acce Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:25 -0700
Subject: pds_core: add the aux client API

Add the client API operations for running adminq commands.
The core registers the client with the FW, then the client
has a context for requesting adminq services.  We expect
to add additional operations for other clients, including
requesting additional private adminqs and IRQs, but don't have
the need yet.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/auxbus.c | 151 ++++++++++++++++++++++++++++-
 include/linux/pds/pds_auxbus.h             |   6 ++
 include/linux/pds/pds_common.h             |   2 +
 3 files changed, 158 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/auxbus.c b/drivers/net/ethernet/amd/pds_core/auxbus.c
index adee516b3f0c..561af8e5b3ea 100644
--- a/drivers/net/ethernet/amd/pds_core/auxbus.c
+++ b/drivers/net/ethernet/amd/pds_core/auxbus.c
@@ -6,6 +6,136 @@
 #include "core.h"
 #include <linux/pds/pds_auxbus.h>
 
+/**
+ * pds_client_register - Link the client to the firmware
+ * @pf_pdev:	ptr to the PF driver struct
+ * @devname:	name that includes service into, e.g. pds_core.vDPA
+ *
+ * Return: 0 on success, or
+ *         negative for error
+ */
+int pds_client_register(struct pci_dev *pf_pdev, char *devname)
+{
+	union pds_core_adminq_comp comp = {};
+	union pds_core_adminq_cmd cmd = {};
+	struct pdsc *pf;
+	int err;
+	u16 ci;
+
+	pf = pci_get_drvdata(pf_pdev);
+	if (pf->state)
+		return -ENXIO;
+
+	cmd.client_reg.opcode = PDS_AQ_CMD_CLIENT_REG;
+	strscpy(cmd.client_reg.devname, devname,
+		sizeof(cmd.client_reg.devname));
+
+	err = pdsc_adminq_post(pf, &cmd, &comp, false);
+	if (err) {
+		dev_info(pf->dev, "register dev_name %s with DSC failed, status %d: %pe\n",
+			 devname, comp.status, ERR_PTR(err));
+		return err;
+	}
+
+	ci = le16_to_cpu(comp.client_reg.client_id);
+	if (!ci) {
+		dev_err(pf->dev, "%s: device returned null client_id\n",
+			__func__);
+		return -EIO;
+	}
+
+	dev_dbg(pf->dev, "%s: device returned client_id %d for %s\n",
+		__func__, ci, devname);
+
+	return ci;
+}
+EXPORT_SYMBOL_GPL(pds_client_register);
+
+/**
+ * pds_client_unregister - Unlink the client from the firmware
+ * @pf_pdev:	ptr to the PF driver struct
+ * @client_id:	id returned from pds_client_register()
+ *
+ * Return: 0 on success, or
+ *         negative for error
+ */
+int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id)
+{
+	union pds_core_adminq_comp comp = {};
+	union pds_core_adminq_cmd cmd = {};
+	struct pdsc *pf;
+	int err;
+
+	pf = pci_get_drvdata(pf_pdev);
+	if (pf->state)
+		return -ENXIO;
+
+	cmd.client_unreg.opcode = PDS_AQ_CMD_CLIENT_UNREG;
+	cmd.client_unreg.client_id = cpu_to_le16(client_id);
+
+	err = pdsc_adminq_post(pf, &cmd, &comp, false);
+	if (err)
+		dev_info(pf->dev, "unregister client_id %d failed, status %d: %pe\n",
+			 client_id, comp.status, ERR_PTR(err));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pds_client_unregister);
+
+/**
+ * pds_client_adminq_cmd - Process an adminq request for the client
+ * @padev:   ptr to the client device
+ * @req:     ptr to buffer with request
+ * @req_len: length of actual struct used for request
+ * @resp:    ptr to buffer where answer is to be copied
+ * @flags:   optional flags from pds_core_adminq_flags
+ *
+ * Return: 0 on success, or
+ *         negative for error
+ *
+ * Client sends pointers to request and response buffers
+ * Core copies request data into pds_core_client_request_cmd
+ * Core sets other fields as needed
+ * Core posts to AdminQ
+ * Core copies completion data into response buffer
+ */
+int pds_client_adminq_cmd(struct pds_auxiliary_dev *padev,
+			  union pds_core_adminq_cmd *req,
+			  size_t req_len,
+			  union pds_core_adminq_comp *resp,
+			  u64 flags)
+{
+	union pds_core_adminq_cmd cmd = {};
+	struct pci_dev *pf_pdev;
+	struct pdsc *pf;
+	size_t cp_len;
+	int err;
+
+	pf_pdev = pci_physfn(padev->vf_pdev);
+	pf = pci_get_drvdata(pf_pdev);
+
+	dev_dbg(pf->dev, "%s: %s opcode %d\n",
+		__func__, dev_name(&padev->aux_dev.dev), req->opcode);
+
+	if (pf->state)
+		return -ENXIO;
+
+	/* Wrap the client's request */
+	cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD;
+	cmd.client_request.client_id = cpu_to_le16(padev->client_id);
+	cp_len = min_t(size_t, req_len, sizeof(cmd.client_request.client_cmd));
+	memcpy(cmd.client_request.client_cmd, req, cp_len);
+
+	err = pdsc_adminq_post(pf, &cmd, resp,
+			       !!(flags & PDS_AQ_FLAG_FASTPOLL));
+	if (err && err != -EAGAIN)
+		dev_info(pf->dev, "client admin cmd failed: %pe\n",
+			 ERR_PTR(err));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pds_client_adminq_cmd);
+
 static void pdsc_auxbus_dev_release(struct device *dev)
 {
 	struct pds_auxiliary_dev *padev =
@@ -16,6 +146,7 @@ static void pdsc_auxbus_dev_release(struct device *dev)
 
 static struct pds_auxiliary_dev *pdsc_auxbus_dev_register(struct pdsc *cf,
 							  struct pdsc *pf,
+							  u16 client_id,
 							  char *name)
 {
 	struct auxiliary_device *aux_dev;
@@ -27,6 +158,7 @@ static struct pds_auxiliary_dev *pdsc_auxbus_dev_register(struct pdsc *cf,
 		return ERR_PTR(-ENOMEM);
 
 	padev->vf_pdev = cf->pdev;
+	padev->client_id = client_id;
 
 	aux_dev = &padev->aux_dev;
 	aux_dev->name = name;
@@ -66,8 +198,10 @@ int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf)
 
 	padev = pf->vfs[cf->vf_id].padev;
 	if (padev) {
+		pds_client_unregister(pf->pdev, padev->client_id);
 		auxiliary_device_delete(&padev->aux_dev);
 		auxiliary_device_uninit(&padev->aux_dev);
+		padev->client_id = 0;
 	}
 	pf->vfs[cf->vf_id].padev = NULL;
 
@@ -79,7 +213,9 @@ int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf)
 {
 	struct pds_auxiliary_dev *padev;
 	enum pds_core_vif_types vt;
+	char devname[PDS_DEVNAME_LEN];
 	u16 vt_support;
+	int client_id;
 	int err = 0;
 
 	mutex_lock(&pf->config_lock);
@@ -101,9 +237,22 @@ int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf)
 	      pf->viftype_status[vt].enabled))
 		goto out_unlock;
 
-	padev = pdsc_auxbus_dev_register(cf, pf,
+	/* Need to register with FW and get the client_id before
+	 * creating the aux device so that the aux client can run
+	 * adminq commands as part its probe
+	 */
+	snprintf(devname, sizeof(devname), "%s.%s.%d",
+		 PDS_CORE_DRV_NAME, pf->viftype_status[vt].name, cf->uid);
+	client_id = pds_client_register(pf->pdev, devname);
+	if (client_id < 0) {
+		err = client_id;
+		goto out_unlock;
+	}
+
+	padev = pdsc_auxbus_dev_register(cf, pf, client_id,
 					 pf->viftype_status[vt].name);
 	if (IS_ERR(padev)) {
+		pds_client_unregister(pf->pdev, client_id);
 		err = PTR_ERR(padev);
 		goto out_unlock;
 	}
diff --git a/include/linux/pds/pds_auxbus.h b/include/linux/pds/pds_auxbus.h
index 493f75b1995e..214ef12302d0 100644
--- a/include/linux/pds/pds_auxbus.h
+++ b/include/linux/pds/pds_auxbus.h
@@ -11,4 +11,10 @@ struct pds_auxiliary_dev {
 	struct pci_dev *vf_pdev;
 	u16 client_id;
 };
+
+int pds_client_adminq_cmd(struct pds_auxiliary_dev *padev,
+			  union pds_core_adminq_cmd *req,
+			  size_t req_len,
+			  union pds_core_adminq_comp *resp,
+			  u64 flags);
 #endif /* _PDSC_AUXBUS_H_ */
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index 961b3d02c69f..4b37675fde3e 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -61,4 +61,6 @@ enum pds_core_logical_qtype {
 };
 
 void *pdsc_get_pf_struct(struct pci_dev *vf_pdev);
+int pds_client_register(struct pci_dev *pf_pdev, char *devname);
+int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id);
 #endif /* _PDS_COMMON_H_ */
-- 
cgit v1.2.3


From d24c28278a01dc4c80d1470533c667cf406f0e88 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@amd.com>
Date: Wed, 19 Apr 2023 10:04:26 -0700
Subject: pds_core: publish events to the clients

When the Core device gets an event from the device, or notices
the device FW to be up or down, it needs to send those events
on to the clients that have an event handler.  Add the code to
pass along the events to the clients.

The entry points pdsc_register_notify() and pdsc_unregister_notify()
are EXPORTed for other drivers that want to listen for these events.

Signed-off-by: Shannon Nelson <shannon.nelson@amd.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/pds_core/adminq.c |  2 ++
 drivers/net/ethernet/amd/pds_core/core.c   | 32 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/amd/pds_core/core.h   |  3 +++
 include/linux/pds/pds_common.h             |  2 ++
 4 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c
index fb2ba3f62480..045fe133f6ee 100644
--- a/drivers/net/ethernet/amd/pds_core/adminq.c
+++ b/drivers/net/ethernet/amd/pds_core/adminq.c
@@ -29,11 +29,13 @@ static int pdsc_process_notifyq(struct pdsc_qcq *qcq)
 		case PDS_EVENT_LINK_CHANGE:
 			dev_info(pdsc->dev, "NotifyQ LINK_CHANGE ecode %d eid %lld\n",
 				 ecode, eid);
+			pdsc_notify(PDS_EVENT_LINK_CHANGE, comp);
 			break;
 
 		case PDS_EVENT_RESET:
 			dev_info(pdsc->dev, "NotifyQ RESET ecode %d eid %lld\n",
 				 ecode, eid);
+			pdsc_notify(PDS_EVENT_RESET, comp);
 			break;
 
 		case PDS_EVENT_XCVR:
diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c
index b2fca3b99f02..483a070d96fa 100644
--- a/drivers/net/ethernet/amd/pds_core/core.c
+++ b/drivers/net/ethernet/amd/pds_core/core.c
@@ -6,6 +6,25 @@
 
 #include "core.h"
 
+static BLOCKING_NOTIFIER_HEAD(pds_notify_chain);
+
+int pdsc_register_notify(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&pds_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(pdsc_register_notify);
+
+void pdsc_unregister_notify(struct notifier_block *nb)
+{
+	blocking_notifier_chain_unregister(&pds_notify_chain, nb);
+}
+EXPORT_SYMBOL_GPL(pdsc_unregister_notify);
+
+void pdsc_notify(unsigned long event, void *data)
+{
+	blocking_notifier_call_chain(&pds_notify_chain, event, data);
+}
+
 void pdsc_intr_free(struct pdsc *pdsc, int index)
 {
 	struct pdsc_intr_info *intr_info;
@@ -494,12 +513,19 @@ void pdsc_stop(struct pdsc *pdsc)
 
 static void pdsc_fw_down(struct pdsc *pdsc)
 {
+	union pds_core_notifyq_comp reset_event = {
+		.reset.ecode = cpu_to_le16(PDS_EVENT_RESET),
+		.reset.state = 0,
+	};
+
 	if (test_and_set_bit(PDSC_S_FW_DEAD, &pdsc->state)) {
 		dev_err(pdsc->dev, "%s: already happening\n", __func__);
 		return;
 	}
 
+	/* Notify clients of fw_down */
 	devlink_health_report(pdsc->fw_reporter, "FW down reported", pdsc);
+	pdsc_notify(PDS_EVENT_RESET, &reset_event);
 
 	pdsc_stop(pdsc);
 	pdsc_teardown(pdsc, PDSC_TEARDOWN_RECOVERY);
@@ -507,6 +533,10 @@ static void pdsc_fw_down(struct pdsc *pdsc)
 
 static void pdsc_fw_up(struct pdsc *pdsc)
 {
+	union pds_core_notifyq_comp reset_event = {
+		.reset.ecode = cpu_to_le16(PDS_EVENT_RESET),
+		.reset.state = 1,
+	};
 	int err;
 
 	if (!test_bit(PDSC_S_FW_DEAD, &pdsc->state)) {
@@ -522,9 +552,11 @@ static void pdsc_fw_up(struct pdsc *pdsc)
 	if (err)
 		goto err_out;
 
+	/* Notify clients of fw_up */
 	pdsc->fw_recoveries++;
 	devlink_health_reporter_state_update(pdsc->fw_reporter,
 					     DEVLINK_HEALTH_REPORTER_STATE_HEALTHY);
+	pdsc_notify(PDS_EVENT_RESET, &reset_event);
 
 	return;
 
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index 9e01a9ee6868..e545fafc4819 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -297,6 +297,9 @@ int pdsc_start(struct pdsc *pdsc);
 void pdsc_stop(struct pdsc *pdsc);
 void pdsc_health_thread(struct work_struct *work);
 
+int pdsc_register_notify(struct notifier_block *nb);
+void pdsc_unregister_notify(struct notifier_block *nb);
+void pdsc_notify(unsigned long event, void *data);
 int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf);
 int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf);
 
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index 4b37675fde3e..060331486d50 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -60,6 +60,8 @@ enum pds_core_logical_qtype {
 	PDS_CORE_QTYPE_MAX     = 16   /* don't change - used in struct size */
 };
 
+int pdsc_register_notify(struct notifier_block *nb);
+void pdsc_unregister_notify(struct notifier_block *nb);
 void *pdsc_get_pf_struct(struct pci_dev *vf_pdev);
 int pds_client_register(struct pci_dev *pf_pdev, char *devname);
 int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id);
-- 
cgit v1.2.3


From dfc39d4026fb2432363c0f77543c4cf3adca4c7b Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <henry.tjf@antgroup.com>
Date: Wed, 19 Apr 2023 15:24:16 +0800
Subject: net/packet: support mergeable feature of virtio

Packet sockets, like tap, can be used as the backend for kernel vhost.
In packet sockets, virtio net header size is currently hardcoded to be
the size of struct virtio_net_hdr, which is 10 bytes; however, it is not
always the case: some virtio features, such as mrg_rxbuf, need virtio
net header to be 12-byte long.

Mergeable buffers, as a virtio feature, is worthy of supporting: packets
that are larger than one-mbuf size will be dropped in vhost worker's
handle_rx if mrg_rxbuf feature is not used, but large packets
cannot be avoided and increasing mbuf's size is not economical.

With this virtio feature enabled by virtio-user, packet sockets with
hardcoded 10-byte virtio net header will parse mac head incorrectly in
packet_snd by taking the last two bytes of virtio net header as part of
mac header.
This incorrect mac header parsing will cause packet to be dropped due to
invalid ether head checking in later under-layer device packet receiving.

By adding extra field vnet_hdr_sz with utilizing holes in struct
packet_sock to record currently used virtio net header size and supporting
extra sockopt PACKET_VNET_HDR_SZ to set specified vnet_hdr_sz, packet
sockets can know the exact length of virtio net header that virtio user
gives.
In packet_snd, tpacket_snd and packet_recvmsg, instead of using
hardcoded virtio net header size, it can get the exact vnet_hdr_sz from
corresponding packet_sock, and parse mac header correctly based on this
information to avoid the packets being mistakenly dropped.

Signed-off-by: Jianfeng Tan <henry.tjf@antgroup.com>
Co-developed-by: Anqi Shen <amy.saq@antgroup.com>
Signed-off-by: Anqi Shen <amy.saq@antgroup.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_packet.h |  1 +
 net/packet/af_packet.c         | 95 +++++++++++++++++++++++++-----------------
 net/packet/diag.c              |  2 +-
 net/packet/internal.h          |  2 +-
 4 files changed, 60 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 78c981d6a9d4..9efc42382fdb 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -59,6 +59,7 @@ struct sockaddr_ll {
 #define PACKET_ROLLOVER_STATS		21
 #define PACKET_FANOUT_DATA		22
 #define PACKET_IGNORE_OUTGOING		23
+#define PACKET_VNET_HDR_SZ		24
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 568f8d76e3c1..6080c0db0814 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2090,18 +2090,18 @@ static unsigned int run_filter(struct sk_buff *skb,
 }
 
 static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
-			   size_t *len)
+			   size_t *len, int vnet_hdr_sz)
 {
-	struct virtio_net_hdr vnet_hdr;
+	struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
 
-	if (*len < sizeof(vnet_hdr))
+	if (*len < vnet_hdr_sz)
 		return -EINVAL;
-	*len -= sizeof(vnet_hdr);
+	*len -= vnet_hdr_sz;
 
-	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
+	if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
 		return -EINVAL;
 
-	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
+	return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
 }
 
 /*
@@ -2250,7 +2250,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	__u32 ts_status;
 	bool is_drop_n_account = false;
 	unsigned int slot_id = 0;
-	bool do_vnet = false;
+	int vnet_hdr_sz = 0;
 
 	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
 	 * We may add members to them until current aligned size without forcing
@@ -2308,10 +2308,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		netoff = TPACKET_ALIGN(po->tp_hdrlen +
 				       (maclen < 16 ? 16 : maclen)) +
 				       po->tp_reserve;
-		if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
-			netoff += sizeof(struct virtio_net_hdr);
-			do_vnet = true;
-		}
+		vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
+		if (vnet_hdr_sz)
+			netoff += vnet_hdr_sz;
 		macoff = netoff - maclen;
 	}
 	if (netoff > USHRT_MAX) {
@@ -2337,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			snaplen = po->rx_ring.frame_size - macoff;
 			if ((int)snaplen < 0) {
 				snaplen = 0;
-				do_vnet = false;
+				vnet_hdr_sz = 0;
 			}
 		}
 	} else if (unlikely(macoff + snaplen >
@@ -2351,7 +2350,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		if (unlikely((int)snaplen < 0)) {
 			snaplen = 0;
 			macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
-			do_vnet = false;
+			vnet_hdr_sz = 0;
 		}
 	}
 	spin_lock(&sk->sk_receive_queue.lock);
@@ -2367,7 +2366,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		__set_bit(slot_id, po->rx_ring.rx_owner_map);
 	}
 
-	if (do_vnet &&
+	if (vnet_hdr_sz &&
 	    virtio_net_hdr_from_skb(skb, h.raw + macoff -
 				    sizeof(struct virtio_net_hdr),
 				    vio_le(), true, 0)) {
@@ -2551,16 +2550,26 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
 }
 
 static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
-				 struct virtio_net_hdr *vnet_hdr)
+				 struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
 {
-	if (*len < sizeof(*vnet_hdr))
+	int ret;
+
+	if (*len < vnet_hdr_sz)
 		return -EINVAL;
-	*len -= sizeof(*vnet_hdr);
+	*len -= vnet_hdr_sz;
 
 	if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
 		return -EFAULT;
 
-	return __packet_snd_vnet_parse(vnet_hdr, *len);
+	ret = __packet_snd_vnet_parse(vnet_hdr, *len);
+	if (ret)
+		return ret;
+
+	/* move iter to point to the start of mac header */
+	if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
+		iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
+
+	return 0;
 }
 
 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
@@ -2722,6 +2731,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	void *ph;
 	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
 	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
 	unsigned char *addr = NULL;
 	int tp_len, size_max;
 	void *data;
@@ -2779,8 +2789,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	size_max = po->tx_ring.frame_size
 		- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
 
-	if ((size_max > dev->mtu + reserve + VLAN_HLEN) &&
-	    !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
+	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
 	reinit_completion(&po->skb_completion);
@@ -2809,10 +2818,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		status = TP_STATUS_SEND_REQUEST;
 		hlen = LL_RESERVED_SPACE(dev);
 		tlen = dev->needed_tailroom;
-		if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
+		if (vnet_hdr_sz) {
 			vnet_hdr = data;
-			data += sizeof(*vnet_hdr);
-			tp_len -= sizeof(*vnet_hdr);
+			data += vnet_hdr_sz;
+			tp_len -= vnet_hdr_sz;
 			if (tp_len < 0 ||
 			    __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
 				tp_len = -EINVAL;
@@ -2837,7 +2846,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 					  addr, hlen, copylen, &sockc);
 		if (likely(tp_len >= 0) &&
 		    tp_len > dev->mtu + reserve &&
-		    !packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR) &&
+		    !vnet_hdr_sz &&
 		    !packet_extra_vlan_len_allowed(dev, skb))
 			tp_len = -EMSGSIZE;
 
@@ -2856,7 +2865,7 @@ tpacket_error:
 			}
 		}
 
-		if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
+		if (vnet_hdr_sz) {
 			if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
 				tp_len = -EINVAL;
 				goto tpacket_error;
@@ -2946,7 +2955,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	struct virtio_net_hdr vnet_hdr = { 0 };
 	int offset = 0;
 	struct packet_sock *po = pkt_sk(sk);
-	bool has_vnet_hdr = false;
+	int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
 	int hlen, tlen, linear;
 	int extra_len = 0;
 
@@ -2990,11 +2999,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 	if (sock->type == SOCK_RAW)
 		reserve = dev->hard_header_len;
-	if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR)) {
-		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
+	if (vnet_hdr_sz) {
+		err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
 		if (err)
 			goto out_unlock;
-		has_vnet_hdr = true;
 	}
 
 	if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
@@ -3064,11 +3072,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 
 	packet_parse_headers(skb, sock);
 
-	if (has_vnet_hdr) {
+	if (vnet_hdr_sz) {
 		err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
 		if (err)
 			goto out_free;
-		len += sizeof(vnet_hdr);
+		len += vnet_hdr_sz;
 		virtio_net_hdr_set_proto(skb, &vnet_hdr);
 	}
 
@@ -3408,7 +3416,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
 	int copied, err;
-	int vnet_hdr_len = 0;
+	int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
 	unsigned int origlen = 0;
 
 	err = -EINVAL;
@@ -3449,11 +3457,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	packet_rcv_try_clear_pressure(pkt_sk(sk));
 
-	if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_HAS_VNET_HDR)) {
-		err = packet_rcv_vnet(msg, skb, &len);
+	if (vnet_hdr_len) {
+		err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
 		if (err)
 			goto out_free;
-		vnet_hdr_len = sizeof(struct virtio_net_hdr);
 	}
 
 	/* You lose any data beyond the buffer you gave. If it worries
@@ -3915,8 +3922,9 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
 		return 0;
 	}
 	case PACKET_VNET_HDR:
+	case PACKET_VNET_HDR_SZ:
 	{
-		int val;
+		int val, hdr_len;
 
 		if (sock->type != SOCK_RAW)
 			return -EINVAL;
@@ -3925,11 +3933,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
 		if (copy_from_sockptr(&val, optval, sizeof(val)))
 			return -EFAULT;
 
+		if (optname == PACKET_VNET_HDR_SZ) {
+			if (val && val != sizeof(struct virtio_net_hdr) &&
+			    val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
+				return -EINVAL;
+			hdr_len = val;
+		} else {
+			hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
+		}
 		lock_sock(sk);
 		if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
 			ret = -EBUSY;
 		} else {
-			packet_sock_flag_set(po, PACKET_SOCK_HAS_VNET_HDR, val);
+			WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
 			ret = 0;
 		}
 		release_sock(sk);
@@ -4062,7 +4078,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
 		break;
 	case PACKET_VNET_HDR:
-		val = packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR);
+		val = !!READ_ONCE(po->vnet_hdr_sz);
+		break;
+	case PACKET_VNET_HDR_SZ:
+		val = READ_ONCE(po->vnet_hdr_sz);
 		break;
 	case PACKET_VERSION:
 		val = po->tp_version;
diff --git a/net/packet/diag.c b/net/packet/diag.c
index de4ced5cf3e8..d0c4eda4cdc6 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -27,7 +27,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
 		pinfo.pdi_flags |= PDI_AUXDATA;
 	if (packet_sock_flag(po, PACKET_SOCK_ORIGDEV))
 		pinfo.pdi_flags |= PDI_ORIGDEV;
-	if (packet_sock_flag(po, PACKET_SOCK_HAS_VNET_HDR))
+	if (READ_ONCE(po->vnet_hdr_sz))
 		pinfo.pdi_flags |= PDI_VNETHDR;
 	if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS))
 		pinfo.pdi_flags |= PDI_LOSS;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 27930f69f368..63f4865202c1 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -118,6 +118,7 @@ struct packet_sock {
 	struct mutex		pg_vec_lock;
 	unsigned long		flags;
 	int			ifindex;	/* bound device		*/
+	u8			vnet_hdr_sz;
 	__be16			num;
 	struct packet_rollover	*rollover;
 	struct packet_mclist	*mclist;
@@ -139,7 +140,6 @@ enum packet_sock_flags {
 	PACKET_SOCK_AUXDATA,
 	PACKET_SOCK_TX_HAS_OFF,
 	PACKET_SOCK_TP_LOSS,
-	PACKET_SOCK_HAS_VNET_HDR,
 	PACKET_SOCK_RUNNING,
 	PACKET_SOCK_PRESSURE,
 	PACKET_SOCK_QDISC_BYPASS,
-- 
cgit v1.2.3


From 84601d6ee68ae820dec97450934797046d62db4b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:54 +0200
Subject: bpf: add bpf_link support for BPF_NETFILTER programs

Add bpf_link support skeleton.  To keep this reviewable, no bpf program
can be invoked yet, if a program is attached only a c-stub is called and
not the actual bpf program.

Defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig.

Uapi example usage:
	union bpf_attr attr = { };

	attr.link_create.prog_fd = progfd;
	attr.link_create.attach_type = 0; /* unused */
	attr.link_create.netfilter.pf = PF_INET;
	attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN;
	attr.link_create.netfilter.priority = -128;

	err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr));

... this would attach progfd to ipv4:input hook.

Such hook gets removed automatically if the calling program exits.

BPF_NETFILTER program invocation is added in followup change.

NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it
allows to tell userspace which program is attached at the given hook
when user runs 'nft hook list' command rather than just the priority
and not-very-helpful 'this hook runs a bpf prog but I can't tell which
one'.

Will also be used to disallow registration of two bpf programs with
same priority in a followup patch.

v4: arm32 cmpxchg only supports 32bit operand
    s/prio/priority/
v3: restrict prog attachment to ip/ip6 for now, lets lift restrictions if
    more use cases pop up (arptables, ebtables, netdev ingress/egress etc).

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-2-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netfilter.h           |   1 +
 include/net/netfilter/nf_bpf_link.h |  10 +++
 include/uapi/linux/bpf.h            |  14 ++++
 kernel/bpf/syscall.c                |   6 ++
 net/netfilter/Kconfig               |   3 +
 net/netfilter/Makefile              |   1 +
 net/netfilter/nf_bpf_link.c         | 159 ++++++++++++++++++++++++++++++++++++
 7 files changed, 194 insertions(+)
 create mode 100644 include/net/netfilter/nf_bpf_link.h
 create mode 100644 net/netfilter/nf_bpf_link.c

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index c8e03bcaecaa..0762444e3767 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv,
 enum nf_hook_ops_type {
 	NF_HOOK_OP_UNDEFINED,
 	NF_HOOK_OP_NF_TABLES,
+	NF_HOOK_OP_BPF,
 };
 
 struct nf_hook_ops {
diff --git a/include/net/netfilter/nf_bpf_link.h b/include/net/netfilter/nf_bpf_link.h
new file mode 100644
index 000000000000..eeaeaf3d15de
--- /dev/null
+++ b/include/net/netfilter/nf_bpf_link.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if IS_ENABLED(CONFIG_NETFILTER_BPF_LINK)
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
+#else
+static inline int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+#endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4b20a7269bee..1bb11a6ee667 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -986,6 +986,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+	BPF_PROG_TYPE_NETFILTER,
 };
 
 enum bpf_attach_type {
@@ -1050,6 +1051,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_PERF_EVENT = 7,
 	BPF_LINK_TYPE_KPROBE_MULTI = 8,
 	BPF_LINK_TYPE_STRUCT_OPS = 9,
+	BPF_LINK_TYPE_NETFILTER = 10,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1560,6 +1562,12 @@ union bpf_attr {
 				 */
 				__u64		cookie;
 			} tracing;
+			struct {
+				__u32		pf;
+				__u32		hooknum;
+				__s32		priority;
+				__u32		flags;
+			} netfilter;
 		};
 	} link_create;
 
@@ -6410,6 +6418,12 @@ struct bpf_link_info {
 		struct {
 			__u32 map_id;
 		} struct_ops;
+		struct {
+			__u32 pf;
+			__u32 hooknum;
+			__s32 priority;
+			__u32 flags;
+		} netfilter;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bcf1a1920ddd..14f39c1e573e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
 #include <linux/trace_events.h>
+#include <net/netfilter/nf_bpf_link.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -2462,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_EXT: /* extends any prog */
+	case BPF_PROG_TYPE_NETFILTER:
 		return true;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		/* always unpriv */
@@ -4588,6 +4590,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 
 	switch (prog->type) {
 	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_NETFILTER:
 		break;
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
@@ -4654,6 +4657,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_NETFILTER:
+		ret = bpf_nf_link_attach(attr, prog);
+		break;
 #endif
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index d0bf630482c1..441d1f134110 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE
 config NETFILTER_FAMILY_ARP
 	bool
 
+config NETFILTER_BPF_LINK
+	def_bool BPF_SYSCALL
+
 config NETFILTER_NETLINK_HOOK
 	tristate "Netfilter base hook dump support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5ffef1cd6143..d4958e7e7631 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -22,6 +22,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
 endif
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o
 
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
new file mode 100644
index 000000000000..efa4f3390742
--- /dev/null
+++ b/net/netfilter/nf_bpf_link.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+#include <uapi/linux/netfilter_ipv4.h>
+
+static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
+				    const struct nf_hook_state *s)
+{
+	return NF_ACCEPT;
+}
+
+struct bpf_nf_link {
+	struct bpf_link link;
+	struct nf_hook_ops hook_ops;
+	struct net *net;
+	u32 dead;
+};
+
+static void bpf_nf_link_release(struct bpf_link *link)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	if (nf_link->dead)
+		return;
+
+	/* prevent hook-not-found warning splat from netfilter core when
+	 * .detach was already called
+	 */
+	if (!cmpxchg(&nf_link->dead, 0, 1))
+		nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+}
+
+static void bpf_nf_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	kfree(nf_link);
+}
+
+static int bpf_nf_link_detach(struct bpf_link *link)
+{
+	bpf_nf_link_release(link);
+	return 0;
+}
+
+static void bpf_nf_link_show_info(const struct bpf_link *link,
+				  struct seq_file *seq)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
+		   nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
+		   nf_link->hook_ops.priority);
+}
+
+static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
+				      struct bpf_link_info *info)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	info->netfilter.pf = nf_link->hook_ops.pf;
+	info->netfilter.hooknum = nf_link->hook_ops.hooknum;
+	info->netfilter.priority = nf_link->hook_ops.priority;
+	info->netfilter.flags = 0;
+
+	return 0;
+}
+
+static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+			      struct bpf_prog *old_prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct bpf_link_ops bpf_nf_link_lops = {
+	.release = bpf_nf_link_release,
+	.dealloc = bpf_nf_link_dealloc,
+	.detach = bpf_nf_link_detach,
+	.show_fdinfo = bpf_nf_link_show_info,
+	.fill_link_info = bpf_nf_link_fill_link_info,
+	.update_prog = bpf_nf_link_update,
+};
+
+static int bpf_nf_check_pf_and_hooks(const union bpf_attr *attr)
+{
+	switch (attr->link_create.netfilter.pf) {
+	case NFPROTO_IPV4:
+	case NFPROTO_IPV6:
+		if (attr->link_create.netfilter.hooknum >= NF_INET_NUMHOOKS)
+			return -EPROTO;
+		break;
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	if (attr->link_create.netfilter.flags)
+		return -EOPNOTSUPP;
+
+	/* make sure conntrack confirm is always last.
+	 *
+	 * In the future, if userspace can e.g. request defrag, then
+	 * "defrag_requested && prio before NF_IP_PRI_CONNTRACK_DEFRAG"
+	 * should fail.
+	 */
+	switch (attr->link_create.netfilter.priority) {
+	case NF_IP_PRI_FIRST: return -ERANGE; /* sabotage_in and other warts */
+	case NF_IP_PRI_LAST: return -ERANGE; /* e.g. conntrack confirm */
+	}
+
+	return 0;
+}
+
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_link_primer link_primer;
+	struct bpf_nf_link *link;
+	int err;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	err = bpf_nf_check_pf_and_hooks(attr);
+	if (err)
+		return err;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog);
+
+	link->hook_ops.hook = nf_hook_run_bpf;
+	link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
+	link->hook_ops.priv = prog;
+
+	link->hook_ops.pf = attr->link_create.netfilter.pf;
+	link->hook_ops.priority = attr->link_create.netfilter.priority;
+	link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;
+
+	link->net = net;
+	link->dead = false;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		return err;
+	}
+
+	err = nf_register_net_hook(net, &link->hook_ops);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		return err;
+	}
+
+	return bpf_link_settle(&link_primer);
+}
-- 
cgit v1.2.3


From fd9c663b9ad67dedfc9a3fd3429ddd3e83782b4d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:55 +0200
Subject: bpf: minimal support for programs hooked into netfilter framework

This adds minimal support for BPF_PROG_TYPE_NETFILTER bpf programs
that will be invoked via the NF_HOOK() points in the ip stack.

Invocation incurs an indirect call.  This is not a necessity: Its
possible to add 'DEFINE_BPF_DISPATCHER(nf_progs)' and handle the
program invocation with the same method already done for xdp progs.

This isn't done here to keep the size of this chunk down.

Verifier restricts verdicts to either DROP or ACCEPT.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-3-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_types.h           |  4 +++
 include/net/netfilter/nf_bpf_link.h |  5 +++
 kernel/bpf/btf.c                    |  6 ++++
 kernel/bpf/verifier.c               |  3 ++
 net/core/filter.c                   |  1 +
 net/netfilter/nf_bpf_link.c         | 70 ++++++++++++++++++++++++++++++++++++-
 6 files changed, 88 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d4ee3ccd3753..39a999abb0ce 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -79,6 +79,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
 #endif
 BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 	      void *, void *)
+#ifdef CONFIG_NETFILTER
+BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
+	      struct bpf_nf_ctx, struct bpf_nf_ctx)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/net/netfilter/nf_bpf_link.h b/include/net/netfilter/nf_bpf_link.h
index eeaeaf3d15de..6c984b0ea838 100644
--- a/include/net/netfilter/nf_bpf_link.h
+++ b/include/net/netfilter/nf_bpf_link.h
@@ -1,5 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
+struct bpf_nf_ctx {
+	const struct nf_hook_state *state;
+	struct sk_buff *skb;
+};
+
 #if IS_ENABLED(CONFIG_NETFILTER_BPF_LINK)
 int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7db4ec125fbd..6b682b8e4b50 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,9 @@
 #include <linux/bsearch.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
 #include <net/sock.h>
 #include "../tools/lib/bpf/relo_core.h"
 
@@ -212,6 +215,7 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_SK_SKB,
 	BTF_KFUNC_HOOK_SOCKET_FILTER,
 	BTF_KFUNC_HOOK_LWT,
+	BTF_KFUNC_HOOK_NETFILTER,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -7802,6 +7806,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_LWT_XMIT:
 	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
 		return BTF_KFUNC_HOOK_LWT;
+	case BPF_PROG_TYPE_NETFILTER:
+		return BTF_KFUNC_HOOK_NETFILTER;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1e05355facdc..fc7281d39e46 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13816,6 +13816,9 @@ static int check_return_code(struct bpf_verifier_env *env)
 		}
 		break;
 
+	case BPF_PROG_TYPE_NETFILTER:
+		range = tnum_range(NF_DROP, NF_ACCEPT);
+		break;
 	case BPF_PROG_TYPE_EXT:
 		/* freplace program can return anything as its return value
 		 * depends on the to-be-replaced kernel func or bpf program.
diff --git a/net/core/filter.c b/net/core/filter.c
index 44fb997434ad..d9ce04ca22ce 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -11717,6 +11717,7 @@ static int __init bpf_kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
 	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
 }
 late_initcall(bpf_kfunc_init);
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index efa4f3390742..49cfc5215386 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/bpf.h>
+#include <linux/filter.h>
 #include <linux/netfilter.h>
 
 #include <net/netfilter/nf_bpf_link.h>
@@ -8,7 +9,13 @@
 static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,
 				    const struct nf_hook_state *s)
 {
-	return NF_ACCEPT;
+	const struct bpf_prog *prog = bpf_prog;
+	struct bpf_nf_ctx ctx = {
+		.state = s,
+		.skb = skb,
+	};
+
+	return bpf_prog_run(prog, &ctx);
 }
 
 struct bpf_nf_link {
@@ -157,3 +164,64 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 
 	return bpf_link_settle(&link_primer);
 }
+
+const struct bpf_prog_ops netfilter_prog_ops = {
+};
+
+static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
+{
+	struct btf *btf;
+	s32 type_id;
+
+	btf = bpf_get_btf_vmlinux();
+	if (IS_ERR_OR_NULL(btf))
+		return false;
+
+	type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+	if (WARN_ON_ONCE(type_id < 0))
+		return false;
+
+	info->btf = btf;
+	info->btf_id = type_id;
+	info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+	return true;
+}
+
+static bool nf_is_valid_access(int off, int size, enum bpf_access_type type,
+			       const struct bpf_prog *prog,
+			       struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= sizeof(struct bpf_nf_ctx))
+		return false;
+
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_nf_ctx, skb):
+		if (size != sizeof_field(struct bpf_nf_ctx, skb))
+			return false;
+
+		return nf_ptr_to_btf_id(info, "sk_buff");
+	case bpf_ctx_range(struct bpf_nf_ctx, state):
+		if (size != sizeof_field(struct bpf_nf_ctx, state))
+			return false;
+
+		return nf_ptr_to_btf_id(info, "nf_hook_state");
+	default:
+		return false;
+	}
+
+	return false;
+}
+
+static const struct bpf_func_proto *
+bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id);
+}
+
+const struct bpf_verifier_ops netfilter_verifier_ops = {
+	.is_valid_access	= nf_is_valid_access,
+	.get_func_proto		= bpf_nf_func_proto,
+};
-- 
cgit v1.2.3


From 506a74db7e019a277e987fa65654bdd953859d5b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:56 +0200
Subject: netfilter: nfnetlink hook: dump bpf prog id

This allows userspace ("nft list hooks") to show which bpf program
is attached to which hook.

Without this, user only knows bpf prog is attached at prio
x, y, z at INPUT and FORWARD, but can't tell which program is where.

v4: kdoc fixups (Simon Horman)

Link: https://lore.kernel.org/bpf/ZEELzpNCnYJuZyod@corigine.com/
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-4-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/netfilter/nfnetlink_hook.h | 24 +++++++-
 net/netfilter/nfnetlink_hook.c                | 81 ++++++++++++++++++++++-----
 2 files changed, 89 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h
index bbcd285b22e1..84a561a74b98 100644
--- a/include/uapi/linux/netfilter/nfnetlink_hook.h
+++ b/include/uapi/linux/netfilter/nfnetlink_hook.h
@@ -32,8 +32,12 @@ enum nfnl_hook_attributes {
 /**
  * enum nfnl_hook_chain_info_attributes - chain description
  *
- * NFNLA_HOOK_INFO_DESC: nft chain and table name (enum nft_table_attributes) (NLA_NESTED)
- * NFNLA_HOOK_INFO_TYPE: chain type (enum nfnl_hook_chaintype) (NLA_U32)
+ * @NFNLA_HOOK_INFO_DESC: nft chain and table name (NLA_NESTED)
+ * @NFNLA_HOOK_INFO_TYPE: chain type (enum nfnl_hook_chaintype) (NLA_U32)
+ *
+ * NFNLA_HOOK_INFO_DESC depends on NFNLA_HOOK_INFO_TYPE value:
+ *   NFNL_HOOK_TYPE_NFTABLES: enum nft_table_attributes
+ *   NFNL_HOOK_TYPE_BPF: enum nfnl_hook_bpf_attributes
  */
 enum nfnl_hook_chain_info_attributes {
 	NFNLA_HOOK_INFO_UNSPEC,
@@ -55,10 +59,24 @@ enum nfnl_hook_chain_desc_attributes {
 /**
  * enum nfnl_hook_chaintype - chain type
  *
- * @NFNL_HOOK_TYPE_NFTABLES nf_tables base chain
+ * @NFNL_HOOK_TYPE_NFTABLES: nf_tables base chain
+ * @NFNL_HOOK_TYPE_BPF: bpf program
  */
 enum nfnl_hook_chaintype {
 	NFNL_HOOK_TYPE_NFTABLES = 0x1,
+	NFNL_HOOK_TYPE_BPF,
+};
+
+/**
+ * enum nfnl_hook_bpf_attributes - bpf prog description
+ *
+ * @NFNLA_HOOK_BPF_ID: bpf program id (NLA_U32)
+ */
+enum nfnl_hook_bpf_attributes {
+	NFNLA_HOOK_BPF_UNSPEC,
+	NFNLA_HOOK_BPF_ID,
+	__NFNLA_HOOK_BPF_MAX,
 };
+#define NFNLA_HOOK_BPF_MAX (__NFNLA_HOOK_BPF_MAX - 1)
 
 #endif /* _NFNL_HOOK_H */
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index 8120aadf6a0f..ade8ee1988b1 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -5,6 +5,7 @@
  * Author: Florian Westphal <fw@strlen.de>
  */
 
+#include <linux/bpf.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kernel.h>
@@ -57,35 +58,76 @@ struct nfnl_dump_hook_data {
 	u8 hook;
 };
 
+static struct nlattr *nfnl_start_info_type(struct sk_buff *nlskb, enum nfnl_hook_chaintype t)
+{
+	struct nlattr *nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+	int ret;
+
+	if (!nest)
+		return NULL;
+
+	ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE, htonl(t));
+	if (ret == 0)
+		return nest;
+
+	nla_nest_cancel(nlskb, nest);
+	return NULL;
+}
+
+static int nfnl_hook_put_bpf_prog_info(struct sk_buff *nlskb,
+				       const struct nfnl_dump_hook_data *ctx,
+				       unsigned int seq,
+				       const struct bpf_prog *prog)
+{
+	struct nlattr *nest, *nest2;
+	int ret;
+
+	if (!IS_ENABLED(CONFIG_NETFILTER_BPF_LINK))
+		return 0;
+
+	if (WARN_ON_ONCE(!prog))
+		return 0;
+
+	nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_BPF);
+	if (!nest)
+		return -EMSGSIZE;
+
+	nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
+	if (!nest2)
+		goto cancel_nest;
+
+	ret = nla_put_be32(nlskb, NFNLA_HOOK_BPF_ID, htonl(prog->aux->id));
+	if (ret)
+		goto cancel_nest;
+
+	nla_nest_end(nlskb, nest2);
+	nla_nest_end(nlskb, nest);
+	return 0;
+
+cancel_nest:
+	nla_nest_cancel(nlskb, nest);
+	return -EMSGSIZE;
+}
+
 static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
 					const struct nfnl_dump_hook_data *ctx,
 					unsigned int seq,
-					const struct nf_hook_ops *ops)
+					struct nft_chain *chain)
 {
 	struct net *net = sock_net(nlskb->sk);
 	struct nlattr *nest, *nest2;
-	struct nft_chain *chain;
 	int ret = 0;
 
-	if (ops->hook_ops_type != NF_HOOK_OP_NF_TABLES)
-		return 0;
-
-	chain = ops->priv;
 	if (WARN_ON_ONCE(!chain))
 		return 0;
 
 	if (!nft_is_active(net, chain))
 		return 0;
 
-	nest = nla_nest_start(nlskb, NFNLA_HOOK_CHAIN_INFO);
+	nest = nfnl_start_info_type(nlskb, NFNL_HOOK_TYPE_NFTABLES);
 	if (!nest)
 		return -EMSGSIZE;
 
-	ret = nla_put_be32(nlskb, NFNLA_HOOK_INFO_TYPE,
-			   htonl(NFNL_HOOK_TYPE_NFTABLES));
-	if (ret)
-		goto cancel_nest;
-
 	nest2 = nla_nest_start(nlskb, NFNLA_HOOK_INFO_DESC);
 	if (!nest2)
 		goto cancel_nest;
@@ -171,7 +213,20 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb,
 	if (ret)
 		goto nla_put_failure;
 
-	ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops);
+	switch (ops->hook_ops_type) {
+	case NF_HOOK_OP_NF_TABLES:
+		ret = nfnl_hook_put_nft_chain_info(nlskb, ctx, seq, ops->priv);
+		break;
+	case NF_HOOK_OP_BPF:
+		ret = nfnl_hook_put_bpf_prog_info(nlskb, ctx, seq, ops->priv);
+		break;
+	case NF_HOOK_OP_UNDEFINED:
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
 	if (ret)
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From 2b99ef22e0d237e08bfc437e7d051f78f352aeb2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Apr 2023 19:02:59 +0200
Subject: bpf: add test_run support for netfilter program type

add glue code so a bpf program can be run using userspace-provided
netfilter state and packet/skb.

Default is to use ipv4:output hook point, but this can be overridden by
userspace.  Userspace provided netfilter state is restricted, only hook and
protocol families can be overridden and only to ipv4/ipv6.

Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230421170300.24115-7-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h         |   3 +
 net/bpf/test_run.c          | 158 ++++++++++++++++++++++++++++++++++++++++++++
 net/netfilter/nf_bpf_link.c |   1 +
 3 files changed, 162 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 18b592fde896..e53ceee1df37 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2264,6 +2264,9 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
 int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 				const union bpf_attr *kattr,
 				union bpf_attr __user *uattr);
+int bpf_prog_test_run_nf(struct bpf_prog *prog,
+			 const union bpf_attr *kattr,
+			 union bpf_attr __user *uattr);
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f170e8a17974..e79e3a415ca9 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -19,7 +19,9 @@
 #include <linux/error-injection.h>
 #include <linux/smp.h>
 #include <linux/sock_diag.h>
+#include <linux/netfilter.h>
 #include <net/xdp.h>
+#include <net/netfilter/nf_bpf_link.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/bpf_test_run.h>
@@ -1691,6 +1693,162 @@ out:
 	return err;
 }
 
+static int verify_and_copy_hook_state(struct nf_hook_state *state,
+				      const struct nf_hook_state *user,
+				      struct net_device *dev)
+{
+	if (user->in || user->out)
+		return -EINVAL;
+
+	if (user->net || user->sk || user->okfn)
+		return -EINVAL;
+
+	switch (user->pf) {
+	case NFPROTO_IPV4:
+	case NFPROTO_IPV6:
+		switch (state->hook) {
+		case NF_INET_PRE_ROUTING:
+			state->in = dev;
+			break;
+		case NF_INET_LOCAL_IN:
+			state->in = dev;
+			break;
+		case NF_INET_FORWARD:
+			state->in = dev;
+			state->out = dev;
+			break;
+		case NF_INET_LOCAL_OUT:
+			state->out = dev;
+			break;
+		case NF_INET_POST_ROUTING:
+			state->out = dev;
+			break;
+		}
+
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	state->pf = user->pf;
+	state->hook = user->hook;
+
+	return 0;
+}
+
+static __be16 nfproto_eth(int nfproto)
+{
+	switch (nfproto) {
+	case NFPROTO_IPV4:
+		return htons(ETH_P_IP);
+	case NFPROTO_IPV6:
+		break;
+	}
+
+	return htons(ETH_P_IPV6);
+}
+
+int bpf_prog_test_run_nf(struct bpf_prog *prog,
+			 const union bpf_attr *kattr,
+			 union bpf_attr __user *uattr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct net_device *dev = net->loopback_dev;
+	struct nf_hook_state *user_ctx, hook_state = {
+		.pf = NFPROTO_IPV4,
+		.hook = NF_INET_LOCAL_OUT,
+	};
+	u32 size = kattr->test.data_size_in;
+	u32 repeat = kattr->test.repeat;
+	struct bpf_nf_ctx ctx = {
+		.state = &hook_state,
+	};
+	struct sk_buff *skb = NULL;
+	u32 retval, duration;
+	void *data;
+	int ret;
+
+	if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+		return -EINVAL;
+
+	if (size < sizeof(struct iphdr))
+		return -EINVAL;
+
+	data = bpf_test_init(kattr, kattr->test.data_size_in, size,
+			     NET_SKB_PAD + NET_IP_ALIGN,
+			     SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	if (!repeat)
+		repeat = 1;
+
+	user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state));
+	if (IS_ERR(user_ctx)) {
+		kfree(data);
+		return PTR_ERR(user_ctx);
+	}
+
+	if (user_ctx) {
+		ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev);
+		if (ret)
+			goto out;
+	}
+
+	skb = slab_build_skb(data);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	data = NULL; /* data released via kfree_skb */
+
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+	__skb_put(skb, size);
+
+	ret = -EINVAL;
+
+	if (hook_state.hook != NF_INET_LOCAL_OUT) {
+		if (size < ETH_HLEN + sizeof(struct iphdr))
+			goto out;
+
+		skb->protocol = eth_type_trans(skb, dev);
+		switch (skb->protocol) {
+		case htons(ETH_P_IP):
+			if (hook_state.pf == NFPROTO_IPV4)
+				break;
+			goto out;
+		case htons(ETH_P_IPV6):
+			if (size < ETH_HLEN + sizeof(struct ipv6hdr))
+				goto out;
+			if (hook_state.pf == NFPROTO_IPV6)
+				break;
+			goto out;
+		default:
+			ret = -EPROTO;
+			goto out;
+		}
+
+		skb_reset_network_header(skb);
+	} else {
+		skb->protocol = nfproto_eth(hook_state.pf);
+	}
+
+	ctx.skb = skb;
+
+	ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false);
+	if (ret)
+		goto out;
+
+	ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration);
+
+out:
+	kfree(user_ctx);
+	kfree_skb(skb);
+	kfree(data);
+	return ret;
+}
+
 static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
 	.owner = THIS_MODULE,
 	.set   = &test_sk_check_kfunc_ids,
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
index 49cfc5215386..c36da56d756f 100644
--- a/net/netfilter/nf_bpf_link.c
+++ b/net/netfilter/nf_bpf_link.c
@@ -166,6 +166,7 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 }
 
 const struct bpf_prog_ops netfilter_prog_ops = {
+	.test_run = bpf_prog_test_run_nf,
 };
 
 static bool nf_ptr_to_btf_id(struct bpf_insn_access_aux *info, const char *name)
-- 
cgit v1.2.3


From 63e9bbbcca60333490e13744ae736d8f988e4950 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 11 Apr 2023 16:29:47 +0200
Subject: netfilter: nf_tables: don't store chain address on jump

Now that the rule trailer/end marker and the rcu head reside in the
same structure, we no longer need to save/restore the chain pointer
when performing/returning from a jump.

We can simply let the trace infra walk the evaluated rule until it
hits the end marker and then fetch the chain pointer from there.

When the rule is NULL (policy tracing), then chain and basechain
pointers were already identical, so just use the basechain.

This cuts size of jumpstack in half, from 256 to 128 bytes in 64bit,
scripts/stackusage says:

nf_tables_core.c:251 nft_do_chain    328     static

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 14 ++++++++++++--
 net/netfilter/nf_tables_api.c     |  7 -------
 net/netfilter/nf_tables_core.c    | 21 ++++++---------------
 net/netfilter/nf_tables_trace.c   | 30 ++++++++++++++++++++++++++----
 4 files changed, 44 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 1b8e305bb54a..f476fd030626 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1046,6 +1046,18 @@ struct nft_rule_dp {
 		__attribute__((aligned(__alignof__(struct nft_expr))));
 };
 
+struct nft_rule_dp_last {
+	struct nft_rule_dp end;		/* end of nft_rule_blob marker */
+	struct rcu_head h;		/* call_rcu head */
+	struct nft_rule_blob *blob;	/* ptr to free via call_rcu */
+	const struct nft_chain *chain;	/* for nftables tracing */
+};
+
+static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule)
+{
+	return (void *)rule + sizeof(*rule) + rule->dlen;
+}
+
 struct nft_rule_blob {
 	unsigned long			size;
 	unsigned char			data[]
@@ -1396,7 +1408,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@packet_dumped: packet headers sent in a previous traceinfo message
  *	@pkt: pktinfo currently processed
  *	@basechain: base chain currently processed
- *	@chain: chain currently processed
  *	@rule:  rule that was evaluated
  *	@verdict: verdict given by rule
  */
@@ -1408,7 +1419,6 @@ struct nft_traceinfo {
 	u32				skbid;
 	const struct nft_pktinfo	*pkt;
 	const struct nft_base_chain	*basechain;
-	const struct nft_chain		*chain;
 	const struct nft_rule_dp	*rule;
 	const struct nft_verdict	*verdict;
 };
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 79848a27e640..0e1c86bb51a2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2110,13 +2110,6 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook)
 	module_put(hook->type->owner);
 }
 
-struct nft_rule_dp_last {
-	struct nft_rule_dp end;	/* end of nft_rule_blob marker */
-	struct rcu_head h;
-	struct nft_rule_blob *blob;
-	const struct nft_chain *chain;	/* for tracing */
-};
-
 static void nft_last_rule(const struct nft_chain *chain, const void *ptr)
 {
 	struct nft_rule_dp_last *lrule;
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index ec3bab751092..89c05b64c2a2 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -42,13 +42,11 @@ static inline void nf_skip_indirect_calls_enable(void) { }
 #endif
 
 static noinline void __nft_trace_packet(struct nft_traceinfo *info,
-					const struct nft_chain *chain,
 					enum nft_trace_types type)
 {
 	if (!info->trace || !info->nf_trace)
 		return;
 
-	info->chain = chain;
 	info->type = type;
 
 	nft_trace_notify(info);
@@ -56,14 +54,13 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 				    struct nft_traceinfo *info,
-				    const struct nft_chain *chain,
 				    const struct nft_rule_dp *rule,
 				    enum nft_trace_types type)
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
 		info->rule = rule;
-		__nft_trace_packet(info, chain, type);
+		__nft_trace_packet(info, type);
 	}
 }
 
@@ -111,7 +108,6 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr,
 }
 
 static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
-					 const struct nft_chain *chain,
 					 const struct nft_regs *regs)
 {
 	enum nft_trace_types type;
@@ -133,17 +129,16 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
 		break;
 	}
 
-	__nft_trace_packet(info, chain, type);
+	__nft_trace_packet(info, type);
 }
 
 static inline void nft_trace_verdict(struct nft_traceinfo *info,
-				     const struct nft_chain *chain,
 				     const struct nft_rule_dp *rule,
 				     const struct nft_regs *regs)
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->rule = rule;
-		__nft_trace_verdict(info, chain, regs);
+		__nft_trace_verdict(info, regs);
 	}
 }
 
@@ -203,7 +198,6 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain,
 }
 
 struct nft_jumpstack {
-	const struct nft_chain *chain;
 	const struct nft_rule_dp *rule;
 };
 
@@ -247,7 +241,6 @@ indirect_call:
 #define nft_rule_expr_first(rule)	(struct nft_expr *)&rule->data[0]
 #define nft_rule_expr_next(expr)	((void *)expr) + expr->ops->size
 #define nft_rule_expr_last(rule)	(struct nft_expr *)&rule->data[rule->dlen]
-#define nft_rule_next(rule)		(void *)rule + sizeof(*rule) + rule->dlen
 
 #define nft_rule_dp_for_each_expr(expr, last, rule) \
         for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \
@@ -302,14 +295,14 @@ next_rule:
 			nft_trace_copy_nftrace(pkt, &info);
 			continue;
 		case NFT_CONTINUE:
-			nft_trace_packet(pkt, &info, chain, rule,
+			nft_trace_packet(pkt, &info, rule,
 					 NFT_TRACETYPE_RULE);
 			continue;
 		}
 		break;
 	}
 
-	nft_trace_verdict(&info, chain, rule, &regs);
+	nft_trace_verdict(&info, rule, &regs);
 
 	switch (regs.verdict.code & NF_VERDICT_MASK) {
 	case NF_ACCEPT:
@@ -323,7 +316,6 @@ next_rule:
 	case NFT_JUMP:
 		if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE))
 			return NF_DROP;
-		jumpstack[stackptr].chain = chain;
 		jumpstack[stackptr].rule = nft_rule_next(rule);
 		stackptr++;
 		fallthrough;
@@ -339,12 +331,11 @@ next_rule:
 
 	if (stackptr > 0) {
 		stackptr--;
-		chain = jumpstack[stackptr].chain;
 		rule = jumpstack[stackptr].rule;
 		goto next_rule;
 	}
 
-	nft_trace_packet(pkt, &info, basechain, NULL, NFT_TRACETYPE_POLICY);
+	nft_trace_packet(pkt, &info, NULL, NFT_TRACETYPE_POLICY);
 
 	if (static_branch_unlikely(&nft_counters_enabled))
 		nft_update_chain_stats(basechain, pkt);
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 1163ba9c1401..3d9b83d84a84 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -164,9 +164,29 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
 	return true;
 }
 
+static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *info)
+{
+	const struct nft_rule_dp *rule = info->rule;
+	const struct nft_rule_dp_last *last;
+
+	if (!rule)
+		return &info->basechain->chain;
+
+	while (!rule->is_last)
+		rule = nft_rule_next(rule);
+
+	last = (const struct nft_rule_dp_last *)rule;
+
+	if (WARN_ON_ONCE(!last->chain))
+		return &info->basechain->chain;
+
+	return last->chain;
+}
+
 void nft_trace_notify(struct nft_traceinfo *info)
 {
 	const struct nft_pktinfo *pkt = info->pkt;
+	const struct nft_chain *chain;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
 	unsigned int size;
@@ -176,9 +196,11 @@ void nft_trace_notify(struct nft_traceinfo *info)
 	if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
 		return;
 
+	chain = nft_trace_get_chain(info);
+
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
-		nla_total_size(strlen(info->chain->table->name)) +
-		nla_total_size(strlen(info->chain->name)) +
+		nla_total_size(strlen(chain->table->name)) +
+		nla_total_size(strlen(chain->name)) +
 		nla_total_size_64bit(sizeof(__be64)) +	/* rule handle */
 		nla_total_size(sizeof(__be32)) +	/* trace type */
 		nla_total_size(0) +			/* VERDICT, nested */
@@ -217,10 +239,10 @@ void nft_trace_notify(struct nft_traceinfo *info)
 	if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid))
 		goto nla_put_failure;
 
-	if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name))
+	if (nla_put_string(skb, NFTA_TRACE_CHAIN, chain->name))
 		goto nla_put_failure;
 
-	if (nla_put_string(skb, NFTA_TRACE_TABLE, info->chain->table->name))
+	if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
 		goto nla_put_failure;
 
 	if (nf_trace_fill_rule_info(skb, info))
-- 
cgit v1.2.3


From 9a32e9850686599ed194ccdceb6cd3dd56b2d9b9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 13 Apr 2023 17:13:19 +0200
Subject: netfilter: nf_tables: don't write table validation state without
 mutex

The ->cleanup callback needs to be removed, this doesn't work anymore as
the transaction mutex is already released in the ->abort function.

Just do it after a successful validation pass, this either happens
from commit or abort phases where transaction mutex is held.

Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h | 1 -
 net/netfilter/nf_tables_api.c       | 8 ++------
 net/netfilter/nfnetlink.c           | 2 --
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 241e005f290a..e9a9ab34a7cc 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -45,7 +45,6 @@ struct nfnetlink_subsystem {
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb,
 		     enum nfnl_abort_action action);
-	void (*cleanup)(struct net *net);
 	bool (*valid_genid)(struct net *net, u32 genid);
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0e1c86bb51a2..21eb273d2740 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -8639,6 +8639,8 @@ static int nf_tables_validate(struct net *net)
 			if (nft_table_validate(net, table) < 0)
 				return -EAGAIN;
 		}
+
+		nft_validate_state_update(net, NFT_VALIDATE_SKIP);
 		break;
 	}
 
@@ -9578,11 +9580,6 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 	return 0;
 }
 
-static void nf_tables_cleanup(struct net *net)
-{
-	nft_validate_state_update(net, NFT_VALIDATE_SKIP);
-}
-
 static int nf_tables_abort(struct net *net, struct sk_buff *skb,
 			   enum nfnl_abort_action action)
 {
@@ -9616,7 +9613,6 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.cb		= nf_tables_cb,
 	.commit		= nf_tables_commit,
 	.abort		= nf_tables_abort,
-	.cleanup	= nf_tables_cleanup,
 	.valid_genid	= nf_tables_valid_genid,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 81c7737c803a..ae7146475d17 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -590,8 +590,6 @@ done:
 			goto replay_abort;
 		}
 	}
-	if (ss->cleanup)
-		ss->cleanup(net);
 
 	nfnl_err_deliver(&err_list, oskb);
 	kfree_skb(skb);
-- 
cgit v1.2.3


From 00c320f9b75560628e840bef027a27c746706759 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 13 Apr 2023 17:13:20 +0200
Subject: netfilter: nf_tables: make validation state per table

We only need to validate tables that saw changes in the current
transaction.

The existing code revalidates all tables, but this isn't needed as
cross-table jumps are not allowed (chains have table scope).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  3 ++-
 net/netfilter/nf_tables_api.c     | 38 ++++++++++++++++++--------------------
 2 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index f476fd030626..ec347d9cff9e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1209,6 +1209,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@genmask: generation mask
  *	@afinfo: address family info
  *	@name: name of the table
+ *	@validate_state: internal, set when transaction adds jumps
  */
 struct nft_table {
 	struct list_head		list;
@@ -1227,6 +1228,7 @@ struct nft_table {
 	char				*name;
 	u16				udlen;
 	u8				*udata;
+	u8				validate_state;
 };
 
 static inline bool nft_table_has_owner(const struct nft_table *table)
@@ -1698,7 +1700,6 @@ struct nftables_pernet {
 	struct mutex		commit_mutex;
 	u64			table_handle;
 	unsigned int		base_seq;
-	u8			validate_state;
 };
 
 extern unsigned int nf_tables_net_id;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 21eb273d2740..44ebc5f9598e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -102,11 +102,9 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
 	[NFT_MSG_DELFLOWTABLE]	= AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
 };
 
-static void nft_validate_state_update(struct net *net, u8 new_validate_state)
+static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
 {
-	struct nftables_pernet *nft_net = nft_pernet(net);
-
-	switch (nft_net->validate_state) {
+	switch (table->validate_state) {
 	case NFT_VALIDATE_SKIP:
 		WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
 		break;
@@ -117,7 +115,7 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state)
 			return;
 	}
 
-	nft_net->validate_state = new_validate_state;
+	table->validate_state = new_validate_state;
 }
 static void nf_tables_trans_destroy_work(struct work_struct *w);
 static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
@@ -1224,6 +1222,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
 	if (table == NULL)
 		goto err_kzalloc;
 
+	table->validate_state = NFT_VALIDATE_SKIP;
 	table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
 	if (table->name == NULL)
 		goto err_strdup;
@@ -3660,7 +3659,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 		}
 
 		if (expr_info[i].ops->validate)
-			nft_validate_state_update(net, NFT_VALIDATE_NEED);
+			nft_validate_state_update(table, NFT_VALIDATE_NEED);
 
 		expr_info[i].ops = NULL;
 		expr = nft_expr_next(expr);
@@ -3710,7 +3709,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 	if (flow)
 		nft_trans_flow_rule(trans) = flow;
 
-	if (nft_net->validate_state == NFT_VALIDATE_DO)
+	if (table->validate_state == NFT_VALIDATE_DO)
 		return nft_table_validate(net, table);
 
 	return 0;
@@ -6312,7 +6311,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			if (desc.type == NFT_DATA_VERDICT &&
 			    (elem.data.val.verdict.code == NFT_GOTO ||
 			     elem.data.val.verdict.code == NFT_JUMP))
-				nft_validate_state_update(ctx->net,
+				nft_validate_state_update(ctx->table,
 							  NFT_VALIDATE_NEED);
 		}
 
@@ -6437,7 +6436,6 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
 				const struct nfnl_info *info,
 				const struct nlattr * const nla[])
 {
-	struct nftables_pernet *nft_net = nft_pernet(info->net);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_next(info->net);
 	u8 family = info->nfmsg->nfgen_family;
@@ -6476,7 +6474,7 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
 		}
 	}
 
-	if (nft_net->validate_state == NFT_VALIDATE_DO)
+	if (table->validate_state == NFT_VALIDATE_DO)
 		return nft_table_validate(net, table);
 
 	return 0;
@@ -8628,19 +8626,20 @@ static int nf_tables_validate(struct net *net)
 	struct nftables_pernet *nft_net = nft_pernet(net);
 	struct nft_table *table;
 
-	switch (nft_net->validate_state) {
-	case NFT_VALIDATE_SKIP:
-		break;
-	case NFT_VALIDATE_NEED:
-		nft_validate_state_update(net, NFT_VALIDATE_DO);
-		fallthrough;
-	case NFT_VALIDATE_DO:
-		list_for_each_entry(table, &nft_net->tables, list) {
+	list_for_each_entry(table, &nft_net->tables, list) {
+		switch (table->validate_state) {
+		case NFT_VALIDATE_SKIP:
+			continue;
+		case NFT_VALIDATE_NEED:
+			nft_validate_state_update(table, NFT_VALIDATE_DO);
+			fallthrough;
+		case NFT_VALIDATE_DO:
 			if (nft_table_validate(net, table) < 0)
 				return -EAGAIN;
+
+			nft_validate_state_update(table, NFT_VALIDATE_SKIP);
 		}
 
-		nft_validate_state_update(net, NFT_VALIDATE_SKIP);
 		break;
 	}
 
@@ -10355,7 +10354,6 @@ static int __net_init nf_tables_init_net(struct net *net)
 	INIT_LIST_HEAD(&nft_net->notify_list);
 	mutex_init(&nft_net->commit_mutex);
 	nft_net->base_seq = 1;
-	nft_net->validate_state = NFT_VALIDATE_SKIP;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 698bb828a6c20c86e30b307175be1827c071ce23 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 14 Apr 2023 15:01:32 +0200
Subject: netfilter: nf_tables: do not store pktinfo in traceinfo structure

pass it as argument.  No change in object size.

stack usage decreases by 8 byte:
 nf_tables_core.c:254  nft_do_chain       320     static

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  5 ++---
 net/netfilter/nf_tables_core.c    | 21 ++++++++++++---------
 net/netfilter/nf_tables_trace.c   |  5 ++---
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ec347d9cff9e..cab351928cd2 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1408,7 +1408,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@type: event type (enum nft_trace_types)
  *	@skbid: hash of skb to be used as trace id
  *	@packet_dumped: packet headers sent in a previous traceinfo message
- *	@pkt: pktinfo currently processed
  *	@basechain: base chain currently processed
  *	@rule:  rule that was evaluated
  *	@verdict: verdict given by rule
@@ -1419,7 +1418,6 @@ struct nft_traceinfo {
 	bool				packet_dumped;
 	enum nft_trace_types		type:8;
 	u32				skbid;
-	const struct nft_pktinfo	*pkt;
 	const struct nft_base_chain	*basechain;
 	const struct nft_rule_dp	*rule;
 	const struct nft_verdict	*verdict;
@@ -1429,7 +1427,8 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 		    const struct nft_verdict *verdict,
 		    const struct nft_chain *basechain);
 
-void nft_trace_notify(struct nft_traceinfo *info);
+void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      struct nft_traceinfo *info);
 
 #define MODULE_ALIAS_NFT_CHAIN(family, name) \
 	MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index bed855638050..776eb2b9f632 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -41,7 +41,8 @@ static inline bool nf_skip_indirect_calls(void) { return false; }
 static inline void nf_skip_indirect_calls_enable(void) { }
 #endif
 
-static noinline void __nft_trace_packet(struct nft_traceinfo *info,
+static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
+					struct nft_traceinfo *info,
 					enum nft_trace_types type)
 {
 	if (!info->trace || !info->nf_trace)
@@ -49,7 +50,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info,
 
 	info->type = type;
 
-	nft_trace_notify(info);
+	nft_trace_notify(pkt, info);
 }
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
@@ -60,7 +61,7 @@ static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
 		info->rule = rule;
-		__nft_trace_packet(info, type);
+		__nft_trace_packet(pkt, info, type);
 	}
 }
 
@@ -105,7 +106,8 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr,
 	regs->verdict.code = NFT_BREAK;
 }
 
-static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
+static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
+					 struct nft_traceinfo *info,
 					 const struct nft_regs *regs)
 {
 	enum nft_trace_types type;
@@ -123,20 +125,21 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info,
 		type = NFT_TRACETYPE_RULE;
 
 		if (info->trace)
-			info->nf_trace = info->pkt->skb->nf_trace;
+			info->nf_trace = pkt->skb->nf_trace;
 		break;
 	}
 
-	__nft_trace_packet(info, type);
+	__nft_trace_packet(pkt, info, type);
 }
 
-static inline void nft_trace_verdict(struct nft_traceinfo *info,
+static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
+				     struct nft_traceinfo *info,
 				     const struct nft_rule_dp *rule,
 				     const struct nft_regs *regs)
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->rule = rule;
-		__nft_trace_verdict(info, regs);
+		__nft_trace_verdict(pkt, info, regs);
 	}
 }
 
@@ -300,7 +303,7 @@ next_rule:
 		break;
 	}
 
-	nft_trace_verdict(&info, rule, &regs);
+	nft_trace_verdict(pkt, &info, rule, &regs);
 
 	switch (regs.verdict.code & NF_VERDICT_MASK) {
 	case NF_ACCEPT:
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 3d9b83d84a84..0a0dcf2587fd 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -183,9 +183,9 @@ static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *i
 	return last->chain;
 }
 
-void nft_trace_notify(struct nft_traceinfo *info)
+void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      struct nft_traceinfo *info)
 {
-	const struct nft_pktinfo *pkt = info->pkt;
 	const struct nft_chain *chain;
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
@@ -305,7 +305,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 	info->trace = true;
 	info->nf_trace = pkt->skb->nf_trace;
 	info->packet_dumped = false;
-	info->pkt = pkt;
 	info->verdict = verdict;
 
 	net_get_random_once(&trace_key, sizeof(trace_key));
-- 
cgit v1.2.3


From 0a202145d5f9277dd24474aca8245731d030f29e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 14 Apr 2023 15:01:33 +0200
Subject: netfilter: nf_tables: do not store verdict in traceinfo structure

Just pass it as argument to nft_trace_notify. Stack is reduced by 8 bytes:

nf_tables_core.c:256 nft_do_chain    312     static

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 +---
 net/netfilter/nf_tables_core.c    | 14 ++++++++------
 net/netfilter/nf_tables_trace.c   | 21 +++++++++++----------
 3 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cab351928cd2..693469ecfa54 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1410,7 +1410,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@packet_dumped: packet headers sent in a previous traceinfo message
  *	@basechain: base chain currently processed
  *	@rule:  rule that was evaluated
- *	@verdict: verdict given by rule
  */
 struct nft_traceinfo {
 	bool				trace;
@@ -1420,14 +1419,13 @@ struct nft_traceinfo {
 	u32				skbid;
 	const struct nft_base_chain	*basechain;
 	const struct nft_rule_dp	*rule;
-	const struct nft_verdict	*verdict;
 };
 
 void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
-		    const struct nft_verdict *verdict,
 		    const struct nft_chain *basechain);
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      const struct nft_verdict *verdict,
 		      struct nft_traceinfo *info);
 
 #define MODULE_ALIAS_NFT_CHAIN(family, name) \
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 776eb2b9f632..6debe8b2623f 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -42,6 +42,7 @@ static inline void nf_skip_indirect_calls_enable(void) { }
 #endif
 
 static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
+					const struct nft_verdict *verdict,
 					struct nft_traceinfo *info,
 					enum nft_trace_types type)
 {
@@ -50,10 +51,11 @@ static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
 
 	info->type = type;
 
-	nft_trace_notify(pkt, info);
+	nft_trace_notify(pkt, verdict, info);
 }
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+				    struct nft_verdict *verdict,
 				    struct nft_traceinfo *info,
 				    const struct nft_rule_dp *rule,
 				    enum nft_trace_types type)
@@ -61,7 +63,7 @@ static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
 		info->rule = rule;
-		__nft_trace_packet(pkt, info, type);
+		__nft_trace_packet(pkt, verdict, info, type);
 	}
 }
 
@@ -129,7 +131,7 @@ static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
 		break;
 	}
 
-	__nft_trace_packet(pkt, info, type);
+	__nft_trace_packet(pkt, &regs->verdict, info, type);
 }
 
 static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
@@ -264,7 +266,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
 
 	info.trace = false;
 	if (static_branch_unlikely(&nft_trace_enabled))
-		nft_trace_init(&info, pkt, &regs.verdict, basechain);
+		nft_trace_init(&info, pkt, basechain);
 do_chain:
 	if (genbit)
 		blob = rcu_dereference(chain->blob_gen_1);
@@ -296,7 +298,7 @@ next_rule:
 			nft_trace_copy_nftrace(pkt, &info);
 			continue;
 		case NFT_CONTINUE:
-			nft_trace_packet(pkt, &info, rule,
+			nft_trace_packet(pkt, &regs.verdict,  &info, rule,
 					 NFT_TRACETYPE_RULE);
 			continue;
 		}
@@ -336,7 +338,7 @@ next_rule:
 		goto next_rule;
 	}
 
-	nft_trace_packet(pkt, &info, NULL, NFT_TRACETYPE_POLICY);
+	nft_trace_packet(pkt, &regs.verdict, &info, NULL, NFT_TRACETYPE_POLICY);
 
 	if (static_branch_unlikely(&nft_counters_enabled))
 		nft_update_chain_stats(basechain, pkt);
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 0a0dcf2587fd..e635104a42be 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -124,6 +124,7 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
 }
 
 static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
+				   const struct nft_verdict *verdict,
 				   const struct nft_traceinfo *info)
 {
 	if (!info->rule || info->rule->is_last)
@@ -135,7 +136,7 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 	 * Since no rule matched, the ->rule pointer is invalid.
 	 */
 	if (info->type == NFT_TRACETYPE_RETURN &&
-	    info->verdict->code == NFT_CONTINUE)
+	    verdict->code == NFT_CONTINUE)
 		return 0;
 
 	return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
@@ -143,7 +144,8 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 			    NFTA_TRACE_PAD);
 }
 
-static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
+static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict,
+					 struct nft_traceinfo *info)
 {
 	switch (info->type) {
 	case NFT_TRACETYPE_RETURN:
@@ -153,7 +155,7 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info)
 		return false;
 	}
 
-	switch (info->verdict->code) {
+	switch (verdict->code) {
 	case NFT_JUMP:
 	case NFT_GOTO:
 		break;
@@ -184,6 +186,7 @@ static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *i
 }
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
+		      const struct nft_verdict *verdict,
 		      struct nft_traceinfo *info)
 {
 	const struct nft_chain *chain;
@@ -217,8 +220,8 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 		nla_total_size(sizeof(u32)) +		/* nfproto */
 		nla_total_size(sizeof(u32));		/* policy */
 
-	if (nft_trace_have_verdict_chain(info))
-		size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */
+	if (nft_trace_have_verdict_chain(verdict, info))
+		size += nla_total_size(strlen(verdict->chain->name)); /* jump target */
 
 	skb = nlmsg_new(size, GFP_ATOMIC);
 	if (!skb)
@@ -245,7 +248,7 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 	if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
 		goto nla_put_failure;
 
-	if (nf_trace_fill_rule_info(skb, info))
+	if (nf_trace_fill_rule_info(skb, verdict, info))
 		goto nla_put_failure;
 
 	switch (info->type) {
@@ -254,11 +257,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 		break;
 	case NFT_TRACETYPE_RETURN:
 	case NFT_TRACETYPE_RULE:
-		if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
+		if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, verdict))
 			goto nla_put_failure;
 
 		/* pkt->skb undefined iff NF_STOLEN, disable dump */
-		if (info->verdict->code == NF_STOLEN)
+		if (verdict->code == NF_STOLEN)
 			info->packet_dumped = true;
 		else
 			mark = pkt->skb->mark;
@@ -295,7 +298,6 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 }
 
 void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
-		    const struct nft_verdict *verdict,
 		    const struct nft_chain *chain)
 {
 	static siphash_key_t trace_key __read_mostly;
@@ -305,7 +307,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 	info->trace = true;
 	info->nf_trace = pkt->skb->nf_trace;
 	info->packet_dumped = false;
-	info->verdict = verdict;
 
 	net_get_random_once(&trace_key, sizeof(trace_key));
 
-- 
cgit v1.2.3


From 46df417544f4f7fa3438caada0dc0e534a694343 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 14 Apr 2023 15:01:34 +0200
Subject: netfilter: nf_tables: do not store rule in traceinfo structure

pass it as argument instead.  This reduces size of traceinfo to
16 bytes.  Total stack usage:

 nf_tables_core.c:252 nft_do_chain    304     static

While its possible to also pass basechain as argument, doing so
increases nft_do_chaininfo function size.

Unlike pktinfo/verdict/rule the basechain info isn't used in
the expression evaluation path. gcc places it on the stack, which
results in extra push/pop when it gets passed to the trace helpers
as argument rather than as part of the traceinfo structure.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  3 +--
 net/netfilter/nf_tables_core.c    | 15 +++++++--------
 net/netfilter/nf_tables_trace.c   | 14 ++++++++------
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 693469ecfa54..58a4d217faaf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1409,7 +1409,6 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
  *	@skbid: hash of skb to be used as trace id
  *	@packet_dumped: packet headers sent in a previous traceinfo message
  *	@basechain: base chain currently processed
- *	@rule:  rule that was evaluated
  */
 struct nft_traceinfo {
 	bool				trace;
@@ -1418,7 +1417,6 @@ struct nft_traceinfo {
 	enum nft_trace_types		type:8;
 	u32				skbid;
 	const struct nft_base_chain	*basechain;
-	const struct nft_rule_dp	*rule;
 };
 
 void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
@@ -1426,6 +1424,7 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
 		      const struct nft_verdict *verdict,
+		      const struct nft_rule_dp *rule,
 		      struct nft_traceinfo *info);
 
 #define MODULE_ALIAS_NFT_CHAIN(family, name) \
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 6debe8b2623f..4d0ce12221f6 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -43,6 +43,7 @@ static inline void nf_skip_indirect_calls_enable(void) { }
 
 static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
 					const struct nft_verdict *verdict,
+					const struct nft_rule_dp *rule,
 					struct nft_traceinfo *info,
 					enum nft_trace_types type)
 {
@@ -51,7 +52,7 @@ static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
 
 	info->type = type;
 
-	nft_trace_notify(pkt, verdict, info);
+	nft_trace_notify(pkt, verdict, rule, info);
 }
 
 static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
@@ -62,8 +63,7 @@ static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
 {
 	if (static_branch_unlikely(&nft_trace_enabled)) {
 		info->nf_trace = pkt->skb->nf_trace;
-		info->rule = rule;
-		__nft_trace_packet(pkt, verdict, info, type);
+		__nft_trace_packet(pkt, verdict, rule, info, type);
 	}
 }
 
@@ -110,6 +110,7 @@ static void nft_cmp16_fast_eval(const struct nft_expr *expr,
 
 static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
 					 struct nft_traceinfo *info,
+					 const struct nft_rule_dp *rule,
 					 const struct nft_regs *regs)
 {
 	enum nft_trace_types type;
@@ -131,7 +132,7 @@ static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt,
 		break;
 	}
 
-	__nft_trace_packet(pkt, &regs->verdict, info, type);
+	__nft_trace_packet(pkt, &regs->verdict, rule, info, type);
 }
 
 static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
@@ -139,10 +140,8 @@ static inline void nft_trace_verdict(const struct nft_pktinfo *pkt,
 				     const struct nft_rule_dp *rule,
 				     const struct nft_regs *regs)
 {
-	if (static_branch_unlikely(&nft_trace_enabled)) {
-		info->rule = rule;
-		__nft_trace_verdict(pkt, info, regs);
-	}
+	if (static_branch_unlikely(&nft_trace_enabled))
+		__nft_trace_verdict(pkt, info, rule, regs);
 }
 
 static bool nft_payload_fast_eval(const struct nft_expr *expr,
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index e635104a42be..6d41c0bd3d78 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -125,9 +125,10 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
 
 static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 				   const struct nft_verdict *verdict,
+				   const struct nft_rule_dp *rule,
 				   const struct nft_traceinfo *info)
 {
-	if (!info->rule || info->rule->is_last)
+	if (!rule || rule->is_last)
 		return 0;
 
 	/* a continue verdict with ->type == RETURN means that this is
@@ -140,7 +141,7 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
 		return 0;
 
 	return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
-			    cpu_to_be64(info->rule->handle),
+			    cpu_to_be64(rule->handle),
 			    NFTA_TRACE_PAD);
 }
 
@@ -166,9 +167,9 @@ static bool nft_trace_have_verdict_chain(const struct nft_verdict *verdict,
 	return true;
 }
 
-static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *info)
+static const struct nft_chain *nft_trace_get_chain(const struct nft_rule_dp *rule,
+						   const struct nft_traceinfo *info)
 {
-	const struct nft_rule_dp *rule = info->rule;
 	const struct nft_rule_dp_last *last;
 
 	if (!rule)
@@ -187,6 +188,7 @@ static const struct nft_chain *nft_trace_get_chain(const struct nft_traceinfo *i
 
 void nft_trace_notify(const struct nft_pktinfo *pkt,
 		      const struct nft_verdict *verdict,
+		      const struct nft_rule_dp *rule,
 		      struct nft_traceinfo *info)
 {
 	const struct nft_chain *chain;
@@ -199,7 +201,7 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 	if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
 		return;
 
-	chain = nft_trace_get_chain(info);
+	chain = nft_trace_get_chain(rule, info);
 
 	size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
 		nla_total_size(strlen(chain->table->name)) +
@@ -248,7 +250,7 @@ void nft_trace_notify(const struct nft_pktinfo *pkt,
 	if (nla_put_string(skb, NFTA_TRACE_TABLE, chain->table->name))
 		goto nla_put_failure;
 
-	if (nf_trace_fill_rule_info(skb, verdict, info))
+	if (nf_trace_fill_rule_info(skb, verdict, rule, info))
 		goto nla_put_failure;
 
 	switch (info->type) {
-- 
cgit v1.2.3


From e3478c68f6704638d08f437cbc552ca5970c151a Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 17 Apr 2023 17:10:45 +0200
Subject: ipvs: Update width of source for ip_vs_sync_conn_options

In ip_vs_sync_conn_v0() copy is made to struct ip_vs_sync_conn_options.
That structure looks like this:

struct ip_vs_sync_conn_options {
        struct ip_vs_seq        in_seq;
        struct ip_vs_seq        out_seq;
};

The source of the copy is the in_seq field of struct ip_vs_conn.  Whose
type is struct ip_vs_seq. Thus we can see that the source - is not as
wide as the amount of data copied, which is the width of struct
ip_vs_sync_conn_option.

The copy is safe because the next field in is another struct ip_vs_seq.
Make use of struct_group() to annotate this.

Flagged by gcc-13 as:

 In file included from ./include/linux/string.h:254,
                  from ./include/linux/bitmap.h:11,
                  from ./include/linux/cpumask.h:12,
                  from ./arch/x86/include/asm/paravirt.h:17,
                  from ./arch/x86/include/asm/cpuid.h:62,
                  from ./arch/x86/include/asm/processor.h:19,
                  from ./arch/x86/include/asm/timex.h:5,
                  from ./include/linux/timex.h:67,
                  from ./include/linux/time32.h:13,
                  from ./include/linux/time.h:60,
                  from ./include/linux/stat.h:19,
                  from ./include/linux/module.h:13,
                  from net/netfilter/ipvs/ip_vs_sync.c:38:
 In function 'fortify_memcpy_chk',
     inlined from 'ip_vs_sync_conn_v0' at net/netfilter/ipvs/ip_vs_sync.c:606:3:
 ./include/linux/fortify-string.h:529:25: error: call to '__read_overflow2_field' declared with attribute warning: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror=attribute-warning]
   529 |                         __read_overflow2_field(q_size_field, size);
       |

Compile tested only.

Signed-off-by: Simon Horman <horms@kernel.org>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             | 6 ++++--
 net/netfilter/ipvs/ip_vs_sync.c | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 6d71a5ff52df..e20f1f92066d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -630,8 +630,10 @@ struct ip_vs_conn {
 	 */
 	struct ip_vs_app        *app;           /* bound ip_vs_app object */
 	void                    *app_data;      /* Application private data */
-	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
-	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+	struct_group(sync_conn_opt,
+		struct ip_vs_seq  in_seq;       /* incoming seq. struct */
+		struct ip_vs_seq  out_seq;      /* outgoing seq. struct */
+	);
 
 	const struct ip_vs_pe	*pe;
 	char			*pe_data;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 4963fec815da..d4fe7bb4f853 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -603,7 +603,7 @@ static void ip_vs_sync_conn_v0(struct netns_ipvs *ipvs, struct ip_vs_conn *cp,
 	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
 		struct ip_vs_sync_conn_options *opt =
 			(struct ip_vs_sync_conn_options *)&s[1];
-		memcpy(opt, &cp->in_seq, sizeof(*opt));
+		memcpy(opt, &cp->sync_conn_opt, sizeof(*opt));
 	}
 
 	m->nr_conns++;
-- 
cgit v1.2.3


From 210ffe4a74caead4d6790747d32b63c6152c70b7 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 17 Apr 2023 17:10:47 +0200
Subject: ipvs: Remove {Enter,Leave}Function

Remove EnterFunction and LeaveFunction.

These debugging macros seem well past their use-by date.  And seem to
have little value these days. Removing them allows some trivial cleanup
of some exit paths for some functions. These are also included in this
patch. There is likely scope for further cleanup of both debugging and
unwind paths. But let's leave that for another day.

Only intended to change debug output, and only when CONFIG_IP_VS_DEBUG
is enabled. Compile tested only.

Signed-off-by: Simon Horman <horms@kernel.org>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             | 20 -------------
 net/netfilter/ipvs/ip_vs_core.c |  8 ------
 net/netfilter/ipvs/ip_vs_ctl.c  | 26 +----------------
 net/netfilter/ipvs/ip_vs_sync.c |  5 ----
 net/netfilter/ipvs/ip_vs_xmit.c | 62 ++++++-----------------------------------
 5 files changed, 9 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e20f1f92066d..a3adc246ee31 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -265,26 +265,6 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
 			pr_err(msg, ##__VA_ARGS__);			\
 	} while (0)
 
-#ifdef CONFIG_IP_VS_DEBUG
-#define EnterFunction(level)						\
-	do {								\
-		if (level <= ip_vs_get_debug_level())			\
-			printk(KERN_DEBUG				\
-			       pr_fmt("Enter: %s, %s line %i\n"),	\
-			       __func__, __FILE__, __LINE__);		\
-	} while (0)
-#define LeaveFunction(level)						\
-	do {								\
-		if (level <= ip_vs_get_debug_level())			\
-			printk(KERN_DEBUG				\
-			       pr_fmt("Leave: %s, %s line %i\n"),	\
-			       __func__, __FILE__, __LINE__);		\
-	} while (0)
-#else
-#define EnterFunction(level)   do {} while (0)
-#define LeaveFunction(level)   do {} while (0)
-#endif
-
 /* The port number of FTP service (in network order). */
 #define FTPPORT  cpu_to_be16(21)
 #define FTPDATA  cpu_to_be16(20)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2fcc26507d69..cb83ca506c5c 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1140,7 +1140,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
 	__be16 vport;
 	unsigned int flags;
 
-	EnterFunction(12);
 	vaddr = &svc->addr;
 	vport = svc->port;
 	daddr = &iph->saddr;
@@ -1208,7 +1207,6 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
 		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
 		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
 		      cp->flags, refcount_read(&cp->refcnt));
-	LeaveFunction(12);
 	return cp;
 }
 
@@ -1316,13 +1314,11 @@ after_nat:
 		ip_vs_update_conntrack(skb, cp, 0);
 	ip_vs_conn_put(cp);
 
-	LeaveFunction(11);
 	return NF_ACCEPT;
 
 drop:
 	ip_vs_conn_put(cp);
 	kfree_skb(skb);
-	LeaveFunction(11);
 	return NF_STOLEN;
 }
 
@@ -1341,8 +1337,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat
 	int af = state->pf;
 	struct sock *sk;
 
-	EnterFunction(11);
-
 	/* Already marked as IPVS request or reply? */
 	if (skb->ipvs_property)
 		return NF_ACCEPT;
@@ -2365,7 +2359,6 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
 	struct netns_ipvs *ipvs;
 	struct net *net;
 
-	EnterFunction(2);
 	list_for_each_entry(net, net_list, exit_list) {
 		ipvs = net_ipvs(net);
 		ip_vs_unregister_hooks(ipvs, AF_INET);
@@ -2374,7 +2367,6 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
 		smp_wmb();
 		ip_vs_sync_net_cleanup(ipvs);
 	}
-	LeaveFunction(2);
 }
 
 static struct pernet_operations ipvs_core_ops = {
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 2a5ed71c82c3..62606fb44d02 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1061,8 +1061,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	unsigned int atype;
 	int ret;
 
-	EnterFunction(2);
-
 #ifdef CONFIG_IP_VS_IPV6
 	if (udest->af == AF_INET6) {
 		atype = ipv6_addr_type(&udest->addr.in6);
@@ -1111,7 +1109,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	spin_lock_init(&dest->dst_lock);
 	__ip_vs_update_dest(svc, dest, udest, 1);
 
-	LeaveFunction(2);
 	return 0;
 
 err_stats:
@@ -1134,8 +1131,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	__be16 dport = udest->port;
 	int ret;
 
-	EnterFunction(2);
-
 	if (udest->weight < 0) {
 		pr_err("%s(): server weight less than zero\n", __func__);
 		return -ERANGE;
@@ -1183,7 +1178,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 		ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
 		if (ret < 0)
-			goto err;
+			return ret;
 		__ip_vs_update_dest(svc, dest, udest, 1);
 	} else {
 		/*
@@ -1192,9 +1187,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 		ret = ip_vs_new_dest(svc, udest);
 	}
 
-err:
-	LeaveFunction(2);
-
 	return ret;
 }
 
@@ -1209,8 +1201,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	union nf_inet_addr daddr;
 	__be16 dport = udest->port;
 
-	EnterFunction(2);
-
 	if (udest->weight < 0) {
 		pr_err("%s(): server weight less than zero\n", __func__);
 		return -ERANGE;
@@ -1242,7 +1232,6 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	}
 
 	__ip_vs_update_dest(svc, dest, udest, 0);
-	LeaveFunction(2);
 
 	return 0;
 }
@@ -1317,8 +1306,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	struct ip_vs_dest *dest;
 	__be16 dport = udest->port;
 
-	EnterFunction(2);
-
 	/* We use function that requires RCU lock */
 	rcu_read_lock();
 	dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
@@ -1339,8 +1326,6 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	 */
 	__ip_vs_del_dest(svc->ipvs, dest, false);
 
-	LeaveFunction(2);
-
 	return 0;
 }
 
@@ -1746,7 +1731,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
 	struct netns_ipvs *ipvs;
 	struct net *net;
 
-	EnterFunction(2);
 	/* Check for "full" addressed entries */
 	mutex_lock(&__ip_vs_mutex);
 	list_for_each_entry(net, net_list, exit_list) {
@@ -1754,7 +1738,6 @@ void ip_vs_service_nets_cleanup(struct list_head *net_list)
 		ip_vs_flush(ipvs, true);
 	}
 	mutex_unlock(&__ip_vs_mutex);
-	LeaveFunction(2);
 }
 
 /* Put all references for device (dst_cache) */
@@ -1792,7 +1775,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	if (event != NETDEV_DOWN || !ipvs)
 		return NOTIFY_DONE;
 	IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
-	EnterFunction(2);
 	mutex_lock(&__ip_vs_mutex);
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
@@ -1821,7 +1803,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
 	}
 	spin_unlock_bh(&ipvs->dest_trash_lock);
 	mutex_unlock(&__ip_vs_mutex);
-	LeaveFunction(2);
 	return NOTIFY_DONE;
 }
 
@@ -4537,8 +4518,6 @@ int __init ip_vs_control_init(void)
 	int idx;
 	int ret;
 
-	EnterFunction(2);
-
 	/* Initialize svc_table, ip_vs_svc_fwm_table */
 	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
 		INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
@@ -4551,15 +4530,12 @@ int __init ip_vs_control_init(void)
 	if (ret < 0)
 		return ret;
 
-	LeaveFunction(2);
 	return 0;
 }
 
 
 void ip_vs_control_cleanup(void)
 {
-	EnterFunction(2);
 	unregister_netdevice_notifier(&ip_vs_dst_notifier);
 	/* relying on common rcu_barrier() in ip_vs_cleanup() */
-	LeaveFunction(2);
 }
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index d4fe7bb4f853..264f2f87a437 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1582,13 +1582,11 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
 	struct kvec	iov;
 	int		len;
 
-	EnterFunction(7);
 	iov.iov_base     = (void *)buffer;
 	iov.iov_len      = length;
 
 	len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
 
-	LeaveFunction(7);
 	return len;
 }
 
@@ -1614,15 +1612,12 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
 	struct kvec		iov = {buffer, buflen};
 	int			len;
 
-	EnterFunction(7);
-
 	/* Receive a packet */
 	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, buflen);
 	len = sock_recvmsg(sock, &msg, MSG_DONTWAIT);
 	if (len < 0)
 		return len;
 
-	LeaveFunction(7);
 	return len;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 99c349c0d968..feb1d7fcb09f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -706,8 +706,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct iphdr  *iph = ip_hdr(skb);
 
-	EnterFunction(10);
-
 	if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
 			       IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
 		goto tx_error;
@@ -719,12 +717,10 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
  tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 
@@ -735,8 +731,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	struct ipv6hdr *iph = ipv6_hdr(skb);
 
-	EnterFunction(10);
-
 	if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
 				  &iph->daddr, NULL,
 				  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
@@ -747,12 +741,10 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
  tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 #endif
@@ -768,8 +760,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rtable *rt;		/* Route to the other host */
 	int local, rc, was_input;
 
-	EnterFunction(10);
-
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 		__be16 _pt, *p;
@@ -839,12 +829,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 
-	LeaveFunction(10);
 	return rc;
 
   tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 
@@ -856,8 +844,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	struct rt6_info *rt;		/* Route to the other host */
 	int local, rc;
 
-	EnterFunction(10);
-
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
 		__be16 _pt, *p;
@@ -927,11 +913,9 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 
-	LeaveFunction(10);
 	return rc;
 
 tx_error:
-	LeaveFunction(10);
 	kfree_skb(skb);
 	return NF_STOLEN;
 }
@@ -1149,8 +1133,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int tun_type, gso_type;
 	int tun_flags;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
 				   IP_VS_RT_MODE_LOCAL |
 				   IP_VS_RT_MODE_NON_LOCAL |
@@ -1199,7 +1181,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 					 &next_protocol, NULL, &dsfield,
 					 &ttl, dfp);
 	if (IS_ERR(skb))
-		goto tx_error;
+		return NF_STOLEN;
 
 	gso_type = __tun_gso_type_mask(AF_INET, cp->af);
 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1267,14 +1249,10 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	else if (ret == NF_DROP)
 		kfree_skb(skb);
 
-	LeaveFunction(10);
-
 	return NF_STOLEN;
 
   tx_error:
-	if (!IS_ERR(skb))
-		kfree_skb(skb);
-	LeaveFunction(10);
+	kfree_skb(skb);
 	return NF_STOLEN;
 }
 
@@ -1298,8 +1276,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int tun_type, gso_type;
 	int tun_flags;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
 				      &cp->daddr.in6,
 				      &saddr, ipvsh, 1,
@@ -1347,7 +1323,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 					 &next_protocol, &payload_len,
 					 &dsfield, &ttl, NULL);
 	if (IS_ERR(skb))
-		goto tx_error;
+		return NF_STOLEN;
 
 	gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
@@ -1414,14 +1390,10 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	else if (ret == NF_DROP)
 		kfree_skb(skb);
 
-	LeaveFunction(10);
-
 	return NF_STOLEN;
 
 tx_error:
-	if (!IS_ERR(skb))
-		kfree_skb(skb);
-	LeaveFunction(10);
+	kfree_skb(skb);
 	return NF_STOLEN;
 }
 #endif
@@ -1437,8 +1409,6 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	int local;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
 				   IP_VS_RT_MODE_LOCAL |
 				   IP_VS_RT_MODE_NON_LOCAL |
@@ -1455,12 +1425,10 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
   tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 
@@ -1471,8 +1439,6 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 {
 	int local;
 
-	EnterFunction(10);
-
 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
 				      &cp->daddr.in6,
 				      NULL, ipvsh, 0,
@@ -1489,12 +1455,10 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
 
-	LeaveFunction(10);
 	return NF_STOLEN;
 
 tx_error:
 	kfree_skb(skb);
-	LeaveFunction(10);
 	return NF_STOLEN;
 }
 #endif
@@ -1514,8 +1478,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int local;
 	int rt_mode, was_input;
 
-	EnterFunction(10);
-
 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
 	   forwarded directly here, because there is no need to
 	   translate address/port back */
@@ -1526,7 +1488,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 			rc = NF_ACCEPT;
 		/* do not touch skb anymore */
 		atomic_inc(&cp->in_pkts);
-		goto out;
+		return rc;
 	}
 
 	/*
@@ -1582,14 +1544,11 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->ignore_df = 1;
 
-	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
-	goto out;
+	return ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
 
   tx_error:
 	kfree_skb(skb);
 	rc = NF_STOLEN;
-  out:
-	LeaveFunction(10);
 	return rc;
 }
 
@@ -1604,8 +1563,6 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	int local;
 	int rt_mode;
 
-	EnterFunction(10);
-
 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
 	   forwarded directly here, because there is no need to
 	   translate address/port back */
@@ -1616,7 +1573,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 			rc = NF_ACCEPT;
 		/* do not touch skb anymore */
 		atomic_inc(&cp->in_pkts);
-		goto out;
+		return rc;
 	}
 
 	/*
@@ -1671,14 +1628,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* Another hack: avoid icmp_send in ip_fragment */
 	skb->ignore_df = 1;
 
-	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
-	goto out;
+	return ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
 
 tx_error:
 	kfree_skb(skb);
 	rc = NF_STOLEN;
-out:
-	LeaveFunction(10);
 	return rc;
 }
 #endif
-- 
cgit v1.2.3


From c7d15aaa105a9484b5385e5c391ea5203347d6b0 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@kernel.org>
Date: Mon, 17 Apr 2023 17:10:48 +0200
Subject: ipvs: Correct spelling in comments

Correct some spelling errors flagged by codespell and found by inspection.

Signed-off-by: Simon Horman <horms@kernel.org>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a3adc246ee31..ff406ef4fd4a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -584,7 +584,7 @@ struct ip_vs_conn {
 	spinlock_t              lock;           /* lock for state transition */
 	volatile __u16          state;          /* state info */
 	volatile __u16          old_state;      /* old state, to be used for
-						 * state transition triggerd
+						 * state transition triggered
 						 * synchronization
 						 */
 	__u32			fwmark;		/* Fire wall mark from skb */
@@ -635,7 +635,7 @@ struct ip_vs_service_user_kern {
 	u16			protocol;
 	union nf_inet_addr	addr;		/* virtual ip address */
 	__be16			port;
-	u32			fwmark;		/* firwall mark of service */
+	u32			fwmark;		/* firewall mark of service */
 
 	/* virtual service options */
 	char			*sched_name;
@@ -1036,7 +1036,7 @@ struct netns_ipvs {
 	struct ipvs_sync_daemon_cfg	bcfg;	/* Backup Configuration */
 	/* net name space ptr */
 	struct net		*net;            /* Needed by timer routines */
-	/* Number of heterogeneous destinations, needed becaus heterogeneous
+	/* Number of heterogeneous destinations, needed because heterogeneous
 	 * are not supported when synchronization is enabled.
 	 */
 	unsigned int		mixed_address_family_dests;
-- 
cgit v1.2.3


From b9703ed44ffbfba85c103b9de01886a225e14b38 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 21 Apr 2023 00:34:31 +0200
Subject: netfilter: nf_tables: support for adding new devices to an existing
 netdev chain

This patch allows users to add devices to an existing netdev chain.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |   6 ++
 net/netfilter/nf_tables_api.c     | 217 ++++++++++++++++++++++++--------------
 2 files changed, 142 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 58a4d217faaf..3ed21d2d5659 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1609,6 +1609,8 @@ struct nft_trans_chain {
 	struct nft_stats __percpu	*stats;
 	u8				policy;
 	u32				chain_id;
+	struct nft_base_chain		*basechain;
+	struct list_head		hook_list;
 };
 
 #define nft_trans_chain_update(trans)	\
@@ -1621,6 +1623,10 @@ struct nft_trans_chain {
 	(((struct nft_trans_chain *)trans->data)->policy)
 #define nft_trans_chain_id(trans)	\
 	(((struct nft_trans_chain *)trans->data)->chain_id)
+#define nft_trans_basechain(trans)	\
+	(((struct nft_trans_chain *)trans->data)->basechain)
+#define nft_trans_chain_hooks(trans)	\
+	(((struct nft_trans_chain *)trans->data)->hook_list)
 
 struct nft_trans_table {
 	bool				update;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 77975b4b0fdc..f8d8cace0c7d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1582,7 +1582,8 @@ nla_put_failure:
 }
 
 static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
-				   const struct nft_base_chain *basechain)
+				   const struct nft_base_chain *basechain,
+				   const struct list_head *hook_list)
 {
 	const struct nf_hook_ops *ops = &basechain->ops;
 	struct nft_hook *hook, *first = NULL;
@@ -1599,7 +1600,11 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family,
 
 	if (nft_base_chain_netdev(family, ops->hooknum)) {
 		nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
-		list_for_each_entry(hook, &basechain->hook_list, list) {
+
+		if (!hook_list)
+			hook_list = &basechain->hook_list;
+
+		list_for_each_entry(hook, hook_list, list) {
 			if (!first)
 				first = hook;
 
@@ -1624,7 +1629,8 @@ nla_put_failure:
 static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 				     u32 portid, u32 seq, int event, u32 flags,
 				     int family, const struct nft_table *table,
-				     const struct nft_chain *chain)
+				     const struct nft_chain *chain,
+				     const struct list_head *hook_list)
 {
 	struct nlmsghdr *nlh;
 
@@ -1649,7 +1655,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 		const struct nft_base_chain *basechain = nft_base_chain(chain);
 		struct nft_stats __percpu *stats;
 
-		if (nft_dump_basechain_hook(skb, family, basechain))
+		if (nft_dump_basechain_hook(skb, family, basechain, hook_list))
 			goto nla_put_failure;
 
 		if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
@@ -1684,7 +1690,8 @@ nla_put_failure:
 	return -1;
 }
 
-static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
+static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
+				   const struct list_head *hook_list)
 {
 	struct nftables_pernet *nft_net;
 	struct sk_buff *skb;
@@ -1704,7 +1711,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event)
 
 	err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
 					event, flags, ctx->family, ctx->table,
-					ctx->chain);
+					ctx->chain, hook_list);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -1750,7 +1757,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 						      NFT_MSG_NEWCHAIN,
 						      NLM_F_MULTI,
 						      table->family, table,
-						      chain) < 0)
+						      chain, NULL) < 0)
 				goto done;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -1804,7 +1811,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
 
 	err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
 					info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
-					0, family, table, chain);
+					0, family, table, chain, NULL);
 	if (err < 0)
 		goto err_fill_chain_info;
 
@@ -2048,9 +2055,10 @@ static int nft_chain_parse_netdev(struct net *net,
 }
 
 static int nft_chain_parse_hook(struct net *net,
+				struct nft_base_chain *basechain,
 				const struct nlattr * const nla[],
 				struct nft_chain_hook *hook, u8 family,
-				struct netlink_ext_ack *extack, bool autoload)
+				struct netlink_ext_ack *extack)
 {
 	struct nftables_pernet *nft_net = nft_pernet(net);
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
@@ -2066,31 +2074,46 @@ static int nft_chain_parse_hook(struct net *net,
 	if (err < 0)
 		return err;
 
-	if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
-	    ha[NFTA_HOOK_PRIORITY] == NULL)
-		return -EINVAL;
+	if (!basechain) {
+		if (!ha[NFTA_HOOK_HOOKNUM] ||
+		    !ha[NFTA_HOOK_PRIORITY])
+			return -EINVAL;
 
-	hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
-	hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+		hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+		hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
 
-	type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
-	if (!type)
-		return -EOPNOTSUPP;
+		type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
+		if (!type)
+			return -EOPNOTSUPP;
 
-	if (nla[NFTA_CHAIN_TYPE]) {
-		type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
-						   family, autoload);
-		if (IS_ERR(type)) {
-			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
-			return PTR_ERR(type);
+		if (nla[NFTA_CHAIN_TYPE]) {
+			type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
+							   family, true);
+			if (IS_ERR(type)) {
+				NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
+				return PTR_ERR(type);
+			}
 		}
-	}
-	if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
-		return -EOPNOTSUPP;
+		if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
+			return -EOPNOTSUPP;
 
-	if (type->type == NFT_CHAIN_T_NAT &&
-	    hook->priority <= NF_IP_PRI_CONNTRACK)
-		return -EOPNOTSUPP;
+		if (type->type == NFT_CHAIN_T_NAT &&
+		    hook->priority <= NF_IP_PRI_CONNTRACK)
+			return -EOPNOTSUPP;
+	} else {
+		if (ha[NFTA_HOOK_HOOKNUM]) {
+			hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+			if (hook->num != basechain->ops.hooknum)
+				return -EOPNOTSUPP;
+		}
+		if (ha[NFTA_HOOK_PRIORITY]) {
+			hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
+			if (hook->priority != basechain->ops.priority)
+				return -EOPNOTSUPP;
+		}
+
+		type = basechain->type;
+	}
 
 	if (!try_module_get(type->owner)) {
 		if (nla[NFTA_CHAIN_TYPE])
@@ -2184,12 +2207,8 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
 		list_splice_init(&hook->list, &basechain->hook_list);
 		list_for_each_entry(h, &basechain->hook_list, list)
 			nft_basechain_hook_init(&h->ops, family, hook, chain);
-
-		basechain->ops.hooknum	= hook->num;
-		basechain->ops.priority	= hook->priority;
-	} else {
-		nft_basechain_hook_init(&basechain->ops, family, hook, chain);
 	}
+	nft_basechain_hook_init(&basechain->ops, family, hook, chain);
 
 	chain->flags |= NFT_CHAIN_BASE | flags;
 	basechain->policy = NF_ACCEPT;
@@ -2239,13 +2258,13 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 
 	if (nla[NFTA_CHAIN_HOOK]) {
 		struct nft_stats __percpu *stats = NULL;
-		struct nft_chain_hook hook;
+		struct nft_chain_hook hook = {};
 
 		if (flags & NFT_CHAIN_BINDING)
 			return -EOPNOTSUPP;
 
-		err = nft_chain_parse_hook(net, nla, &hook, family, extack,
-					   true);
+		err = nft_chain_parse_hook(net, NULL, nla, &hook, family,
+					   extack);
 		if (err < 0)
 			return err;
 
@@ -2359,65 +2378,57 @@ err_destroy_chain:
 	return err;
 }
 
-static bool nft_hook_list_equal(struct list_head *hook_list1,
-				struct list_head *hook_list2)
-{
-	struct nft_hook *hook;
-	int n = 0, m = 0;
-
-	n = 0;
-	list_for_each_entry(hook, hook_list2, list) {
-		if (!nft_hook_list_find(hook_list1, hook))
-			return false;
-
-		n++;
-	}
-	list_for_each_entry(hook, hook_list1, list)
-		m++;
-
-	return n == m;
-}
-
 static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			      u32 flags, const struct nlattr *attr,
 			      struct netlink_ext_ack *extack)
 {
 	const struct nlattr * const *nla = ctx->nla;
+	struct nft_base_chain *basechain = NULL;
 	struct nft_table *table = ctx->table;
 	struct nft_chain *chain = ctx->chain;
-	struct nft_base_chain *basechain;
+	struct nft_chain_hook hook = {};
 	struct nft_stats *stats = NULL;
-	struct nft_chain_hook hook;
+	struct nft_hook *h, *next;
 	struct nf_hook_ops *ops;
 	struct nft_trans *trans;
+	bool unregister = false;
 	int err;
 
 	if (chain->flags ^ flags)
 		return -EOPNOTSUPP;
 
+	INIT_LIST_HEAD(&hook.list);
+
 	if (nla[NFTA_CHAIN_HOOK]) {
 		if (!nft_is_base_chain(chain)) {
 			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
 		}
-		err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
-					   extack, false);
+
+		basechain = nft_base_chain(chain);
+		err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook,
+					   ctx->family, extack);
 		if (err < 0)
 			return err;
 
-		basechain = nft_base_chain(chain);
 		if (basechain->type != hook.type) {
 			nft_chain_release_hook(&hook);
 			NL_SET_BAD_ATTR(extack, attr);
 			return -EEXIST;
 		}
 
-		if (nft_base_chain_netdev(ctx->family, hook.num)) {
-			if (!nft_hook_list_equal(&basechain->hook_list,
-						 &hook.list)) {
-				nft_chain_release_hook(&hook);
-				NL_SET_BAD_ATTR(extack, attr);
-				return -EEXIST;
+		if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
+			list_for_each_entry_safe(h, next, &hook.list, list) {
+				h->ops.pf	= basechain->ops.pf;
+				h->ops.hooknum	= basechain->ops.hooknum;
+				h->ops.priority	= basechain->ops.priority;
+				h->ops.priv	= basechain->ops.priv;
+				h->ops.hook	= basechain->ops.hook;
+
+				if (nft_hook_list_find(&basechain->hook_list, h)) {
+					list_del(&h->list);
+					kfree(h);
+				}
 			}
 		} else {
 			ops = &basechain->ops;
@@ -2428,7 +2439,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 				return -EEXIST;
 			}
 		}
-		nft_chain_release_hook(&hook);
 	}
 
 	if (nla[NFTA_CHAIN_HANDLE] &&
@@ -2439,24 +2449,43 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 					  nla[NFTA_CHAIN_NAME], genmask);
 		if (!IS_ERR(chain2)) {
 			NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
-			return -EEXIST;
+			err = -EEXIST;
+			goto err_hooks;
 		}
 	}
 
 	if (nla[NFTA_CHAIN_COUNTERS]) {
-		if (!nft_is_base_chain(chain))
-			return -EOPNOTSUPP;
+		if (!nft_is_base_chain(chain)) {
+			err = -EOPNOTSUPP;
+			goto err_hooks;
+		}
 
 		stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
-		if (IS_ERR(stats))
-			return PTR_ERR(stats);
+		if (IS_ERR(stats)) {
+			err = PTR_ERR(stats);
+			goto err_hooks;
+		}
 	}
 
+	if (!(table->flags & NFT_TABLE_F_DORMANT) &&
+	    nft_is_base_chain(chain) &&
+	    !list_empty(&hook.list)) {
+		basechain = nft_base_chain(chain);
+		ops = &basechain->ops;
+
+		if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
+			err = nft_netdev_register_hooks(ctx->net, &hook.list);
+			if (err < 0)
+				goto err_hooks;
+		}
+	}
+
+	unregister = true;
 	err = -ENOMEM;
 	trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
 				sizeof(struct nft_trans_chain));
 	if (trans == NULL)
-		goto err;
+		goto err_trans;
 
 	nft_trans_chain_stats(trans) = stats;
 	nft_trans_chain_update(trans) = true;
@@ -2475,7 +2504,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 		err = -ENOMEM;
 		name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
 		if (!name)
-			goto err;
+			goto err_trans;
 
 		err = -EEXIST;
 		list_for_each_entry(tmp, &nft_net->commit_list, list) {
@@ -2486,18 +2515,35 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 			    strcmp(name, nft_trans_chain_name(tmp)) == 0) {
 				NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
 				kfree(name);
-				goto err;
+				goto err_trans;
 			}
 		}
 
 		nft_trans_chain_name(trans) = name;
 	}
+
+	nft_trans_basechain(trans) = basechain;
+	INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
+	list_splice(&hook.list, &nft_trans_chain_hooks(trans));
+
 	nft_trans_commit_list_add_tail(ctx->net, trans);
 
 	return 0;
-err:
+
+err_trans:
 	free_percpu(stats);
 	kfree(trans);
+err_hooks:
+	if (nla[NFTA_CHAIN_HOOK]) {
+		list_for_each_entry_safe(h, next, &hook.list, list) {
+			if (unregister)
+				nf_unregister_net_hook(ctx->net, &h->ops);
+			list_del(&h->list);
+			kfree_rcu(h, rcu);
+		}
+		module_put(hook.type->owner);
+	}
+
 	return err;
 }
 
@@ -9244,19 +9290,22 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 		case NFT_MSG_NEWCHAIN:
 			if (nft_trans_chain_update(trans)) {
 				nft_chain_commit_update(trans);
-				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN,
+						       &nft_trans_chain_hooks(trans));
+				list_splice(&nft_trans_chain_hooks(trans),
+					    &nft_trans_basechain(trans)->hook_list);
 				/* trans destroyed after rcu grace period */
 			} else {
 				nft_chain_commit_drop_policy(trans);
 				nft_clear(net, trans->ctx.chain);
-				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+				nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN, NULL);
 				nft_trans_destroy(trans);
 			}
 			break;
 		case NFT_MSG_DELCHAIN:
 		case NFT_MSG_DESTROYCHAIN:
 			nft_chain_del(trans->ctx.chain);
-			nf_tables_chain_notify(&trans->ctx, trans->msg_type);
+			nf_tables_chain_notify(&trans->ctx, trans->msg_type, NULL);
 			nf_tables_unregister_hook(trans->ctx.net,
 						  trans->ctx.table,
 						  trans->ctx.chain);
@@ -9423,7 +9472,10 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		nf_tables_table_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_NEWCHAIN:
-		nf_tables_chain_destroy(&trans->ctx);
+		if (nft_trans_chain_update(trans))
+			nft_hooks_destroy(&nft_trans_chain_hooks(trans));
+		else
+			nf_tables_chain_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_NEWRULE:
 		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
@@ -9486,6 +9538,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
 			break;
 		case NFT_MSG_NEWCHAIN:
 			if (nft_trans_chain_update(trans)) {
+				nft_netdev_unregister_hooks(net,
+							    &nft_trans_chain_hooks(trans),
+							    true);
 				free_percpu(nft_trans_chain_stats(trans));
 				kfree(nft_trans_chain_name(trans));
 				nft_trans_destroy(trans);
-- 
cgit v1.2.3


From e0416e7d33361d2ad0bf9f007428346579ac854a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 21 Apr 2023 23:03:46 +0100
Subject: rxrpc: Fix potential race in error handling in afs_make_call()

If the rxrpc call set up by afs_make_call() receives an error whilst it is
transmitting the request, there's the possibility that it may get to the
point the rxrpc call is ended (after the error_kill_call label) just as the
call is queued for async processing.

This could manifest itself as call->rxcall being seen as NULL in
afs_deliver_to_call() when it tries to lock the call.

Fix this by splitting rxrpc_kernel_end_call() into a function to shut down
an rxrpc call and a function to release the caller's reference and calling
the latter only when we get to afs_put_call().

Reported-by: Jeffrey Altman <jaltman@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: kafs-testing+fedora36_64checkkafs-build-306@auristor.com
cc: Marc Dionne <marc.dionne@auristor.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/rxrpc.rst | 17 ++++++++++++-----
 fs/afs/rxrpc.c                     |  9 ++++-----
 include/net/af_rxrpc.h             |  3 ++-
 net/rxrpc/af_rxrpc.c               | 37 +++++++++++++++++++++++++------------
 net/rxrpc/rxperf.c                 |  3 ++-
 5 files changed, 45 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst
index ec1323d92c96..e807e18ba32a 100644
--- a/Documentation/networking/rxrpc.rst
+++ b/Documentation/networking/rxrpc.rst
@@ -848,14 +848,21 @@ The kernel interface functions are as follows:
      returned.  The caller now holds a reference on this and it must be
      properly ended.
 
- (#) End a client call::
+ (#) Shut down a client call::
 
-	void rxrpc_kernel_end_call(struct socket *sock,
+	void rxrpc_kernel_shutdown_call(struct socket *sock,
+					struct rxrpc_call *call);
+
+     This is used to shut down a previously begun call.  The user_call_ID is
+     expunged from AF_RXRPC's knowledge and will not be seen again in
+     association with the specified call.
+
+ (#) Release the ref on a client call::
+
+	void rxrpc_kernel_put_call(struct socket *sock,
 				   struct rxrpc_call *call);
 
-     This is used to end a previously begun call.  The user_call_ID is expunged
-     from AF_RXRPC's knowledge and will not be seen again in association with
-     the specified call.
+     This is used to release the caller's ref on an rxrpc call.
 
  (#) Send data through a call::
 
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 7817e2b860e5..e08b850c3e6d 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -179,7 +179,8 @@ void afs_put_call(struct afs_call *call)
 		ASSERT(call->type->name != NULL);
 
 		if (call->rxcall) {
-			rxrpc_kernel_end_call(net->socket, call->rxcall);
+			rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
+			rxrpc_kernel_put_call(net->socket, call->rxcall);
 			call->rxcall = NULL;
 		}
 		if (call->type->destructor)
@@ -420,10 +421,8 @@ error_kill_call:
 	 * The call, however, might be queued on afs_async_calls and we need to
 	 * make sure we don't get any more notifications that might requeue it.
 	 */
-	if (call->rxcall) {
-		rxrpc_kernel_end_call(call->net->socket, call->rxcall);
-		call->rxcall = NULL;
-	}
+	if (call->rxcall)
+		rxrpc_kernel_shutdown_call(call->net->socket, call->rxcall);
 	if (call->async) {
 		if (cancel_work_sync(&call->async_work))
 			afs_put_call(call);
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index ba717eac0229..01a35e113ab9 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -57,7 +57,8 @@ int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *,
 			   struct iov_iter *, size_t *, bool, u32 *, u16 *);
 bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
 			     u32, int, enum rxrpc_abort_reason);
-void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *);
+void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call);
+void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call);
 void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
 			   struct sockaddr_rxrpc *);
 bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *);
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 102f5cbff91a..c32b164206f9 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -342,31 +342,44 @@ static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall,
 }
 
 /**
- * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using
+ * rxrpc_kernel_shutdown_call - Allow a kernel service to shut down a call it was using
  * @sock: The socket the call is on
  * @call: The call to end
  *
- * Allow a kernel service to end a call it was using.  The call must be
+ * Allow a kernel service to shut down a call it was using.  The call must be
  * complete before this is called (the call should be aborted if necessary).
  */
-void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call)
+void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call)
 {
 	_enter("%d{%d}", call->debug_id, refcount_read(&call->ref));
 
 	mutex_lock(&call->user_mutex);
-	rxrpc_release_call(rxrpc_sk(sock->sk), call);
-
-	/* Make sure we're not going to call back into a kernel service */
-	if (call->notify_rx) {
-		spin_lock(&call->notify_lock);
-		call->notify_rx = rxrpc_dummy_notify_rx;
-		spin_unlock(&call->notify_lock);
+	if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) {
+		rxrpc_release_call(rxrpc_sk(sock->sk), call);
+
+		/* Make sure we're not going to call back into a kernel service */
+		if (call->notify_rx) {
+			spin_lock(&call->notify_lock);
+			call->notify_rx = rxrpc_dummy_notify_rx;
+			spin_unlock(&call->notify_lock);
+		}
 	}
-
 	mutex_unlock(&call->user_mutex);
+}
+EXPORT_SYMBOL(rxrpc_kernel_shutdown_call);
+
+/**
+ * rxrpc_kernel_put_call - Release a reference to a call
+ * @sock: The socket the call is on
+ * @call: The call to put
+ *
+ * Drop the application's ref on an rxrpc call.
+ */
+void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call)
+{
 	rxrpc_put_call(call, rxrpc_call_put_kernel);
 }
-EXPORT_SYMBOL(rxrpc_kernel_end_call);
+EXPORT_SYMBOL(rxrpc_kernel_put_call);
 
 /**
  * rxrpc_kernel_check_life - Check to see whether a call is still alive
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index 4a2e90015ca7..085e7892d310 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -342,7 +342,8 @@ static void rxperf_deliver_to_call(struct work_struct *work)
 call_complete:
 	rxperf_set_call_complete(call, ret, remote_abort);
 	/* The call may have been requeued */
-	rxrpc_kernel_end_call(rxperf_socket, call->rxcall);
+	rxrpc_kernel_shutdown_call(rxperf_socket, call->rxcall);
+	rxrpc_kernel_put_call(rxperf_socket, call->rxcall);
 	cancel_work(&call->work);
 	kfree(call);
 }
-- 
cgit v1.2.3


From 6d26d985eeda89faedabbcf6607c37454b9691b0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 22 Apr 2023 09:35:44 +0200
Subject: bpf: fix link failure with NETFILTER=y INET=n

Explicitly check if NETFILTER_BPF_LINK is enabled, else configs
that have NETFILTER=y but CONFIG_INET=n fail to link:

> kernel/bpf/syscall.o: undefined reference to `netfilter_prog_ops'
> kernel/bpf/verifier.o: undefined reference to `netfilter_verifier_ops'

Fixes: fd9c663b9ad6 ("bpf: minimal support for programs hooked into netfilter framework")
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202304220903.fRZTJtxe-lkp@intel.com/
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20230422073544.17634-1-fw@strlen.de
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 39a999abb0ce..fc0d6f32c687 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -79,7 +79,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
 #endif
 BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 	      void *, void *)
-#ifdef CONFIG_NETFILTER
+#ifdef CONFIG_NETFILTER_BPF_LINK
 BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
 	      struct bpf_nf_ctx, struct bpf_nf_ctx)
 #endif
-- 
cgit v1.2.3


From 87eff2ec57b6d68d294013d8dd21e839a1175e3a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Apr 2023 09:43:57 +0000
Subject: net: optimize napi_threaded_poll() vs RPS/RFS

We use napi_threaded_poll() in order to reduce our softirq dependency.

We can add a followup of 821eba962d95 ("net: optimize napi_schedule_rps()")
to further remove the need of firing NET_RX_SOFTIRQ whenever
RPS/RFS are used.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/core/dev.c            | 12 ++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a6a3e9457d6c..08fbd4622ccf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3194,7 +3194,10 @@ struct softnet_data {
 #ifdef CONFIG_RPS
 	struct softnet_data	*rps_ipi_list;
 #endif
+
 	bool			in_net_rx_action;
+	bool			in_napi_threaded_poll;
+
 #ifdef CONFIG_NET_FLOW_LIMIT
 	struct sd_flow_limit __rcu *flow_limit;
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 7d9ec23f97c6..735096d42c1d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4603,10 +4603,10 @@ static void napi_schedule_rps(struct softnet_data *sd)
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 		mysd->rps_ipi_list = sd;
 
-		/* If not called from net_rx_action()
+		/* If not called from net_rx_action() or napi_threaded_poll()
 		 * we have to raise NET_RX_SOFTIRQ.
 		 */
-		if (!mysd->in_net_rx_action)
+		if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
 			__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 		return;
 	}
@@ -6631,11 +6631,19 @@ static int napi_threaded_poll(void *data)
 
 			local_bh_disable();
 			sd = this_cpu_ptr(&softnet_data);
+			sd->in_napi_threaded_poll = true;
 
 			have = netpoll_poll_lock(napi);
 			__napi_poll(napi, &repoll);
 			netpoll_poll_unlock(have);
 
+			sd->in_napi_threaded_poll = false;
+			barrier();
+
+			if (sd_has_rps_ipi_waiting(sd)) {
+				local_irq_disable();
+				net_rps_action_and_irq_enable(sd);
+			}
 			skb_defer_free_flush(sd);
 			local_bh_enable();
 
-- 
cgit v1.2.3


From f90615ada0b1e21a9d93ff89b04549fd7a92c92b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 21 Apr 2023 01:55:53 +0300
Subject: net: vlan: don't adjust MAC header in __vlan_insert_inner_tag()
 unless set

This is a preparatory change for the deletion of skb_reset_mac_header(skb)
from __dev_queue_xmit(). After that deletion, skb_mac_header(skb) will
no longer be set in TX paths, from which __vlan_insert_inner_tag() can
still be called (perhaps indirectly).

If we don't make this change, then an unset MAC header (equal to ~0U)
will become set after the adjustment with VLAN_HLEN.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 6864b89ef868..90b76d63c11c 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -351,7 +351,8 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
 	/* Move the mac header sans proto to the beginning of the new header. */
 	if (likely(mac_len > ETH_TLEN))
 		memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
-	skb->mac_header -= VLAN_HLEN;
+	if (skb_mac_header_was_set(skb))
+		skb->mac_header -= VLAN_HLEN;
 
 	veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);
 
-- 
cgit v1.2.3


From 1f5020acb33f926030f62563c86dffca35c7b701 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 21 Apr 2023 01:55:54 +0300
Subject: net: vlan: introduce skb_vlan_eth_hdr()

Similar to skb_eth_hdr() introduced in commit 96cc4b69581d ("macvlan: do
not assume mac_header is set in macvlan_broadcast()"), let's introduce a
skb_vlan_eth_hdr() helper which can be used in TX-only code paths to get
to the VLAN header based on skb->data rather than based on the
skb_mac_header(skb).

We also consolidate the drivers that dereference skb->data to go through
this helper.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c      |  3 +--
 drivers/net/ethernet/emulex/benet/be_main.c          |  2 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c          |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  2 +-
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c       |  4 ++--
 drivers/net/ethernet/sfc/tx_tso.c                    |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c    |  7 ++-----
 drivers/staging/gdm724x/gdm_lte.c                    |  4 ++--
 include/linux/if_vlan.h                              | 12 ++++++++++--
 net/batman-adv/soft-interface.c                      |  2 +-
 12 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 12083b9679b5..6ea5521074d3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1935,8 +1935,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb,
 
 		/* Skip VLAN tag if present */
 		if (ether_type == ETH_P_8021Q) {
-			struct vlan_ethhdr *vhdr =
-				(struct vlan_ethhdr *)skb->data;
+			struct vlan_ethhdr *vhdr = skb_vlan_eth_hdr(skb);
 
 			ether_type = ntohs(vhdr->h_vlan_encapsulated_proto);
 		}
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index aed1b622f51f..7e408bcc88de 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1124,7 +1124,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 						  struct be_wrb_params
 						  *wrb_params)
 {
-	struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+	struct vlan_ethhdr *veh = skb_vlan_eth_hdr(skb);
 	unsigned int eth_hdr_len;
 	struct iphdr *ip;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 5caea154362f..7356ad965487 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1532,7 +1532,7 @@ static int hns3_handle_vtags(struct hns3_enet_ring *tx_ring,
 	if (unlikely(rc < 0))
 		return rc;
 
-	vhdr = (struct vlan_ethhdr *)skb->data;
+	vhdr = skb_vlan_eth_hdr(skb);
 	vhdr->h_vlan_TCI |= cpu_to_be16((skb->priority << VLAN_PRIO_SHIFT)
 					 & VLAN_PRIO_MASK);
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c8c2cbaa0ede..8b8bf4880faa 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3063,7 +3063,7 @@ static inline int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 			rc = skb_cow_head(skb, 0);
 			if (rc < 0)
 				return rc;
-			vhdr = (struct vlan_ethhdr *)skb->data;
+			vhdr = skb_vlan_eth_hdr(skb);
 			vhdr->h_vlan_TCI = htons(tx_flags >>
 						 I40E_TX_FLAGS_VLAN_SHIFT);
 		} else {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f2604fc05991..e961ef4bbf4d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8798,7 +8798,7 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 
 			if (skb_cow_head(skb, 0))
 				goto out_drop;
-			vhdr = (struct vlan_ethhdr *)skb->data;
+			vhdr = skb_vlan_eth_hdr(skb);
 			vhdr->h_vlan_TCI = htons(tx_flags >>
 						 IXGBE_TX_FLAGS_VLAN_SHIFT);
 		} else {
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 59d0dd862fd1..1d1e183d3a8b 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1854,7 +1854,7 @@ netxen_tso_check(struct net_device *netdev,
 
 	if (protocol == cpu_to_be16(ETH_P_8021Q)) {
 
-		vh = (struct vlan_ethhdr *)skb->data;
+		vh = skb_vlan_eth_hdr(skb);
 		protocol = vh->h_vlan_encapsulated_proto;
 		flags = FLAGS_VLAN_TAGGED;
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index 92930a055cbc..41894d154013 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -318,7 +318,7 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter,
 
 	if (adapter->flags & QLCNIC_VLAN_FILTERING) {
 		if (protocol == ETH_P_8021Q) {
-			vh = (struct vlan_ethhdr *)skb->data;
+			vh = skb_vlan_eth_hdr(skb);
 			vlan_id = ntohs(vh->h_vlan_TCI);
 		} else if (skb_vlan_tag_present(skb)) {
 			vlan_id = skb_vlan_tag_get(skb);
@@ -468,7 +468,7 @@ static int qlcnic_tx_pkt(struct qlcnic_adapter *adapter,
 	u32 producer = tx_ring->producer;
 
 	if (protocol == ETH_P_8021Q) {
-		vh = (struct vlan_ethhdr *)skb->data;
+		vh = skb_vlan_eth_hdr(skb);
 		flags = QLCNIC_FLAGS_VLAN_TAGGED;
 		vlan_tci = ntohs(vh->h_vlan_TCI);
 		protocol = ntohs(vh->h_vlan_encapsulated_proto);
diff --git a/drivers/net/ethernet/sfc/tx_tso.c b/drivers/net/ethernet/sfc/tx_tso.c
index 898e5c61d908..d381d8164f07 100644
--- a/drivers/net/ethernet/sfc/tx_tso.c
+++ b/drivers/net/ethernet/sfc/tx_tso.c
@@ -147,7 +147,7 @@ static __be16 efx_tso_check_protocol(struct sk_buff *skb)
 	EFX_WARN_ON_ONCE_PARANOID(((struct ethhdr *)skb->data)->h_proto !=
 				  protocol);
 	if (protocol == htons(ETH_P_8021Q)) {
-		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+		struct vlan_ethhdr *veh = skb_vlan_eth_hdr(skb);
 
 		protocol = veh->h_vlan_encapsulated_proto;
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index f116e4ae293b..0fca81507a77 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4569,13 +4569,10 @@ dma_map_err:
 
 static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
 {
-	struct vlan_ethhdr *veth;
-	__be16 vlan_proto;
+	struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);
+	__be16 vlan_proto = veth->h_vlan_proto;
 	u16 vlanid;
 
-	veth = (struct vlan_ethhdr *)skb->data;
-	vlan_proto = veth->h_vlan_proto;
-
 	if ((vlan_proto == htons(ETH_P_8021Q) &&
 	     dev->features & NETIF_F_HW_VLAN_CTAG_RX) ||
 	    (vlan_proto == htons(ETH_P_8021AD) &&
diff --git a/drivers/staging/gdm724x/gdm_lte.c b/drivers/staging/gdm724x/gdm_lte.c
index 671ee8843c88..5703a9ddb6d0 100644
--- a/drivers/staging/gdm724x/gdm_lte.c
+++ b/drivers/staging/gdm724x/gdm_lte.c
@@ -349,7 +349,7 @@ static s32 gdm_lte_tx_nic_type(struct net_device *dev, struct sk_buff *skb)
 	/* Get ethernet protocol */
 	eth = (struct ethhdr *)skb->data;
 	if (ntohs(eth->h_proto) == ETH_P_8021Q) {
-		vlan_eth = (struct vlan_ethhdr *)skb->data;
+		vlan_eth = skb_vlan_eth_hdr(skb);
 		mac_proto = ntohs(vlan_eth->h_vlan_encapsulated_proto);
 		network_data = skb->data + VLAN_ETH_HLEN;
 		nic_type |= NIC_TYPE_F_VLAN;
@@ -435,7 +435,7 @@ static netdev_tx_t gdm_lte_tx(struct sk_buff *skb, struct net_device *dev)
 	 * driver based on the NIC mac
 	 */
 	if (nic_type & NIC_TYPE_F_VLAN) {
-		struct vlan_ethhdr *vlan_eth = (struct vlan_ethhdr *)skb->data;
+		struct vlan_ethhdr *vlan_eth = skb_vlan_eth_hdr(skb);
 
 		nic->vlan_id = ntohs(vlan_eth->h_vlan_TCI) & VLAN_VID_MASK;
 		data_buf = skb->data + (VLAN_ETH_HLEN - ETH_HLEN);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 90b76d63c11c..3698f2b391cd 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -62,6 +62,14 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
 	return (struct vlan_ethhdr *)skb_mac_header(skb);
 }
 
+/* Prefer this version in TX path, instead of
+ * skb_reset_mac_header() + vlan_eth_hdr()
+ */
+static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
+{
+	return (struct vlan_ethhdr *)skb->data;
+}
+
 #define VLAN_PRIO_MASK		0xe000 /* Priority Code Point */
 #define VLAN_PRIO_SHIFT		13
 #define VLAN_CFI_MASK		0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
@@ -529,7 +537,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
  */
 static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 {
-	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data;
+	struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);
 
 	if (!eth_type_vlan(veth->h_vlan_proto))
 		return -EINVAL;
@@ -713,7 +721,7 @@ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
 		if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
 			return false;
 
-		veh = (struct vlan_ethhdr *)skb->data;
+		veh = skb_vlan_eth_hdr(skb);
 		protocol = veh->h_vlan_encapsulated_proto;
 	}
 
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 125f4628687c..d3fdf82282af 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -439,7 +439,7 @@ void batadv_interface_rx(struct net_device *soft_iface,
 		if (!pskb_may_pull(skb, VLAN_ETH_HLEN))
 			goto dropped;
 
-		vhdr = (struct vlan_ethhdr *)skb->data;
+		vhdr = skb_vlan_eth_hdr(skb);
 
 		/* drop batman-in-batman packets to prevent loops */
 		if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN))
-- 
cgit v1.2.3


From 0bcf2e4aca6c29a07555b713f2fb461dc38d5977 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 21 Apr 2023 01:56:01 +0300
Subject: net: dsa: tag_ocelot: call only the relevant portion of
 __skb_vlan_pop() on TX

ocelot_xmit_get_vlan_info() calls __skb_vlan_pop() as the most
appropriate helper I could find which strips away a VLAN header.
That's all I need it to do, but __skb_vlan_pop() has more logic, which
will become incompatible with the future revert of commit 6d1ccff62780
("net: reset mac header in dev_start_xmit()").

Namely, it performs a sanity check on skb_mac_header(), which will stop
being set after the above revert, so it will return an error instead of
removing the VLAN tag.

ocelot_xmit_get_vlan_info() gets called in 2 circumstances:

(1) the port is under a VLAN-aware bridge and the bridge sends
    VLAN-tagged packets

(2) the port is under a VLAN-aware bridge and somebody else (an 8021q
    upper) sends VLAN-tagged packets (using a VID that isn't in the
    bridge vlan tables)

In case (1), there is actually no bug to defend against, because
br_dev_xmit() calls skb_reset_mac_header() and things continue to work.

However, in case (2), illustrated using the commands below, it can be
seen that our intervention is needed, since __skb_vlan_pop() complains:

$ ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up
$ ip link set $eth master br0 && ip link set $eth up
$ ip link add link $eth name $eth.100 type vlan id 100 && ip link set $eth.100 up
$ ip addr add 192.168.100.1/24 dev $eth.100

I could fend off the checks in __skb_vlan_pop() with some
skb_mac_header_was_set() calls, but seeing how few callers of
__skb_vlan_pop() there are from TX paths, that seems rather
unproductive.

As an alternative solution, extract the bare minimum logic to strip a
VLAN header, and move it to a new helper named vlan_remove_tag(), close
to the definition of vlan_insert_tag(). Document it appropriately and
make ocelot_xmit_get_vlan_info() call this smaller helper instead.

Seeing that it doesn't appear illegal to test skb->protocol in the TX
path, I guess it would be a good for vlan_remove_tag() to also absorb
the vlan_set_encap_proto() function call.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 21 +++++++++++++++++++++
 net/core/skbuff.c       |  8 +-------
 net/dsa/tag_ocelot.c    |  2 +-
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 3698f2b391cd..0f40f379d75c 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -685,6 +685,27 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
 		skb->protocol = htons(ETH_P_802_2);
 }
 
+/**
+ * vlan_remove_tag - remove outer VLAN tag from payload
+ * @skb: skbuff to remove tag from
+ * @vlan_tci: buffer to store value
+ *
+ * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
+ * pointing at the MAC header.
+ *
+ * Returns a new pointer to skb->data, or NULL on failure to pull.
+ */
+static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
+{
+	struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+
+	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
+
+	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+	vlan_set_encap_proto(skb, vhdr);
+	return __skb_pull(skb, VLAN_HLEN);
+}
+
 /**
  * skb_vlan_tagged - check if skb is vlan tagged.
  * @skb: skbuff to query
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 304a966164d8..c7c141f6fc14 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5996,7 +5996,6 @@ EXPORT_SYMBOL(skb_ensure_writable);
  */
 int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
 {
-	struct vlan_hdr *vhdr;
 	int offset = skb->data - skb_mac_header(skb);
 	int err;
 
@@ -6012,13 +6011,8 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
 
 	skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
 
-	vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
-	*vlan_tci = ntohs(vhdr->h_vlan_TCI);
-
-	memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
-	__skb_pull(skb, VLAN_HLEN);
+	vlan_remove_tag(skb, vlan_tci);
 
-	vlan_set_encap_proto(skb, vhdr);
 	skb->mac_header += VLAN_HLEN;
 
 	if (skb_network_offset(skb) < ETH_HLEN)
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
index 73ee09de1a3a..20bf7074d5a6 100644
--- a/net/dsa/tag_ocelot.c
+++ b/net/dsa/tag_ocelot.c
@@ -30,7 +30,7 @@ static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp,
 	br_vlan_get_proto(br, &proto);
 
 	if (ntohs(hdr->h_vlan_proto) == proto) {
-		__skb_vlan_pop(skb, &tci);
+		vlan_remove_tag(skb, &tci);
 		*vlan_tci = tci;
 	} else {
 		rcu_read_lock();
-- 
cgit v1.2.3


From a80d2c545ded86d0350b9a870735565d8b749786 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 13 Feb 2023 14:28:55 -0800
Subject: Bluetooth: MGMT: Use BIT macro when defining bitfields

This makes use of BIT macro when defining bitfields which makes it
clearer what bit it is toggling.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/mgmt.h | 80 ++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index e18a927669c0..a5801649f619 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -91,26 +91,26 @@ struct mgmt_rp_read_index_list {
 #define MGMT_MAX_NAME_LENGTH		(HCI_MAX_NAME_LENGTH + 1)
 #define MGMT_MAX_SHORT_NAME_LENGTH	(HCI_MAX_SHORT_NAME_LENGTH + 1)
 
-#define MGMT_SETTING_POWERED		0x00000001
-#define MGMT_SETTING_CONNECTABLE	0x00000002
-#define MGMT_SETTING_FAST_CONNECTABLE	0x00000004
-#define MGMT_SETTING_DISCOVERABLE	0x00000008
-#define MGMT_SETTING_BONDABLE		0x00000010
-#define MGMT_SETTING_LINK_SECURITY	0x00000020
-#define MGMT_SETTING_SSP		0x00000040
-#define MGMT_SETTING_BREDR		0x00000080
-#define MGMT_SETTING_HS			0x00000100
-#define MGMT_SETTING_LE			0x00000200
-#define MGMT_SETTING_ADVERTISING	0x00000400
-#define MGMT_SETTING_SECURE_CONN	0x00000800
-#define MGMT_SETTING_DEBUG_KEYS		0x00001000
-#define MGMT_SETTING_PRIVACY		0x00002000
-#define MGMT_SETTING_CONFIGURATION	0x00004000
-#define MGMT_SETTING_STATIC_ADDRESS	0x00008000
-#define MGMT_SETTING_PHY_CONFIGURATION	0x00010000
-#define MGMT_SETTING_WIDEBAND_SPEECH	0x00020000
-#define MGMT_SETTING_CIS_CENTRAL	0x00040000
-#define MGMT_SETTING_CIS_PERIPHERAL	0x00080000
+#define MGMT_SETTING_POWERED		BIT(0)
+#define MGMT_SETTING_CONNECTABLE	BIT(1)
+#define MGMT_SETTING_FAST_CONNECTABLE	BIT(2)
+#define MGMT_SETTING_DISCOVERABLE	BIT(3)
+#define MGMT_SETTING_BONDABLE		BIT(4)
+#define MGMT_SETTING_LINK_SECURITY	BIT(5)
+#define MGMT_SETTING_SSP		BIT(6)
+#define MGMT_SETTING_BREDR		BIT(7)
+#define MGMT_SETTING_HS			BIT(8)
+#define MGMT_SETTING_LE			BIT(9)
+#define MGMT_SETTING_ADVERTISING	BIT(10)
+#define MGMT_SETTING_SECURE_CONN	BIT(11)
+#define MGMT_SETTING_DEBUG_KEYS		BIT(12)
+#define MGMT_SETTING_PRIVACY		BIT(13)
+#define MGMT_SETTING_CONFIGURATION	BIT(14)
+#define MGMT_SETTING_STATIC_ADDRESS	BIT(15)
+#define MGMT_SETTING_PHY_CONFIGURATION	BIT(16)
+#define MGMT_SETTING_WIDEBAND_SPEECH	BIT(17)
+#define MGMT_SETTING_CIS_CENTRAL	BIT(18)
+#define MGMT_SETTING_CIS_PERIPHERAL	BIT(19)
 
 #define MGMT_OP_READ_INFO		0x0004
 #define MGMT_READ_INFO_SIZE		0
@@ -635,21 +635,21 @@ struct mgmt_rp_get_phy_configuration {
 } __packed;
 #define MGMT_GET_PHY_CONFIGURATION_SIZE	0
 
-#define MGMT_PHY_BR_1M_1SLOT	0x00000001
-#define MGMT_PHY_BR_1M_3SLOT	0x00000002
-#define MGMT_PHY_BR_1M_5SLOT	0x00000004
-#define MGMT_PHY_EDR_2M_1SLOT	0x00000008
-#define MGMT_PHY_EDR_2M_3SLOT	0x00000010
-#define MGMT_PHY_EDR_2M_5SLOT	0x00000020
-#define MGMT_PHY_EDR_3M_1SLOT	0x00000040
-#define MGMT_PHY_EDR_3M_3SLOT	0x00000080
-#define MGMT_PHY_EDR_3M_5SLOT	0x00000100
-#define MGMT_PHY_LE_1M_TX		0x00000200
-#define MGMT_PHY_LE_1M_RX		0x00000400
-#define MGMT_PHY_LE_2M_TX		0x00000800
-#define MGMT_PHY_LE_2M_RX		0x00001000
-#define MGMT_PHY_LE_CODED_TX	0x00002000
-#define MGMT_PHY_LE_CODED_RX	0x00004000
+#define MGMT_PHY_BR_1M_1SLOT		BIT(0)
+#define MGMT_PHY_BR_1M_3SLOT		BIT(1)
+#define MGMT_PHY_BR_1M_5SLOT		BIT(2)
+#define MGMT_PHY_EDR_2M_1SLOT		BIT(3)
+#define MGMT_PHY_EDR_2M_3SLOT		BIT(4)
+#define MGMT_PHY_EDR_2M_5SLOT		BIT(5)
+#define MGMT_PHY_EDR_3M_1SLOT		BIT(6)
+#define MGMT_PHY_EDR_3M_3SLOT		BIT(7)
+#define MGMT_PHY_EDR_3M_5SLOT		BIT(8)
+#define MGMT_PHY_LE_1M_TX		BIT(9)
+#define MGMT_PHY_LE_1M_RX		BIT(10)
+#define MGMT_PHY_LE_2M_TX		BIT(11)
+#define MGMT_PHY_LE_2M_RX		BIT(12)
+#define MGMT_PHY_LE_CODED_TX		BIT(13)
+#define MGMT_PHY_LE_CODED_RX		BIT(14)
 
 #define MGMT_PHY_BREDR_MASK (MGMT_PHY_BR_1M_1SLOT | MGMT_PHY_BR_1M_3SLOT | \
 			     MGMT_PHY_BR_1M_5SLOT | MGMT_PHY_EDR_2M_1SLOT | \
@@ -974,11 +974,11 @@ struct mgmt_ev_auth_failed {
 	__u8	status;
 } __packed;
 
-#define MGMT_DEV_FOUND_CONFIRM_NAME		0x01
-#define MGMT_DEV_FOUND_LEGACY_PAIRING		0x02
-#define MGMT_DEV_FOUND_NOT_CONNECTABLE		0x04
-#define MGMT_DEV_FOUND_INITIATED_CONN		0x08
-#define MGMT_DEV_FOUND_NAME_REQUEST_FAILED	0x10
+#define MGMT_DEV_FOUND_CONFIRM_NAME		BIT(0)
+#define MGMT_DEV_FOUND_LEGACY_PAIRING		BIT(1)
+#define MGMT_DEV_FOUND_NOT_CONNECTABLE		BIT(2)
+#define MGMT_DEV_FOUND_INITIATED_CONN		BIT(3)
+#define MGMT_DEV_FOUND_NAME_REQUEST_FAILED	BIT(4)
 
 #define MGMT_EV_DEVICE_FOUND		0x0012
 struct mgmt_ev_device_found {
-- 
cgit v1.2.3


From d4b20f0b8491bfb3238ebf7df4b13243189620d2 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 24 Feb 2023 15:27:33 -0800
Subject: Bluetooth: hci_core: Make hci_conn_hash_add append to the list

This makes hci_conn_hash_add append to the tail of the conn_hash so it
matches the order they are created, this is required if the controller
attempts to match the order of ACL with CIS which uses append logic
when programming the CIS ids on the CIG.

The result of this change affects Create CIS:

Before:

< HCI Command: LE Create Connected Isochronous Stream (0x08|0x0064) plen 9
        Number of CIS: 2
        CIS Handle: 2560
        ACL Handle: 3586
        CIS Handle: 2561
        ACL Handle: 3585

After:

< HCI Command: LE Create Connected Isochronous Stream (0x08|0x0064) plen 9
        Number of CIS: 2
        CIS Handle: 2560
        ACL Handle: 3585
        CIS Handle: 2561
        ACL Handle: 3586

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index d5311ceb21c6..e22e45fbe8db 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -979,7 +979,7 @@ static inline bool hci_conn_sc_enabled(struct hci_conn *conn)
 static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
-	list_add_rcu(&c->list, &h->list);
+	list_add_tail_rcu(&c->list, &h->list);
 	switch (c->type) {
 	case ACL_LINK:
 		h->acl_num++;
-- 
cgit v1.2.3


From b8b23001b8025a61f0979578884a74faa825023e Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 8 Mar 2023 16:16:31 -0800
Subject: Bluetooth: L2CAP: Delay identity address updates

This delays the identity address updates to give time for userspace to
process the new address otherwise there is a risk that userspace
creates a duplicated device if the MGMT event is delayed for some
reason.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/l2cap.h | 2 +-
 net/bluetooth/l2cap_core.c    | 7 +++----
 net/bluetooth/smp.c           | 9 ++++++++-
 3 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 2f766e3437ce..cf393e72d6ed 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -694,7 +694,7 @@ struct l2cap_conn {
 	struct sk_buff_head	pending_rx;
 	struct work_struct	pending_rx_work;
 
-	struct work_struct	id_addr_update_work;
+	struct delayed_work	id_addr_timer;
 
 	__u8			disc_reason;
 
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 55a7226233f9..5cc95fd17f7d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -745,7 +745,7 @@ EXPORT_SYMBOL_GPL(l2cap_chan_list);
 static void l2cap_conn_update_id_addr(struct work_struct *work)
 {
 	struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
-					       id_addr_update_work);
+					       id_addr_timer.work);
 	struct hci_conn *hcon = conn->hcon;
 	struct l2cap_chan *chan;
 
@@ -1907,8 +1907,7 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
 	if (work_pending(&conn->pending_rx_work))
 		cancel_work_sync(&conn->pending_rx_work);
 
-	if (work_pending(&conn->id_addr_update_work))
-		cancel_work_sync(&conn->id_addr_update_work);
+	cancel_delayed_work_sync(&conn->id_addr_timer);
 
 	l2cap_unregister_all_users(conn);
 
@@ -7874,7 +7873,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
 
 	skb_queue_head_init(&conn->pending_rx);
 	INIT_WORK(&conn->pending_rx_work, process_pending_rx);
-	INIT_WORK(&conn->id_addr_update_work, l2cap_conn_update_id_addr);
+	INIT_DELAYED_WORK(&conn->id_addr_timer, l2cap_conn_update_id_addr);
 
 	conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM;
 
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 70663229b3cc..f1a9fc0012f0 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -58,6 +58,8 @@
 
 #define SMP_TIMEOUT	msecs_to_jiffies(30000)
 
+#define ID_ADDR_TIMEOUT	msecs_to_jiffies(200)
+
 #define AUTH_REQ_MASK(dev)	(hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
 				 0x3f : 0x07)
 #define KEY_DIST_MASK		0x07
@@ -1067,7 +1069,12 @@ static void smp_notify_keys(struct l2cap_conn *conn)
 		if (hcon->type == LE_LINK) {
 			bacpy(&hcon->dst, &smp->remote_irk->bdaddr);
 			hcon->dst_type = smp->remote_irk->addr_type;
-			queue_work(hdev->workqueue, &conn->id_addr_update_work);
+			/* Use a short delay to make sure the new address is
+			 * propagated _before_ the channels.
+			 */
+			queue_delayed_work(hdev->workqueue,
+					   &conn->id_addr_timer,
+					   ID_ADDR_TIMEOUT);
 		}
 	}
 
-- 
cgit v1.2.3


From 8194f1ef5a815aea815a91daf2c721eab2674f1f Mon Sep 17 00:00:00 2001
From: Vasily Khoruzhick <anarsoul@gmail.com>
Date: Tue, 7 Mar 2023 23:17:30 +0100
Subject: Bluetooth: Add new quirk for broken local ext features page 2

Some adapters (e.g. RTL8723CS) advertise that they have more than
2 pages for local ext features, but they don't support any features
declared in these pages. RTL8723CS reports max_page = 2 and declares
support for sync train and secure connection, but it responds with
either garbage or with error in status on corresponding commands.

Signed-off-by: Vasily Khoruzhick <anarsoul@gmail.com>
Signed-off-by: Bastian Germann <bage@debian.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 7 +++++++
 net/bluetooth/hci_event.c   | 9 +++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 400f8a7d0c3f..997107bfc0b1 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -294,6 +294,13 @@ enum {
 	 * during the hdev->setup vendor callback.
 	 */
 	HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG,
+
+	/* When this quirk is set, max_page for local extended features
+	 * is set to 1, even if controller reports higher number. Some
+	 * controllers (e.g. RTL8723CS) report more pages, but they
+	 * don't actually support features declared there.
+	 */
+	HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
 };
 
 /* HCI device flags */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index e87c928c9e17..51f13518dba9 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -886,8 +886,13 @@ static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data,
 	if (rp->status)
 		return rp->status;
 
-	if (hdev->max_page < rp->max_page)
-		hdev->max_page = rp->max_page;
+	if (hdev->max_page < rp->max_page) {
+		if (test_bit(HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
+			     &hdev->quirks))
+			bt_dev_warn(hdev, "broken local ext features page 2");
+		else
+			hdev->max_page = rp->max_page;
+	}
 
 	if (rp->page < HCI_MAX_PAGES)
 		memcpy(hdev->features[rp->page], rp->features, 8);
-- 
cgit v1.2.3


From 29f93a687f3c435e94d026ee1fc8ad18ce56b7fb Mon Sep 17 00:00:00 2001
From: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Date: Thu, 16 Mar 2023 22:52:11 +0530
Subject: serdev: Replace all instances of ENOTSUPP with EOPNOTSUPP

This replaces all instances of ENOTSUPP with EOPNOTSUPP since ENOTSUPP
is not a standard error code. This will help maintain consistency in
error codes when new serdev API's are added.

Signed-off-by: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/tty/serdev/core.c           | 6 +++---
 drivers/tty/serdev/serdev-ttyport.c | 4 ++--
 include/linux/serdev.h              | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index 678014253b7b..078fe64625cb 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -366,7 +366,7 @@ int serdev_device_set_parity(struct serdev_device *serdev,
 	struct serdev_controller *ctrl = serdev->ctrl;
 
 	if (!ctrl || !ctrl->ops->set_parity)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return ctrl->ops->set_parity(ctrl, parity);
 }
@@ -388,7 +388,7 @@ int serdev_device_get_tiocm(struct serdev_device *serdev)
 	struct serdev_controller *ctrl = serdev->ctrl;
 
 	if (!ctrl || !ctrl->ops->get_tiocm)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return ctrl->ops->get_tiocm(ctrl);
 }
@@ -399,7 +399,7 @@ int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
 	struct serdev_controller *ctrl = serdev->ctrl;
 
 	if (!ctrl || !ctrl->ops->set_tiocm)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return ctrl->ops->set_tiocm(ctrl, set, clear);
 }
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index d367803e2044..f26ff82723f1 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -231,7 +231,7 @@ static int ttyport_get_tiocm(struct serdev_controller *ctrl)
 	struct tty_struct *tty = serport->tty;
 
 	if (!tty->ops->tiocmget)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return tty->ops->tiocmget(tty);
 }
@@ -242,7 +242,7 @@ static int ttyport_set_tiocm(struct serdev_controller *ctrl, unsigned int set, u
 	struct tty_struct *tty = serport->tty;
 
 	if (!tty->ops->tiocmset)
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	return tty->ops->tiocmset(tty, set, clear);
 }
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 5f6bfe4f6d95..babedb13785b 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -250,11 +250,11 @@ static inline int serdev_device_write_buf(struct serdev_device *serdev,
 static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {}
 static inline int serdev_device_get_tiocm(struct serdev_device *serdev)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf,
 				      size_t count, unsigned long timeout)
-- 
cgit v1.2.3


From 8eaf839e4ac4feadf06e03eeff34059795450712 Mon Sep 17 00:00:00 2001
From: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Date: Thu, 16 Mar 2023 22:52:12 +0530
Subject: serdev: Add method to assert break signal over tty UART port

Adds serdev_device_break_ctl() and an implementation for ttyport.
This function simply calls the break_ctl in tty layer, which can
assert a break signal over UART-TX line, if the tty and the
underlying platform and UART peripheral supports this operation.

Signed-off-by: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/tty/serdev/core.c           | 11 +++++++++++
 drivers/tty/serdev/serdev-ttyport.c | 12 ++++++++++++
 include/linux/serdev.h              |  6 ++++++
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index 078fe64625cb..e7d663901c07 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -405,6 +405,17 @@ int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
 }
 EXPORT_SYMBOL_GPL(serdev_device_set_tiocm);
 
+int serdev_device_break_ctl(struct serdev_device *serdev, int break_state)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->break_ctl)
+		return -EOPNOTSUPP;
+
+	return ctrl->ops->break_ctl(ctrl, break_state);
+}
+EXPORT_SYMBOL_GPL(serdev_device_break_ctl);
+
 static int serdev_drv_probe(struct device *dev)
 {
 	const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver);
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index f26ff82723f1..8033ef19669c 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -247,6 +247,17 @@ static int ttyport_set_tiocm(struct serdev_controller *ctrl, unsigned int set, u
 	return tty->ops->tiocmset(tty, set, clear);
 }
 
+static int ttyport_break_ctl(struct serdev_controller *ctrl, unsigned int break_state)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	if (!tty->ops->break_ctl)
+		return -EOPNOTSUPP;
+
+	return tty->ops->break_ctl(tty, break_state);
+}
+
 static const struct serdev_controller_ops ctrl_ops = {
 	.write_buf = ttyport_write_buf,
 	.write_flush = ttyport_write_flush,
@@ -259,6 +270,7 @@ static const struct serdev_controller_ops ctrl_ops = {
 	.wait_until_sent = ttyport_wait_until_sent,
 	.get_tiocm = ttyport_get_tiocm,
 	.set_tiocm = ttyport_set_tiocm,
+	.break_ctl = ttyport_break_ctl,
 };
 
 struct device *serdev_tty_port_register(struct tty_port *port,
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index babedb13785b..f5f97fa25e8a 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -93,6 +93,7 @@ struct serdev_controller_ops {
 	void (*wait_until_sent)(struct serdev_controller *, long);
 	int (*get_tiocm)(struct serdev_controller *);
 	int (*set_tiocm)(struct serdev_controller *, unsigned int, unsigned int);
+	int (*break_ctl)(struct serdev_controller *ctrl, unsigned int break_state);
 };
 
 /**
@@ -203,6 +204,7 @@ int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_
 void serdev_device_wait_until_sent(struct serdev_device *, long);
 int serdev_device_get_tiocm(struct serdev_device *);
 int serdev_device_set_tiocm(struct serdev_device *, int, int);
+int serdev_device_break_ctl(struct serdev_device *serdev, int break_state);
 void serdev_device_write_wakeup(struct serdev_device *);
 int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, long);
 void serdev_device_write_flush(struct serdev_device *);
@@ -256,6 +258,10 @@ static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set,
 {
 	return -EOPNOTSUPP;
 }
+static inline int serdev_device_break_ctl(struct serdev_device *serdev, int break_state)
+{
+	return -EOPNOTSUPP;
+}
 static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf,
 				      size_t count, unsigned long timeout)
 {
-- 
cgit v1.2.3


From 9695ef876fd122cb7bbc04a4a93b8727d2e36bda Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Thu, 30 Mar 2023 09:58:23 -0700
Subject: Bluetooth: Add support for hci devcoredump

Add devcoredump APIs to hci core so that drivers only have to provide
the dump skbs instead of managing the synchronization and timeouts.

The devcoredump APIs should be used in the following manner:
 - hci_devcoredump_init is called to allocate the dump.
 - hci_devcoredump_append is called to append any skbs with dump data
   OR hci_devcoredump_append_pattern is called to insert a pattern.
 - hci_devcoredump_complete is called when all dump packets have been
   sent OR hci_devcoredump_abort is called to indicate an error and
   cancel an ongoing dump collection.

The high level APIs just prepare some skbs with the appropriate data and
queue it for the dump to process. Packets part of the crashdump can be
intercepted in the driver in interrupt context and forwarded directly to
the devcoredump APIs.

Internally, there are 5 states for the dump: idle, active, complete,
abort and timeout. A devcoredump will only be in active state after it
has been initialized. Once active, it accepts data to be appended,
patterns to be inserted (i.e. memset) and a completion event or an abort
event to generate a devcoredump. The timeout is initialized at the same
time the dump is initialized (defaulting to 10s) and will be cleared
either when the timeout occurs or the dump is complete or aborted.

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Manish Mandlik <mmandlik@google.com>
Reviewed-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/coredump.h | 116 +++++++++
 include/net/bluetooth/hci_core.h |  14 +
 net/bluetooth/Makefile           |   2 +
 net/bluetooth/coredump.c         | 535 +++++++++++++++++++++++++++++++++++++++
 net/bluetooth/hci_core.c         |   1 +
 net/bluetooth/hci_sync.c         |   2 +
 6 files changed, 670 insertions(+)
 create mode 100644 include/net/bluetooth/coredump.h
 create mode 100644 net/bluetooth/coredump.c

(limited to 'include')

diff --git a/include/net/bluetooth/coredump.h b/include/net/bluetooth/coredump.h
new file mode 100644
index 000000000000..72f51b587a04
--- /dev/null
+++ b/include/net/bluetooth/coredump.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Google Corporation
+ */
+
+#ifndef __COREDUMP_H
+#define __COREDUMP_H
+
+#define DEVCOREDUMP_TIMEOUT	msecs_to_jiffies(10000)	/* 10 sec */
+
+typedef void (*coredump_t)(struct hci_dev *hdev);
+typedef void (*dmp_hdr_t)(struct hci_dev *hdev, struct sk_buff *skb);
+typedef void (*notify_change_t)(struct hci_dev *hdev, int state);
+
+/* struct hci_devcoredump - Devcoredump state
+ *
+ * @supported: Indicates if FW dump collection is supported by driver
+ * @state: Current state of dump collection
+ * @timeout: Indicates a timeout for collecting the devcoredump
+ *
+ * @alloc_size: Total size of the dump
+ * @head: Start of the dump
+ * @tail: Pointer to current end of dump
+ * @end: head + alloc_size for easy comparisons
+ *
+ * @dump_q: Dump queue for state machine to process
+ * @dump_rx: Devcoredump state machine work
+ * @dump_timeout: Devcoredump timeout work
+ *
+ * @coredump: Called from the driver's .coredump() function.
+ * @dmp_hdr: Create a dump header to identify controller/fw/driver info
+ * @notify_change: Notify driver when devcoredump state has changed
+ */
+struct hci_devcoredump {
+	bool		supported;
+
+	enum devcoredump_state {
+		HCI_DEVCOREDUMP_IDLE,
+		HCI_DEVCOREDUMP_ACTIVE,
+		HCI_DEVCOREDUMP_DONE,
+		HCI_DEVCOREDUMP_ABORT,
+		HCI_DEVCOREDUMP_TIMEOUT,
+	} state;
+
+	unsigned long	timeout;
+
+	size_t		alloc_size;
+	char		*head;
+	char		*tail;
+	char		*end;
+
+	struct sk_buff_head	dump_q;
+	struct work_struct	dump_rx;
+	struct delayed_work	dump_timeout;
+
+	coredump_t		coredump;
+	dmp_hdr_t		dmp_hdr;
+	notify_change_t		notify_change;
+};
+
+#ifdef CONFIG_DEV_COREDUMP
+
+void hci_devcd_reset(struct hci_dev *hdev);
+void hci_devcd_rx(struct work_struct *work);
+void hci_devcd_timeout(struct work_struct *work);
+
+int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+		       dmp_hdr_t dmp_hdr, notify_change_t notify_change);
+int hci_devcd_init(struct hci_dev *hdev, u32 dump_size);
+int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb);
+int hci_devcd_append_pattern(struct hci_dev *hdev, u8 pattern, u32 len);
+int hci_devcd_complete(struct hci_dev *hdev);
+int hci_devcd_abort(struct hci_dev *hdev);
+
+#else
+
+static inline void hci_devcd_reset(struct hci_dev *hdev) {}
+static inline void hci_devcd_rx(struct work_struct *work) {}
+static inline void hci_devcd_timeout(struct work_struct *work) {}
+
+static inline int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+				     dmp_hdr_t dmp_hdr,
+				     notify_change_t notify_change)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_init(struct hci_dev *hdev, u32 dump_size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_append_pattern(struct hci_dev *hdev,
+					   u8 pattern, u32 len)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_complete(struct hci_dev *hdev)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int hci_devcd_abort(struct hci_dev *hdev)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif /* CONFIG_DEV_COREDUMP */
+
+#endif /* __COREDUMP_H */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index e22e45fbe8db..a461335beacf 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -32,6 +32,7 @@
 #include <net/bluetooth/hci.h>
 #include <net/bluetooth/hci_sync.h>
 #include <net/bluetooth/hci_sock.h>
+#include <net/bluetooth/coredump.h>
 
 /* HCI priority */
 #define HCI_PRIO_MAX	7
@@ -590,6 +591,10 @@ struct hci_dev {
 	const char		*fw_info;
 	struct dentry		*debugfs;
 
+#ifdef CONFIG_DEV_COREDUMP
+	struct hci_devcoredump	dump;
+#endif
+
 	struct device		dev;
 
 	struct rfkill		*rfkill;
@@ -1497,6 +1502,15 @@ static inline void hci_set_aosp_capable(struct hci_dev *hdev)
 #endif
 }
 
+static inline void hci_devcd_setup(struct hci_dev *hdev)
+{
+#ifdef CONFIG_DEV_COREDUMP
+	INIT_WORK(&hdev->dump.dump_rx, hci_devcd_rx);
+	INIT_DELAYED_WORK(&hdev->dump.dump_timeout, hci_devcd_timeout);
+	skb_queue_head_init(&hdev->dump.dump_q);
+#endif
+}
+
 int hci_dev_open(__u16 dev);
 int hci_dev_close(__u16 dev);
 int hci_dev_do_close(struct hci_dev *hdev);
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 0e7b7db42750..141ac1fda0bf 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -17,6 +17,8 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
 	ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o hci_codec.o \
 	eir.o hci_sync.o
 
+bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o
+
 bluetooth-$(CONFIG_BT_BREDR) += sco.o
 bluetooth-$(CONFIG_BT_LE) += iso.o
 bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c
new file mode 100644
index 000000000000..08fa98505454
--- /dev/null
+++ b/net/bluetooth/coredump.c
@@ -0,0 +1,535 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Google Corporation
+ */
+
+#include <linux/devcoredump.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+enum hci_devcoredump_pkt_type {
+	HCI_DEVCOREDUMP_PKT_INIT,
+	HCI_DEVCOREDUMP_PKT_SKB,
+	HCI_DEVCOREDUMP_PKT_PATTERN,
+	HCI_DEVCOREDUMP_PKT_COMPLETE,
+	HCI_DEVCOREDUMP_PKT_ABORT,
+};
+
+struct hci_devcoredump_skb_cb {
+	u16 pkt_type;
+};
+
+struct hci_devcoredump_skb_pattern {
+	u8 pattern;
+	u32 len;
+} __packed;
+
+#define hci_dmp_cb(skb)	((struct hci_devcoredump_skb_cb *)((skb)->cb))
+
+#define DBG_UNEXPECTED_STATE() \
+	bt_dev_dbg(hdev, \
+		   "Unexpected packet (%d) for state (%d). ", \
+		   hci_dmp_cb(skb)->pkt_type, hdev->dump.state)
+
+#define MAX_DEVCOREDUMP_HDR_SIZE	512	/* bytes */
+
+static int hci_devcd_update_hdr_state(char *buf, size_t size, int state)
+{
+	int len = 0;
+
+	if (!buf)
+		return 0;
+
+	len = scnprintf(buf, size, "Bluetooth devcoredump\nState: %d\n", state);
+
+	return len + 1; /* scnprintf adds \0 at the end upon state rewrite */
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_update_state(struct hci_dev *hdev, int state)
+{
+	bt_dev_dbg(hdev, "Updating devcoredump state from %d to %d.",
+		   hdev->dump.state, state);
+
+	hdev->dump.state = state;
+
+	return hci_devcd_update_hdr_state(hdev->dump.head,
+					  hdev->dump.alloc_size, state);
+}
+
+static int hci_devcd_mkheader(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	char dump_start[] = "--- Start dump ---\n";
+	char hdr[80];
+	int hdr_len;
+
+	hdr_len = hci_devcd_update_hdr_state(hdr, sizeof(hdr),
+					     HCI_DEVCOREDUMP_IDLE);
+	skb_put_data(skb, hdr, hdr_len);
+
+	if (hdev->dump.dmp_hdr)
+		hdev->dump.dmp_hdr(hdev, skb);
+
+	skb_put_data(skb, dump_start, strlen(dump_start));
+
+	return skb->len;
+}
+
+/* Do not call with hci_dev_lock since this calls driver code. */
+static void hci_devcd_notify(struct hci_dev *hdev, int state)
+{
+	if (hdev->dump.notify_change)
+		hdev->dump.notify_change(hdev, state);
+}
+
+/* Call with hci_dev_lock only. */
+void hci_devcd_reset(struct hci_dev *hdev)
+{
+	hdev->dump.head = NULL;
+	hdev->dump.tail = NULL;
+	hdev->dump.alloc_size = 0;
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+	cancel_delayed_work(&hdev->dump.dump_timeout);
+	skb_queue_purge(&hdev->dump.dump_q);
+}
+
+/* Call with hci_dev_lock only. */
+static void hci_devcd_free(struct hci_dev *hdev)
+{
+	if (hdev->dump.head)
+		vfree(hdev->dump.head);
+
+	hci_devcd_reset(hdev);
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_alloc(struct hci_dev *hdev, u32 size)
+{
+	hdev->dump.head = vmalloc(size);
+	if (!hdev->dump.head)
+		return -ENOMEM;
+
+	hdev->dump.alloc_size = size;
+	hdev->dump.tail = hdev->dump.head;
+	hdev->dump.end = hdev->dump.head + size;
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+	return 0;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_copy(struct hci_dev *hdev, char *buf, u32 size)
+{
+	if (hdev->dump.tail + size > hdev->dump.end)
+		return false;
+
+	memcpy(hdev->dump.tail, buf, size);
+	hdev->dump.tail += size;
+
+	return true;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_memset(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+	if (hdev->dump.tail + len > hdev->dump.end)
+		return false;
+
+	memset(hdev->dump.tail, pattern, len);
+	hdev->dump.tail += len;
+
+	return true;
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_prepare(struct hci_dev *hdev, u32 dump_size)
+{
+	struct sk_buff *skb;
+	int dump_hdr_size;
+	int err = 0;
+
+	skb = alloc_skb(MAX_DEVCOREDUMP_HDR_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	dump_hdr_size = hci_devcd_mkheader(hdev, skb);
+
+	if (hci_devcd_alloc(hdev, dump_hdr_size + dump_size)) {
+		err = -ENOMEM;
+		goto hdr_free;
+	}
+
+	/* Insert the device header */
+	if (!hci_devcd_copy(hdev, skb->data, skb->len)) {
+		bt_dev_err(hdev, "Failed to insert header");
+		hci_devcd_free(hdev);
+
+		err = -ENOMEM;
+		goto hdr_free;
+	}
+
+hdr_free:
+	kfree_skb(skb);
+
+	return err;
+}
+
+static void hci_devcd_handle_pkt_init(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	u32 *dump_size;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_IDLE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	if (skb->len != sizeof(*dump_size)) {
+		bt_dev_dbg(hdev, "Invalid dump init pkt");
+		return;
+	}
+
+	dump_size = skb_pull_data(skb, sizeof(*dump_size));
+	if (!*dump_size) {
+		bt_dev_err(hdev, "Zero size dump init pkt");
+		return;
+	}
+
+	if (hci_devcd_prepare(hdev, *dump_size)) {
+		bt_dev_err(hdev, "Failed to prepare for dump");
+		return;
+	}
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ACTIVE);
+	queue_delayed_work(hdev->workqueue, &hdev->dump.dump_timeout,
+			   hdev->dump.timeout);
+}
+
+static void hci_devcd_handle_pkt_skb(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	if (!hci_devcd_copy(hdev, skb->data, skb->len))
+		bt_dev_dbg(hdev, "Failed to insert skb");
+}
+
+static void hci_devcd_handle_pkt_pattern(struct hci_dev *hdev,
+					 struct sk_buff *skb)
+{
+	struct hci_devcoredump_skb_pattern *pattern;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	if (skb->len != sizeof(*pattern)) {
+		bt_dev_dbg(hdev, "Invalid pattern skb");
+		return;
+	}
+
+	pattern = skb_pull_data(skb, sizeof(*pattern));
+
+	if (!hci_devcd_memset(hdev, pattern->pattern, pattern->len))
+		bt_dev_dbg(hdev, "Failed to set pattern");
+}
+
+static void hci_devcd_handle_pkt_complete(struct hci_dev *hdev,
+					  struct sk_buff *skb)
+{
+	u32 dump_size;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_DONE);
+	dump_size = hdev->dump.tail - hdev->dump.head;
+
+	bt_dev_dbg(hdev, "complete with size %u (expect %zu)", dump_size,
+		   hdev->dump.alloc_size);
+
+	dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL);
+}
+
+static void hci_devcd_handle_pkt_abort(struct hci_dev *hdev,
+				       struct sk_buff *skb)
+{
+	u32 dump_size;
+
+	if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+		DBG_UNEXPECTED_STATE();
+		return;
+	}
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ABORT);
+	dump_size = hdev->dump.tail - hdev->dump.head;
+
+	bt_dev_dbg(hdev, "aborted with size %u (expect %zu)", dump_size,
+		   hdev->dump.alloc_size);
+
+	/* Emit a devcoredump with the available data */
+	dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL);
+}
+
+/* Bluetooth devcoredump state machine.
+ *
+ * Devcoredump states:
+ *
+ *      HCI_DEVCOREDUMP_IDLE: The default state.
+ *
+ *      HCI_DEVCOREDUMP_ACTIVE: A devcoredump will be in this state once it has
+ *              been initialized using hci_devcd_init(). Once active, the driver
+ *              can append data using hci_devcd_append() or insert a pattern
+ *              using hci_devcd_append_pattern().
+ *
+ *      HCI_DEVCOREDUMP_DONE: Once the dump collection is complete, the drive
+ *              can signal the completion using hci_devcd_complete(). A
+ *              devcoredump is generated indicating the completion event and
+ *              then the state machine is reset to the default state.
+ *
+ *      HCI_DEVCOREDUMP_ABORT: The driver can cancel ongoing dump collection in
+ *              case of any error using hci_devcd_abort(). A devcoredump is
+ *              still generated with the available data indicating the abort
+ *              event and then the state machine is reset to the default state.
+ *
+ *      HCI_DEVCOREDUMP_TIMEOUT: A timeout timer for HCI_DEVCOREDUMP_TIMEOUT sec
+ *              is started during devcoredump initialization. Once the timeout
+ *              occurs, the driver is notified, a devcoredump is generated with
+ *              the available data indicating the timeout event and then the
+ *              state machine is reset to the default state.
+ *
+ * The driver must register using hci_devcd_register() before using the hci
+ * devcoredump APIs.
+ */
+void hci_devcd_rx(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev, dump.dump_rx);
+	struct sk_buff *skb;
+	int start_state;
+
+	while ((skb = skb_dequeue(&hdev->dump.dump_q))) {
+		/* Return if timeout occurs. The timeout handler function
+		 * hci_devcd_timeout() will report the available dump data.
+		 */
+		if (hdev->dump.state == HCI_DEVCOREDUMP_TIMEOUT) {
+			kfree_skb(skb);
+			return;
+		}
+
+		hci_dev_lock(hdev);
+		start_state = hdev->dump.state;
+
+		switch (hci_dmp_cb(skb)->pkt_type) {
+		case HCI_DEVCOREDUMP_PKT_INIT:
+			hci_devcd_handle_pkt_init(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_SKB:
+			hci_devcd_handle_pkt_skb(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_PATTERN:
+			hci_devcd_handle_pkt_pattern(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_COMPLETE:
+			hci_devcd_handle_pkt_complete(hdev, skb);
+			break;
+
+		case HCI_DEVCOREDUMP_PKT_ABORT:
+			hci_devcd_handle_pkt_abort(hdev, skb);
+			break;
+
+		default:
+			bt_dev_dbg(hdev, "Unknown packet (%d) for state (%d). ",
+				   hci_dmp_cb(skb)->pkt_type, hdev->dump.state);
+			break;
+		}
+
+		hci_dev_unlock(hdev);
+		kfree_skb(skb);
+
+		/* Notify the driver about any state changes before resetting
+		 * the state machine
+		 */
+		if (start_state != hdev->dump.state)
+			hci_devcd_notify(hdev, hdev->dump.state);
+
+		/* Reset the state machine if the devcoredump is complete */
+		hci_dev_lock(hdev);
+		if (hdev->dump.state == HCI_DEVCOREDUMP_DONE ||
+		    hdev->dump.state == HCI_DEVCOREDUMP_ABORT)
+			hci_devcd_reset(hdev);
+		hci_dev_unlock(hdev);
+	}
+}
+EXPORT_SYMBOL(hci_devcd_rx);
+
+void hci_devcd_timeout(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    dump.dump_timeout.work);
+	u32 dump_size;
+
+	hci_devcd_notify(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+	hci_dev_lock(hdev);
+
+	cancel_work(&hdev->dump.dump_rx);
+
+	hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+	dump_size = hdev->dump.tail - hdev->dump.head;
+	bt_dev_dbg(hdev, "timeout with size %u (expect %zu)", dump_size,
+		   hdev->dump.alloc_size);
+
+	/* Emit a devcoredump with the available data */
+	dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL);
+
+	hci_devcd_reset(hdev);
+
+	hci_dev_unlock(hdev);
+}
+EXPORT_SYMBOL(hci_devcd_timeout);
+
+int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+		       dmp_hdr_t dmp_hdr, notify_change_t notify_change)
+{
+	/* Driver must implement coredump() and dmp_hdr() functions for
+	 * bluetooth devcoredump. The coredump() should trigger a coredump
+	 * event on the controller when the device's coredump sysfs entry is
+	 * written to. The dmp_hdr() should create a dump header to identify
+	 * the controller/fw/driver info.
+	 */
+	if (!coredump || !dmp_hdr)
+		return -EINVAL;
+
+	hci_dev_lock(hdev);
+	hdev->dump.coredump = coredump;
+	hdev->dump.dmp_hdr = dmp_hdr;
+	hdev->dump.notify_change = notify_change;
+	hdev->dump.supported = true;
+	hdev->dump.timeout = DEVCOREDUMP_TIMEOUT;
+	hci_dev_unlock(hdev);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_register);
+
+static inline bool hci_devcd_enabled(struct hci_dev *hdev)
+{
+	return hdev->dump.supported;
+}
+
+int hci_devcd_init(struct hci_dev *hdev, u32 dump_size)
+{
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(sizeof(dump_size), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_INIT;
+	skb_put_data(skb, &dump_size, sizeof(dump_size));
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_init);
+
+int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb)
+{
+	if (!skb)
+		return -ENOMEM;
+
+	if (!hci_devcd_enabled(hdev)) {
+		kfree_skb(skb);
+		return -EOPNOTSUPP;
+	}
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_SKB;
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append);
+
+int hci_devcd_append_pattern(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+	struct hci_devcoredump_skb_pattern p;
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(sizeof(p), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	p.pattern = pattern;
+	p.len = len;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_PATTERN;
+	skb_put_data(skb, &p, sizeof(p));
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append_pattern);
+
+int hci_devcd_complete(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(0, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_COMPLETE;
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_complete);
+
+int hci_devcd_abort(struct hci_dev *hdev)
+{
+	struct sk_buff *skb;
+
+	if (!hci_devcd_enabled(hdev))
+		return -EOPNOTSUPP;
+
+	skb = alloc_skb(0, GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+
+	hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_ABORT;
+
+	skb_queue_tail(&hdev->dump.dump_q, skb);
+	queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+	return 0;
+}
+EXPORT_SYMBOL(hci_devcd_abort);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 334e308451f5..393b317ae68f 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2544,6 +2544,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 	INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
 	INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout);
 
+	hci_devcd_setup(hdev);
 	hci_request_setup(hdev);
 
 	hci_init_sysfs(hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 31231f0e4a28..f21497ebc814 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4727,6 +4727,8 @@ int hci_dev_open_sync(struct hci_dev *hdev)
 		goto done;
 	}
 
+	hci_devcd_reset(hdev);
+
 	set_bit(HCI_RUNNING, &hdev->flags);
 	hci_sock_dev_event(hdev, HCI_DEV_OPEN);
 
-- 
cgit v1.2.3


From 0fe8c8d071343fa9278980ce4b6f8e6ea24a2ed1 Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Fri, 31 Mar 2023 18:38:01 +0300
Subject: Bluetooth: Split bt_iso_qos into dedicated structures

Split bt_iso_qos into dedicated unicast and broadcast
structures and add additional broadcast parameters.

Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections")
Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |  43 +++++++---
 include/net/bluetooth/hci_core.h  |   9 ++-
 net/bluetooth/hci_conn.c          | 162 ++++++++++++++++++++------------------
 net/bluetooth/hci_event.c         |  33 ++++----
 net/bluetooth/iso.c               | 125 +++++++++++++++++++++++------
 5 files changed, 237 insertions(+), 135 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index bcc5a4cd2c17..1b4230cd42a3 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (C) 2000-2001 Qualcomm Incorporated
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -171,23 +172,39 @@ struct bt_iso_io_qos {
 	__u8  rtn;
 };
 
-struct bt_iso_qos {
-	union {
-		__u8  cig;
-		__u8  big;
-	};
-	union {
-		__u8  cis;
-		__u8  bis;
-	};
-	union {
-		__u8  sca;
-		__u8  sync_interval;
-	};
+struct bt_iso_ucast_qos {
+	__u8  cig;
+	__u8  cis;
+	__u8  sca;
+	__u8  packing;
+	__u8  framing;
+	struct bt_iso_io_qos in;
+	struct bt_iso_io_qos out;
+};
+
+struct bt_iso_bcast_qos {
+	__u8  big;
+	__u8  bis;
+	__u8  sync_interval;
 	__u8  packing;
 	__u8  framing;
 	struct bt_iso_io_qos in;
 	struct bt_iso_io_qos out;
+	__u8  encryption;
+	__u8  bcode[16];
+	__u8  options;
+	__u16 skip;
+	__u16 sync_timeout;
+	__u8  sync_cte_type;
+	__u8  mse;
+	__u16 timeout;
+};
+
+struct bt_iso_qos {
+	union {
+		struct bt_iso_ucast_qos ucast;
+		struct bt_iso_bcast_qos bcast;
+	};
 };
 
 #define BT_ISO_PHY_1M		0x01
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a461335beacf..f11689284112 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -1096,7 +1097,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev,
 		if (bacmp(&c->dst, ba) || c->type != ISO_LINK)
 			continue;
 
-		if (c->iso_qos.big == big && c->iso_qos.bis == bis) {
+		if (c->iso_qos.bcast.big == big && c->iso_qos.bcast.bis == bis) {
 			rcu_read_unlock();
 			return c;
 		}
@@ -1205,7 +1206,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev,
 		if (c->type != ISO_LINK)
 			continue;
 
-		if (handle == c->iso_qos.cig) {
+		if (handle == c->iso_qos.ucast.cig) {
 			rcu_read_unlock();
 			return c;
 		}
@@ -1228,7 +1229,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev,
 		if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK)
 			continue;
 
-		if (handle == c->iso_qos.big) {
+		if (handle == c->iso_qos.bcast.big) {
 			rcu_read_unlock();
 			return c;
 		}
@@ -1337,7 +1338,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
 				 __u8 dst_type, struct bt_iso_qos *qos,
 				 __u8 data_len, __u8 *data);
 int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
-		       __u8 sid);
+		       __u8 sid, struct bt_iso_qos *qos);
 int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos,
 			   __u16 sync_handle, __u8 num_bis, __u8 bis[]);
 int hci_conn_check_link_mode(struct hci_conn *conn);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 8455ba141ee6..5672b4924572 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -795,8 +796,8 @@ static void bis_list(struct hci_conn *conn, void *data)
 	if (bacmp(&conn->dst, BDADDR_ANY))
 		return;
 
-	if (d->big != conn->iso_qos.big || d->bis == BT_ISO_QOS_BIS_UNSET ||
-	    d->bis != conn->iso_qos.bis)
+	if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET ||
+	    d->bis != conn->iso_qos.bcast.bis)
 		return;
 
 	d->count++;
@@ -916,10 +917,10 @@ static void bis_cleanup(struct hci_conn *conn)
 		if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags))
 			return;
 
-		hci_le_terminate_big(hdev, conn->iso_qos.big,
-				     conn->iso_qos.bis);
+		hci_le_terminate_big(hdev, conn->iso_qos.bcast.big,
+				     conn->iso_qos.bcast.bis);
 	} else {
-		hci_le_big_terminate(hdev, conn->iso_qos.big,
+		hci_le_big_terminate(hdev, conn->iso_qos.bcast.big,
 				     conn->sync_handle);
 	}
 }
@@ -959,7 +960,7 @@ static void cis_cleanup(struct hci_conn *conn)
 	struct iso_list_data d;
 
 	memset(&d, 0, sizeof(d));
-	d.cig = conn->iso_qos.cig;
+	d.cig = conn->iso_qos.ucast.cig;
 
 	/* Check if ISO connection is a CIS and remove CIG if there are
 	 * no other connections using it.
@@ -968,7 +969,7 @@ static void cis_cleanup(struct hci_conn *conn)
 	if (d.count)
 		return;
 
-	hci_le_remove_cig(hdev, conn->iso_qos.cig);
+	hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig);
 }
 
 struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
@@ -1411,7 +1412,7 @@ static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos)
 	struct iso_list_data data;
 
 	/* Allocate a BIG if not set */
-	if (qos->big == BT_ISO_QOS_BIG_UNSET) {
+	if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) {
 		for (data.big = 0x00; data.big < 0xef; data.big++) {
 			data.count = 0;
 			data.bis = 0xff;
@@ -1426,7 +1427,7 @@ static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos)
 			return -EADDRNOTAVAIL;
 
 		/* Update BIG */
-		qos->big = data.big;
+		qos->bcast.big = data.big;
 	}
 
 	return 0;
@@ -1437,7 +1438,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
 	struct iso_list_data data;
 
 	/* Allocate BIS if not set */
-	if (qos->bis == BT_ISO_QOS_BIS_UNSET) {
+	if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) {
 		/* Find an unused adv set to advertise BIS, skip instance 0x00
 		 * since it is reserved as general purpose set.
 		 */
@@ -1455,7 +1456,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
 			return -EADDRNOTAVAIL;
 
 		/* Update BIS */
-		qos->bis = data.bis;
+		qos->bcast.bis = data.bis;
 	}
 
 	return 0;
@@ -1484,8 +1485,8 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
 	if (err)
 		return ERR_PTR(err);
 
-	data.big = qos->big;
-	data.bis = qos->bis;
+	data.big = qos->bcast.big;
+	data.bis = qos->bcast.bis;
 	data.count = 0;
 
 	/* Check if there is already a matching BIG/BIS */
@@ -1493,7 +1494,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
 	if (data.count)
 		return ERR_PTR(-EADDRINUSE);
 
-	conn = hci_conn_hash_lookup_bis(hdev, dst, qos->big, qos->bis);
+	conn = hci_conn_hash_lookup_bis(hdev, dst, qos->bcast.big, qos->bcast.bis);
 	if (conn)
 		return ERR_PTR(-EADDRINUSE);
 
@@ -1648,13 +1649,13 @@ static void cis_add(struct iso_list_data *d, struct bt_iso_qos *qos)
 {
 	struct hci_cis_params *cis = &d->pdu.cis[d->pdu.cp.num_cis];
 
-	cis->cis_id = qos->cis;
-	cis->c_sdu  = cpu_to_le16(qos->out.sdu);
-	cis->p_sdu  = cpu_to_le16(qos->in.sdu);
-	cis->c_phy  = qos->out.phy ? qos->out.phy : qos->in.phy;
-	cis->p_phy  = qos->in.phy ? qos->in.phy : qos->out.phy;
-	cis->c_rtn  = qos->out.rtn;
-	cis->p_rtn  = qos->in.rtn;
+	cis->cis_id = qos->ucast.cis;
+	cis->c_sdu  = cpu_to_le16(qos->ucast.out.sdu);
+	cis->p_sdu  = cpu_to_le16(qos->ucast.in.sdu);
+	cis->c_phy  = qos->ucast.out.phy ? qos->ucast.out.phy : qos->ucast.in.phy;
+	cis->p_phy  = qos->ucast.in.phy ? qos->ucast.in.phy : qos->ucast.out.phy;
+	cis->c_rtn  = qos->ucast.out.rtn;
+	cis->p_rtn  = qos->ucast.in.rtn;
 
 	d->pdu.cp.num_cis++;
 }
@@ -1667,8 +1668,8 @@ static void cis_list(struct hci_conn *conn, void *data)
 	if (!bacmp(&conn->dst, BDADDR_ANY))
 		return;
 
-	if (d->cig != conn->iso_qos.cig || d->cis == BT_ISO_QOS_CIS_UNSET ||
-	    d->cis != conn->iso_qos.cis)
+	if (d->cig != conn->iso_qos.ucast.cig || d->cis == BT_ISO_QOS_CIS_UNSET ||
+	    d->cis != conn->iso_qos.ucast.cis)
 		return;
 
 	d->count++;
@@ -1687,17 +1688,18 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
 
 	memset(&cp, 0, sizeof(cp));
 
-	cp.handle = qos->big;
-	cp.adv_handle = qos->bis;
+	cp.handle = qos->bcast.big;
+	cp.adv_handle = qos->bcast.bis;
 	cp.num_bis  = 0x01;
-	hci_cpu_to_le24(qos->out.interval, cp.bis.sdu_interval);
-	cp.bis.sdu = cpu_to_le16(qos->out.sdu);
-	cp.bis.latency =  cpu_to_le16(qos->out.latency);
-	cp.bis.rtn  = qos->out.rtn;
-	cp.bis.phy  = qos->out.phy;
-	cp.bis.packing = qos->packing;
-	cp.bis.framing = qos->framing;
-	cp.bis.encryption = 0x00;
+	hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval);
+	cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu);
+	cp.bis.latency =  cpu_to_le16(qos->bcast.out.latency);
+	cp.bis.rtn  = qos->bcast.out.rtn;
+	cp.bis.phy  = qos->bcast.out.phy;
+	cp.bis.packing = qos->bcast.packing;
+	cp.bis.framing = qos->bcast.framing;
+	cp.bis.encryption = qos->bcast.encryption;
+	memcpy(cp.bis.bcode, qos->bcast.bcode, sizeof(cp.bis.bcode));
 	memset(&cp.bis.bcode, 0, sizeof(cp.bis.bcode));
 
 	return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp);
@@ -1711,7 +1713,7 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 	memset(&data, 0, sizeof(data));
 
 	/* Allocate a CIG if not set */
-	if (qos->cig == BT_ISO_QOS_CIG_UNSET) {
+	if (qos->ucast.cig == BT_ISO_QOS_CIG_UNSET) {
 		for (data.cig = 0x00; data.cig < 0xff; data.cig++) {
 			data.count = 0;
 			data.cis = 0xff;
@@ -1731,22 +1733,22 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 			return false;
 
 		/* Update CIG */
-		qos->cig = data.cig;
+		qos->ucast.cig = data.cig;
 	}
 
-	data.pdu.cp.cig_id = qos->cig;
-	hci_cpu_to_le24(qos->out.interval, data.pdu.cp.c_interval);
-	hci_cpu_to_le24(qos->in.interval, data.pdu.cp.p_interval);
-	data.pdu.cp.sca = qos->sca;
-	data.pdu.cp.packing = qos->packing;
-	data.pdu.cp.framing = qos->framing;
-	data.pdu.cp.c_latency = cpu_to_le16(qos->out.latency);
-	data.pdu.cp.p_latency = cpu_to_le16(qos->in.latency);
+	data.pdu.cp.cig_id = qos->ucast.cig;
+	hci_cpu_to_le24(qos->ucast.out.interval, data.pdu.cp.c_interval);
+	hci_cpu_to_le24(qos->ucast.in.interval, data.pdu.cp.p_interval);
+	data.pdu.cp.sca = qos->ucast.sca;
+	data.pdu.cp.packing = qos->ucast.packing;
+	data.pdu.cp.framing = qos->ucast.framing;
+	data.pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency);
+	data.pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency);
 
-	if (qos->cis != BT_ISO_QOS_CIS_UNSET) {
+	if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) {
 		data.count = 0;
-		data.cig = qos->cig;
-		data.cis = qos->cis;
+		data.cig = qos->ucast.cig;
+		data.cis = qos->ucast.cis;
 
 		hci_conn_hash_list_state(hdev, cis_list, ISO_LINK, BT_BOUND,
 					 &data);
@@ -1757,7 +1759,7 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 	}
 
 	/* Reprogram all CIS(s) with the same CIG */
-	for (data.cig = qos->cig, data.cis = 0x00; data.cis < 0x11;
+	for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0x11;
 	     data.cis++) {
 		data.count = 0;
 
@@ -1767,14 +1769,14 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
 			continue;
 
 		/* Allocate a CIS if not set */
-		if (qos->cis == BT_ISO_QOS_CIS_UNSET) {
+		if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) {
 			/* Update CIS */
-			qos->cis = data.cis;
+			qos->ucast.cis = data.cis;
 			cis_add(&data, qos);
 		}
 	}
 
-	if (qos->cis == BT_ISO_QOS_CIS_UNSET || !data.pdu.cp.num_cis)
+	if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET || !data.pdu.cp.num_cis)
 		return false;
 
 	if (hci_send_cmd(hdev, HCI_OP_LE_SET_CIG_PARAMS,
@@ -1809,32 +1811,32 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 		return cis;
 
 	/* Update LINK PHYs according to QoS preference */
-	cis->le_tx_phy = qos->out.phy;
-	cis->le_rx_phy = qos->in.phy;
+	cis->le_tx_phy = qos->ucast.out.phy;
+	cis->le_rx_phy = qos->ucast.in.phy;
 
 	/* If output interval is not set use the input interval as it cannot be
 	 * 0x000000.
 	 */
-	if (!qos->out.interval)
-		qos->out.interval = qos->in.interval;
+	if (!qos->ucast.out.interval)
+		qos->ucast.out.interval = qos->ucast.in.interval;
 
 	/* If input interval is not set use the output interval as it cannot be
 	 * 0x000000.
 	 */
-	if (!qos->in.interval)
-		qos->in.interval = qos->out.interval;
+	if (!qos->ucast.in.interval)
+		qos->ucast.in.interval = qos->ucast.out.interval;
 
 	/* If output latency is not set use the input latency as it cannot be
 	 * 0x0000.
 	 */
-	if (!qos->out.latency)
-		qos->out.latency = qos->in.latency;
+	if (!qos->ucast.out.latency)
+		qos->ucast.out.latency = qos->ucast.in.latency;
 
 	/* If input latency is not set use the output latency as it cannot be
 	 * 0x0000.
 	 */
-	if (!qos->in.latency)
-		qos->in.latency = qos->out.latency;
+	if (!qos->ucast.in.latency)
+		qos->ucast.in.latency = qos->ucast.out.latency;
 
 	if (!hci_le_set_cig_params(cis, qos)) {
 		hci_conn_drop(cis);
@@ -1854,7 +1856,7 @@ bool hci_iso_setup_path(struct hci_conn *conn)
 
 	memset(&cmd, 0, sizeof(cmd));
 
-	if (conn->iso_qos.out.sdu) {
+	if (conn->iso_qos.ucast.out.sdu) {
 		cmd.handle = cpu_to_le16(conn->handle);
 		cmd.direction = 0x00; /* Input (Host to Controller) */
 		cmd.path = 0x00; /* HCI path if enabled */
@@ -1865,7 +1867,7 @@ bool hci_iso_setup_path(struct hci_conn *conn)
 			return false;
 	}
 
-	if (conn->iso_qos.in.sdu) {
+	if (conn->iso_qos.ucast.in.sdu) {
 		cmd.handle = cpu_to_le16(conn->handle);
 		cmd.direction = 0x01; /* Output (Controller to Host) */
 		cmd.path = 0x00; /* HCI path if enabled */
@@ -1892,7 +1894,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 	cmd.cis[0].acl_handle = cpu_to_le16(conn->link->handle);
 	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
 	cmd.cp.num_cis++;
-	cig = conn->iso_qos.cig;
+	cig = conn->iso_qos.ucast.cig;
 
 	hci_dev_lock(hdev);
 
@@ -1902,7 +1904,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
 
 		if (conn == data || conn->type != ISO_LINK ||
-		    conn->state == BT_CONNECTED || conn->iso_qos.cig != cig)
+		    conn->state == BT_CONNECTED || conn->iso_qos.ucast.cig != cig)
 			continue;
 
 		/* Check if all CIS(s) belonging to a CIG are ready */
@@ -2002,8 +2004,8 @@ static void hci_bind_bis(struct hci_conn *conn,
 			 struct bt_iso_qos *qos)
 {
 	/* Update LINK PHYs according to QoS preference */
-	conn->le_tx_phy = qos->out.phy;
-	conn->le_tx_phy = qos->out.phy;
+	conn->le_tx_phy = qos->bcast.out.phy;
+	conn->le_tx_phy = qos->bcast.out.phy;
 	conn->iso_qos = *qos;
 	conn->state = BT_BOUND;
 }
@@ -2016,16 +2018,16 @@ static int create_big_sync(struct hci_dev *hdev, void *data)
 	u32 flags = 0;
 	int err;
 
-	if (qos->out.phy == 0x02)
+	if (qos->bcast.out.phy == 0x02)
 		flags |= MGMT_ADV_FLAG_SEC_2M;
 
 	/* Align intervals */
-	interval = qos->out.interval / 1250;
+	interval = qos->bcast.out.interval / 1250;
 
-	if (qos->bis)
-		sync_interval = qos->sync_interval * 1600;
+	if (qos->bcast.bis)
+		sync_interval = qos->bcast.sync_interval * 1600;
 
-	err = hci_start_per_adv_sync(hdev, qos->bis, conn->le_per_adv_data_len,
+	err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->le_per_adv_data_len,
 				     conn->le_per_adv_data, flags, interval,
 				     interval, sync_interval);
 	if (err)
@@ -2062,7 +2064,7 @@ static int create_pa_sync(struct hci_dev *hdev, void *data)
 }
 
 int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
-		       __u8 sid)
+		       __u8 sid, struct bt_iso_qos *qos)
 {
 	struct hci_cp_le_pa_create_sync *cp;
 
@@ -2075,9 +2077,13 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type,
 		return -ENOMEM;
 	}
 
+	cp->options = qos->bcast.options;
 	cp->sid = sid;
 	cp->addr_type = dst_type;
 	bacpy(&cp->addr, dst);
+	cp->skip = cpu_to_le16(qos->bcast.skip);
+	cp->sync_timeout = cpu_to_le16(qos->bcast.sync_timeout);
+	cp->sync_cte_type = qos->bcast.sync_cte_type;
 
 	/* Queue start pa_create_sync and scan */
 	return hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete);
@@ -2100,8 +2106,12 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct bt_iso_qos *qos,
 		return err;
 
 	memset(&pdu, 0, sizeof(pdu));
-	pdu.cp.handle = qos->big;
+	pdu.cp.handle = qos->bcast.big;
 	pdu.cp.sync_handle = cpu_to_le16(sync_handle);
+	pdu.cp.encryption = qos->bcast.encryption;
+	memcpy(pdu.cp.bcode, qos->bcast.bcode, sizeof(pdu.cp.bcode));
+	pdu.cp.mse = qos->bcast.mse;
+	pdu.cp.timeout = cpu_to_le16(qos->bcast.timeout);
 	pdu.cp.num_bis = num_bis;
 	memcpy(pdu.bis, bis, num_bis);
 
@@ -2151,7 +2161,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
 		return ERR_PTR(err);
 	}
 
-	hci_iso_qos_setup(hdev, conn, &qos->out,
+	hci_iso_qos_setup(hdev, conn, &qos->bcast.out,
 			  conn->le_tx_phy ? conn->le_tx_phy :
 			  hdev->le_tx_def_phys);
 
@@ -2177,9 +2187,9 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 	if (IS_ERR(le))
 		return le;
 
-	hci_iso_qos_setup(hdev, le, &qos->out,
+	hci_iso_qos_setup(hdev, le, &qos->ucast.out,
 			  le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys);
-	hci_iso_qos_setup(hdev, le, &qos->in,
+	hci_iso_qos_setup(hdev, le, &qos->ucast.in,
 			  le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys);
 
 	cis = hci_bind_cis(hdev, dst, dst_type, qos);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 51f13518dba9..0e0a93cc1218 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1,6 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+   Copyright 2023 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -3833,7 +3834,7 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
-		if (conn->type != ISO_LINK || conn->iso_qos.cig != rp->cig_id ||
+		if (conn->type != ISO_LINK || conn->iso_qos.ucast.cig != rp->cig_id ||
 		    conn->state == BT_CONNECTED)
 			continue;
 
@@ -3890,7 +3891,7 @@ static u8 hci_cc_le_setup_iso_path(struct hci_dev *hdev, void *data,
 	/* Input (Host to Controller) */
 	case 0x00:
 		/* Only confirm connection if output only */
-		if (conn->iso_qos.out.sdu && !conn->iso_qos.in.sdu)
+		if (conn->iso_qos.ucast.out.sdu && !conn->iso_qos.ucast.in.sdu)
 			hci_connect_cfm(conn, rp->status);
 		break;
 	/* Output (Controller to Host) */
@@ -6818,15 +6819,15 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data,
 		memset(&interval, 0, sizeof(interval));
 
 		memcpy(&interval, ev->c_latency, sizeof(ev->c_latency));
-		conn->iso_qos.in.interval = le32_to_cpu(interval);
+		conn->iso_qos.ucast.in.interval = le32_to_cpu(interval);
 		memcpy(&interval, ev->p_latency, sizeof(ev->p_latency));
-		conn->iso_qos.out.interval = le32_to_cpu(interval);
-		conn->iso_qos.in.latency = le16_to_cpu(ev->interval);
-		conn->iso_qos.out.latency = le16_to_cpu(ev->interval);
-		conn->iso_qos.in.sdu = le16_to_cpu(ev->c_mtu);
-		conn->iso_qos.out.sdu = le16_to_cpu(ev->p_mtu);
-		conn->iso_qos.in.phy = ev->c_phy;
-		conn->iso_qos.out.phy = ev->p_phy;
+		conn->iso_qos.ucast.out.interval = le32_to_cpu(interval);
+		conn->iso_qos.ucast.in.latency = le16_to_cpu(ev->interval);
+		conn->iso_qos.ucast.out.latency = le16_to_cpu(ev->interval);
+		conn->iso_qos.ucast.in.sdu = le16_to_cpu(ev->c_mtu);
+		conn->iso_qos.ucast.out.sdu = le16_to_cpu(ev->p_mtu);
+		conn->iso_qos.ucast.in.phy = ev->c_phy;
+		conn->iso_qos.ucast.out.phy = ev->p_phy;
 	}
 
 	if (!ev->status) {
@@ -6900,8 +6901,8 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
 		cis->handle = cis_handle;
 	}
 
-	cis->iso_qos.cig = ev->cig_id;
-	cis->iso_qos.cis = ev->cis_id;
+	cis->iso_qos.ucast.cig = ev->cig_id;
+	cis->iso_qos.ucast.cis = ev->cis_id;
 
 	if (!(flags & HCI_PROTO_DEFER)) {
 		hci_le_accept_cis(hdev, ev->cis_handle);
@@ -6988,13 +6989,13 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 			bis->handle = handle;
 		}
 
-		bis->iso_qos.big = ev->handle;
+		bis->iso_qos.bcast.big = ev->handle;
 		memset(&interval, 0, sizeof(interval));
 		memcpy(&interval, ev->latency, sizeof(ev->latency));
-		bis->iso_qos.in.interval = le32_to_cpu(interval);
+		bis->iso_qos.bcast.in.interval = le32_to_cpu(interval);
 		/* Convert ISO Interval (1.25 ms slots) to latency (ms) */
-		bis->iso_qos.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
-		bis->iso_qos.in.sdu = le16_to_cpu(ev->max_pdu);
+		bis->iso_qos.bcast.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
+		bis->iso_qos.bcast.in.sdu = le16_to_cpu(ev->max_pdu);
 
 		hci_iso_setup_path(bis);
 	}
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 8d136a730163..74117df03a3f 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -3,6 +3,7 @@
  * BlueZ - Bluetooth protocol stack for Linux
  *
  * Copyright (C) 2022 Intel Corporation
+ * Copyright 2023 NXP
  */
 
 #include <linux/module.h>
@@ -59,11 +60,17 @@ struct iso_pinfo {
 	__u16			sync_handle;
 	__u32			flags;
 	struct bt_iso_qos	qos;
+	bool			qos_user_set;
 	__u8			base_len;
 	__u8			base[BASE_MAX_LENGTH];
 	struct iso_conn		*conn;
 };
 
+static struct bt_iso_qos default_qos;
+
+static bool check_ucast_qos(struct bt_iso_qos *qos);
+static bool check_bcast_qos(struct bt_iso_qos *qos);
+
 /* ---- ISO timers ---- */
 #define ISO_CONN_TIMEOUT	(HZ * 40)
 #define ISO_DISCONN_TIMEOUT	(HZ * 2)
@@ -264,8 +271,15 @@ static int iso_connect_bis(struct sock *sk)
 		goto unlock;
 	}
 
+	/* Fail if user set invalid QoS */
+	if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+		iso_pi(sk)->qos = default_qos;
+		err = -EINVAL;
+		goto unlock;
+	}
+
 	/* Fail if out PHYs are marked as disabled */
-	if (!iso_pi(sk)->qos.out.phy) {
+	if (!iso_pi(sk)->qos.bcast.out.phy) {
 		err = -EINVAL;
 		goto unlock;
 	}
@@ -336,8 +350,15 @@ static int iso_connect_cis(struct sock *sk)
 		goto unlock;
 	}
 
+	/* Fail if user set invalid QoS */
+	if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) {
+		iso_pi(sk)->qos = default_qos;
+		err = -EINVAL;
+		goto unlock;
+	}
+
 	/* Fail if either PHYs are marked as disabled */
-	if (!iso_pi(sk)->qos.in.phy && !iso_pi(sk)->qos.out.phy) {
+	if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) {
 		err = -EINVAL;
 		goto unlock;
 	}
@@ -417,7 +438,7 @@ static int iso_send_frame(struct sock *sk, struct sk_buff *skb)
 
 	BT_DBG("sk %p len %d", sk, skb->len);
 
-	if (skb->len > qos->out.sdu)
+	if (skb->len > qos->ucast.out.sdu)
 		return -EMSGSIZE;
 
 	len = skb->len;
@@ -680,13 +701,23 @@ static struct proto iso_proto = {
 }
 
 static struct bt_iso_qos default_qos = {
-	.cig		= BT_ISO_QOS_CIG_UNSET,
-	.cis		= BT_ISO_QOS_CIS_UNSET,
-	.sca		= 0x00,
-	.packing	= 0x00,
-	.framing	= 0x00,
-	.in		= DEFAULT_IO_QOS,
-	.out		= DEFAULT_IO_QOS,
+	.bcast = {
+		.big			= BT_ISO_QOS_BIG_UNSET,
+		.bis			= BT_ISO_QOS_BIS_UNSET,
+		.sync_interval		= 0x00,
+		.packing		= 0x00,
+		.framing		= 0x00,
+		.in			= DEFAULT_IO_QOS,
+		.out			= DEFAULT_IO_QOS,
+		.encryption		= 0x00,
+		.bcode			= {0x00},
+		.options		= 0x00,
+		.skip			= 0x0000,
+		.sync_timeout		= 0x4000,
+		.sync_cte_type		= 0x00,
+		.mse			= 0x00,
+		.timeout		= 0x4000,
+	},
 };
 
 static struct sock *iso_sock_alloc(struct net *net, struct socket *sock,
@@ -893,9 +924,15 @@ static int iso_listen_bis(struct sock *sk)
 	if (!hdev)
 		return -EHOSTUNREACH;
 
+	/* Fail if user set invalid QoS */
+	if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+		iso_pi(sk)->qos = default_qos;
+		return -EINVAL;
+	}
+
 	err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst,
 				 le_addr_type(iso_pi(sk)->dst_type),
-				 iso_pi(sk)->bc_sid);
+				 iso_pi(sk)->bc_sid, &iso_pi(sk)->qos);
 
 	hci_dev_put(hdev);
 
@@ -1154,21 +1191,62 @@ static bool check_io_qos(struct bt_iso_io_qos *qos)
 	return true;
 }
 
-static bool check_qos(struct bt_iso_qos *qos)
+static bool check_ucast_qos(struct bt_iso_qos *qos)
 {
-	if (qos->sca > 0x07)
+	if (qos->ucast.sca > 0x07)
 		return false;
 
-	if (qos->packing > 0x01)
+	if (qos->ucast.packing > 0x01)
 		return false;
 
-	if (qos->framing > 0x01)
+	if (qos->ucast.framing > 0x01)
 		return false;
 
-	if (!check_io_qos(&qos->in))
+	if (!check_io_qos(&qos->ucast.in))
 		return false;
 
-	if (!check_io_qos(&qos->out))
+	if (!check_io_qos(&qos->ucast.out))
+		return false;
+
+	return true;
+}
+
+static bool check_bcast_qos(struct bt_iso_qos *qos)
+{
+	if (qos->bcast.sync_interval > 0x07)
+		return false;
+
+	if (qos->bcast.packing > 0x01)
+		return false;
+
+	if (qos->bcast.framing > 0x01)
+		return false;
+
+	if (!check_io_qos(&qos->bcast.in))
+		return false;
+
+	if (!check_io_qos(&qos->bcast.out))
+		return false;
+
+	if (qos->bcast.encryption > 0x01)
+		return false;
+
+	if (qos->bcast.options > 0x07)
+		return false;
+
+	if (qos->bcast.skip > 0x01f3)
+		return false;
+
+	if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000)
+		return false;
+
+	if (qos->bcast.sync_cte_type > 0x1f)
+		return false;
+
+	if (qos->bcast.mse > 0x1f)
+		return false;
+
+	if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000)
 		return false;
 
 	return true;
@@ -1179,7 +1257,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
 {
 	struct sock *sk = sock->sk;
 	int len, err = 0;
-	struct bt_iso_qos qos;
+	struct bt_iso_qos qos = default_qos;
 	u32 opt;
 
 	BT_DBG("sk %p", sk);
@@ -1212,24 +1290,19 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
 		}
 
 		len = min_t(unsigned int, sizeof(qos), optlen);
-		if (len != sizeof(qos)) {
-			err = -EINVAL;
-			break;
-		}
-
-		memset(&qos, 0, sizeof(qos));
 
 		if (copy_from_sockptr(&qos, optval, len)) {
 			err = -EFAULT;
 			break;
 		}
 
-		if (!check_qos(&qos)) {
+		if (len == sizeof(qos.ucast) && !check_ucast_qos(&qos)) {
 			err = -EINVAL;
 			break;
 		}
 
 		iso_pi(sk)->qos = qos;
+		iso_pi(sk)->qos_user_set = true;
 
 		break;
 
@@ -1419,7 +1492,7 @@ static bool iso_match_big(struct sock *sk, void *data)
 {
 	struct hci_evt_le_big_sync_estabilished *ev = data;
 
-	return ev->handle == iso_pi(sk)->qos.big;
+	return ev->handle == iso_pi(sk)->qos.bcast.big;
 }
 
 static void iso_conn_ready(struct iso_conn *conn)
-- 
cgit v1.2.3


From 288c90224eec55d13e786844b7954ef060752089 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 19 Dec 2022 13:37:02 -0800
Subject: Bluetooth: Enable all supported LE PHY by default

This enables 2M and Coded PHY by default if they are marked as supported
in the LE features bits.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  4 ++++
 net/bluetooth/hci_sync.c         | 28 ++++++++++++++++++++++++----
 net/bluetooth/mgmt.c             |  4 ++--
 3 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index f11689284112..827e67159523 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1683,9 +1683,13 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \
 		      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M))
 
+#define le_2m_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_2M))
+
 #define scan_2m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_2M) || \
 		      ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M))
 
+#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED))
+
 #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \
 			 ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED))
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index f21497ebc814..00017f75cd41 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4414,18 +4414,38 @@ static int hci_le_set_write_def_data_len_sync(struct hci_dev *hdev)
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
 }
 
-/* Set Default PHY parameters if command is supported */
+/* Set Default PHY parameters if command is supported, enables all supported
+ * PHYs according to the LE Features bits.
+ */
 static int hci_le_set_default_phy_sync(struct hci_dev *hdev)
 {
 	struct hci_cp_le_set_default_phy cp;
 
-	if (!(hdev->commands[35] & 0x20))
+	if (!(hdev->commands[35] & 0x20)) {
+		/* If the command is not supported it means only 1M PHY is
+		 * supported.
+		 */
+		hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
+		hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
 		return 0;
+	}
 
 	memset(&cp, 0, sizeof(cp));
 	cp.all_phys = 0x00;
-	cp.tx_phys = hdev->le_tx_def_phys;
-	cp.rx_phys = hdev->le_rx_def_phys;
+	cp.tx_phys = HCI_LE_SET_PHY_1M;
+	cp.rx_phys = HCI_LE_SET_PHY_1M;
+
+	/* Enables 2M PHY if supported */
+	if (le_2m_capable(hdev)) {
+		cp.tx_phys |= HCI_LE_SET_PHY_2M;
+		cp.rx_phys |= HCI_LE_SET_PHY_2M;
+	}
+
+	/* Enables Coded PHY if supported */
+	if (le_coded_capable(hdev)) {
+		cp.tx_phys |= HCI_LE_SET_PHY_CODED;
+		cp.rx_phys |= HCI_LE_SET_PHY_CODED;
+	}
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_DEFAULT_PHY,
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 249dc6777fb4..5f8c144c84b8 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -8393,10 +8393,10 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
 		flags |= MGMT_ADV_FLAG_HW_OFFLOAD;
 		flags |= MGMT_ADV_FLAG_CAN_SET_TX_POWER;
 
-		if (hdev->le_features[1] & HCI_LE_PHY_2M)
+		if (le_2m_capable(hdev))
 			flags |= MGMT_ADV_FLAG_SEC_2M;
 
-		if (hdev->le_features[1] & HCI_LE_PHY_CODED)
+		if (le_coded_capable(hdev))
 			flags |= MGMT_ADV_FLAG_SEC_CODED;
 	}
 
-- 
cgit v1.2.3


From 06149746e7203d5ffe2d6faf9799ee36203aa8b8 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 11 Apr 2023 16:02:22 -0700
Subject: Bluetooth: hci_conn: Add support for linking multiple hcon

Since it is required for some configurations to have multiple CIS with
the same peer which is now covered by iso-tester in the following test
cases:

    ISO AC 6(i) - Success
    ISO AC 7(i) - Success
    ISO AC 8(i) - Success
    ISO AC 9(i) - Success
    ISO AC 11(i) - Success

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  14 +++-
 net/bluetooth/hci_conn.c         | 155 ++++++++++++++++++++++++++++-----------
 net/bluetooth/hci_event.c        |  92 +++++++++++------------
 net/bluetooth/iso.c              |   8 +-
 4 files changed, 172 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 827e67159523..4fe1e71cb9d8 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -770,7 +770,10 @@ struct hci_conn {
 	void		*iso_data;
 	struct amp_mgr	*amp_mgr;
 
-	struct hci_conn	*link;
+	struct list_head link_list;
+	struct hci_conn	*parent;
+	struct hci_link *link;
+
 	struct bt_codec codec;
 
 	void (*connect_cfm_cb)	(struct hci_conn *conn, u8 status);
@@ -780,6 +783,11 @@ struct hci_conn {
 	void (*cleanup)(struct hci_conn *conn);
 };
 
+struct hci_link {
+	struct list_head list;
+	struct hci_conn *conn;
+};
+
 struct hci_chan {
 	struct list_head list;
 	__u16 handle;
@@ -1383,12 +1391,14 @@ static inline void hci_conn_put(struct hci_conn *conn)
 	put_device(&conn->dev);
 }
 
-static inline void hci_conn_hold(struct hci_conn *conn)
+static inline struct hci_conn *hci_conn_hold(struct hci_conn *conn)
 {
 	BT_DBG("hcon %p orig refcnt %d", conn, atomic_read(&conn->refcnt));
 
 	atomic_inc(&conn->refcnt);
 	cancel_delayed_work(&conn->disc_work);
+
+	return conn;
 }
 
 static inline void hci_conn_drop(struct hci_conn *conn)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 01e0b7255174..d8466abbb36a 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -330,8 +330,11 @@ static void hci_add_sco(struct hci_conn *conn, __u16 handle)
 static bool find_next_esco_param(struct hci_conn *conn,
 				 const struct sco_param *esco_param, int size)
 {
+	if (!conn->parent)
+		return false;
+
 	for (; conn->attempt <= size; conn->attempt++) {
-		if (lmp_esco_2m_capable(conn->link) ||
+		if (lmp_esco_2m_capable(conn->parent) ||
 		    (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3))
 			break;
 		BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported",
@@ -461,7 +464,7 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data)
 		break;
 
 	case BT_CODEC_CVSD:
-		if (lmp_esco_capable(conn->link)) {
+		if (conn->parent && lmp_esco_capable(conn->parent)) {
 			if (!find_next_esco_param(conn, esco_param_cvsd,
 						  ARRAY_SIZE(esco_param_cvsd)))
 				return -EINVAL;
@@ -531,7 +534,7 @@ static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle)
 		param = &esco_param_msbc[conn->attempt - 1];
 		break;
 	case SCO_AIRMODE_CVSD:
-		if (lmp_esco_capable(conn->link)) {
+		if (conn->parent && lmp_esco_capable(conn->parent)) {
 			if (!find_next_esco_param(conn, esco_param_cvsd,
 						  ARRAY_SIZE(esco_param_cvsd)))
 				return false;
@@ -637,21 +640,22 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
 /* Device _must_ be locked */
 void hci_sco_setup(struct hci_conn *conn, __u8 status)
 {
-	struct hci_conn *sco = conn->link;
+	struct hci_link *link;
 
-	if (!sco)
+	link = list_first_entry_or_null(&conn->link_list, struct hci_link, list);
+	if (!link || !link->conn)
 		return;
 
 	BT_DBG("hcon %p", conn);
 
 	if (!status) {
 		if (lmp_esco_capable(conn->hdev))
-			hci_setup_sync(sco, conn->handle);
+			hci_setup_sync(link->conn, conn->handle);
 		else
-			hci_add_sco(sco, conn->handle);
+			hci_add_sco(link->conn, conn->handle);
 	} else {
-		hci_connect_cfm(sco, status);
-		hci_conn_del(sco);
+		hci_connect_cfm(link->conn, status);
+		hci_conn_del(link->conn);
 	}
 }
 
@@ -1042,6 +1046,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	skb_queue_head_init(&conn->data_q);
 
 	INIT_LIST_HEAD(&conn->chan_list);
+	INIT_LIST_HEAD(&conn->link_list);
 
 	INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout);
 	INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept);
@@ -1069,15 +1074,39 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	return conn;
 }
 
-static bool hci_conn_unlink(struct hci_conn *conn)
+static void hci_conn_unlink(struct hci_conn *conn)
 {
+	struct hci_dev *hdev = conn->hdev;
+
+	bt_dev_dbg(hdev, "hcon %p", conn);
+
+	if (!conn->parent) {
+		struct hci_link *link, *t;
+
+		list_for_each_entry_safe(link, t, &conn->link_list, list)
+			hci_conn_unlink(link->conn);
+
+		return;
+	}
+
 	if (!conn->link)
-		return false;
+		return;
+
+	hci_conn_put(conn->parent);
+	conn->parent = NULL;
 
-	conn->link->link = NULL;
+	list_del_rcu(&conn->link->list);
+	synchronize_rcu();
+
+	kfree(conn->link);
 	conn->link = NULL;
 
-	return true;
+	/* Due to race, SCO connection might be not established
+	 * yet at this point. Delete it now, otherwise it is
+	 * possible for it to be stuck and can't be deleted.
+	 */
+	if (conn->handle == HCI_CONN_HANDLE_UNSET)
+		hci_conn_del(conn);
 }
 
 int hci_conn_del(struct hci_conn *conn)
@@ -1091,18 +1120,7 @@ int hci_conn_del(struct hci_conn *conn)
 	cancel_delayed_work_sync(&conn->idle_work);
 
 	if (conn->type == ACL_LINK) {
-		struct hci_conn *link = conn->link;
-
-		if (link) {
-			hci_conn_unlink(conn);
-			/* Due to race, SCO connection might be not established
-			 * yet at this point. Delete it now, otherwise it is
-			 * possible for it to be stuck and can't be deleted.
-			 */
-			if (link->handle == HCI_CONN_HANDLE_UNSET)
-				hci_conn_del(link);
-		}
-
+		hci_conn_unlink(conn);
 		/* Unacked frames */
 		hdev->acl_cnt += conn->sent;
 	} else if (conn->type == LE_LINK) {
@@ -1113,7 +1131,7 @@ int hci_conn_del(struct hci_conn *conn)
 		else
 			hdev->acl_cnt += conn->sent;
 	} else {
-		struct hci_conn *acl = conn->link;
+		struct hci_conn *acl = conn->parent;
 
 		if (acl) {
 			hci_conn_unlink(conn);
@@ -1600,11 +1618,40 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 	return acl;
 }
 
+static struct hci_link *hci_conn_link(struct hci_conn *parent,
+				      struct hci_conn *conn)
+{
+	struct hci_dev *hdev = parent->hdev;
+	struct hci_link *link;
+
+	bt_dev_dbg(hdev, "parent %p hcon %p", parent, conn);
+
+	if (conn->link)
+		return conn->link;
+
+	if (conn->parent)
+		return NULL;
+
+	link = kzalloc(sizeof(*link), GFP_KERNEL);
+	if (!link)
+		return NULL;
+
+	link->conn = hci_conn_hold(conn);
+	conn->link = link;
+	conn->parent = hci_conn_get(parent);
+
+	/* Use list_add_tail_rcu append to the list */
+	list_add_tail_rcu(&link->list, &parent->link_list);
+
+	return link;
+}
+
 struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 				 __u16 setting, struct bt_codec *codec)
 {
 	struct hci_conn *acl;
 	struct hci_conn *sco;
+	struct hci_link *link;
 
 	acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING,
 			      CONN_REASON_SCO_CONNECT);
@@ -1620,10 +1667,12 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 		}
 	}
 
-	acl->link = sco;
-	sco->link = acl;
-
-	hci_conn_hold(sco);
+	link = hci_conn_link(acl, sco);
+	if (!link) {
+		hci_conn_drop(acl);
+		hci_conn_drop(sco);
+		return NULL;
+	}
 
 	sco->setting = setting;
 	sco->codec = *codec;
@@ -1890,7 +1939,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 	u8 cig;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.cis[0].acl_handle = cpu_to_le16(conn->link->handle);
+	cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle);
 	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
 	cmd.cp.num_cis++;
 	cig = conn->iso_qos.ucast.cig;
@@ -1903,11 +1952,12 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
 
 		if (conn == data || conn->type != ISO_LINK ||
-		    conn->state == BT_CONNECTED || conn->iso_qos.ucast.cig != cig)
+		    conn->state == BT_CONNECTED ||
+		    conn->iso_qos.ucast.cig != cig)
 			continue;
 
 		/* Check if all CIS(s) belonging to a CIG are ready */
-		if (!conn->link || conn->link->state != BT_CONNECTED ||
+		if (!conn->parent || conn->parent->state != BT_CONNECTED ||
 		    conn->state != BT_CONNECT) {
 			cmd.cp.num_cis = 0;
 			break;
@@ -1924,7 +1974,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 		 * command have been generated, the Controller shall return the
 		 * error code Command Disallowed (0x0C).
 		 */
-		cis->acl_handle = cpu_to_le16(conn->link->handle);
+		cis->acl_handle = cpu_to_le16(conn->parent->handle);
 		cis->cis_handle = cpu_to_le16(conn->handle);
 		cmd.cp.num_cis++;
 	}
@@ -1943,15 +1993,33 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 int hci_le_create_cis(struct hci_conn *conn)
 {
 	struct hci_conn *cis;
+	struct hci_link *link, *t;
 	struct hci_dev *hdev = conn->hdev;
 	int err;
 
+	bt_dev_dbg(hdev, "hcon %p", conn);
+
 	switch (conn->type) {
 	case LE_LINK:
-		if (!conn->link || conn->state != BT_CONNECTED)
+		if (conn->state != BT_CONNECTED || list_empty(&conn->link_list))
 			return -EINVAL;
-		cis = conn->link;
-		break;
+
+		cis = NULL;
+
+		/* hci_conn_link uses list_add_tail_rcu so the list is in
+		 * the same order as the connections are requested.
+		 */
+		list_for_each_entry_safe(link, t, &conn->link_list, list) {
+			if (link->conn->state == BT_BOUND) {
+				err = hci_le_create_cis(link->conn);
+				if (err)
+					return err;
+
+				cis = link->conn;
+			}
+		}
+
+		return cis ? 0 : -EINVAL;
 	case ISO_LINK:
 		cis = conn;
 		break;
@@ -2172,6 +2240,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 {
 	struct hci_conn *le;
 	struct hci_conn *cis;
+	struct hci_link *link;
 
 	if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
 		le = hci_connect_le(hdev, dst, dst_type, false,
@@ -2197,16 +2266,18 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
 		return cis;
 	}
 
-	le->link = cis;
-	cis->link = le;
-
-	hci_conn_hold(cis);
+	link = hci_conn_link(le, cis);
+	if (!link) {
+		hci_conn_drop(le);
+		hci_conn_drop(cis);
+		return NULL;
+	}
 
 	/* If LE is already connected and CIS handle is already set proceed to
 	 * Create CIS immediately.
 	 */
 	if (le->state == BT_CONNECTED && cis->handle != HCI_CONN_HANDLE_UNSET)
-		hci_le_create_cis(le);
+		hci_le_create_cis(cis);
 
 	return cis;
 }
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 0e0a93cc1218..d00ef6e3fc45 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2345,7 +2345,8 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
 {
 	struct hci_cp_add_sco *cp;
-	struct hci_conn *acl, *sco;
+	struct hci_conn *acl;
+	struct hci_link *link;
 	__u16 handle;
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
@@ -2365,12 +2366,13 @@ static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
 
 	acl = hci_conn_hash_lookup_handle(hdev, handle);
 	if (acl) {
-		sco = acl->link;
-		if (sco) {
-			sco->state = BT_CLOSED;
+		link = list_first_entry_or_null(&acl->link_list,
+						struct hci_link, list);
+		if (link && link->conn) {
+			link->conn->state = BT_CLOSED;
 
-			hci_connect_cfm(sco, status);
-			hci_conn_del(sco);
+			hci_connect_cfm(link->conn, status);
+			hci_conn_del(link->conn);
 		}
 	}
 
@@ -2637,74 +2639,61 @@ static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status)
 	hci_dev_unlock(hdev);
 }
 
-static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+static void hci_setup_sync_conn_status(struct hci_dev *hdev, __u16 handle,
+				       __u8 status)
 {
-	struct hci_cp_setup_sync_conn *cp;
-	struct hci_conn *acl, *sco;
-	__u16 handle;
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", status);
-
-	if (!status)
-		return;
-
-	cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN);
-	if (!cp)
-		return;
+	struct hci_conn *acl;
+	struct hci_link *link;
 
-	handle = __le16_to_cpu(cp->handle);
-
-	bt_dev_dbg(hdev, "handle 0x%4.4x", handle);
+	bt_dev_dbg(hdev, "handle 0x%4.4x status 0x%2.2x", handle, status);
 
 	hci_dev_lock(hdev);
 
 	acl = hci_conn_hash_lookup_handle(hdev, handle);
 	if (acl) {
-		sco = acl->link;
-		if (sco) {
-			sco->state = BT_CLOSED;
+		link = list_first_entry_or_null(&acl->link_list,
+						struct hci_link, list);
+		if (link && link->conn) {
+			link->conn->state = BT_CLOSED;
 
-			hci_connect_cfm(sco, status);
-			hci_conn_del(sco);
+			hci_connect_cfm(link->conn, status);
+			hci_conn_del(link->conn);
 		}
 	}
 
 	hci_dev_unlock(hdev);
 }
 
-static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
 {
-	struct hci_cp_enhanced_setup_sync_conn *cp;
-	struct hci_conn *acl, *sco;
-	__u16 handle;
+	struct hci_cp_setup_sync_conn *cp;
 
 	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
 	if (!status)
 		return;
 
-	cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN);
+	cp = hci_sent_cmd_data(hdev, HCI_OP_SETUP_SYNC_CONN);
 	if (!cp)
 		return;
 
-	handle = __le16_to_cpu(cp->handle);
+	hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
+}
 
-	bt_dev_dbg(hdev, "handle 0x%4.4x", handle);
+static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+{
+	struct hci_cp_enhanced_setup_sync_conn *cp;
 
-	hci_dev_lock(hdev);
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
 
-	acl = hci_conn_hash_lookup_handle(hdev, handle);
-	if (acl) {
-		sco = acl->link;
-		if (sco) {
-			sco->state = BT_CLOSED;
+	if (!status)
+		return;
 
-			hci_connect_cfm(sco, status);
-			hci_conn_del(sco);
-		}
-	}
+	cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN);
+	if (!cp)
+		return;
 
-	hci_dev_unlock(hdev);
+	hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
 }
 
 static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
@@ -3834,19 +3823,20 @@ static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data,
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
-		if (conn->type != ISO_LINK || conn->iso_qos.ucast.cig != rp->cig_id ||
+		if (conn->type != ISO_LINK ||
+		    conn->iso_qos.ucast.cig != rp->cig_id ||
 		    conn->state == BT_CONNECTED)
 			continue;
 
 		conn->handle = __le16_to_cpu(rp->handle[i++]);
 
-		bt_dev_dbg(hdev, "%p handle 0x%4.4x link %p", conn,
-			   conn->handle, conn->link);
+		bt_dev_dbg(hdev, "%p handle 0x%4.4x parent %p", conn,
+			   conn->handle, conn->parent);
 
 		/* Create CIS if LE is already connected */
-		if (conn->link && conn->link->state == BT_CONNECTED) {
+		if (conn->parent && conn->parent->state == BT_CONNECTED) {
 			rcu_read_unlock();
-			hci_le_create_cis(conn->link);
+			hci_le_create_cis(conn);
 			rcu_read_lock();
 		}
 
@@ -5031,7 +5021,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
 		if (conn->out) {
 			conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
 					(hdev->esco_type & EDR_ESCO_MASK);
-			if (hci_setup_sync(conn, conn->link->handle))
+			if (hci_setup_sync(conn, conn->parent->handle))
 				goto unlock;
 		}
 		fallthrough;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 74117df03a3f..34d55a85d8f6 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1657,8 +1657,12 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
 
 		/* Check if LE link has failed */
 		if (status) {
-			if (hcon->link)
-				iso_conn_del(hcon->link, bt_to_errno(status));
+			struct hci_link *link, *t;
+
+			list_for_each_entry_safe(link, t, &hcon->link_list,
+						 list)
+				iso_conn_del(link->conn, bt_to_errno(status));
+
 			return;
 		}
 
-- 
cgit v1.2.3


From c14516faede33c2c31da45cf950d55dbff42962e Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 11 Apr 2023 16:14:25 -0700
Subject: Bluetooth: hci_conn: Fix not matching by CIS ID

This fixes only matching CIS by address which prevents creating new hcon
if upper layer is requesting a specific CIS ID.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 12 +++++++++++-
 net/bluetooth/hci_conn.c         |  3 ++-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 4fe1e71cb9d8..a6c8aee2f256 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1180,7 +1180,9 @@ static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev,
 
 static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev,
 							bdaddr_t *ba,
-							__u8 ba_type)
+							__u8 ba_type,
+							__u8 cig,
+							__u8 id)
 {
 	struct hci_conn_hash *h = &hdev->conn_hash;
 	struct hci_conn  *c;
@@ -1191,6 +1193,14 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev,
 		if (c->type != ISO_LINK)
 			continue;
 
+		/* Match CIG ID if set */
+		if (cig != BT_ISO_QOS_CIG_UNSET && cig != c->iso_qos.ucast.cig)
+			continue;
+
+		/* Match CIS ID if set */
+		if (id != BT_ISO_QOS_CIS_UNSET && id != c->iso_qos.ucast.cis)
+			continue;
+
 		if (ba_type == c->dst_type && !bacmp(&c->dst, ba)) {
 			rcu_read_unlock();
 			return c;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index d8466abbb36a..c215e983e287 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1841,7 +1841,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 {
 	struct hci_conn *cis;
 
-	cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type);
+	cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig,
+				       qos->ucast.cis);
 	if (!cis) {
 		cis = hci_conn_add(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
 		if (!cis)
-- 
cgit v1.2.3


From c09b80be6ffc338634b2f5f8cfa12b6843410834 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Wed, 12 Apr 2023 17:45:51 -0700
Subject: Bluetooth: hci_conn: Fix not waiting for HCI_EVT_LE_CIS_ESTABLISHED

When submitting HCI_OP_LE_CREATE_CIS the code shall wait for
HCI_EVT_LE_CIS_ESTABLISHED thus enforcing the serialization of
HCI_OP_LE_CREATE_CIS as the Core spec does not allow to send them in
parallel:

  BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2566:

  If the Host issues this command before all the HCI_LE_CIS_Established
  events from the previous use of the command have been generated, the
  Controller shall return the error code Command Disallowed (0x0C).

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 ++
 net/bluetooth/hci_conn.c         | 58 +----------------------------------
 net/bluetooth/hci_sync.c         | 65 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 17f5a4c32f36..f61b249787fc 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -122,6 +122,8 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason);
 
 int hci_le_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn);
 
+int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn);
+
 int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle);
 
 int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index c215e983e287..640b951bf40a 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1932,63 +1932,7 @@ bool hci_iso_setup_path(struct hci_conn *conn)
 
 static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
 {
-	struct {
-		struct hci_cp_le_create_cis cp;
-		struct hci_cis cis[0x1f];
-	} cmd;
-	struct hci_conn *conn = data;
-	u8 cig;
-
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle);
-	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
-	cmd.cp.num_cis++;
-	cig = conn->iso_qos.ucast.cig;
-
-	hci_dev_lock(hdev);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
-		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
-
-		if (conn == data || conn->type != ISO_LINK ||
-		    conn->state == BT_CONNECTED ||
-		    conn->iso_qos.ucast.cig != cig)
-			continue;
-
-		/* Check if all CIS(s) belonging to a CIG are ready */
-		if (!conn->parent || conn->parent->state != BT_CONNECTED ||
-		    conn->state != BT_CONNECT) {
-			cmd.cp.num_cis = 0;
-			break;
-		}
-
-		/* Group all CIS with state BT_CONNECT since the spec don't
-		 * allow to send them individually:
-		 *
-		 * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
-		 * page 2566:
-		 *
-		 * If the Host issues this command before all the
-		 * HCI_LE_CIS_Established events from the previous use of the
-		 * command have been generated, the Controller shall return the
-		 * error code Command Disallowed (0x0C).
-		 */
-		cis->acl_handle = cpu_to_le16(conn->parent->handle);
-		cis->cis_handle = cpu_to_le16(conn->handle);
-		cmd.cp.num_cis++;
-	}
-
-	rcu_read_unlock();
-
-	hci_dev_unlock(hdev);
-
-	if (!cmd.cp.num_cis)
-		return 0;
-
-	return hci_send_cmd(hdev, HCI_OP_LE_CREATE_CIS, sizeof(cmd.cp) +
-			    sizeof(cmd.cis[0]) * cmd.cp.num_cis, &cmd);
+	return hci_le_create_cis_sync(hdev, data);
 }
 
 int hci_le_create_cis(struct hci_conn *conn)
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 00017f75cd41..6f060d00a70a 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6137,6 +6137,71 @@ done:
 	return err;
 }
 
+int hci_le_create_cis_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+	struct {
+		struct hci_cp_le_create_cis cp;
+		struct hci_cis cis[0x1f];
+	} cmd;
+	u8 cig;
+	struct hci_conn *hcon = conn;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.cis[0].acl_handle = cpu_to_le16(conn->parent->handle);
+	cmd.cis[0].cis_handle = cpu_to_le16(conn->handle);
+	cmd.cp.num_cis++;
+	cig = conn->iso_qos.ucast.cig;
+
+	hci_dev_lock(hdev);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
+
+		if (conn == hcon || conn->type != ISO_LINK ||
+		    conn->state == BT_CONNECTED ||
+		    conn->iso_qos.ucast.cig != cig)
+			continue;
+
+		/* Check if all CIS(s) belonging to a CIG are ready */
+		if (!conn->parent || conn->parent->state != BT_CONNECTED ||
+		    conn->state != BT_CONNECT) {
+			cmd.cp.num_cis = 0;
+			break;
+		}
+
+		/* Group all CIS with state BT_CONNECT since the spec don't
+		 * allow to send them individually:
+		 *
+		 * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+		 * page 2566:
+		 *
+		 * If the Host issues this command before all the
+		 * HCI_LE_CIS_Established events from the previous use of the
+		 * command have been generated, the Controller shall return the
+		 * error code Command Disallowed (0x0C).
+		 */
+		cis->acl_handle = cpu_to_le16(conn->parent->handle);
+		cis->cis_handle = cpu_to_le16(conn->handle);
+		cmd.cp.num_cis++;
+	}
+
+	rcu_read_unlock();
+
+	hci_dev_unlock(hdev);
+
+	if (!cmd.cp.num_cis)
+		return 0;
+
+	/* Wait for HCI_LE_CIS_Established */
+	return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS,
+					sizeof(cmd.cp) + sizeof(cmd.cis[0]) *
+					cmd.cp.num_cis, &cmd,
+					HCI_EVT_LE_CIS_ESTABLISHED,
+					conn->conn_timeout, NULL);
+}
+
 int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle)
 {
 	struct hci_cp_le_remove_cig cp;
-- 
cgit v1.2.3


From 91b6d02ddcd113352bdd895990b252065c596de7 Mon Sep 17 00:00:00 2001
From: Raul Cheleguini <raul.cheleguini@gmail.com>
Date: Thu, 23 Mar 2023 10:45:39 -0300
Subject: Bluetooth: Add new quirk for broken set random RPA timeout for
 ATS2851

The ATS2851 based controller advertises support for command "LE Set Random
Private Address Timeout" but does not actually implement it, impeding the
controller initialization.

Add the quirk HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT to unblock the controller
initialization.

< HCI Command: LE Set Resolvable Private... (0x08|0x002e) plen 2
        Timeout: 900 seconds
> HCI Event: Command Status (0x0f) plen 4
      LE Set Resolvable Private Address Timeout (0x08|0x002e) ncmd 1
        Status: Unknown HCI Command (0x01)

Co-developed-by: imoc <wzj9912@gmail.com>
Signed-off-by: imoc <wzj9912@gmail.com>
Signed-off-by: Raul Cheleguini <raul.cheleguini@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c   | 1 +
 include/net/bluetooth/hci.h | 8 ++++++++
 net/bluetooth/hci_sync.c    | 6 +++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 2303b0a66323..3aa189b1986d 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -4149,6 +4149,7 @@ static int btusb_probe(struct usb_interface *intf,
 		/* Support is advertised, but not implemented */
 		set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
 		set_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks);
+		set_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks);
 		set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks);
 	}
 
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 997107bfc0b1..07df96c47ef4 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -301,6 +301,14 @@ enum {
 	 * don't actually support features declared there.
 	 */
 	HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2,
+
+	/*
+	 * When this quirk is set, the HCI_OP_LE_SET_RPA_TIMEOUT command is
+	 * skipped during initialization. This is required for the Actions
+	 * Semiconductor ATS2851 based controllers, which erroneously claims
+	 * to support it.
+	 */
+	HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT,
 };
 
 /* HCI device flags */
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 6f060d00a70a..771aaa808967 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4093,7 +4093,8 @@ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev)
 {
 	__le16 timeout = cpu_to_le16(hdev->rpa_timeout);
 
-	if (!(hdev->commands[35] & 0x04))
+	if (!(hdev->commands[35] & 0x04) ||
+	    test_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT,
@@ -4553,6 +4554,9 @@ static const struct {
 			 "HCI Set Event Filter command not supported."),
 	HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN,
 			 "HCI Enhanced Setup Synchronous Connection command is "
+			 "advertised, but not supported."),
+	HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT,
+			 "HCI LE Set Random Private Address Timeout command is "
 			 "advertised, but not supported.")
 };
 
-- 
cgit v1.2.3


From d883a4669a1def6d121ccf5e64ad28260d1c9531 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 21 Apr 2023 11:37:55 -0700
Subject: Bluetooth: hci_sync: Only allow hci_cmd_sync_queue if running

This makes sure hci_cmd_sync_queue only queue new work if HCI_RUNNING
has been set otherwise there is a risk of commands being sent while
turning off.

Because hci_cmd_sync_queue can no longer queue work while HCI_RUNNING is
not set it cannot be used to power on adapters so instead
hci_cmd_sync_submit is introduced which bypass the HCI_RUNNING check, so
it behaves like the old implementation.

Link: https://lore.kernel.org/all/CAB4PzUpDMvdc8j2MdeSAy1KkAE-D3woprCwAdYWeOc-3v3c9Sw@mail.gmail.com/
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  2 ++
 net/bluetooth/hci_sync.c         | 25 +++++++++++++++++++++++--
 net/bluetooth/mgmt.c             | 12 ++++++++----
 3 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index f61b249787fc..2495be4d8b82 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -41,6 +41,8 @@ void hci_cmd_sync_clear(struct hci_dev *hdev);
 void hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
 void __hci_cmd_sync_cancel(struct hci_dev *hdev, int err);
 
+int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			void *data, hci_cmd_sync_work_destroy_t destroy);
 int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 		       void *data, hci_cmd_sync_work_destroy_t destroy);
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 771aaa808967..647a8ce54062 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -684,8 +684,12 @@ void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
 }
 EXPORT_SYMBOL(hci_cmd_sync_cancel);
 
-int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
-		       void *data, hci_cmd_sync_work_destroy_t destroy)
+/* Submit HCI command to be run in as cmd_sync_work:
+ *
+ * - hdev must _not_ be unregistered
+ */
+int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+			void *data, hci_cmd_sync_work_destroy_t destroy)
 {
 	struct hci_cmd_sync_work_entry *entry;
 
@@ -708,6 +712,23 @@ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
 
 	return 0;
 }
+EXPORT_SYMBOL(hci_cmd_sync_submit);
+
+/* Queue HCI command:
+ *
+ * - hdev must be running
+ */
+int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+		       void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+	/* Only queue command if hdev is running which means it had been opened
+	 * and is either on init phase or is already up.
+	 */
+	if (!test_bit(HCI_RUNNING, &hdev->flags))
+		return -ENETDOWN;
+
+	return hci_cmd_sync_submit(hdev, func, data, destroy);
+}
 EXPORT_SYMBOL(hci_cmd_sync_queue);
 
 int hci_update_eir_sync(struct hci_dev *hdev)
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 13c745876b39..f7b2d0971f24 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1400,11 +1400,15 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
 	}
 
 	/* Cancel potentially blocking sync operation before power off */
-	if (cp->val == 0x00)
+	if (cp->val == 0x00) {
 		__hci_cmd_sync_cancel(hdev, -EHOSTDOWN);
-
-	err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
-				 mgmt_set_powered_complete);
+		err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
+					 mgmt_set_powered_complete);
+	} else {
+		/* Use hci_cmd_sync_submit since hdev might not be running */
+		err = hci_cmd_sync_submit(hdev, set_powered_sync, cmd,
+					  mgmt_set_powered_complete);
+	}
 
 	if (err < 0)
 		mgmt_pending_remove(cmd);
-- 
cgit v1.2.3