From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2024 12:12:18 +0000 Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patch adds dev->max_pacing_offload_horizon expressing this timing wheel horizon in nsec units. This is a read-only attribute. Unless a driver sets it, dev->max_pacing_offload_horizon is zero. v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 ) v3: added yaml doc (also per Jakub feedback) Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6dc258993b17..506ba9c80e83 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX }; -- cgit v1.2.3 From f26080d47007df2ee90e65b7d390207ff3a588af Mon Sep 17 00:00:00 2001 From: Jeffrey Ji Date: Thu, 3 Oct 2024 12:12:19 +0000 Subject: net_sched: sch_fq: add the ability to offload pacing Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patchs adds to FQ packet scheduler TCA_FQ_OFFLOAD_HORIZON attribute. Its value is capped by the device max_pacing_offload_horizon, added in the prior patch. It allows FQ to let packets within pacing offload horizon to be delivered to the device, which will handle the needed delay without host involvement. Signed-off-by: Jeffrey Ji Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_fq.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a3cd0c2dc995..25a9a47001cd 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -836,6 +836,8 @@ enum { TCA_FQ_WEIGHTS, /* Weights for each band */ + TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */ + __TCA_FQ_MAX }; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 19a49af5a9e5..aeabf45c9200 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -111,6 +111,7 @@ struct fq_perband_flows { struct fq_sched_data { /* Read mostly cache line */ + u64 offload_horizon; u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -299,7 +300,7 @@ static void fq_gc(struct fq_sched_data *q, } /* Fast path can be used if : - * 1) Packet tstamp is in the past. + * 1) Packet tstamp is in the past, or within the pacing offload horizon. * 2) FQ qlen == 0 OR * (no flow is currently eligible for transmit, * AND fast path queue has less than 8 packets) @@ -314,7 +315,7 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, const struct fq_sched_data *q = qdisc_priv(sch); const struct sock *sk; - if (fq_skb_cb(skb)->time_to_send > now) + if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon) return false; if (sch->q.qlen != 0) { @@ -595,15 +596,18 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) unsigned long sample; struct rb_node *p; - if (q->time_next_delayed_flow > now) + if (q->time_next_delayed_flow > now + q->offload_horizon) return; /* Update unthrottle latency EWMA. * This is cheap and can help diagnosing timer/latency problems. */ sample = (unsigned long)(now - q->time_next_delayed_flow); - q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; - q->unthrottle_latency_ns += sample >> 3; + if ((long)sample > 0) { + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + } + now += q->offload_horizon; q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { @@ -687,7 +691,7 @@ begin: u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); - if (now < time_next_packet) { + if (now + q->offload_horizon < time_next_packet) { head->first = f->next; f->time_next_packet = time_next_packet; fq_flow_set_throttled(q, f); @@ -925,6 +929,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)), [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)), + [TCA_FQ_OFFLOAD_HORIZON] = { .type = NLA_U32 }, }; /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ @@ -1100,6 +1105,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->horizon_drop, nla_get_u8(tb[TCA_FQ_HORIZON_DROP])); + if (tb[TCA_FQ_OFFLOAD_HORIZON]) { + u64 offload_horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]); + + if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) { + WRITE_ONCE(q->offload_horizon, offload_horizon); + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon"); + err = -EINVAL; + } + } if (!err) { sch_tree_unlock(sch); @@ -1183,6 +1199,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) .bands = FQ_BANDS, }; struct nlattr *opts; + u64 offload_horizon; u64 ce_threshold; s32 weights[3]; u64 horizon; @@ -1199,6 +1216,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) horizon = READ_ONCE(q->horizon); do_div(horizon, NSEC_PER_USEC); + offload_horizon = READ_ONCE(q->offload_horizon); + do_div(offload_horizon, NSEC_PER_USEC); + if (nla_put_u32(skb, TCA_FQ_PLIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, @@ -1224,6 +1244,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_TIMER_SLACK, READ_ONCE(q->timer_slack)) || nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) || nla_put_u8(skb, TCA_FQ_HORIZON_DROP, READ_ONCE(q->horizon_drop))) goto nla_put_failure; -- cgit v1.2.3 From 4436df478860bb5da1864df2cd20f281a210f139 Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Fri, 7 Jun 2024 18:19:12 +0200 Subject: batman-adv: Add flex array to struct batadv_tvlv_tt_data The "struct batadv_tvlv_tt_data" uses a dynamically sized set of trailing elements. Specifically, it uses an array of structures of type "batadv_tvlv_tt_vlan_data". So, use the preferred way in the kernel declaring a flexible array [1]. At the same time, prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). In this case, it is important to note that the attribute used is specifically __counted_by_be since variable "num_vlan" is of type __be16. The following change to the "batadv_tt_tvlv_ogm_handler_v1" function: - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); - tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan); + tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data + + flex_size); is intended to prevent the compiler from generating an "out-of-bounds" notification due to the __counted_by attribute. The compiler can do a pointer calculation using the vlan_data flexible array memory, or in other words, this may be calculated as an array offset, since it is the same as: &tt_data->vlan_data[num_vlan] Therefore, we go past the end of the array. In other "multiple trailing flexible array" situations, this has been solved by addressing from the base pointer, since the compiler either knows the full allocation size or it knows nothing about it (this case, since it came from a "void *" function argument). The order in which the structure batadv_tvlv_tt_data and the structure batadv_tvlv_tt_vlan_data are defined must be swap to avoid an incomplete type error. Also, avoid the open-coded arithmetic in memory allocator functions [2] using the "struct_size" macro and use the "flex_array_size" helper to clarify some calculations, when possible. Moreover, the new structure member also allow us to avoid the open-coded arithmetic on pointers in some situations. Take advantage of this. This code was detected with the help of Coccinelle, and audited and modified manually. Link: https://www.kernel.org/doc/html/next/process/deprecated.html#zero-length-and-one-element-arrays [1] Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [2] Reviewed-by: Kees Cook Signed-off-by: Erick Archer Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 29 ++++++++++++---------- net/batman-adv/translation-table.c | 49 ++++++++++++++++---------------------- 2 files changed, 36 insertions(+), 42 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 6e25753015df..439132a819ea 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -9,6 +9,7 @@ #include #include +#include #include /** @@ -592,19 +593,6 @@ struct batadv_tvlv_gateway_data { __be32 bandwidth_up; }; -/** - * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container - * @flags: translation table flags (see batadv_tt_data_flags) - * @ttvn: translation table version number - * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by - * one batadv_tvlv_tt_vlan_data object per announced vlan - */ -struct batadv_tvlv_tt_data { - __u8 flags; - __u8 ttvn; - __be16 num_vlan; -}; - /** * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through * the tt tvlv container @@ -618,6 +606,21 @@ struct batadv_tvlv_tt_vlan_data { __u16 reserved; }; +/** + * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container + * @flags: translation table flags (see batadv_tt_data_flags) + * @ttvn: translation table version number + * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by + * one batadv_tvlv_tt_vlan_data object per announced vlan + * @vlan_data: array of batadv_tvlv_tt_vlan_data objects + */ +struct batadv_tvlv_tt_data { + __u8 flags; + __u8 ttvn; + __be16 num_vlan; + struct batadv_tvlv_tt_vlan_data vlan_data[] __counted_by_be(num_vlan); +}; + /** * struct batadv_tvlv_tt_change - translation table diff data * @flags: status indicators concerning the non-mesh client (see diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 2243cec18ecc..6815d1262feb 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -856,8 +857,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, num_entries += atomic_read(&vlan->tt.num_entries); } - change_offset = sizeof(**tt_data); - change_offset += num_vlan * sizeof(*tt_vlan); + change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) @@ -876,7 +876,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, (*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn); (*tt_data)->num_vlan = htons(num_vlan); - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); + tt_vlan = (*tt_data)->vlan_data; hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); @@ -936,8 +936,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, total_entries += vlan_entries; } - change_offset = sizeof(**tt_data); - change_offset += num_vlan * sizeof(*tt_vlan); + change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) @@ -956,7 +955,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, (*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn); (*tt_data)->num_vlan = htons(num_vlan); - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); + tt_vlan = (*tt_data)->vlan_data; hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) @@ -2916,7 +2915,6 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; - struct batadv_tvlv_tt_vlan_data *tt_vlan_req; struct batadv_hard_iface *primary_if; bool ret = false; int i, size; @@ -2932,7 +2930,7 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, if (!tt_req_node) goto out; - size = sizeof(*tvlv_tt_data) + sizeof(*tt_vlan_req) * num_vlan; + size = struct_size(tvlv_tt_data, vlan_data, num_vlan); tvlv_tt_data = kzalloc(size, GFP_ATOMIC); if (!tvlv_tt_data) goto out; @@ -2944,12 +2942,10 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, /* send all the CRCs within the request. This is needed by intermediate * nodes to ensure they have the correct table before replying */ - tt_vlan_req = (struct batadv_tvlv_tt_vlan_data *)(tvlv_tt_data + 1); for (i = 0; i < num_vlan; i++) { - tt_vlan_req->vid = tt_vlan->vid; - tt_vlan_req->crc = tt_vlan->crc; + tvlv_tt_data->vlan_data[i].vid = tt_vlan->vid; + tvlv_tt_data->vlan_data[i].crc = tt_vlan->crc; - tt_vlan_req++; tt_vlan++; } @@ -3001,7 +2997,6 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_orig_node *res_dst_orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; - struct batadv_tvlv_tt_vlan_data *tt_vlan; bool ret = false, full_table; u8 orig_ttvn, req_ttvn; u16 tvlv_len; @@ -3024,10 +3019,9 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_data->ttvn; - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); /* this node doesn't have the requested data */ if (orig_ttvn != req_ttvn || - !batadv_tt_global_check_crc(req_dst_orig_node, tt_vlan, + !batadv_tt_global_check_crc(req_dst_orig_node, tt_data->vlan_data, ntohs(tt_data->num_vlan))) goto out; @@ -3370,7 +3364,6 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; u8 *tvlv_ptr = (u8 *)tt_data; - u16 change_offset; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", @@ -3383,10 +3376,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, spin_lock_bh(&orig_node->tt_lock); - change_offset = sizeof(struct batadv_tvlv_tt_vlan_data); - change_offset *= ntohs(tt_data->num_vlan); - change_offset += sizeof(*tt_data); - tvlv_ptr += change_offset; + tvlv_ptr += struct_size(tt_data, vlan_data, ntohs(tt_data->num_vlan)); tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr; if (tt_data->flags & BATADV_TT_FULL_TABLE) { @@ -3985,10 +3975,10 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, u8 flags, void *tvlv_value, u16 tvlv_value_len) { - struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; + size_t flex_size; if (tvlv_value_len < sizeof(*tt_data)) return; @@ -3998,17 +3988,18 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, num_vlan = ntohs(tt_data->num_vlan); - if (tvlv_value_len < sizeof(*tt_vlan) * num_vlan) + flex_size = flex_array_size(tt_data, vlan_data, num_vlan); + if (tvlv_value_len < flex_size) return; - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); - tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan); - tvlv_value_len -= sizeof(*tt_vlan) * num_vlan; + tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data + + flex_size); + tvlv_value_len -= flex_size; num_entries = batadv_tt_entries(tvlv_value_len); - batadv_tt_update_orig(bat_priv, orig, tt_vlan, num_vlan, tt_change, - num_entries, tt_data->ttvn); + batadv_tt_update_orig(bat_priv, orig, tt_data->vlan_data, num_vlan, + tt_change, num_entries, tt_data->ttvn); } /** @@ -4039,8 +4030,8 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); - tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data); - tt_vlan_len *= ntohs(tt_data->num_vlan); + tt_vlan_len = flex_array_size(tt_data, vlan_data, + ntohs(tt_data->num_vlan)); if (tvlv_value_len < tt_vlan_len) return NET_RX_SUCCESS; -- cgit v1.2.3 From 83134ef4609388f6b9ca31a384f531155196c2a7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Oct 2024 12:13:31 +0200 Subject: netkit: Add option for scrubbing skb meta data Jordan reported that when running Cilium with netkit in per-endpoint-routes mode, network policy misclassifies traffic. In this direct routing mode of Cilium which is used in case of GKE/EKS/AKS, the Pod's BPF program to enforce policy sits on the netkit primary device's egress side. The issue here is that in case of netkit's netkit_prep_forward(), it will clear meta data such as skb->mark and skb->priority before executing the BPF program. Thus, identity data stored in there from earlier BPF programs (e.g. from tcx ingress on the physical device) gets cleared instead of being made available for the primary's program to process. While for traffic egressing the Pod via the peer device this might be desired, this is different for the primary one where compared to tcx egress on the host veth this information would be available. To address this, add a new parameter for the device orchestration to allow control of skb->mark and skb->priority scrubbing, to make the two accessible from BPF (and eventually leave it up to the program to scrub). By default, the current behavior is retained. For netkit peer this also enables the use case where applications could cooperate/signal intent to the BPF program. Note that struct netkit has a 4 byte hole between policy and bundle which is used here, in other words, struct netkit's first cacheline content used in fast-path does not get moved around. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Reported-by: Jordan Rife Signed-off-by: Daniel Borkmann Cc: Nikolay Aleksandrov Link: https://github.com/cilium/cilium/issues/34042 Acked-by: Jakub Kicinski Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20241004101335.117711-1-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- drivers/net/netkit.c | 68 +++++++++++++++++++++++++++++++++++--------- include/uapi/linux/if_link.h | 15 ++++++++++ 2 files changed, 70 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 059269557d92..fba2c734f0ec 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -20,6 +20,7 @@ struct netkit { struct net_device __rcu *peer; struct bpf_mprog_entry __rcu *active; enum netkit_action policy; + enum netkit_scrub scrub; struct bpf_mprog_bundle bundle; /* Needed in slow-path */ @@ -50,12 +51,24 @@ netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, return ret; } -static void netkit_prep_forward(struct sk_buff *skb, bool xnet) +static void netkit_xnet(struct sk_buff *skb) { - skb_scrub_packet(skb, xnet); skb->priority = 0; + skb->mark = 0; +} + +static void netkit_prep_forward(struct sk_buff *skb, + bool xnet, bool xnet_scrub) +{ + skb_scrub_packet(skb, false); nf_skip_egress(skb, true); skb_reset_mac_header(skb); + if (!xnet) + return; + ipvs_reset(skb); + skb_clear_tstamp(skb); + if (xnet_scrub) + netkit_xnet(skb); } static struct netkit *netkit_priv(const struct net_device *dev) @@ -80,7 +93,8 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) !pskb_may_pull(skb, ETH_HLEN) || skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; - netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)), + nk->scrub); eth_skb_pkt_type(skb, peer); skb->dev = peer; entry = rcu_dereference(nk->active); @@ -332,8 +346,10 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, struct netlink_ext_ack *extack) { struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr; - enum netkit_action default_prim = NETKIT_PASS; - enum netkit_action default_peer = NETKIT_PASS; + enum netkit_action policy_prim = NETKIT_PASS; + enum netkit_action policy_peer = NETKIT_PASS; + enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; + enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; enum netkit_mode mode = NETKIT_L3; unsigned char ifname_assign_type; struct ifinfomsg *ifmp = NULL; @@ -362,17 +378,21 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, return err; tbp = peer_tb; } + if (data[IFLA_NETKIT_SCRUB]) + scrub_prim = nla_get_u32(data[IFLA_NETKIT_SCRUB]); + if (data[IFLA_NETKIT_PEER_SCRUB]) + scrub_peer = nla_get_u32(data[IFLA_NETKIT_PEER_SCRUB]); if (data[IFLA_NETKIT_POLICY]) { attr = data[IFLA_NETKIT_POLICY]; - default_prim = nla_get_u32(attr); - err = netkit_check_policy(default_prim, attr, extack); + policy_prim = nla_get_u32(attr); + err = netkit_check_policy(policy_prim, attr, extack); if (err < 0) return err; } if (data[IFLA_NETKIT_PEER_POLICY]) { attr = data[IFLA_NETKIT_PEER_POLICY]; - default_peer = nla_get_u32(attr); - err = netkit_check_policy(default_peer, attr, extack); + policy_peer = nla_get_u32(attr); + err = netkit_check_policy(policy_peer, attr, extack); if (err < 0) return err; } @@ -409,7 +429,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, nk = netkit_priv(peer); nk->primary = false; - nk->policy = default_peer; + nk->policy = policy_peer; + nk->scrub = scrub_peer; nk->mode = mode; bpf_mprog_bundle_init(&nk->bundle); @@ -434,7 +455,8 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev, nk = netkit_priv(dev); nk->primary = true; - nk->policy = default_prim; + nk->policy = policy_prim; + nk->scrub = scrub_prim; nk->mode = mode; bpf_mprog_bundle_init(&nk->bundle); @@ -874,6 +896,18 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return -EACCES; } + if (data[IFLA_NETKIT_SCRUB]) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB], + "netkit scrubbing cannot be changed after device creation"); + return -EACCES; + } + + if (data[IFLA_NETKIT_PEER_SCRUB]) { + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB], + "netkit scrubbing cannot be changed after device creation"); + return -EACCES; + } + if (data[IFLA_NETKIT_PEER_INFO]) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO], "netkit peer info cannot be changed after device creation"); @@ -908,8 +942,10 @@ static size_t netkit_get_size(const struct net_device *dev) { return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */ - nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_SCRUB */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */ nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */ + nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ 0; } @@ -924,11 +960,15 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode)) return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub)) + return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy)) return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_PEER_SCRUB, nk->scrub)) + return -EMSGSIZE; } return 0; @@ -936,9 +976,11 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) }, - [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, [IFLA_NETKIT_MODE] = { .type = NLA_U32 }, + [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 }, + [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), + [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6dc258993b17..2acc7687e017 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1292,6 +1292,19 @@ enum netkit_mode { NETKIT_L3, }; +/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to + * the BPF program if attached. This also means the latter can + * consume the two fields if they were populated earlier. + * + * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before + * invoking the attached BPF program when the peer device resides + * in a different network namespace. This is the default behavior. + */ +enum netkit_scrub { + NETKIT_SCRUB_NONE, + NETKIT_SCRUB_DEFAULT, +}; + enum { IFLA_NETKIT_UNSPEC, IFLA_NETKIT_PEER_INFO, @@ -1299,6 +1312,8 @@ enum { IFLA_NETKIT_POLICY, IFLA_NETKIT_PEER_POLICY, IFLA_NETKIT_MODE, + IFLA_NETKIT_SCRUB, + IFLA_NETKIT_PEER_SCRUB, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) -- cgit v1.2.3 From 20503272422693d793b84f88bf23fe4e955d3a33 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 6 Oct 2024 08:17:58 +0100 Subject: ptp: Add support for the AMZNC10C 'vmclock' device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vmclock device addresses the problem of live migration with precision clocks. The tolerances of a hardware counter (e.g. TSC) are typically around ±50PPM. A guest will use NTP/PTP/PPS to discipline that counter against an external source of 'real' time, and track the precise frequency of the counter as it changes with environmental conditions. When a guest is live migrated, anything it knows about the frequency of the underlying counter becomes invalid. It may move from a host where the counter running at -50PPM of its nominal frequency, to a host where it runs at +50PPM. There will also be a step change in the value of the counter, as the correctness of its absolute value at migration is limited by the accuracy of the source and destination host's time synchronization. In its simplest form, the device merely advertises a 'disruption_marker' which indicates that the guest should throw away any NTP synchronization it thinks it has, and start again. Because the shared memory region can be exposed all the way to userspace through the /dev/vmclock0 node, applications can still use time from a fast vDSO 'system call', and check the disruption marker to be sure that their timestamp is indeed truthful. The structure also allows for the precise time, as known by the host, to be exposed directly to guests so that they don't have to wait for NTP to resync from scratch. The PTP driver consumes this information if present. Like the KVM PTP clock, this PTP driver can convert TSC-based cross timestamps into KVM clock values. Unlike the KVM PTP clock, it does so only when such is actually helpful. The values and fields are based on the nascent virtio-rtc specification, and the intent is that a version (hopefully precisely this version) of this structure will be included as an optional part of that spec. In the meantime, this driver supports the simple ACPI form of the device which is being shipped in certain commercial hypervisors (and submitted for inclusion in QEMU). Signed-off-by: David Woodhouse Acked-by: Richard Cochran Signed-off-by: David S. Miller --- MAINTAINERS | 7 + drivers/ptp/Kconfig | 13 + drivers/ptp/Makefile | 1 + drivers/ptp/ptp_vmclock.c | 615 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vmclock-abi.h | 182 ++++++++++++ 5 files changed, 818 insertions(+) create mode 100644 drivers/ptp/ptp_vmclock.c create mode 100644 include/uapi/linux/vmclock-abi.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index af635dc60cfe..1389704c7d8d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18683,6 +18683,13 @@ S: Maintained F: drivers/ptp/ptp_vclock.c F: net/ethtool/phc_vclocks.c +PTP VMCLOCK SUPPORT +M: David Woodhouse +L: netdev@vger.kernel.org +S: Maintained +F: drivers/ptp/ptp_vmclock.c +F: include/uapi/linux/vmclock-abi.h + PTRACE SUPPORT M: Oleg Nesterov S: Maintained diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 604541dcb320..e98c9767e0ef 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -131,6 +131,19 @@ config PTP_1588_CLOCK_KVM To compile this driver as a module, choose M here: the module will be called ptp_kvm. +config PTP_1588_CLOCK_VMCLOCK + tristate "Virtual machine PTP clock" + depends on X86_TSC || ARM_ARCH_TIMER + depends on PTP_1588_CLOCK && ACPI && ARCH_SUPPORTS_INT128 + default y + help + This driver adds support for using a virtual precision clock + advertised by the hypervisor. This clock is only useful in virtual + machines where such a device is present. + + To compile this driver as a module, choose M here: the module + will be called ptp_vmclock. + config PTP_1588_CLOCK_IDT82P33 tristate "IDT 82P33xxx PTP clock" depends on PTP_1588_CLOCK && I2C diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 68bf02078053..01b5cd91eb61 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_DTE) += ptp_dte.o obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o +obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK) += ptp_vmclock.o obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp-qoriq.o ptp-qoriq-y += ptp_qoriq.o ptp-qoriq-$(CONFIG_DEBUG_FS) += ptp_qoriq_debugfs.o diff --git a/drivers/ptp/ptp_vmclock.c b/drivers/ptp/ptp_vmclock.c new file mode 100644 index 000000000000..cdca8a3ad1aa --- /dev/null +++ b/drivers/ptp/ptp_vmclock.c @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtual PTP 1588 clock for use with LM-safe VMclock device. + * + * Copyright © 2024 Amazon.com, Inc. or its affiliates. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifdef CONFIG_X86 +#include +#include +#endif + +#ifdef CONFIG_KVM_GUEST +#define SUPPORT_KVMCLOCK +#endif + +static DEFINE_IDA(vmclock_ida); + +ACPI_MODULE_NAME("vmclock"); + +struct vmclock_state { + struct resource res; + struct vmclock_abi *clk; + struct miscdevice miscdev; + struct ptp_clock_info ptp_clock_info; + struct ptp_clock *ptp_clock; + enum clocksource_ids cs_id, sys_cs_id; + int index; + char *name; +}; + +#define VMCLOCK_MAX_WAIT ms_to_ktime(100) + +/* Require at least the flags field to be present. All else can be optional. */ +#define VMCLOCK_MIN_SIZE offsetof(struct vmclock_abi, pad) + +#define VMCLOCK_FIELD_PRESENT(_c, _f) \ + (le32_to_cpu((_c)->size) >= (offsetof(struct vmclock_abi, _f) + \ + sizeof((_c)->_f))) + +/* + * Multiply a 64-bit count by a 64-bit tick 'period' in units of seconds >> 64 + * and add the fractional second part of the reference time. + * + * The result is a 128-bit value, the top 64 bits of which are seconds, and + * the low 64 bits are (seconds >> 64). + */ +static uint64_t mul_u64_u64_shr_add_u64(uint64_t *res_hi, uint64_t delta, + uint64_t period, uint8_t shift, + uint64_t frac_sec) +{ + unsigned __int128 res = (unsigned __int128)delta * period; + + res >>= shift; + res += frac_sec; + *res_hi = res >> 64; + return (uint64_t)res; +} + +static bool tai_adjust(struct vmclock_abi *clk, uint64_t *sec) +{ + if (likely(clk->time_type == VMCLOCK_TIME_UTC)) + return true; + + if (clk->time_type == VMCLOCK_TIME_TAI && + (le64_to_cpu(clk->flags) & VMCLOCK_FLAG_TAI_OFFSET_VALID)) { + if (sec) + *sec += (int16_t)le16_to_cpu(clk->tai_offset_sec); + return true; + } + return false; +} + +static int vmclock_get_crosststamp(struct vmclock_state *st, + struct ptp_system_timestamp *sts, + struct system_counterval_t *system_counter, + struct timespec64 *tspec) +{ + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); + struct system_time_snapshot systime_snapshot; + uint64_t cycle, delta, seq, frac_sec; + +#ifdef CONFIG_X86 + /* + * We'd expect the hypervisor to know this and to report the clock + * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid. + */ + if (check_tsc_unstable()) + return -EINVAL; +#endif + + while (1) { + seq = le32_to_cpu(st->clk->seq_count) & ~1ULL; + + /* + * This pairs with a write barrier in the hypervisor + * which populates this structure. + */ + virt_rmb(); + + if (st->clk->clock_status == VMCLOCK_STATUS_UNRELIABLE) + return -EINVAL; + + /* + * When invoked for gettimex64(), fill in the pre/post system + * times. The simple case is when system time is based on the + * same counter as st->cs_id, in which case all three times + * will be derived from the *same* counter value. + * + * If the system isn't using the same counter, then the value + * from ktime_get_snapshot() will still be used as pre_ts, and + * ptp_read_system_postts() is called to populate postts after + * calling get_cycles(). + * + * The conversion to timespec64 happens further down, outside + * the seq_count loop. + */ + if (sts) { + ktime_get_snapshot(&systime_snapshot); + if (systime_snapshot.cs_id == st->cs_id) { + cycle = systime_snapshot.cycles; + } else { + cycle = get_cycles(); + ptp_read_system_postts(sts); + } + } else { + cycle = get_cycles(); + } + + delta = cycle - le64_to_cpu(st->clk->counter_value); + + frac_sec = mul_u64_u64_shr_add_u64(&tspec->tv_sec, delta, + le64_to_cpu(st->clk->counter_period_frac_sec), + st->clk->counter_period_shift, + le64_to_cpu(st->clk->time_frac_sec)); + tspec->tv_nsec = mul_u64_u64_shr(frac_sec, NSEC_PER_SEC, 64); + tspec->tv_sec += le64_to_cpu(st->clk->time_sec); + + if (!tai_adjust(st->clk, &tspec->tv_sec)) + return -EINVAL; + + /* + * This pairs with a write barrier in the hypervisor + * which populates this structure. + */ + virt_rmb(); + if (seq == le32_to_cpu(st->clk->seq_count)) + break; + + if (ktime_after(ktime_get(), deadline)) + return -ETIMEDOUT; + } + + if (system_counter) { + system_counter->cycles = cycle; + system_counter->cs_id = st->cs_id; + } + + if (sts) { + sts->pre_ts = ktime_to_timespec64(systime_snapshot.real); + if (systime_snapshot.cs_id == st->cs_id) + sts->post_ts = sts->pre_ts; + } + + return 0; +} + +#ifdef SUPPORT_KVMCLOCK +/* + * In the case where the system is using the KVM clock for timekeeping, convert + * the TSC value into a KVM clock time in order to return a paired reading that + * get_device_system_crosststamp() can cope with. + */ +static int vmclock_get_crosststamp_kvmclock(struct vmclock_state *st, + struct ptp_system_timestamp *sts, + struct system_counterval_t *system_counter, + struct timespec64 *tspec) +{ + struct pvclock_vcpu_time_info *pvti = this_cpu_pvti(); + unsigned int pvti_ver; + int ret; + + preempt_disable_notrace(); + + do { + pvti_ver = pvclock_read_begin(pvti); + + ret = vmclock_get_crosststamp(st, sts, system_counter, tspec); + if (ret) + break; + + system_counter->cycles = __pvclock_read_cycles(pvti, + system_counter->cycles); + system_counter->cs_id = CSID_X86_KVM_CLK; + + /* + * This retry should never really happen; if the TSC is + * stable and reliable enough across vCPUS that it is sane + * for the hypervisor to expose a VMCLOCK device which uses + * it as the reference counter, then the KVM clock sohuld be + * in 'master clock mode' and basically never changed. But + * the KVM clock is a fickle and often broken thing, so do + * it "properly" just in case. + */ + } while (pvclock_read_retry(pvti, pvti_ver)); + + preempt_enable_notrace(); + + return ret; +} +#endif + +static int ptp_vmclock_get_time_fn(ktime_t *device_time, + struct system_counterval_t *system_counter, + void *ctx) +{ + struct vmclock_state *st = ctx; + struct timespec64 tspec; + int ret; + +#ifdef SUPPORT_KVMCLOCK + if (READ_ONCE(st->sys_cs_id) == CSID_X86_KVM_CLK) + ret = vmclock_get_crosststamp_kvmclock(st, NULL, system_counter, + &tspec); + else +#endif + ret = vmclock_get_crosststamp(st, NULL, system_counter, &tspec); + + if (!ret) + *device_time = timespec64_to_ktime(tspec); + + return ret; +} + +static int ptp_vmclock_getcrosststamp(struct ptp_clock_info *ptp, + struct system_device_crosststamp *xtstamp) +{ + struct vmclock_state *st = container_of(ptp, struct vmclock_state, + ptp_clock_info); + int ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, st, + NULL, xtstamp); +#ifdef SUPPORT_KVMCLOCK + /* + * On x86, the KVM clock may be used for the system time. We can + * actually convert a TSC reading to that, and return a paired + * timestamp that get_device_system_crosststamp() *can* handle. + */ + if (ret == -ENODEV) { + struct system_time_snapshot systime_snapshot; + + ktime_get_snapshot(&systime_snapshot); + + if (systime_snapshot.cs_id == CSID_X86_TSC || + systime_snapshot.cs_id == CSID_X86_KVM_CLK) { + WRITE_ONCE(st->sys_cs_id, systime_snapshot.cs_id); + ret = get_device_system_crosststamp(ptp_vmclock_get_time_fn, + st, NULL, xtstamp); + } + } +#endif + return ret; +} + +/* + * PTP clock operations + */ + +static int ptp_vmclock_adjfine(struct ptp_clock_info *ptp, long delta) +{ + return -EOPNOTSUPP; +} + +static int ptp_vmclock_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + return -EOPNOTSUPP; +} + +static int ptp_vmclock_settime(struct ptp_clock_info *ptp, + const struct timespec64 *ts) +{ + return -EOPNOTSUPP; +} + +static int ptp_vmclock_gettimex(struct ptp_clock_info *ptp, struct timespec64 *ts, + struct ptp_system_timestamp *sts) +{ + struct vmclock_state *st = container_of(ptp, struct vmclock_state, + ptp_clock_info); + + return vmclock_get_crosststamp(st, sts, NULL, ts); +} + +static int ptp_vmclock_enable(struct ptp_clock_info *ptp, + struct ptp_clock_request *rq, int on) +{ + return -EOPNOTSUPP; +} + +static const struct ptp_clock_info ptp_vmclock_info = { + .owner = THIS_MODULE, + .max_adj = 0, + .n_ext_ts = 0, + .n_pins = 0, + .pps = 0, + .adjfine = ptp_vmclock_adjfine, + .adjtime = ptp_vmclock_adjtime, + .gettimex64 = ptp_vmclock_gettimex, + .settime64 = ptp_vmclock_settime, + .enable = ptp_vmclock_enable, + .getcrosststamp = ptp_vmclock_getcrosststamp, +}; + +static struct ptp_clock *vmclock_ptp_register(struct device *dev, + struct vmclock_state *st) +{ + enum clocksource_ids cs_id; + + if (IS_ENABLED(CONFIG_ARM64) && + st->clk->counter_id == VMCLOCK_COUNTER_ARM_VCNT) { + /* Can we check it's the virtual counter? */ + cs_id = CSID_ARM_ARCH_COUNTER; + } else if (IS_ENABLED(CONFIG_X86) && + st->clk->counter_id == VMCLOCK_COUNTER_X86_TSC) { + cs_id = CSID_X86_TSC; + } else { + return NULL; + } + + /* Only UTC, or TAI with offset */ + if (!tai_adjust(st->clk, NULL)) { + dev_info(dev, "vmclock does not provide unambiguous UTC\n"); + return NULL; + } + + st->sys_cs_id = cs_id; + st->cs_id = cs_id; + st->ptp_clock_info = ptp_vmclock_info; + strscpy(st->ptp_clock_info.name, st->name); + + return ptp_clock_register(&st->ptp_clock_info, dev); +} + +static int vmclock_miscdev_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct vmclock_state *st = container_of(fp->private_data, + struct vmclock_state, miscdev); + + if ((vma->vm_flags & (VM_READ|VM_WRITE)) != VM_READ) + return -EROFS; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE || vma->vm_pgoff) + return -EINVAL; + + if (io_remap_pfn_range(vma, vma->vm_start, + st->res.start >> PAGE_SHIFT, PAGE_SIZE, + vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static ssize_t vmclock_miscdev_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct vmclock_state *st = container_of(fp->private_data, + struct vmclock_state, miscdev); + ktime_t deadline = ktime_add(ktime_get(), VMCLOCK_MAX_WAIT); + size_t max_count; + uint32_t seq; + + if (*ppos >= PAGE_SIZE) + return 0; + + max_count = PAGE_SIZE - *ppos; + if (count > max_count) + count = max_count; + + while (1) { + seq = le32_to_cpu(st->clk->seq_count) & ~1U; + /* Pairs with hypervisor wmb */ + virt_rmb(); + + if (copy_to_user(buf, ((char *)st->clk) + *ppos, count)) + return -EFAULT; + + /* Pairs with hypervisor wmb */ + virt_rmb(); + if (seq == le32_to_cpu(st->clk->seq_count)) + break; + + if (ktime_after(ktime_get(), deadline)) + return -ETIMEDOUT; + } + + *ppos += count; + return count; +} + +static const struct file_operations vmclock_miscdev_fops = { + .mmap = vmclock_miscdev_mmap, + .read = vmclock_miscdev_read, +}; + +/* module operations */ + +static void vmclock_remove(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct vmclock_state *st = dev_get_drvdata(dev); + + if (st->ptp_clock) + ptp_clock_unregister(st->ptp_clock); + + if (st->miscdev.minor != MISC_DYNAMIC_MINOR) + misc_deregister(&st->miscdev); +} + +static acpi_status vmclock_acpi_resources(struct acpi_resource *ares, void *data) +{ + struct vmclock_state *st = data; + struct resource_win win; + struct resource *res = &win.res; + + if (ares->type == ACPI_RESOURCE_TYPE_END_TAG) + return AE_OK; + + /* There can be only one */ + if (resource_type(&st->res) == IORESOURCE_MEM) + return AE_ERROR; + + if (acpi_dev_resource_memory(ares, res) || + acpi_dev_resource_address_space(ares, &win)) { + + if (resource_type(res) != IORESOURCE_MEM || + resource_size(res) < sizeof(st->clk)) + return AE_ERROR; + + st->res = *res; + return AE_OK; + } + + return AE_ERROR; +} + +static int vmclock_probe_acpi(struct device *dev, struct vmclock_state *st) +{ + struct acpi_device *adev = ACPI_COMPANION(dev); + acpi_status status; + + /* + * This should never happen as this function is only called when + * has_acpi_companion(dev) is true, but the logic is sufficiently + * complex that Coverity can't see the tautology. + */ + if (!adev) + return -ENODEV; + + status = acpi_walk_resources(adev->handle, METHOD_NAME__CRS, + vmclock_acpi_resources, st); + if (ACPI_FAILURE(status) || resource_type(&st->res) != IORESOURCE_MEM) { + dev_err(dev, "failed to get resources\n"); + return -ENODEV; + } + + return 0; +} + +static void vmclock_put_idx(void *data) +{ + struct vmclock_state *st = data; + + ida_free(&vmclock_ida, st->index); +} + +static int vmclock_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct vmclock_state *st; + int ret; + + st = devm_kzalloc(dev, sizeof(*st), GFP_KERNEL); + if (!st) + return -ENOMEM; + + if (has_acpi_companion(dev)) + ret = vmclock_probe_acpi(dev, st); + else + ret = -EINVAL; /* Only ACPI for now */ + + if (ret) { + dev_info(dev, "Failed to obtain physical address: %d\n", ret); + goto out; + } + + if (resource_size(&st->res) < VMCLOCK_MIN_SIZE) { + dev_info(dev, "Region too small (0x%llx)\n", + resource_size(&st->res)); + ret = -EINVAL; + goto out; + } + st->clk = devm_memremap(dev, st->res.start, resource_size(&st->res), + MEMREMAP_WB | MEMREMAP_DEC); + if (IS_ERR(st->clk)) { + ret = PTR_ERR(st->clk); + dev_info(dev, "failed to map shared memory\n"); + st->clk = NULL; + goto out; + } + + if (le32_to_cpu(st->clk->magic) != VMCLOCK_MAGIC || + le32_to_cpu(st->clk->size) > resource_size(&st->res) || + le16_to_cpu(st->clk->version) != 1) { + dev_info(dev, "vmclock magic fields invalid\n"); + ret = -EINVAL; + goto out; + } + + ret = ida_alloc(&vmclock_ida, GFP_KERNEL); + if (ret < 0) + goto out; + + st->index = ret; + ret = devm_add_action_or_reset(&pdev->dev, vmclock_put_idx, st); + if (ret) + goto out; + + st->name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "vmclock%d", st->index); + if (!st->name) { + ret = -ENOMEM; + goto out; + } + + /* + * If the structure is big enough, it can be mapped to userspace. + * Theoretically a guest OS even using larger pages could still + * use 4KiB PTEs to map smaller MMIO regions like this, but let's + * cross that bridge if/when we come to it. + */ + if (le32_to_cpu(st->clk->size) >= PAGE_SIZE) { + st->miscdev.minor = MISC_DYNAMIC_MINOR; + st->miscdev.fops = &vmclock_miscdev_fops; + st->miscdev.name = st->name; + + ret = misc_register(&st->miscdev); + if (ret) + goto out; + } + + /* If there is valid clock information, register a PTP clock */ + if (VMCLOCK_FIELD_PRESENT(st->clk, time_frac_sec)) { + /* Can return a silent NULL, or an error. */ + st->ptp_clock = vmclock_ptp_register(dev, st); + if (IS_ERR(st->ptp_clock)) { + ret = PTR_ERR(st->ptp_clock); + st->ptp_clock = NULL; + vmclock_remove(pdev); + goto out; + } + } + + if (!st->miscdev.minor && !st->ptp_clock) { + /* Neither miscdev nor PTP registered */ + dev_info(dev, "vmclock: Neither miscdev nor PTP available; not registering\n"); + ret = -ENODEV; + goto out; + } + + dev_info(dev, "%s: registered %s%s%s\n", st->name, + st->miscdev.minor ? "miscdev" : "", + (st->miscdev.minor && st->ptp_clock) ? ", " : "", + st->ptp_clock ? "PTP" : ""); + + dev_set_drvdata(dev, st); + + out: + return ret; +} + +static const struct acpi_device_id vmclock_acpi_ids[] = { + { "AMZNC10C", 0 }, + {} +}; +MODULE_DEVICE_TABLE(acpi, vmclock_acpi_ids); + +static struct platform_driver vmclock_platform_driver = { + .probe = vmclock_probe, + .remove_new = vmclock_remove, + .driver = { + .name = "vmclock", + .acpi_match_table = vmclock_acpi_ids, + }, +}; + +module_platform_driver(vmclock_platform_driver) + +MODULE_AUTHOR("David Woodhouse "); +MODULE_DESCRIPTION("PTP clock using VMCLOCK"); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/vmclock-abi.h b/include/uapi/linux/vmclock-abi.h new file mode 100644 index 000000000000..2d99b29ac44a --- /dev/null +++ b/include/uapi/linux/vmclock-abi.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ + +/* + * This structure provides a vDSO-style clock to VM guests, exposing the + * relationship (or lack thereof) between the CPU clock (TSC, timebase, arch + * counter, etc.) and real time. It is designed to address the problem of + * live migration, which other clock enlightenments do not. + * + * When a guest is live migrated, this affects the clock in two ways. + * + * First, even between identical hosts the actual frequency of the underlying + * counter will change within the tolerances of its specification (typically + * ±50PPM, or 4 seconds a day). This frequency also varies over time on the + * same host, but can be tracked by NTP as it generally varies slowly. With + * live migration there is a step change in the frequency, with no warning. + * + * Second, there may be a step change in the value of the counter itself, as + * its accuracy is limited by the precision of the NTP synchronization on the + * source and destination hosts. + * + * So any calibration (NTP, PTP, etc.) which the guest has done on the source + * host before migration is invalid, and needs to be redone on the new host. + * + * In its most basic mode, this structure provides only an indication to the + * guest that live migration has occurred. This allows the guest to know that + * its clock is invalid and take remedial action. For applications that need + * reliable accurate timestamps (e.g. distributed databases), the structure + * can be mapped all the way to userspace. This allows the application to see + * directly for itself that the clock is disrupted and take appropriate + * action, even when using a vDSO-style method to get the time instead of a + * system call. + * + * In its more advanced mode. this structure can also be used to expose the + * precise relationship of the CPU counter to real time, as calibrated by the + * host. This means that userspace applications can have accurate time + * immediately after live migration, rather than having to pause operations + * and wait for NTP to recover. This mode does, of course, rely on the + * counter being reliable and consistent across CPUs. + * + * Note that this must be true UTC, never with smeared leap seconds. If a + * guest wishes to construct a smeared clock, it can do so. Presenting a + * smeared clock through this interface would be problematic because it + * actually messes with the apparent counter *period*. A linear smearing + * of 1 ms per second would effectively tweak the counter period by 1000PPM + * at the start/end of the smearing period, while a sinusoidal smear would + * basically be impossible to represent. + * + * This structure is offered with the intent that it be adopted into the + * nascent virtio-rtc standard, as a virtio-rtc that does not address the live + * migration problem seems a little less than fit for purpose. For that + * reason, certain fields use precisely the same numeric definitions as in + * the virtio-rtc proposal. The structure can also be exposed through an ACPI + * device with the CID "VMCLOCK", modelled on the "VMGENID" device except for + * the fact that it uses a real _CRS to convey the address of the structure + * (which should be a full page, to allow for mapping directly to userspace). + */ + +#ifndef __VMCLOCK_ABI_H__ +#define __VMCLOCK_ABI_H__ + +#include + +struct vmclock_abi { + /* CONSTANT FIELDS */ + __le32 magic; +#define VMCLOCK_MAGIC 0x4b4c4356 /* "VCLK" */ + __le32 size; /* Size of region containing this structure */ + __le16 version; /* 1 */ + __u8 counter_id; /* Matches VIRTIO_RTC_COUNTER_xxx except INVALID */ +#define VMCLOCK_COUNTER_ARM_VCNT 0 +#define VMCLOCK_COUNTER_X86_TSC 1 +#define VMCLOCK_COUNTER_INVALID 0xff + __u8 time_type; /* Matches VIRTIO_RTC_TYPE_xxx */ +#define VMCLOCK_TIME_UTC 0 /* Since 1970-01-01 00:00:00z */ +#define VMCLOCK_TIME_TAI 1 /* Since 1970-01-01 00:00:00z */ +#define VMCLOCK_TIME_MONOTONIC 2 /* Since undefined epoch */ +#define VMCLOCK_TIME_INVALID_SMEARED 3 /* Not supported */ +#define VMCLOCK_TIME_INVALID_MAYBE_SMEARED 4 /* Not supported */ + + /* NON-CONSTANT FIELDS PROTECTED BY SEQCOUNT LOCK */ + __le32 seq_count; /* Low bit means an update is in progress */ + /* + * This field changes to another non-repeating value when the CPU + * counter is disrupted, for example on live migration. This lets + * the guest know that it should discard any calibration it has + * performed of the counter against external sources (NTP/PTP/etc.). + */ + __le64 disruption_marker; + __le64 flags; + /* Indicates that the tai_offset_sec field is valid */ +#define VMCLOCK_FLAG_TAI_OFFSET_VALID (1 << 0) + /* + * Optionally used to notify guests of pending maintenance events. + * A guest which provides latency-sensitive services may wish to + * remove itself from service if an event is coming up. Two flags + * indicate the approximate imminence of the event. + */ +#define VMCLOCK_FLAG_DISRUPTION_SOON (1 << 1) /* About a day */ +#define VMCLOCK_FLAG_DISRUPTION_IMMINENT (1 << 2) /* About an hour */ +#define VMCLOCK_FLAG_PERIOD_ESTERROR_VALID (1 << 3) +#define VMCLOCK_FLAG_PERIOD_MAXERROR_VALID (1 << 4) +#define VMCLOCK_FLAG_TIME_ESTERROR_VALID (1 << 5) +#define VMCLOCK_FLAG_TIME_MAXERROR_VALID (1 << 6) + /* + * If the MONOTONIC flag is set then (other than leap seconds) it is + * guaranteed that the time calculated according this structure at + * any given moment shall never appear to be later than the time + * calculated via the structure at any *later* moment. + * + * In particular, a timestamp based on a counter reading taken + * immediately after setting the low bit of seq_count (and the + * associated memory barrier), using the previously-valid time and + * period fields, shall never be later than a timestamp based on + * a counter reading taken immediately before *clearing* the low + * bit again after the update, using the about-to-be-valid fields. + */ +#define VMCLOCK_FLAG_TIME_MONOTONIC (1 << 7) + + __u8 pad[2]; + __u8 clock_status; +#define VMCLOCK_STATUS_UNKNOWN 0 +#define VMCLOCK_STATUS_INITIALIZING 1 +#define VMCLOCK_STATUS_SYNCHRONIZED 2 +#define VMCLOCK_STATUS_FREERUNNING 3 +#define VMCLOCK_STATUS_UNRELIABLE 4 + + /* + * The time exposed through this device is never smeared. This field + * corresponds to the 'subtype' field in virtio-rtc, which indicates + * the smearing method. However in this case it provides a *hint* to + * the guest operating system, such that *if* the guest OS wants to + * provide its users with an alternative clock which does not follow + * UTC, it may do so in a fashion consistent with the other systems + * in the nearby environment. + */ + __u8 leap_second_smearing_hint; /* Matches VIRTIO_RTC_SUBTYPE_xxx */ +#define VMCLOCK_SMEARING_STRICT 0 +#define VMCLOCK_SMEARING_NOON_LINEAR 1 +#define VMCLOCK_SMEARING_UTC_SLS 2 + __le16 tai_offset_sec; /* Actually two's complement signed */ + __u8 leap_indicator; + /* + * This field is based on the VIRTIO_RTC_LEAP_xxx values as defined + * in the current draft of virtio-rtc, but since smearing cannot be + * used with the shared memory device, some values are not used. + * + * The _POST_POS and _POST_NEG values allow the guest to perform + * its own smearing during the day or so after a leap second when + * such smearing may need to continue being applied for a leap + * second which is now theoretically "historical". + */ +#define VMCLOCK_LEAP_NONE 0x00 /* No known nearby leap second */ +#define VMCLOCK_LEAP_PRE_POS 0x01 /* Positive leap second at EOM */ +#define VMCLOCK_LEAP_PRE_NEG 0x02 /* Negative leap second at EOM */ +#define VMCLOCK_LEAP_POS 0x03 /* Set during 23:59:60 second */ +#define VMCLOCK_LEAP_POST_POS 0x04 +#define VMCLOCK_LEAP_POST_NEG 0x05 + + /* Bit shift for counter_period_frac_sec and its error rate */ + __u8 counter_period_shift; + /* + * Paired values of counter and UTC at a given point in time. + */ + __le64 counter_value; + /* + * Counter period, and error margin of same. The unit of these + * fields is 1/2^(64 + counter_period_shift) of a second. + */ + __le64 counter_period_frac_sec; + __le64 counter_period_esterror_rate_frac_sec; + __le64 counter_period_maxerror_rate_frac_sec; + + /* + * Time according to time_type field above. + */ + __le64 time_sec; /* Seconds since time_type epoch */ + __le64 time_frac_sec; /* Units of 1/2^64 of a second */ + __le64 time_esterror_nanosec; + __le64 time_maxerror_nanosec; +}; + +#endif /* __VMCLOCK_ABI_H__ */ -- cgit v1.2.3 From 80c549cd1ab0241a7af262690a0ff9991fc74ec5 Mon Sep 17 00:00:00 2001 From: Alexander Zubkov Date: Tue, 8 Oct 2024 18:27:57 +0200 Subject: Fix misspelling of "accept*" in net Several files have "accept*" misspelled as "accpet*" in the comments. Fix all such occurrences. Signed-off-by: Alexander Zubkov Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241008162756.22618-2-green@qrator.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c | 4 ++-- drivers/net/ethernet/natsemi/ns83820.c | 2 +- include/uapi/linux/udp.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c index 455a54708be4..96fd31d75dfd 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_main.c @@ -342,8 +342,8 @@ static struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, { struct sk_buff *skb; - /* Allocate space for cpl_pass_accpet_req which will be synthesized by - * driver. Once driver synthesizes cpl_pass_accpet_req the skb will go + /* Allocate space for cpl_pass_accept_req which will be synthesized by + * driver. Once driver synthesizes cpl_pass_accept_req the skb will go * through the regular cpl_pass_accept_req processing in TOM. */ skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index 998586872599..bea969dfa536 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -2090,7 +2090,7 @@ static int ns83820_init_one(struct pci_dev *pci_dev, */ /* Ramit : 1024 DMA is not a good idea, it ends up banging * some DELL and COMPAQ SMP systems - * Turn on ALP, only we are accpeting Jumbo Packets */ + * Turn on ALP, only we are accepting Jumbo Packets */ writel(RXCFG_AEP | RXCFG_ARP | RXCFG_AIRL | RXCFG_RX_FD | RXCFG_STRIPCRC //| RXCFG_ALP diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index 1a0fe8b151fb..d85d671deed3 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -31,7 +31,7 @@ struct udphdr { #define UDP_CORK 1 /* Never send partially complete segments */ #define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ #define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ -#define UDP_NO_CHECK6_RX 102 /* Disable accpeting checksum for UDP6 */ +#define UDP_NO_CHECK6_RX 102 /* Disable accepting checksum for UDP6 */ #define UDP_SEGMENT 103 /* Set GSO segmentation size */ #define UDP_GRO 104 /* This socket can receive UDP GRO packets */ -- cgit v1.2.3 From 04e65df94b3112a1b319b6deb5bab83fd740bc7d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:48 +0200 Subject: netlink: spec: add shaper YAML spec Define the user-space visible interface to query, configure and delete network shapers via yaml definition. Add dummy implementations for the relevant NL callbacks. set() and delete() operations touch a single shaper creating/updating or deleting it. The group() operation creates a shaper's group, nesting multiple input shapers under the specified output shaper. Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/7a33a1ff370bdbcd0cd3f909575c912cd56f41da.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/net_shaper.yaml | 274 ++++++++++++++++++++++++++++ MAINTAINERS | 1 + include/uapi/linux/net_shaper.h | 78 ++++++++ net/Kconfig | 3 + net/Makefile | 1 + net/shaper/Makefile | 8 + net/shaper/shaper.c | 55 ++++++ net/shaper/shaper_nl_gen.c | 125 +++++++++++++ net/shaper/shaper_nl_gen.h | 34 ++++ 9 files changed, 579 insertions(+) create mode 100644 Documentation/netlink/specs/net_shaper.yaml create mode 100644 include/uapi/linux/net_shaper.h create mode 100644 net/shaper/Makefile create mode 100644 net/shaper/shaper.c create mode 100644 net/shaper/shaper_nl_gen.c create mode 100644 net/shaper/shaper_nl_gen.h (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml new file mode 100644 index 000000000000..618fc09932ff --- /dev/null +++ b/Documentation/netlink/specs/net_shaper.yaml @@ -0,0 +1,274 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +name: net-shaper + +doc: | + Networking HW rate limiting configuration. + + This API allows configuring HW shapers available on the network + devices at different levels (queues, network device) and allows + arbitrary manipulation of the scheduling tree of the involved + shapers. + + Each @shaper is identified within the given device, by a @handle, + comprising both a @scope and an @id. + + Depending on the @scope value, the shapers are attached to specific + HW objects (queues, devices) or, for @node scope, represent a + scheduling group, that can be placed in an arbitrary location of + the scheduling tree. + + Shapers can be created with two different operations: the @set + operation, to create and update a single "attached" shaper, and + the @group operation, to create and update a scheduling + group. Only the @group operation can create @node scope shapers. + + Existing shapers can be deleted/reset via the @delete operation. + + The user can query the running configuration via the @get operation. + +definitions: + - + type: enum + name: scope + doc: Defines the shaper @id interpretation. + render-max: true + entries: + - name: unspec + doc: The scope is not specified. + - + name: netdev + doc: The main shaper for the given network device. + - + name: queue + doc: | + The shaper is attached to the given device queue, + the @id represents the queue number. + - + name: node + doc: | + The shaper allows grouping of queues or other + node shapers; can be nested in either @netdev + shapers or other @node shapers, allowing placement + in any location of the scheduling tree, except + leaves and root. + - + type: enum + name: metric + doc: Different metric supported by the shaper. + entries: + - + name: bps + doc: Shaper operates on a bits per second basis. + - + name: pps + doc: Shaper operates on a packets per second basis. + +attribute-sets: + - + name: net-shaper + attributes: + - + name: handle + type: nest + nested-attributes: handle + doc: Unique identifier for the given shaper inside the owning device. + - + name: metric + type: u32 + enum: metric + doc: Metric used by the given shaper for bw-min, bw-max and burst. + - + name: bw-min + type: uint + doc: Guaranteed bandwidth for the given shaper. + - + name: bw-max + type: uint + doc: Maximum bandwidth for the given shaper or 0 when unlimited. + - + name: burst + type: uint + doc: | + Maximum burst-size for shaping. Should not be interpreted + as a quantum. + - + name: priority + type: u32 + doc: | + Scheduling priority for the given shaper. The priority + scheduling is applied to sibling shapers. + - + name: weight + type: u32 + doc: | + Relative weight for round robin scheduling of the + given shaper. + The scheduling is applied to all sibling shapers + with the same priority. + - + name: ifindex + type: u32 + doc: Interface index owning the specified shaper. + - + name: parent + type: nest + nested-attributes: handle + doc: | + Identifier for the parent of the affected shaper. + Only needed for @group operation. + - + name: leaves + type: nest + multi-attr: true + nested-attributes: leaf-info + doc: | + Describes a set of leaves shapers for a @group operation. + - + name: handle + attributes: + - + name: scope + type: u32 + enum: scope + doc: Defines the shaper @id interpretation. + - + name: id + type: u32 + doc: | + Numeric identifier of a shaper. The id semantic depends on + the scope. For @queue scope it's the queue id and for @node + scope it's the node identifier. + - + name: leaf-info + subset-of: net-shaper + attributes: + - + name: handle + - + name: priority + - + name: weight + +operations: + list: + - + name: get + doc: | + Get information about a shaper for a given device. + attribute-set: net-shaper + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: &ns-binding + - ifindex + - handle + reply: + attributes: &ns-attrs + - ifindex + - parent + - handle + - metric + - bw-min + - bw-max + - burst + - priority + - weight + + dump: + pre: net-shaper-nl-pre-dumpit + post: net-shaper-nl-post-dumpit + request: + attributes: + - ifindex + reply: + attributes: *ns-attrs + - + name: set + doc: | + Create or update the specified shaper. + The set operation can't be used to create a @node scope shaper, + use the @group operation instead. + attribute-set: net-shaper + flags: [ admin-perm ] + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: + - ifindex + - handle + - metric + - bw-min + - bw-max + - burst + - priority + - weight + + - + name: delete + doc: | + Clear (remove) the specified shaper. When deleting + a @node shaper, reattach all the node's leaves to the + deleted node's parent. + If, after the removal, the parent shaper has no more + leaves and the parent shaper scope is @node, the parent + node is deleted, recursively. + When deleting a @queue shaper or a @netdev shaper, + the shaper disappears from the hierarchy, but the + queue/device can still send traffic: it has an implicit + node with infinite bandwidth. The queue's implicit node + feeds an implicit RR node at the root of the hierarchy. + attribute-set: net-shaper + flags: [ admin-perm ] + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: *ns-binding + + - + name: group + doc: | + Create or update a scheduling group, attaching the specified + @leaves shapers under the specified node identified by @handle. + The @leaves shapers scope must be @queue and the node shaper + scope must be either @node or @netdev. + When the node shaper has @node scope, if the @handle @id is not + specified, a new shaper of such scope is created, otherwise the + specified node must already exist. + When updating an existing node shaper, the specified @leaves are + added to the existing node; such node will also retain any preexisting + leave. + The @parent handle for a new node shaper defaults to the parent + of all the leaves, provided all the leaves share the same parent. + Otherwise @parent handle must be specified. + The user can optionally provide shaping attributes for the node + shaper. + The operation is atomic, on failure no change is applied to + the device shaping configuration, otherwise the @node shaper + full identifier, comprising @binding and @handle, is provided + as the reply. + attribute-set: net-shaper + flags: [ admin-perm ] + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: + - ifindex + - parent + - handle + - metric + - bw-min + - bw-max + - burst + - priority + - weight + - leaves + reply: + attributes: *ns-binding diff --git a/MAINTAINERS b/MAINTAINERS index 1389704c7d8d..2927b44dda25 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16116,6 +16116,7 @@ F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h F: include/uapi/linux/ethtool_netlink.h F: include/uapi/linux/if_* +F: include/uapi/linux/net_shaper.h F: include/uapi/linux/netdev* F: tools/testing/selftests/drivers/net/ X: Documentation/devicetree/bindings/net/bluetooth/ diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h new file mode 100644 index 000000000000..9e3fa63618ee --- /dev/null +++ b/include/uapi/linux/net_shaper.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/net_shaper.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_NET_SHAPER_H +#define _UAPI_LINUX_NET_SHAPER_H + +#define NET_SHAPER_FAMILY_NAME "net-shaper" +#define NET_SHAPER_FAMILY_VERSION 1 + +/** + * enum net_shaper_scope - Defines the shaper @id interpretation. + * @NET_SHAPER_SCOPE_UNSPEC: The scope is not specified. + * @NET_SHAPER_SCOPE_NETDEV: The main shaper for the given network device. + * @NET_SHAPER_SCOPE_QUEUE: The shaper is attached to the given device queue, + * the @id represents the queue number. + * @NET_SHAPER_SCOPE_NODE: The shaper allows grouping of queues or other node + * shapers; can be nested in either @netdev shapers or other @node shapers, + * allowing placement in any location of the scheduling tree, except leaves + * and root. + */ +enum net_shaper_scope { + NET_SHAPER_SCOPE_UNSPEC, + NET_SHAPER_SCOPE_NETDEV, + NET_SHAPER_SCOPE_QUEUE, + NET_SHAPER_SCOPE_NODE, + + /* private: */ + __NET_SHAPER_SCOPE_MAX, + NET_SHAPER_SCOPE_MAX = (__NET_SHAPER_SCOPE_MAX - 1) +}; + +/** + * enum net_shaper_metric - Different metric supported by the shaper. + * @NET_SHAPER_METRIC_BPS: Shaper operates on a bits per second basis. + * @NET_SHAPER_METRIC_PPS: Shaper operates on a packets per second basis. + */ +enum net_shaper_metric { + NET_SHAPER_METRIC_BPS, + NET_SHAPER_METRIC_PPS, +}; + +enum { + NET_SHAPER_A_HANDLE = 1, + NET_SHAPER_A_METRIC, + NET_SHAPER_A_BW_MIN, + NET_SHAPER_A_BW_MAX, + NET_SHAPER_A_BURST, + NET_SHAPER_A_PRIORITY, + NET_SHAPER_A_WEIGHT, + NET_SHAPER_A_IFINDEX, + NET_SHAPER_A_PARENT, + NET_SHAPER_A_LEAVES, + + __NET_SHAPER_A_MAX, + NET_SHAPER_A_MAX = (__NET_SHAPER_A_MAX - 1) +}; + +enum { + NET_SHAPER_A_HANDLE_SCOPE = 1, + NET_SHAPER_A_HANDLE_ID, + + __NET_SHAPER_A_HANDLE_MAX, + NET_SHAPER_A_HANDLE_MAX = (__NET_SHAPER_A_HANDLE_MAX - 1) +}; + +enum { + NET_SHAPER_CMD_GET = 1, + NET_SHAPER_CMD_SET, + NET_SHAPER_CMD_DELETE, + NET_SHAPER_CMD_GROUP, + + __NET_SHAPER_CMD_MAX, + NET_SHAPER_CMD_MAX = (__NET_SHAPER_CMD_MAX - 1) +}; + +#endif /* _UAPI_LINUX_NET_SHAPER_H */ diff --git a/net/Kconfig b/net/Kconfig index a629f92dc86b..c3fca69a7c83 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -72,6 +72,9 @@ config NET_DEVMEM depends on GENERIC_ALLOCATOR depends on PAGE_POOL +config NET_SHAPER + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/Makefile b/net/Makefile index 65bb8c72a35e..60ed5190eda8 100644 --- a/net/Makefile +++ b/net/Makefile @@ -79,3 +79,4 @@ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_MCTP) += mctp/ obj-$(CONFIG_NET_HANDSHAKE) += handshake/ +obj-$(CONFIG_NET_SHAPER) += shaper/ diff --git a/net/shaper/Makefile b/net/shaper/Makefile new file mode 100644 index 000000000000..54af7169a331 --- /dev/null +++ b/net/shaper/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the net shaper infrastructure. +# +# Copyright (c) 2024, Red Hat, Inc. +# + +obj-y += shaper.o shaper_nl_gen.o diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c new file mode 100644 index 000000000000..a1b20888f502 --- /dev/null +++ b/net/shaper/shaper.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include + +#include "shaper_nl_gen.h" + +int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +void net_shaper_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ +} + +int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +static int __init shaper_init(void) +{ + return genl_register_family(&net_shaper_nl_family); +} + +subsys_initcall(shaper_init); diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c new file mode 100644 index 000000000000..34185c5989e6 --- /dev/null +++ b/net/shaper/shaper_nl_gen.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/net_shaper.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "shaper_nl_gen.h" + +#include + +/* Common nested types */ +const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = { + [NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), + [NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, }, +}; + +const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = { + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, }, + [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, }, +}; + +/* NET_SHAPER_CMD_GET - do */ +static const struct nla_policy net_shaper_get_do_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), +}; + +/* NET_SHAPER_CMD_GET - dump */ +static const struct nla_policy net_shaper_get_dump_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, +}; + +/* NET_SHAPER_CMD_SET - do */ +static const struct nla_policy net_shaper_set_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_METRIC] = NLA_POLICY_MAX(NLA_U32, 1), + [NET_SHAPER_A_BW_MIN] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BURST] = { .type = NLA_UINT, }, + [NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, }, + [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, }, +}; + +/* NET_SHAPER_CMD_DELETE - do */ +static const struct nla_policy net_shaper_delete_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), +}; + +/* NET_SHAPER_CMD_GROUP - do */ +static const struct nla_policy net_shaper_group_nl_policy[NET_SHAPER_A_LEAVES + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_PARENT] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_METRIC] = NLA_POLICY_MAX(NLA_U32, 1), + [NET_SHAPER_A_BW_MIN] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BURST] = { .type = NLA_UINT, }, + [NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, }, + [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, }, + [NET_SHAPER_A_LEAVES] = NLA_POLICY_NESTED(net_shaper_leaf_info_nl_policy), +}; + +/* Ops table for net_shaper */ +static const struct genl_split_ops net_shaper_nl_ops[] = { + { + .cmd = NET_SHAPER_CMD_GET, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_get_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_get_do_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_GET, + .start = net_shaper_nl_pre_dumpit, + .dumpit = net_shaper_nl_get_dumpit, + .done = net_shaper_nl_post_dumpit, + .policy = net_shaper_get_dump_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NET_SHAPER_CMD_SET, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_set_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_set_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_DELETE, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_delete_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_delete_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_GROUP, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_group_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_group_nl_policy, + .maxattr = NET_SHAPER_A_LEAVES, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, +}; + +struct genl_family net_shaper_nl_family __ro_after_init = { + .name = NET_SHAPER_FAMILY_NAME, + .version = NET_SHAPER_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = net_shaper_nl_ops, + .n_split_ops = ARRAY_SIZE(net_shaper_nl_ops), +}; diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h new file mode 100644 index 000000000000..016cb6f3187b --- /dev/null +++ b/net/shaper/shaper_nl_gen.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/net_shaper.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_NET_SHAPER_GEN_H +#define _LINUX_NET_SHAPER_GEN_H + +#include +#include + +#include + +/* Common nested types */ +extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1]; +extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1]; + +int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +int net_shaper_nl_pre_dumpit(struct netlink_callback *cb); +int net_shaper_nl_post_dumpit(struct netlink_callback *cb); + +int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info); + +extern struct genl_family net_shaper_nl_family; + +#endif /* _LINUX_NET_SHAPER_GEN_H */ -- cgit v1.2.3 From 14bba9285aedefb99647d716b0f61bf32081e387 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:54 +0200 Subject: netlink: spec: add shaper introspection support Allow the user-space to fine-grain query the shaping features supported by the NIC on each domain. Reviewed-by: Jiri Pirko Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/3ddd10e450e3fe7d4b944c0d0b886d4483529ee6.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/net_shaper.yaml | 88 +++++++++++++++++++++++++++++ include/uapi/linux/net_shaper.h | 17 ++++++ net/shaper/shaper.c | 32 +++++++++++ net/shaper/shaper_nl_gen.c | 29 ++++++++++ net/shaper/shaper_nl_gen.h | 10 ++++ 5 files changed, 176 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml index 618fc09932ff..8ebad0d02904 100644 --- a/Documentation/netlink/specs/net_shaper.yaml +++ b/Documentation/netlink/specs/net_shaper.yaml @@ -26,6 +26,11 @@ doc: | The user can query the running configuration via the @get operation. + Different devices can provide different feature sets, e.g. with no + support for complex scheduling hierarchy, or for some shaping + parameters. The user can introspect the HW capabilities via the + @cap-get operation. + definitions: - type: enum @@ -148,6 +153,53 @@ attribute-sets: name: priority - name: weight + - + name: caps + attributes: + - + name: ifindex + type: u32 + doc: Interface index queried for shapers capabilities. + - + name: scope + type: u32 + enum: scope + doc: The scope to which the queried capabilities apply. + - + name: support-metric-bps + type: flag + doc: The device accepts 'bps' metric for bw-min, bw-max and burst. + - + name: support-metric-pps + type: flag + doc: The device accepts 'pps' metric for bw-min, bw-max and burst. + - + name: support-nesting + type: flag + doc: | + The device supports nesting shaper belonging to this scope + below 'node' scoped shapers. Only 'queue' and 'node' + scope can have flag 'support-nesting'. + - + name: support-bw-min + type: flag + doc: The device supports a minimum guaranteed B/W. + - + name: support-bw-max + type: flag + doc: The device supports maximum B/W shaping. + - + name: support-burst + type: flag + doc: The device supports a maximum burst size. + - + name: support-priority + type: flag + doc: The device supports priority scheduling. + - + name: support-weight + type: flag + doc: The device supports weighted round robin scheduling. operations: list: @@ -272,3 +324,39 @@ operations: - leaves reply: attributes: *ns-binding + + - + name: cap-get + doc: | + Get the shaper capabilities supported by the given device + for the specified scope. + attribute-set: caps + + do: + pre: net-shaper-nl-cap-pre-doit + post: net-shaper-nl-cap-post-doit + request: + attributes: + - ifindex + - scope + reply: + attributes: &cap-attrs + - ifindex + - scope + - support-metric-bps + - support-metric-pps + - support-nesting + - support-bw-min + - support-bw-max + - support-burst + - support-priority + - support-weight + + dump: + pre: net-shaper-nl-cap-pre-dumpit + post: net-shaper-nl-cap-post-dumpit + request: + attributes: + - ifindex + reply: + attributes: *cap-attrs diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h index 9e3fa63618ee..d8834b59f7d7 100644 --- a/include/uapi/linux/net_shaper.h +++ b/include/uapi/linux/net_shaper.h @@ -65,11 +65,28 @@ enum { NET_SHAPER_A_HANDLE_MAX = (__NET_SHAPER_A_HANDLE_MAX - 1) }; +enum { + NET_SHAPER_A_CAPS_IFINDEX = 1, + NET_SHAPER_A_CAPS_SCOPE, + NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS, + NET_SHAPER_A_CAPS_SUPPORT_METRIC_PPS, + NET_SHAPER_A_CAPS_SUPPORT_NESTING, + NET_SHAPER_A_CAPS_SUPPORT_BW_MIN, + NET_SHAPER_A_CAPS_SUPPORT_BW_MAX, + NET_SHAPER_A_CAPS_SUPPORT_BURST, + NET_SHAPER_A_CAPS_SUPPORT_PRIORITY, + NET_SHAPER_A_CAPS_SUPPORT_WEIGHT, + + __NET_SHAPER_A_CAPS_MAX, + NET_SHAPER_A_CAPS_MAX = (__NET_SHAPER_A_CAPS_MAX - 1) +}; + enum { NET_SHAPER_CMD_GET = 1, NET_SHAPER_CMD_SET, NET_SHAPER_CMD_DELETE, NET_SHAPER_CMD_GROUP, + NET_SHAPER_CMD_CAP_GET, __NET_SHAPER_CMD_MAX, NET_SHAPER_CMD_MAX = (__NET_SHAPER_CMD_MAX - 1) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 85ad172833fc..92c8da046391 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -598,6 +598,27 @@ int net_shaper_nl_post_dumpit(struct netlink_callback *cb) return 0; } +int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +void net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ +} + +int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_binding *binding; @@ -1126,6 +1147,17 @@ free_msg: goto free_leaves; } +int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return 0; +} + +int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return 0; +} + static void net_shaper_flush(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 34185c5989e6..204c8ae8c7b1 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -65,6 +65,17 @@ static const struct nla_policy net_shaper_group_nl_policy[NET_SHAPER_A_LEAVES + [NET_SHAPER_A_LEAVES] = NLA_POLICY_NESTED(net_shaper_leaf_info_nl_policy), }; +/* NET_SHAPER_CMD_CAP_GET - do */ +static const struct nla_policy net_shaper_cap_get_do_nl_policy[NET_SHAPER_A_CAPS_SCOPE + 1] = { + [NET_SHAPER_A_CAPS_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_CAPS_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), +}; + +/* NET_SHAPER_CMD_CAP_GET - dump */ +static const struct nla_policy net_shaper_cap_get_dump_nl_policy[NET_SHAPER_A_CAPS_IFINDEX + 1] = { + [NET_SHAPER_A_CAPS_IFINDEX] = { .type = NLA_U32, }, +}; + /* Ops table for net_shaper */ static const struct genl_split_ops net_shaper_nl_ops[] = { { @@ -112,6 +123,24 @@ static const struct genl_split_ops net_shaper_nl_ops[] = { .maxattr = NET_SHAPER_A_LEAVES, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NET_SHAPER_CMD_CAP_GET, + .pre_doit = net_shaper_nl_cap_pre_doit, + .doit = net_shaper_nl_cap_get_doit, + .post_doit = net_shaper_nl_cap_post_doit, + .policy = net_shaper_cap_get_do_nl_policy, + .maxattr = NET_SHAPER_A_CAPS_SCOPE, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_CAP_GET, + .start = net_shaper_nl_cap_pre_dumpit, + .dumpit = net_shaper_nl_cap_get_dumpit, + .done = net_shaper_nl_cap_post_dumpit, + .policy = net_shaper_cap_get_dump_nl_policy, + .maxattr = NET_SHAPER_A_CAPS_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, }; struct genl_family net_shaper_nl_family __ro_after_init = { diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index 016cb6f3187b..cb7f9026fc23 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -17,17 +17,27 @@ extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGH int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +void +net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_pre_dumpit(struct netlink_callback *cb); +int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb); int net_shaper_nl_post_dumpit(struct netlink_callback *cb); +int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb); int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); extern struct genl_family net_shaper_nl_family; -- cgit v1.2.3 From 516010460011ae74ac3b7383cf90ed27e2711cd6 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:57 +0000 Subject: netdev-genl: Dump napi_defer_hard_irqs Support dumping defer_hard_irqs for a NAPI ID. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 8 ++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 4 files changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 08412c279297..585e87ec3c16 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -248,6 +248,13 @@ attribute-sets: threaded mode. If NAPI is not in threaded mode (i.e. uses normal softirq context), the attribute will be absent. type: u32 + - + name: defer-hard-irqs + doc: The number of consecutive empty polls before IRQ deferral ends + and hardware IRQs are re-enabled. + type: u32 + checks: + max: s32-max - name: queue attributes: @@ -636,6 +643,7 @@ operations: - ifindex - irq - pid + - defer-hard-irqs dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7c308f04e7a0..13dc0b027e86 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -122,6 +122,7 @@ enum { NETDEV_A_NAPI_ID, NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, + NETDEV_A_NAPI_DEFER_HARD_IRQS, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 358cba248796..f98e5d1d0d21 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + u32 napi_defer_hard_irqs; void *hdr; pid_t pid; @@ -189,6 +190,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, goto nla_put_failure; } + napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); + if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, + napi_defer_hard_irqs)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7c308f04e7a0..13dc0b027e86 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -122,6 +122,7 @@ enum { NETDEV_A_NAPI_ID, NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, + NETDEV_A_NAPI_DEFER_HARD_IRQS, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 0137891e74576f77a7901718dc0ce08ca074ae74 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:59 +0000 Subject: netdev-genl: Dump gro_flush_timeout Support dumping gro_flush_timeout for a NAPI ID. Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-5-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 9 +++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 4 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 585e87ec3c16..7b47454c51dd 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -255,6 +255,14 @@ attribute-sets: type: u32 checks: max: s32-max + - + name: gro-flush-timeout + doc: The timeout, in nanoseconds, of when to trigger the NAPI watchdog + timer which schedules NAPI processing. Additionally, a non-zero + value will also prevent GRO from flushing recent super-frames at + the end of a NAPI cycle. This may add receive latency in exchange + for reducing the number of frames processed by the network stack. + type: uint - name: queue attributes: @@ -644,6 +652,7 @@ operations: - irq - pid - defer-hard-irqs + - gro-flush-timeout dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 13dc0b027e86..cacd33359c76 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -123,6 +123,7 @@ enum { NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, + NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index f98e5d1d0d21..ac19f2e6cfbe 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; pid_t pid; @@ -195,6 +196,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + gro_flush_timeout)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 13dc0b027e86..cacd33359c76 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -123,6 +123,7 @@ enum { NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, + NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 1287c1ae0fc227e5acef11a539eb4e75646e31c7 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:45:01 +0000 Subject: netdev-genl: Support setting per-NAPI config values Add support to set per-NAPI defer_hard_irqs and gro_flush_timeout. Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-7-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 11 ++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl-gen.c | 18 +++++++++++++ net/core/netdev-genl-gen.h | 1 + net/core/netdev-genl.c | 45 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 6 files changed, 77 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 7b47454c51dd..f9cb97d6106c 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -693,6 +693,17 @@ operations: reply: attributes: - id + - + name: napi-set + doc: Set configurable NAPI instance settings. + attribute-set: napi + flags: [ admin-perm ] + do: + request: + attributes: + - id + - defer-hard-irqs + - gro-flush-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index cacd33359c76..e3ebb49f60d2 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -201,6 +201,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_NAPI_SET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index b28424ae06d5..e197bd84997c 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -22,6 +22,10 @@ static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = .max = 2147483647ULL, }; +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { + .max = 2147483647ULL, +}; + /* Common nested types */ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), @@ -87,6 +91,13 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), }; +/* NETDEV_CMD_NAPI_SET - do */ +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -171,6 +182,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_NAPI_SET, + .doit = netdev_nl_napi_set_doit, + .policy = netdev_napi_set_nl_policy, + .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 8cda334fd042..e09dd7539ff2 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index ac19f2e6cfbe..b49c3b4e5fbe 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -303,6 +303,51 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return err; } +static int +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) +{ + u64 gro_flush_timeout = 0; + u32 defer = 0; + + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); + napi_set_defer_hard_irqs(napi, defer); + } + + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); + napi_set_gro_flush_timeout(napi, gro_flush_timeout); + } + + return 0; +} + +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + unsigned int napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rtnl_lock(); + + napi = napi_by_id(napi_id); + if (napi) { + err = netdev_nl_napi_set_config(napi, info); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + rtnl_unlock(); + + return err; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index cacd33359c76..e3ebb49f60d2 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -201,6 +201,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_NAPI_SET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From 3607798ad9bdef35ad08489a8239390fccaac6b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:42 +0200 Subject: wifi: cfg80211: add option for vif allowed radios This allows users to prevent a vif from affecting radios other than the configured ones. This can be useful in cases where e.g. an AP is running on one radio, and triggering a scan on another radio should not disturb it. Changing the allowed radios list for a vif is supported, but only while it is down. While it is possible to achieve the same by always explicitly specifying a frequency list for scan requests and ensuring that the wrong channel/band is never accidentally set on an unrelated interface, this change makes multi-radio wiphy setups a lot easier to deal with for CLI users. By itself, this patch only enforces the radio mask for scanning requests and remain-on-channel. Follow-up changes build on this to limit configured frequencies. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/eefcb218780f71a1549875d149f1196486762756.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 +++++++++++ include/uapi/linux/nl80211.h | 5 ++++ net/wireless/core.c | 2 ++ net/wireless/nl80211.c | 60 ++++++++++++++++++++++++++++++++++++++------ net/wireless/scan.c | 10 +++++--- net/wireless/util.c | 29 +++++++++++++++++++++ 6 files changed, 109 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c8ce5c2e14f4..95d05e67e69a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6221,6 +6221,7 @@ enum ieee80211_ap_reg_power { * entered. * @links.cac_time_ms: CAC time in ms * @valid_links: bitmap describing what elements of @links are valid + * @radio_mask: Bitmask of radios that this interface is allowed to operate on. */ struct wireless_dev { struct wiphy *wiphy; @@ -6333,6 +6334,8 @@ struct wireless_dev { unsigned int cac_time_ms; } links[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links; + + u32 radio_mask; }; static inline const u8 *wdev_address(struct wireless_dev *wdev) @@ -6518,6 +6521,17 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef); +/** + * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel + * + * @wdev: the wireless device + * @chan: channel to check + * + * Return: whether or not the wdev may use the channel + */ +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan); + /** * ieee80211_get_response_rate - get basic rate for a given rate * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f97f5adc8d51..d31ccee99cc7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2868,6 +2868,9 @@ enum nl80211_commands { * nested item, it contains attributes defined in * &enum nl80211_if_combination_attrs. * + * @NL80211_ATTR_VIF_RADIO_MASK: Bitmask of allowed radios (u32). + * A value of 0 means all radios. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3416,6 +3419,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_RADIOS, NL80211_ATTR_WIPHY_INTERFACE_COMBINATIONS, + NL80211_ATTR_VIF_RADIO_MASK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index 4c8d8f167409..93d62a1d3a45 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1430,6 +1430,8 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; + wdev->radio_mask = BIT(wdev->wiphy->n_radio) - 1; + if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fb35c03af34c..a330347dd7a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -829,6 +829,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG }, + [NL80211_ATTR_VIF_RADIO_MASK] = { .type = NLA_U32 }, }; /* policy for the key attributes */ @@ -3996,7 +3997,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->devlist_generation ^ (cfg80211_rdev_list_generation << 2)) || - nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr)) + nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr) || + nla_put_u32(msg, NL80211_ATTR_VIF_RADIO_MASK, wdev->radio_mask)) goto nla_put_failure; if (rdev->ops->get_channel && !wdev->valid_links) { @@ -4312,6 +4314,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev, return -EOPNOTSUPP; } +static int nl80211_parse_vif_radio_mask(struct genl_info *info, + u32 *radio_mask) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct nlattr *attr = info->attrs[NL80211_ATTR_VIF_RADIO_MASK]; + u32 mask, allowed; + + if (!attr) { + *radio_mask = 0; + return 0; + } + + allowed = BIT(rdev->wiphy.n_radio) - 1; + mask = nla_get_u32(attr); + if (mask & ~allowed) + return -EINVAL; + if (!mask) + mask = allowed; + *radio_mask = mask; + + return 1; +} + static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -4319,6 +4344,8 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) int err; enum nl80211_iftype otype, ntype; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u32 radio_mask = 0; bool change = false; memset(¶ms, 0, sizeof(params)); @@ -4332,8 +4359,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_MESH_ID]) { - struct wireless_dev *wdev = dev->ieee80211_ptr; - if (ntype != NL80211_IFTYPE_MESH_POINT) return -EINVAL; if (otype != NL80211_IFTYPE_MESH_POINT) @@ -4364,6 +4389,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (err > 0) change = true; + err = nl80211_parse_vif_radio_mask(info, &radio_mask); + if (err < 0) + return err; + if (err && netif_running(dev)) + return -EBUSY; + if (change) err = cfg80211_change_iface(rdev, dev, ntype, ¶ms); else @@ -4372,11 +4403,11 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (!err && params.use_4addr != -1) dev->ieee80211_ptr->use_4addr = params.use_4addr; - if (change && !err) { - struct wireless_dev *wdev = dev->ieee80211_ptr; + if (radio_mask) + wdev->radio_mask = radio_mask; + if (change && !err) nl80211_notify_iface(rdev, wdev, NL80211_CMD_SET_INTERFACE); - } return err; } @@ -4387,6 +4418,7 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) struct vif_params params; struct wireless_dev *wdev; struct sk_buff *msg; + u32 radio_mask; int err; enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; @@ -4424,6 +4456,10 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) if (err < 0) return err; + err = nl80211_parse_vif_radio_mask(info, &radio_mask); + if (err < 0) + return err; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -4465,6 +4501,9 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) break; } + if (radio_mask) + wdev->radio_mask = radio_mask; + if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0, rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) { nlmsg_free(msg); @@ -9156,6 +9195,9 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, lockdep_assert_wiphy(wdev->wiphy); + if (!cfg80211_wdev_channel_allowed(wdev, chan)) + return false; + if (!cfg80211_beaconing_iface_active(wdev)) return true; @@ -9368,7 +9410,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } /* ignore disabled channels */ - if (chan->flags & IEEE80211_CHAN_DISABLED) + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(wdev, chan)) continue; request->channels[i] = chan; @@ -9388,7 +9431,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; - if (chan->flags & IEEE80211_CHAN_DISABLED) + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(wdev, chan)) continue; request->channels[i] = chan; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 8ba618f4734f..8e3d46bf4836 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -956,7 +956,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); - if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan)) continue; for (i = 0; i < rdev_req->n_channels; i++) { @@ -3515,9 +3516,12 @@ int cfg80211_wext_siwscan(struct net_device *dev, continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + struct ieee80211_channel *chan; + /* ignore disabled channels */ - if (wiphy->bands[band]->channels[j].flags & - IEEE80211_CHAN_DISABLED) + chan = &wiphy->bands[band]->channels[j]; + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(creq->wdev, chan)) continue; /* If we have a wireless request structure and the diff --git a/net/wireless/util.c b/net/wireless/util.c index 93a9c32418a6..040d62051eb9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2923,3 +2923,32 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, return true; } EXPORT_SYMBOL(cfg80211_radio_chandef_valid); + +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan) +{ + struct wiphy *wiphy = wdev->wiphy; + const struct wiphy_radio *radio; + struct cfg80211_chan_def chandef; + u32 radio_mask; + int i; + + radio_mask = wdev->radio_mask; + if (!wiphy->n_radio || radio_mask == BIT(wiphy->n_radio) - 1) + return true; + + cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_HT20); + for (i = 0; i < wiphy->n_radio; i++) { + if (!(radio_mask & BIT(i))) + continue; + + radio = &wiphy->radio[i]; + if (!cfg80211_radio_chandef_valid(radio, &chandef)) + continue; + + return true; + } + + return false; +} +EXPORT_SYMBOL(cfg80211_wdev_channel_allowed); -- cgit v1.2.3 From ebda716ea4da03326ac4d0a71604d18aa8a2e695 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:45 +0200 Subject: wifi: cfg80211: report per wiphy radio antenna mask With multi-radio devices, each radio typically gets a fixed set of antennas. In order to be able to disable specific antennas for some radios, user space needs to know which antenna mask bits are assigned to which radio. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/e0a26afa2c88eaa188ec96ec6d17ecac4e827641.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 95d05e67e69a..3100733f3e23 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5434,6 +5434,8 @@ struct wiphy_radio_freq_range { * @iface_combinations: Valid interface combinations array, should not * list single interface types. * @n_iface_combinations: number of entries in @iface_combinations array. + * + * @antenna_mask: bitmask of antennas connected to this radio. */ struct wiphy_radio { const struct wiphy_radio_freq_range *freq_range; @@ -5441,6 +5443,8 @@ struct wiphy_radio { const struct ieee80211_iface_combination *iface_combinations; int n_iface_combinations; + + u32 antenna_mask; }; #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d31ccee99cc7..1b8827f920ff 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -8036,6 +8036,8 @@ enum nl80211_ap_settings_flags { * @NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION: Supported interface * combination for this radio. Attribute may be present multiple times * and contains attributes defined in &enum nl80211_if_combination_attrs. + * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas + * connected to this radio. * * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute @@ -8046,6 +8048,7 @@ enum nl80211_wiphy_radio_attrs { NL80211_WIPHY_RADIO_ATTR_INDEX, NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE, NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION, + NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, /* keep last */ __NL80211_WIPHY_RADIO_ATTR_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a330347dd7a3..aa78f18dd454 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2431,6 +2431,11 @@ static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx) if (nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_INDEX, idx)) goto nla_put_failure; + if (r->antenna_mask && + nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, + r->antenna_mask)) + goto nla_put_failure; + for (i = 0; i < r->n_freq_range; i++) { const struct wiphy_radio_freq_range *range = &r->freq_range[i]; -- cgit v1.2.3 From a77e527b470cc38754c730bce1483711f643bb60 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:49 +0200 Subject: wifi: cfg80211: add monitor SKIP_TX flag This can be used to indicate that the user is not interested in receiving locally sent packets on the monitor interface. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/f0c20f832eadd36c71fba9a2a16ba57d78389b6c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 1 + 3 files changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5feb93ba0400..8f9853b1a5d1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2267,6 +2267,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering * @MONITOR_FLAG_COOK_FRAMES: report frames after processing * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address + * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ enum monitor_flags { MONITOR_FLAG_CHANGED = BIT(__NL80211_MNTR_FLAG_INVALID), @@ -2276,6 +2277,7 @@ enum monitor_flags { MONITOR_FLAG_OTHER_BSS = BIT(NL80211_MNTR_FLAG_OTHER_BSS), MONITOR_FLAG_COOK_FRAMES = BIT(NL80211_MNTR_FLAG_COOK_FRAMES), MONITOR_FLAG_ACTIVE = BIT(NL80211_MNTR_FLAG_ACTIVE), + MONITOR_FLAG_SKIP_TX = BIT(NL80211_MNTR_FLAG_SKIP_TX), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1b8827f920ff..6d11437596b9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4703,6 +4703,7 @@ enum nl80211_survey_info { * overrides all other flags. * @NL80211_MNTR_FLAG_ACTIVE: use the configured MAC address * and ACK incoming unicast packets. + * @NL80211_MNTR_FLAG_SKIP_TX: do not pass local tx packets * * @__NL80211_MNTR_FLAG_AFTER_LAST: internal use * @NL80211_MNTR_FLAG_MAX: highest possible monitor flag @@ -4715,6 +4716,7 @@ enum nl80211_mntr_flags { NL80211_MNTR_FLAG_OTHER_BSS, NL80211_MNTR_FLAG_COOK_FRAMES, NL80211_MNTR_FLAG_ACTIVE, + NL80211_MNTR_FLAG_SKIP_TX, /* keep last */ __NL80211_MNTR_FLAG_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 84015f56e93a..4a8c3b6d49d1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4206,6 +4206,7 @@ static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = { [NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG }, [NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG }, [NL80211_MNTR_FLAG_ACTIVE] = { .type = NLA_FLAG }, + [NL80211_MNTR_FLAG_SKIP_TX] = { .type = NLA_FLAG }, }; static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) -- cgit v1.2.3 From 1ddf9916ac09313128e40d6581cef889c0b4ce84 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:42 +0200 Subject: xfrm: Add support for per cpu xfrm state handling. Currently all flows for a certain SA must be processed by the same cpu to avoid packet reordering and lock contention of the xfrm state lock. To get rid of this limitation, the IETF standardized per cpu SAs in RFC 9611. This patch implements the xfrm part of it. We add the cpu as a lookup key for xfrm states and a config option to generate acquire messages for each cpu. With that, we can have on each cpu a SA with identical traffic selector so that flows can be processed in parallel on all cpus. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 5 ++-- include/uapi/linux/xfrm.h | 2 ++ net/key/af_key.c | 7 +++--- net/xfrm/xfrm_compat.c | 6 +++-- net/xfrm/xfrm_state.c | 58 ++++++++++++++++++++++++++++++++++++++--------- net/xfrm/xfrm_user.c | 56 +++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 112 insertions(+), 22 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a0bdd58f401c..f5275618e744 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -188,6 +188,7 @@ struct xfrm_state { refcount_t refcnt; spinlock_t lock; + u32 pcpu_num; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; @@ -1684,7 +1685,7 @@ struct xfrmk_spdinfo { u32 spdhmcnt; }; -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1796,7 +1797,7 @@ int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u32 if_id, u8 proto, + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index f28701500714..d73a97e3030a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -322,6 +322,7 @@ enum xfrm_attr_type_t { XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ XFRMA_SA_DIR, /* __u8 */ XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ + XFRMA_SA_PCPU, /* __u32 */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ @@ -437,6 +438,7 @@ struct xfrm_userpolicy_info { #define XFRM_POLICY_LOCALOK 1 /* Allow user to override global policy */ /* Automatically expand selector to include matching ICMP payloads. */ #define XFRM_POLICY_ICMP 2 +#define XFRM_POLICY_CPU_ACQUIRE 4 __u8 share; }; diff --git a/net/key/af_key.c b/net/key/af_key.c index f79fb99271ed..c56bb4f451e6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1354,7 +1354,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (hdr->sadb_msg_seq) { - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; @@ -1362,7 +1362,8 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (!x) - x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX, + proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; @@ -1417,7 +1418,7 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x == NULL) return 0; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 91357ccaf4af..5b9ee63e30b6 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -132,6 +132,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = { [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, }; static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, @@ -282,9 +283,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_MTIMER_THRESH: case XFRMA_SA_DIR: case XFRMA_NAT_KEEPALIVE_INTERVAL: + case XFRMA_SA_PCPU: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -439,7 +441,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 37478d36a8df..ebef07b80afa 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -679,6 +679,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) x->lft.hard_packet_limit = XFRM_INF; x->replay_maxage = 0; x->replay_maxdiff = 0; + x->pcpu_num = UINT_MAX; spin_lock_init(&x->lock); } return x; @@ -1155,6 +1156,12 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, struct xfrm_state **best, int *acq_in_progress, int *error) { + /* We need the cpu id just as a lookup key, + * we don't require it to be stable. + */ + unsigned int pcpu_id = get_cpu(); + put_cpu(); + /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. @@ -1174,13 +1181,18 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, &fl->u.__fl_common)) return; + if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id) + return; + if (!*best || + ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) || (*best)->km.dying > x->km.dying || ((*best)->km.dying == x->km.dying && (*best)->curlft.add_time < x->curlft.add_time)) *best = x; } else if (x->km.state == XFRM_STATE_ACQ) { - *acq_in_progress = 1; + if (!*best || x->pcpu_num == pcpu_id) + *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { if ((!x->sel.family || @@ -1209,6 +1221,13 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; + unsigned int pcpu_id; + + /* We need the cpu id just as a lookup key, + * we don't require it to be stable. + */ + pcpu_id = get_cpu(); + put_cpu(); to_put = NULL; @@ -1282,7 +1301,10 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, } found: - x = best; + if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || + (best && (best->pcpu_num == pcpu_id))) + x = best; + if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && (x0 = __xfrm_state_lookup_all(net, mark, daddr, @@ -1314,6 +1336,8 @@ found: xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); x->if_id = if_id; + if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best) + x->pcpu_num = pcpu_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { @@ -1392,6 +1416,11 @@ found: x = NULL; error = -ESRCH; } + + /* Use the already installed 'fallback' while the CPU-specific + * SA acquire is handled*/ + if (best) + x = best; } out: if (x) { @@ -1524,12 +1553,14 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; u32 if_id = xnew->if_id; + u32 cpu_id = xnew->pcpu_num; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && x->if_id == if_id && + x->pcpu_num == cpu_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) @@ -1552,7 +1583,7 @@ EXPORT_SYMBOL(xfrm_state_insert); static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, - u32 reqid, u32 if_id, u8 proto, + u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) @@ -1569,6 +1600,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, x->id.spi != 0 || x->id.proto != proto || (mark & x->mark.m) != x->mark.v || + x->pcpu_num != pcpu_num || !xfrm_addr_equal(&x->id.daddr, daddr, family) || !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; @@ -1602,6 +1634,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, break; } + x->pcpu_num = pcpu_num; x->km.state = XFRM_STATE_ACQ; x->id.proto = proto; x->props.family = family; @@ -1630,7 +1663,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, return x; } -static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_add(struct xfrm_state *x) { @@ -1656,7 +1689,7 @@ int xfrm_state_add(struct xfrm_state *x) } if (use_spi && x->km.seq) { - x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq); + x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num); if (x1 && ((x1->id.proto != x->id.proto) || !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) { to_put = x1; @@ -1666,7 +1699,7 @@ int xfrm_state_add(struct xfrm_state *x) if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, - x->props.reqid, x->if_id, x->id.proto, + x->props.reqid, x->if_id, x->pcpu_num, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); @@ -1791,6 +1824,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; + x->pcpu_num = orig->pcpu_num; x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; @@ -2066,13 +2100,14 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, - u32 if_id, u8 proto, const xfrm_address_t *daddr, + u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); + x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num, + proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; @@ -2207,7 +2242,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, /* Silly enough, but I'm lazy to build resolution list */ -static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { unsigned int h = xfrm_seq_hash(net, seq); struct xfrm_state *x; @@ -2215,6 +2250,7 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { if (x->km.seq == seq && (mark & x->mark.m) == x->mark.v && + x->pcpu_num == pcpu_num && x->km.state == XFRM_STATE_ACQ) { xfrm_state_hold(x); return x; @@ -2224,12 +2260,12 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s return NULL; } -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __xfrm_find_acq_byseq(net, mark, seq); + x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e3b8ce89831a..e4d448950d05 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -460,6 +460,12 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, } } + if (!sa_dir && attrs[XFRMA_SA_PCPU]) { + NL_SET_ERR_MSG(extack, "SA_PCPU only supported with SA_DIR"); + err = -EINVAL; + goto out; + } + out: return err; } @@ -841,6 +847,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, x->nat_keepalive_interval = nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]); + if (attrs[XFRMA_SA_PCPU]) { + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + if (x->pcpu_num >= num_possible_cpus()) + goto error; + } + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); if (err) goto error; @@ -1296,6 +1308,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } + if (x->pcpu_num != UINT_MAX) { + ret = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); + if (ret) + goto out; + } if (x->dir) ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -1700,6 +1717,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, u32 mark; struct xfrm_mark m; u32 if_id = 0; + u32 pcpu_num = UINT_MAX; p = nlmsg_data(nlh); err = verify_spi_info(p->info.id.proto, p->min, p->max, extack); @@ -1716,8 +1734,16 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_SA_PCPU]) { + pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + if (pcpu_num >= num_possible_cpus()) { + err = -EINVAL; + goto out_noput; + } + } + if (p->info.seq) { - x = xfrm_find_acq_byseq(net, mark, p->info.seq); + x = xfrm_find_acq_byseq(net, mark, p->info.seq, pcpu_num); if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { xfrm_state_put(x); x = NULL; @@ -1726,7 +1752,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (!x) x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, - if_id, p->info.id.proto, daddr, + if_id, pcpu_num, p->info.id.proto, daddr, &p->info.saddr, 1, family); err = -ENOENT; @@ -2526,7 +2552,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(4) /* XFRM_AE_RTHR */ + nla_total_size(4) /* XFRM_AE_ETHR */ - + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */ + + nla_total_size(sizeof(x->dir)) /* XFRMA_SA_DIR */ + + nla_total_size(4); /* XFRMA_SA_PCPU */ } static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -2582,6 +2609,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct err = xfrm_if_id_put(skb, x->if_id); if (err) goto out_cancel; + if (x->pcpu_num != UINT_MAX) + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); if (x->dir) { err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -2852,6 +2881,13 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_mark_get(attrs, &mark); + if (attrs[XFRMA_SA_PCPU]) { + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + err = -EINVAL; + if (x->pcpu_num >= num_possible_cpus()) + goto free_state; + } + err = verify_newpolicy_info(&ua->policy, extack); if (err) goto free_state; @@ -3182,6 +3218,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); @@ -3348,7 +3385,8 @@ static inline unsigned int xfrm_expire_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + nla_total_size(sizeof(struct xfrm_mark)) + - nla_total_size(sizeof_field(struct xfrm_state, dir)); + nla_total_size(sizeof_field(struct xfrm_state, dir)) + + nla_total_size(4); /* XFRMA_SA_PCPU */ } static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -3374,6 +3412,11 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct err = xfrm_if_id_put(skb, x->if_id); if (err) return err; + if (x->pcpu_num != UINT_MAX) { + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); + if (err) + return err; + } if (x->dir) { err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -3481,6 +3524,8 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) } if (x->if_id) l += nla_total_size(sizeof(x->if_id)); + if (x->pcpu_num) + l += nla_total_size(sizeof(x->pcpu_num)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@ -3587,6 +3632,7 @@ static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x, + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(xfrm_user_sec_ctx_size(x->security)) + + nla_total_size(4) /* XFRMA_SA_PCPU */ + userpolicy_type_attrsize(); } @@ -3623,6 +3669,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = xfrm_if_id_put(skb, xp->if_id); if (!err && xp->xdo.dev) err = copy_user_offload(&xp->xdo, skb); + if (!err && x->pcpu_num != UINT_MAX) + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); if (err) { nlmsg_cancel(skb, nlh); return err; -- cgit v1.2.3 From a1afb959add1fad43cb337448c244ed70bac3109 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 30 Oct 2024 09:11:56 +0100 Subject: dpll: add clock quality level attribute and op In order to allow driver expose quality level of the clock it is running, introduce a new netlink attr with enum to carry it to the userspace. Also, introduce an op the dpll netlink code calls into the driver to obtain the value. Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20241030081157.966604-2-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/dpll.yaml | 41 +++++++++++++++++++++++++++++++++++ drivers/dpll/dpll_netlink.c | 24 ++++++++++++++++++++ include/linux/dpll.h | 4 ++++ include/uapi/linux/dpll.h | 24 ++++++++++++++++++++ 4 files changed, 93 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index f2894ca35de8..8feefeae5376 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -85,6 +85,36 @@ definitions: This may happen for example if dpll device was previously locked on an input pin of type PIN_TYPE_SYNCE_ETH_PORT. render-max: true + - + type: enum + name: clock-quality-level + doc: | + level of quality of a clock device. This mainly applies when + the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. + The current list is defined according to the table 11-7 contained + in ITU-T G.8264/Y.1364 document. One may extend this list freely + by other ITU-T defined clock qualities, or different ones defined + by another standardization body (for those, please use + different prefix). + entries: + - + name: itu-opt1-prc + value: 1 + - + name: itu-opt1-ssu-a + - + name: itu-opt1-ssu-b + - + name: itu-opt1-eec1 + - + name: itu-opt1-prtc + - + name: itu-opt1-eprtc + - + name: itu-opt1-eeec + - + name: itu-opt1-eprc + render-max: true - type: const name: temp-divider @@ -252,6 +282,17 @@ attribute-sets: name: lock-status-error type: u32 enum: lock-status-error + - + name: clock-quality-level + type: u32 + enum: clock-quality-level + multi-attr: true + doc: | + Level of quality of a clock device. This mainly applies when + the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. This could + be put to message multiple times to indicate possible parallel + quality levels (e.g. one specified by ITU option 1 and another + one specified by option 2). - name: pin enum-name: dpll_a_pin diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index fc0280dcddd1..c130f87147fa 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -169,6 +169,27 @@ dpll_msg_add_temp(struct sk_buff *msg, struct dpll_device *dpll, return 0; } +static int +dpll_msg_add_clock_quality_level(struct sk_buff *msg, struct dpll_device *dpll, + struct netlink_ext_ack *extack) +{ + const struct dpll_device_ops *ops = dpll_device_ops(dpll); + DECLARE_BITMAP(qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) = { 0 }; + enum dpll_clock_quality_level ql; + int ret; + + if (!ops->clock_quality_level_get) + return 0; + ret = ops->clock_quality_level_get(dpll, dpll_priv(dpll), qls, extack); + if (ret) + return ret; + for_each_set_bit(ql, qls, DPLL_CLOCK_QUALITY_LEVEL_MAX) + if (nla_put_u32(msg, DPLL_A_CLOCK_QUALITY_LEVEL, ql)) + return -EMSGSIZE; + + return 0; +} + static int dpll_msg_add_pin_prio(struct sk_buff *msg, struct dpll_pin *pin, struct dpll_pin_ref *ref, @@ -557,6 +578,9 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg, if (ret) return ret; ret = dpll_msg_add_lock_status(msg, dpll, extack); + if (ret) + return ret; + ret = dpll_msg_add_clock_quality_level(msg, dpll, extack); if (ret) return ret; ret = dpll_msg_add_mode(msg, dpll, extack); diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 81f7b623d0ba..5e4f9ab1cf75 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -26,6 +26,10 @@ struct dpll_device_ops { struct netlink_ext_ack *extack); int (*temp_get)(const struct dpll_device *dpll, void *dpll_priv, s32 *temp, struct netlink_ext_ack *extack); + int (*clock_quality_level_get)(const struct dpll_device *dpll, + void *dpll_priv, + unsigned long *qls, + struct netlink_ext_ack *extack); }; struct dpll_pin_ops { diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index b0654ade7b7e..2b7ec2da4bcc 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -79,6 +79,29 @@ enum dpll_lock_status_error { DPLL_LOCK_STATUS_ERROR_MAX = (__DPLL_LOCK_STATUS_ERROR_MAX - 1) }; +/** + * enum dpll_clock_quality_level - level of quality of a clock device. This + * mainly applies when the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. The + * current list is defined according to the table 11-7 contained in ITU-T + * G.8264/Y.1364 document. One may extend this list freely by other ITU-T + * defined clock qualities, or different ones defined by another + * standardization body (for those, please use different prefix). + */ +enum dpll_clock_quality_level { + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRC = 1, + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_SSU_A, + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_SSU_B, + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EEC1, + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRTC, + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRTC, + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EEEC, + DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_EPRC, + + /* private: */ + __DPLL_CLOCK_QUALITY_LEVEL_MAX, + DPLL_CLOCK_QUALITY_LEVEL_MAX = (__DPLL_CLOCK_QUALITY_LEVEL_MAX - 1) +}; + #define DPLL_TEMP_DIVIDER 1000 /** @@ -180,6 +203,7 @@ enum dpll_a { DPLL_A_TEMP, DPLL_A_TYPE, DPLL_A_LOCK_STATUS_ERROR, + DPLL_A_CLOCK_QUALITY_LEVEL, __DPLL_A_MAX, DPLL_A_MAX = (__DPLL_A_MAX - 1) -- cgit v1.2.3 From 43d3487035e9a86fad952de4240a518614240d43 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 29 Oct 2024 15:55:35 -0600 Subject: UAPI: ethtool: Use __struct_group() in struct ethtool_link_settings Use the `__struct_group()` helper to create a new tagged `struct ethtool_link_settings_hdr`. This structure groups together all the members of the flexible `struct ethtool_link_settings` except the flexible array. As a result, the array is effectively separated from the rest of the members without modifying the memory layout of the flexible structure. This new tagged struct will be used to fix problematic declarations of middle-flex-arrays in composite structs[1]. [1] https://git.kernel.org/linus/d88cabfd9abc Signed-off-by: Gustavo A. R. Silva Link: https://patch.msgid.link/9e9fb0bd72e5ba1e916acbb4995b1e358b86a689.1730238285.git.gustavoars@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index c405ed63acfa..fc1f54b065f9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2511,21 +2511,24 @@ enum ethtool_reset_flags { * autonegotiation; 0 if unknown or not applicable. Read-only. */ struct ethtool_link_settings { - __u32 cmd; - __u32 speed; - __u8 duplex; - __u8 port; - __u8 phy_address; - __u8 autoneg; - __u8 mdio_support; - __u8 eth_tp_mdix; - __u8 eth_tp_mdix_ctrl; - __s8 link_mode_masks_nwords; - __u8 transceiver; - __u8 master_slave_cfg; - __u8 master_slave_state; - __u8 rate_matching; - __u32 reserved[7]; + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(ethtool_link_settings_hdr, hdr, /* no attrs */, + __u32 cmd; + __u32 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 autoneg; + __u8 mdio_support; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __s8 link_mode_masks_nwords; + __u8 transceiver; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 rate_matching; + __u32 reserved[7]; + ); __u32 link_mode_masks[]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; -- cgit v1.2.3 From 690e50dd69ee48e43e0f7c42396487da1b81be14 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 3 Nov 2024 08:53:14 -0800 Subject: tools: ynl-gen: de-kdocify enums with no doc for entries Sometimes the names of the enum entries are self-explanatory or come from standards. Forcing authors to write trivial kdoc for each of such entries seems unreasonable, but kdoc would complain about undocumented entries. Detect enums which only have documentation for the entire type and no documentation for entries. Render their doc as a plain comment. Link: https://patch.msgid.link/20241103165314.1631237-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/dpll.h | 14 +++++++------- tools/net/ynl/lib/nlspec.py | 3 +++ tools/net/ynl/ynl-gen-c.py | 14 +++++++++----- 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 2b7ec2da4bcc..bf97d4b6d51f 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -79,13 +79,13 @@ enum dpll_lock_status_error { DPLL_LOCK_STATUS_ERROR_MAX = (__DPLL_LOCK_STATUS_ERROR_MAX - 1) }; -/** - * enum dpll_clock_quality_level - level of quality of a clock device. This - * mainly applies when the dpll lock-status is DPLL_LOCK_STATUS_HOLDOVER. The - * current list is defined according to the table 11-7 contained in ITU-T - * G.8264/Y.1364 document. One may extend this list freely by other ITU-T - * defined clock qualities, or different ones defined by another - * standardization body (for those, please use different prefix). +/* + * level of quality of a clock device. This mainly applies when the dpll + * lock-status is DPLL_LOCK_STATUS_HOLDOVER. The current list is defined + * according to the table 11-7 contained in ITU-T G.8264/Y.1364 document. One + * may extend this list freely by other ITU-T defined clock qualities, or + * different ones defined by another standardization body (for those, please + * use different prefix). */ enum dpll_clock_quality_level { DPLL_CLOCK_QUALITY_LEVEL_ITU_OPT1_PRC = 1, diff --git a/tools/net/ynl/lib/nlspec.py b/tools/net/ynl/lib/nlspec.py index b6d6f8aef423..a745739655ad 100644 --- a/tools/net/ynl/lib/nlspec.py +++ b/tools/net/ynl/lib/nlspec.py @@ -131,6 +131,9 @@ class SpecEnumSet(SpecElement): def has_doc(self): if 'doc' in self.yaml: return True + return self.has_entry_doc() + + def has_entry_doc(self): for entry in self.entries.values(): if entry.has_doc(): return True diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index aa22eb092475..c48b69071111 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2437,11 +2437,15 @@ def render_uapi(family, cw): enum = family.consts[const['name']] if enum.has_doc(): - cw.p('/**') - doc = '' - if 'doc' in enum: - doc = ' - ' + enum['doc'] - cw.write_doc_line(enum.enum_name + doc) + if enum.has_entry_doc(): + cw.p('/**') + doc = '' + if 'doc' in enum: + doc = ' - ' + enum['doc'] + cw.write_doc_line(enum.enum_name + doc) + else: + cw.p('/*') + cw.write_doc_line(enum['doc'], indent=False) for entry in enum.entries.values(): if entry.has_doc(): doc = '@' + entry.c_name + ': ' + entry['doc'] -- cgit v1.2.3 From 84bfbfbbd32aee136afea4b6bf82581dce79c305 Mon Sep 17 00:00:00 2001 From: Maurice Lambert Date: Sun, 3 Nov 2024 23:39:50 +0100 Subject: netlink: typographical error in nlmsg_type constants definition This commit fix a typographical error in netlink nlmsg_type constants definition in the include/uapi/linux/rtnetlink.h at line 177. The definition is RTM_NEWNVLAN RTM_NEWVLAN instead of RTM_NEWVLAN RTM_NEWVLAN. Signed-off-by: Maurice Lambert Fixes: 8dcea187088b ("net: bridge: vlan: add rtm definitions and dump support") Link: https://patch.msgid.link/20241103223950.230300-1-mauricelambert434@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 3b687d20c9ed..db7254d52d93 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -174,7 +174,7 @@ enum { #define RTM_GETLINKPROP RTM_GETLINKPROP RTM_NEWVLAN = 112, -#define RTM_NEWNVLAN RTM_NEWVLAN +#define RTM_NEWVLAN RTM_NEWVLAN RTM_DELVLAN, #define RTM_DELVLAN RTM_DELVLAN RTM_GETVLAN, -- cgit v1.2.3 From 9907cda95fcbf44141b1292faab89cf8ec542f22 Mon Sep 17 00:00:00 2001 From: Juraj Šarinay Date: Sun, 3 Nov 2024 13:45:25 +0100 Subject: net: nfc: Propagate ISO14443 type A target ATS to userspace via netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 20-byte field ats to struct nfc_target and expose it as NFC_ATTR_TARGET_ATS via the netlink interface. The payload contains 'historical bytes' that help to distinguish cards from one another. The information is commonly used to assemble an emulated ATR similar to that reported by smart cards with contacts. Add a 20-byte field target_ats to struct nci_dev to hold the payload obtained in nci_rf_intf_activated_ntf_packet() and copy it to over to nfc_target.ats in nci_activate_target(). The approach is similar to the handling of 'general bytes' within ATR_RES. Replace the hard-coded size of rats_res within struct activation_params_nfca_poll_iso_dep by the equal constant NFC_ATS_MAXSIZE now defined in nfc.h Within NCI, the information corresponds to the 'RATS Response' activation parameter that omits the initial length byte TL. This loses no information and is consistent with our handling of SENSB_RES that also drops the first (constant) byte. Tested with nxp_nci_i2c on a few type A targets including an ICAO 9303 compliant passport. I refrain from the corresponding change to digital_in_recv_ats() to have the few drivers based on digital.h fill nfc_target.ats, as I have no way to test it. That class of drivers appear not to set NFC_ATTR_TARGET_SENSB_RES either. Consider a separate patch to propagate (all) the parameters. Signed-off-by: Juraj Šarinay Link: https://patch.msgid.link/20241103124525.8392-1-juraj@sarinay.com Signed-off-by: Paolo Abeni --- include/net/nfc/nci.h | 2 +- include/net/nfc/nci_core.h | 4 ++++ include/net/nfc/nfc.h | 4 ++++ include/uapi/linux/nfc.h | 3 +++ net/nfc/nci/core.c | 13 ++++++++++++- net/nfc/nci/ntf.c | 32 +++++++++++++++++++++++++++++++- net/nfc/netlink.c | 5 +++++ 7 files changed, 60 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index dc36519d16aa..09efcaed7c3f 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -475,7 +475,7 @@ struct nci_rf_discover_ntf { #define NCI_OP_RF_INTF_ACTIVATED_NTF nci_opcode_pack(NCI_GID_RF_MGMT, 0x05) struct activation_params_nfca_poll_iso_dep { __u8 rats_res_len; - __u8 rats_res[20]; + __u8 rats_res[NFC_ATS_MAXSIZE]; }; struct activation_params_nfcb_poll_iso_dep { diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index ea8595651c38..e180bdf2f82b 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -265,6 +265,10 @@ struct nci_dev { /* stored during intf_activated_ntf */ __u8 remote_gb[NFC_MAX_GT_LEN]; __u8 remote_gb_len; + + /* stored during intf_activated_ntf */ + __u8 target_ats[NFC_ATS_MAXSIZE]; + __u8 target_ats_len; }; /* ----- NCI Devices ----- */ diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 3a3781838c67..127e6c7d910d 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -86,6 +86,8 @@ struct nfc_ops { * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. + * @ats_len: length of Answer To Select in bytes + * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation */ struct nfc_target { u32 idx; @@ -105,6 +107,8 @@ struct nfc_target { u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; + u8 ats_len; + u8 ats[NFC_ATS_MAXSIZE]; }; /** diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 4fa4e979e948..2f5b4be25261 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -164,6 +164,7 @@ enum nfc_commands { * @NFC_ATTR_VENDOR_SUBCMD: Vendor specific sub command * @NFC_ATTR_VENDOR_DATA: Vendor specific data, to be optionally passed * to a vendor specific command implementation + * @NFC_ATTR_TARGET_ATS: ISO 14443 type A target Answer To Select */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -198,6 +199,7 @@ enum nfc_attrs { NFC_ATTR_VENDOR_ID, NFC_ATTR_VENDOR_SUBCMD, NFC_ATTR_VENDOR_DATA, + NFC_ATTR_TARGET_ATS, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; @@ -225,6 +227,7 @@ enum nfc_sdp_attr { #define NFC_GB_MAXSIZE 48 #define NFC_FIRMWARE_NAME_MAXSIZE 32 #define NFC_ISO15693_UID_MAXSIZE 8 +#define NFC_ATS_MAXSIZE 20 /* NFC protocols */ #define NFC_PROTO_JEWEL 1 diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index f456a5911e7d..1ec5955fe469 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -757,6 +757,14 @@ int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) } EXPORT_SYMBOL(nci_core_conn_close); +static void nci_set_target_ats(struct nfc_target *target, struct nci_dev *ndev) +{ + if (ndev->target_ats_len > 0) { + target->ats_len = ndev->target_ats_len; + memcpy(target->ats, ndev->target_ats, target->ats_len); + } +} + static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); @@ -939,8 +947,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT)); } - if (!rc) + if (!rc) { ndev->target_active_prot = protocol; + if (protocol == NFC_PROTO_ISO14443) + nci_set_target_ats(target, ndev); + } return rc; } diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 994a0a1efb58..a818eff27e6b 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -402,7 +402,7 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: nfca_poll = &ntf->activation_params.nfca_poll_iso_dep; - nfca_poll->rats_res_len = min_t(__u8, *data++, 20); + nfca_poll->rats_res_len = min_t(__u8, *data++, NFC_ATS_MAXSIZE); pr_debug("rats_res_len %d\n", nfca_poll->rats_res_len); if (nfca_poll->rats_res_len > 0) { memcpy(nfca_poll->rats_res, @@ -531,6 +531,28 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, return NCI_STATUS_OK; } +static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev, + const struct nci_rf_intf_activated_ntf *ntf) +{ + ndev->target_ats_len = 0; + + if (ntf->activation_params_len <= 0) + return NCI_STATUS_OK; + + if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > NFC_ATS_MAXSIZE) { + pr_debug("ATS too long\n"); + return NCI_STATUS_RF_PROTOCOL_ERROR; + } + + if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > 0) { + ndev->target_ats_len = ntf->activation_params.nfca_poll_iso_dep.rats_res_len; + memcpy(ndev->target_ats, ntf->activation_params.nfca_poll_iso_dep.rats_res, + ndev->target_ats_len); + } + + return NCI_STATUS_OK; +} + static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { @@ -660,6 +682,14 @@ exit: if (err != NCI_STATUS_OK) pr_err("unable to store general bytes\n"); } + + /* store ATS to be reported later in nci_activate_target */ + if (ntf.rf_interface == NCI_RF_INTERFACE_ISO_DEP && + ntf.activation_rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) { + err = nci_store_ats_nfc_iso_dep(ndev, &ntf); + if (err != NCI_STATUS_OK) + pr_err("unable to store ATS\n"); + } } if (!(ntf.activation_rf_tech_and_mode & NCI_RF_TECH_MODE_LISTEN_MASK)) { diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index dd2ce73a24fb..6a40b8d0350d 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -96,6 +96,11 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, goto nla_put_failure; } + if (target->ats_len > 0 && + nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len, + target->ats)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3 From 580db513b4a9d52f306580015a1872eea0a0894e Mon Sep 17 00:00:00 2001 From: Khang Nguyen Date: Tue, 5 Nov 2024 14:19:15 +0700 Subject: net: mctp: Expose transport binding identifier via IFLA attribute MCTP control protocol implementations are transport binding dependent. Endpoint discovery is mandatory based on transport binding. Message timing requirements are specified in each respective transport binding specification. However, we currently have no means to get this information from MCTP links. Add a IFLA_MCTP_PHYS_BINDING netlink link attribute, which represents the transport type using the DMTF DSP0239-defined type numbers, returned as part of RTM_GETLINK data. We get an IFLA_MCTP_PHYS_BINDING attribute for each MCTP link, for example: - 0x00 (unspec) for loopback interface; - 0x01 (SMBus/I2C) for mctpi2c%d interfaces; and - 0x05 (serial) for mctpserial%d interfaces. Signed-off-by: Khang Nguyen Reviewed-by: Matt Johnston Link: https://patch.msgid.link/20241105071915.821871-1-khangng@os.amperecomputing.com Signed-off-by: Jakub Kicinski --- drivers/net/mctp/mctp-i2c.c | 3 ++- drivers/net/mctp/mctp-i3c.c | 2 +- drivers/net/mctp/mctp-serial.c | 5 +++-- include/net/mctp.h | 18 ++++++++++++++++++ include/net/mctpdevice.h | 4 +++- include/uapi/linux/if_link.h | 1 + net/mctp/device.c | 12 +++++++++--- 7 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index e70fb6687994..d2b3f5a59141 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -880,7 +880,8 @@ static int mctp_i2c_add_netdev(struct mctp_i2c_client *mcli, goto err; } - rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops); + rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops, + MCTP_PHYS_BINDING_SMBUS); if (rc < 0) { dev_err(&mcli->client->dev, "register netdev \"%s\" failed %d\n", diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c index 1bc87a062686..9adad59b8676 100644 --- a/drivers/net/mctp/mctp-i3c.c +++ b/drivers/net/mctp/mctp-i3c.c @@ -607,7 +607,7 @@ __must_hold(&busdevs_lock) goto err_free_uninit; } - rc = mctp_register_netdev(ndev, NULL); + rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_I3C); if (rc < 0) { dev_warn(&ndev->dev, "netdev register failed: %d\n", rc); goto err_free_netdev; diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c index e63720ec3238..26c9a33fd636 100644 --- a/drivers/net/mctp/mctp-serial.c +++ b/drivers/net/mctp/mctp-serial.c @@ -23,6 +23,7 @@ #include #include +#include #include #define MCTP_SERIAL_MTU 68 /* base mtu (64) + mctp header */ @@ -470,7 +471,7 @@ static int mctp_serial_open(struct tty_struct *tty) spin_lock_init(&dev->lock); INIT_WORK(&dev->tx_work, mctp_serial_tx_work); - rc = register_netdev(ndev); + rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_SERIAL); if (rc) goto free_netdev; @@ -492,7 +493,7 @@ static void mctp_serial_close(struct tty_struct *tty) struct mctp_serial *dev = tty->disc_data; int idx = dev->idx; - unregister_netdev(dev->netdev); + mctp_unregister_netdev(dev->netdev); ida_free(&mctp_serial_ida, idx); } diff --git a/include/net/mctp.h b/include/net/mctp.h index 28d59ae94ca3..1ecbff7116f6 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -298,4 +298,22 @@ void mctp_routes_exit(void); int mctp_device_init(void); void mctp_device_exit(void); +/* MCTP IDs and Codes from DMTF specification + * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf + */ +enum mctp_phys_binding { + MCTP_PHYS_BINDING_UNSPEC = 0x00, + MCTP_PHYS_BINDING_SMBUS = 0x01, + MCTP_PHYS_BINDING_PCIE_VDM = 0x02, + MCTP_PHYS_BINDING_USB = 0x03, + MCTP_PHYS_BINDING_KCS = 0x04, + MCTP_PHYS_BINDING_SERIAL = 0x05, + MCTP_PHYS_BINDING_I3C = 0x06, + MCTP_PHYS_BINDING_MMBI = 0x07, + MCTP_PHYS_BINDING_PCC = 0x08, + MCTP_PHYS_BINDING_UCIE = 0x09, + MCTP_PHYS_BINDING_VENDOR = 0xFF, +}; + #endif /* __NET_MCTP_H */ diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h index 5c0d04b5c12c..957d9ef924c5 100644 --- a/include/net/mctpdevice.h +++ b/include/net/mctpdevice.h @@ -22,6 +22,7 @@ struct mctp_dev { refcount_t refs; unsigned int net; + enum mctp_phys_binding binding; const struct mctp_netdev_ops *ops; @@ -44,7 +45,8 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); struct mctp_dev *__mctp_dev_get(const struct net_device *dev); int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops); + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding); void mctp_unregister_netdev(struct net_device *dev); void mctp_dev_hold(struct mctp_dev *mdev); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8516c1ccd57a..2575e0cd9b48 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1958,6 +1958,7 @@ struct ifla_rmnet_flags { enum { IFLA_MCTP_UNSPEC, IFLA_MCTP_NET, + IFLA_MCTP_PHYS_BINDING, __IFLA_MCTP_MAX, }; diff --git a/net/mctp/device.c b/net/mctp/device.c index 3d75b919995d..26ce34b7e88e 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -371,6 +371,8 @@ static int mctp_fill_link_af(struct sk_buff *skb, return -ENODATA; if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net)) return -EMSGSIZE; + if (nla_put_u8(skb, IFLA_MCTP_PHYS_BINDING, mdev->binding)) + return -EMSGSIZE; return 0; } @@ -385,6 +387,7 @@ static size_t mctp_get_link_af_size(const struct net_device *dev, if (!mdev) return 0; ret = nla_total_size(4); /* IFLA_MCTP_NET */ + ret += nla_total_size(1); /* IFLA_MCTP_PHYS_BINDING */ mctp_dev_put(mdev); return ret; } @@ -480,7 +483,8 @@ static int mctp_dev_notify(struct notifier_block *this, unsigned long event, } static int mctp_register_netdevice(struct net_device *dev, - const struct mctp_netdev_ops *ops) + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding) { struct mctp_dev *mdev; @@ -489,17 +493,19 @@ static int mctp_register_netdevice(struct net_device *dev, return PTR_ERR(mdev); mdev->ops = ops; + mdev->binding = binding; return register_netdevice(dev); } int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops) + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding) { int rc; rtnl_lock(); - rc = mctp_register_netdevice(dev, ops); + rc = mctp_register_netdevice(dev, ops, binding); rtnl_unlock(); return rc; -- cgit v1.2.3 From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:31 +0000 Subject: net: Add napi_struct parameter irq_suspend_timeout Add a per-NAPI IRQ suspension parameter, which can be get/set with netdev-genl. This patch doesn't change any behavior but prepares the code for other changes in the following commits which use irq_suspend_timeout as a timeout for IRQ suspension. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 7 +++++++ include/linux/netdevice.h | 2 ++ include/uapi/linux/netdev.h | 1 + net/core/dev.c | 2 ++ net/core/dev.h | 25 +++++++++++++++++++++++++ net/core/netdev-genl-gen.c | 5 +++-- net/core/netdev-genl.c | 12 ++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 8 files changed, 53 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index f9cb97d6106c..cbb544bd6c84 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -263,6 +263,11 @@ attribute-sets: the end of a NAPI cycle. This may add receive latency in exchange for reducing the number of frames processed by the network stack. type: uint + - + name: irq-suspend-timeout + doc: The timeout, in nanoseconds, of how long to suspend irq + processing, if event polling finds events + type: uint - name: queue attributes: @@ -653,6 +658,7 @@ operations: - pid - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout dump: request: attributes: @@ -704,6 +710,7 @@ operations: - id - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df4483598628..0aae346d919e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -348,6 +348,7 @@ struct gro_list { */ struct napi_config { u64 gro_flush_timeout; + u64 irq_suspend_timeout; u32 defer_hard_irqs; unsigned int napi_id; }; @@ -384,6 +385,7 @@ struct napi_struct { struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; + unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 6a31152e4606..4d910872963f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6666,6 +6666,7 @@ static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. It will be saved to the config * in napi_disable. @@ -6680,6 +6681,7 @@ static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; n->config->napi_id = n->napi_id; napi_hash_del(n); } diff --git a/net/core/dev.h b/net/core/dev.h index 7881bced70a9..d043dee25a68 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -236,6 +236,31 @@ static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, netdev->napi_config[i].gro_flush_timeout = timeout; } +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 21de7e10be16..a89cbd8d87c3 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,10 +92,11 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; /* Ops table for netdev */ @@ -186,7 +187,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b49c3b4e5fbe..765ce7c9d73b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; @@ -196,6 +197,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) @@ -306,6 +312,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { + u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u32 defer = 0; @@ -314,6 +321,11 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) napi_set_defer_hard_irqs(napi, defer); } + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From a12143e6084c502fc3cfaa8b717bffc8c14cf806 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 14 Nov 2024 22:07:51 +0100 Subject: netfilter: bitwise: rename some boolean operation functions In the next patch we add support for doing AND, OR and XOR operations directly in the kernel, so rename some functions and an enum constant related to mask-and-xor boolean operations. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 10 +++++++--- net/netfilter/nft_bitwise.c | 34 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 9e9079321380..487542234ccd 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -564,16 +564,20 @@ enum nft_immediate_attributes { /** * enum nft_bitwise_ops - nf_tables bitwise operations * - * @NFT_BITWISE_BOOL: mask-and-xor operation used to implement NOT, AND, OR and - * XOR boolean operations + * @NFT_BITWISE_MASK_XOR: mask-and-xor operation used to implement NOT, AND, OR + * and XOR boolean operations * @NFT_BITWISE_LSHIFT: left-shift operation * @NFT_BITWISE_RSHIFT: right-shift operation */ enum nft_bitwise_ops { - NFT_BITWISE_BOOL, + NFT_BITWISE_MASK_XOR, NFT_BITWISE_LSHIFT, NFT_BITWISE_RSHIFT, }; +/* + * Old name for NFT_BITWISE_MASK_XOR. Retained for backwards-compatibility. + */ +#define NFT_BITWISE_BOOL NFT_BITWISE_MASK_XOR /** * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 7de95674fd8c..7f6a4f800537 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -25,8 +25,8 @@ struct nft_bitwise { struct nft_data data; }; -static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, - const struct nft_bitwise *priv) +static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) { unsigned int i; @@ -68,8 +68,8 @@ void nft_bitwise_eval(const struct nft_expr *expr, u32 *dst = ®s->data[priv->dreg]; switch (priv->op) { - case NFT_BITWISE_BOOL: - nft_bitwise_eval_bool(dst, src, priv); + case NFT_BITWISE_MASK_XOR: + nft_bitwise_eval_mask_xor(dst, src, priv); break; case NFT_BITWISE_LSHIFT: nft_bitwise_eval_lshift(dst, src, priv); @@ -90,8 +90,8 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; -static int nft_bitwise_init_bool(struct nft_bitwise *priv, - const struct nlattr *const tb[]) +static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { struct nft_data_desc mask = { .type = NFT_DATA_VALUE, @@ -185,7 +185,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (tb[NFTA_BITWISE_OP]) { priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { - case NFT_BITWISE_BOOL: + case NFT_BITWISE_MASK_XOR: case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: break; @@ -193,12 +193,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } } else { - priv->op = NFT_BITWISE_BOOL; + priv->op = NFT_BITWISE_MASK_XOR; } switch(priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_init_bool(priv, tb); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_init_mask_xor(priv, tb); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: @@ -209,8 +209,8 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, return err; } -static int nft_bitwise_dump_bool(struct sk_buff *skb, - const struct nft_bitwise *priv) +static int nft_bitwise_dump_mask_xor(struct sk_buff *skb, + const struct nft_bitwise *priv) { if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) @@ -248,8 +248,8 @@ static int nft_bitwise_dump(struct sk_buff *skb, return -1; switch (priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_dump_bool(skb, priv); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_dump_mask_xor(skb, priv); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: @@ -269,7 +269,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; - if (priv->op != NFT_BITWISE_BOOL) + if (priv->op != NFT_BITWISE_MASK_XOR) return -EOPNOTSUPP; if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || @@ -406,7 +406,7 @@ nft_bitwise_fast_dump(struct sk_buff *skb, return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32)))) return -1; - if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL))) + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR))) return -1; data.data[0] = priv->mask; @@ -501,7 +501,7 @@ nft_bitwise_select_ops(const struct nft_ctx *ctx, return &nft_bitwise_ops; if (tb[NFTA_BITWISE_OP] && - ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL) + ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR) return &nft_bitwise_ops; return &nft_bitwise_fast_ops; -- cgit v1.2.3 From b0ccf4f53d968e794a4ea579d5135cc1aaf1a53f Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 14 Nov 2024 22:08:13 +0100 Subject: netfilter: bitwise: add support for doing AND, OR and XOR directly Hitherto, these operations have been converted in user space to mask-and-xor operations on one register and two immediate values, and it is the latter which have been evaluated by the kernel. We add support for evaluating these operations directly in kernel space on one register and either an immediate value or a second register. Pablo made a few changes to the original patch: - EINVAL if NFTA_BITWISE_SREG2 is used with fast version. - Allow _AND,_OR,_XOR with _DATA != sizeof(u32) - Dump _SREG2 or _DATA with _AND,_OR,_XOR Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 8 ++ net/netfilter/nft_bitwise.c | 134 ++++++++++++++++++++++++++++--- 2 files changed, 131 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 487542234ccd..49c944e78463 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -568,11 +568,17 @@ enum nft_immediate_attributes { * and XOR boolean operations * @NFT_BITWISE_LSHIFT: left-shift operation * @NFT_BITWISE_RSHIFT: right-shift operation + * @NFT_BITWISE_AND: and operation + * @NFT_BITWISE_OR: or operation + * @NFT_BITWISE_XOR: xor operation */ enum nft_bitwise_ops { NFT_BITWISE_MASK_XOR, NFT_BITWISE_LSHIFT, NFT_BITWISE_RSHIFT, + NFT_BITWISE_AND, + NFT_BITWISE_OR, + NFT_BITWISE_XOR, }; /* * Old name for NFT_BITWISE_MASK_XOR. Retained for backwards-compatibility. @@ -590,6 +596,7 @@ enum nft_bitwise_ops { * @NFTA_BITWISE_OP: type of operation (NLA_U32: nft_bitwise_ops) * @NFTA_BITWISE_DATA: argument for non-boolean operations * (NLA_NESTED: nft_data_attributes) + * @NFTA_BITWISE_SREG2: second source register (NLA_U32: nft_registers) * * The bitwise expression supports boolean and shift operations. It implements * the boolean operations by performing the following operation: @@ -613,6 +620,7 @@ enum nft_bitwise_attributes { NFTA_BITWISE_XOR, NFTA_BITWISE_OP, NFTA_BITWISE_DATA, + NFTA_BITWISE_SREG2, __NFTA_BITWISE_MAX }; #define NFTA_BITWISE_MAX (__NFTA_BITWISE_MAX - 1) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 7f6a4f800537..d550910aabec 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -17,6 +17,7 @@ struct nft_bitwise { u8 sreg; + u8 sreg2; u8 dreg; enum nft_bitwise_ops op:8; u8 len; @@ -60,28 +61,72 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, } } +static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] & src2[i]; +} + +static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] | src2[i]; +} + +static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] ^ src2[i]; +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const u32 *src = ®s->data[priv->sreg]; + const u32 *src = ®s->data[priv->sreg], *src2; u32 *dst = ®s->data[priv->dreg]; - switch (priv->op) { - case NFT_BITWISE_MASK_XOR: + if (priv->op == NFT_BITWISE_MASK_XOR) { nft_bitwise_eval_mask_xor(dst, src, priv); - break; - case NFT_BITWISE_LSHIFT: + return; + } + if (priv->op == NFT_BITWISE_LSHIFT) { nft_bitwise_eval_lshift(dst, src, priv); - break; - case NFT_BITWISE_RSHIFT: + return; + } + if (priv->op == NFT_BITWISE_RSHIFT) { nft_bitwise_eval_rshift(dst, src, priv); - break; + return; + } + + src2 = priv->sreg2 ? ®s->data[priv->sreg2] : priv->data.data; + + if (priv->op == NFT_BITWISE_AND) { + nft_bitwise_eval_and(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_OR) { + nft_bitwise_eval_or(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_XOR) { + nft_bitwise_eval_xor(dst, src, src2, priv); + return; } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_SREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_SREG2] = { .type = NLA_U32 }, [NFTA_BITWISE_DREG] = { .type = NLA_U32 }, [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, @@ -105,7 +150,8 @@ static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv, }; int err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || @@ -139,7 +185,8 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, int err; if (tb[NFTA_BITWISE_MASK] || - tb[NFTA_BITWISE_XOR]) + tb[NFTA_BITWISE_XOR] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_DATA]) @@ -157,6 +204,41 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, return 0; } +static int nft_bitwise_init_bool(const struct nft_ctx *ctx, + struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) || + (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2])) + return -EINVAL; + + if (tb[NFTA_BITWISE_DATA]) { + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = priv->len, + }; + + err = nft_data_init(NULL, &priv->data, &desc, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + } else { + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2], + &priv->sreg2, priv->len); + if (err < 0) + return err; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -188,6 +270,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, case NFT_BITWISE_MASK_XOR: case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: break; default: return -EOPNOTSUPP; @@ -204,6 +289,11 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, case NFT_BITWISE_RSHIFT: err = nft_bitwise_init_shift(priv, tb); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_init_bool(ctx, priv, tb); + break; } return err; @@ -232,6 +322,21 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb, return 0; } +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (priv->sreg2) { + if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2)) + return -1; + } else { + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + } + + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { @@ -255,6 +360,11 @@ static int nft_bitwise_dump(struct sk_buff *skb, case NFT_BITWISE_RSHIFT: err = nft_bitwise_dump_shift(skb, priv); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_dump_bool(skb, priv); + break; } return err; @@ -299,6 +409,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track, track->regs[priv->dreg].bitwise && track->regs[priv->dreg].bitwise->ops == expr->ops && priv->sreg == bitwise->sreg && + priv->sreg2 == bitwise->sreg2 && priv->dreg == bitwise->dreg && priv->op == bitwise->op && priv->len == bitwise->len && @@ -375,7 +486,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || -- cgit v1.2.3 From ebda123fe703f492d7d557a4da00888ddec4779e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 Nov 2024 12:43:04 -0800 Subject: Revert "UAPI: ethtool: Use __struct_group() in struct ethtool_link_settings" This reverts commit 43d3487035e9a86fad952de4240a518614240d43. We cannot use tagged struct groups in UAPI because C++ will throw syntax errors even under "extern C". Signed-off-by: Kees Cook Link: https://patch.msgid.link/20241115204308.3821419-2-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index fc1f54b065f9..c405ed63acfa 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2511,24 +2511,21 @@ enum ethtool_reset_flags { * autonegotiation; 0 if unknown or not applicable. Read-only. */ struct ethtool_link_settings { - /* New members MUST be added within the __struct_group() macro below. */ - __struct_group(ethtool_link_settings_hdr, hdr, /* no attrs */, - __u32 cmd; - __u32 speed; - __u8 duplex; - __u8 port; - __u8 phy_address; - __u8 autoneg; - __u8 mdio_support; - __u8 eth_tp_mdix; - __u8 eth_tp_mdix_ctrl; - __s8 link_mode_masks_nwords; - __u8 transceiver; - __u8 master_slave_cfg; - __u8 master_slave_state; - __u8 rate_matching; - __u32 reserved[7]; - ); + __u32 cmd; + __u32 speed; + __u8 duplex; + __u8 port; + __u8 phy_address; + __u8 autoneg; + __u8 mdio_support; + __u8 eth_tp_mdix; + __u8 eth_tp_mdix_ctrl; + __s8 link_mode_masks_nwords; + __u8 transceiver; + __u8 master_slave_cfg; + __u8 master_slave_state; + __u8 rate_matching; + __u32 reserved[7]; __u32 link_mode_masks[]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; -- cgit v1.2.3 From 96c677fca54a28fcfea4dbab9c1f2530bd0a08d1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 Nov 2024 12:43:05 -0800 Subject: UAPI: ethtool: Avoid flex-array in struct ethtool_link_settings struct ethtool_link_settings tends to be used as a header for other structures that have trailing bytes[1], but has a trailing flexible array itself. Using this overlapped with other structures leads to ambiguous object sizing in the compiler, so we want to avoid such situations (which have caused real bugs in the past). Detecting this can be done with -Wflex-array-member-not-at-end, which will need to be enabled globally. Using a tagged struct_group() to create a new ethtool_link_settings_hdr structure isn't possible as it seems we cannot use the tagged variant of struct_group() due to syntax issues from C++'s perspective (even within "extern C")[2]. Instead, we can just leave the offending member defined in UAPI and remove it from the kernel's view of the structure, as Linux doesn't actually use this member at all. There is also no change in size since it was already a flexible array that didn't contribute to size returned by any use of sizeof(). Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/lkml/20241109100213.262a2fa0@kernel.org/ [2] Link: https://lore.kernel.org/lkml/0bc2809fe2a6c11dd4c8a9a10d9bd65cccdb559b.1730238285.git.gustavoars@kernel.org/ [1] Signed-off-by: Kees Cook Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241115204308.3821419-3-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index c405ed63acfa..7e1b3820f91f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2526,12 +2526,19 @@ struct ethtool_link_settings { __u8 master_slave_state; __u8 rate_matching; __u32 reserved[7]; +#ifndef __KERNEL__ + /* Linux builds with -Wflex-array-member-not-at-end but does + * not use the "link_mode_masks" member. Leave it defined for + * userspace for now, and when userspace wants to start using + * -Wfamnae, we'll need a new solution. + */ __u32 link_mode_masks[]; /* layout of link_mode_masks fields: * __u32 map_supported[link_mode_masks_nwords]; * __u32 map_advertising[link_mode_masks_nwords]; * __u32 map_lp_advertising[link_mode_masks_nwords]; */ +#endif }; /** -- cgit v1.2.3