From 53c2d5b14a82f6e7f0f8089083972df20e66a354 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Sat, 1 Oct 2022 10:00:45 +0800 Subject: RDMA/core: return -EOPNOSUPP for ODP unsupported device ib_reg_mr(3) which is used to register a MR with specific access flags for specific HCA will set errno when something go wrong. So, here we should return the specific -EOPNOTSUPP when the being requested ODP access flag is unsupported by the HCA(such as RXE). Signed-off-by: Li Zhijian Link: https://lore.kernel.org/r/20221001020045.8324-1-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 975d6e9efbcb..a1f4d53a4bb6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4334,7 +4334,7 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev, if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) - return -EINVAL; + return -EOPNOTSUPP; return 0; } -- cgit v1.2.3 From 7ac7bfe746d8faddbd79abed526ee67f46d8867c Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Sun, 9 Oct 2022 16:10:47 +0800 Subject: RDMA/opa_vnic: fix spelling typo in comment Fix spelling typo in comment. Reported-by: k2ci Signed-off-by: Jiangshan Yi Link: https://lore.kernel.org/r/20221009081047.2643471-1-13667453960@163.com Signed-off-by: Leon Romanovsky --- include/rdma/opa_vnic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h index f3d5377b217a..d297f084001a 100644 --- a/include/rdma/opa_vnic.h +++ b/include/rdma/opa_vnic.h @@ -51,7 +51,7 @@ static inline void *opa_vnic_dev_priv(const struct net_device *dev) return oparn->dev_priv; } -/* opa_vnic skb meta data structrue */ +/* opa_vnic skb meta data structure */ struct opa_vnic_skb_mdata { u8 vl; u8 entropy; -- cgit v1.2.3 From 5c20311d76cbaeb7ed2ecf9c8b8322f8fc4a7ae3 Mon Sep 17 00:00:00 2001 From: Leonid Ravich Date: Wed, 9 Nov 2022 11:57:17 +0200 Subject: IB/mad: Don't call to function that might sleep while in atomic context Tracepoints are not allowed to sleep, as such the following splat is generated due to call to ib_query_pkey() in atomic context. WARNING: CPU: 0 PID: 1888000 at kernel/trace/ring_buffer.c:2492 rb_commit+0xc1/0x220 CPU: 0 PID: 1888000 Comm: kworker/u9:0 Kdump: loaded Tainted: G OE --------- - - 4.18.0-305.3.1.el8.x86_64 #1 Hardware name: Red Hat KVM, BIOS 1.13.0-2.module_el8.3.0+555+a55c8938 04/01/2014 Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core] RIP: 0010:rb_commit+0xc1/0x220 RSP: 0000:ffffa8ac80f9bca0 EFLAGS: 00010202 RAX: ffff8951c7c01300 RBX: ffff8951c7c14a00 RCX: 0000000000000246 RDX: ffff8951c707c000 RSI: ffff8951c707c57c RDI: ffff8951c7c14a00 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: ffff8951c7c01300 R11: 0000000000000001 R12: 0000000000000246 R13: 0000000000000000 R14: ffffffff964c70c0 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8951fbc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f20e8f39010 CR3: 000000002ca10005 CR4: 0000000000170ef0 Call Trace: ring_buffer_unlock_commit+0x1d/0xa0 trace_buffer_unlock_commit_regs+0x3b/0x1b0 trace_event_buffer_commit+0x67/0x1d0 trace_event_raw_event_ib_mad_recv_done_handler+0x11c/0x160 [ib_core] ib_mad_recv_done+0x48b/0xc10 [ib_core] ? trace_event_raw_event_cq_poll+0x6f/0xb0 [ib_core] __ib_process_cq+0x91/0x1c0 [ib_core] ib_cq_poll_work+0x26/0x80 [ib_core] process_one_work+0x1a7/0x360 ? create_worker+0x1a0/0x1a0 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x116/0x130 ? kthread_flush_work_fn+0x10/0x10 ret_from_fork+0x35/0x40 ---[ end trace 78ba8509d3830a16 ]--- Fixes: 821bf1de45a1 ("IB/MAD: Add recv path trace point") Signed-off-by: Leonid Ravich Link: https://lore.kernel.org/r/Y2t5feomyznrVj7V@leonid-Inspiron-3421 Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/mad.c | 5 ----- include/trace/events/ib_mad.h | 13 ++++--------- 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 1893aa613ad7..674344eb8e2f 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -59,9 +59,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, struct ib_mad_qp_info *qp_info, struct trace_event_raw_ib_mad_send_template *entry) { - u16 pkey; - struct ib_device *dev = qp_info->port_priv->device; - u32 pnum = qp_info->port_priv->port_num; struct ib_ud_wr *wr = &mad_send_wr->send_wr; struct rdma_ah_attr attr = {}; @@ -69,8 +66,6 @@ static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, /* These are common */ entry->sl = attr.sl; - ib_query_pkey(dev, pnum, wr->pkey_index, &pkey); - entry->pkey = pkey; entry->rqpn = wr->remote_qpn; entry->rqkey = wr->remote_qkey; entry->dlid = rdma_ah_get_dlid(&attr); diff --git a/include/trace/events/ib_mad.h b/include/trace/events/ib_mad.h index 59363a083ecb..d92691c78cff 100644 --- a/include/trace/events/ib_mad.h +++ b/include/trace/events/ib_mad.h @@ -49,7 +49,6 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, __field(int, retries_left) __field(int, max_retries) __field(int, retry) - __field(u16, pkey) ), TP_fast_assign( @@ -89,7 +88,7 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, "hdr : base_ver 0x%x class 0x%x class_ver 0x%x " \ "method 0x%x status 0x%x class_specific 0x%x tid 0x%llx " \ "attr_id 0x%x attr_mod 0x%x => dlid 0x%08x sl %d "\ - "pkey 0x%x rpqn 0x%x rqpkey 0x%x", + "rpqn 0x%x rqpkey 0x%x", __entry->dev_index, __entry->port_num, __entry->qp_num, __entry->agent_priv, be64_to_cpu(__entry->wrtid), __entry->retries_left, __entry->max_retries, @@ -100,7 +99,7 @@ DECLARE_EVENT_CLASS(ib_mad_send_template, be16_to_cpu(__entry->class_specific), be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id), be32_to_cpu(__entry->attr_mod), - be32_to_cpu(__entry->dlid), __entry->sl, __entry->pkey, + be32_to_cpu(__entry->dlid), __entry->sl, __entry->rqpn, __entry->rqkey ) ); @@ -204,7 +203,6 @@ TRACE_EVENT(ib_mad_recv_done_handler, __field(u16, wc_status) __field(u32, slid) __field(u32, dev_index) - __field(u16, pkey) ), TP_fast_assign( @@ -224,9 +222,6 @@ TRACE_EVENT(ib_mad_recv_done_handler, __entry->slid = wc->slid; __entry->src_qp = wc->src_qp; __entry->sl = wc->sl; - ib_query_pkey(qp_info->port_priv->device, - qp_info->port_priv->port_num, - wc->pkey_index, &__entry->pkey); __entry->wc_status = wc->status; ), @@ -234,7 +229,7 @@ TRACE_EVENT(ib_mad_recv_done_handler, "base_ver 0x%02x class 0x%02x class_ver 0x%02x " \ "method 0x%02x status 0x%04x class_specific 0x%04x " \ "tid 0x%016llx attr_id 0x%04x attr_mod 0x%08x " \ - "slid 0x%08x src QP%d, sl %d pkey 0x%04x", + "slid 0x%08x src QP%d, sl %d", __entry->dev_index, __entry->port_num, __entry->qp_num, __entry->wc_status, __entry->length, @@ -244,7 +239,7 @@ TRACE_EVENT(ib_mad_recv_done_handler, be16_to_cpu(__entry->class_specific), be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id), be32_to_cpu(__entry->attr_mod), - __entry->slid, __entry->src_qp, __entry->sl, __entry->pkey + __entry->slid, __entry->src_qp, __entry->sl ) ); -- cgit v1.2.3 From 0266a177631d4c6b963b5b12dd986a8c5abdbf06 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 3 Nov 2022 12:16:30 -0700 Subject: RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA). Co-developed-by: Ajay Sharma Signed-off-by: Ajay Sharma Reviewed-by: Dexuan Cui Signed-off-by: Long Li Link: https://lore.kernel.org/r/1667502990-2559-13-git-send-email-longli@linuxonhyperv.com Signed-off-by: Leon Romanovsky --- MAINTAINERS | 9 + drivers/infiniband/Kconfig | 1 + drivers/infiniband/hw/Makefile | 1 + drivers/infiniband/hw/mana/Kconfig | 10 + drivers/infiniband/hw/mana/Makefile | 4 + drivers/infiniband/hw/mana/cq.c | 79 +++++ drivers/infiniband/hw/mana/device.c | 117 +++++++ drivers/infiniband/hw/mana/main.c | 521 ++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/mana_ib.h | 162 ++++++++++ drivers/infiniband/hw/mana/mr.c | 198 ++++++++++++ drivers/infiniband/hw/mana/qp.c | 506 +++++++++++++++++++++++++++++++ drivers/infiniband/hw/mana/wq.c | 115 +++++++ include/net/mana/mana.h | 3 + include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + include/uapi/rdma/mana-abi.h | 66 ++++ 15 files changed, 1793 insertions(+) create mode 100644 drivers/infiniband/hw/mana/Kconfig create mode 100644 drivers/infiniband/hw/mana/Makefile create mode 100644 drivers/infiniband/hw/mana/cq.c create mode 100644 drivers/infiniband/hw/mana/device.c create mode 100644 drivers/infiniband/hw/mana/main.c create mode 100644 drivers/infiniband/hw/mana/mana_ib.h create mode 100644 drivers/infiniband/hw/mana/mr.c create mode 100644 drivers/infiniband/hw/mana/qp.c create mode 100644 drivers/infiniband/hw/mana/wq.c create mode 100644 include/uapi/rdma/mana-abi.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 441a65d41eb4..4db8e4e02c05 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13669,6 +13669,15 @@ F: drivers/scsi/smartpqi/smartpqi*.[ch] F: include/linux/cciss*.h F: include/uapi/linux/cciss*.h +MICROSOFT MANA RDMA DRIVER +M: Long Li +M: Ajay Sharma +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/hw/mana/ +F: include/net/mana +F: include/uapi/rdma/mana-abi.h + MICROSOFT SURFACE AGGREGATOR TABLET-MODE SWITCH M: Maximilian Luz L: platform-driver-x86@vger.kernel.org diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index aa36ac618e72..ccc874478f0b 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -85,6 +85,7 @@ source "drivers/infiniband/hw/erdma/Kconfig" source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/hns/Kconfig" source "drivers/infiniband/hw/irdma/Kconfig" +source "drivers/infiniband/hw/mana/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" source "drivers/infiniband/hw/mlx5/Kconfig" source "drivers/infiniband/hw/mthca/Kconfig" diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 6b3a88046125..1211f4317a9f 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_QIB) += qib/ obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/ obj-$(CONFIG_INFINIBAND_EFA) += efa/ obj-$(CONFIG_INFINIBAND_IRDMA) += irdma/ +obj-$(CONFIG_MANA_INFINIBAND) += mana/ obj-$(CONFIG_MLX4_INFINIBAND) += mlx4/ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ diff --git a/drivers/infiniband/hw/mana/Kconfig b/drivers/infiniband/hw/mana/Kconfig new file mode 100644 index 000000000000..546640657bac --- /dev/null +++ b/drivers/infiniband/hw/mana/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +config MANA_INFINIBAND + tristate "Microsoft Azure Network Adapter support" + depends on NETDEVICES && ETHERNET && PCI && MICROSOFT_MANA + help + This driver provides low-level RDMA support for Microsoft Azure + Network Adapter (MANA). MANA supports RDMA features that can be used + for workloads (e.g. DPDK, MPI etc) that uses RDMA verbs to directly + access hardware from user-mode processes in Microsoft Azure cloud + environment. diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile new file mode 100644 index 000000000000..88655fe5e398 --- /dev/null +++ b/drivers/infiniband/hw/mana/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o + +mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c new file mode 100644 index 000000000000..d141cab8a1e6 --- /dev/null +++ b/drivers/infiniband/hw/mana/cq.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct ib_device *ibdev = ibcq->device; + struct mana_ib_create_cq ucmd = {}; + struct mana_ib_dev *mdev; + int err; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + if (udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(ibdev, + "Failed to copy from udata for create cq, %d\n", err); + return err; + } + + if (attr->cqe > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); + return -EINVAL; + } + + cq->cqe = attr->cqe; + cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(cq->umem)) { + err = PTR_ERR(cq->umem); + ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n", + err); + return err; + } + + err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region); + if (err) { + ibdev_dbg(ibdev, + "Failed to create dma region for create cq, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(ibdev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, cq->gdma_region); + + /* + * The CQ ID is not known at this time. The ID is generated at create_qp + */ + + return 0; + +err_release_umem: + ib_umem_release(cq->umem); + return err; +} + +int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq); + struct ib_device *ibdev = ibcq->device; + struct mana_ib_dev *mdev; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region); + ib_umem_release(cq->umem); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c new file mode 100644 index 000000000000..d4541b8707e4 --- /dev/null +++ b/drivers/infiniband/hw/mana/device.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" +#include + +MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS(NET_MANA); + +static const struct ib_device_ops mana_ib_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_MANA, + .uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION, + + .alloc_pd = mana_ib_alloc_pd, + .alloc_ucontext = mana_ib_alloc_ucontext, + .create_cq = mana_ib_create_cq, + .create_qp = mana_ib_create_qp, + .create_rwq_ind_table = mana_ib_create_rwq_ind_table, + .create_wq = mana_ib_create_wq, + .dealloc_pd = mana_ib_dealloc_pd, + .dealloc_ucontext = mana_ib_dealloc_ucontext, + .dereg_mr = mana_ib_dereg_mr, + .destroy_cq = mana_ib_destroy_cq, + .destroy_qp = mana_ib_destroy_qp, + .destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table, + .destroy_wq = mana_ib_destroy_wq, + .disassociate_ucontext = mana_ib_disassociate_ucontext, + .get_port_immutable = mana_ib_get_port_immutable, + .mmap = mana_ib_mmap, + .modify_qp = mana_ib_modify_qp, + .modify_wq = mana_ib_modify_wq, + .query_device = mana_ib_query_device, + .query_gid = mana_ib_query_gid, + .query_port = mana_ib_query_port, + .reg_user_mr = mana_ib_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mana_ib_ucontext, ibucontext), + INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mana_ib_rwq_ind_table, + ib_ind_table), +}; + +static int mana_ib_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) +{ + struct mana_adev *madev = container_of(adev, struct mana_adev, adev); + struct gdma_dev *mdev = madev->mdev; + struct mana_context *mc; + struct mana_ib_dev *dev; + int ret; + + mc = mdev->driver_data; + + dev = ib_alloc_device(mana_ib_dev, ib_dev); + if (!dev) + return -ENOMEM; + + ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops); + + dev->ib_dev.phys_port_cnt = mc->num_ports; + + ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev, + mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt); + + dev->gdma_dev = mdev; + dev->ib_dev.node_type = RDMA_NODE_IB_CA; + + /* + * num_comp_vectors needs to set to the max MSIX index + * when interrupts and event queues are implemented + */ + dev->ib_dev.num_comp_vectors = 1; + dev->ib_dev.dev.parent = mdev->gdma_context->dev; + + ret = ib_register_device(&dev->ib_dev, "mana_%d", + mdev->gdma_context->dev); + if (ret) { + ib_dealloc_device(&dev->ib_dev); + return ret; + } + + dev_set_drvdata(&adev->dev, dev); + + return 0; +} + +static void mana_ib_remove(struct auxiliary_device *adev) +{ + struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev); + + ib_unregister_device(&dev->ib_dev); + ib_dealloc_device(&dev->ib_dev); +} + +static const struct auxiliary_device_id mana_id_table[] = { + { + .name = "mana.rdma", + }, + {}, +}; + +MODULE_DEVICE_TABLE(auxiliary, mana_id_table); + +static struct auxiliary_driver mana_driver = { + .name = "rdma", + .probe = mana_ib_probe, + .remove = mana_ib_remove, + .id_table = mana_id_table, +}; + +module_auxiliary_driver(mana_driver); diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c new file mode 100644 index 000000000000..8b3bc302d6f3 --- /dev/null +++ b/drivers/infiniband/hw/mana/main.c @@ -0,0 +1,521 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, + u32 port) +{ + struct gdma_dev *gd = dev->gdma_dev; + struct mana_port_context *mpc; + struct net_device *ndev; + struct mana_context *mc; + + mc = gd->driver_data; + ndev = mc->ports[port]; + mpc = netdev_priv(ndev); + + mutex_lock(&pd->vport_mutex); + + pd->vport_use_count--; + WARN_ON(pd->vport_use_count < 0); + + if (!pd->vport_use_count) + mana_uncfg_vport(mpc); + + mutex_unlock(&pd->vport_mutex); +} + +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd, + u32 doorbell_id) +{ + struct gdma_dev *mdev = dev->gdma_dev; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + int err; + + mc = mdev->driver_data; + ndev = mc->ports[port]; + mpc = netdev_priv(ndev); + + mutex_lock(&pd->vport_mutex); + + pd->vport_use_count++; + if (pd->vport_use_count > 1) { + ibdev_dbg(&dev->ib_dev, + "Skip as this PD is already configured vport\n"); + mutex_unlock(&pd->vport_mutex); + return 0; + } + + err = mana_cfg_vport(mpc, pd->pdn, doorbell_id); + if (err) { + pd->vport_use_count--; + mutex_unlock(&pd->vport_mutex); + + ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err); + return err; + } + + mutex_unlock(&pd->vport_mutex); + + pd->tx_shortform_allowed = mpc->tx_shortform_allowed; + pd->tx_vp_offset = mpc->tx_vp_offset; + + ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n", + mpc->port_handle, pd->pdn, doorbell_id); + + return 0; +} + +int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct ib_device *ibdev = ibpd->device; + struct gdma_create_pd_resp resp = {}; + struct gdma_create_pd_req req = {}; + enum gdma_pd_flags flags = 0; + struct mana_ib_dev *dev; + struct gdma_dev *mdev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + mdev = dev->gdma_dev; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req), + sizeof(resp)); + + req.flags = flags; + err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to get pd_id err %d status %u\n", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + pd->pd_handle = resp.pd_handle; + pd->pdn = resp.pd_id; + ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n", + pd->pd_handle, pd->pdn); + + mutex_init(&pd->vport_mutex); + pd->vport_use_count = 0; + return 0; +} + +int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct ib_device *ibdev = ibpd->device; + struct gdma_destory_pd_resp resp = {}; + struct gdma_destroy_pd_req req = {}; + struct mana_ib_dev *dev; + struct gdma_dev *mdev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + mdev = dev->gdma_dev; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req), + sizeof(resp)); + + req.pd_handle = pd->pd_handle; + err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req, + sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to destroy pd_handle 0x%llx err %d status %u", + pd->pd_handle, err, resp.hdr.status); + if (!err) + err = -EPROTO; + } + + return err; +} + +static int mana_gd_destroy_doorbell_page(struct gdma_context *gc, + int doorbell_page) +{ + struct gdma_destroy_resource_range_req req = {}; + struct gdma_resp_hdr resp = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_RESOURCE_RANGE, + sizeof(req), sizeof(resp)); + + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; + req.num_resources = 1; + req.allocated_resources = doorbell_page; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.status) { + dev_err(gc->dev, + "Failed to destroy doorbell page: ret %d, 0x%x\n", + err, resp.status); + return err ?: -EPROTO; + } + + return 0; +} + +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, + int *doorbell_page) +{ + struct gdma_allocate_resource_range_req req = {}; + struct gdma_allocate_resource_range_resp resp = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE, + sizeof(req), sizeof(resp)); + + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; + req.num_resources = 1; + req.alignment = 1; + + /* Have GDMA start searching from 0 */ + req.allocated_resources = 0; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + dev_err(gc->dev, + "Failed to allocate doorbell page: ret %d, 0x%x\n", + err, resp.hdr.status); + return err ?: -EPROTO; + } + + *doorbell_page = resp.allocated_resources; + + return 0; +} + +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, + struct ib_udata *udata) +{ + struct mana_ib_ucontext *ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + struct gdma_dev *dev; + int doorbell_page; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + dev = mdev->gdma_dev; + gc = dev->gdma_context; + + /* Allocate a doorbell page index */ + ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page); + if (ret) { + ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret); + return ret; + } + + ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page); + + ucontext->doorbell = doorbell_page; + + return 0; +} + +void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +{ + struct mana_ib_ucontext *mana_ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev->gdma_dev->gdma_context; + + ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell); + if (ret) + ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret); +} + +static int +mana_ib_gd_first_dma_region(struct mana_ib_dev *dev, + struct gdma_context *gc, + struct gdma_create_dma_region_req *create_req, + size_t num_pages, mana_handle_t *gdma_region) +{ + struct gdma_create_dma_region_resp create_resp = {}; + unsigned int create_req_msg_size; + int err; + + create_req_msg_size = + struct_size(create_req, page_addr_list, num_pages); + create_req->page_addr_list_len = num_pages; + + err = mana_gd_send_request(gc, create_req_msg_size, create_req, + sizeof(create_resp), &create_resp); + if (err || create_resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, + "Failed to create DMA region: %d, 0x%x\n", + err, create_resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + *gdma_region = create_resp.dma_region_handle; + ibdev_dbg(&dev->ib_dev, "Created DMA region handle 0x%llx\n", + *gdma_region); + + return 0; +} + +static int +mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc, + struct gdma_dma_region_add_pages_req *add_req, + unsigned int num_pages, u32 expected_status) +{ + unsigned int add_req_msg_size = + struct_size(add_req, page_addr_list, num_pages); + struct gdma_general_resp add_resp = {}; + int err; + + mana_gd_init_req_hdr(&add_req->hdr, GDMA_DMA_REGION_ADD_PAGES, + add_req_msg_size, sizeof(add_resp)); + add_req->page_addr_list_len = num_pages; + + err = mana_gd_send_request(gc, add_req_msg_size, add_req, + sizeof(add_resp), &add_resp); + if (err || add_resp.hdr.status != expected_status) { + ibdev_dbg(&dev->ib_dev, + "Failed to create DMA region: %d, 0x%x\n", + err, add_resp.hdr.status); + + if (!err) + err = -EPROTO; + + return err; + } + + return 0; +} + +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region) +{ + struct gdma_dma_region_add_pages_req *add_req = NULL; + size_t num_pages_processed = 0, num_pages_to_handle; + struct gdma_create_dma_region_req *create_req; + unsigned int create_req_msg_size; + struct hw_channel_context *hwc; + struct ib_block_iter biter; + size_t max_pgs_add_cmd = 0; + size_t max_pgs_create_cmd; + struct gdma_context *gc; + size_t num_pages_total; + struct gdma_dev *mdev; + unsigned long page_sz; + unsigned int tail = 0; + u64 *page_addr_list; + void *request_buf; + int err; + + mdev = dev->gdma_dev; + gc = mdev->gdma_context; + hwc = gc->hwc.driver_data; + + /* Hardware requires dma region to align to chosen page size */ + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0); + if (!page_sz) { + ibdev_dbg(&dev->ib_dev, "failed to find page size.\n"); + return -ENOMEM; + } + num_pages_total = ib_umem_num_dma_blocks(umem, page_sz); + + max_pgs_create_cmd = + (hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64); + num_pages_to_handle = + min_t(size_t, num_pages_total, max_pgs_create_cmd); + create_req_msg_size = + struct_size(create_req, page_addr_list, num_pages_to_handle); + + request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL); + if (!request_buf) + return -ENOMEM; + + create_req = request_buf; + mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION, + create_req_msg_size, + sizeof(struct gdma_create_dma_region_resp)); + + create_req->length = umem->length; + create_req->offset_in_page = umem->address & (page_sz - 1); + create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->page_count = num_pages_total; + + ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", + umem->length, num_pages_total); + + ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n", + page_sz, create_req->offset_in_page); + + ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u", + num_pages_to_handle, create_req->gdma_page_type); + + page_addr_list = create_req->page_addr_list; + rdma_umem_for_each_dma_block(umem, &biter, page_sz) { + page_addr_list[tail++] = rdma_block_iter_dma_address(&biter); + if (tail < num_pages_to_handle) + continue; + + if (!num_pages_processed) { + /* First create message */ + err = mana_ib_gd_first_dma_region(dev, gc, create_req, + tail, gdma_region); + if (err) + goto out; + + max_pgs_add_cmd = (hwc->max_req_msg_size - + sizeof(*add_req)) / sizeof(u64); + + add_req = request_buf; + add_req->dma_region_handle = *gdma_region; + add_req->reserved3 = 0; + page_addr_list = add_req->page_addr_list; + } else { + /* Subsequent create messages */ + u32 expected_s = 0; + + if (num_pages_processed + num_pages_to_handle < + num_pages_total) + expected_s = GDMA_STATUS_MORE_ENTRIES; + + err = mana_ib_gd_add_dma_region(dev, gc, add_req, tail, + expected_s); + if (err) + break; + } + + num_pages_processed += tail; + tail = 0; + + /* The remaining pages to create */ + num_pages_to_handle = + min_t(size_t, + num_pages_total - num_pages_processed, + max_pgs_add_cmd); + } + + if (err) + mana_ib_gd_destroy_dma_region(dev, *gdma_region); + +out: + kfree(request_buf); + return err; +} + +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64 gdma_region) +{ + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + + gc = mdev->gdma_context; + ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region); + + return mana_gd_destroy_dma_region(gc, gdma_region); +} + +int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) +{ + struct mana_ib_ucontext *mana_ucontext = + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); + struct ib_device *ibdev = ibcontext->device; + struct mana_ib_dev *mdev; + struct gdma_context *gc; + phys_addr_t pfn; + pgprot_t prot; + int ret; + + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); + gc = mdev->gdma_dev->gdma_context; + + if (vma->vm_pgoff != 0) { + ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff); + return -EINVAL; + } + + /* Map to the page indexed by ucontext->doorbell */ + pfn = (gc->phys_db_page_base + + gc->db_page_size * mana_ucontext->doorbell) >> + PAGE_SHIFT; + prot = pgprot_writecombine(vma->vm_page_prot); + + ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + NULL); + if (ret) + ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); + else + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", + pfn, gc->db_page_size, ret); + + return ret; +} + +int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + /* + * This version only support RAW_PACKET + * other values need to be filled for other types + */ + immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET; + + return 0; +} + +int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw) +{ + props->max_qp = MANA_MAX_NUM_QUEUES; + props->max_qp_wr = MAX_SEND_BUFFERS_PER_QUEUE; + + /* + * max_cqe could be potentially much bigger. + * As this version of driver only support RAW QP, set it to the same + * value as max_qp_wr + */ + props->max_cqe = MAX_SEND_BUFFERS_PER_QUEUE; + + props->max_mr_size = MANA_IB_MAX_MR_SIZE; + props->max_mr = MANA_IB_MAX_MR; + props->max_send_sge = MAX_TX_WQE_SGL_ENTRIES; + props->max_recv_sge = MAX_RX_WQE_SGL_ENTRIES; + + return 0; +} + +int mana_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props) +{ + /* This version doesn't return port properties */ + return 0; +} + +int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid) +{ + /* This version doesn't return GID properties */ + return 0; +} + +void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h new file mode 100644 index 000000000000..502cc8672eef --- /dev/null +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. + */ + +#ifndef _MANA_IB_H_ +#define _MANA_IB_H_ + +#include +#include +#include +#include +#include + +#include + +#define PAGE_SZ_BM \ + (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ + SZ_512K | SZ_1M | SZ_2M) + +/* MANA doesn't have any limit for MR size */ +#define MANA_IB_MAX_MR_SIZE U64_MAX + +/* + * The hardware limit of number of MRs is greater than maximum number of MRs + * that can possibly represent in 24 bits + */ +#define MANA_IB_MAX_MR 0xFFFFFFu + +struct mana_ib_dev { + struct ib_device ib_dev; + struct gdma_dev *gdma_dev; +}; + +struct mana_ib_wq { + struct ib_wq ibwq; + struct ib_umem *umem; + int wqe; + u32 wq_buf_size; + u64 gdma_region; + u64 id; + mana_handle_t rx_object; +}; + +struct mana_ib_pd { + struct ib_pd ibpd; + u32 pdn; + mana_handle_t pd_handle; + + /* Mutex for sharing access to vport_use_count */ + struct mutex vport_mutex; + int vport_use_count; + + bool tx_shortform_allowed; + u32 tx_vp_offset; +}; + +struct mana_ib_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + mana_handle_t mr_handle; +}; + +struct mana_ib_cq { + struct ib_cq ibcq; + struct ib_umem *umem; + int cqe; + u64 gdma_region; + u64 id; +}; + +struct mana_ib_qp { + struct ib_qp ibqp; + + /* Work queue info */ + struct ib_umem *sq_umem; + int sqe; + u64 sq_gdma_region; + u64 sq_id; + mana_handle_t tx_object; + + /* The port on the IB device, starting with 1 */ + u32 port; +}; + +struct mana_ib_ucontext { + struct ib_ucontext ibucontext; + u32 doorbell; +}; + +struct mana_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_ind_table; +}; + +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem, + mana_handle_t *gdma_region); + +int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, + mana_handle_t gdma_region); + +struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); + +int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); + +int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata); + +int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); + +int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl); + +struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags); + +struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata); + +int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); + +int mana_ib_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata); + +int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); + +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port_id, + struct mana_ib_pd *pd, u32 doorbell_id); +void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd, + u32 port); + +int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); + +int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); + +int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); + +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, + struct ib_udata *udata); +void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); + +int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma); + +int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable); +int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw); +int mana_ib_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *props); +int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index, + union ib_gid *gid); + +void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext); + +#endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c new file mode 100644 index 000000000000..a56236cdd9ee --- /dev/null +++ b/drivers/infiniband/hw/mana/mr.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +#define VALID_MR_FLAGS \ + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) + +static enum gdma_mr_access_flags +mana_ib_verbs_to_gdma_access_flags(int access_flags) +{ + enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ; + + if (access_flags & IB_ACCESS_LOCAL_WRITE) + flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_WRITE) + flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE; + + if (access_flags & IB_ACCESS_REMOTE_READ) + flags |= GDMA_ACCESS_FLAG_REMOTE_READ; + + return flags; +} + +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, + struct gdma_create_mr_params *mr_params) +{ + struct gdma_create_mr_response resp = {}; + struct gdma_create_mr_request req = {}; + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + int err; + + gc = mdev->gdma_context; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), + sizeof(resp)); + req.pd_handle = mr_params->pd_handle; + req.mr_type = mr_params->mr_type; + + switch (mr_params->mr_type) { + case GDMA_MR_TYPE_GVA: + req.gva.dma_region_handle = mr_params->gva.dma_region_handle; + req.gva.virtual_address = mr_params->gva.virtual_address; + req.gva.access_flags = mr_params->gva.access_flags; + break; + + default: + ibdev_dbg(&dev->ib_dev, + "invalid param (GDMA_MR_TYPE) passed, type %d\n", + req.mr_type); + return -EINVAL; + } + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + + if (err || resp.hdr.status) { + ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + + return err; + } + + mr->ibmr.lkey = resp.lkey; + mr->ibmr.rkey = resp.rkey; + mr->mr_handle = resp.mr_handle; + + return 0; +} + +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, + gdma_obj_handle_t mr_handle) +{ + struct gdma_destroy_mr_response resp = {}; + struct gdma_destroy_mr_request req = {}; + struct gdma_dev *mdev = dev->gdma_dev; + struct gdma_context *gc; + int err; + + gc = mdev->gdma_context; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req), + sizeof(resp)); + + req.mr_handle = mr_handle; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = -EPROTO; + return err; + } + + return 0; +} + +struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, + u64 iova, int access_flags, + struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct gdma_create_mr_params mr_params = {}; + struct ib_device *ibdev = ibpd->device; + gdma_obj_handle_t dma_region_handle; + struct mana_ib_dev *dev; + struct mana_ib_mr *mr; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + ibdev_dbg(ibdev, + "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x", + start, iova, length, access_flags); + + if (access_flags & ~VALID_MR_FLAGS) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->umem = ib_umem_get(ibdev, start, length, access_flags); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + ibdev_dbg(ibdev, + "Failed to get umem for register user-mr, %d\n", err); + goto err_free; + } + + err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle); + if (err) { + ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n", + err); + goto err_umem; + } + + ibdev_dbg(ibdev, + "mana_ib_gd_create_dma_region ret %d gdma_region %llx\n", err, + dma_region_handle); + + mr_params.pd_handle = pd->pd_handle; + mr_params.mr_type = GDMA_MR_TYPE_GVA; + mr_params.gva.dma_region_handle = dma_region_handle; + mr_params.gva.virtual_address = iova; + mr_params.gva.access_flags = + mana_ib_verbs_to_gdma_access_flags(access_flags); + + err = mana_ib_gd_create_mr(dev, mr, &mr_params); + if (err) + goto err_dma_region; + + /* + * There is no need to keep track of dma_region_handle after MR is + * successfully created. The dma_region_handle is tracked in the PF + * as part of the lifecycle of this MR. + */ + + return &mr->ibmr; + +err_dma_region: + mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context, + dma_region_handle); + +err_umem: + ib_umem_release(mr->umem); + +err_free: + kfree(mr); + return ERR_PTR(err); +} + +int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr); + struct ib_device *ibdev = ibmr->device; + struct mana_ib_dev *dev; + int err; + + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); + + err = mana_ib_gd_destroy_mr(dev, mr->mr_handle); + if (err) + return err; + + if (mr->umem) + ib_umem_release(mr->umem); + + kfree(mr); + + return 0; +} diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c new file mode 100644 index 000000000000..ea15ec77e321 --- /dev/null +++ b/drivers/infiniband/hw/mana/qp.c @@ -0,0 +1,506 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev, + struct net_device *ndev, + mana_handle_t default_rxobj, + mana_handle_t ind_table[], + u32 log_ind_tbl_size, u32 rx_hash_key_len, + u8 *rx_hash_key) +{ + struct mana_port_context *mpc = netdev_priv(ndev); + struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_resp resp = {}; + mana_handle_t *req_indir_tab; + struct gdma_context *gc; + struct gdma_dev *mdev; + u32 req_buf_size; + int i, err; + + mdev = dev->gdma_dev; + gc = mdev->gdma_context; + + req_buf_size = + sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE; + req = kzalloc(req_buf_size, GFP_KERNEL); + if (!req) + return -ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, + sizeof(resp)); + + req->vport = mpc->port_handle; + req->rx_enable = 1; + req->update_default_rxobj = 1; + req->default_rxobj = default_rxobj; + req->hdr.dev_id = mdev->dev_id; + + /* If there are more than 1 entries in indirection table, enable RSS */ + if (log_ind_tbl_size) + req->rss_enable = true; + + req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE; + req->indir_tab_offset = sizeof(*req); + req->update_indir_tab = true; + + req_indir_tab = (mana_handle_t *)(req + 1); + /* The ind table passed to the hardware must have + * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb + * ind_table to MANA_INDIRECT_TABLE_SIZE if required + */ + ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size); + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { + req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)]; + ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i, + req_indir_tab[i]); + } + + req->update_hashkey = true; + if (rx_hash_key_len) + memcpy(req->hashkey, rx_hash_key, rx_hash_key_len); + else + netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE); + + ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n", + req->vport, default_rxobj); + + err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp); + if (err) { + netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; + } + + if (resp.hdr.status) { + netdev_err(ndev, "vPort RX configuration failed: 0x%x\n", + resp.hdr.status); + err = -EPROTO; + goto out; + } + + netdev_info(ndev, "Configured steering vPort %llu log_entries %u\n", + mpc->port_handle, log_ind_tbl_size); + +out: + kfree(req); + return err; +} + +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_dev *mdev = + container_of(pd->device, struct mana_ib_dev, ib_dev); + struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl; + struct mana_ib_create_qp_rss_resp resp = {}; + struct mana_ib_create_qp_rss ucmd = {}; + struct gdma_dev *gd = mdev->gdma_dev; + mana_handle_t *mana_ind_table; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_cq *cq; + struct mana_ib_wq *wq; + unsigned int ind_tbl_size; + struct ib_cq *ibcq; + struct ib_wq *ibwq; + int i = 0; + u32 port; + int ret; + + mc = gd->driver_data; + + if (!udata || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (ret) { + ibdev_dbg(&mdev->ib_dev, + "Failed copy from udata for create rss-qp, err %d\n", + ret); + return ret; + } + + if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_recv_wr %d exceeding limit\n", + attr->cap.max_recv_wr); + return -EINVAL; + } + + if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_recv_sge %d exceeding limit\n", + attr->cap.max_recv_sge); + return -EINVAL; + } + + ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size; + if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) { + ibdev_dbg(&mdev->ib_dev, + "Indirect table size %d exceeding limit\n", + ind_tbl_size); + return -EINVAL; + } + + if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) { + ibdev_dbg(&mdev->ib_dev, + "RX Hash function is not supported, %d\n", + ucmd.rx_hash_function); + return -EINVAL; + } + + /* IB ports start with 1, MANA start with 0 */ + port = ucmd.port; + if (port < 1 || port > mc->num_ports) { + ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n", + port); + return -EINVAL; + } + ndev = mc->ports[port - 1]; + mpc = netdev_priv(ndev); + + ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n", + ucmd.rx_hash_function, port); + + mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t), + GFP_KERNEL); + if (!mana_ind_table) { + ret = -ENOMEM; + goto fail; + } + + qp->port = port; + + for (i = 0; i < ind_tbl_size; i++) { + struct mana_obj_spec wq_spec = {}; + struct mana_obj_spec cq_spec = {}; + + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + + ibcq = ibwq->cq; + cq = container_of(ibcq, struct mana_ib_cq, ibcq); + + wq_spec.gdma_region = wq->gdma_region; + wq_spec.queue_size = wq->wq_buf_size; + + cq_spec.gdma_region = cq->gdma_region; + cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = GDMA_CQ_NO_EQ; + + ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ, + &wq_spec, &cq_spec, &wq->rx_object); + if (ret) + goto fail; + + /* The GDMA regions are now owned by the WQ object */ + wq->gdma_region = GDMA_INVALID_DMA_REGION; + cq->gdma_region = GDMA_INVALID_DMA_REGION; + + wq->id = wq_spec.queue_index; + cq->id = cq_spec.queue_index; + + ibdev_dbg(&mdev->ib_dev, + "ret %d rx_object 0x%llx wq id %llu cq id %llu\n", + ret, wq->rx_object, wq->id, cq->id); + + resp.entries[i].cqid = cq->id; + resp.entries[i].wqid = wq->id; + + mana_ind_table[i] = wq->rx_object; + } + resp.num_entries = i; + + ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object, + mana_ind_table, + ind_tbl->log_ind_tbl_size, + ucmd.rx_hash_key_len, + ucmd.rx_hash_key); + if (ret) + goto fail; + + ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (ret) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy to udata create rss-qp, %d\n", + ret); + goto fail; + } + + kfree(mana_ind_table); + + return 0; + +fail: + while (i-- > 0) { + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); + } + + kfree(mana_ind_table); + + return ret; +} + +static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + struct mana_ib_dev *mdev = + container_of(ibpd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_cq *send_cq = + container_of(attr->send_cq, struct mana_ib_cq, ibcq); + struct mana_ib_ucontext *mana_ucontext = + rdma_udata_to_drv_context(udata, struct mana_ib_ucontext, + ibucontext); + struct mana_ib_create_qp_resp resp = {}; + struct gdma_dev *gd = mdev->gdma_dev; + struct mana_ib_create_qp ucmd = {}; + struct mana_obj_spec wq_spec = {}; + struct mana_obj_spec cq_spec = {}; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct ib_umem *umem; + int err; + u32 port; + + mc = gd->driver_data; + + if (!mana_ucontext || udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy from udata create qp-raw, %d\n", err); + return err; + } + + /* IB ports start with 1, MANA Ethernet ports start with 0 */ + port = ucmd.port; + if (ucmd.port > mc->num_ports) + return -EINVAL; + + if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_send_wr %d exceeding limit\n", + attr->cap.max_send_wr); + return -EINVAL; + } + + if (attr->cap.max_send_sge > MAX_TX_WQE_SGL_ENTRIES) { + ibdev_dbg(&mdev->ib_dev, + "Requested max_send_sge %d exceeding limit\n", + attr->cap.max_send_sge); + return -EINVAL; + } + + ndev = mc->ports[port - 1]; + mpc = netdev_priv(ndev); + ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc); + + err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext->doorbell); + if (err) + return -ENODEV; + + qp->port = port; + + ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n", + ucmd.sq_buf_addr, ucmd.port); + + umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, + "Failed to get umem for create qp-raw, err %d\n", + err); + goto err_free_vport; + } + qp->sq_umem = umem; + + err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem, + &qp->sq_gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create dma region for create qp-raw, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(&mdev->ib_dev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, qp->sq_gdma_region); + + /* Create a WQ on the same port handle used by the Ethernet */ + wq_spec.gdma_region = qp->sq_gdma_region; + wq_spec.queue_size = ucmd.sq_buf_size; + + cq_spec.gdma_region = send_cq->gdma_region; + cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = GDMA_CQ_NO_EQ; + + err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec, + &cq_spec, &qp->tx_object); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create wq for create raw-qp, err %d\n", + err); + goto err_destroy_dma_region; + } + + /* The GDMA regions are now owned by the WQ object */ + qp->sq_gdma_region = GDMA_INVALID_DMA_REGION; + send_cq->gdma_region = GDMA_INVALID_DMA_REGION; + + qp->sq_id = wq_spec.queue_index; + send_cq->id = cq_spec.queue_index; + + ibdev_dbg(&mdev->ib_dev, + "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err, + qp->tx_object, qp->sq_id, send_cq->id); + + resp.sqid = qp->sq_id; + resp.cqid = send_cq->id; + resp.tx_vp_offset = pd->tx_vp_offset; + + err = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed copy udata for create qp-raw, %d\n", + err); + goto err_destroy_wq_obj; + } + + return 0; + +err_destroy_wq_obj: + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + +err_destroy_dma_region: + mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + +err_release_umem: + ib_umem_release(umem); + +err_free_vport: + mana_ib_uncfg_vport(mdev, pd, port - 1); + + return err; +} + +int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + switch (attr->qp_type) { + case IB_QPT_RAW_PACKET: + /* When rwq_ind_tbl is used, it's for creating WQs for RSS */ + if (attr->rwq_ind_tbl) + return mana_ib_create_qp_rss(ibqp, ibqp->pd, attr, + udata); + + return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata); + default: + /* Creating QP other than IB_QPT_RAW_PACKET is not supported */ + ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n", + attr->qp_type); + } + + return -EINVAL; +} + +int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + /* modify_qp is not supported by this version of the driver */ + return -EOPNOTSUPP; +} + +static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp, + struct ib_rwq_ind_table *ind_tbl, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_dev *gd = mdev->gdma_dev; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_wq *wq; + struct ib_wq *ibwq; + int i; + + mc = gd->driver_data; + ndev = mc->ports[qp->port - 1]; + mpc = netdev_priv(ndev); + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + ibwq = ind_tbl->ind_tbl[i]; + wq = container_of(ibwq, struct mana_ib_wq, ibwq); + ibdev_dbg(&mdev->ib_dev, "destroying wq->rx_object %llu\n", + wq->rx_object); + mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object); + } + + return 0; +} + +static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev); + struct gdma_dev *gd = mdev->gdma_dev; + struct ib_pd *ibpd = qp->ibqp.pd; + struct mana_port_context *mpc; + struct mana_context *mc; + struct net_device *ndev; + struct mana_ib_pd *pd; + + mc = gd->driver_data; + ndev = mc->ports[qp->port - 1]; + mpc = netdev_priv(ndev); + pd = container_of(ibpd, struct mana_ib_pd, ibpd); + + mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object); + + if (qp->sq_umem) { + mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region); + ib_umem_release(qp->sq_umem); + } + + mana_ib_uncfg_vport(mdev, pd, qp->port - 1); + + return 0; +} + +int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); + + switch (ibqp->qp_type) { + case IB_QPT_RAW_PACKET: + if (ibqp->rwq_ind_tbl) + return mana_ib_destroy_qp_rss(qp, ibqp->rwq_ind_tbl, + udata); + + return mana_ib_destroy_qp_raw(qp, udata); + + default: + ibdev_dbg(ibqp->device, "Unexpected QP type %u\n", + ibqp->qp_type); + } + + return -ENOENT; +} diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c new file mode 100644 index 000000000000..372d361510e0 --- /dev/null +++ b/drivers/infiniband/hw/mana/wq.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#include "mana_ib.h" + +struct ib_wq *mana_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mana_ib_dev *mdev = + container_of(pd->device, struct mana_ib_dev, ib_dev); + struct mana_ib_create_wq ucmd = {}; + struct mana_ib_wq *wq; + struct ib_umem *umem; + int err; + + if (udata->inlen < sizeof(ucmd)) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to copy from udata for create wq, %d\n", err); + return ERR_PTR(err); + } + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + return ERR_PTR(-ENOMEM); + + ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr); + + umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size, + IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + err = PTR_ERR(umem); + ibdev_dbg(&mdev->ib_dev, + "Failed to get umem for create wq, err %d\n", err); + goto err_free_wq; + } + + wq->umem = umem; + wq->wqe = init_attr->max_wr; + wq->wq_buf_size = ucmd.wq_buf_size; + wq->rx_object = INVALID_MANA_HANDLE; + + err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region); + if (err) { + ibdev_dbg(&mdev->ib_dev, + "Failed to create dma region for create wq, %d\n", + err); + goto err_release_umem; + } + + ibdev_dbg(&mdev->ib_dev, + "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n", + err, wq->gdma_region); + + /* WQ ID is returned at wq_create time, doesn't know the value yet */ + + return &wq->ibwq; + +err_release_umem: + ib_umem_release(umem); + +err_free_wq: + kfree(wq); + + return ERR_PTR(err); +} + +int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + /* modify_wq is not supported by this version of the driver */ + return -EOPNOTSUPP; +} + +int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata) +{ + struct mana_ib_wq *wq = container_of(ibwq, struct mana_ib_wq, ibwq); + struct ib_device *ib_dev = ibwq->device; + struct mana_ib_dev *mdev; + + mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev); + + mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region); + ib_umem_release(wq->umem); + + kfree(wq); + + return 0; +} + +int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + /* + * There is no additional data in ind_table to be maintained by this + * driver, do nothing + */ + return 0; +} + +int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + /* + * There is no additional data in ind_table to be maintained by this + * driver, do nothing + */ + return 0; +} diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 713a8f8cca9a..20212ffeefb9 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -412,6 +412,9 @@ int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); extern const struct ethtool_ops mana_ethtool_ops; +/* A CQ can be created not associated with any EQ */ +#define GDMA_CQ_NO_EQ 0xffff + struct mana_obj_spec { u32 queue_index; u64 gdma_region; diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 7dd56210226f..e0c25537fd2e 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -251,6 +251,7 @@ enum rdma_driver_id { RDMA_DRIVER_EFA, RDMA_DRIVER_SIW, RDMA_DRIVER_ERDMA, + RDMA_DRIVER_MANA, }; enum ib_uverbs_gid_type { diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h new file mode 100644 index 000000000000..5fcb31b37fb9 --- /dev/null +++ b/include/uapi/rdma/mana-abi.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ +/* + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. + */ + +#ifndef MANA_ABI_USER_H +#define MANA_ABI_USER_H + +#include +#include + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ + +#define MANA_IB_UVERBS_ABI_VERSION 1 + +struct mana_ib_create_cq { + __aligned_u64 buf_addr; +}; + +struct mana_ib_create_qp { + __aligned_u64 sq_buf_addr; + __u32 sq_buf_size; + __u32 port; +}; + +struct mana_ib_create_qp_resp { + __u32 sqid; + __u32 cqid; + __u32 tx_vp_offset; + __u32 reserved; +}; + +struct mana_ib_create_wq { + __aligned_u64 wq_buf_addr; + __u32 wq_buf_size; + __u32 reserved; +}; + +/* RX Hash function flags */ +enum mana_ib_rx_hash_function_flags { + MANA_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +struct mana_ib_create_qp_rss { + __aligned_u64 rx_hash_fields_mask; + __u8 rx_hash_function; + __u8 reserved[7]; + __u32 rx_hash_key_len; + __u8 rx_hash_key[40]; + __u32 port; +}; + +struct rss_resp_entry { + __u32 cqid; + __u32 wqid; +}; + +struct mana_ib_create_qp_rss_resp { + __aligned_u64 num_entries; + struct rss_resp_entry entries[64]; +}; + +#endif -- cgit v1.2.3 From 3574cfdca28543e2e8db649297cd6659ea8e4bb8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 11 Nov 2022 11:55:29 +0200 Subject: RDMA/mana: Remove redefinition of basic u64 type gdma_obj_handle_t is no more than redefinition of basic u64 type. Remove such obfuscation. Link: https://lore.kernel.org/r/3c1e821279e6a165d058655d2343722d6650e776.1668160486.git.leonro@nvidia.com Acked-by: Long Li Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mana/mr.c | 5 ++-- drivers/net/ethernet/microsoft/mana/gdma_main.c | 3 +-- include/net/mana/gdma.h | 31 +++++++++++-------------- 3 files changed, 17 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index a56236cdd9ee..351207c60eb6 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -73,8 +73,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, return 0; } -static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, - gdma_obj_handle_t mr_handle) +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle) { struct gdma_destroy_mr_response resp = {}; struct gdma_destroy_mr_request req = {}; @@ -108,9 +107,9 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); struct gdma_create_mr_params mr_params = {}; struct ib_device *ibdev = ibpd->device; - gdma_obj_handle_t dma_region_handle; struct mana_ib_dev *dev; struct mana_ib_mr *mr; + u64 dma_region_handle; int err; dev = container_of(ibdev, struct mana_ib_dev, ib_dev); diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 46a7d1e6ece9..69224ff8efb6 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -671,8 +671,7 @@ free_q: return err; } -int mana_gd_destroy_dma_region(struct gdma_context *gc, - gdma_obj_handle_t dma_region_handle) +int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle) { struct gdma_destroy_dma_region_req req = {}; struct gdma_general_resp resp = {}; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 221adc96340c..a9fdae14d24c 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -65,8 +65,6 @@ enum { GDMA_DEVICE_MANA = 2, }; -typedef u64 gdma_obj_handle_t; - struct gdma_resource { /* Protect the bitmap */ spinlock_t lock; @@ -200,7 +198,7 @@ struct gdma_mem_info { u64 length; /* Allocated by the PF driver */ - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; #define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 @@ -624,7 +622,7 @@ struct gdma_create_queue_req { u32 reserved1; u32 pdid; u32 doolbell_id; - gdma_obj_handle_t gdma_region; + u64 gdma_region; u32 reserved2; u32 queue_size; u32 log2_throttle_limit; @@ -699,14 +697,14 @@ struct gdma_create_dma_region_req { struct gdma_create_dma_region_resp { struct gdma_resp_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; /* HW DATA */ /* GDMA_DMA_REGION_ADD_PAGES */ struct gdma_dma_region_add_pages_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u32 page_addr_list_len; u32 reserved3; @@ -718,7 +716,7 @@ struct gdma_dma_region_add_pages_req { struct gdma_destroy_dma_region_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; }; /* HW DATA */ enum gdma_pd_flags { @@ -733,14 +731,14 @@ struct gdma_create_pd_req { struct gdma_create_pd_resp { struct gdma_resp_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; u32 pd_id; u32 reserved; };/* HW DATA */ struct gdma_destroy_pd_req { struct gdma_req_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; };/* HW DATA */ struct gdma_destory_pd_resp { @@ -756,11 +754,11 @@ enum gdma_mr_type { }; struct gdma_create_mr_params { - gdma_obj_handle_t pd_handle; + u64 pd_handle; enum gdma_mr_type mr_type; union { struct { - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; @@ -769,13 +767,13 @@ struct gdma_create_mr_params { struct gdma_create_mr_request { struct gdma_req_hdr hdr; - gdma_obj_handle_t pd_handle; + u64 pd_handle; enum gdma_mr_type mr_type; u32 reserved_1; union { struct { - gdma_obj_handle_t dma_region_handle; + u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; @@ -786,14 +784,14 @@ struct gdma_create_mr_request { struct gdma_create_mr_response { struct gdma_resp_hdr hdr; - gdma_obj_handle_t mr_handle; + u64 mr_handle; u32 lkey; u32 rkey; };/* HW DATA */ struct gdma_destroy_mr_request { struct gdma_req_hdr hdr; - gdma_obj_handle_t mr_handle; + u64 mr_handle; };/* HW DATA */ struct gdma_destroy_mr_response { @@ -827,7 +825,6 @@ void mana_gd_free_memory(struct gdma_mem_info *gmi); int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, u32 resp_len, void *resp); -int mana_gd_destroy_dma_region(struct gdma_context *gc, - gdma_obj_handle_t dma_region_handle); +int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); #endif /* _GDMA_H */ -- cgit v1.2.3 From 0c5e259b06a8efc69f929ad777ea49281bb58e37 Mon Sep 17 00:00:00 2001 From: Luoyouming Date: Tue, 8 Nov 2022 21:38:47 +0800 Subject: RDMA/hns: Fix incorrect sge nums calculation The user usually configures the number of sge through the max_send_sge parameter when creating qp, and configures the maximum size of inline data that can be sent through max_inline_data. Inline uses sge to fill data to send. Expect the following: 1) When the sge space cannot hold inline data, the sge space needs to be expanded to accommodate all inline data 2) When the sge space is enough to accommodate inline data, the upper limit of inline data can be increased so that users can send larger inline data Currently case one is not implemented. When the inline data is larger than the sge space, an error of insufficient sge space occurs. This part of the code needs to be reimplemented according to the expected rules. The calculation method of sge num is modified to take the maximum value of max_send_sge and the sge for max_inline_data to solve this problem. Fixes: 05201e01be93 ("RDMA/hns: Refactor process of setting extended sge") Fixes: 30b707886aeb ("RDMA/hns: Support inline data in extented sge space for RC") Link: https://lore.kernel.org/r/20221108133847.2304539-3-xuhaoyue1@hisilicon.com Signed-off-by: Luoyouming Signed-off-by: Haoyue Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +--- drivers/infiniband/hw/hns/hns_roce_main.c | 18 ++++- drivers/infiniband/hw/hns/hns_roce_qp.c | 107 +++++++++++++++++++++++----- include/uapi/rdma/hns-abi.h | 15 ++++ 5 files changed, 125 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 723e55a7de8d..f701cc86896b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -202,6 +202,7 @@ struct hns_roce_ucontext { struct list_head page_list; struct mutex page_mutex; struct hns_user_mmap_entry *db_mmap_entry; + u32 config; }; struct hns_roce_pd { @@ -334,6 +335,7 @@ struct hns_roce_wq { u32 head; u32 tail; void __iomem *db_reg; + u32 ext_sge_cnt; }; struct hns_roce_sge { @@ -635,6 +637,7 @@ struct hns_roce_qp { struct list_head rq_node; /* all recv qps are on a list */ struct list_head sq_node; /* all send qps are on a list */ struct hns_user_mmap_entry *dwqe_mmap_entry; + u32 config; }; struct hns_roce_ib_iboe { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dcb59c05edfd..939811867249 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -188,14 +188,6 @@ static void set_atomic_seg(const struct ib_send_wr *wr, hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); } -static unsigned int get_std_sge_num(struct hns_roce_qp *qp) -{ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return 0; - - return HNS_ROCE_SGE_IN_WQE; -} - static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_idx, u32 msg_len) @@ -203,14 +195,12 @@ static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev; unsigned int left_len_in_pg; unsigned int idx = *sge_idx; - unsigned int std_sge_num; unsigned int i = 0; unsigned int len; void *addr; void *dseg; - std_sge_num = get_std_sge_num(qp); - if (msg_len > (qp->sq.max_gs - std_sge_num) * HNS_ROCE_SGE_SIZE) { + if (msg_len > qp->sq.ext_sge_cnt * HNS_ROCE_SGE_SIZE) { ibdev_err(ibdev, "no enough extended sge space for inline data.\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index dcf89689a4c6..8ba68ac12388 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -354,10 +354,11 @@ static int hns_roce_alloc_uar_entry(struct ib_ucontext *uctx) static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - int ret; struct hns_roce_ucontext *context = to_hr_ucontext(uctx); - struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); + struct hns_roce_ib_alloc_ucontext_resp resp = {}; + struct hns_roce_ib_alloc_ucontext ucmd = {}; + int ret; if (!hr_dev->active) return -EAGAIN; @@ -365,6 +366,19 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, resp.qp_tab_size = hr_dev->caps.num_qps; resp.srq_tab_size = hr_dev->caps.num_srqs; + ret = ib_copy_from_udata(&ucmd, udata, + min(udata->inlen, sizeof(ucmd))); + if (ret) + return ret; + + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + context->config = ucmd.config & HNS_ROCE_EXSGE_FLAGS; + + if (context->config & HNS_ROCE_EXSGE_FLAGS) { + resp.config |= HNS_ROCE_RSP_EXSGE_FLAGS; + resp.max_inline_data = hr_dev->caps.max_sq_inline; + } + ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index f0bd82a18069..0ae335fb205c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -476,38 +476,109 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, return 0; } -static u32 get_wqe_ext_sge_cnt(struct hns_roce_qp *qp) +static u32 get_max_inline_data(struct hns_roce_dev *hr_dev, + struct ib_qp_cap *cap) { - /* GSI/UD QP only has extended sge */ - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_UD) - return qp->sq.max_gs; - - if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) - return qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE; + if (cap->max_inline_data) { + cap->max_inline_data = roundup_pow_of_two(cap->max_inline_data); + return min(cap->max_inline_data, + hr_dev->caps.max_sq_inline); + } return 0; } +static void update_inline_data(struct hns_roce_qp *hr_qp, + struct ib_qp_cap *cap) +{ + u32 sge_num = hr_qp->sq.ext_sge_cnt; + + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + if (!(hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD)) + sge_num = max((u32)HNS_ROCE_SGE_IN_WQE, sge_num); + + cap->max_inline_data = max(cap->max_inline_data, + sge_num * HNS_ROCE_SGE_SIZE); + } + + hr_qp->max_inline_data = cap->max_inline_data; +} + +static u32 get_sge_num_from_max_send_sge(bool is_ud_or_gsi, + u32 max_send_sge) +{ + unsigned int std_sge_num; + unsigned int min_sge; + + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + min_sge = is_ud_or_gsi ? 1 : 0; + return max_send_sge > std_sge_num ? (max_send_sge - std_sge_num) : + min_sge; +} + +static unsigned int get_sge_num_from_max_inl_data(bool is_ud_or_gsi, + u32 max_inline_data) +{ + unsigned int inline_sge; + + inline_sge = roundup_pow_of_two(max_inline_data) / HNS_ROCE_SGE_SIZE; + + /* + * if max_inline_data less than + * HNS_ROCE_SGE_IN_WQE * HNS_ROCE_SGE_SIZE, + * In addition to ud's mode, no need to extend sge. + */ + if (!is_ud_or_gsi && inline_sge <= HNS_ROCE_SGE_IN_WQE) + inline_sge = 0; + + return inline_sge; +} + static void set_ext_sge_param(struct hns_roce_dev *hr_dev, u32 sq_wqe_cnt, struct hns_roce_qp *hr_qp, struct ib_qp_cap *cap) { + bool is_ud_or_gsi = (hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD); + unsigned int std_sge_num; + u32 inline_ext_sge = 0; + u32 ext_wqe_sge_cnt; u32 total_sge_cnt; - u32 wqe_sge_cnt; + + cap->max_inline_data = get_max_inline_data(hr_dev, cap); hr_qp->sge.sge_shift = HNS_ROCE_SGE_SHIFT; + std_sge_num = is_ud_or_gsi ? 0 : HNS_ROCE_SGE_IN_WQE; + ext_wqe_sge_cnt = get_sge_num_from_max_send_sge(is_ud_or_gsi, + cap->max_send_sge); - hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + if (hr_qp->config & HNS_ROCE_EXSGE_FLAGS) { + inline_ext_sge = max(ext_wqe_sge_cnt, + get_sge_num_from_max_inl_data(is_ud_or_gsi, + cap->max_inline_data)); + hr_qp->sq.ext_sge_cnt = inline_ext_sge ? + roundup_pow_of_two(inline_ext_sge) : 0; - wqe_sge_cnt = get_wqe_ext_sge_cnt(hr_qp); + hr_qp->sq.max_gs = max(1U, (hr_qp->sq.ext_sge_cnt + std_sge_num)); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + + ext_wqe_sge_cnt = hr_qp->sq.ext_sge_cnt; + } else { + hr_qp->sq.max_gs = max(1U, cap->max_send_sge); + hr_qp->sq.max_gs = min(hr_qp->sq.max_gs, hr_dev->caps.max_sq_sg); + hr_qp->sq.ext_sge_cnt = hr_qp->sq.max_gs; + } /* If the number of extended sge is not zero, they MUST use the * space of HNS_HW_PAGE_SIZE at least. */ - if (wqe_sge_cnt) { - total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * wqe_sge_cnt); + if (ext_wqe_sge_cnt) { + total_sge_cnt = roundup_pow_of_two(sq_wqe_cnt * ext_wqe_sge_cnt); hr_qp->sge.sge_cnt = max(total_sge_cnt, (u32)HNS_HW_PAGE_SIZE / HNS_ROCE_SGE_SIZE); } + + update_inline_data(hr_qp, cap); } static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev, @@ -556,6 +627,7 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift = ucmd->log_sq_stride; hr_qp->sq.wqe_cnt = cnt; + cap->max_send_sge = hr_qp->sq.max_gs; return 0; } @@ -986,13 +1058,9 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_ib_create_qp *ucmd) { struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_ucontext *uctx; int ret; - if (init_attr->cap.max_inline_data > hr_dev->caps.max_sq_inline) - init_attr->cap.max_inline_data = hr_dev->caps.max_sq_inline; - - hr_qp->max_inline_data = init_attr->cap.max_inline_data; - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR; else @@ -1015,12 +1083,17 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, return ret; } + uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, + ibucontext); + hr_qp->config = uctx->config; ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd); if (ret) ibdev_err(ibdev, "failed to set user SQ size, ret = %d.\n", ret); } else { + if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) + hr_qp->config = HNS_ROCE_EXSGE_FLAGS; ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp); if (ret) ibdev_err(ibdev, diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index f6fde06db4b4..745790ce3c26 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -85,11 +85,26 @@ struct hns_roce_ib_create_qp_resp { __aligned_u64 dwqe_mmap_key; }; +enum { + HNS_ROCE_EXSGE_FLAGS = 1 << 0, +}; + +enum { + HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0, +}; + struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 cqe_size; __u32 srq_tab_size; __u32 reserved; + __u32 config; + __u32 max_inline_data; +}; + +struct hns_roce_ib_alloc_ucontext { + __u32 config; + __u32 reserved; }; struct hns_roce_ib_alloc_pd_resp { -- cgit v1.2.3 From 09f530f0c6d6689eee5e690c6d98f495fcc3a0f9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 23 Nov 2022 20:27:14 -0400 Subject: RDMA: Add netdevice_tracker to ib_device_set_netdev() This will cause an informative backtrace to print if the user of ib_device_set_netdev() isn't careful about tearing down the ibdevice before its the netdevice parent is destroyed. Such as like this: unregister_netdevice: waiting for vlan0 to become free. Usage count = 2 leaked reference. ib_device_set_netdev+0x266/0x730 siw_newlink+0x4e0/0xfd0 nldev_newlink+0x35c/0x5c0 rdma_nl_rcv_msg+0x36d/0x690 rdma_nl_rcv+0x2ee/0x430 netlink_unicast+0x543/0x7f0 netlink_sendmsg+0x918/0xe20 sock_sendmsg+0xcf/0x120 ____sys_sendmsg+0x70d/0x8b0 ___sys_sendmsg+0x11d/0x1b0 __sys_sendmsg+0xfa/0x1d0 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd This will help debug the issues syzkaller is seeing. Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v1-a7c81b3842ce+e5-netdev_tracker_jgg@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 6 ++++-- include/rdma/ib_verbs.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3409c55ea88b..ff35cebb25e2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2159,14 +2159,16 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, return 0; } + if (old_ndev) + netdev_tracker_free(ndev, &pdata->netdev_tracker); if (ndev) - dev_hold(ndev); + netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); rcu_assign_pointer(pdata->netdev, ndev); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); if (old_ndev) - dev_put(old_ndev); + __dev_put(old_ndev); return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1f4d53a4bb6..77dd9148815b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2203,6 +2203,7 @@ struct ib_port_data { struct ib_port_cache cache; struct net_device __rcu *netdev; + netdevice_tracker netdev_tracker; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; struct ib_port *sysfs; -- cgit v1.2.3 From efa2afc3969e166702fd2ae3cfb1a7a195ef3533 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:05 +0000 Subject: RDMA: Extend RDMA user ABI to support atomic write 1) Define new atomic write request/completion in userspace. 2) Define new atomic write capability in userspace. Link: https://lore.kernel.org/r/1669905432-14-2-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_verbs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 43672cb1fd57..237814815544 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -466,6 +466,7 @@ enum ib_uverbs_wc_opcode { IB_UVERBS_WC_BIND_MW = 5, IB_UVERBS_WC_LOCAL_INV = 6, IB_UVERBS_WC_TSO = 7, + IB_UVERBS_WC_ATOMIC_WRITE = 9, }; struct ib_uverbs_wc { @@ -784,6 +785,7 @@ enum ib_uverbs_wr_opcode { IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + IB_UVERBS_WR_ATOMIC_WRITE = 15, /* Review enum ib_wr_opcode before modifying this */ }; @@ -1331,6 +1333,8 @@ enum ib_uverbs_device_cap_flags { /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS. */ IB_UVERBS_DEVICE_RAW_SCATTER_FCS = 1ULL << 34, IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING = 1ULL << 36, + /* Atomic write attributes */ + IB_UVERBS_DEVICE_ATOMIC_WRITE = 1ULL << 40, }; enum ib_uverbs_raw_packet_caps { -- cgit v1.2.3 From 3ff81e827b8d5cea36ff374a11c200b4306f45d2 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:06 +0000 Subject: RDMA: Extend RDMA kernel ABI to support atomic write 1) Define new atomic write request/completion in kernel. 2) Define new atomic write capability in kernel. 3) Define new atomic write opcode for RC service in packet. Link: https://lore.kernel.org/r/1669905432-14-3-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/rdma/ib_pack.h | 2 ++ include/rdma/ib_verbs.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index a9162f25beaf..f932d164af63 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -84,6 +84,7 @@ enum { /* opcode 0x15 is reserved */ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, + IB_OPCODE_ATOMIC_WRITE = 0x1D, /* real constants follow -- see comment about above IB_OPCODE() macro for more details */ @@ -112,6 +113,7 @@ enum { IB_OPCODE(RC, FETCH_ADD), IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), + IB_OPCODE(RC, ATOMIC_WRITE), /* UC */ IB_OPCODE(UC, SEND_FIRST), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 77dd9148815b..df6bb26ba0be 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -270,6 +270,7 @@ enum ib_device_cap_flags { /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, + IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { @@ -982,6 +983,7 @@ enum ib_wc_opcode { IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW, IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV, IB_WC_LSO = IB_UVERBS_WC_TSO, + IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, @@ -1325,6 +1327,7 @@ enum ib_wr_opcode { IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, -- cgit v1.2.3 From c2d939002934fa9d7b802f196b069963b46da194 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 1 Dec 2022 14:37:07 +0000 Subject: RDMA/rxe: Extend rxe user ABI to support atomic write Define an atomic_wr array to store 8-byte value. Link: https://lore.kernel.org/r/1669905432-14-4-git-send-email-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 73f679dfd2df..d20d1ecf046f 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -146,6 +146,7 @@ struct rxe_dma_info { __u32 reserved; union { __DECLARE_FLEX_ARRAY(__u8, inline_data); + __DECLARE_FLEX_ARRAY(__u8, atomic_wr); __DECLARE_FLEX_ARRAY(struct rxe_sge, sge); }; }; -- cgit v1.2.3 From 0c17da492dc6c33cc5b99633adb4bd7b2587153c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:52 +0800 Subject: RDMA: Extend RDMA user ABI to support flush This commit extends the RDMA user ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing RDMA user ABI. Link: https://lore.kernel.org/r/20221206130201.30986-2-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_ioctl_verbs.h | 2 ++ include/uapi/rdma/ib_user_verbs.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index e0c25537fd2e..d7c5aaa32744 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -57,6 +57,8 @@ enum ib_uverbs_access_flags { IB_UVERBS_ACCESS_ZERO_BASED = 1 << 5, IB_UVERBS_ACCESS_ON_DEMAND = 1 << 6, IB_UVERBS_ACCESS_HUGETLB = 1 << 7, + IB_UVERBS_ACCESS_FLUSH_GLOBAL = 1 << 8, + IB_UVERBS_ACCESS_FLUSH_PERSISTENT = 1 << 9, IB_UVERBS_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_OPTIONAL_FIRST, IB_UVERBS_ACCESS_OPTIONAL_RANGE = diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 237814815544..e16650f0c85d 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -105,6 +105,18 @@ enum { IB_USER_VERBS_EX_CMD_MODIFY_CQ }; +/* see IBA A19.4.1.1 Placement Types */ +enum ib_placement_type { + IB_FLUSH_GLOBAL = 1U << 0, + IB_FLUSH_PERSISTENT = 1U << 1, +}; + +/* see IBA A19.4.1.2 Selectivity Level */ +enum ib_selectivity_level { + IB_FLUSH_RANGE = 0, + IB_FLUSH_MR, +}; + /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to @@ -466,6 +478,7 @@ enum ib_uverbs_wc_opcode { IB_UVERBS_WC_BIND_MW = 5, IB_UVERBS_WC_LOCAL_INV = 6, IB_UVERBS_WC_TSO = 7, + IB_UVERBS_WC_FLUSH = 8, IB_UVERBS_WC_ATOMIC_WRITE = 9, }; @@ -785,6 +798,7 @@ enum ib_uverbs_wr_opcode { IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + IB_UVERBS_WR_FLUSH = 14, IB_UVERBS_WR_ATOMIC_WRITE = 15, /* Review enum ib_wr_opcode before modifying this */ }; @@ -1333,6 +1347,9 @@ enum ib_uverbs_device_cap_flags { /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS. */ IB_UVERBS_DEVICE_RAW_SCATTER_FCS = 1ULL << 34, IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING = 1ULL << 36, + /* Flush placement types */ + IB_UVERBS_DEVICE_FLUSH_GLOBAL = 1ULL << 38, + IB_UVERBS_DEVICE_FLUSH_PERSISTENT = 1ULL << 39, /* Atomic write attributes */ IB_UVERBS_DEVICE_ATOMIC_WRITE = 1ULL << 40, }; -- cgit v1.2.3 From 208e3a134b50d95ea3962d7a37b4d8a8f5368376 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:53 +0800 Subject: RDMA: Extend RDMA kernel verbs ABI to support flush This commit extends the RDMA kernel verbs ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing RDMA kernel verbs ABI. It makes device/HCA support new FLUSH attributes/capabilities, and it also makes memory region support new FLUSH access flags. Users can use ibv_reg_mr(3) to register flush access flags. Only the access flags also supported by device's capabilities can be registered successfully. Once registered successfully, it means the MR is flushable. Similarly, A flushable MR should also have one or both of GLOBAL_VISIBILITY and PERSISTENT attributes/capabilities like device/HCA. Link: https://lore.kernel.org/r/20221206130201.30986-3-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/rdma/ib_pack.h | 3 +++ include/rdma/ib_verbs.h | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index f932d164af63..b8c56d7dc35d 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -84,6 +84,7 @@ enum { /* opcode 0x15 is reserved */ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16, IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17, + IB_OPCODE_FLUSH = 0x1C, IB_OPCODE_ATOMIC_WRITE = 0x1D, /* real constants follow -- see comment about above IB_OPCODE() @@ -113,6 +114,7 @@ enum { IB_OPCODE(RC, FETCH_ADD), IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE), IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE), + IB_OPCODE(RC, FLUSH), IB_OPCODE(RC, ATOMIC_WRITE), /* UC */ @@ -151,6 +153,7 @@ enum { IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE), IB_OPCODE(RD, COMPARE_SWAP), IB_OPCODE(RD, FETCH_ADD), + IB_OPCODE(RD, FLUSH), /* UD */ IB_OPCODE(UD, SEND_ONLY), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index df6bb26ba0be..a9a429172c0a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -270,6 +270,9 @@ enum ib_device_cap_flags { /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, + /* Placement type attributes */ + IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, + IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; @@ -987,6 +990,7 @@ enum ib_wc_opcode { IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, + IB_WC_FLUSH = IB_UVERBS_WC_FLUSH, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). @@ -1327,6 +1331,7 @@ enum ib_wr_opcode { IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + IB_WR_FLUSH = IB_UVERBS_WR_FLUSH, IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ @@ -1461,10 +1466,12 @@ enum ib_access_flags { IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND, IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB, IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING, + IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL, + IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT, IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE, IB_ACCESS_SUPPORTED = - ((IB_ACCESS_HUGETLB << 1) - 1) | IB_ACCESS_OPTIONAL, + ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL, }; /* @@ -4325,6 +4332,8 @@ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); static inline int ib_check_mr_access(struct ib_device *ib_dev, unsigned int flags) { + u64 device_cap = ib_dev->attrs.device_cap_flags; + /* * Local write permission is required if remote write or * remote atomic permission is also requested. @@ -4339,6 +4348,13 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev, if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EOPNOTSUPP; + + if ((flags & IB_ACCESS_FLUSH_GLOBAL && + !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) || + (flags & IB_ACCESS_FLUSH_PERSISTENT && + !(device_cap & IB_DEVICE_FLUSH_PERSISTENT))) + return -EOPNOTSUPP; + return 0; } -- cgit v1.2.3 From 668ce52d5eef477c0def757610768a1a3ccc9785 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 6 Dec 2022 21:01:54 +0800 Subject: RDMA/rxe: Extend rxe user ABI to support flush This commit extends the rxe user ABI to support the flush operation defined in IBA A19.4.1. These changes are backward compatible with the existing rxe user ABI. The user API request a flush by filling this structure. Link: https://lore.kernel.org/r/20221206130201.30986-4-lizhijian@fujitsu.com Reviewed-by: Zhu Yanjun Signed-off-by: Li Zhijian Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index d20d1ecf046f..bb092fccb813 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -82,6 +82,13 @@ struct rxe_send_wr { __u32 invalidate_rkey; } ex; union { + struct { + __aligned_u64 remote_addr; + __u32 length; + __u32 rkey; + __u8 type; + __u8 level; + } flush; struct { __aligned_u64 remote_addr; __u32 rkey; -- cgit v1.2.3