From 9062ff405b49769c04f00373de2c9cefab91b600 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 18 Jul 2023 03:55:40 -0700 Subject: vfio/pci: Extend VFIO_DEVICE_GET_PCI_HOT_RESET_INFO for vfio device cdev This allows VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl use the iommufd_ctx of the cdev device to check the ownership of the other affected devices. When VFIO_DEVICE_GET_PCI_HOT_RESET_INFO is called on an IOMMUFD managed device, the new flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is reported to indicate the values returned are IOMMUFD devids rather than group IDs as used when accessing vfio devices through the conventional vfio group interface. Additionally the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED will be reported in this mode if all of the devices affected by the hot-reset are owned by either virtue of being directly bound to the same iommufd context as the calling device, or implicitly owned via a shared IOMMU group. Suggested-by: Jason Gunthorpe Suggested-by: Alex Williamson Reviewed-by: Jason Gunthorpe Tested-by: Yanting Jiang Tested-by: Zhenzhong Duan Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20230718105542.4138-9-yi.l.liu@intel.com Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 20c804bdc09c..e680720ddddc 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -677,11 +677,57 @@ enum { * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) * + * This command is used to query the affected devices in the hot reset for + * a given device. + * + * This command always reports the segment, bus, and devfn information for + * each affected device, and selectively reports the group_id or devid per + * the way how the calling device is opened. + * + * - If the calling device is opened via the traditional group/container + * API, group_id is reported. User should check if it has owned all + * the affected devices and provides a set of group fds to prove the + * ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl. + * + * - If the calling device is opened as a cdev, devid is reported. + * Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this + * data type. All the affected devices should be represented in + * the dev_set, ex. bound to a vfio driver, and also be owned by + * this interface which is determined by the following conditions: + * 1) Has a valid devid within the iommufd_ctx of the calling device. + * Ownership cannot be determined across separate iommufd_ctx and + * the cdev calling conventions do not support a proof-of-ownership + * model as provided in the legacy group interface. In this case + * valid devid with value greater than zero is provided in the return + * structure. + * 2) Does not have a valid devid within the iommufd_ctx of the calling + * device, but belongs to the same IOMMU group as the calling device + * or another opened device that has a valid devid within the + * iommufd_ctx of the calling device. This provides implicit ownership + * for devices within the same DMA isolation context. In this case + * the devid value of VFIO_PCI_DEVID_OWNED is provided in the return + * structure. + * + * A devid value of VFIO_PCI_DEVID_NOT_OWNED is provided in the return + * structure for affected devices where device is NOT represented in the + * dev_set or ownership is not available. Such devices prevent the use + * of VFIO_DEVICE_PCI_HOT_RESET ioctl outside of the proof-of-ownership + * calling conventions (ie. via legacy group accessed devices). Flag + * VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the + * affected devices are represented in the dev_set and also owned by + * the user. This flag is available only when + * flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved. + * * Return: 0 on success, -errno on failure: * -enospc = insufficient buffer, -enodev = unsupported for device. */ struct vfio_pci_dependent_device { - __u32 group_id; + union { + __u32 group_id; + __u32 devid; +#define VFIO_PCI_DEVID_OWNED 0 +#define VFIO_PCI_DEVID_NOT_OWNED -1 + }; __u16 segment; __u8 bus; __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ @@ -690,6 +736,8 @@ struct vfio_pci_dependent_device { struct vfio_pci_hot_reset_info { __u32 argsz; __u32 flags; +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID (1 << 0) +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED (1 << 1) __u32 count; struct vfio_pci_dependent_device devices[]; }; -- cgit v1.2.3 From 71791b9246c7339f0d151e853546acc653522e2f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 18 Jul 2023 03:55:42 -0700 Subject: vfio/pci: Allow passing zero-length fd array in VFIO_DEVICE_PCI_HOT_RESET This is the way user to invoke hot-reset for the devices opened by cdev interface. User should check the flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED in the output of VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl before doing hot-reset for cdev devices. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Tested-by: Yanting Jiang Tested-by: Zhenzhong Duan Signed-off-by: Jason Gunthorpe Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20230718105542.4138-11-yi.l.liu@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 61 ++++++++++++++++++++++++++++++++-------- include/uapi/linux/vfio.h | 21 ++++++++++++++ 2 files changed, 71 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f4a0ea01559e..65cbada3ec13 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -181,7 +181,8 @@ no_mmap: struct vfio_pci_group_info; static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, - struct vfio_pci_group_info *groups); + struct vfio_pci_group_info *groups, + struct iommufd_ctx *iommufd_ctx); /* * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND @@ -1329,8 +1330,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev, if (ret) return ret; - /* Somewhere between 1 and count is OK */ - if (!array_count || array_count > count) + if (array_count > count) return -EINVAL; group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL); @@ -1379,7 +1379,7 @@ vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev, info.count = array_count; info.files = files; - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL); hot_reset_release: for (file_idx--; file_idx >= 0; file_idx--) @@ -1402,13 +1402,21 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, if (hdr.argsz < minsz || hdr.flags) return -EINVAL; + /* zero-length array is only for cdev opened devices */ + if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev)) + return -EINVAL; + /* Can we do a slot or bus reset or neither? */ if (!pci_probe_reset_slot(vdev->pdev->slot)) slot = true; else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg); + if (hdr.count) + return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg); + + return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, + vfio_iommufd_device_ictx(&vdev->vdev)); } static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, @@ -2376,13 +2384,16 @@ const struct pci_error_handlers vfio_pci_core_err_handlers = { }; EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); -static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, +static bool vfio_dev_in_groups(struct vfio_device *vdev, struct vfio_pci_group_info *groups) { unsigned int i; + if (!groups) + return false; + for (i = 0; i < groups->count; i++) - if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) + if (vfio_file_has_dev(groups->files[i], vdev)) return true; return false; } @@ -2458,7 +2469,8 @@ unwind: * get each memory_lock. */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, - struct vfio_pci_group_info *groups) + struct vfio_pci_group_info *groups, + struct iommufd_ctx *iommufd_ctx) { struct vfio_pci_core_device *cur_mem; struct vfio_pci_core_device *cur_vma; @@ -2488,11 +2500,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_unlock; list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + bool owned; + /* - * Test whether all the affected devices are contained by the - * set of groups provided by the user. + * Test whether all the affected devices can be reset by the + * user. + * + * If called from a group opened device and the user provides + * a set of groups, all the devices in the dev_set should be + * contained by the set of groups provided by the user. + * + * If called from a cdev opened device and the user provides + * a zero-length array, all the devices in the dev_set must + * be bound to the same iommufd_ctx as the input iommufd_ctx. + * If there is any device that has not been bound to any + * iommufd_ctx yet, check if its iommu_group has any device + * bound to the input iommufd_ctx. Such devices can be + * considered owned by the input iommufd_ctx as the device + * cannot be owned by another iommufd_ctx when its iommu_group + * is owned. + * + * Otherwise, reset is not allowed. */ - if (!vfio_dev_in_groups(cur_vma, groups)) { + if (iommufd_ctx) { + int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + iommufd_ctx); + + owned = (devid > 0 || devid == -ENOENT); + } else { + owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + } + + if (!owned) { ret = -EINVAL; goto err_undo; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index e680720ddddc..4c3d548e9c96 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -717,6 +717,9 @@ enum { * affected devices are represented in the dev_set and also owned by * the user. This flag is available only when * flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved. + * When set, user could invoke VFIO_DEVICE_PCI_HOT_RESET with a zero + * length fd array on the calling device as the ownership is validated + * by iommufd_ctx. * * Return: 0 on success, -errno on failure: * -enospc = insufficient buffer, -enodev = unsupported for device. @@ -748,6 +751,24 @@ struct vfio_pci_hot_reset_info { * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13, * struct vfio_pci_hot_reset) * + * A PCI hot reset results in either a bus or slot reset which may affect + * other devices sharing the bus/slot. The calling user must have + * ownership of the full set of affected devices as determined by the + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl. + * + * When called on a device file descriptor acquired through the vfio + * group interface, the user is required to provide proof of ownership + * of those affected devices via the group_fds array in struct + * vfio_pci_hot_reset. + * + * When called on a direct cdev opened vfio device, the flags field of + * struct vfio_pci_hot_reset_info reports the ownership status of the + * affected devices and this ioctl must be called with an empty group_fds + * array. See above INFO ioctl definition for ownership requirements. + * + * Mixed usage of legacy groups and cdevs across the set of affected + * devices is not supported. + * * Return: 0 on success, -errno on failure. */ struct vfio_pci_hot_reset { -- cgit v1.2.3 From dcc31ea60b422f9868c39607059d6e37cee6cefa Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 18 Jul 2023 06:55:30 -0700 Subject: kvm/vfio: Accept vfio device file from userspace This defines KVM_DEV_VFIO_FILE* and make alias with KVM_DEV_VFIO_GROUP*. Old userspace uses KVM_DEV_VFIO_GROUP* works as well. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Tested-by: Terrence Xu Tested-by: Nicolin Chen Tested-by: Matthew Rosato Tested-by: Yanting Jiang Tested-by: Shameer Kolothum Tested-by: Zhenzhong Duan Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20230718135551.6592-6-yi.l.liu@intel.com Signed-off-by: Alex Williamson --- Documentation/virt/kvm/devices/vfio.rst | 47 ++++++++++++++++++++++----------- include/uapi/linux/kvm.h | 13 ++++++--- virt/kvm/vfio.c | 12 ++++----- 3 files changed, 47 insertions(+), 25 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/devices/vfio.rst b/Documentation/virt/kvm/devices/vfio.rst index 08b544212638..c549143bb891 100644 --- a/Documentation/virt/kvm/devices/vfio.rst +++ b/Documentation/virt/kvm/devices/vfio.rst @@ -9,22 +9,34 @@ Device types supported: - KVM_DEV_TYPE_VFIO Only one VFIO instance may be created per VM. The created device -tracks VFIO groups in use by the VM and features of those groups -important to the correctness and acceleration of the VM. As groups -are enabled and disabled for use by the VM, KVM should be updated -about their presence. When registered with KVM, a reference to the -VFIO-group is held by KVM. +tracks VFIO files (group or device) in use by the VM and features +of those groups/devices important to the correctness and acceleration +of the VM. As groups/devices are enabled and disabled for use by the +VM, KVM should be updated about their presence. When registered with +KVM, a reference to the VFIO file is held by KVM. Groups: - KVM_DEV_VFIO_GROUP - -KVM_DEV_VFIO_GROUP attributes: - KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. + KVM_DEV_VFIO_FILE + alias: KVM_DEV_VFIO_GROUP + +KVM_DEV_VFIO_FILE attributes: + KVM_DEV_VFIO_FILE_ADD: Add a VFIO file (group/device) to VFIO-KVM device + tracking + + kvm_device_attr.addr points to an int32_t file descriptor for the + VFIO file. + + KVM_DEV_VFIO_FILE_DEL: Remove a VFIO file (group/device) from VFIO-KVM + device tracking + + kvm_device_attr.addr points to an int32_t file descriptor for the + VFIO file. + +KVM_DEV_VFIO_GROUP (legacy kvm device group restricted to the handling of VFIO group fd): + KVM_DEV_VFIO_GROUP_ADD: same as KVM_DEV_VFIO_FILE_ADD for group fd only + + KVM_DEV_VFIO_GROUP_DEL: same as KVM_DEV_VFIO_FILE_DEL for group fd only + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table allocated by sPAPR KVM. kvm_device_attr.addr points to a struct:: @@ -40,7 +52,10 @@ KVM_DEV_VFIO_GROUP attributes: - @tablefd is a file descriptor for a TCE table allocated via KVM_CREATE_SPAPR_TCE. -The GROUP_ADD operation above should be invoked prior to accessing the +The FILE/GROUP_ADD operation above should be invoked prior to accessing the device file descriptor via VFIO_GROUP_GET_DEVICE_FD in order to support drivers which require a kvm pointer to be set in their .open_device() -callback. +callback. It is the same for device file descriptor via character device +open which gets device access via VFIO_DEVICE_BIND_IOMMUFD. For such file +descriptors, FILE_ADD should be invoked before VFIO_DEVICE_BIND_IOMMUFD +to support the drivers mentioned in prior sentence as well. diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f089ab290978..13065dd96132 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1418,9 +1418,16 @@ struct kvm_device_attr { __u64 addr; /* userspace address of attr data */ }; -#define KVM_DEV_VFIO_GROUP 1 -#define KVM_DEV_VFIO_GROUP_ADD 1 -#define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_FILE 1 + +#define KVM_DEV_VFIO_FILE_ADD 1 +#define KVM_DEV_VFIO_FILE_DEL 2 + +/* KVM_DEV_VFIO_GROUP aliases are for compile time uapi compatibility */ +#define KVM_DEV_VFIO_GROUP KVM_DEV_VFIO_FILE + +#define KVM_DEV_VFIO_GROUP_ADD KVM_DEV_VFIO_FILE_ADD +#define KVM_DEV_VFIO_GROUP_DEL KVM_DEV_VFIO_FILE_DEL #define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 enum kvm_device_type { diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 8f7fa07e8170..07cb5f44b2a2 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -286,12 +286,12 @@ static int kvm_vfio_set_file(struct kvm_device *dev, long attr, int32_t fd; switch (attr) { - case KVM_DEV_VFIO_GROUP_ADD: + case KVM_DEV_VFIO_FILE_ADD: if (get_user(fd, argp)) return -EFAULT; return kvm_vfio_file_add(dev, fd); - case KVM_DEV_VFIO_GROUP_DEL: + case KVM_DEV_VFIO_FILE_DEL: if (get_user(fd, argp)) return -EFAULT; return kvm_vfio_file_del(dev, fd); @@ -309,7 +309,7 @@ static int kvm_vfio_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { - case KVM_DEV_VFIO_GROUP: + case KVM_DEV_VFIO_FILE: return kvm_vfio_set_file(dev, attr->attr, u64_to_user_ptr(attr->addr)); } @@ -321,10 +321,10 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { - case KVM_DEV_VFIO_GROUP: + case KVM_DEV_VFIO_FILE: switch (attr->attr) { - case KVM_DEV_VFIO_GROUP_ADD: - case KVM_DEV_VFIO_GROUP_DEL: + case KVM_DEV_VFIO_FILE_ADD: + case KVM_DEV_VFIO_FILE_DEL: #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: #endif -- cgit v1.2.3 From 5fcc26969a164e6a3154bb68badd6712716eb954 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 18 Jul 2023 06:55:47 -0700 Subject: vfio: Add VFIO_DEVICE_BIND_IOMMUFD This adds ioctl for userspace to bind device cdev fd to iommufd. VFIO_DEVICE_BIND_IOMMUFD: bind device to an iommufd, hence gain DMA control provided by the iommufd. open_device op is called after bind_iommufd op. Tested-by: Nicolin Chen Tested-by: Matthew Rosato Tested-by: Yanting Jiang Tested-by: Shameer Kolothum Tested-by: Terrence Xu Tested-by: Zhenzhong Duan Signed-off-by: Yi Liu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230718135551.6592-23-yi.l.liu@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 107 +++++++++++++++++++++++++++++++++++++++++++++ drivers/vfio/vfio.h | 13 ++++++ drivers/vfio/vfio_main.c | 5 +++ include/linux/vfio.h | 5 ++- include/uapi/linux/vfio.h | 27 ++++++++++++ 5 files changed, 155 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index bf1032d00107..f40784dd5561 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -3,6 +3,7 @@ * Copyright (c) 2023 Intel Corporation. */ #include +#include #include "vfio.h" @@ -45,6 +46,112 @@ err_put_registration: return ret; } +static void vfio_df_get_kvm_safe(struct vfio_device_file *df) +{ + spin_lock(&df->kvm_ref_lock); + vfio_device_get_kvm_safe(df->device, df->kvm); + spin_unlock(&df->kvm_ref_lock); +} + +long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, + struct vfio_device_bind_iommufd __user *arg) +{ + struct vfio_device *device = df->device; + struct vfio_device_bind_iommufd bind; + unsigned long minsz; + int ret; + + static_assert(__same_type(arg->out_devid, df->devid)); + + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); + + if (copy_from_user(&bind, arg, minsz)) + return -EFAULT; + + if (bind.argsz < minsz || bind.flags || bind.iommufd < 0) + return -EINVAL; + + /* BIND_IOMMUFD only allowed for cdev fds */ + if (df->group) + return -EINVAL; + + ret = vfio_device_block_group(device); + if (ret) + return ret; + + mutex_lock(&device->dev_set->lock); + /* one device cannot be bound twice */ + if (df->access_granted) { + ret = -EINVAL; + goto out_unlock; + } + + df->iommufd = iommufd_ctx_from_fd(bind.iommufd); + if (IS_ERR(df->iommufd)) { + ret = PTR_ERR(df->iommufd); + df->iommufd = NULL; + goto out_unlock; + } + + /* + * Before the device open, get the KVM pointer currently + * associated with the device file (if there is) and obtain + * a reference. This reference is held until device closed. + * Save the pointer in the device for use by drivers. + */ + vfio_df_get_kvm_safe(df); + + ret = vfio_df_open(df); + if (ret) + goto out_put_kvm; + + ret = copy_to_user(&arg->out_devid, &df->devid, + sizeof(df->devid)) ? -EFAULT : 0; + if (ret) + goto out_close_device; + + device->cdev_opened = true; + /* + * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ + * read/write/mmap + */ + smp_store_release(&df->access_granted, true); + mutex_unlock(&device->dev_set->lock); + return 0; + +out_close_device: + vfio_df_close(df); +out_put_kvm: + vfio_device_put_kvm(device); + iommufd_ctx_put(df->iommufd); + df->iommufd = NULL; +out_unlock: + mutex_unlock(&device->dev_set->lock); + vfio_device_unblock_group(device); + return ret; +} + +void vfio_df_unbind_iommufd(struct vfio_device_file *df) +{ + struct vfio_device *device = df->device; + + /* + * In the time of close, there is no contention with another one + * changing this flag. So read df->access_granted without lock + * and no smp_load_acquire() is ok. + */ + if (!df->access_granted) + return; + + mutex_lock(&device->dev_set->lock); + vfio_df_close(df); + vfio_device_put_kvm(device); + iommufd_ctx_put(df->iommufd); + device->cdev_opened = false; + mutex_unlock(&device->dev_set->lock); + vfio_device_unblock_group(device); +} + static char *vfio_device_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index c2aa65382592..b6d4ba1ef2b8 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -287,6 +287,9 @@ static inline void vfio_device_del(struct vfio_device *device) } int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep); +long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, + struct vfio_device_bind_iommufd __user *arg); +void vfio_df_unbind_iommufd(struct vfio_device_file *df); int vfio_cdev_init(struct class *device_class); void vfio_cdev_cleanup(void); #else @@ -310,6 +313,16 @@ static inline int vfio_device_fops_cdev_open(struct inode *inode, return 0; } +static inline long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, + struct vfio_device_bind_iommufd __user *arg) +{ + return -ENOTTY; +} + +static inline void vfio_df_unbind_iommufd(struct vfio_device_file *df) +{ +} + static inline int vfio_cdev_init(struct class *device_class) { return 0; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index a2744cb64c6d..9fdf93ff17cf 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -575,6 +575,8 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) if (df->group) vfio_df_group_close(df); + else + vfio_df_unbind_iommufd(df); vfio_device_put_registration(device); @@ -1149,6 +1151,9 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, void __user *uptr = (void __user *)arg; int ret; + if (cmd == VFIO_DEVICE_BIND_IOMMUFD) + return vfio_df_ioctl_bind_iommufd(df, uptr); + /* Paired with smp_store_release() following vfio_df_open() */ if (!smp_load_acquire(&df->access_granted)) return -EINVAL; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e0069f26488d..d6228c839c44 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -64,8 +64,9 @@ struct vfio_device { void (*put_kvm)(struct kvm *kvm); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; - bool iommufd_attached; + u8 iommufd_attached:1; #endif + u8 cdev_opened:1; }; /** @@ -168,7 +169,7 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) static inline bool vfio_device_cdev_opened(struct vfio_device *device) { - return false; + return device->cdev_opened; } /** diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 4c3d548e9c96..098946b23e86 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -897,6 +897,33 @@ struct vfio_device_feature { #define VFIO_DEVICE_FEATURE _IO(VFIO_TYPE, VFIO_BASE + 17) +/* + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18, + * struct vfio_device_bind_iommufd) + * @argsz: User filled size of this data. + * @flags: Must be 0. + * @iommufd: iommufd to bind. + * @out_devid: The device id generated by this bind. devid is a handle for + * this device/iommufd bond and can be used in IOMMUFD commands. + * + * Bind a vfio_device to the specified iommufd. + * + * User is restricted from accessing the device before the binding operation + * is completed. Only allowed on cdev fds. + * + * Unbind is automatically conducted when device fd is closed. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_bind_iommufd { + __u32 argsz; + __u32 flags; + __s32 iommufd; + __u32 out_devid; +}; + +#define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) + /* * Provide support for setting a PCI VF Token, which is used as a shared * secret between PF and VF drivers. This feature may only be set on a -- cgit v1.2.3 From b290a05fd858281fcac6fe94ec76a46d5396b9e5 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 18 Jul 2023 06:55:48 -0700 Subject: vfio: Add VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT This adds ioctl for userspace to attach device cdev fd to and detach from IOAS/hw_pagetable managed by iommufd. VFIO_DEVICE_ATTACH_IOMMUFD_PT: attach vfio device to IOAS or hw_pagetable managed by iommufd. Attach can be undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. VFIO_DEVICE_DETACH_IOMMUFD_PT: detach vfio device from the current attached IOAS or hw_pagetable managed by iommufd. Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Tested-by: Matthew Rosato Tested-by: Yanting Jiang Tested-by: Shameer Kolothum Tested-by: Terrence Xu Tested-by: Zhenzhong Duan Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20230718135551.6592-24-yi.l.liu@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/vfio/vfio.h | 5 ++++ drivers/vfio/vfio_main.c | 15 +++++++++++- include/uapi/linux/vfio.h | 44 +++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index f40784dd5561..e75da0a70d1f 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -152,6 +152,64 @@ void vfio_df_unbind_iommufd(struct vfio_device_file *df) vfio_device_unblock_group(device); } +int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, + struct vfio_device_attach_iommufd_pt __user *arg) +{ + struct vfio_device *device = df->device; + struct vfio_device_attach_iommufd_pt attach; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id); + + if (copy_from_user(&attach, arg, minsz)) + return -EFAULT; + + if (attach.argsz < minsz || attach.flags) + return -EINVAL; + + mutex_lock(&device->dev_set->lock); + ret = device->ops->attach_ioas(device, &attach.pt_id); + if (ret) + goto out_unlock; + + if (copy_to_user(&arg->pt_id, &attach.pt_id, sizeof(attach.pt_id))) { + ret = -EFAULT; + goto out_detach; + } + mutex_unlock(&device->dev_set->lock); + + return 0; + +out_detach: + device->ops->detach_ioas(device); +out_unlock: + mutex_unlock(&device->dev_set->lock); + return ret; +} + +int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, + struct vfio_device_detach_iommufd_pt __user *arg) +{ + struct vfio_device *device = df->device; + struct vfio_device_detach_iommufd_pt detach; + unsigned long minsz; + + minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags); + + if (copy_from_user(&detach, arg, minsz)) + return -EFAULT; + + if (detach.argsz < minsz || detach.flags) + return -EINVAL; + + mutex_lock(&device->dev_set->lock); + device->ops->detach_ioas(device); + mutex_unlock(&device->dev_set->lock); + + return 0; +} + static char *vfio_device_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index b6d4ba1ef2b8..8353774a5d07 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -266,6 +266,11 @@ vfio_iommufd_compat_attach_ioas(struct vfio_device *device, } #endif +int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, + struct vfio_device_attach_iommufd_pt __user *arg); +int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, + struct vfio_device_detach_iommufd_pt __user *arg); + #if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) void vfio_init_device_cdev(struct vfio_device *device); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9fdf93ff17cf..ba1d84afe081 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1162,6 +1162,19 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, if (ret) return ret; + /* cdev only ioctls */ + if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { + switch (cmd) { + case VFIO_DEVICE_ATTACH_IOMMUFD_PT: + ret = vfio_df_ioctl_attach_pt(df, uptr); + goto out; + + case VFIO_DEVICE_DETACH_IOMMUFD_PT: + ret = vfio_df_ioctl_detach_pt(df, uptr); + goto out; + } + } + switch (cmd) { case VFIO_DEVICE_FEATURE: ret = vfio_ioctl_device_feature(device, uptr); @@ -1174,7 +1187,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = device->ops->ioctl(device, cmd, arg); break; } - +out: vfio_device_pm_runtime_put(device); return ret; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 098946b23e86..fa06e3eb4955 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -924,6 +924,50 @@ struct vfio_device_bind_iommufd { #define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) +/* + * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19, + * struct vfio_device_attach_iommufd_pt) + * @argsz: User filled size of this data. + * @flags: Must be 0. + * @pt_id: Input the target id which can represent an ioas or a hwpt + * allocated via iommufd subsystem. + * Output the input ioas id or the attached hwpt id which could + * be the specified hwpt itself or a hwpt automatically created + * for the specified ioas by kernel during the attachment. + * + * Associate the device with an address space within the bound iommufd. + * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only + * allowed on cdev fds. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_attach_iommufd_pt { + __u32 argsz; + __u32 flags; + __u32 pt_id; +}; + +#define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19) + +/* + * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20, + * struct vfio_device_detach_iommufd_pt) + * @argsz: User filled size of this data. + * @flags: Must be 0. + * + * Remove the association of the device and its current associated address + * space. After it, the device should be in a blocking DMA state. This is only + * allowed on cdev fds. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_detach_iommufd_pt { + __u32 argsz; + __u32 flags; +}; + +#define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20) + /* * Provide support for setting a PCI VF Token, which is used as a shared * secret between PF and VF drivers. This feature may only be set on a -- cgit v1.2.3 From a881b496941f02fe620c5708a4af68762b24c33d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 9 Aug 2023 16:31:44 -0400 Subject: vfio: align capability structures The VFIO_DEVICE_GET_INFO, VFIO_DEVICE_GET_REGION_INFO, and VFIO_IOMMU_GET_INFO ioctls fill in an info struct followed by capability structs: +------+---------+---------+-----+ | info | caps[0] | caps[1] | ... | +------+---------+---------+-----+ Both the info and capability struct sizes are not always multiples of sizeof(u64), leaving u64 fields in later capability structs misaligned. Userspace applications currently need to handle misalignment manually in order to support CPU architectures and programming languages with strict alignment requirements. Make life easier for userspace by ensuring alignment in the kernel. This is done by padding info struct definitions and by copying out zeroes after capability structs that are not aligned. The new layout is as follows: +------+---------+---+---------+-----+ | info | caps[0] | 0 | caps[1] | ... | +------+---------+---+---------+-----+ In this example caps[0] has a size that is not multiples of sizeof(u64), so zero padding is added to align the subsequent structure. Adding zero padding between structs does not break the uapi. The memory layout is specified by the info.cap_offset and caps[i].next fields filled in by the kernel. Applications use these field values to locate structs and are therefore unaffected by the addition of zero padding. Note that code that copies out info structs with padding is updated to always zero the struct and copy out as many bytes as userspace requested. This makes the code shorter and avoids potential information leaks by ensuring padding is initialized. Originally-by: Alex Williamson Signed-off-by: Stefan Hajnoczi Reviewed-by: Kevin Tian Acked-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230809203144.2880050-1-stefanha@redhat.com Signed-off-by: Alex Williamson --- drivers/iommu/iommufd/vfio_compat.c | 2 ++ drivers/vfio/pci/vfio_pci_core.c | 11 ++--------- drivers/vfio/vfio_iommu_type1.c | 11 ++--------- drivers/vfio/vfio_main.c | 6 ++++++ include/uapi/linux/vfio.h | 2 ++ 5 files changed, 14 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index fe02517c73cc..6c810bf80f99 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -483,6 +483,8 @@ static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, rc = cap_size; goto out_put; } + cap_size = ALIGN(cap_size, sizeof(u64)); + if (last_cap && info.argsz >= total_cap_size && put_user(total_cap_size, &last_cap->next)) { rc = -EFAULT; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 65cbada3ec13..1929103ee59a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -958,24 +958,17 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, struct vfio_device_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); - struct vfio_device_info info; + struct vfio_device_info info = {}; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; int ret; - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; - } + minsz = min_t(size_t, info.argsz, sizeof(info)); info.flags = VFIO_DEVICE_FLAGS_PCI; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index d662aa9d1b4b..eacd6ec04de5 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2762,27 +2762,20 @@ static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, unsigned long arg) { - struct vfio_iommu_type1_info info; + struct vfio_iommu_type1_info info = {}; unsigned long minsz; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; int ret; minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; /* output, no-recopy necessary */ - } + minsz = min_t(size_t, info.argsz, sizeof(info)); mutex_lock(&iommu->lock); info.flags = VFIO_IOMMU_INFO_PGSIZES; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 5417778f2b6b..cfad824d9aa2 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1409,6 +1409,9 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, void *buf; struct vfio_info_cap_header *header, *tmp; + /* Ensure that the next capability struct will be aligned */ + size = ALIGN(size, sizeof(u64)); + buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); if (!buf) { kfree(caps->buf); @@ -1442,6 +1445,9 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) struct vfio_info_cap_header *tmp; void *buf = (void *)caps->buf; + /* Capability structs should start with proper alignment */ + WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); + for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) tmp->next += offset; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index fa06e3eb4955..f9c6f3e2cf6e 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -217,6 +217,7 @@ struct vfio_device_info { __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ + __u32 pad; }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) @@ -1444,6 +1445,7 @@ struct vfio_iommu_type1_info { #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ __u64 iova_pgsizes; /* Bitmap of supported page sizes */ __u32 cap_offset; /* Offset within info struct of first cap */ + __u32 pad; }; /* -- cgit v1.2.3