From 0ada768517dafa1504ef5986ba04f118b7436960 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jan 2019 16:07:27 +0200 Subject: RDMA/mlx5: Delete declaration of already removed function The implementation of mlx5_core_page_fault_resume() was removed in commit d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA"). This patch removes declaration too. Fixes: d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 54299251d40d..b6f5839f129a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -939,10 +939,6 @@ int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, size_t sz); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token, - u32 wq_num, u8 type, int error); -#endif int mlx5_init_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 7527a7b157d1191b23562ed70154ae93bd65f845 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 17 Jan 2019 20:14:15 +0200 Subject: IB/core: Simplify rdma cgroup registration RDMA cgroup registration routine always returns success, so simplify function to be void and run clang formatter over whole CONFIG_CGROUP_RDMA art of core_priv.h. This reduces unwinding error path for regular registration and future net namespace change functionality for rdma device. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Acked-by: Tejun Heo Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cgroup.c | 5 ++--- drivers/infiniband/core/core_priv.h | 17 +++++++++++------ drivers/infiniband/core/device.c | 8 +------- include/linux/cgroup_rdma.h | 2 +- kernel/cgroup/rdma.c | 5 +---- 5 files changed, 16 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c index 126ac5f99db7..388fd04e5f63 100644 --- a/drivers/infiniband/core/cgroup.c +++ b/drivers/infiniband/core/cgroup.c @@ -21,12 +21,11 @@ * Register with the rdma cgroup. Should be called before * exposing rdma device to user space applications to avoid * resource accounting leak. - * Returns 0 on success or otherwise failure code. */ -int ib_device_register_rdmacg(struct ib_device *device) +void ib_device_register_rdmacg(struct ib_device *device) { device->cg_device.name = device->name; - return rdmacg_register_device(&device->cg_device); + rdmacg_register_device(&device->cg_device); } /** diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index aca75c74e451..42a49982f66e 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -115,7 +115,7 @@ void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); #ifdef CONFIG_CGROUP_RDMA -int ib_device_register_rdmacg(struct ib_device *device); +void ib_device_register_rdmacg(struct ib_device *device); void ib_device_unregister_rdmacg(struct ib_device *device); int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, @@ -126,21 +126,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index); #else -static inline int ib_device_register_rdmacg(struct ib_device *device) -{ return 0; } +static inline void ib_device_register_rdmacg(struct ib_device *device) +{ +} static inline void ib_device_unregister_rdmacg(struct ib_device *device) -{ } +{ +} static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ return 0; } +{ + return 0; +} static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ } +{ +} #endif static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 4a9aa6d10c5e..200431c540f2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -599,12 +599,7 @@ int ib_register_device(struct ib_device *device, const char *name) device->index = __dev_new_index(); - ret = ib_device_register_rdmacg(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't register device with rdma cgroup\n"); - goto dev_cleanup; - } + ib_device_register_rdmacg(device); ret = ib_device_register_sysfs(device); if (ret) { @@ -627,7 +622,6 @@ int ib_register_device(struct ib_device *device, const char *name) cg_cleanup: ib_device_unregister_rdmacg(device); -dev_cleanup: cleanup_device(device); out: mutex_unlock(&device_mutex); diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index e94290b29e99..ef1bae2983f3 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -39,7 +39,7 @@ struct rdmacg_device { * APIs for RDMA/IB stack to publish when a device wants to * participate in resource accounting */ -int rdmacg_register_device(struct rdmacg_device *device); +void rdmacg_register_device(struct rdmacg_device *device); void rdmacg_unregister_device(struct rdmacg_device *device); /* APIs for RDMA/IB stack to charge/uncharge pool specific resources */ diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index d3bbb757ee49..1d75ae7f1cb7 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge); * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. */ -int rdmacg_register_device(struct rdmacg_device *device) +void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); @@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device) mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); - return 0; } EXPORT_SYMBOL(rdmacg_register_device); -- cgit v1.2.3 From 534fd7aac56a7994d16032f32123def9923e339f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 13 Jan 2019 16:01:17 +0200 Subject: IB/mlx5: Manage indirection mkey upon DEVX flow for ODP Manage indirection mkey upon DEVX flow to support ODP. To support a page fault event on the indirection mkey it needs to be part of the device mkey radix tree. Both the creation and the deletion flows for a DEVX object which is indirection mkey were adapted to handle that. Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 89 +++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +++ include/linux/mlx5/driver.h | 1 + 4 files changed, 96 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index b7ff2138ac2a..bbf9a26d8fa6 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -17,12 +17,18 @@ #define UVERBS_MODULE_NAME mlx5_ib #include +enum devx_obj_flags { + DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, +}; + #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; u64 obj_id; u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; + u32 flags; + struct mlx5_ib_devx_mr devx_mr; }; struct devx_umem { @@ -1011,6 +1017,36 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, } } +static int devx_handle_mkey_indirect(struct devx_obj *obj, + struct mlx5_ib_dev *dev, + void *in, void *out) +{ + struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; + struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr; + unsigned long flags; + struct mlx5_core_mkey *mkey; + void *mkc; + u8 key; + int err; + + mkey = &devx_mr->mmkey; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + key = MLX5_GET(mkc, mkc, mkey_7_0); + mkey->key = mlx5_idx_to_mkey( + MLX5_GET(create_mkey_out, out, mkey_index)) | key; + mkey->type = MLX5_MKEY_INDIRECT_DEVX; + mkey->iova = MLX5_GET64(mkc, mkc, start_addr); + mkey->size = MLX5_GET64(mkc, mkc, len); + mkey->pd = MLX5_GET(mkc, mkc, pd); + devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), + mkey); + write_unlock_irqrestore(&table->lock, flags); + return err; +} + static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, struct devx_obj *obj, void *in, int in_len) @@ -1030,13 +1066,45 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, access_mode |= MLX5_GET(mkc, mkc, access_mode_4_2) << 2; if (access_mode == MLX5_MKC_ACCESS_MODE_KLMS || - access_mode == MLX5_MKC_ACCESS_MODE_KSM) + access_mode == MLX5_MKC_ACCESS_MODE_KSM) { + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + obj->flags |= DEVX_OBJ_FLAGS_INDIRECT_MKEY; return 0; + } MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); return 0; } +static void devx_free_indirect_mkey(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct devx_obj, devx_mr.rcu)); +} + +/* This function to delete from the radix tree needs to be called before + * destroying the underlying mkey. Otherwise a race might occur in case that + * other thread will get the same mkey before this one will be deleted, + * in that case it will fail via inserting to the tree its own data. + * + * Note: + * An error in the destroy is not expected unless there is some other indirect + * mkey which points to this one. In a kernel cleanup flow it will be just + * destroyed in the iterative destruction call. In a user flow, in case + * the application didn't close in the expected order it's its own problem, + * the mkey won't be part of the tree, in both cases the kernel is safe. + */ +static void devx_cleanup_mkey(struct devx_obj *obj) +{ + struct mlx5_mkey_table *table = &obj->mdev->priv.mkey_table; + struct mlx5_core_mkey *del_mkey; + unsigned long flags; + + write_lock_irqsave(&table->lock, flags); + del_mkey = radix_tree_delete(&table->tree, + mlx5_base_mkey(obj->devx_mr.mmkey.key)); + write_unlock_irqrestore(&table->lock, flags); +} + static int devx_obj_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why) { @@ -1044,10 +1112,21 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, struct devx_obj *obj = uobject->object; int ret; + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) + devx_cleanup_mkey(obj); + ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { + struct mlx5_ib_dev *dev = to_mdev(uobject->context->device); + + call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu, + devx_free_indirect_mkey); + return ret; + } + kfree(obj); return ret; } @@ -1108,6 +1187,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { + err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out); + if (err) + goto obj_destroy; + } + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) goto obj_destroy; @@ -1116,6 +1201,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( return 0; obj_destroy: + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) + devx_cleanup_mkey(obj); mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); obj_free: kfree(obj); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 61064b7171fc..ae00f994673b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5724,6 +5724,7 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + srcu_barrier(&dev->mr_srcu); cleanup_srcu_struct(&dev->mr_srcu); drain_workqueue(dev->advise_mr_wq); destroy_workqueue(dev->advise_mr_wq); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b0a37ca2a714..819207190343 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -602,6 +602,12 @@ struct mlx5_ib_mw { int ndescs; }; +struct mlx5_ib_devx_mr { + struct mlx5_core_mkey mmkey; + int ndescs; + struct rcu_head rcu; +}; + struct mlx5_ib_umr_context { struct ib_cqe cqe; enum ib_wc_status status; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b6f5839f129a..619d6fee96a1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -364,6 +364,7 @@ struct mlx5_core_sig_ctx { enum { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, + MLX5_MKEY_INDIRECT_DEVX, }; struct mlx5_core_mkey { -- cgit v1.2.3 From 70f8a3ca68d3e1f3344d959981ca55d5f6ec77f7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Feb 2019 09:59:15 -0800 Subject: mm: make mm->pinned_vm an atomic64 counter Taking a sleeping lock to _only_ increment a variable is quite the overkill, and pretty much all users do this. Furthermore, some drivers (ie: infiniband and scif) that need pinned semantics can go to quite some trouble to actually delay via workqueue (un)accounting for pinned pages when not possible to acquire it. By making the counter atomic we no longer need to hold the mmap_sem and can simply some code around it for pinned_vm users. The counter is 64-bit such that we need not worry about overflows such as rdma user input controlled from userspace. Reviewed-by: Ira Weiny Reviewed-by: Christoph Lameter Reviewed-by: Daniel Jordan Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 12 ++++++------ drivers/infiniband/hw/hfi1/user_pages.c | 6 +++--- drivers/infiniband/hw/qib/qib_user_pages.c | 4 ++-- drivers/infiniband/hw/usnic/usnic_uiom.c | 8 ++++---- drivers/misc/mic/scif/scif_rma.c | 6 +++--- fs/proc/task_mmu.c | 2 +- include/linux/mm_types.h | 2 +- kernel/events/core.c | 8 ++++---- kernel/fork.c | 2 +- mm/debug.c | 5 +++-- 10 files changed, 28 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 1efe0a74e06b..678abe1afcba 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -166,13 +166,13 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; down_write(&mm->mmap_sem); - if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || - (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { + new_pinned = atomic64_read(&mm->pinned_vm) + npages; + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { up_write(&mm->mmap_sem); ret = -ENOMEM; goto out; } - mm->pinned_vm = new_pinned; + atomic64_set(&mm->pinned_vm, new_pinned); up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -234,7 +234,7 @@ umem_release: __ib_umem_release(context->device, umem, 0); vma: down_write(&mm->mmap_sem); - mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); up_write(&mm->mmap_sem); out: if (vma_list) @@ -263,7 +263,7 @@ static void ib_umem_release_defer(struct work_struct *work) struct ib_umem *umem = container_of(work, struct ib_umem, work); down_write(&umem->owning_mm->mmap_sem); - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); up_write(&umem->owning_mm->mmap_sem); __ib_umem_release_tail(umem); @@ -302,7 +302,7 @@ void ib_umem_release(struct ib_umem *umem) } else { down_write(&umem->owning_mm->mmap_sem); } - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); up_write(&umem->owning_mm->mmap_sem); __ib_umem_release_tail(umem); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index e341e6dcc388..40a6e434190f 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -92,7 +92,7 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, size = DIV_ROUND_UP(size, PAGE_SIZE); down_read(&mm->mmap_sem); - pinned = mm->pinned_vm; + pinned = atomic64_read(&mm->pinned_vm); up_read(&mm->mmap_sem); /* First, check the absolute limit against all pinned pages. */ @@ -112,7 +112,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np return ret; down_write(&mm->mmap_sem); - mm->pinned_vm += ret; + atomic64_add(ret, &mm->pinned_vm); up_write(&mm->mmap_sem); return ret; @@ -131,7 +131,7 @@ void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, if (mm) { /* during close after signal, mm can be NULL */ down_write(&mm->mmap_sem); - mm->pinned_vm -= npages; + atomic64_sub(npages, &mm->pinned_vm); up_write(&mm->mmap_sem); } } diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 075f09fb7ce3..c6c81022d313 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -75,7 +75,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, goto bail_release; } - current->mm->pinned_vm += num_pages; + atomic64_add(num_pages, ¤t->mm->pinned_vm); ret = 0; goto bail; @@ -156,7 +156,7 @@ void qib_release_user_pages(struct page **p, size_t num_pages) __qib_release_user_pages(p, num_pages, 1); if (current->mm) { - current->mm->pinned_vm -= num_pages; + atomic64_sub(num_pages, ¤t->mm->pinned_vm); up_write(¤t->mm->mmap_sem); } } diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index ce01a59fccc4..854436a2b437 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -129,7 +129,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, uiomr->owning_mm = mm = current->mm; down_write(&mm->mmap_sem); - locked = npages + current->mm->pinned_vm; + locked = npages + atomic64_read(¤t->mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { @@ -187,7 +187,7 @@ out: if (ret < 0) usnic_uiom_put_pages(chunk_list, 0); else { - mm->pinned_vm = locked; + atomic64_set(&mm->pinned_vm, locked); mmgrab(uiomr->owning_mm); } @@ -441,7 +441,7 @@ static void usnic_uiom_release_defer(struct work_struct *work) container_of(work, struct usnic_uiom_reg, work); down_write(&uiomr->owning_mm->mmap_sem); - uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm); up_write(&uiomr->owning_mm->mmap_sem); __usnic_uiom_release_tail(uiomr); @@ -469,7 +469,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, } else { down_write(&uiomr->owning_mm->mmap_sem); } - uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm); up_write(&uiomr->owning_mm->mmap_sem); __usnic_uiom_release_tail(uiomr); diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c index 749321eb91ae..2448368f181e 100644 --- a/drivers/misc/mic/scif/scif_rma.c +++ b/drivers/misc/mic/scif/scif_rma.c @@ -285,7 +285,7 @@ __scif_dec_pinned_vm_lock(struct mm_struct *mm, } else { down_write(&mm->mmap_sem); } - mm->pinned_vm -= nr_pages; + atomic64_sub(nr_pages, &mm->pinned_vm); up_write(&mm->mmap_sem); return 0; } @@ -299,7 +299,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, return 0; locked = nr_pages; - locked += mm->pinned_vm; + locked += atomic64_read(&mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { dev_err(scif_info.mdev.this_device, @@ -307,7 +307,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, locked, lock_limit); return -ENOMEM; } - mm->pinned_vm = locked; + atomic64_set(&mm->pinned_vm, locked); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0ec9edab2f3..d2902962244d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); - SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2c471a2c43fa..acea2ea2d6c4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,7 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ - unsigned long pinned_vm; /* Refcount permanently increased */ + atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ diff --git a/kernel/events/core.c b/kernel/events/core.c index e5ede6918050..29e9f2473656 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5459,7 +5459,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); @@ -5532,7 +5532,7 @@ again: */ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: @@ -5680,7 +5680,7 @@ accounting: lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -5721,7 +5721,7 @@ accounting: unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..85e08c379a9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -981,7 +981,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - mm->pinned_vm = 0; + atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); diff --git a/mm/debug.c b/mm/debug.c index 0abb987dad9b..7d13941a72f9 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -135,7 +135,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -166,7 +166,8 @@ void dump_mm(const struct mm_struct *mm) mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, + atomic64_read(&mm->pinned_vm), + mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, -- cgit v1.2.3 From d901b2760dc6cd5fbbf2eac31d71d94baa6c4aef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 4 Jan 2019 11:40:21 -0700 Subject: lib/scatterlist: Provide a DMA page iterator Commit 2db76d7c3c6d ("lib/scatterlist: sg_page_iter: support sg lists w/o backing pages") introduced the sg_page_iter_dma_address() function without providing a way to use it in the general case. If the sg_dma_len() is not equal to the sg length callers cannot safely use the for_each_sg_page/sg_page_iter_dma_address combination. Resolve this API mistake by providing a DMA specific iterator, for_each_sg_dma_page(), that uses the right length so sg_page_iter_dma_address() works as expected with all sglists. A new iterator type is introduced to provide compile-time safety against wrongly mixing accessors and iterators. Acked-by: Christoph Hellwig (for scatterlist) Acked-by: Thomas Hellstrom Acked-by: Sakari Ailus (ipu3-cio2) Signed-off-by: Jason Gunthorpe --- .clang-format | 1 + drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 8 ++++- drivers/media/pci/intel/ipu3/ipu3-cio2.c | 4 +-- include/linux/scatterlist.h | 49 ++++++++++++++++++++++++------ lib/scatterlist.c | 26 ++++++++++++++++ 5 files changed, 76 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/.clang-format b/.clang-format index bc2ffb2a0b53..335ce29ab813 100644 --- a/.clang-format +++ b/.clang-format @@ -240,6 +240,7 @@ ForEachMacros: - 'for_each_set_bit' - 'for_each_set_bit_from' - 'for_each_sg' + - 'for_each_sg_dma_page' - 'for_each_sg_page' - 'for_each_sibling_event' - '__for_each_thread' diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index 31786b200afc..a3357ff7540d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -311,7 +311,13 @@ static dma_addr_t __vmw_piter_dma_addr(struct vmw_piter *viter) static dma_addr_t __vmw_piter_sg_addr(struct vmw_piter *viter) { - return sg_page_iter_dma_address(&viter->iter); + /* + * FIXME: This driver wrongly mixes DMA and CPU SG list iteration and + * needs revision. See + * https://lore.kernel.org/lkml/20190104223531.GA1705@ziepe.ca/ + */ + return sg_page_iter_dma_address( + container_of(&viter->iter, struct sg_dma_page_iter, base)); } diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2.c b/drivers/media/pci/intel/ipu3/ipu3-cio2.c index cdb79ae2d8dc..9fbfbda74171 100644 --- a/drivers/media/pci/intel/ipu3/ipu3-cio2.c +++ b/drivers/media/pci/intel/ipu3/ipu3-cio2.c @@ -846,7 +846,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb) unsigned int pages = DIV_ROUND_UP(vb->planes[0].length, CIO2_PAGE_SIZE); unsigned int lops = DIV_ROUND_UP(pages + 1, entries_per_page); struct sg_table *sg; - struct sg_page_iter sg_iter; + struct sg_dma_page_iter sg_iter; int i, j; if (lops <= 0 || lops > CIO2_MAX_LOPS) { @@ -873,7 +873,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb) b->offset = sg->sgl->offset; i = j = 0; - for_each_sg_page(sg->sgl, &sg_iter, sg->nents, 0) { + for_each_sg_dma_page (sg->sgl, &sg_iter, sg->nents, 0) { if (!pages--) break; b->lop[i][j] = sg_page_iter_dma_address(&sg_iter) >> PAGE_SHIFT; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b96f0d0b5b8f..b4be960c7e5d 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -339,12 +339,12 @@ int sg_alloc_table_chained(struct sg_table *table, int nents, /* * sg page iterator * - * Iterates over sg entries page-by-page. On each successful iteration, - * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter) - * to get the current page and its dma address. @piter->sg will point to the - * sg holding this page and @piter->sg_pgoffset to the page's page offset - * within the sg. The iteration will stop either when a maximum number of sg - * entries was reached or a terminating sg (sg_last(sg) == true) was reached. + * Iterates over sg entries page-by-page. On each successful iteration, you + * can call sg_page_iter_page(@piter) to get the current page and its dma + * address. @piter->sg will point to the sg holding this page and + * @piter->sg_pgoffset to the page's page offset within the sg. The iteration + * will stop either when a maximum number of sg entries was reached or a + * terminating sg (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ @@ -356,7 +356,19 @@ struct sg_page_iter { * next step */ }; +/* + * sg page iterator for DMA addresses + * + * This is the same as sg_page_iter however you can call + * sg_page_iter_dma_address(@dma_iter) to get the page's DMA + * address. sg_page_iter_page() cannot be called on this iterator. + */ +struct sg_dma_page_iter { + struct sg_page_iter base; +}; + bool __sg_page_iter_next(struct sg_page_iter *piter); +bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); @@ -372,11 +384,13 @@ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. - * @piter: page iterator holding the page + * @dma_iter: page iterator holding the page */ -static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) +static inline dma_addr_t +sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { - return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT); + return sg_dma_address(dma_iter->base.sg) + + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** @@ -385,11 +399,28 @@ static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_page() to get each page pointer. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) +/** + * for_each_sg_dma_page - iterate over the pages of the given sg list + * @sglist: sglist to iterate over + * @dma_iter: page iterator to hold current page + * @dma_nents: maximum number of sg entries to iterate over, this is the value + * returned from dma_map_sg + * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_dma_address() to get each page's DMA address. + */ +#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ + for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ + pgoffset); \ + __sg_page_iter_dma_next(dma_iter);) + /* * Mapping sg iterator * diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 9ba349e775ef..739dc9fe2c55 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -625,6 +625,32 @@ bool __sg_page_iter_next(struct sg_page_iter *piter) } EXPORT_SYMBOL(__sg_page_iter_next); +static int sg_dma_page_count(struct scatterlist *sg) +{ + return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT; +} + +bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter) +{ + struct sg_page_iter *piter = &dma_iter->base; + + if (!piter->__nents || !piter->sg) + return false; + + piter->sg_pgoffset += piter->__pg_advance; + piter->__pg_advance = 1; + + while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) { + piter->sg_pgoffset -= sg_dma_page_count(piter->sg); + piter->sg = sg_next(piter->sg); + if (!--piter->__nents || !piter->sg) + return false; + } + + return true; +} +EXPORT_SYMBOL(__sg_page_iter_dma_next); + /** * sg_miter_start - start mapping iteration over a sg list * @miter: sg mapping iter to be started -- cgit v1.2.3