From 6d192b4f2d644d15d9a9f1d33dab05af936f6540 Mon Sep 17 00:00:00 2001 From: Jonathan Cavitt Date: Tue, 24 Mar 2026 15:29:37 +0000 Subject: drm/xe/xe_pagefault: Disallow writes to read-only VMAs The page fault handler should reject write/atomic access to read only VMAs. Add code to handle this in xe_pagefault_service after the VMA lookup. v2: - Apply max line length (Matthew) Fixes: fb544b844508 ("drm/xe: Implement xe_pagefault_queue_work") Signed-off-by: Jonathan Cavitt Suggested-by: Matthew Brost Cc: Shuicheng Lin Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20260324152935.72444-7-jonathan.cavitt@intel.com (cherry picked from commit 714ee6754ac5fa3dc078856a196a6b124cd797a0) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pagefault.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_pagefault.c b/drivers/gpu/drm/xe/xe_pagefault.c index 6bee53d6ffc3..922a4f3344b1 100644 --- a/drivers/gpu/drm/xe/xe_pagefault.c +++ b/drivers/gpu/drm/xe/xe_pagefault.c @@ -187,6 +187,12 @@ static int xe_pagefault_service(struct xe_pagefault *pf) goto unlock_vm; } + if (xe_vma_read_only(vma) && + pf->consumer.access_type != XE_PAGEFAULT_ACCESS_TYPE_READ) { + err = -EPERM; + goto unlock_vm; + } + atomic = xe_pagefault_access_is_atomic(pf->consumer.access_type); if (xe_vma_is_cpu_addr_mirror(vma)) -- cgit v1.2.3 From 7b9a3a910e96f20b101b0df6b1f5c77b023a4097 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 26 Mar 2026 18:38:38 +0530 Subject: drm/xe/madvise: Accept canonical GPU addresses in xe_vm_madvise_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace passes canonical (sign-extended) GPU addresses where bits 63:48 mirror bit 47. The internal GPUVM uses non-canonical form (upper bits zeroed), so passing raw canonical addresses into GPUVM lookups causes mismatches for addresses above 128TiB. Strip the sign extension with xe_device_uncanonicalize_addr() at the top of xe_vm_madvise_ioctl(). Non-canonical addresses are unaffected. Fixes: ada7486c5668 ("drm/xe: Implement madvise ioctl for xe") Suggested-by: Matthew Brost Cc: Thomas Hellström Reviewed-by: Matthew Brost Signed-off-by: Himal Prasad Ghimiray Signed-off-by: Arvind Yadav Signed-off-by: Matthew Brost Link: https://patch.msgid.link/20260326130843.3545241-13-arvind.yadav@intel.com (cherry picked from commit 05c8b1cdc54036465ea457a0501a8c2f9409fce7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_vm_madvise.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c b/drivers/gpu/drm/xe/xe_vm_madvise.c index bc39a9a9790c..b4086129a364 100644 --- a/drivers/gpu/drm/xe/xe_vm_madvise.c +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c @@ -408,8 +408,15 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *fil struct xe_device *xe = to_xe_device(dev); struct xe_file *xef = to_xe_file(file); struct drm_xe_madvise *args = data; - struct xe_vmas_in_madvise_range madvise_range = {.addr = args->start, - .range = args->range, }; + struct xe_vmas_in_madvise_range madvise_range = { + /* + * Userspace may pass canonical (sign-extended) addresses. + * Strip the sign extension to get the internal non-canonical + * form used by the GPUVM, matching xe_vm_bind_ioctl() behavior. + */ + .addr = xe_device_uncanonicalize_addr(xe, args->start), + .range = args->range, + }; struct xe_madvise_details details; struct xe_vm *vm; struct drm_exec exec; @@ -439,7 +446,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *fil if (err) goto unlock_vm; - err = xe_vm_alloc_madvise_vma(vm, args->start, args->range); + err = xe_vm_alloc_madvise_vma(vm, madvise_range.addr, args->range); if (err) goto madv_fini; @@ -482,7 +489,8 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *fil madvise_funcs[attr_type](xe, vm, madvise_range.vmas, madvise_range.num_vmas, args, &details); - err = xe_vm_invalidate_madvise_range(vm, args->start, args->start + args->range); + err = xe_vm_invalidate_madvise_range(vm, madvise_range.addr, + madvise_range.addr + args->range); if (madvise_range.has_svm_userptr_vmas) xe_svm_notifier_unlock(vm); -- cgit v1.2.3 From e2628e670bb0923fcdc00828bfcd67b26a7df020 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 24 Mar 2026 08:37:20 -0700 Subject: drm/xe/pxp: Clean up termination status on failure If the PXP HW termination fails during PXP start, the normal completion code won't be called, so the termination will remain uncomplete. To avoid unnecessary waits, mark the termination as completed from the error path. Note that we already do this if the termination fails when handling a termination irq from the HW. Fixes: f8caa80154c4 ("drm/xe/pxp: Add PXP queue tracking and session start") Signed-off-by: Daniele Ceraolo Spurio Cc: Alan Previn Teres Alexis Cc: Julia Filipchuk Reviewed-by: Julia Filipchuk Link: https://patch.msgid.link/20260324153718.3155504-7-daniele.ceraolospurio@intel.com (cherry picked from commit 5d9e708d2a69ab1f64a17aec810cd7c70c5b9fab) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pxp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/xe/xe_pxp.c b/drivers/gpu/drm/xe/xe_pxp.c index d61446bf9c19..782281dbe9e0 100644 --- a/drivers/gpu/drm/xe/xe_pxp.c +++ b/drivers/gpu/drm/xe/xe_pxp.c @@ -583,6 +583,7 @@ wait_for_idle: drm_err(&pxp->xe->drm, "PXP termination failed before start\n"); mutex_lock(&pxp->mutex); pxp->status = XE_PXP_ERROR; + complete_all(&pxp->termination); goto out_unlock; } -- cgit v1.2.3 From 4fed244954c2dc9aafa333d08f66b14345225e03 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 24 Mar 2026 08:37:21 -0700 Subject: drm/xe/pxp: Remove incorrect handling of impossible state during suspend The default case of the PXP suspend switch is incorrectly exiting without releasing the lock. However, this case is impossible to hit because we're switching on an enum and all the valid enum values have their own cases. Therefore, we can just get rid of the default case and rely on the compiler to warn us if a new enum value is added and we forget to add it to the switch. Fixes: 51462211f4a9 ("drm/xe/pxp: add PXP PM support") Signed-off-by: Daniele Ceraolo Spurio Cc: Alan Previn Teres Alexis Cc: Julia Filipchuk Reviewed-by: Julia Filipchuk Link: https://patch.msgid.link/20260324153718.3155504-8-daniele.ceraolospurio@intel.com (cherry picked from commit f1b5a77fc9b6a90cd9a5e3db9d4c73ae1edfcfac) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pxp.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_pxp.c b/drivers/gpu/drm/xe/xe_pxp.c index 782281dbe9e0..fa82d606953e 100644 --- a/drivers/gpu/drm/xe/xe_pxp.c +++ b/drivers/gpu/drm/xe/xe_pxp.c @@ -871,11 +871,6 @@ wait_for_activation: pxp->key_instance++; needs_queue_inval = true; break; - default: - drm_err(&pxp->xe->drm, "unexpected state during PXP suspend: %u", - pxp->status); - ret = -EIO; - goto out; } /* @@ -900,7 +895,6 @@ wait_for_activation: pxp->last_suspend_key_instance = pxp->key_instance; -out: return ret; } -- cgit v1.2.3 From 76903b2057c8677c2c006e87fede15f496555dc0 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 24 Mar 2026 08:37:22 -0700 Subject: drm/xe/pxp: Clear restart flag in pxp_start after jumping back If we don't clear the flag we'll keep jumping back at the beginning of the function once we reach the end. Fixes: ccd3c6820a90 ("drm/xe/pxp: Decouple queue addition from PXP start") Signed-off-by: Daniele Ceraolo Spurio Cc: Julia Filipchuk Reviewed-by: Julia Filipchuk Link: https://patch.msgid.link/20260324153718.3155504-9-daniele.ceraolospurio@intel.com (cherry picked from commit 0850ec7bb2459602351639dccf7a68a03c9d1ee0) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pxp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_pxp.c b/drivers/gpu/drm/xe/xe_pxp.c index fa82d606953e..088a1ab75f70 100644 --- a/drivers/gpu/drm/xe/xe_pxp.c +++ b/drivers/gpu/drm/xe/xe_pxp.c @@ -512,7 +512,7 @@ static int __exec_queue_add(struct xe_pxp *pxp, struct xe_exec_queue *q) static int pxp_start(struct xe_pxp *pxp, u8 type) { int ret = 0; - bool restart = false; + bool restart; if (!xe_pxp_is_enabled(pxp)) return -ENODEV; @@ -541,6 +541,8 @@ wait_for_idle: msecs_to_jiffies(PXP_ACTIVATION_TIMEOUT_MS))) return -ETIMEDOUT; + restart = false; + mutex_lock(&pxp->mutex); /* If PXP is not already active, turn it on */ -- cgit v1.2.3 From e3fb579872a8d9cf264c52710d5839de3afa6fc1 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Tue, 24 Mar 2026 08:37:23 -0700 Subject: drm/xe/pxp: Don't allow PXP on older PTL GSC FWs On PTL, older GSC FWs have a bug that can cause them to crash during PXP invalidation events, which leads to a complete loss of power management on the media GT. Therefore, we can't use PXP on FWs that have this bug, which was fixed in PTL GSC build 1396. Fixes: b1dcec9bd8a1 ("drm/xe/ptl: Enable PXP for PTL") Signed-off-by: Daniele Ceraolo Spurio Cc: Julia Filipchuk Reviewed-by: Julia Filipchuk Acked-by: Rodrigo Vivi Link: https://patch.msgid.link/20260324153718.3155504-10-daniele.ceraolospurio@intel.com (cherry picked from commit 6eb04caaa972934c9b6cea0e0c29e466bf9a346f) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_pxp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/gpu/drm/xe/xe_pxp.c b/drivers/gpu/drm/xe/xe_pxp.c index 088a1ab75f70..872217f30375 100644 --- a/drivers/gpu/drm/xe/xe_pxp.c +++ b/drivers/gpu/drm/xe/xe_pxp.c @@ -380,6 +380,18 @@ int xe_pxp_init(struct xe_device *xe) return 0; } + /* + * On PTL, older GSC FWs have a bug that can cause them to crash during + * PXP invalidation events, which leads to a complete loss of power + * management on the media GT. Therefore, we can't use PXP on FWs that + * have this bug, which was fixed in PTL GSC build 1396. + */ + if (xe->info.platform == XE_PANTHERLAKE && + gt->uc.gsc.fw.versions.found[XE_UC_FW_VER_RELEASE].build < 1396) { + drm_info(&xe->drm, "PXP requires PTL GSC build 1396 or newer\n"); + return 0; + } + pxp = drmm_kzalloc(&xe->drm, sizeof(struct xe_pxp), GFP_KERNEL); if (!pxp) { err = -ENOMEM; -- cgit v1.2.3 From bce7cd6db2386e7a2b49ad3c09df0beabddfa3c4 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 26 Feb 2026 17:52:25 -0800 Subject: drm/xe: Disable garbage collector work item on SVM close MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an SVM is closed, the garbage collector work item must be stopped synchronously and any future queuing must be prevented. Replace flush_work() with disable_work_sync() to ensure both conditions are met. Fixes: 63f6e480d115 ("drm/xe: Add SVM garbage collector") Cc: stable@vger.kernel.org Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patch.msgid.link/20260227015225.3081787-1-matthew.brost@intel.com (cherry picked from commit 2247feb9badca5a4774df9a437bfc44fba4f22de) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index ca67d0bdbfac..c6b92d4ea6f4 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -903,7 +903,7 @@ int xe_svm_init(struct xe_vm *vm) void xe_svm_close(struct xe_vm *vm) { xe_assert(vm->xe, xe_vm_is_closed(vm)); - flush_work(&vm->svm.garbage_collector.work); + disable_work_sync(&vm->svm.garbage_collector.work); xe_svm_put_pagemaps(vm); drm_pagemap_release_owner(&vm->svm.peer); } -- cgit v1.2.3 From 56b7432b7e8e6ae1b289cb405d16db4150ef193b Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Thu, 26 Mar 2026 14:01:15 -0700 Subject: drm/xe: Avoid memory allocations in xe_device_declare_wedged() xe_device_declare_wedged() runs in the DMA-fence signaling path, where GFP_KERNEL memory allocations are not allowed. However, registering xe_device_wedged_fini via drmm_add_action_or_reset() triggers a GFP_KERNEL allocation. Fix this by deferring the registration of xe_device_wedged_fini until late in the driver load sequence. Additionally, drop the wedged PM reference only if the device is actually wedged in xe_device_wedged_fini. Fixes: 452bca0edbd0 ("drm/xe: Don't suspend device upon wedge") Signed-off-by: Matthew Brost Reviewed-by: Rodrigo Vivi Link: https://patch.msgid.link/20260326210116.202585-2-matthew.brost@intel.com (cherry picked from commit b08ceb443866808b881b12d4183008d214d816c1) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/xe/xe_device.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c index 52ee24e9cd3a..3eb06b27db7e 100644 --- a/drivers/gpu/drm/xe/xe_device.c +++ b/drivers/gpu/drm/xe/xe_device.c @@ -837,6 +837,14 @@ static void detect_preproduction_hw(struct xe_device *xe) } } +static void xe_device_wedged_fini(struct drm_device *drm, void *arg) +{ + struct xe_device *xe = arg; + + if (atomic_read(&xe->wedged.flag)) + xe_pm_runtime_put(xe); +} + int xe_device_probe(struct xe_device *xe) { struct xe_tile *tile; @@ -1013,6 +1021,10 @@ int xe_device_probe(struct xe_device *xe) detect_preproduction_hw(xe); + err = drmm_add_action_or_reset(&xe->drm, xe_device_wedged_fini, xe); + if (err) + goto err_unregister_display; + return devm_add_action_or_reset(xe->drm.dev, xe_device_sanitize, xe); err_unregister_display: @@ -1216,13 +1228,6 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address) return address & GENMASK_ULL(xe->info.va_bits - 1, 0); } -static void xe_device_wedged_fini(struct drm_device *drm, void *arg) -{ - struct xe_device *xe = arg; - - xe_pm_runtime_put(xe); -} - /** * DOC: Xe Device Wedging * @@ -1300,15 +1305,9 @@ void xe_device_declare_wedged(struct xe_device *xe) return; } - xe_pm_runtime_get_noresume(xe); - - if (drmm_add_action_or_reset(&xe->drm, xe_device_wedged_fini, xe)) { - drm_err(&xe->drm, "Failed to register xe_device_wedged_fini clean-up. Although device is wedged.\n"); - return; - } - if (!atomic_xchg(&xe->wedged.flag, 1)) { xe->needs_flr_on_fini = true; + xe_pm_runtime_get_noresume(xe); drm_err(&xe->drm, "CRITICAL: Xe has declared device %s as wedged.\n" "IOCTLs and executions are blocked. Only a rebind may clear the failure\n" -- cgit v1.2.3