summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2026-03-19 16:15:33 +0100
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2026-03-19 16:15:33 +0100
commit7e2dc8ed7862ac622b5a59953b679de97001dc83 (patch)
treed2d2cf61a22f5a6404000ee007c5e80bc2d9eca9 /arch/arm64
parenta7e8c9cc3a13baf3dcf9734dd55609aa7ff9a1a0 (diff)
parent4a2b0ed2ac7abe9743e1559d212075a0ebac96b3 (diff)
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h10
-rw-r--r--arch/arm64/kernel/cpufeature.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c2
-rw-r--r--arch/arm64/kvm/mmu.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c34
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic.c6
-rw-r--r--arch/arm64/mm/contpte.c53
-rw-r--r--arch/arm64/mm/mmap.c6
11 files changed, 108 insertions, 43 deletions
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ac7f970c7883..59f25b85be2b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -760,6 +760,9 @@ struct kvm_host_data {
/* Number of debug breakpoints/watchpoints for this CPU (minus 1) */
unsigned int debug_brps;
unsigned int debug_wrps;
+
+ /* Last vgic_irq part of the AP list recorded in an LR */
+ struct vgic_irq *last_lr_irq;
};
struct kvm_host_psci_config {
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index ea6f5458ae2e..6dae631c7132 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -50,11 +50,11 @@
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define _PAGE_KERNEL (PROT_NORMAL)
-#define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
-#define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY)
-#define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN)
-#define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
+#define _PAGE_KERNEL (PROT_NORMAL | PTE_DIRTY)
+#define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY | PTE_DIRTY)
+#define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY | PTE_DIRTY)
+#define _PAGE_KERNEL_EXEC ((PROT_NORMAL & ~PTE_PXN) | PTE_DIRTY)
+#define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT | PTE_DIRTY)
#define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
#define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c840a93b9ef9..f5bbcdf2795c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2336,6 +2336,15 @@ static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry,
if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY))
return true;
+ /*
+ * pKVM prevents late onlining of CPUs. This means that whatever
+ * state the capability is in after deprivilege cannot be affected
+ * by a new CPU booting -- this is garanteed to be a CPU we have
+ * already seen, and the cap is therefore unchanged.
+ */
+ if (system_capabilities_finalized() && is_protected_kvm_enabled())
+ return cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR);
+
if (is_kernel_in_hyp_mode())
res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2);
else
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 49db32f3ddf7..ece04bb10ab0 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -516,7 +516,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
granule = kvm_granule_size(level);
cur.start = ALIGN_DOWN(addr, granule);
cur.end = cur.start + granule;
- if (!range_included(&cur, range))
+ if (!range_included(&cur, range) && level < KVM_PGTABLE_LAST_LEVEL)
continue;
*range = cur;
return 0;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2caa97f87890..9f22d60b64db 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1753,14 +1753,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
/*
- * Both the canonical IPA and fault IPA must be hugepage-aligned to
- * ensure we find the right PFN and lay down the mapping in the right
- * place.
+ * Both the canonical IPA and fault IPA must be aligned to the
+ * mapping size to ensure we find the right PFN and lay down the
+ * mapping in the right place.
*/
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
- fault_ipa &= ~(vma_pagesize - 1);
- ipa &= ~(vma_pagesize - 1);
- }
+ fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize);
+ ipa = ALIGN_DOWN(ipa, vma_pagesize);
gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index dc9f9db31026..6ed01c7faa91 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -140,26 +140,9 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
goto out_unlock;
}
- kvm_for_each_vcpu(i, vcpu, kvm) {
- ret = vgic_allocate_private_irqs_locked(vcpu, type);
- if (ret)
- break;
- }
-
- if (ret) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- kfree(vgic_cpu->private_irqs);
- vgic_cpu->private_irqs = NULL;
- }
-
- goto out_unlock;
- }
-
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST;
-
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
@@ -176,6 +159,23 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = vgic_allocate_private_irqs_locked(vcpu, type);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ kfree(vgic_cpu->private_irqs);
+ vgic_cpu->private_irqs = NULL;
+ }
+
+ kvm->arch.vgic.vgic_model = 0;
+ goto out_unlock;
+ }
+
if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 585491fbda80..cafa3cb32bda 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -115,7 +115,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr);
- struct vgic_irq *irq;
+ struct vgic_irq *irq = *host_data_ptr(last_lr_irq);
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -123,7 +123,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]);
/* See the GICv3 equivalent for the EOIcount handling rationale */
- list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ list_for_each_entry_continue(irq, &vgic_cpu->ap_list_head, ap_list) {
u32 lr;
if (!eoicount) {
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 1d6dd1b545bd..db2f220c29f7 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -148,7 +148,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr);
- struct vgic_irq *irq;
+ struct vgic_irq *irq = *host_data_ptr(last_lr_irq);
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -158,12 +158,12 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
/*
* EOIMode=0: use EOIcount to emulate deactivation. We are
* guaranteed to deactivate in reverse order of the activation, so
- * just pick one active interrupt after the other in the ap_list,
- * and replay the deactivation as if the CPU was doing it. We also
- * rely on priority drop to have taken place, and the list to be
- * sorted by priority.
+ * just pick one active interrupt after the other in the tail part
+ * of the ap_list, past the LRs, and replay the deactivation as if
+ * the CPU was doing it. We also rely on priority drop to have taken
+ * place, and the list to be sorted by priority.
*/
- list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ list_for_each_entry_continue(irq, &vgic_cpu->ap_list_head, ap_list) {
u64 lr;
/*
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 430aa98888fd..e22b79cfff96 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -814,6 +814,9 @@ retry:
static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
{
+ if (!*host_data_ptr(last_lr_irq))
+ return;
+
if (kvm_vgic_global_state.type == VGIC_V2)
vgic_v2_fold_lr_state(vcpu);
else
@@ -960,10 +963,13 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
if (irqs_outside_lrs(&als))
vgic_sort_ap_list(vcpu);
+ *host_data_ptr(last_lr_irq) = NULL;
+
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
scoped_guard(raw_spinlock, &irq->irq_lock) {
if (likely(vgic_target_oracle(irq) == vcpu)) {
vgic_populate_lr(vcpu, irq, count++);
+ *host_data_ptr(last_lr_irq) = irq;
}
}
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 589bcf878938..cb6cd58deda9 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -581,6 +581,27 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
}
EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes);
+static bool contpte_all_subptes_match_access_flags(pte_t *ptep, pte_t entry)
+{
+ pte_t *cont_ptep = contpte_align_down(ptep);
+ /*
+ * PFNs differ per sub-PTE. Match only bits consumed by
+ * __ptep_set_access_flags(): AF, DIRTY and write permission.
+ */
+ const pteval_t cmp_mask = PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
+ pteval_t entry_cmp = pte_val(entry) & cmp_mask;
+ int i;
+
+ for (i = 0; i < CONT_PTES; i++) {
+ pteval_t pte_cmp = pte_val(__ptep_get(cont_ptep + i)) & cmp_mask;
+
+ if (pte_cmp != entry_cmp)
+ return false;
+ }
+
+ return true;
+}
+
int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty)
@@ -590,14 +611,38 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
int i;
/*
- * Gather the access/dirty bits for the contiguous range. If nothing has
- * changed, its a noop.
+ * Check whether all sub-PTEs in the CONT block already match the
+ * requested access flags/write permission, using raw per-PTE values
+ * rather than the gathered ptep_get() view.
+ *
+ * __ptep_set_access_flags() can update AF, dirty and write
+ * permission, but only to make the mapping more permissive.
+ *
+ * ptep_get() gathers AF/dirty state across the whole CONT block,
+ * which is correct for a CPU with FEAT_HAFDBS. But page-table
+ * walkers that evaluate each descriptor individually (e.g. a CPU
+ * without DBM support, or an SMMU without HTTU, or with HA/HD
+ * disabled in CD.TCR) can keep faulting on the target sub-PTE if
+ * only a sibling has been updated. Gathering can therefore cause
+ * false no-ops when only a sibling has been updated:
+ * - write faults: target still has PTE_RDONLY (needs PTE_RDONLY cleared)
+ * - read faults: target still lacks PTE_AF
+ *
+ * Per Arm ARM (DDI 0487) D8.7.1, any sub-PTE in a CONT range may
+ * become the effective cached translation, so all entries must have
+ * consistent attributes. Check the full CONT block before returning
+ * no-op, and when any sub-PTE mismatches, proceed to update the whole
+ * range.
*/
- orig_pte = pte_mknoncont(ptep_get(ptep));
- if (pte_val(orig_pte) == pte_val(entry))
+ if (contpte_all_subptes_match_access_flags(ptep, entry))
return 0;
/*
+ * Use raw target pte (not gathered) for write-bit unfold decision.
+ */
+ orig_pte = pte_mknoncont(__ptep_get(ptep));
+
+ /*
* We can fix up access/dirty bits without having to unfold the contig
* range. But if the write bit is changing, we must unfold.
*/
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 75f343009b4b..92b2f5097a96 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -91,7 +91,11 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
/* Short circuit GCS to avoid bloating the table. */
if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
- prot = gcs_page_prot;
+ /* Honour mprotect(PROT_NONE) on shadow stack mappings */
+ if (vm_flags & VM_ACCESS_FLAGS)
+ prot = gcs_page_prot;
+ else
+ prot = pgprot_val(protection_map[VM_NONE]);
} else {
prot = pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);