summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2026-03-19 16:15:33 +0100
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2026-03-19 16:15:33 +0100
commit7e2dc8ed7862ac622b5a59953b679de97001dc83 (patch)
treed2d2cf61a22f5a6404000ee007c5e80bc2d9eca9 /arch
parenta7e8c9cc3a13baf3dcf9734dd55609aa7ff9a1a0 (diff)
parent4a2b0ed2ac7abe9743e1559d212075a0ebac96b3 (diff)
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h10
-rw-r--r--arch/arm64/kernel/cpufeature.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c2
-rw-r--r--arch/arm64/kvm/mmu.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c34
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c4
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic.c6
-rw-r--r--arch/arm64/mm/contpte.c53
-rw-r--r--arch/arm64/mm/mmap.c6
-rw-r--r--arch/parisc/include/asm/pgtable.h2
-rw-r--r--arch/parisc/kernel/head.S7
-rw-r--r--arch/parisc/kernel/setup.c20
-rw-r--r--arch/powerpc/include/asm/uaccess.h2
-rw-r--r--arch/powerpc/kexec/core.c17
-rw-r--r--arch/powerpc/kexec/file_load_64.c14
-rw-r--r--arch/powerpc/net/bpf_jit_comp.c30
-rw-r--r--arch/powerpc/net/bpf_jit_comp64.c101
-rw-r--r--arch/powerpc/perf/callchain.c5
-rw-r--r--arch/powerpc/perf/callchain_32.c1
-rw-r--r--arch/powerpc/perf/callchain_64.c1
-rw-r--r--arch/powerpc/platforms/83xx/km83xx.c4
-rw-r--r--arch/powerpc/platforms/pseries/msi.c2
-rw-r--r--arch/s390/include/asm/processor.h2
-rw-r--r--arch/s390/lib/xor.c5
-rw-r--r--arch/s390/mm/pfault.c4
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kernel/apic/apic.c6
-rw-r--r--arch/x86/kvm/svm/avic.c9
-rw-r--r--arch/x86/kvm/svm/svm.c9
-rw-r--r--arch/x86/kvm/vmx/nested.c22
33 files changed, 314 insertions, 104 deletions
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ac7f970c7883..59f25b85be2b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -760,6 +760,9 @@ struct kvm_host_data {
/* Number of debug breakpoints/watchpoints for this CPU (minus 1) */
unsigned int debug_brps;
unsigned int debug_wrps;
+
+ /* Last vgic_irq part of the AP list recorded in an LR */
+ struct vgic_irq *last_lr_irq;
};
struct kvm_host_psci_config {
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index ea6f5458ae2e..6dae631c7132 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -50,11 +50,11 @@
#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))
-#define _PAGE_KERNEL (PROT_NORMAL)
-#define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
-#define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY)
-#define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN)
-#define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)
+#define _PAGE_KERNEL (PROT_NORMAL | PTE_DIRTY)
+#define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY | PTE_DIRTY)
+#define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY | PTE_DIRTY)
+#define _PAGE_KERNEL_EXEC ((PROT_NORMAL & ~PTE_PXN) | PTE_DIRTY)
+#define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT | PTE_DIRTY)
#define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
#define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c840a93b9ef9..f5bbcdf2795c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2336,6 +2336,15 @@ static bool can_trap_icv_dir_el1(const struct arm64_cpu_capabilities *entry,
if (this_cpu_has_cap(ARM64_HAS_GICV5_LEGACY))
return true;
+ /*
+ * pKVM prevents late onlining of CPUs. This means that whatever
+ * state the capability is in after deprivilege cannot be affected
+ * by a new CPU booting -- this is garanteed to be a CPU we have
+ * already seen, and the cap is therefore unchanged.
+ */
+ if (system_capabilities_finalized() && is_protected_kvm_enabled())
+ return cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR);
+
if (is_kernel_in_hyp_mode())
res.a1 = read_sysreg_s(SYS_ICH_VTR_EL2);
else
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 49db32f3ddf7..ece04bb10ab0 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -516,7 +516,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
granule = kvm_granule_size(level);
cur.start = ALIGN_DOWN(addr, granule);
cur.end = cur.start + granule;
- if (!range_included(&cur, range))
+ if (!range_included(&cur, range) && level < KVM_PGTABLE_LAST_LEVEL)
continue;
*range = cur;
return 0;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 2caa97f87890..9f22d60b64db 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1753,14 +1753,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
/*
- * Both the canonical IPA and fault IPA must be hugepage-aligned to
- * ensure we find the right PFN and lay down the mapping in the right
- * place.
+ * Both the canonical IPA and fault IPA must be aligned to the
+ * mapping size to ensure we find the right PFN and lay down the
+ * mapping in the right place.
*/
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
- fault_ipa &= ~(vma_pagesize - 1);
- ipa &= ~(vma_pagesize - 1);
- }
+ fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize);
+ ipa = ALIGN_DOWN(ipa, vma_pagesize);
gfn = ipa >> PAGE_SHIFT;
mte_allowed = kvm_vma_mte_allowed(vma);
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index dc9f9db31026..6ed01c7faa91 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -140,26 +140,9 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
goto out_unlock;
}
- kvm_for_each_vcpu(i, vcpu, kvm) {
- ret = vgic_allocate_private_irqs_locked(vcpu, type);
- if (ret)
- break;
- }
-
- if (ret) {
- kvm_for_each_vcpu(i, vcpu, kvm) {
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- kfree(vgic_cpu->private_irqs);
- vgic_cpu->private_irqs = NULL;
- }
-
- goto out_unlock;
- }
-
kvm->arch.vgic.in_kernel = true;
kvm->arch.vgic.vgic_model = type;
kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST;
-
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
@@ -176,6 +159,23 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ ret = vgic_allocate_private_irqs_locked(vcpu, type);
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ kfree(vgic_cpu->private_irqs);
+ vgic_cpu->private_irqs = NULL;
+ }
+
+ kvm->arch.vgic.vgic_model = 0;
+ goto out_unlock;
+ }
+
if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 585491fbda80..cafa3cb32bda 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -115,7 +115,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr);
- struct vgic_irq *irq;
+ struct vgic_irq *irq = *host_data_ptr(last_lr_irq);
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -123,7 +123,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]);
/* See the GICv3 equivalent for the EOIcount handling rationale */
- list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ list_for_each_entry_continue(irq, &vgic_cpu->ap_list_head, ap_list) {
u32 lr;
if (!eoicount) {
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 1d6dd1b545bd..db2f220c29f7 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -148,7 +148,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr);
- struct vgic_irq *irq;
+ struct vgic_irq *irq = *host_data_ptr(last_lr_irq);
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
@@ -158,12 +158,12 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
/*
* EOIMode=0: use EOIcount to emulate deactivation. We are
* guaranteed to deactivate in reverse order of the activation, so
- * just pick one active interrupt after the other in the ap_list,
- * and replay the deactivation as if the CPU was doing it. We also
- * rely on priority drop to have taken place, and the list to be
- * sorted by priority.
+ * just pick one active interrupt after the other in the tail part
+ * of the ap_list, past the LRs, and replay the deactivation as if
+ * the CPU was doing it. We also rely on priority drop to have taken
+ * place, and the list to be sorted by priority.
*/
- list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ list_for_each_entry_continue(irq, &vgic_cpu->ap_list_head, ap_list) {
u64 lr;
/*
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 430aa98888fd..e22b79cfff96 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -814,6 +814,9 @@ retry:
static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
{
+ if (!*host_data_ptr(last_lr_irq))
+ return;
+
if (kvm_vgic_global_state.type == VGIC_V2)
vgic_v2_fold_lr_state(vcpu);
else
@@ -960,10 +963,13 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
if (irqs_outside_lrs(&als))
vgic_sort_ap_list(vcpu);
+ *host_data_ptr(last_lr_irq) = NULL;
+
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
scoped_guard(raw_spinlock, &irq->irq_lock) {
if (likely(vgic_target_oracle(irq) == vcpu)) {
vgic_populate_lr(vcpu, irq, count++);
+ *host_data_ptr(last_lr_irq) = irq;
}
}
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 589bcf878938..cb6cd58deda9 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -581,6 +581,27 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
}
EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes);
+static bool contpte_all_subptes_match_access_flags(pte_t *ptep, pte_t entry)
+{
+ pte_t *cont_ptep = contpte_align_down(ptep);
+ /*
+ * PFNs differ per sub-PTE. Match only bits consumed by
+ * __ptep_set_access_flags(): AF, DIRTY and write permission.
+ */
+ const pteval_t cmp_mask = PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
+ pteval_t entry_cmp = pte_val(entry) & cmp_mask;
+ int i;
+
+ for (i = 0; i < CONT_PTES; i++) {
+ pteval_t pte_cmp = pte_val(__ptep_get(cont_ptep + i)) & cmp_mask;
+
+ if (pte_cmp != entry_cmp)
+ return false;
+ }
+
+ return true;
+}
+
int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty)
@@ -590,14 +611,38 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
int i;
/*
- * Gather the access/dirty bits for the contiguous range. If nothing has
- * changed, its a noop.
+ * Check whether all sub-PTEs in the CONT block already match the
+ * requested access flags/write permission, using raw per-PTE values
+ * rather than the gathered ptep_get() view.
+ *
+ * __ptep_set_access_flags() can update AF, dirty and write
+ * permission, but only to make the mapping more permissive.
+ *
+ * ptep_get() gathers AF/dirty state across the whole CONT block,
+ * which is correct for a CPU with FEAT_HAFDBS. But page-table
+ * walkers that evaluate each descriptor individually (e.g. a CPU
+ * without DBM support, or an SMMU without HTTU, or with HA/HD
+ * disabled in CD.TCR) can keep faulting on the target sub-PTE if
+ * only a sibling has been updated. Gathering can therefore cause
+ * false no-ops when only a sibling has been updated:
+ * - write faults: target still has PTE_RDONLY (needs PTE_RDONLY cleared)
+ * - read faults: target still lacks PTE_AF
+ *
+ * Per Arm ARM (DDI 0487) D8.7.1, any sub-PTE in a CONT range may
+ * become the effective cached translation, so all entries must have
+ * consistent attributes. Check the full CONT block before returning
+ * no-op, and when any sub-PTE mismatches, proceed to update the whole
+ * range.
*/
- orig_pte = pte_mknoncont(ptep_get(ptep));
- if (pte_val(orig_pte) == pte_val(entry))
+ if (contpte_all_subptes_match_access_flags(ptep, entry))
return 0;
/*
+ * Use raw target pte (not gathered) for write-bit unfold decision.
+ */
+ orig_pte = pte_mknoncont(__ptep_get(ptep));
+
+ /*
* We can fix up access/dirty bits without having to unfold the contig
* range. But if the write bit is changing, we must unfold.
*/
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 75f343009b4b..92b2f5097a96 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -91,7 +91,11 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
/* Short circuit GCS to avoid bloating the table. */
if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
- prot = gcs_page_prot;
+ /* Honour mprotect(PROT_NONE) on shadow stack mappings */
+ if (vm_flags & VM_ACCESS_FLAGS)
+ prot = gcs_page_prot;
+ else
+ prot = pgprot_val(protection_map[VM_NONE]);
} else {
prot = pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 2c139a4dbf4b..17afe7a59edf 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -85,7 +85,7 @@ extern void __update_cache(pte_t pte);
printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, (unsigned long)pgd_val(e))
/* This is the size of the initially mapped kernel memory */
-#if defined(CONFIG_64BIT)
+#if defined(CONFIG_64BIT) || defined(CONFIG_KALLSYMS)
#define KERNEL_INITIAL_ORDER 26 /* 1<<26 = 64MB */
#else
#define KERNEL_INITIAL_ORDER 25 /* 1<<25 = 32MB */
diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index 96e0264ac961..9188c8d87437 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -56,6 +56,7 @@ ENTRY(parisc_kernel_start)
.import __bss_start,data
.import __bss_stop,data
+ .import __end,data
load32 PA(__bss_start),%r3
load32 PA(__bss_stop),%r4
@@ -149,7 +150,11 @@ $cpu_ok:
* everything ... it will get remapped correctly later */
ldo 0+_PAGE_KERNEL_RWX(%r0),%r3 /* Hardwired 0 phys addr start */
load32 (1<<(KERNEL_INITIAL_ORDER-PAGE_SHIFT)),%r11 /* PFN count */
- load32 PA(pg0),%r1
+ load32 PA(_end),%r1
+ SHRREG %r1,PAGE_SHIFT,%r1 /* %r1 is PFN count for _end symbol */
+ cmpb,<<,n %r11,%r1,1f
+ copy %r1,%r11 /* %r1 PFN count smaller than %r11 */
+1: load32 PA(pg0),%r1
$pgt_fill_loop:
STREGM %r3,ASM_PTE_ENTRY_SIZE(%r1)
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index ace483b6f19a..d3e17a7a8901 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -120,14 +120,6 @@ void __init setup_arch(char **cmdline_p)
#endif
printk(KERN_CONT ".\n");
- /*
- * Check if initial kernel page mappings are sufficient.
- * panic early if not, else we may access kernel functions
- * and variables which can't be reached.
- */
- if (__pa((unsigned long) &_end) >= KERNEL_INITIAL_SIZE)
- panic("KERNEL_INITIAL_ORDER too small!");
-
#ifdef CONFIG_64BIT
if(parisc_narrow_firmware) {
printk(KERN_INFO "Kernel is using PDC in 32-bit mode.\n");
@@ -279,6 +271,18 @@ void __init start_parisc(void)
int ret, cpunum;
struct pdc_coproc_cfg coproc_cfg;
+ /*
+ * Check if initial kernel page mapping is sufficient.
+ * Print warning if not, because we may access kernel functions and
+ * variables which can't be reached yet through the initial mappings.
+ * Note that the panic() and printk() functions are not functional
+ * yet, so we need to use direct iodc() firmware calls instead.
+ */
+ const char warn1[] = "CRITICAL: Kernel may crash because "
+ "KERNEL_INITIAL_ORDER is too small.\n";
+ if (__pa((unsigned long) &_end) >= KERNEL_INITIAL_SIZE)
+ pdc_iodc_print(warn1, sizeof(warn1) - 1);
+
/* check QEMU/SeaBIOS marker in PAGE0 */
running_on_qemu = (memcmp(&PAGE0->pad0, "SeaBIOS", 8) == 0);
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 3e622e647d62..f77c503ecc10 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -253,7 +253,7 @@ __gus_failed: \
".section .fixup,\"ax\"\n" \
"4: li %0,%3\n" \
" li %1,0\n" \
- " li %1+1,0\n" \
+ " li %L1,0\n" \
" b 3b\n" \
".previous\n" \
EX_TABLE(1b, 4b) \
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 104c05520bf0..dc44f11be353 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -23,6 +23,7 @@
#include <asm/firmware.h>
#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG)
+#define __be_word __PASTE(__be, BITS_PER_LONG)
#ifdef CONFIG_CRASH_DUMP
void machine_crash_shutdown(struct pt_regs *regs)
@@ -146,25 +147,25 @@ int __init overlaps_crashkernel(unsigned long start, unsigned long size)
}
/* Values we need to export to the second kernel via the device tree. */
-static phys_addr_t crashk_base;
-static phys_addr_t crashk_size;
-static unsigned long long mem_limit;
+static __be_word crashk_base;
+static __be_word crashk_size;
+static __be_word mem_limit;
static struct property crashk_base_prop = {
.name = "linux,crashkernel-base",
- .length = sizeof(phys_addr_t),
+ .length = sizeof(__be_word),
.value = &crashk_base
};
static struct property crashk_size_prop = {
.name = "linux,crashkernel-size",
- .length = sizeof(phys_addr_t),
+ .length = sizeof(__be_word),
.value = &crashk_size,
};
static struct property memory_limit_prop = {
.name = "linux,memory-limit",
- .length = sizeof(unsigned long long),
+ .length = sizeof(__be_word),
.value = &mem_limit,
};
@@ -193,11 +194,11 @@ static void __init export_crashk_values(struct device_node *node)
}
#endif /* CONFIG_CRASH_RESERVE */
-static phys_addr_t kernel_end;
+static __be_word kernel_end;
static struct property kernel_end_prop = {
.name = "linux,kernel-end",
- .length = sizeof(phys_addr_t),
+ .length = sizeof(__be_word),
.value = &kernel_end,
};
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index e7ef8b2a2554..5f6d50e4c3d4 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -450,6 +450,11 @@ static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf)
kbuf->buffer = headers;
kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
kbuf->bufsz = headers_sz;
+
+ /*
+ * Account for extra space required to accommodate additional memory
+ * ranges in elfcorehdr due to memory hotplug events.
+ */
kbuf->memsz = headers_sz + kdump_extra_elfcorehdr_size(cmem);
kbuf->top_down = false;
@@ -460,7 +465,14 @@ static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf)
}
image->elf_load_addr = kbuf->mem;
- image->elf_headers_sz = headers_sz;
+
+ /*
+ * If CONFIG_CRASH_HOTPLUG is enabled, the elfcorehdr kexec segment
+ * memsz can be larger than bufsz. Always initialize elf_headers_sz
+ * with memsz. This ensures the correct size is reserved for elfcorehdr
+ * memory in the FDT prepared for kdump.
+ */
+ image->elf_headers_sz = kbuf->memsz;
image->elf_headers = headers;
out:
kfree(cmem);
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index e199976e410a..0d2d2190af9a 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -437,7 +437,7 @@ void bpf_jit_free(struct bpf_prog *fp)
bool bpf_jit_supports_kfunc_call(void)
{
- return true;
+ return IS_ENABLED(CONFIG_PPC64);
}
bool bpf_jit_supports_arena(void)
@@ -722,9 +722,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
* retval_off [ return value ]
* [ reg argN ]
* [ ... ]
- * regs_off [ reg_arg1 ] prog ctx context
- * nregs_off [ args count ]
- * ip_off [ traced function ]
+ * regs_off [ reg_arg1 ] prog_ctx
+ * nregs_off [ args count ] ((u64 *)prog_ctx)[-1]
+ * ip_off [ traced function ] ((u64 *)prog_ctx)[-2]
* [ ... ]
* run_ctx_off [ bpf_tramp_run_ctx ]
* [ reg argN ]
@@ -824,7 +824,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
bpf_trampoline_save_args(image, ctx, func_frame_offset, nr_regs, regs_off);
- /* Save our return address */
+ /* Save our LR/return address */
EMIT(PPC_RAW_MFLR(_R3));
if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE))
EMIT(PPC_RAW_STL(_R3, _R1, alt_lr_off));
@@ -832,24 +832,34 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
EMIT(PPC_RAW_STL(_R3, _R1, bpf_frame_size + PPC_LR_STKOFF));
/*
- * Save ip address of the traced function.
- * We could recover this from LR, but we will need to address for OOL trampoline,
- * and optional GEP area.
+ * Derive IP address of the traced function.
+ * In case of CONFIG_PPC_FTRACE_OUT_OF_LINE or BPF program, LR points to the instruction
+ * after the 'bl' instruction in the OOL stub. Refer to ftrace_init_ool_stub() and
+ * bpf_arch_text_poke() for OOL stub of kernel functions and bpf programs respectively.
+ * Relevant stub sequence:
+ *
+ * bl <tramp>
+ * LR (R3) => mtlr r0
+ * b <func_addr+4>
+ *
+ * Recover kernel function/bpf program address from the unconditional
+ * branch instruction at the end of OOL stub.
*/
if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) || flags & BPF_TRAMP_F_IP_ARG) {
EMIT(PPC_RAW_LWZ(_R4, _R3, 4));
EMIT(PPC_RAW_SLWI(_R4, _R4, 6));
EMIT(PPC_RAW_SRAWI(_R4, _R4, 6));
EMIT(PPC_RAW_ADD(_R3, _R3, _R4));
- EMIT(PPC_RAW_ADDI(_R3, _R3, 4));
}
if (flags & BPF_TRAMP_F_IP_ARG)
EMIT(PPC_RAW_STL(_R3, _R1, ip_off));
- if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE))
+ if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) {
/* Fake our LR for unwind */
+ EMIT(PPC_RAW_ADDI(_R3, _R3, 4));
EMIT(PPC_RAW_STL(_R3, _R1, bpf_frame_size + PPC_LR_STKOFF));
+ }
/* Save function arg count -- see bpf_get_func_arg_cnt() */
EMIT(PPC_RAW_LI(_R3, nr_regs));
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 1fe37128c876..de99f9b354ab 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -319,6 +319,83 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *
return 0;
}
+static int zero_extend(u32 *image, struct codegen_context *ctx, u32 src_reg, u32 dst_reg, u32 size)
+{
+ switch (size) {
+ case 1:
+ /* zero-extend 8 bits into 64 bits */
+ EMIT(PPC_RAW_RLDICL(dst_reg, src_reg, 0, 56));
+ return 0;
+ case 2:
+ /* zero-extend 16 bits into 64 bits */
+ EMIT(PPC_RAW_RLDICL(dst_reg, src_reg, 0, 48));
+ return 0;
+ case 4:
+ /* zero-extend 32 bits into 64 bits */
+ EMIT(PPC_RAW_RLDICL(dst_reg, src_reg, 0, 32));
+ fallthrough;
+ case 8:
+ /* Nothing to do */
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+static int sign_extend(u32 *image, struct codegen_context *ctx, u32 src_reg, u32 dst_reg, u32 size)
+{
+ switch (size) {
+ case 1:
+ /* sign-extend 8 bits into 64 bits */
+ EMIT(PPC_RAW_EXTSB(dst_reg, src_reg));
+ return 0;
+ case 2:
+ /* sign-extend 16 bits into 64 bits */
+ EMIT(PPC_RAW_EXTSH(dst_reg, src_reg));
+ return 0;
+ case 4:
+ /* sign-extend 32 bits into 64 bits */
+ EMIT(PPC_RAW_EXTSW(dst_reg, src_reg));
+ fallthrough;
+ case 8:
+ /* Nothing to do */
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+/*
+ * Handle powerpc ABI expectations from caller:
+ * - Unsigned arguments are zero-extended.
+ * - Signed arguments are sign-extended.
+ */
+static int prepare_for_kfunc_call(const struct bpf_prog *fp, u32 *image,
+ struct codegen_context *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct btf_func_model *m = bpf_jit_find_kfunc_model(fp, insn);
+ int i;
+
+ if (!m)
+ return -1;
+
+ for (i = 0; i < m->nr_args; i++) {
+ /* Note that BPF ABI only allows up to 5 args for kfuncs */
+ u32 reg = bpf_to_ppc(BPF_REG_1 + i), size = m->arg_size[i];
+
+ if (!(m->arg_flags[i] & BTF_FMODEL_SIGNED_ARG)) {
+ if (zero_extend(image, ctx, reg, reg, size))
+ return -1;
+ } else {
+ if (sign_extend(image, ctx, reg, reg, size))
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out)
{
/*
@@ -931,14 +1008,16 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code
/* special mov32 for zext */
EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 0, 31));
break;
- } else if (off == 8) {
- EMIT(PPC_RAW_EXTSB(dst_reg, src_reg));
- } else if (off == 16) {
- EMIT(PPC_RAW_EXTSH(dst_reg, src_reg));
- } else if (off == 32) {
- EMIT(PPC_RAW_EXTSW(dst_reg, src_reg));
- } else if (dst_reg != src_reg)
- EMIT(PPC_RAW_MR(dst_reg, src_reg));
+ }
+ if (off == 0) {
+ /* MOV */
+ if (dst_reg != src_reg)
+ EMIT(PPC_RAW_MR(dst_reg, src_reg));
+ } else {
+ /* MOVSX: dst = (s8,s16,s32)src (off = 8,16,32) */
+ if (sign_extend(image, ctx, src_reg, dst_reg, off / 8))
+ return -1;
+ }
goto bpf_alu32_trunc;
case BPF_ALU | BPF_MOV | BPF_K: /* (u32) dst = imm */
case BPF_ALU64 | BPF_MOV | BPF_K: /* dst = (s64) imm */
@@ -1395,6 +1474,12 @@ emit_clear:
if (ret < 0)
return ret;
+ /* Take care of powerpc ABI requirements before kfunc call */
+ if (insn[i].src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ if (prepare_for_kfunc_call(fp, image, ctx, &insn[i]))
+ return -1;
+ }
+
ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr);
if (ret)
return ret;
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 26aa26482c9a..992cc5c98214 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -103,6 +103,11 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
void
perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
+ perf_callchain_store(entry, perf_arch_instruction_pointer(regs));
+
+ if (!current->mm)
+ return;
+
if (!is_32bit_task())
perf_callchain_user_64(entry, regs);
else
diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c
index ddcc2d8aa64a..0de21c5d272c 100644
--- a/arch/powerpc/perf/callchain_32.c
+++ b/arch/powerpc/perf/callchain_32.c
@@ -142,7 +142,6 @@ void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
next_ip = perf_arch_instruction_pointer(regs);
lr = regs->link;
sp = regs->gpr[1];
- perf_callchain_store(entry, next_ip);
while (entry->nr < entry->max_stack) {
fp = (unsigned int __user *) (unsigned long) sp;
diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c
index 115d1c105e8a..30fb61c5f0cb 100644
--- a/arch/powerpc/perf/callchain_64.c
+++ b/arch/powerpc/perf/callchain_64.c
@@ -77,7 +77,6 @@ void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
next_ip = perf_arch_instruction_pointer(regs);
lr = regs->link;
sp = regs->gpr[1];
- perf_callchain_store(entry, next_ip);
while (entry->nr < entry->max_stack) {
fp = (unsigned long __user *) sp;
diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c
index 2b5d187d9b62..9ef8fb39dd1b 100644
--- a/arch/powerpc/platforms/83xx/km83xx.c
+++ b/arch/powerpc/platforms/83xx/km83xx.c
@@ -155,8 +155,8 @@ machine_device_initcall(mpc83xx_km, mpc83xx_declare_of_platform_devices);
/* list of the supported boards */
static char *board[] __initdata = {
- "Keymile,KMETER1",
- "Keymile,kmpbec8321",
+ "keymile,KMETER1",
+ "keymile,kmpbec8321",
NULL
};
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index edc30cda5dbc..56f17296545a 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -605,7 +605,7 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
&pseries_msi_irq_chip, pseries_dev);
}
- pseries_dev->msi_used++;
+ pseries_dev->msi_used += nr_irqs;
return 0;
out:
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 3affba95845b..86076f27c2d9 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -158,7 +158,7 @@ static __always_inline void __stackleak_poison(unsigned long erase_low,
" j 4f\n"
"3: mvc 8(1,%[addr]),0(%[addr])\n"
"4:"
- : [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp)
+ : [addr] "+&a" (erase_low), [count] "+&a" (count), [tmp] "=&a" (tmp)
: [poison] "d" (poison)
: "memory", "cc"
);
diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c
index 1721b73b7803..81c0235c0466 100644
--- a/arch/s390/lib/xor.c
+++ b/arch/s390/lib/xor.c
@@ -28,8 +28,8 @@ static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
" j 3f\n"
"2: xc 0(1,%1),0(%2)\n"
"3:"
- : : "d" (bytes), "a" (p1), "a" (p2)
- : "0", "cc", "memory");
+ : "+d" (bytes), "+a" (p1), "+a" (p2)
+ : : "0", "cc", "memory");
}
static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1,
@@ -96,7 +96,6 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
const unsigned long * __restrict p5)
{
asm volatile(
- " larl 1,2f\n"
" aghi %0,-1\n"
" jm 6f\n"
" srlg 0,%0,8\n"
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
index 2f829448c719..6ecd6b0a22a8 100644
--- a/arch/s390/mm/pfault.c
+++ b/arch/s390/mm/pfault.c
@@ -62,7 +62,7 @@ int __pfault_init(void)
"0: nopr %%r7\n"
EX_TABLE(0b, 0b)
: [rc] "+d" (rc)
- : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk)
+ : [refbk] "a" (virt_to_phys(&pfault_init_refbk)), "m" (pfault_init_refbk)
: "cc");
return rc;
}
@@ -84,7 +84,7 @@ void __pfault_fini(void)
"0: nopr %%r7\n"
EX_TABLE(0b, 0b)
:
- : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk)
+ : [refbk] "a" (virt_to_phys(&pfault_fini_refbk)), "m" (pfault_fini_refbk)
: "cc");
}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c27b3e5f60c2..8b8f7bb5cc82 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2470,7 +2470,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
KVM_X86_QUIRK_SLOT_ZAP_ALL | \
KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \
- KVM_X86_QUIRK_IGNORE_GUEST_PAT)
+ KVM_X86_QUIRK_IGNORE_GUEST_PAT | \
+ KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM)
#define KVM_X86_CONDITIONAL_QUIRKS \
(KVM_X86_QUIRK_CD_NW_CLEARED | \
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 1208932e5cc3..774eb6989ef9 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -476,6 +476,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8)
#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9)
+#define KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM (1 << 10)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d93f87f29d03..961714e6adae 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1894,6 +1894,7 @@ void __init check_x2apic(void)
static inline void try_to_enable_x2apic(int remap_mode) { }
static inline void __x2apic_enable(void) { }
+static inline void __x2apic_disable(void) { }
#endif /* !CONFIG_X86_X2APIC */
void __init enable_IR_x2apic(void)
@@ -2456,6 +2457,11 @@ static void lapic_resume(void *data)
if (x2apic_mode) {
__x2apic_enable();
} else {
+ if (x2apic_enabled()) {
+ pr_warn_once("x2apic: re-enabled by firmware during resume. Disabling\n");
+ __x2apic_disable();
+ }
+
/*
* Make sure the APICBASE points to the right address
*
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 0f6c8596719b..b87398dff73f 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -189,12 +189,12 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
-
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu);
-
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
+
/*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
@@ -226,6 +226,9 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm)
vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK);
vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK;
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+
/*
* If running nested and the guest uses its own MSR bitmap, there
* is no need to update L0's msr bitmap
@@ -368,7 +371,7 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table));
vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE;
- if (kvm_apicv_activated(svm->vcpu.kvm))
+ if (kvm_vcpu_apicv_active(&svm->vcpu))
avic_activate_vmcb(svm);
else
avic_deactivate_vmcb(svm);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index a58548b35b85..1b07f665e0af 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1032,8 +1032,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
svm_set_intercept(svm, INTERCEPT_CR0_WRITE);
svm_set_intercept(svm, INTERCEPT_CR3_WRITE);
svm_set_intercept(svm, INTERCEPT_CR4_WRITE);
- if (!kvm_vcpu_apicv_active(vcpu))
- svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
+ svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
@@ -1141,7 +1140,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
svm_clr_intercept(svm, INTERCEPT_PAUSE);
}
- if (kvm_vcpu_apicv_active(vcpu))
+ if (enable_apicv && irqchip_in_kernel(vcpu->kvm))
avic_init_vmcb(svm, vmcb);
if (vnmi)
@@ -2603,9 +2602,11 @@ static int dr_interception(struct kvm_vcpu *vcpu)
static int cr8_write_interception(struct kvm_vcpu *vcpu)
{
+ u8 cr8_prev = kvm_get_cr8(vcpu);
int r;
- u8 cr8_prev = kvm_get_cr8(vcpu);
+ WARN_ON_ONCE(kvm_vcpu_apicv_active(vcpu));
+
/* instruction emulation calls kvm_set_cr8() */
r = cr_interception(vcpu);
if (lapic_in_kernel(vcpu))
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6137e5307d0f..89ef3551e35c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3292,10 +3292,24 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP)))
return -EINVAL;
- if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
- (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
- CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false))))
- return -EINVAL;
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ u64 debugctl = vmcs12->guest_ia32_debugctl;
+
+ /*
+ * FREEZE_IN_SMM is not virtualized, but allow L1 to set it in
+ * vmcs12's DEBUGCTL under a quirk for backwards compatibility.
+ * Note that the quirk only relaxes the consistency check. The
+ * vmcc02 bit is still under the control of the host. In
+ * particular, if a host administrator decides to clear the bit,
+ * then L1 has no say in the matter.
+ */
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM))
+ debugctl &= ~DEBUGCTLMSR_FREEZE_IN_SMM;
+
+ if (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) ||
+ CC(!vmx_is_valid_debugctl(vcpu, debugctl, false)))
+ return -EINVAL;
+ }
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))