diff options
Diffstat (limited to 'arch')
60 files changed, 337 insertions, 267 deletions
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 83e03abbb2ca..8cbd1e96fd50 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -264,19 +264,33 @@ __iowrite64_copy(void __iomem *to, const void *from, size_t count) typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, pgprot_t *prot); int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); +void __iomem *__ioremap_prot(phys_addr_t phys, size_t size, pgprot_t prot); -#define ioremap_prot ioremap_prot +static inline void __iomem *ioremap_prot(phys_addr_t phys, size_t size, + pgprot_t user_prot) +{ + pgprot_t prot; + ptdesc_t user_prot_val = pgprot_val(user_prot); + + if (WARN_ON_ONCE(!(user_prot_val & PTE_USER))) + return NULL; -#define _PAGE_IOREMAP PROT_DEVICE_nGnRE + prot = __pgprot_modify(PAGE_KERNEL, PTE_ATTRINDX_MASK, + user_prot_val & PTE_ATTRINDX_MASK); + return __ioremap_prot(phys, size, prot); +} +#define ioremap_prot ioremap_prot +#define ioremap(addr, size) \ + __ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define ioremap_wc(addr, size) \ - ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) + __ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) #define ioremap_np(addr, size) \ - ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) + __ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) #define ioremap_encrypted(addr, size) \ - ioremap_prot((addr), (size), PAGE_KERNEL) + __ioremap_prot((addr), (size), PAGE_KERNEL) /* * io{read,write}{16,32,64}be() macros @@ -297,7 +311,7 @@ static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size) if (pfn_is_map_memory(__phys_to_pfn(addr))) return (void __iomem *)__phys_to_virt(addr); - return ioremap_prot(addr, size, __pgprot(PROT_NORMAL)); + return __ioremap_prot(addr, size, __pgprot(PROT_NORMAL)); } /* diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5d5a3bbdb95e..2ca264b3db5f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1616,7 +1616,8 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1PIE, IMP)) #define kvm_has_s1poe(k) \ - (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) + (system_supports_poe() && \ + kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) #define kvm_has_ras(k) \ (kvm_has_feat((k), ID_AA64PFR0_EL1, RAS, IMP)) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 905c658057a4..091544e6af44 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -397,6 +397,8 @@ int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu); int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu); void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val); +u16 get_asid_by_regime(struct kvm_vcpu *vcpu, enum trans_regime regime); + #define vncr_fixmap(c) \ ({ \ u32 __c = (c); \ diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index d27e8872fe3c..2b32639160de 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -164,9 +164,6 @@ static inline bool __pure lpa2_is_enabled(void) #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) -#define PAGE_GCS __pgprot(_PAGE_GCS) -#define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) - #define PIE_E0 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a2d65d7d6aae..1416e652612b 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -31,19 +31,11 @@ */ #define __TLBI_0(op, arg) asm (ARM64_ASM_PREAMBLE \ "tlbi " #op "\n" \ - ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op, \ - ARM64_WORKAROUND_REPEAT_TLBI, \ - CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ : : ) #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE \ - "tlbi " #op ", %0\n" \ - ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op ", %0", \ - ARM64_WORKAROUND_REPEAT_TLBI, \ - CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ - : : "r" (arg)) + "tlbi " #op ", %x0\n" \ + : : "rZ" (arg)) #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg) @@ -181,6 +173,34 @@ static inline unsigned long get_trans_granule(void) (__pages >> (5 * (scale) + 1)) - 1; \ }) +#define __repeat_tlbi_sync(op, arg...) \ +do { \ + if (!alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI)) \ + break; \ + __tlbi(op, ##arg); \ + dsb(ish); \ +} while (0) + +/* + * Complete broadcast TLB maintenance issued by the host which invalidates + * stage 1 information in the host's own translation regime. + */ +static inline void __tlbi_sync_s1ish(void) +{ + dsb(ish); + __repeat_tlbi_sync(vale1is, 0); +} + +/* + * Complete broadcast TLB maintenance issued by hyp code which invalidates + * stage 1 translation information in any translation regime. + */ +static inline void __tlbi_sync_s1ish_hyp(void) +{ + dsb(ish); + __repeat_tlbi_sync(vale2is, 0); +} + /* * TLB Invalidation * ================ @@ -279,7 +299,7 @@ static inline void flush_tlb_all(void) { dsb(ishst); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } @@ -291,7 +311,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) asid = __TLBI_VADDR(0, ASID(mm)); __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); - dsb(ish); + __tlbi_sync_s1ish(); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } @@ -345,20 +365,11 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { flush_tlb_page_nosync(vma, uaddr); - dsb(ish); + __tlbi_sync_s1ish(); } static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { - /* - * TLB flush deferral is not required on systems which are affected by - * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation - * will have two consecutive TLBI instructions with a dsb(ish) in between - * defeating the purpose (i.e save overall 'dsb ish' cost). - */ - if (alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI)) - return false; - return true; } @@ -374,7 +385,7 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) */ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { - dsb(ish); + __tlbi_sync_s1ish(); } /* @@ -509,7 +520,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, { __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, last_level, tlb_level); - dsb(ish); + __tlbi_sync_s1ish(); } static inline void local_flush_tlb_contpte(struct vm_area_struct *vma, @@ -557,7 +568,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end dsb(ishst); __flush_tlb_range_op(vaale1is, start, pages, stride, 0, TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } @@ -571,7 +582,7 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) dsb(ishst); __tlbi(vaae1is, addr); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index af90128cfed5..a9d884fd1d00 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -377,7 +377,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) prot = __acpi_get_writethrough_mem_attribute(); } } - return ioremap_prot(phys, size, prot); + return __ioremap_prot(phys, size, prot); } /* diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 4a609e9b65de..b9d4998c97ef 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -37,7 +37,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) * We pick the reserved-ASID to minimise the impact. */ __tlbi(aside1is, __TLBI_VADDR(0, 0)); - dsb(ish); + __tlbi_sync_s1ish(); } ret = caches_clean_inval_user_pou(start, start + chunk); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3fe1faab0362..b32f13358fbb 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -400,16 +400,25 @@ static inline int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) { /* - * Abort call on counterless CPU or when interrupts are - * disabled - can lead to deadlock in smp sync call. + * Abort call on counterless CPU. */ if (!cpu_has_amu_feat(cpu)) return -EOPNOTSUPP; - if (WARN_ON_ONCE(irqs_disabled())) - return -EPERM; - - smp_call_function_single(cpu, func, val, 1); + if (irqs_disabled()) { + /* + * When IRQs are disabled (tick path: sched_tick -> + * topology_scale_freq_tick or cppc_scale_freq_tick), only local + * CPU counter reads are allowed. Remote CPU counter read would + * require smp_call_function_single() which is unsafe with IRQs + * disabled. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) + return -EPERM; + func(val); + } else { + smp_call_function_single(cpu, func, val, 1); + } return 0; } diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 4f803fd1c99a..7d1f22fd490b 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -21,7 +21,6 @@ menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" select KVM_COMMON select KVM_GENERIC_HARDWARE_ENABLING - select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 29f0326f7e00..410ffd41fd73 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -358,7 +358,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_IOEVENTFD: case KVM_CAP_USER_MEMORY: - case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_ONE_REG: case KVM_CAP_ARM_PSCI: diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c index 885bd5bb2f41..6588ea251ed7 100644 --- a/arch/arm64/kvm/at.c +++ b/arch/arm64/kvm/at.c @@ -540,31 +540,8 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0); wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG); - if (wr->nG) { - u64 asid_ttbr, tcr; - - switch (wi->regime) { - case TR_EL10: - tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); - asid_ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL1) : - vcpu_read_sys_reg(vcpu, TTBR0_EL1)); - break; - case TR_EL20: - tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); - asid_ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL2) : - vcpu_read_sys_reg(vcpu, TTBR0_EL2)); - break; - default: - BUG(); - } - - wr->asid = FIELD_GET(TTBR_ASID_MASK, asid_ttbr); - if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || - !(tcr & TCR_ASID16)) - wr->asid &= GENMASK(7, 0); - } + if (wr->nG) + wr->asid = get_asid_by_regime(vcpu, wi->regime); return 0; diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index ae8391baebc3..218976287d3f 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -271,7 +271,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) */ dsb(ishst); __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); } diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 8e29d7734a15..2f029bfe4755 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -342,6 +342,7 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { hyp_vm->kvm.arch.flags = host_arch_flags; + hyp_vm->kvm.arch.flags &= ~BIT_ULL(KVM_ARCH_FLAG_ID_REGS_INITIALIZED); bitmap_copy(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, @@ -391,7 +392,7 @@ static void unpin_host_sve_state(struct pkvm_hyp_vcpu *hyp_vcpu) if (!vcpu_has_feature(&hyp_vcpu->vcpu, KVM_ARM_VCPU_SVE)) return; - sve_state = kern_hyp_va(hyp_vcpu->vcpu.arch.sve_state); + sve_state = hyp_vcpu->vcpu.arch.sve_state; hyp_unpin_shared_mem(sve_state, sve_state + vcpu_sve_state_size(&hyp_vcpu->vcpu)); } @@ -471,6 +472,35 @@ err: return ret; } +static int vm_copy_id_regs(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); + const struct kvm *host_kvm = hyp_vm->host_kvm; + struct kvm *kvm = &hyp_vm->kvm; + + if (!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &host_kvm->arch.flags)) + return -EINVAL; + + if (test_and_set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return 0; + + memcpy(kvm->arch.id_regs, host_kvm->arch.id_regs, sizeof(kvm->arch.id_regs)); + + return 0; +} + +static int pkvm_vcpu_init_sysregs(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + int ret = 0; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + else + ret = vm_copy_id_regs(hyp_vcpu); + + return ret; +} + static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, struct pkvm_hyp_vm *hyp_vm, struct kvm_vcpu *host_vcpu) @@ -490,8 +520,9 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; - if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) - kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + ret = pkvm_vcpu_init_sysregs(hyp_vcpu); + if (ret) + goto done; ret = pkvm_vcpu_init_traps(hyp_vcpu); if (ret) diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 48da9ca9763f..3dc1ce0d27fe 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -169,7 +169,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -226,7 +226,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -240,7 +240,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt, false); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -266,5 +266,5 @@ void __kvm_flush_vm_context(void) /* Same remark as in enter_vmid_context() */ dsb(ish); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 0e4ddd28ef5d..9b480f947da2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -501,7 +501,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, *unmapped += granule; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); mm_ops->put_page(ctx->ptep); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index ec2569818629..35855dadfb1b 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -115,7 +115,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -176,7 +176,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -192,7 +192,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -217,7 +217,7 @@ void __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } /* @@ -358,7 +358,7 @@ int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) default: ret = -EINVAL; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); if (mmu) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 070a01e53fcb..ec2eee857208 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1754,14 +1754,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* - * Both the canonical IPA and fault IPA must be hugepage-aligned to - * ensure we find the right PFN and lay down the mapping in the right - * place. + * Both the canonical IPA and fault IPA must be aligned to the + * mapping size to ensure we find the right PFN and lay down the + * mapping in the right place. */ - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { - fault_ipa &= ~(vma_pagesize - 1); - ipa &= ~(vma_pagesize - 1); - } + fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize); + ipa = ALIGN_DOWN(ipa, vma_pagesize); gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 620126d1f0dc..12c9f6e8dfda 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -854,6 +854,33 @@ int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2) return kvm_inject_nested_sync(vcpu, esr_el2); } +u16 get_asid_by_regime(struct kvm_vcpu *vcpu, enum trans_regime regime) +{ + enum vcpu_sysreg ttbr_elx; + u64 tcr; + u16 asid; + + switch (regime) { + case TR_EL10: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); + ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL1 : TTBR0_EL1; + break; + case TR_EL20: + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); + ttbr_elx = (tcr & TCR_A1) ? TTBR1_EL2 : TTBR0_EL2; + break; + default: + BUG(); + } + + asid = FIELD_GET(TTBRx_EL1_ASID, vcpu_read_sys_reg(vcpu, ttbr_elx)); + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || + !(tcr & TCR_ASID16)) + asid &= GENMASK(7, 0); + + return asid; +} + static void invalidate_vncr(struct vncr_tlb *vt) { vt->valid = false; @@ -1154,9 +1181,6 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) { int i; - if (!kvm->arch.nested_mmus_size) - return; - for (i = 0; i < kvm->arch.nested_mmus_size; i++) { struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; @@ -1336,20 +1360,8 @@ static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu) if (read_vncr_el2(vcpu) != vt->gva) return false; - if (vt->wr.nG) { - u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); - u64 ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL2) : - vcpu_read_sys_reg(vcpu, TTBR0_EL2)); - u16 asid; - - asid = FIELD_GET(TTBR_ASID_MASK, ttbr); - if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || - !(tcr & TCR_ASID16)) - asid &= GENMASK(7, 0); - - return asid == vt->wr.asid; - } + if (vt->wr.nG) + return get_asid_by_regime(vcpu, TR_EL20) == vt->wr.asid; return true; } @@ -1452,21 +1464,8 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu) if (read_vncr_el2(vcpu) != vt->gva) return; - if (vt->wr.nG) { - u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); - u64 ttbr = ((tcr & TCR_A1) ? - vcpu_read_sys_reg(vcpu, TTBR1_EL2) : - vcpu_read_sys_reg(vcpu, TTBR0_EL2)); - u16 asid; - - asid = FIELD_GET(TTBR_ASID_MASK, ttbr); - if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) || - !(tcr & TCR_ASID16)) - asid &= GENMASK(7, 0); - - if (asid != vt->wr.asid) - return; - } + if (vt->wr.nG && get_asid_by_regime(vcpu, TR_EL20) != vt->wr.asid) + return; vt->cpu = smp_processor_id(); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index a7cd0badc20c..1b4cacb6e918 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1816,6 +1816,9 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, ID_AA64MMFR3_EL1_SCTLRX | ID_AA64MMFR3_EL1_S1POE | ID_AA64MMFR3_EL1_S1PIE; + + if (!system_supports_poe()) + val &= ~ID_AA64MMFR3_EL1_S1POE; break; case SYS_ID_MMFR4_EL1: val &= ~ID_MMFR4_EL1_CCIDX; diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index d02341303899..e278e060e78a 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -32,7 +32,11 @@ static inline unsigned long xloops_to_cycles(unsigned long xloops) * Note that userspace cannot change the offset behind our back either, * as the vcpu mutex is held as long as KVM_RUN is in progress. */ -#define __delay_cycles() __arch_counter_get_cntvct_stable() +static cycles_t notrace __delay_cycles(void) +{ + guard(preempt_notrace)(); + return __arch_counter_get_cntvct_stable(); +} void __delay(unsigned long cycles) { diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index b12cbed9b5ad..6eef48ef6278 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -14,8 +14,8 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) return 0; } -void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - pgprot_t pgprot) +void __iomem *__ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t pgprot) { unsigned long last_addr = phys_addr + size - 1; @@ -39,7 +39,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, return generic_ioremap_prot(phys_addr, size, pgprot); } -EXPORT_SYMBOL(ioremap_prot); +EXPORT_SYMBOL(__ioremap_prot); /* * Must be called after early_fixmap_init diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 08ee177432c2..92b2f5097a96 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -34,6 +34,8 @@ static pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC }; +static ptdesc_t gcs_page_prot __ro_after_init = _PAGE_GCS_RO; + /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. @@ -73,9 +75,11 @@ static int __init adjust_protection_map(void) protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } - if (lpa2_is_enabled()) + if (lpa2_is_enabled()) { for (int i = 0; i < ARRAY_SIZE(protection_map); i++) pgprot_val(protection_map[i]) &= ~PTE_SHARED; + gcs_page_prot &= ~PTE_SHARED; + } return 0; } @@ -87,7 +91,11 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { - prot = _PAGE_GCS_RO; + /* Honour mprotect(PROT_NONE) on shadow stack mappings */ + if (vm_flags & VM_ACCESS_FLAGS) + prot = gcs_page_prot; + else + prot = pgprot_val(protection_map[VM_NONE]); } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 356d33c7a4ae..adf84962d579 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2119,7 +2119,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) extable_offset = round_up(prog_size + PLT_TARGET_SIZE, extable_align); image_size = extable_offset + extable_size; ro_header = bpf_jit_binary_pack_alloc(image_size, &ro_image_ptr, - sizeof(u32), &header, &image_ptr, + sizeof(u64), &header, &image_ptr, jit_fill_hole); if (!ro_header) { prog = orig_prog; diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ed4f724db774..8e5213609975 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -28,7 +28,6 @@ config KVM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING - select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO select VIRT_XFER_TO_GUEST_WORK select SCHED_INFO diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index 63fd40530aa9..5282158b8122 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -118,7 +118,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_READONLY_MEM: - case KVM_CAP_SYNC_MMU: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_IOEVENTFD: case KVM_CAP_MP_STATE: diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index cc13cc35f208..b1b9a1d67758 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_MMIO - select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING select HAVE_KVM_READONLY_MEM help diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b0fb92fda4d4..29d9f630edfb 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1035,7 +1035,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_READONLY_MEM: - case KVM_CAP_SYNC_MMU: case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c9a2d50ff1b0..9a0d1c1aca6c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -38,7 +38,6 @@ config KVM_BOOK3S_64_HANDLER config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO - select KVM_GENERIC_MMU_NOTIFIER config KVM_BOOK3S_HV_POSSIBLE bool @@ -81,7 +80,6 @@ config KVM_BOOK3S_64_HV tristate "KVM for POWER7 and later using hypervisor mode in host" depends on KVM_BOOK3S_64 && PPC_POWERNV select KVM_BOOK3S_HV_POSSIBLE - select KVM_GENERIC_MMU_NOTIFIER select KVM_BOOK3S_HV_PMU select CMA help @@ -203,7 +201,6 @@ config KVM_E500V2 depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO - select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500 guest kernels in virtual machines on E500v2 host processors. @@ -220,7 +217,6 @@ config KVM_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV - select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500MC/E5500/E6500 guest kernels in virtual machines on E500MC/E5500/E6500 host processors. diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9a89a6d98f97..00302399fc37 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -623,12 +623,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && kvmppc_hv_ops->enable_nested && !kvmppc_hv_ops->enable_nested(NULL)); break; -#endif - case KVM_CAP_SYNC_MMU: - BUILD_BUG_ON(!IS_ENABLED(CONFIG_KVM_GENERIC_MMU_NOTIFIER)); - r = 1; - break; -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_HTAB_FD: r = hv_enabled; break; diff --git a/arch/riscv/boot/dts/microchip/mpfs.dtsi b/arch/riscv/boot/dts/microchip/mpfs.dtsi index 5c2963e269b8..a0ffedc2d344 100644 --- a/arch/riscv/boot/dts/microchip/mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/mpfs.dtsi @@ -428,6 +428,7 @@ clocks = <&clkcfg CLK_CAN0>, <&clkcfg CLK_MSSPLL3>; interrupt-parent = <&plic>; interrupts = <56>; + resets = <&mss_top_sysreg CLK_CAN0>; status = "disabled"; }; @@ -437,6 +438,7 @@ clocks = <&clkcfg CLK_CAN1>, <&clkcfg CLK_MSSPLL3>; interrupt-parent = <&plic>; interrupts = <57>; + resets = <&mss_top_sysreg CLK_CAN1>; status = "disabled"; }; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 77379f77840a..ec2cee0a39e0 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,7 +30,6 @@ config KVM select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO select VIRT_XFER_TO_GUEST_WORK - select KVM_GENERIC_MMU_NOTIFIER select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS help diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 7cbd2340c190..58bce57dc55b 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -181,7 +181,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_IOEVENTFD: case KVM_CAP_USER_MEMORY: - case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_ONE_REG: case KVM_CAP_READONLY_MEM: diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h index 09f763b9eb40..32536ee34aa0 100644 --- a/arch/s390/include/asm/idle.h +++ b/arch/s390/include/asm/idle.h @@ -19,9 +19,9 @@ struct s390_idle_data { unsigned long mt_cycles_enter[8]; }; +DECLARE_PER_CPU(struct s390_idle_data, s390_idle); + extern struct device_attribute dev_attr_idle_count; extern struct device_attribute dev_attr_idle_time_us; -void psw_idle(struct s390_idle_data *data, unsigned long psw_mask); - #endif /* _S390_IDLE_H */ diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h index 9d25fb35a042..b1db75d14e9d 100644 --- a/arch/s390/include/asm/vtime.h +++ b/arch/s390/include/asm/vtime.h @@ -2,6 +2,12 @@ #ifndef _S390_VTIME_H #define _S390_VTIME_H +#include <asm/lowcore.h> +#include <asm/cpu_mf.h> +#include <asm/idle.h> + +DECLARE_PER_CPU(u64, mt_cycles[8]); + static inline void update_timer_sys(void) { struct lowcore *lc = get_lowcore(); @@ -20,4 +26,32 @@ static inline void update_timer_mcck(void) lc->last_update_timer = lc->mcck_enter_timer; } +static inline void update_timer_idle(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + struct lowcore *lc = get_lowcore(); + u64 cycles_new[8]; + int i, mtid; + + mtid = smp_cpu_mtid; + if (mtid) { + stcctm(MT_DIAG, mtid, cycles_new); + for (i = 0; i < mtid; i++) + __this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); + } + /* + * This is a bit subtle: Forward last_update_clock so it excludes idle + * time. For correct steal time calculation in do_account_vtime() add + * passed wall time before idle_enter to steal_timer: + * During the passed wall time before idle_enter CPU time may have + * been accounted to system, hardirq, softirq, etc. lowcore fields. + * The accounted CPU times will be subtracted again from steal_timer + * when accumulated steal time is calculated in do_account_vtime(). + */ + lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; + lc->last_update_clock = lc->int_clock; + lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; + lc->last_update_timer = lc->sys_enter_timer; +} + #endif /* _S390_VTIME_H */ diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index dd55cc6bbc28..fb67b4abe68c 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -56,8 +56,6 @@ long sys_s390_pci_mmio_write(unsigned long, const void __user *, size_t); long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t); long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user *return_code, unsigned long flags); -DECLARE_PER_CPU(u64, mt_cycles[8]); - unsigned long stack_alloc(void); void stack_free(unsigned long stack); diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 39cb8d0ae348..1f1b06b6b4ef 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -15,37 +15,22 @@ #include <trace/events/power.h> #include <asm/cpu_mf.h> #include <asm/cputime.h> +#include <asm/idle.h> #include <asm/nmi.h> #include <asm/smp.h> -#include "entry.h" -static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); +DEFINE_PER_CPU(struct s390_idle_data, s390_idle); void account_idle_time_irq(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); - struct lowcore *lc = get_lowcore(); unsigned long idle_time; - u64 cycles_new[8]; - int i; - if (smp_cpu_mtid) { - stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); - for (i = 0; i < smp_cpu_mtid; i++) - this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); - } - - idle_time = lc->int_clock - idle->clock_idle_enter; - - lc->steal_timer += idle->clock_idle_enter - lc->last_update_clock; - lc->last_update_clock = lc->int_clock; - - lc->system_timer += lc->last_update_timer - idle->timer_idle_enter; - lc->last_update_timer = lc->sys_enter_timer; + idle_time = get_lowcore()->int_clock - idle->clock_idle_enter; /* Account time spent with enabled wait psw loaded as idle time. */ - WRITE_ONCE(idle->idle_time, READ_ONCE(idle->idle_time) + idle_time); - WRITE_ONCE(idle->idle_count, READ_ONCE(idle->idle_count) + 1); + __atomic64_add(idle_time, &idle->idle_time); + __atomic64_add_const(1, &idle->idle_count); account_idle_time(cputime_to_nsecs(idle_time)); } diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index dcdc7e274848..049c557c452f 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2377,7 +2377,7 @@ void __init setup_ipl(void) atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); } -void s390_reset_system(void) +void __no_stack_protector s390_reset_system(void) { /* Disable prefixing */ set_prefix(0); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index f81723bc8856..7fdf960191d3 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -146,6 +146,12 @@ void noinstr do_io_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); bool from_idle; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) { + update_timer_idle(); + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); + } + irq_enter_rcu(); if (user_mode(regs)) { @@ -154,7 +160,6 @@ void noinstr do_io_irq(struct pt_regs *regs) current->thread.last_break = regs->last_break; } - from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); @@ -171,9 +176,6 @@ void noinstr do_io_irq(struct pt_regs *regs) set_irq_regs(old_regs); irqentry_exit(regs, state); - - if (from_idle) - regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); } void noinstr do_ext_irq(struct pt_regs *regs) @@ -182,6 +184,12 @@ void noinstr do_ext_irq(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); bool from_idle; + from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); + if (from_idle) { + update_timer_idle(); + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); + } + irq_enter_rcu(); if (user_mode(regs)) { @@ -194,7 +202,6 @@ void noinstr do_ext_irq(struct pt_regs *regs) regs->int_parm = get_lowcore()->ext_params; regs->int_parm_long = get_lowcore()->ext_params2; - from_idle = test_and_clear_cpu_flag(CIF_ENABLED_WAIT); if (from_idle) account_idle_time_irq(); @@ -203,9 +210,6 @@ void noinstr do_ext_irq(struct pt_regs *regs) irq_exit_rcu(); set_irq_regs(old_regs); irqentry_exit(regs, state); - - if (from_idle) - regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); } static void show_msi_interrupt(struct seq_file *p, int irq) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 234a0ba30510..bf48744d0912 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -48,8 +48,7 @@ static inline void set_vtimer(u64 expires) static inline int virt_timer_forward(u64 elapsed) { - BUG_ON(!irqs_disabled()); - + lockdep_assert_irqs_disabled(); if (list_empty(&virt_timer_list)) return 0; elapsed = atomic64_add_return(elapsed, &virt_timer_elapsed); @@ -137,23 +136,16 @@ static int do_account_vtime(struct task_struct *tsk) lc->system_timer += timer; /* Update MT utilization calculation */ - if (smp_cpu_mtid && - time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) + if (smp_cpu_mtid && time_after64(jiffies_64, __this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); /* Calculate cputime delta */ - user = update_tsk_timer(&tsk->thread.user_timer, - READ_ONCE(lc->user_timer)); - guest = update_tsk_timer(&tsk->thread.guest_timer, - READ_ONCE(lc->guest_timer)); - system = update_tsk_timer(&tsk->thread.system_timer, - READ_ONCE(lc->system_timer)); - hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, - READ_ONCE(lc->hardirq_timer)); - softirq = update_tsk_timer(&tsk->thread.softirq_timer, - READ_ONCE(lc->softirq_timer)); - lc->steal_timer += - clock - user - guest - system - hardirq - softirq; + user = update_tsk_timer(&tsk->thread.user_timer, lc->user_timer); + guest = update_tsk_timer(&tsk->thread.guest_timer, lc->guest_timer); + system = update_tsk_timer(&tsk->thread.system_timer, lc->system_timer); + hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, lc->hardirq_timer); + softirq = update_tsk_timer(&tsk->thread.softirq_timer, lc->softirq_timer); + lc->steal_timer += clock - user - guest - system - hardirq - softirq; /* Push account value */ if (user) { @@ -225,10 +217,6 @@ static u64 vtime_delta(void) return timer - lc->last_update_timer; } -/* - * Update process times based on virtual cpu times stored by entry.S - * to the lowcore fields user_timer, system_timer & steal_clock. - */ void vtime_account_kernel(struct task_struct *tsk) { struct lowcore *lc = get_lowcore(); @@ -238,27 +226,17 @@ void vtime_account_kernel(struct task_struct *tsk) lc->guest_timer += delta; else lc->system_timer += delta; - - virt_timer_forward(delta); } EXPORT_SYMBOL_GPL(vtime_account_kernel); void vtime_account_softirq(struct task_struct *tsk) { - u64 delta = vtime_delta(); - - get_lowcore()->softirq_timer += delta; - - virt_timer_forward(delta); + get_lowcore()->softirq_timer += vtime_delta(); } void vtime_account_hardirq(struct task_struct *tsk) { - u64 delta = vtime_delta(); - - get_lowcore()->hardirq_timer += delta; - - virt_timer_forward(delta); + get_lowcore()->hardirq_timer += vtime_delta(); } /* diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 917ac740513e..5b835bc6a194 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -28,9 +28,7 @@ config KVM select HAVE_KVM_INVALID_WAKEUPS select HAVE_KVM_NO_POLL select KVM_VFIO - select MMU_NOTIFIER select VIRT_XFER_TO_GUEST_WORK - select KVM_GENERIC_MMU_NOTIFIER select KVM_MMU_LOCKLESS_AGING help Support hosting paravirtualized guest machines using the SIE diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7a175d86cef0..bc7d6fa66eaf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -601,7 +601,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_S390_PSW: case KVM_CAP_S390_GMAP: - case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_S390_UCONTROL case KVM_CAP_S390_UCONTROL: #endif diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index 2f829448c719..6ecd6b0a22a8 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -62,7 +62,7 @@ int __pfault_init(void) "0: nopr %%r7\n" EX_TABLE(0b, 0b) : [rc] "+d" (rc) - : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : [refbk] "a" (virt_to_phys(&pfault_init_refbk)), "m" (pfault_init_refbk) : "cc"); return rc; } @@ -84,7 +84,7 @@ void __pfault_fini(void) "0: nopr %%r7\n" EX_TABLE(0b, 0b) : - : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : [refbk] "a" (virt_to_phys(&pfault_fini_refbk)), "m" (pfault_fini_refbk) : "cc"); } diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index 46ef88bc9c26..7613ab0ffb89 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c @@ -312,6 +312,8 @@ static dma_addr_t dma_4u_map_phys(struct device *dev, phys_addr_t phys, if (direction != DMA_TO_DEVICE) iopte_protection |= IOPTE_WRITE; + phys &= IO_PAGE_MASK; + for (i = 0; i < npages; i++, base++, phys += IO_PAGE_SIZE) iopte_val(*base) = iopte_protection | phys; diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 440284cc804e..61f14b4c8f90 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -410,6 +410,8 @@ static dma_addr_t dma_4v_map_phys(struct device *dev, phys_addr_t phys, iommu_batch_start(dev, prot, entry); + phys &= IO_PAGE_MASK; + for (i = 0; i < npages; i++, phys += IO_PAGE_SIZE) { long err = iommu_batch_add(phys, mask); if (unlikely(err < 0L)) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 012b2bcaa8a0..20fc33300a95 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -69,11 +69,11 @@ struct io_thread_req { }; -static struct io_thread_req * (*irq_req_buffer)[]; +static struct io_thread_req **irq_req_buffer; static struct io_thread_req *irq_remainder; static int irq_remainder_size; -static struct io_thread_req * (*io_req_buffer)[]; +static struct io_thread_req **io_req_buffer; static struct io_thread_req *io_remainder; static int io_remainder_size; @@ -398,7 +398,7 @@ static int thread_fd = -1; static int bulk_req_safe_read( int fd, - struct io_thread_req * (*request_buffer)[], + struct io_thread_req **request_buffer, struct io_thread_req **remainder, int *remainder_size, int max_recs @@ -465,7 +465,7 @@ static irqreturn_t ubd_intr(int irq, void *dev) &irq_remainder, &irq_remainder_size, UBD_REQ_BUFFER_SIZE)) >= 0) { for (i = 0; i < len / sizeof(struct io_thread_req *); i++) - ubd_end_request((*irq_req_buffer)[i]); + ubd_end_request(irq_req_buffer[i]); } if (len < 0 && len != -EAGAIN) @@ -1512,7 +1512,7 @@ void *io_thread(void *arg) } for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { - struct io_thread_req *req = (*io_req_buffer)[count]; + struct io_thread_req *req = io_req_buffer[count]; int i; io_count++; diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index a9b72997103d..88c757ac8ccd 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -160,8 +160,6 @@ void __init fred_complete_exception_setup(void) static noinstr void fred_extint(struct pt_regs *regs) { unsigned int vector = regs->fred_ss.vector; - unsigned int index = array_index_nospec(vector - FIRST_SYSTEM_VECTOR, - NR_SYSTEM_VECTORS); if (WARN_ON_ONCE(vector < FIRST_EXTERNAL_VECTOR)) return; @@ -170,7 +168,8 @@ static noinstr void fred_extint(struct pt_regs *regs) irqentry_state_t state = irqentry_enter(regs); instrumentation_begin(); - sysvec_table[index](regs); + sysvec_table[array_index_nospec(vector - FIRST_SYSTEM_VECTOR, + NR_SYSTEM_VECTORS)](regs); instrumentation_end(); irqentry_exit(regs, state); } else { diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 5ed6e0b7e715..0a1d08136cc1 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -6497,6 +6497,32 @@ static struct intel_uncore_type gnr_uncore_ubox = { .attr_update = uncore_alias_groups, }; +static struct uncore_event_desc gnr_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x01,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0, "event=0x05,umask=0xcf"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1, "event=0x06,umask=0xcf"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0, "event=0x05,umask=0xf0"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1, "event=0x06,umask=0xf0"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type gnr_uncore_imc = { + SPR_UNCORE_MMIO_COMMON_FORMAT(), + .name = "imc", + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, + .event_descs = gnr_uncore_imc_events, +}; + static struct intel_uncore_type gnr_uncore_pciex8 = { SPR_UNCORE_PCI_COMMON_FORMAT(), .name = "pciex8", @@ -6544,7 +6570,7 @@ static struct intel_uncore_type *gnr_uncores[UNCORE_GNR_NUM_UNCORE_TYPES] = { NULL, &spr_uncore_pcu, &gnr_uncore_ubox, - &spr_uncore_imc, + &gnr_uncore_imc, NULL, &gnr_uncore_upi, NULL, diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 9b4e04690e1a..80c1696d8d59 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -7,7 +7,7 @@ #include <linux/objtool.h> #include <asm/asm.h> -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct bug_entry; extern void __WARN_trap(struct bug_entry *bug, ...); #endif @@ -137,7 +137,7 @@ do { \ #ifdef HAVE_ARCH_BUG_FORMAT_ARGS -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <linux/static_call_types.h> DECLARE_STATIC_CALL(WARN_trap, __WARN_trap); @@ -153,7 +153,7 @@ struct arch_va_list { struct sysv_va_list args; }; extern void *__warn_args(struct arch_va_list *args, struct pt_regs *regs); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #define __WARN_bug_entry(flags, format) ({ \ struct bug_entry *bug; \ diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index c40b9ebc1fb4..ab3fbbd947ed 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -111,6 +111,12 @@ extern bhi_thunk __bhi_args_end[]; struct pt_regs; +#ifdef CONFIG_CALL_PADDING +#define CFI_OFFSET (CONFIG_FUNCTION_PADDING_CFI+5) +#else +#define CFI_OFFSET 5 +#endif + #ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall @@ -119,11 +125,9 @@ static inline int cfi_get_offset(void) { switch (cfi_mode) { case CFI_FINEIBT: - return 16; + return /* fineibt_prefix_size */ 16; case CFI_KCFI: - if (IS_ENABLED(CONFIG_CALL_PADDING)) - return 16; - return 5; + return CFI_OFFSET; default: return 0; } diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index a1193e9d65f2..462754b0bf8a 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -77,7 +77,7 @@ static __always_inline void native_local_irq_restore(unsigned long flags) #endif #ifndef CONFIG_PARAVIRT -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Used in the idle loop; sti takes one instruction cycle * to complete: @@ -95,7 +95,7 @@ static __always_inline void halt(void) { native_halt(); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_PARAVIRT_XXL diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 9d38ae744a2e..a7294656ad90 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -68,7 +68,7 @@ * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the * CFI symbol layout changes. * - * Without CALL_THUNKS: + * Without CALL_PADDING: * * .align FUNCTION_ALIGNMENT * __cfi_##name: @@ -77,7 +77,7 @@ * .long __kcfi_typeid_##name * name: * - * With CALL_THUNKS: + * With CALL_PADDING: * * .align FUNCTION_ALIGNMENT * __cfi_##name: diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index c55058f3d75e..409981468cba 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -20,7 +20,7 @@ #define PER_CPU_VAR(var) __percpu(var)__percpu_rel -#else /* !__ASSEMBLY__: */ +#else /* !__ASSEMBLER__: */ #include <linux/args.h> #include <linux/bits.h> diff --git a/arch/x86/include/asm/runtime-const.h b/arch/x86/include/asm/runtime-const.h index e5a13dc8816e..4cd94fdcb45e 100644 --- a/arch/x86/include/asm/runtime-const.h +++ b/arch/x86/include/asm/runtime-const.h @@ -6,7 +6,7 @@ #error "Cannot use runtime-const infrastructure from modules" #endif -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro RUNTIME_CONST_PTR sym reg movq $0x0123456789abcdef, %\reg @@ -16,7 +16,7 @@ .popsection .endm -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ #define runtime_const_ptr(sym) ({ \ typeof(sym) __ret; \ @@ -74,5 +74,5 @@ static inline void runtime_const_fixup(void (*fn)(void *, unsigned long), } } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 869b88061801..3f24cc472ce9 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -25,6 +25,8 @@ extern int ibt_selftest_noendbr(void); void handle_invalid_op(struct pt_regs *regs); #endif +noinstr bool handle_bug(struct pt_regs *regs); + static inline int get_si_code(unsigned long condition) { if (condition & DR_STEP) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a888ae0f01fb..e87da25d1236 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1182,7 +1182,7 @@ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) poison_endbr(addr); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16); + poison_cfi(addr - CFI_OFFSET); } } @@ -1389,6 +1389,8 @@ extern u8 fineibt_preamble_end[]; #define fineibt_preamble_ud 0x13 #define fineibt_preamble_hash 5 +#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE) + /* * <fineibt_caller_start>: * 0: b8 78 56 34 12 mov $0x12345678, %eax @@ -1634,7 +1636,7 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) * have determined there are no indirect calls to it and we * don't need no CFI either. */ - if (!is_endbr(addr + 16)) + if (!is_endbr(addr + CFI_OFFSET)) continue; hash = decode_preamble_hash(addr, &arity); @@ -1642,6 +1644,15 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) addr, addr, 5, addr)) return -EINVAL; + /* + * FineIBT relies on being at func-16, so if the preamble is + * actually larger than that, place it the tail end. + * + * NOTE: this is possible with things like DEBUG_CALL_THUNKS + * and DEBUG_FORCE_FUNCTION_ALIGN_64B. + */ + addr += CFI_OFFSET - fineibt_prefix_size; + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(addr + fineibt_preamble_hash, &hash, 4); @@ -1664,10 +1675,10 @@ static void cfi_rewrite_endbr(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - if (!exact_endbr(addr + 16)) + if (!exact_endbr(addr + CFI_OFFSET)) continue; - poison_endbr(addr + 16); + poison_endbr(addr + CFI_OFFSET); } } @@ -1772,7 +1783,8 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, if (FINEIBT_WARN(fineibt_preamble_size, 20) || FINEIBT_WARN(fineibt_preamble_bhi + fineibt_bhi1_size, 20) || FINEIBT_WARN(fineibt_caller_size, 14) || - FINEIBT_WARN(fineibt_paranoid_size, 20)) + FINEIBT_WARN(fineibt_paranoid_size, 20) || + WARN_ON_ONCE(CFI_OFFSET < fineibt_prefix_size)) return; if (cfi_mode == CFI_AUTO) { @@ -1886,6 +1898,11 @@ static void poison_cfi(void *addr) switch (cfi_mode) { case CFI_FINEIBT: /* + * FineIBT preamble is at func-16. + */ + addr += CFI_OFFSET - fineibt_prefix_size; + + /* * FineIBT prefix should start with an ENDBR. */ if (!is_endbr(addr)) @@ -1923,8 +1940,6 @@ static void poison_cfi(void *addr) } } -#define fineibt_prefix_size (fineibt_preamble_size - ENDBR_INSN_SIZE) - /* * When regs->ip points to a 0xD6 byte in the FineIBT preamble, * return true and fill out target and type. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5a6a772e0a6c..4dbff8ef9b1c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -397,7 +397,7 @@ static inline void handle_invalid_op(struct pt_regs *regs) ILL_ILLOPN, error_get_trap_addr(regs)); } -static noinstr bool handle_bug(struct pt_regs *regs) +noinstr bool handle_bug(struct pt_regs *regs) { unsigned long addr = regs->ip; bool handled = false; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index d916bd766c94..801bf9e520db 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM_X86 def_tristate KVM if (KVM_INTEL != n || KVM_AMD != n) select KVM_COMMON - select KVM_GENERIC_MMU_NOTIFIER select KVM_ELIDE_TLB_FLUSH_IF_YOUNG select KVM_MMU_LOCKLESS_AGING select HAVE_KVM_IRQCHIP diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3fb64905d190..a03530795707 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4805,7 +4805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: - case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 2fdc1f1f5adb..6b9ff1c6cafa 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -411,14 +411,11 @@ void __init early_fixup_exception(struct pt_regs *regs, int trapnr) return; if (trapnr == X86_TRAP_UD) { - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { - /* Skip the ud2. */ - regs->ip += LEN_UD2; + if (handle_bug(regs)) return; - } /* - * If this was a BUG and report_bug returns or if this + * If this was a BUG and handle_bug returns or if this * was just a normal #UD, we want to continue onward and * crash. */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8f10080e6fe3..e9b78040d703 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -438,17 +438,8 @@ static void emit_kcfi(u8 **pprog, u32 hash) EMIT1_off32(0xb8, hash); /* movl $hash, %eax */ #ifdef CONFIG_CALL_PADDING - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); - EMIT1(0x90); + for (int i = 0; i < CONFIG_FUNCTION_PADDING_CFI; i++) + EMIT1(0x90); #endif EMIT_ENDBR(); |
