diff options
| author | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2026-03-19 16:08:51 +0100 |
|---|---|---|
| committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2026-03-19 16:08:51 +0100 |
| commit | 1c6d58b8a03b656bfe4c6930a7d6052782c0bc89 (patch) | |
| tree | 70991c25d188f5b1cd6f5d84f9f352ec2fa2effa /arch | |
| parent | 744943ac89cd209aec9414dd751c53528d5757e7 (diff) | |
| parent | 4aea1dc4cad17cd146072e13b1fd404f32b8b3ef (diff) | |
Merge v6.18.19linux-rolling-lts
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'arch')
28 files changed, 320 insertions, 108 deletions
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index a64a26aaceba..a03f73bef87c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -50,11 +50,11 @@ #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) -#define _PAGE_KERNEL (PROT_NORMAL) -#define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) -#define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) -#define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN) -#define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) +#define _PAGE_KERNEL (PROT_NORMAL | PTE_DIRTY) +#define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY | PTE_DIRTY) +#define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY | PTE_DIRTY) +#define _PAGE_KERNEL_EXEC ((PROT_NORMAL & ~PTE_PXN) | PTE_DIRTY) +#define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT | PTE_DIRTY) #define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 49db32f3ddf7..ece04bb10ab0 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -516,7 +516,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) granule = kvm_granule_size(level); cur.start = ALIGN_DOWN(addr, granule); cur.end = cur.start + granule; - if (!range_included(&cur, range)) + if (!range_included(&cur, range) && level < KVM_PGTABLE_LAST_LEVEL) continue; *range = cur; return 0; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7cc964af8d30..0d38dc72dfc6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1712,14 +1712,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } /* - * Both the canonical IPA and fault IPA must be hugepage-aligned to - * ensure we find the right PFN and lay down the mapping in the right - * place. + * Both the canonical IPA and fault IPA must be aligned to the + * mapping size to ensure we find the right PFN and lay down the + * mapping in the right place. */ - if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) { - fault_ipa &= ~(vma_pagesize - 1); - ipa &= ~(vma_pagesize - 1); - } + fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize); + ipa = ALIGN_DOWN(ipa, vma_pagesize); gfn = ipa >> PAGE_SHIFT; mte_allowed = kvm_vma_mte_allowed(vma); diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index da62edbc1205..30fa88e49be4 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -140,26 +140,9 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) goto out_unlock; } - kvm_for_each_vcpu(i, vcpu, kvm) { - ret = vgic_allocate_private_irqs_locked(vcpu, type); - if (ret) - break; - } - - if (ret) { - kvm_for_each_vcpu(i, vcpu, kvm) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - kfree(vgic_cpu->private_irqs); - vgic_cpu->private_irqs = NULL; - } - - goto out_unlock; - } - kvm->arch.vgic.in_kernel = true; kvm->arch.vgic.vgic_model = type; kvm->arch.vgic.implementation_rev = KVM_VGIC_IMP_REV_LATEST; - kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF; aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC; @@ -176,6 +159,23 @@ int kvm_vgic_create(struct kvm *kvm, u32 type) kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0); kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1); + kvm_for_each_vcpu(i, vcpu, kvm) { + ret = vgic_allocate_private_irqs_locked(vcpu, type); + if (ret) + break; + } + + if (ret) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + kfree(vgic_cpu->private_irqs); + vgic_cpu->private_irqs = NULL; + } + + kvm->arch.vgic.vgic_model = 0; + goto out_unlock; + } + if (type == KVM_DEV_TYPE_ARM_VGIC_V3) kvm->arch.vgic.nassgicap = system_supports_direct_sgis(); diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c index c0557945939c..29024e20e876 100644 --- a/arch/arm64/mm/contpte.c +++ b/arch/arm64/mm/contpte.c @@ -581,6 +581,27 @@ void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes); +static bool contpte_all_subptes_match_access_flags(pte_t *ptep, pte_t entry) +{ + pte_t *cont_ptep = contpte_align_down(ptep); + /* + * PFNs differ per sub-PTE. Match only bits consumed by + * __ptep_set_access_flags(): AF, DIRTY and write permission. + */ + const pteval_t cmp_mask = PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; + pteval_t entry_cmp = pte_val(entry) & cmp_mask; + int i; + + for (i = 0; i < CONT_PTES; i++) { + pteval_t pte_cmp = pte_val(__ptep_get(cont_ptep + i)) & cmp_mask; + + if (pte_cmp != entry_cmp) + return false; + } + + return true; +} + int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty) @@ -590,14 +611,38 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma, int i; /* - * Gather the access/dirty bits for the contiguous range. If nothing has - * changed, its a noop. + * Check whether all sub-PTEs in the CONT block already match the + * requested access flags/write permission, using raw per-PTE values + * rather than the gathered ptep_get() view. + * + * __ptep_set_access_flags() can update AF, dirty and write + * permission, but only to make the mapping more permissive. + * + * ptep_get() gathers AF/dirty state across the whole CONT block, + * which is correct for a CPU with FEAT_HAFDBS. But page-table + * walkers that evaluate each descriptor individually (e.g. a CPU + * without DBM support, or an SMMU without HTTU, or with HA/HD + * disabled in CD.TCR) can keep faulting on the target sub-PTE if + * only a sibling has been updated. Gathering can therefore cause + * false no-ops when only a sibling has been updated: + * - write faults: target still has PTE_RDONLY (needs PTE_RDONLY cleared) + * - read faults: target still lacks PTE_AF + * + * Per Arm ARM (DDI 0487) D8.7.1, any sub-PTE in a CONT range may + * become the effective cached translation, so all entries must have + * consistent attributes. Check the full CONT block before returning + * no-op, and when any sub-PTE mismatches, proceed to update the whole + * range. */ - orig_pte = pte_mknoncont(ptep_get(ptep)); - if (pte_val(orig_pte) == pte_val(entry)) + if (contpte_all_subptes_match_access_flags(ptep, entry)) return 0; /* + * Use raw target pte (not gathered) for write-bit unfold decision. + */ + orig_pte = pte_mknoncont(__ptep_get(ptep)); + + /* * We can fix up access/dirty bits without having to unfold the contig * range. But if the write bit is changing, we must unfold. */ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 75f343009b4b..92b2f5097a96 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -91,7 +91,11 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { - prot = gcs_page_prot; + /* Honour mprotect(PROT_NONE) on shadow stack mappings */ + if (vm_flags & VM_ACCESS_FLAGS) + prot = gcs_page_prot; + else + prot = pgprot_val(protection_map[VM_NONE]); } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 2c139a4dbf4b..17afe7a59edf 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -85,7 +85,7 @@ extern void __update_cache(pte_t pte); printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, (unsigned long)pgd_val(e)) /* This is the size of the initially mapped kernel memory */ -#if defined(CONFIG_64BIT) +#if defined(CONFIG_64BIT) || defined(CONFIG_KALLSYMS) #define KERNEL_INITIAL_ORDER 26 /* 1<<26 = 64MB */ #else #define KERNEL_INITIAL_ORDER 25 /* 1<<25 = 32MB */ diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S index 96e0264ac961..9188c8d87437 100644 --- a/arch/parisc/kernel/head.S +++ b/arch/parisc/kernel/head.S @@ -56,6 +56,7 @@ ENTRY(parisc_kernel_start) .import __bss_start,data .import __bss_stop,data + .import __end,data load32 PA(__bss_start),%r3 load32 PA(__bss_stop),%r4 @@ -149,7 +150,11 @@ $cpu_ok: * everything ... it will get remapped correctly later */ ldo 0+_PAGE_KERNEL_RWX(%r0),%r3 /* Hardwired 0 phys addr start */ load32 (1<<(KERNEL_INITIAL_ORDER-PAGE_SHIFT)),%r11 /* PFN count */ - load32 PA(pg0),%r1 + load32 PA(_end),%r1 + SHRREG %r1,PAGE_SHIFT,%r1 /* %r1 is PFN count for _end symbol */ + cmpb,<<,n %r11,%r1,1f + copy %r1,%r11 /* %r1 PFN count smaller than %r11 */ +1: load32 PA(pg0),%r1 $pgt_fill_loop: STREGM %r3,ASM_PTE_ENTRY_SIZE(%r1) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index ace483b6f19a..d3e17a7a8901 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -120,14 +120,6 @@ void __init setup_arch(char **cmdline_p) #endif printk(KERN_CONT ".\n"); - /* - * Check if initial kernel page mappings are sufficient. - * panic early if not, else we may access kernel functions - * and variables which can't be reached. - */ - if (__pa((unsigned long) &_end) >= KERNEL_INITIAL_SIZE) - panic("KERNEL_INITIAL_ORDER too small!"); - #ifdef CONFIG_64BIT if(parisc_narrow_firmware) { printk(KERN_INFO "Kernel is using PDC in 32-bit mode.\n"); @@ -279,6 +271,18 @@ void __init start_parisc(void) int ret, cpunum; struct pdc_coproc_cfg coproc_cfg; + /* + * Check if initial kernel page mapping is sufficient. + * Print warning if not, because we may access kernel functions and + * variables which can't be reached yet through the initial mappings. + * Note that the panic() and printk() functions are not functional + * yet, so we need to use direct iodc() firmware calls instead. + */ + const char warn1[] = "CRITICAL: Kernel may crash because " + "KERNEL_INITIAL_ORDER is too small.\n"; + if (__pa((unsigned long) &_end) >= KERNEL_INITIAL_SIZE) + pdc_iodc_print(warn1, sizeof(warn1) - 1); + /* check QEMU/SeaBIOS marker in PAGE0 */ running_on_qemu = (memcmp(&PAGE0->pad0, "SeaBIOS", 8) == 0); diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 3987a5c33558..929f7050c73a 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -253,7 +253,7 @@ __gus_failed: \ ".section .fixup,\"ax\"\n" \ "4: li %0,%3\n" \ " li %1,0\n" \ - " li %1+1,0\n" \ + " li %L1,0\n" \ " b 3b\n" \ ".previous\n" \ EX_TABLE(1b, 4b) \ diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index d1a2d755381c..f86a6fc11e91 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -22,6 +22,9 @@ #include <asm/setup.h> #include <asm/firmware.h> +#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) +#define __be_word __PASTE(__be, BITS_PER_LONG) + #ifdef CONFIG_CRASH_DUMP void machine_crash_shutdown(struct pt_regs *regs) { @@ -136,37 +139,28 @@ int __init overlaps_crashkernel(unsigned long start, unsigned long size) } /* Values we need to export to the second kernel via the device tree. */ -static phys_addr_t kernel_end; -static phys_addr_t crashk_base; -static phys_addr_t crashk_size; -static unsigned long long mem_limit; - -static struct property kernel_end_prop = { - .name = "linux,kernel-end", - .length = sizeof(phys_addr_t), - .value = &kernel_end, -}; +static __be_word crashk_base; +static __be_word crashk_size; +static __be_word mem_limit; static struct property crashk_base_prop = { .name = "linux,crashkernel-base", - .length = sizeof(phys_addr_t), + .length = sizeof(__be_word), .value = &crashk_base }; static struct property crashk_size_prop = { .name = "linux,crashkernel-size", - .length = sizeof(phys_addr_t), + .length = sizeof(__be_word), .value = &crashk_size, }; static struct property memory_limit_prop = { .name = "linux,memory-limit", - .length = sizeof(unsigned long long), + .length = sizeof(__be_word), .value = &mem_limit, }; -#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) - static void __init export_crashk_values(struct device_node *node) { /* There might be existing crash kernel properties, but we can't @@ -190,6 +184,15 @@ static void __init export_crashk_values(struct device_node *node) mem_limit = cpu_to_be_ulong(memory_limit); of_update_property(node, &memory_limit_prop); } +#endif /* CONFIG_CRASH_RESERVE */ + +static __be_word kernel_end; + +static struct property kernel_end_prop = { + .name = "linux,kernel-end", + .length = sizeof(__be_word), + .value = &kernel_end, +}; static int __init kexec_setup(void) { @@ -200,16 +203,17 @@ static int __init kexec_setup(void) return -ENOENT; /* remove any stale properties so ours can be found */ - of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); + of_remove_property(node, of_find_property(node, kernel_end_prop.name, + NULL)); /* information needed by userspace when using default_machine_kexec */ kernel_end = cpu_to_be_ulong(__pa(_end)); of_add_property(node, &kernel_end_prop); +#ifdef CONFIG_CRASH_RESERVE export_crashk_values(node); - +#endif of_node_put(node); return 0; } late_initcall(kexec_setup); -#endif /* CONFIG_CRASH_RESERVE */ diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index e7ef8b2a2554..5f6d50e4c3d4 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -450,6 +450,11 @@ static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf) kbuf->buffer = headers; kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; kbuf->bufsz = headers_sz; + + /* + * Account for extra space required to accommodate additional memory + * ranges in elfcorehdr due to memory hotplug events. + */ kbuf->memsz = headers_sz + kdump_extra_elfcorehdr_size(cmem); kbuf->top_down = false; @@ -460,7 +465,14 @@ static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf) } image->elf_load_addr = kbuf->mem; - image->elf_headers_sz = headers_sz; + + /* + * If CONFIG_CRASH_HOTPLUG is enabled, the elfcorehdr kexec segment + * memsz can be larger than bufsz. Always initialize elf_headers_sz + * with memsz. This ensures the correct size is reserved for elfcorehdr + * memory in the FDT prepared for kdump. + */ + image->elf_headers_sz = kbuf->memsz; image->elf_headers = headers; out: kfree(cmem); diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 21f7f26a5e2f..189ef7b72081 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -437,7 +437,7 @@ void bpf_jit_free(struct bpf_prog *fp) bool bpf_jit_supports_kfunc_call(void) { - return true; + return IS_ENABLED(CONFIG_PPC64); } bool bpf_jit_supports_arena(void) @@ -722,9 +722,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im * retval_off [ return value ] * [ reg argN ] * [ ... ] - * regs_off [ reg_arg1 ] prog ctx context - * nregs_off [ args count ] - * ip_off [ traced function ] + * regs_off [ reg_arg1 ] prog_ctx + * nregs_off [ args count ] ((u64 *)prog_ctx)[-1] + * ip_off [ traced function ] ((u64 *)prog_ctx)[-2] * [ ... ] * run_ctx_off [ bpf_tramp_run_ctx ] * [ reg argN ] @@ -824,7 +824,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im bpf_trampoline_save_args(image, ctx, func_frame_offset, nr_regs, regs_off); - /* Save our return address */ + /* Save our LR/return address */ EMIT(PPC_RAW_MFLR(_R3)); if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) EMIT(PPC_RAW_STL(_R3, _R1, alt_lr_off)); @@ -832,24 +832,34 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im EMIT(PPC_RAW_STL(_R3, _R1, bpf_frame_size + PPC_LR_STKOFF)); /* - * Save ip address of the traced function. - * We could recover this from LR, but we will need to address for OOL trampoline, - * and optional GEP area. + * Derive IP address of the traced function. + * In case of CONFIG_PPC_FTRACE_OUT_OF_LINE or BPF program, LR points to the instruction + * after the 'bl' instruction in the OOL stub. Refer to ftrace_init_ool_stub() and + * bpf_arch_text_poke() for OOL stub of kernel functions and bpf programs respectively. + * Relevant stub sequence: + * + * bl <tramp> + * LR (R3) => mtlr r0 + * b <func_addr+4> + * + * Recover kernel function/bpf program address from the unconditional + * branch instruction at the end of OOL stub. */ if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE) || flags & BPF_TRAMP_F_IP_ARG) { EMIT(PPC_RAW_LWZ(_R4, _R3, 4)); EMIT(PPC_RAW_SLWI(_R4, _R4, 6)); EMIT(PPC_RAW_SRAWI(_R4, _R4, 6)); EMIT(PPC_RAW_ADD(_R3, _R3, _R4)); - EMIT(PPC_RAW_ADDI(_R3, _R3, 4)); } if (flags & BPF_TRAMP_F_IP_ARG) EMIT(PPC_RAW_STL(_R3, _R1, ip_off)); - if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) + if (IS_ENABLED(CONFIG_PPC_FTRACE_OUT_OF_LINE)) { /* Fake our LR for unwind */ + EMIT(PPC_RAW_ADDI(_R3, _R3, 4)); EMIT(PPC_RAW_STL(_R3, _R1, bpf_frame_size + PPC_LR_STKOFF)); + } /* Save function arg count -- see bpf_get_func_arg_cnt() */ EMIT(PPC_RAW_LI(_R3, nr_regs)); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 1fe37128c876..de99f9b354ab 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -319,6 +319,83 @@ int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context * return 0; } +static int zero_extend(u32 *image, struct codegen_context *ctx, u32 src_reg, u32 dst_reg, u32 size) +{ + switch (size) { + case 1: + /* zero-extend 8 bits into 64 bits */ + EMIT(PPC_RAW_RLDICL(dst_reg, src_reg, 0, 56)); + return 0; + case 2: + /* zero-extend 16 bits into 64 bits */ + EMIT(PPC_RAW_RLDICL(dst_reg, src_reg, 0, 48)); + return 0; + case 4: + /* zero-extend 32 bits into 64 bits */ + EMIT(PPC_RAW_RLDICL(dst_reg, src_reg, 0, 32)); + fallthrough; + case 8: + /* Nothing to do */ + return 0; + default: + return -1; + } +} + +static int sign_extend(u32 *image, struct codegen_context *ctx, u32 src_reg, u32 dst_reg, u32 size) +{ + switch (size) { + case 1: + /* sign-extend 8 bits into 64 bits */ + EMIT(PPC_RAW_EXTSB(dst_reg, src_reg)); + return 0; + case 2: + /* sign-extend 16 bits into 64 bits */ + EMIT(PPC_RAW_EXTSH(dst_reg, src_reg)); + return 0; + case 4: + /* sign-extend 32 bits into 64 bits */ + EMIT(PPC_RAW_EXTSW(dst_reg, src_reg)); + fallthrough; + case 8: + /* Nothing to do */ + return 0; + default: + return -1; + } +} + +/* + * Handle powerpc ABI expectations from caller: + * - Unsigned arguments are zero-extended. + * - Signed arguments are sign-extended. + */ +static int prepare_for_kfunc_call(const struct bpf_prog *fp, u32 *image, + struct codegen_context *ctx, + const struct bpf_insn *insn) +{ + const struct btf_func_model *m = bpf_jit_find_kfunc_model(fp, insn); + int i; + + if (!m) + return -1; + + for (i = 0; i < m->nr_args; i++) { + /* Note that BPF ABI only allows up to 5 args for kfuncs */ + u32 reg = bpf_to_ppc(BPF_REG_1 + i), size = m->arg_size[i]; + + if (!(m->arg_flags[i] & BTF_FMODEL_SIGNED_ARG)) { + if (zero_extend(image, ctx, reg, reg, size)) + return -1; + } else { + if (sign_extend(image, ctx, reg, reg, size)) + return -1; + } + } + + return 0; +} + static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 out) { /* @@ -931,14 +1008,16 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code /* special mov32 for zext */ EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 0, 31)); break; - } else if (off == 8) { - EMIT(PPC_RAW_EXTSB(dst_reg, src_reg)); - } else if (off == 16) { - EMIT(PPC_RAW_EXTSH(dst_reg, src_reg)); - } else if (off == 32) { - EMIT(PPC_RAW_EXTSW(dst_reg, src_reg)); - } else if (dst_reg != src_reg) - EMIT(PPC_RAW_MR(dst_reg, src_reg)); + } + if (off == 0) { + /* MOV */ + if (dst_reg != src_reg) + EMIT(PPC_RAW_MR(dst_reg, src_reg)); + } else { + /* MOVSX: dst = (s8,s16,s32)src (off = 8,16,32) */ + if (sign_extend(image, ctx, src_reg, dst_reg, off / 8)) + return -1; + } goto bpf_alu32_trunc; case BPF_ALU | BPF_MOV | BPF_K: /* (u32) dst = imm */ case BPF_ALU64 | BPF_MOV | BPF_K: /* dst = (s64) imm */ @@ -1395,6 +1474,12 @@ emit_clear: if (ret < 0) return ret; + /* Take care of powerpc ABI requirements before kfunc call */ + if (insn[i].src_reg == BPF_PSEUDO_KFUNC_CALL) { + if (prepare_for_kfunc_call(fp, image, ctx, &insn[i])) + return -1; + } + ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr); if (ret) return ret; diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 26aa26482c9a..992cc5c98214 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -103,6 +103,11 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + perf_callchain_store(entry, perf_arch_instruction_pointer(regs)); + + if (!current->mm) + return; + if (!is_32bit_task()) perf_callchain_user_64(entry, regs); else diff --git a/arch/powerpc/perf/callchain_32.c b/arch/powerpc/perf/callchain_32.c index ddcc2d8aa64a..0de21c5d272c 100644 --- a/arch/powerpc/perf/callchain_32.c +++ b/arch/powerpc/perf/callchain_32.c @@ -142,7 +142,6 @@ void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, next_ip = perf_arch_instruction_pointer(regs); lr = regs->link; sp = regs->gpr[1]; - perf_callchain_store(entry, next_ip); while (entry->nr < entry->max_stack) { fp = (unsigned int __user *) (unsigned long) sp; diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index 115d1c105e8a..30fb61c5f0cb 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -77,7 +77,6 @@ void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, next_ip = perf_arch_instruction_pointer(regs); lr = regs->link; sp = regs->gpr[1]; - perf_callchain_store(entry, next_ip); while (entry->nr < entry->max_stack) { fp = (unsigned long __user *) sp; diff --git a/arch/powerpc/platforms/83xx/km83xx.c b/arch/powerpc/platforms/83xx/km83xx.c index 2b5d187d9b62..9ef8fb39dd1b 100644 --- a/arch/powerpc/platforms/83xx/km83xx.c +++ b/arch/powerpc/platforms/83xx/km83xx.c @@ -155,8 +155,8 @@ machine_device_initcall(mpc83xx_km, mpc83xx_declare_of_platform_devices); /* list of the supported boards */ static char *board[] __initdata = { - "Keymile,KMETER1", - "Keymile,kmpbec8321", + "keymile,KMETER1", + "keymile,kmpbec8321", NULL }; diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index edc30cda5dbc..56f17296545a 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -605,7 +605,7 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq &pseries_msi_irq_chip, pseries_dev); } - pseries_dev->msi_used++; + pseries_dev->msi_used += nr_irqs; return 0; out: diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 93e1034485d7..70010bba27e7 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -164,7 +164,7 @@ static __always_inline void __stackleak_poison(unsigned long erase_low, " j 4f\n" "3: mvc 8(1,%[addr]),0(%[addr])\n" "4:" - : [addr] "+&a" (erase_low), [count] "+&d" (count), [tmp] "=&a" (tmp) + : [addr] "+&a" (erase_low), [count] "+&a" (count), [tmp] "=&a" (tmp) : [poison] "d" (poison) : "memory", "cc" ); diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c index 1721b73b7803..81c0235c0466 100644 --- a/arch/s390/lib/xor.c +++ b/arch/s390/lib/xor.c @@ -28,8 +28,8 @@ static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1, " j 3f\n" "2: xc 0(1,%1),0(%2)\n" "3:" - : : "d" (bytes), "a" (p1), "a" (p2) - : "0", "cc", "memory"); + : "+d" (bytes), "+a" (p1), "+a" (p2) + : : "0", "cc", "memory"); } static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1, @@ -96,7 +96,6 @@ static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1, const unsigned long * __restrict p5) { asm volatile( - " larl 1,2f\n" " aghi %0,-1\n" " jm 6f\n" " srlg 0,%0,8\n" diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c index e6175d75e4b0..ea2a224459b6 100644 --- a/arch/s390/mm/pfault.c +++ b/arch/s390/mm/pfault.c @@ -62,7 +62,7 @@ int __pfault_init(void) "0: nopr %%r7\n" EX_TABLE(0b, 0b) : [rc] "+d" (rc) - : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : [refbk] "a" (virt_to_phys(&pfault_init_refbk)), "m" (pfault_init_refbk) : "cc"); return rc; } @@ -84,7 +84,7 @@ void __pfault_fini(void) "0: nopr %%r7\n" EX_TABLE(0b, 0b) : - : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : [refbk] "a" (virt_to_phys(&pfault_fini_refbk)), "m" (pfault_fini_refbk) : "cc"); } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0f2f9f1552a4..03866683ac33 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2473,7 +2473,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \ KVM_X86_QUIRK_SLOT_ZAP_ALL | \ KVM_X86_QUIRK_STUFF_FEATURE_MSRS | \ - KVM_X86_QUIRK_IGNORE_GUEST_PAT) + KVM_X86_QUIRK_IGNORE_GUEST_PAT | \ + KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM) #define KVM_X86_CONDITIONAL_QUIRKS \ (KVM_X86_QUIRK_CD_NW_CLEARED | \ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 1584926cd1f4..42e4e835a7b4 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -476,6 +476,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) #define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) +#define KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM (1 << 10) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 680d305589a3..aa1b0ef5e931 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1890,6 +1890,7 @@ void __init check_x2apic(void) static inline void try_to_enable_x2apic(int remap_mode) { } static inline void __x2apic_enable(void) { } +static inline void __x2apic_disable(void) { } #endif /* !CONFIG_X86_X2APIC */ void __init enable_IR_x2apic(void) @@ -2452,6 +2453,11 @@ static void lapic_resume(void) if (x2apic_mode) { __x2apic_enable(); } else { + if (x2apic_enabled()) { + pr_warn_once("x2apic: re-enabled by firmware during resume. Disabling\n"); + __x2apic_disable(); + } + /* * Make sure the APICBASE points to the right address * diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index fef00546c885..ba6e9485e824 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -158,15 +158,34 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, svm->x2avic_msrs_intercepted = intercept; } +static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +{ + u32 arch_max; + + if (x2avic_enabled && apic_x2apic_mode(vcpu->arch.apic)) + arch_max = X2AVIC_MAX_PHYSICAL_ID; + else + arch_max = AVIC_MAX_PHYSICAL_ID; + + /* + * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID + * plus one, so the max possible APIC ID is one less than that. + */ + return min(vcpu->kvm->arch.max_vcpu_ids - 1, arch_max); +} + static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; - + vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); + /* * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR * accesses, while interrupt injection to a running vCPU can be @@ -176,7 +195,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; - vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { @@ -186,8 +205,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* For xAVIC and hybrid-xAVIC modes */ - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } @@ -200,6 +217,9 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm) vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + if (!sev_es_guest(svm->vcpu.kvm)) + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); + /* * If running nested and the guest uses its own MSR bitmap, there * is no need to update L0's msr bitmap @@ -321,7 +341,7 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) vmcb->control.avic_physical_id = __sme_set(__pa(kvm_svm->avic_physical_id_table)); vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE; - if (kvm_apicv_activated(svm->vcpu.kvm)) + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_activate_vmcb(svm); else avic_deactivate_vmcb(svm); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index eed104207a11..939b94418554 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1032,8 +1032,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm_set_intercept(svm, INTERCEPT_CR0_WRITE); svm_set_intercept(svm, INTERCEPT_CR3_WRITE); svm_set_intercept(svm, INTERCEPT_CR4_WRITE); - if (!kvm_vcpu_apicv_active(vcpu)) - svm_set_intercept(svm, INTERCEPT_CR8_WRITE); + svm_set_intercept(svm, INTERCEPT_CR8_WRITE); set_dr_intercepts(svm); @@ -1141,7 +1140,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) svm_clr_intercept(svm, INTERCEPT_PAUSE); } - if (kvm_vcpu_apicv_active(vcpu)) + if (enable_apicv && irqchip_in_kernel(vcpu->kvm)) avic_init_vmcb(svm, vmcb); if (vnmi) @@ -2598,9 +2597,11 @@ static int dr_interception(struct kvm_vcpu *vcpu) static int cr8_write_interception(struct kvm_vcpu *vcpu) { + u8 cr8_prev = kvm_get_cr8(vcpu); int r; - u8 cr8_prev = kvm_get_cr8(vcpu); + WARN_ON_ONCE(kvm_vcpu_apicv_active(vcpu)); + /* instruction emulation calls kvm_set_cr8() */ r = cr_interception(vcpu); if (lapic_in_kernel(vcpu)) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1725c6a94f99..36574b6b637d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3262,10 +3262,24 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) return -EINVAL; - if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && - (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || - CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) - return -EINVAL; + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + u64 debugctl = vmcs12->guest_ia32_debugctl; + + /* + * FREEZE_IN_SMM is not virtualized, but allow L1 to set it in + * vmcs12's DEBUGCTL under a quirk for backwards compatibility. + * Note that the quirk only relaxes the consistency check. The + * vmcc02 bit is still under the control of the host. In + * particular, if a host administrator decides to clear the bit, + * then L1 has no say in the matter. + */ + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM)) + debugctl &= ~DEBUGCTLMSR_FREEZE_IN_SMM; + + if (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || + CC(!vmx_is_valid_debugctl(vcpu, debugctl, false))) + return -EINVAL; + } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) |
