diff options
113 files changed, 3309 insertions, 1729 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 3b621c3ae67c..79cdc1b177f1 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8524,7 +8524,7 @@ Therefore, the ioctl must be called *before* reading the content of the dirty pages. The dirty ring can get full. When it happens, the KVM_RUN of the -vcpu will return with exit reason KVM_EXIT_DIRTY_LOG_FULL. +vcpu will return with exit reason KVM_EXIT_DIRTY_RING_FULL. The dirty ring interface has a major difference comparing to the KVM_GET_DIRTY_LOG interface in that, when reading the dirty ring from diff --git a/Documentation/virt/kvm/x86/errata.rst b/Documentation/virt/kvm/x86/errata.rst index 37c79362a48f..a9cf0e004651 100644 --- a/Documentation/virt/kvm/x86/errata.rst +++ b/Documentation/virt/kvm/x86/errata.rst @@ -48,7 +48,14 @@ versus "has_error_code", i.e. KVM's ABI follows AMD behavior. Nested virtualization features ------------------------------ -TBD +On AMD CPUs, when GIF is cleared, #DB exceptions or traps due to a breakpoint +register match are ignored and discarded by the CPU. The CPU relies on the VMM +to fully virtualize this behavior, even when vGIF is enabled for the guest +(i.e. vGIF=0 does not cause the CPU to drop #DBs when the guest is running). +KVM does not virtualize this behavior as the complexity is unjustified given +the rarity of the use case. One way to handle this would be for KVM to +intercept the #DB, temporarily disable the breakpoint, single-step over the +instruction, then re-enable the breakpoint. x2APIC ------ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 052bf0d4d0b0..e7140c19771b 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/loongarch/include/asm/kvm_eiointc.h b/arch/loongarch/include/asm/kvm_eiointc.h index a3a40aba8acf..8b7a2fa3f7f8 100644 --- a/arch/loongarch/include/asm/kvm_eiointc.h +++ b/arch/loongarch/include/asm/kvm_eiointc.h @@ -10,10 +10,7 @@ #define EIOINTC_IRQS 256 #define EIOINTC_ROUTE_MAX_VCPUS 256 -#define EIOINTC_IRQS_U8_NUMS (EIOINTC_IRQS / 8) -#define EIOINTC_IRQS_U16_NUMS (EIOINTC_IRQS_U8_NUMS / 2) -#define EIOINTC_IRQS_U32_NUMS (EIOINTC_IRQS_U8_NUMS / 4) -#define EIOINTC_IRQS_U64_NUMS (EIOINTC_IRQS_U8_NUMS / 8) +#define EIOINTC_IRQS_U64_NUMS (EIOINTC_IRQS / 64) /* map to ipnum per 32 irqs */ #define EIOINTC_IRQS_NODETYPE_COUNT 16 @@ -64,54 +61,18 @@ struct loongarch_eiointc { uint32_t status; /* hardware state */ - union nodetype { - u64 reg_u64[EIOINTC_IRQS_NODETYPE_COUNT / 4]; - u32 reg_u32[EIOINTC_IRQS_NODETYPE_COUNT / 2]; - u16 reg_u16[EIOINTC_IRQS_NODETYPE_COUNT]; - u8 reg_u8[EIOINTC_IRQS_NODETYPE_COUNT * 2]; - } nodetype; + u64 nodetype[EIOINTC_IRQS_NODETYPE_COUNT / 4]; /* one bit shows the state of one irq */ - union bounce { - u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; - } bounce; - - union isr { - u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; - } isr; - union coreisr { - u64 reg_u64[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U8_NUMS]; - } coreisr; - union enable { - u64 reg_u64[EIOINTC_IRQS_U64_NUMS]; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS]; - } enable; + u64 bounce[EIOINTC_IRQS_U64_NUMS]; + u64 isr[EIOINTC_IRQS_U64_NUMS]; + u64 coreisr[EIOINTC_ROUTE_MAX_VCPUS][EIOINTC_IRQS_U64_NUMS]; + u64 enable[EIOINTC_IRQS_U64_NUMS]; /* use one byte to config ipmap for 32 irqs at once */ - union ipmap { - u64 reg_u64; - u32 reg_u32[EIOINTC_IRQS_U32_NUMS / 4]; - u16 reg_u16[EIOINTC_IRQS_U16_NUMS / 4]; - u8 reg_u8[EIOINTC_IRQS_U8_NUMS / 4]; - } ipmap; + u64 ipmap; /* use one byte to config coremap for one irq */ - union coremap { - u64 reg_u64[EIOINTC_IRQS / 8]; - u32 reg_u32[EIOINTC_IRQS / 4]; - u16 reg_u16[EIOINTC_IRQS / 2]; - u8 reg_u8[EIOINTC_IRQS]; - } coremap; + u64 coremap[EIOINTC_IRQS / 8]; DECLARE_BITMAP(sw_coreisr[EIOINTC_ROUTE_MAX_VCPUS][LOONGSON_IP_NUM], EIOINTC_IRQS); uint8_t sw_coremap[EIOINTC_IRQS]; diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 0cecbd038bb3..e4fe5b8e8149 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -126,6 +126,8 @@ struct kvm_arch { struct kvm_phyid_map *phyid_map; /* Enabled PV features */ unsigned long pv_features; + /* Supported KVM features */ + unsigned long kvm_features; s64 time_offset; struct kvm_context __percpu *vmcs; @@ -293,6 +295,12 @@ static inline int kvm_get_pmu_num(struct kvm_vcpu_arch *arch) return (arch->cpucfg[6] & CPUCFG6_PMNUM) >> CPUCFG6_PMNUM_SHIFT; } +/* Check whether KVM support this feature (VMM may disable it) */ +static inline bool kvm_vm_support(struct kvm_arch *arch, int feature) +{ + return !!(arch->kvm_features & BIT_ULL(feature)); +} + bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu); /* Debug: dump vcpu state */ diff --git a/arch/loongarch/include/asm/kvm_vcpu.h b/arch/loongarch/include/asm/kvm_vcpu.h index f1efd7cfbc20..3784ab4ccdb5 100644 --- a/arch/loongarch/include/asm/kvm_vcpu.h +++ b/arch/loongarch/include/asm/kvm_vcpu.h @@ -15,6 +15,7 @@ #define CPU_PMU (_ULCAST_(1) << 10) #define CPU_TIMER (_ULCAST_(1) << 11) #define CPU_IPI (_ULCAST_(1) << 12) +#define CPU_AVEC (_ULCAST_(1) << 14) /* Controlled by 0x52 guest exception VIP aligned to estat bit 5~12 */ #define CPU_IP0 (_ULCAST_(1)) diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 3de03cb864b2..58a4a3b6b035 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -511,6 +511,8 @@ #define CSR_GCFG_GPERF_SHIFT 24 #define CSR_GCFG_GPERF_WIDTH 3 #define CSR_GCFG_GPERF (_ULCAST_(0x7) << CSR_GCFG_GPERF_SHIFT) +#define CSR_GCFG_GPMP_SHIFT 23 +#define CSR_GCFG_GPMP (_ULCAST_(0x1) << CSR_GCFG_GPMP_SHIFT) #define CSR_GCFG_GCI_SHIFT 20 #define CSR_GCFG_GCI_WIDTH 2 #define CSR_GCFG_GCI (_ULCAST_(0x3) << CSR_GCFG_GCI_SHIFT) diff --git a/arch/loongarch/include/uapi/asm/kvm.h b/arch/loongarch/include/uapi/asm/kvm.h index 57ba1a563bb1..de6c3f18e40a 100644 --- a/arch/loongarch/include/uapi/asm/kvm.h +++ b/arch/loongarch/include/uapi/asm/kvm.h @@ -104,6 +104,7 @@ struct kvm_fpu { #define KVM_LOONGARCH_VM_FEAT_PV_IPI 6 #define KVM_LOONGARCH_VM_FEAT_PV_STEALTIME 7 #define KVM_LOONGARCH_VM_FEAT_PTW 8 +#define KVM_LOONGARCH_VM_FEAT_MSGINT 9 /* Device Control API on vcpu fd */ #define KVM_LOONGARCH_VCPU_CPUCFG 0 diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ae64bbdf83a7..ed4f724db774 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -25,7 +25,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_MSI select HAVE_KVM_READONLY_MEM - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/loongarch/kvm/intc/eiointc.c b/arch/loongarch/kvm/intc/eiointc.c index a1cc116b4dac..29886876143f 100644 --- a/arch/loongarch/kvm/intc/eiointc.c +++ b/arch/loongarch/kvm/intc/eiointc.c @@ -13,19 +13,19 @@ static void eiointc_set_sw_coreisr(struct loongarch_eiointc *s) struct kvm_vcpu *vcpu; for (irq = 0; irq < EIOINTC_IRQS; irq++) { - ipnum = s->ipmap.reg_u8[irq / 32]; + ipnum = (s->ipmap >> (irq / 32 * 8)) & 0xff; if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) { ipnum = count_trailing_zeros(ipnum); ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0; } - cpuid = s->coremap.reg_u8[irq]; + cpuid = ((u8 *)s->coremap)[irq]; vcpu = kvm_get_vcpu_by_cpuid(s->kvm, cpuid); if (!vcpu) continue; cpu = vcpu->vcpu_id; - if (test_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu])) + if (test_bit(irq, (unsigned long *)s->coreisr[cpu])) __set_bit(irq, s->sw_coreisr[cpu][ipnum]); else __clear_bit(irq, s->sw_coreisr[cpu][ipnum]); @@ -38,7 +38,7 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) struct kvm_vcpu *vcpu; struct kvm_interrupt vcpu_irq; - ipnum = s->ipmap.reg_u8[irq / 32]; + ipnum = (s->ipmap >> (irq / 32 * 8)) & 0xff; if (!(s->status & BIT(EIOINTC_ENABLE_INT_ENCODE))) { ipnum = count_trailing_zeros(ipnum); ipnum = (ipnum >= 0 && ipnum < 4) ? ipnum : 0; @@ -53,13 +53,13 @@ static void eiointc_update_irq(struct loongarch_eiointc *s, int irq, int level) if (level) { /* if not enable return false */ - if (!test_bit(irq, (unsigned long *)s->enable.reg_u32)) + if (!test_bit(irq, (unsigned long *)s->enable)) return; - __set_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]); + __set_bit(irq, (unsigned long *)s->coreisr[cpu]); found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS); __set_bit(irq, s->sw_coreisr[cpu][ipnum]); } else { - __clear_bit(irq, (unsigned long *)s->coreisr.reg_u32[cpu]); + __clear_bit(irq, (unsigned long *)s->coreisr[cpu]); __clear_bit(irq, s->sw_coreisr[cpu][ipnum]); found = find_first_bit(s->sw_coreisr[cpu][ipnum], EIOINTC_IRQS); } @@ -94,7 +94,7 @@ static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s, if (s->sw_coremap[irq + i] == cpu) continue; - if (notify && test_bit(irq + i, (unsigned long *)s->isr.reg_u8)) { + if (notify && test_bit(irq + i, (unsigned long *)s->isr)) { /* lower irq at old cpu and raise irq at new cpu */ eiointc_update_irq(s, irq + i, 0); s->sw_coremap[irq + i] = cpu; @@ -108,7 +108,7 @@ static inline void eiointc_update_sw_coremap(struct loongarch_eiointc *s, void eiointc_set_irq(struct loongarch_eiointc *s, int irq, int level) { unsigned long flags; - unsigned long *isr = (unsigned long *)s->isr.reg_u8; + unsigned long *isr = (unsigned long *)s->isr; spin_lock_irqsave(&s->lock, flags); level ? __set_bit(irq, isr) : __clear_bit(irq, isr); @@ -127,27 +127,27 @@ static int loongarch_eiointc_read(struct kvm_vcpu *vcpu, struct loongarch_eioint switch (offset) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: index = (offset - EIOINTC_NODETYPE_START) >> 3; - data = s->nodetype.reg_u64[index]; + data = s->nodetype[index]; break; case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: index = (offset - EIOINTC_IPMAP_START) >> 3; - data = s->ipmap.reg_u64; + data = s->ipmap; break; case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: index = (offset - EIOINTC_ENABLE_START) >> 3; - data = s->enable.reg_u64[index]; + data = s->enable[index]; break; case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: index = (offset - EIOINTC_BOUNCE_START) >> 3; - data = s->bounce.reg_u64[index]; + data = s->bounce[index]; break; case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: index = (offset - EIOINTC_COREISR_START) >> 3; - data = s->coreisr.reg_u64[vcpu->vcpu_id][index]; + data = s->coreisr[vcpu->vcpu_id][index]; break; case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: index = (offset - EIOINTC_COREMAP_START) >> 3; - data = s->coremap.reg_u64[index]; + data = s->coremap[index]; break; default: ret = -EINVAL; @@ -223,26 +223,26 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, switch (offset) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: index = (offset - EIOINTC_NODETYPE_START) >> 3; - old = s->nodetype.reg_u64[index]; - s->nodetype.reg_u64[index] = (old & ~mask) | data; + old = s->nodetype[index]; + s->nodetype[index] = (old & ~mask) | data; break; case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: /* * ipmap cannot be set at runtime, can be set only at the beginning * of irqchip driver, need not update upper irq level */ - old = s->ipmap.reg_u64; - s->ipmap.reg_u64 = (old & ~mask) | data; + old = s->ipmap; + s->ipmap = (old & ~mask) | data; break; case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: index = (offset - EIOINTC_ENABLE_START) >> 3; - old = s->enable.reg_u64[index]; - s->enable.reg_u64[index] = (old & ~mask) | data; + old = s->enable[index]; + s->enable[index] = (old & ~mask) | data; /* * 1: enable irq. * update irq when isr is set. */ - data = s->enable.reg_u64[index] & ~old & s->isr.reg_u64[index]; + data = s->enable[index] & ~old & s->isr[index]; while (data) { irq = __ffs(data); eiointc_update_irq(s, irq + index * 64, 1); @@ -252,7 +252,7 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, * 0: disable irq. * update irq when isr is set. */ - data = ~s->enable.reg_u64[index] & old & s->isr.reg_u64[index]; + data = ~s->enable[index] & old & s->isr[index]; while (data) { irq = __ffs(data); eiointc_update_irq(s, irq + index * 64, 0); @@ -262,16 +262,16 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: /* do not emulate hw bounced irq routing */ index = (offset - EIOINTC_BOUNCE_START) >> 3; - old = s->bounce.reg_u64[index]; - s->bounce.reg_u64[index] = (old & ~mask) | data; + old = s->bounce[index]; + s->bounce[index] = (old & ~mask) | data; break; case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: index = (offset - EIOINTC_COREISR_START) >> 3; /* use attrs to get current cpu index */ cpu = vcpu->vcpu_id; - old = s->coreisr.reg_u64[cpu][index]; + old = s->coreisr[cpu][index]; /* write 1 to clear interrupt */ - s->coreisr.reg_u64[cpu][index] = old & ~data; + s->coreisr[cpu][index] = old & ~data; data &= old; while (data) { irq = __ffs(data); @@ -281,9 +281,9 @@ static int loongarch_eiointc_write(struct kvm_vcpu *vcpu, break; case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: index = (offset - EIOINTC_COREMAP_START) >> 3; - old = s->coremap.reg_u64[index]; - s->coremap.reg_u64[index] = (old & ~mask) | data; - data = s->coremap.reg_u64[index]; + old = s->coremap[index]; + s->coremap[index] = (old & ~mask) | data; + data = s->coremap[index]; eiointc_update_sw_coremap(s, index * 8, data, sizeof(data), true); break; default: @@ -451,10 +451,10 @@ static int kvm_eiointc_ctrl_access(struct kvm_device *dev, break; case KVM_DEV_LOONGARCH_EXTIOI_CTRL_LOAD_FINISHED: eiointc_set_sw_coreisr(s); - for (i = 0; i < (EIOINTC_IRQS / 4); i++) { - start_irq = i * 4; + for (i = 0; i < (EIOINTC_IRQS / 8); i++) { + start_irq = i * 8; eiointc_update_sw_coremap(s, start_irq, - s->coremap.reg_u32[i], sizeof(u32), false); + s->coremap[i], sizeof(u64), false); } break; default: @@ -481,34 +481,34 @@ static int kvm_eiointc_regs_access(struct kvm_device *dev, switch (addr) { case EIOINTC_NODETYPE_START ... EIOINTC_NODETYPE_END: offset = (addr - EIOINTC_NODETYPE_START) / 4; - p = &s->nodetype.reg_u32[offset]; + p = s->nodetype + offset * 4; break; case EIOINTC_IPMAP_START ... EIOINTC_IPMAP_END: offset = (addr - EIOINTC_IPMAP_START) / 4; - p = &s->ipmap.reg_u32[offset]; + p = &s->ipmap + offset * 4; break; case EIOINTC_ENABLE_START ... EIOINTC_ENABLE_END: offset = (addr - EIOINTC_ENABLE_START) / 4; - p = &s->enable.reg_u32[offset]; + p = s->enable + offset * 4; break; case EIOINTC_BOUNCE_START ... EIOINTC_BOUNCE_END: offset = (addr - EIOINTC_BOUNCE_START) / 4; - p = &s->bounce.reg_u32[offset]; + p = s->bounce + offset * 4; break; case EIOINTC_ISR_START ... EIOINTC_ISR_END: offset = (addr - EIOINTC_ISR_START) / 4; - p = &s->isr.reg_u32[offset]; + p = s->isr + offset * 4; break; case EIOINTC_COREISR_START ... EIOINTC_COREISR_END: if (cpu >= s->num_cpu) return -EINVAL; offset = (addr - EIOINTC_COREISR_START) / 4; - p = &s->coreisr.reg_u32[cpu][offset]; + p = s->coreisr[cpu] + offset * 4; break; case EIOINTC_COREMAP_START ... EIOINTC_COREMAP_END: offset = (addr - EIOINTC_COREMAP_START) / 4; - p = &s->coremap.reg_u32[offset]; + p = s->coremap + offset * 4; break; default: kvm_err("%s: unknown eiointc register, addr = %d\n", __func__, addr); diff --git a/arch/loongarch/kvm/interrupt.c b/arch/loongarch/kvm/interrupt.c index 8462083f0301..a6d42d399a59 100644 --- a/arch/loongarch/kvm/interrupt.c +++ b/arch/loongarch/kvm/interrupt.c @@ -21,6 +21,7 @@ static unsigned int priority_to_irq[EXCCODE_INT_NUM] = { [INT_HWI5] = CPU_IP5, [INT_HWI6] = CPU_IP6, [INT_HWI7] = CPU_IP7, + [INT_AVEC] = CPU_AVEC, }; static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) @@ -31,6 +32,11 @@ static int kvm_irq_deliver(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; + if (cpu_has_msgint && (priority == INT_AVEC)) { + set_gcsr_estat(irq); + return 1; + } + switch (priority) { case INT_TI: case INT_IPI: @@ -58,6 +64,11 @@ static int kvm_irq_clear(struct kvm_vcpu *vcpu, unsigned int priority) if (priority < EXCCODE_INT_NUM) irq = priority_to_irq[priority]; + if (cpu_has_msgint && (priority == INT_AVEC)) { + clear_gcsr_estat(irq); + return 1; + } + switch (priority) { case INT_TI: case INT_IPI: @@ -83,10 +94,10 @@ void kvm_deliver_intr(struct kvm_vcpu *vcpu) unsigned long *pending = &vcpu->arch.irq_pending; unsigned long *pending_clr = &vcpu->arch.irq_clear; - for_each_set_bit(priority, pending_clr, INT_IPI + 1) + for_each_set_bit(priority, pending_clr, EXCCODE_INT_NUM) kvm_irq_clear(vcpu, priority); - for_each_set_bit(priority, pending, INT_IPI + 1) + for_each_set_bit(priority, pending, EXCCODE_INT_NUM) kvm_irq_deliver(vcpu, priority); } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 1245a6b35896..6d833599ef2e 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -659,8 +659,7 @@ static int _kvm_get_cpucfg_mask(int id, u64 *v) *v = GENMASK(31, 0); return 0; case LOONGARCH_CPUCFG1: - /* CPUCFG1_MSGINT is not supported by KVM */ - *v = GENMASK(25, 0); + *v = GENMASK(26, 0); return 0; case LOONGARCH_CPUCFG2: /* CPUCFG2 features unconditionally supported by KVM */ @@ -728,6 +727,10 @@ static int kvm_check_cpucfg(int id, u64 val) return -EINVAL; switch (id) { + case LOONGARCH_CPUCFG1: + if ((val & CPUCFG1_MSGINT) && !cpu_has_msgint) + return -EINVAL; + return 0; case LOONGARCH_CPUCFG2: if (!(val & CPUCFG2_LLFTP)) /* Guests must have a constant timer */ @@ -1473,8 +1476,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { void __user *argp = (void __user *)arg; struct kvm_vcpu *vcpu = filp->private_data; @@ -1657,6 +1660,12 @@ static int _kvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_LLBCTL); + if (cpu_has_msgint) { + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR0); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR1); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR2); + kvm_restore_hw_gcsr(csr, LOONGARCH_CSR_ISR3); + } /* Restore Root.GINTC from unused Guest.GINTC register */ write_csr_gintc(csr->csrs[LOONGARCH_CSR_GINTC]); @@ -1746,6 +1755,12 @@ static int _kvm_vcpu_put(struct kvm_vcpu *vcpu, int cpu) kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN1); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN2); kvm_save_hw_gcsr(csr, LOONGARCH_CSR_DMWIN3); + if (cpu_has_msgint) { + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR0); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR1); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR2); + kvm_save_hw_gcsr(csr, LOONGARCH_CSR_ISR3); + } vcpu->arch.aux_inuse |= KVM_LARCH_SWCSR_LATEST; diff --git a/arch/loongarch/kvm/vm.c b/arch/loongarch/kvm/vm.c index a49b1c1a3dd1..194ccbcdc3b3 100644 --- a/arch/loongarch/kvm/vm.c +++ b/arch/loongarch/kvm/vm.c @@ -6,6 +6,7 @@ #include <linux/kvm_host.h> #include <asm/kvm_mmu.h> #include <asm/kvm_vcpu.h> +#include <asm/kvm_csr.h> #include <asm/kvm_eiointc.h> #include <asm/kvm_pch_pic.h> @@ -24,6 +25,23 @@ const struct kvm_stats_header kvm_vm_stats_header = { sizeof(kvm_vm_stats_desc), }; +static void kvm_vm_init_features(struct kvm *kvm) +{ + unsigned long val; + + val = read_csr_gcfg(); + if (val & CSR_GCFG_GPMP) + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PMU); + + /* Enable all PV features by default */ + kvm->arch.pv_features = BIT(KVM_FEATURE_IPI); + kvm->arch.kvm_features = BIT(KVM_LOONGARCH_VM_FEAT_PV_IPI); + if (kvm_pvtime_supported()) { + kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); + kvm->arch.kvm_features |= BIT(KVM_LOONGARCH_VM_FEAT_PV_STEALTIME); + } +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int i; @@ -42,11 +60,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) spin_lock_init(&kvm->arch.phyid_map_lock); kvm_init_vmcs(kvm); - - /* Enable all PV features by default */ - kvm->arch.pv_features = BIT(KVM_FEATURE_IPI); - if (kvm_pvtime_supported()) - kvm->arch.pv_features |= BIT(KVM_FEATURE_STEAL_TIME); + kvm_vm_init_features(kvm); /* * cpu_vabits means user address space only (a half of total). @@ -136,18 +150,18 @@ static int kvm_vm_feature_has_attr(struct kvm *kvm, struct kvm_device_attr *attr if (cpu_has_lbt_mips) return 0; return -ENXIO; - case KVM_LOONGARCH_VM_FEAT_PMU: - if (cpu_has_pmp) + case KVM_LOONGARCH_VM_FEAT_PTW: + if (cpu_has_ptw) return 0; return -ENXIO; - case KVM_LOONGARCH_VM_FEAT_PV_IPI: - return 0; - case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: - if (kvm_pvtime_supported()) + case KVM_LOONGARCH_VM_FEAT_MSGINT: + if (cpu_has_msgint) return 0; return -ENXIO; - case KVM_LOONGARCH_VM_FEAT_PTW: - if (cpu_has_ptw) + case KVM_LOONGARCH_VM_FEAT_PMU: + case KVM_LOONGARCH_VM_FEAT_PV_IPI: + case KVM_LOONGARCH_VM_FEAT_PV_STEALTIME: + if (kvm_vm_support(&kvm->arch, attr->attr)) return 0; return -ENXIO; default: diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index ab57221fa4dd..cc13cc35f208 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select EXPORT_UASM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a75587018f44..b0fb92fda4d4 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 2f2702c867f7..c9a2d50ff1b0 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select KVM_COMMON - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select HAVE_KVM_IRQ_BYPASS diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2ba057171ebe..9a89a6d98f97 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index c50328212917..77379f77840a 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI - select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_READONLY_MEM select HAVE_KVM_DIRTY_RING_ACQ_REL select KVM_COMMON diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 9f07a3177a28..a55a95da54d0 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..96d16028e8b7 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..8c4caa5f2fcd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4091a776e37a..c2f1281b32d8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -338,6 +338,7 @@ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ #define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ +#define X86_FEATURE_EFER_LMSLE_MBZ (13*32+20) /* EFER.LMSLE must be zero */ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ @@ -499,6 +500,12 @@ #define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ +#define X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO (21*32+17) /* + * Clear CPU buffers before VM-Enter if the vCPU + * can access host MMIO (ignored for all intents + * and purposes if CLEAR_CPU_BUF_VM is set). + */ +#define X86_FEATURE_X2AVIC_EXT (21*32+18) /* AMD SVM x2AVIC support for 4k vCPUs */ /* * BUG word(s) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index f00c09ffe6a9..6b6d472baa0b 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -5,7 +5,7 @@ #include <linux/threads.h> typedef struct { -#if IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) u8 kvm_cpu_l1tf_flush_l1d; #endif unsigned int __nmi_count; /* arch dependent */ @@ -68,7 +68,7 @@ extern u64 arch_irq_stat(void); DECLARE_PER_CPU_CACHE_HOT(u16, __softirq_pending); #define local_softirq_pending_ref __softirq_pending -#if IS_ENABLED(CONFIG_KVM_INTEL) +#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) /* * This function is called from noinstr interrupt contexts * and must be inlined to not get instrumentation. diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index fdf178443f85..de709fb5bd76 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -128,6 +128,7 @@ KVM_X86_OP(enable_smi_window) KVM_X86_OP_OPTIONAL(dev_get_attr) KVM_X86_OP_OPTIONAL(mem_enc_ioctl) KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl) +KVM_X86_OP_OPTIONAL(vcpu_mem_enc_unlocked_ioctl) KVM_X86_OP_OPTIONAL(mem_enc_register_region) KVM_X86_OP_OPTIONAL(mem_enc_unregister_region) KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48598d017d6f..5a3bfa293e8b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1055,9 +1055,6 @@ struct kvm_vcpu_arch { /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; - /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ - bool l1tf_flush_l1d; - /* Host CPU on which VM-entry was most recently attempted */ int last_vmentry_cpu; @@ -1456,8 +1453,6 @@ struct kvm_arch { bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; - struct delayed_work kvmclock_update_work; - struct delayed_work kvmclock_sync_work; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; @@ -1848,15 +1843,15 @@ struct kvm_x86_ops { void *external_spt); /* Update the external page table from spte getting set. */ int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - kvm_pfn_t pfn_for_gfn); + u64 mirror_spte); /* Update external page tables for page table about to be freed. */ int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, void *external_spt); /* Update external page table from spte getting removed, and flush TLB. */ - int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, - kvm_pfn_t pfn_for_gfn); + void (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level, + u64 mirror_spte); bool (*has_wbinvd_exit)(void); @@ -1914,6 +1909,7 @@ struct kvm_x86_ops { int (*dev_get_attr)(u32 group, u64 attr, u64 *val); int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); + int (*vcpu_mem_enc_unlocked_ioctl)(struct kvm_vcpu *vcpu, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); @@ -2143,6 +2139,11 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); * the gfn, i.e. retrying the instruction will hit a * !PRESENT fault, which results in a new shadow page * and sends KVM back to square one. + * + * EMULTYPE_SKIP_SOFT_INT - Set in combination with EMULTYPE_SKIP to only skip + * an instruction if it could generate a given software + * interrupt, which must be encoded via + * EMULTYPE_SET_SOFT_INT_VECTOR(). */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) @@ -2153,6 +2154,10 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) #define EMULTYPE_WRITE_PF_TO_SP (1 << 8) +#define EMULTYPE_SKIP_SOFT_INT (1 << 9) + +#define EMULTYPE_SET_SOFT_INT_VECTOR(v) ((u32)((v) & 0xff) << 16) +#define EMULTYPE_GET_SOFT_INT_VECTOR(e) (((e) >> 16) & 0xff) static inline bool kvm_can_emulate_event_vectoring(int emul_type) { @@ -2167,6 +2172,7 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa); +void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -2378,7 +2384,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); -void kvm_user_return_msr_update_cache(unsigned int index, u64 val); u64 kvm_get_user_return_msr(unsigned int slot); static inline bool kvm_is_supported_user_return_msr(u32 msr) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 08ed5a2e46a5..a6526c5be5ca 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -308,24 +308,26 @@ * CFLAGS.ZF. * Note: Only the memory operand variant of VERW clears the CPU buffers. */ -.macro __CLEAR_CPU_BUFFERS feature #ifdef CONFIG_X86_64 - ALTERNATIVE "", "verw x86_verw_sel(%rip)", \feature +#define VERW verw x86_verw_sel(%rip) #else - /* - * In 32bit mode, the memory operand must be a %cs reference. The data - * segments may not be usable (vm86 mode), and the stack segment may not - * be flat (ESPFIX32). - */ - ALTERNATIVE "", "verw %cs:x86_verw_sel", \feature +/* + * In 32bit mode, the memory operand must be a %cs reference. The data segments + * may not be usable (vm86 mode), and the stack segment may not be flat (ESPFIX32). + */ +#define VERW verw %cs:x86_verw_sel #endif -.endm -#define CLEAR_CPU_BUFFERS \ - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF +/* + * Provide a stringified VERW macro for simple usage, and a non-stringified + * VERW macro for use in more elaborate sequences, e.g. to encode a conditional + * VERW within an ALTERNATIVE. + */ +#define __CLEAR_CPU_BUFFERS __stringify(VERW) -#define VM_CLEAR_CPU_BUFFERS \ - __CLEAR_CPU_BUFFERS X86_FEATURE_CLEAR_CPU_BUF_VM +/* If necessary, emit VERW on exit-to-userspace to clear CPU buffers. */ +#define CLEAR_CPU_BUFFERS \ + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF #ifdef CONFIG_X86_64 .macro CLEAR_BRANCH_HISTORY @@ -580,8 +582,6 @@ DECLARE_STATIC_KEY_FALSE(cpu_buf_idle_clear); DECLARE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -DECLARE_STATIC_KEY_FALSE(cpu_buf_vm_clear); - extern u16 x86_verw_sel; #include <asm/segment.h> diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 17f6c3fedeee..e69b6d0dedcf 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -279,7 +279,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; -#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) +#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(11, 0) /* * For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as @@ -289,11 +289,14 @@ enum avic_ipi_failure_cause { /* * For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511). + * With X86_FEATURE_X2AVIC_EXT, the max index is increased to 0xfff (4095). */ #define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL +#define X2AVIC_4K_MAX_PHYSICAL_ID 0xFFFUL static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID); static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID); +static_assert((X2AVIC_4K_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_4K_MAX_PHYSICAL_ID); #define SVM_SEV_FEAT_SNP_ACTIVE BIT(0) #define SVM_SEV_FEAT_RESTRICTED_INJECTION BIT(3) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d420c9c066d4..7ceff6583652 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -502,6 +502,7 @@ struct kvm_sync_regs { /* vendor-specific groups and attributes for system fd */ #define KVM_X86_GRP_SEV 1 # define KVM_X86_SEV_VMSA_FEATURES 0 +# define KVM_X86_SNP_POLICY_BITS 1 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d7fa03bf51b4..70bb48ab46d7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -192,14 +192,6 @@ EXPORT_SYMBOL_GPL(cpu_buf_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* - * Controls CPU Fill buffer clear before VMenter. This is a subset of - * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only - * mitigation is required. - */ -DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); -EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); - #undef pr_fmt #define pr_fmt(fmt) "mitigations: " fmt @@ -489,8 +481,8 @@ static enum rfds_mitigations rfds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; /* - * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing - * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to + * userspace *and* on entry to KVM guests. */ static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; @@ -536,6 +528,7 @@ static void __init mds_apply_mitigation(void) if (mds_mitigation == MDS_MITIGATION_FULL || mds_mitigation == MDS_MITIGATION_VMWERV) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) cpu_smt_disable(false); @@ -647,6 +640,7 @@ static void __init taa_apply_mitigation(void) * present on host, enable the mitigation for UCODE_NEEDED as well. */ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) cpu_smt_disable(false); @@ -748,9 +742,9 @@ static void __init mmio_apply_mitigation(void) */ if (verw_clear_cpu_buf_mitigation_selected) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - static_branch_disable(&cpu_buf_vm_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); } else { - static_branch_enable(&cpu_buf_vm_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO); } /* @@ -839,8 +833,10 @@ static void __init rfds_update_mitigation(void) static void __init rfds_apply_mitigation(void) { - if (rfds_mitigation == RFDS_MITIGATION_VERW) + if (rfds_mitigation == RFDS_MITIGATION_VERW) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + } } static __init int rfds_parse_cmdline(char *str) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index caa4dc885c21..aa7f21f5f46b 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -49,6 +49,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 }, { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52524e0ca97f..d563a948318b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1135,6 +1135,7 @@ void kvm_set_cpu_caps(void) F(AMD_STIBP), F(AMD_STIBP_ALWAYS_ON), F(AMD_IBRS_SAME_MODE), + PASSTHROUGH_F(EFER_LMSLE_MBZ), F(AMD_PSFD), F(AMD_IBPB_RET), ); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4e3da5b497b8..c8e292e9a24d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -81,9 +81,8 @@ */ /* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define DstShift 1 +#define ByteOp (1<<0) /* 8-bit operands. */ +#define DstShift 1 /* Destination operand type at bits 1-5 */ #define ImplicitOps (OpImplicit << DstShift) #define DstReg (OpReg << DstShift) #define DstMem (OpMem << DstShift) @@ -95,8 +94,7 @@ #define DstDX (OpDX << DstShift) #define DstAccLo (OpAccLo << DstShift) #define DstMask (OpMask << DstShift) -/* Source operand type. */ -#define SrcShift 6 +#define SrcShift 6 /* Source operand type at bits 6-10 */ #define SrcNone (OpNone << SrcShift) #define SrcReg (OpReg << SrcShift) #define SrcMem (OpMem << SrcShift) @@ -119,10 +117,10 @@ #define SrcAccHi (OpAccHi << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) -#define MemAbs (1<<12) /* Memory operand is absolute displacement */ +#define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ #define Stack (1<<14) /* Stack instruction (push/pop) */ -#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */ +#define GroupMask (7<<15) /* Group mechanisms, at bits 15-17 */ #define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ @@ -131,11 +129,8 @@ #define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */ #define ModeDual (7<<15) /* Different instruction for 32/64 bit */ #define Sse (1<<18) /* SSE Vector instruction */ -/* Generic ModRM decode. */ -#define ModRM (1<<19) -/* Destination is only written; never read. */ -#define Mov (1<<20) -/* Misc flags */ +#define ModRM (1<<19) /* Generic ModRM decode. */ +#define Mov (1<<20) /* Destination is only written; never read. */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ @@ -143,11 +138,11 @@ #define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ -#define No64 (1<<28) +#define No64 (1<<28) /* Instruction generates #UD in 64-bit mode */ #define PageTable (1 << 29) /* instruction used to write page table */ #define NotImpl (1 << 30) /* instruction is not implemented */ -/* Source 2 operand type */ -#define Src2Shift (31) +#define Avx ((u64)1 << 31) /* Instruction uses VEX prefix */ +#define Src2Shift (32) /* Source 2 operand type at bits 32-36 */ #define Src2None (OpNone << Src2Shift) #define Src2Mem (OpMem << Src2Shift) #define Src2CL (OpCL << Src2Shift) @@ -161,12 +156,13 @@ #define Src2FS (OpFS << Src2Shift) #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) +/* free: 37-39 */ #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ -#define AlignMask ((u64)7 << 41) +#define AlignMask ((u64)3 << 41) /* Memory alignment requirement at bits 41-42 */ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ -#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ -#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ +#define Aligned16 ((u64)3 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ +/* free: 43-44 */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ @@ -243,6 +239,13 @@ enum x86_transfer_type { X86_TRANSFER_TASK_SWITCH, }; +enum rex_bits { + REX_B = 1, + REX_X = 2, + REX_R = 4, + REX_W = 8, +}; + static void writeback_registers(struct x86_emulate_ctxt *ctxt) { unsigned long dirty = ctxt->regs_dirty; @@ -622,7 +625,6 @@ static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) switch (alignment) { case Unaligned: - case Avx: return 1; case Aligned16: return 16; @@ -924,7 +926,7 @@ static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg, int byteop) { void *p; - int highbyte_regs = (ctxt->rex_prefix == 0) && byteop; + int highbyte_regs = (ctxt->rex_prefix == REX_NONE) && byteop; if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1; @@ -1030,6 +1032,7 @@ static void fetch_register_operand(struct operand *op) op->val = *(u64 *)op->addr.reg; break; } + op->orig_val = op->val; } static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1075,17 +1078,17 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static void decode_register_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op) +static void __decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op, int reg) { - unsigned int reg; - - if (ctxt->d & ModRM) - reg = ctxt->modrm_reg; - else - reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3); - - if (ctxt->d & Sse) { + if ((ctxt->d & Avx) && ctxt->op_bytes == 32) { + op->type = OP_YMM; + op->bytes = 32; + op->addr.xmm = reg; + kvm_read_avx_reg(reg, &op->vec_val2); + return; + } + if (ctxt->d & (Avx|Sse)) { op->type = OP_XMM; op->bytes = 16; op->addr.xmm = reg; @@ -1103,9 +1106,20 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp); - fetch_register_operand(op); - op->orig_val = op->val; +} + +static void decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op) +{ + unsigned int reg; + + if (ctxt->d & ModRM) + reg = ctxt->modrm_reg; + else + reg = (ctxt->b & 7) | (ctxt->rex_bits & REX_B ? 8 : 0); + + __decode_register_operand(ctxt, op, reg); } static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) @@ -1122,9 +1136,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ - index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ - base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ + ctxt->modrm_reg = (ctxt->rex_bits & REX_R ? 8 : 0); + index_reg = (ctxt->rex_bits & REX_X ? 8 : 0); + base_reg = (ctxt->rex_bits & REX_B ? 8 : 0); ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; @@ -1132,24 +1146,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { - op->type = OP_REG; - op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - op->addr.reg = decode_register(ctxt, ctxt->modrm_rm, - ctxt->d & ByteOp); - if (ctxt->d & Sse) { - op->type = OP_XMM; - op->bytes = 16; - op->addr.xmm = ctxt->modrm_rm; - kvm_read_sse_reg(ctxt->modrm_rm, &op->vec_val); - return rc; - } - if (ctxt->d & Mmx) { - op->type = OP_MM; - op->bytes = 8; - op->addr.mm = ctxt->modrm_rm & 7; - return rc; - } - fetch_register_operand(op); + __decode_register_operand(ctxt, op, ctxt->modrm_rm); return rc; } @@ -1783,7 +1780,15 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) op->data, op->bytes * op->count); case OP_XMM: - kvm_write_sse_reg(op->addr.xmm, &op->vec_val); + if (!(ctxt->d & Avx)) { + kvm_write_sse_reg(op->addr.xmm, &op->vec_val); + break; + } + /* full YMM write but with high bytes cleared */ + memset(op->valptr + 16, 0, 16); + fallthrough; + case OP_YMM: + kvm_write_avx_reg(op->addr.xmm, &op->vec_val2); break; case OP_MM: kvm_write_mmx_reg(op->addr.mm, &op->mm_val); @@ -2466,7 +2471,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(&cs, &ss); - if ((ctxt->rex_prefix & 0x8) != 0x0) + if (ctxt->rex_bits & REX_W) usermode = X86EMUL_MODE_PROT64; else usermode = X86EMUL_MODE_PROT32; @@ -3958,6 +3963,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +static const struct opcode ud = I(SrcNone, emulate_ud); + static const struct opcode group7_rm0[] = { N, I(SrcNone | Priv | EmulateOnUD, em_hypercall), @@ -4114,7 +4121,7 @@ static const struct group_dual group15 = { { } }; static const struct gprefix pfx_0f_6f_0f_7f = { - I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), + I(Mmx, em_mov), I(Sse | Avx | Aligned, em_mov), N, I(Sse | Avx | Unaligned, em_mov), }; static const struct instr_dual instr_dual_0f_2b = { @@ -4133,8 +4140,8 @@ static const struct gprefix pfx_0f_28_0f_29 = { I(Aligned, em_mov), I(Aligned, em_mov), N, N, }; -static const struct gprefix pfx_0f_e7 = { - N, I(Sse, em_mov), N, N, +static const struct gprefix pfx_0f_e7_0f_38_2a = { + N, I(Sse | Avx, em_mov), N, N, }; static const struct escape escape_d9 = { { @@ -4347,8 +4354,8 @@ static const struct opcode twobyte_table[256] = { DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, /* 0x10 - 0x1F */ - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_10_0f_11), - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_10_0f_11), + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_10_0f_11), + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_10_0f_11), N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 4 * prefetch + 4 * reserved NOP */ D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, @@ -4364,9 +4371,9 @@ static const struct opcode twobyte_table[256] = { IIP(ModRM | SrcMem | Priv | Op3264 | NoMod, em_dr_write, dr_write, check_dr_write), N, N, N, N, - GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29), - GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29), - N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b), + GP(ModRM | DstReg | SrcMem | Mov | Sse | Avx, &pfx_0f_28_0f_29), + GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_28_0f_29), + N, GP(ModRM | DstMem | SrcReg | Mov | Sse | Avx, &pfx_0f_2b), N, N, N, N, /* 0x30 - 0x3F */ II(ImplicitOps | Priv, em_wrmsr, wrmsr), @@ -4431,7 +4438,7 @@ static const struct opcode twobyte_table[256] = { /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ - N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7), + N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, N, N, N, /* 0xF0 - 0xFF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N @@ -4458,8 +4465,13 @@ static const struct gprefix three_byte_0f_38_f1 = { * byte. */ static const struct opcode opcode_map_0f_38[256] = { - /* 0x00 - 0x7f */ - X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), + /* 0x00 - 0x1f */ + X16(N), X16(N), + /* 0x20 - 0x2f */ + X8(N), + X2(N), GP(SrcReg | DstMem | ModRM | Mov | Aligned, &pfx_0f_e7_0f_38_2a), N, N, N, N, N, + /* 0x30 - 0x7f */ + X16(N), X16(N), X16(N), X16(N), X16(N), /* 0x80 - 0xef */ X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), /* 0xf0 - 0xf1 */ @@ -4618,14 +4630,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); fetch_register_operand(op); - op->orig_val = op->val; break; case OpAccLo: op->type = OP_REG; op->bytes = (ctxt->d & ByteOp) ? 2 : ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); fetch_register_operand(op); - op->orig_val = op->val; break; case OpAccHi: if (ctxt->d & ByteOp) { @@ -4636,7 +4646,6 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = ctxt->op_bytes; op->addr.reg = reg_rmw(ctxt, VCPU_REGS_RDX); fetch_register_operand(op); - op->orig_val = op->val; break; case OpDI: op->type = OP_MEM; @@ -4755,12 +4764,87 @@ done: return rc; } +static int x86_decode_avx(struct x86_emulate_ctxt *ctxt, + u8 vex_1st, u8 vex_2nd, struct opcode *opcode) +{ + u8 vex_3rd, map, pp, l, v; + int rc = X86EMUL_CONTINUE; + + if (ctxt->rep_prefix || ctxt->op_prefix || ctxt->rex_prefix) + goto ud; + + if (vex_1st == 0xc5) { + /* Expand RVVVVlpp to VEX3 format */ + vex_3rd = vex_2nd & ~0x80; /* VVVVlpp from VEX2, w=0 */ + vex_2nd = (vex_2nd & 0x80) | 0x61; /* R from VEX2, X=1 B=1 mmmmm=00001 */ + } else { + vex_3rd = insn_fetch(u8, ctxt); + } + + /* vex_2nd = RXBmmmmm, vex_3rd = wVVVVlpp. Fix polarity */ + vex_2nd ^= 0xE0; /* binary 11100000 */ + vex_3rd ^= 0x78; /* binary 01111000 */ + + ctxt->rex_prefix = REX_PREFIX; + ctxt->rex_bits = (vex_2nd & 0xE0) >> 5; /* RXB */ + ctxt->rex_bits |= (vex_3rd & 0x80) >> 4; /* w */ + if (ctxt->rex_bits && ctxt->mode != X86EMUL_MODE_PROT64) + goto ud; + + map = vex_2nd & 0x1f; + v = (vex_3rd >> 3) & 0xf; + l = vex_3rd & 0x4; + pp = vex_3rd & 0x3; + + ctxt->b = insn_fetch(u8, ctxt); + switch (map) { + case 1: + ctxt->opcode_len = 2; + *opcode = twobyte_table[ctxt->b]; + break; + case 2: + ctxt->opcode_len = 3; + *opcode = opcode_map_0f_38[ctxt->b]; + break; + case 3: + /* no 0f 3a instructions are supported yet */ + return X86EMUL_UNHANDLEABLE; + default: + goto ud; + } + + /* + * No three operand instructions are supported yet; those that + * *are* marked with the Avx flag reserve the VVVV flag. + */ + if (v) + goto ud; + + if (l) + ctxt->op_bytes = 32; + else + ctxt->op_bytes = 16; + + switch (pp) { + case 0: break; + case 1: ctxt->op_prefix = true; break; + case 2: ctxt->rep_prefix = 0xf3; break; + case 3: ctxt->rep_prefix = 0xf2; break; + } + +done: + return rc; +ud: + *opcode = ud; + return rc; +} + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int emulation_type) { int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; - bool op_prefix = false; + bool vex_prefix = false; bool has_seg_override = false; struct opcode opcode; u16 dummy; @@ -4812,7 +4896,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int for (;;) { switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ - op_prefix = true; + ctxt->op_prefix = true; /* switch between 2/4 bytes */ ctxt->op_bytes = def_op_bytes ^ 6; break; @@ -4851,7 +4935,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) goto done_prefixes; - ctxt->rex_prefix = ctxt->b; + ctxt->rex_prefix = REX_PREFIX; + ctxt->rex_bits = ctxt->b & 0xf; continue; case 0xf0: /* LOCK */ ctxt->lock_prefix = 1; @@ -4865,20 +4950,33 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int } /* Any legacy prefix after a REX prefix nullifies its effect. */ - - ctxt->rex_prefix = 0; + ctxt->rex_prefix = REX_NONE; + ctxt->rex_bits = 0; } done_prefixes: /* REX prefix. */ - if (ctxt->rex_prefix & 8) - ctxt->op_bytes = 8; /* REX.W */ + if (ctxt->rex_bits & REX_W) + ctxt->op_bytes = 8; /* Opcode byte(s). */ - opcode = opcode_table[ctxt->b]; - /* Two-byte opcode? */ - if (ctxt->b == 0x0f) { + if (ctxt->b == 0xc4 || ctxt->b == 0xc5) { + /* VEX or LDS/LES */ + u8 vex_2nd = insn_fetch(u8, ctxt); + if (mode != X86EMUL_MODE_PROT64 && (vex_2nd & 0xc0) != 0xc0) { + opcode = opcode_table[ctxt->b]; + ctxt->modrm = vex_2nd; + /* the Mod/RM byte has been fetched already! */ + goto done_modrm; + } + + vex_prefix = true; + rc = x86_decode_avx(ctxt, ctxt->b, vex_2nd, &opcode); + if (rc != X86EMUL_CONTINUE) + goto done; + } else if (ctxt->b == 0x0f) { + /* Two- or three-byte opcode */ ctxt->opcode_len = 2; ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; @@ -4889,18 +4987,16 @@ done_prefixes: ctxt->b = insn_fetch(u8, ctxt); opcode = opcode_map_0f_38[ctxt->b]; } + } else { + /* Opcode byte(s). */ + opcode = opcode_table[ctxt->b]; } - ctxt->d = opcode.flags; - if (ctxt->d & ModRM) + if (opcode.flags & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); - /* vex-prefix instructions are not implemented */ - if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && - (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) { - ctxt->d = NotImpl; - } - +done_modrm: + ctxt->d = opcode.flags; while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: @@ -4919,9 +5015,9 @@ done_prefixes: opcode = opcode.u.group[goffset]; break; case Prefix: - if (ctxt->rep_prefix && op_prefix) + if (ctxt->rep_prefix && ctxt->op_prefix) return EMULATION_FAILED; - simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; + simd_prefix = ctxt->op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; case 0x66: opcode = opcode.u.gprefix->pfx_66; break; @@ -4966,6 +5062,19 @@ done_prefixes: if (ctxt->d == 0) return EMULATION_FAILED; + if (unlikely(vex_prefix)) { + /* + * Only specifically marked instructions support VEX. Since many + * instructions support it but are not annotated, return not implemented + * rather than #UD. + */ + if (!(ctxt->d & Avx)) + return EMULATION_FAILED; + + if (!(ctxt->d & AlignMask)) + ctxt->d |= Unaligned; + } + ctxt->execute = opcode.u.execute; /* @@ -5036,8 +5145,10 @@ done_prefixes: if ((ctxt->d & No16) && ctxt->op_bytes == 2) ctxt->op_bytes = 4; - if (ctxt->d & Sse) - ctxt->op_bytes = 16; + if (vex_prefix) + ; + else if (ctxt->d & Sse) + ctxt->op_bytes = 16, ctxt->d &= ~Avx; else if (ctxt->d & Mmx) ctxt->op_bytes = 8; } @@ -5137,8 +5248,10 @@ void init_decode_cache(struct x86_emulate_ctxt *ctxt) { /* Clear fields that are set conditionally but read without a guard. */ ctxt->rip_relative = false; - ctxt->rex_prefix = 0; + ctxt->rex_prefix = REX_NONE; + ctxt->rex_bits = 0; ctxt->lock_prefix = 0; + ctxt->op_prefix = false; ctxt->rep_prefix = 0; ctxt->regs_valid = 0; ctxt->regs_dirty = 0; @@ -5168,20 +5281,34 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts) } if (unlikely(ctxt->d & - (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { + (No64|Undefined|Avx|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || (ctxt->d & Undefined)) { rc = emulate_ud(ctxt); goto done; } - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + if ((ctxt->d & (Avx|Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) { rc = emulate_ud(ctxt); goto done; } - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + if (ctxt->d & Avx) { + u64 xcr = 0; + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSXSAVE) + || ops->get_xcr(ctxt, 0, &xcr) + || !(xcr & XFEATURE_MASK_YMM)) { + rc = emulate_ud(ctxt); + goto done; + } + } else if (ctxt->d & Sse) { + if (!(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) { + rc = emulate_ud(ctxt); + goto done; + } + } + + if ((ctxt->d & (Avx|Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { rc = emulate_nm(ctxt); goto done; } diff --git a/arch/x86/kvm/fpu.h b/arch/x86/kvm/fpu.h index 3ba12888bf66..f898781b6a06 100644 --- a/arch/x86/kvm/fpu.h +++ b/arch/x86/kvm/fpu.h @@ -15,6 +15,58 @@ typedef u32 __attribute__((vector_size(16))) sse128_t; #define sse128_l3(x) ({ __sse128_u t; t.vec = x; t.as_u32[3]; }) #define sse128(lo, hi) ({ __sse128_u t; t.as_u64[0] = lo; t.as_u64[1] = hi; t.vec; }) +typedef u32 __attribute__((vector_size(32))) avx256_t; + +static inline void _kvm_read_avx_reg(int reg, avx256_t *data) +{ + switch (reg) { + case 0: asm("vmovdqa %%ymm0, %0" : "=m"(*data)); break; + case 1: asm("vmovdqa %%ymm1, %0" : "=m"(*data)); break; + case 2: asm("vmovdqa %%ymm2, %0" : "=m"(*data)); break; + case 3: asm("vmovdqa %%ymm3, %0" : "=m"(*data)); break; + case 4: asm("vmovdqa %%ymm4, %0" : "=m"(*data)); break; + case 5: asm("vmovdqa %%ymm5, %0" : "=m"(*data)); break; + case 6: asm("vmovdqa %%ymm6, %0" : "=m"(*data)); break; + case 7: asm("vmovdqa %%ymm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("vmovdqa %%ymm8, %0" : "=m"(*data)); break; + case 9: asm("vmovdqa %%ymm9, %0" : "=m"(*data)); break; + case 10: asm("vmovdqa %%ymm10, %0" : "=m"(*data)); break; + case 11: asm("vmovdqa %%ymm11, %0" : "=m"(*data)); break; + case 12: asm("vmovdqa %%ymm12, %0" : "=m"(*data)); break; + case 13: asm("vmovdqa %%ymm13, %0" : "=m"(*data)); break; + case 14: asm("vmovdqa %%ymm14, %0" : "=m"(*data)); break; + case 15: asm("vmovdqa %%ymm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } +} + +static inline void _kvm_write_avx_reg(int reg, const avx256_t *data) +{ + switch (reg) { + case 0: asm("vmovdqa %0, %%ymm0" : : "m"(*data)); break; + case 1: asm("vmovdqa %0, %%ymm1" : : "m"(*data)); break; + case 2: asm("vmovdqa %0, %%ymm2" : : "m"(*data)); break; + case 3: asm("vmovdqa %0, %%ymm3" : : "m"(*data)); break; + case 4: asm("vmovdqa %0, %%ymm4" : : "m"(*data)); break; + case 5: asm("vmovdqa %0, %%ymm5" : : "m"(*data)); break; + case 6: asm("vmovdqa %0, %%ymm6" : : "m"(*data)); break; + case 7: asm("vmovdqa %0, %%ymm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("vmovdqa %0, %%ymm8" : : "m"(*data)); break; + case 9: asm("vmovdqa %0, %%ymm9" : : "m"(*data)); break; + case 10: asm("vmovdqa %0, %%ymm10" : : "m"(*data)); break; + case 11: asm("vmovdqa %0, %%ymm11" : : "m"(*data)); break; + case 12: asm("vmovdqa %0, %%ymm12" : : "m"(*data)); break; + case 13: asm("vmovdqa %0, %%ymm13" : : "m"(*data)); break; + case 14: asm("vmovdqa %0, %%ymm14" : : "m"(*data)); break; + case 15: asm("vmovdqa %0, %%ymm15" : : "m"(*data)); break; +#endif + default: BUG(); + } +} + static inline void _kvm_read_sse_reg(int reg, sse128_t *data) { switch (reg) { @@ -109,6 +161,20 @@ static inline void kvm_fpu_put(void) fpregs_unlock(); } +static inline void kvm_read_avx_reg(int reg, avx256_t *data) +{ + kvm_fpu_get(); + _kvm_read_avx_reg(reg, data); + kvm_fpu_put(); +} + +static inline void kvm_write_avx_reg(int reg, const avx256_t *data) +{ + kvm_fpu_get(); + _kvm_write_avx_reg(reg, data); + kvm_fpu_put(); +} + static inline void kvm_read_sse_reg(int reg, sse128_t *data) { kvm_fpu_get(); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 38595ecb990d..de92292eb1f5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1568,7 +1568,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) * only, there can be valuable data in the rest which needs * to be preserved e.g. on migration. */ - if (__put_user(0, (u32 __user *)addr)) + if (put_user(0, (u32 __user *)addr)) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 7b5ddb787a25..fb3dab4b5a53 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -237,6 +237,7 @@ struct x86_emulate_ops { bool (*is_smm)(struct x86_emulate_ctxt *ctxt); int (*leave_smm)(struct x86_emulate_ctxt *ctxt); void (*triple_fault)(struct x86_emulate_ctxt *ctxt); + int (*get_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr); int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr); gva_t (*get_untagged_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, @@ -248,7 +249,7 @@ struct x86_emulate_ops { /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_MM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type; unsigned int bytes; unsigned int count; union { @@ -267,11 +268,12 @@ struct operand { union { unsigned long val; u64 val64; - char valptr[sizeof(sse128_t)]; + char valptr[sizeof(avx256_t)]; sse128_t vec_val; + avx256_t vec_val2; u64 mm_val; void *data; - }; + } __aligned(32); }; #define X86_MAX_INSTRUCTION_LENGTH 15 @@ -317,6 +319,14 @@ typedef void (*fastop_t)(struct fastop *); #define NR_EMULATOR_GPRS 8 #endif +/* + * Distinguish between no prefix, REX, or in the future REX2. + */ +enum rex_type { + REX_NONE, + REX_PREFIX, +}; + struct x86_emulate_ctxt { void *vcpu; const struct x86_emulate_ops *ops; @@ -348,6 +358,7 @@ struct x86_emulate_ctxt { u8 opcode_len; u8 b; u8 intercept; + bool op_prefix; u8 op_bytes; u8 ad_bytes; union { @@ -357,7 +368,8 @@ struct x86_emulate_ctxt { int (*check_perm)(struct x86_emulate_ctxt *ctxt); bool rip_relative; - u8 rex_prefix; + enum rex_type rex_prefix; + u8 rex_bits; u8 lock_prefix; u8 rep_prefix; /* bitmaps of registers in _regs[] that can be read */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0ae7f913d782..1597dd0b0cc6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2126,23 +2126,41 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg) static void advance_periodic_target_expiration(struct kvm_lapic *apic) { + struct kvm_timer *ktimer = &apic->lapic_timer; ktime_t now = ktime_get(); u64 tscl = rdtsc(); ktime_t delta; /* - * Synchronize both deadlines to the same time source or - * differences in the periods (caused by differences in the - * underlying clocks or numerical approximation errors) will - * cause the two to drift apart over time as the errors - * accumulate. + * Use kernel time as the time source for both the hrtimer deadline and + * TSC-based deadline so that they stay synchronized. Computing each + * deadline independently will cause the two deadlines to drift apart + * over time as differences in the periods accumulate, e.g. due to + * differences in the underlying clocks or numerical approximation errors. */ - apic->lapic_timer.target_expiration = - ktime_add_ns(apic->lapic_timer.target_expiration, - apic->lapic_timer.period); - delta = ktime_sub(apic->lapic_timer.target_expiration, now); - apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + - nsec_to_cycles(apic->vcpu, delta); + ktimer->target_expiration = ktime_add_ns(ktimer->target_expiration, + ktimer->period); + + /* + * If the new expiration is in the past, e.g. because userspace stopped + * running the VM for an extended duration, then force the expiration + * to "now" and don't try to play catch-up with the missed events. KVM + * will only deliver a single interrupt regardless of how many events + * are pending, i.e. restarting the timer with an expiration in the + * past will do nothing more than waste host cycles, and can even lead + * to a hard lockup in extreme cases. + */ + if (ktime_before(ktimer->target_expiration, now)) + ktimer->target_expiration = now; + + /* + * Note, ensuring the expiration isn't in the past also prevents delta + * from going negative, which could cause the TSC deadline to become + * excessively large due to it an unsigned value. + */ + delta = ktime_sub(ktimer->target_expiration, now); + ktimer->tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, delta); } static void start_sw_period(struct kvm_lapic *apic) @@ -2970,9 +2988,9 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic, true); - if (lapic_is_periodic(apic)) { + if (lapic_is_periodic(apic) && !WARN_ON_ONCE(!apic->lapic_timer.period)) { advance_periodic_target_expiration(apic); - hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + hrtimer_set_expires(&ktimer->timer, ktimer->target_expiration); return HRTIMER_RESTART; } else return HRTIMER_NORESTART; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f63074048ec6..830f46145692 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -235,8 +235,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return -(u32)fault & errcode; } -bool kvm_mmu_may_ignore_guest_pat(struct kvm *kvm); - int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); @@ -257,8 +255,7 @@ extern bool tdp_mmu_enabled; #define tdp_mmu_enabled false #endif -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa); -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level); +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn); static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 667d66cf76d5..02c450686b4a 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4859,7 +4859,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); @@ -4924,7 +4924,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) return direct_page_fault(vcpu, fault); } -int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) +static int kvm_tdp_page_prefault(struct kvm_vcpu *vcpu, gpa_t gpa, + u64 error_code, u8 *level) { int r; @@ -4966,7 +4967,6 @@ int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level return -EIO; } } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_map_page); long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) @@ -5002,7 +5002,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); + r = kvm_tdp_page_prefault(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; @@ -5014,6 +5014,86 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, return min(range->size, end - range->gpa); } +#ifdef CONFIG_KVM_GUEST_MEMFD +static void kvm_assert_gmem_invalidate_lock_held(struct kvm_memory_slot *slot) +{ +#ifdef CONFIG_PROVE_LOCKING + if (WARN_ON_ONCE(!kvm_slot_has_gmem(slot)) || + WARN_ON_ONCE(!slot->gmem.file) || + WARN_ON_ONCE(!file_count(slot->gmem.file))) + return; + + lockdep_assert_held(&file_inode(slot->gmem.file)->i_mapping->invalidate_lock); +#endif +} + +int kvm_tdp_mmu_map_private_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) +{ + struct kvm_page_fault fault = { + .addr = gfn_to_gpa(gfn), + .error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS, + .prefetch = true, + .is_tdp = true, + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(vcpu->kvm), + + .max_level = PG_LEVEL_4K, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + .is_private = true, + + .gfn = gfn, + .slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn), + .pfn = pfn, + .map_writable = true, + }; + struct kvm *kvm = vcpu->kvm; + int r; + + lockdep_assert_held(&kvm->slots_lock); + + /* + * Mapping a pre-determined private pfn is intended only for use when + * populating a guest_memfd instance. Assert that the slot is backed + * by guest_memfd and that the gmem instance's invalidate_lock is held. + */ + kvm_assert_gmem_invalidate_lock_held(fault.slot); + + if (KVM_BUG_ON(!tdp_mmu_enabled, kvm)) + return -EIO; + + if (kvm_gfn_is_write_tracked(kvm, fault.slot, fault.gfn)) + return -EPERM; + + r = kvm_mmu_reload(vcpu); + if (r) + return r; + + r = mmu_topup_memory_caches(vcpu, false); + if (r) + return r; + + do { + if (signal_pending(current)) + return -EINTR; + + if (kvm_test_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + + cond_resched(); + + guard(read_lock)(&kvm->mmu_lock); + + r = kvm_tdp_mmu_map(vcpu, &fault); + } while (r == RET_PF_RETRY); + + if (r != RET_PF_FIXED) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn); +#endif + static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; @@ -5997,7 +6077,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) out: return r; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { @@ -6863,6 +6942,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_unlock(&kvm->mmu_lock); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range); static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -7204,7 +7284,6 @@ restart: return need_tlb_flush; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) @@ -7364,6 +7443,9 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + if (!enable_mmio_caching) + return; + gen &= MMIO_SPTE_GEN_MASK; /* diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index ed5c01df21ba..73cdcbccc89e 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -39,16 +39,6 @@ #define INVALID_PAE_ROOT 0 #define IS_VALID_PAE_ROOT(x) (!!(x)) -static inline hpa_t kvm_mmu_get_dummy_root(void) -{ - return my_zero_pfn(0) << PAGE_SHIFT; -} - -static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) -{ - return is_zero_pfn(shadow_page >> PAGE_SHIFT); -} - typedef u64 __rcu *tdp_ptep_t; struct kvm_mmu_page { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ed762bb4b007..901cd2bd40b8 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -402,7 +402,7 @@ retry_walk: goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); - if (unlikely(__get_user(pte, ptep_user))) + if (unlikely(get_user(pte, ptep_user))) goto error; walker->ptep_user[walker->level - 1] = ptep_user; diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 37647afde7d3..85a0473809b0 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -292,7 +292,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } - if (static_branch_unlikely(&cpu_buf_vm_clear) && + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && !kvm_vcpu_can_access_host_mmio(vcpu) && kvm_is_mmio_pfn(pfn, &is_host_mmio)) kvm_track_host_mmio_mapping(vcpu); diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 3133f066927e..91ce29fd6f1b 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -246,6 +246,16 @@ static inline int spte_index(u64 *sptep) */ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; +static inline hpa_t kvm_mmu_get_dummy_root(void) +{ + return my_zero_pfn(0) << PAGE_SHIFT; +} + +static inline bool kvm_mmu_is_dummy_root(hpa_t shadow_page) +{ + return is_zero_pfn(shadow_page >> PAGE_SHIFT); +} + static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) { struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index c5734ca5c17d..9c26038f6b77 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -362,9 +362,6 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, int level) { - kvm_pfn_t old_pfn = spte_to_pfn(old_spte); - int ret; - /* * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external * PTs are removed in a special order, involving free_external_spt(). @@ -377,9 +374,8 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, /* Zapping leaf spte is allowed only when write lock is held. */ lockdep_assert_held_write(&kvm->mmu_lock); - /* Because write lock is held, operation should success. */ - ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); - KVM_BUG_ON(ret, kvm); + + kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte); } /** @@ -519,7 +515,6 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp bool was_present = is_shadow_present_pte(old_spte); bool is_present = is_shadow_present_pte(new_spte); bool is_leaf = is_present && is_last_spte(new_spte, level); - kvm_pfn_t new_pfn = spte_to_pfn(new_spte); int ret = 0; KVM_BUG_ON(was_present, kvm); @@ -538,7 +533,7 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp * external page table, or leaf. */ if (is_leaf) { - ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); + ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte); } else { void *external_spt = get_external_spt(gfn, new_spte, level); @@ -1273,6 +1268,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) struct kvm_mmu_page *sp; int ret = RET_PF_RETRY; + KVM_MMU_WARN_ON(!root || root->role.invalid); + kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); @@ -1939,13 +1936,16 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ -static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - struct kvm_mmu_page *root) +int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, + int *root_level) { + struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); struct tdp_iter iter; gfn_t gfn = addr >> PAGE_SHIFT; int leaf = -1; + *root_level = vcpu->arch.mmu->root_role.level; + for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; @@ -1954,36 +1954,6 @@ static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, return leaf; } -int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, - int *root_level) -{ - struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); - *root_level = vcpu->arch.mmu->root_role.level; - - return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); -} - -bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) -{ - struct kvm *kvm = vcpu->kvm; - bool is_direct = kvm_is_addr_direct(kvm, gpa); - hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : - vcpu->arch.mmu->mirror_root_hpa; - u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; - int leaf; - - lockdep_assert_held(&kvm->mmu_lock); - rcu_read_lock(); - leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); - rcu_read_unlock(); - if (leaf < 0) - return false; - - spte = sptes[leaf]; - return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped); - /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index fef00546c885..6b77b2033208 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -106,7 +106,7 @@ static u32 next_vm_id = 0; static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); static bool x2avic_enabled; - +static u32 x2avic_max_physical_id; static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) @@ -158,12 +158,40 @@ static void avic_set_x2apic_msr_interception(struct vcpu_svm *svm, svm->x2avic_msrs_intercepted = intercept; } +static u32 __avic_get_max_physical_id(struct kvm *kvm, struct kvm_vcpu *vcpu) +{ + u32 arch_max; + + /* + * Return the largest size (x2APIC) when querying without a vCPU, e.g. + * to allocate the per-VM table.. + */ + if (x2avic_enabled && (!vcpu || apic_x2apic_mode(vcpu->arch.apic))) + arch_max = x2avic_max_physical_id; + else + arch_max = AVIC_MAX_PHYSICAL_ID; + + /* + * Despite its name, KVM_CAP_MAX_VCPU_ID represents the maximum APIC ID + * plus one, so the max possible APIC ID is one less than that. + */ + return min(kvm->arch.max_vcpu_ids - 1, arch_max); +} + +static u32 avic_get_max_physical_id(struct kvm_vcpu *vcpu) +{ + return __avic_get_max_physical_id(vcpu->kvm, vcpu); +} + static void avic_activate_vmcb(struct vcpu_svm *svm) { struct vmcb *vmcb = svm->vmcb01.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); + vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; + vmcb->control.avic_physical_id |= avic_get_max_physical_id(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; @@ -176,7 +204,7 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { vmcb->control.int_ctl |= X2APIC_MODE_MASK; - vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; + /* Disabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, false); } else { @@ -186,8 +214,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm) */ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu); - /* For xAVIC and hybrid-xAVIC modes */ - vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; /* Enabling MSR intercept for x2APIC registers */ avic_set_x2apic_msr_interception(svm, true); } @@ -247,6 +273,30 @@ static int avic_ga_log_notifier(u32 ga_tag) return 0; } +static int avic_get_physical_id_table_order(struct kvm *kvm) +{ + /* Provision for the maximum physical ID supported in x2avic mode */ + return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64)); +} + +int avic_alloc_physical_id_table(struct kvm *kvm) +{ + struct kvm_svm *kvm_svm = to_kvm_svm(kvm); + + if (!irqchip_in_kernel(kvm) || !enable_apicv) + return 0; + + if (kvm_svm->avic_physical_id_table) + return 0; + + kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, + avic_get_physical_id_table_order(kvm)); + if (!kvm_svm->avic_physical_id_table) + return -ENOMEM; + + return 0; +} + void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -256,7 +306,8 @@ void avic_vm_destroy(struct kvm *kvm) return; free_page((unsigned long)kvm_svm->avic_logical_id_table); - free_page((unsigned long)kvm_svm->avic_physical_id_table); + free_pages((unsigned long)kvm_svm->avic_physical_id_table, + avic_get_physical_id_table_order(kvm)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_del(&kvm_svm->hnode); @@ -274,10 +325,6 @@ int avic_vm_init(struct kvm *kvm) if (!enable_apicv) return 0; - kvm_svm->avic_physical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!kvm_svm->avic_physical_id_table) - goto free_avic; - kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!kvm_svm->avic_logical_id_table) goto free_avic; @@ -342,7 +389,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) * fully initialized AVIC. */ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || - (id > X2AVIC_MAX_PHYSICAL_ID)) { + (id > x2avic_max_physical_id)) { kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_TOO_BIG); vcpu->arch.apic->apicv_active = false; return 0; @@ -562,7 +609,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) u32 icrh = svm->vmcb->control.exit_info_1 >> 32; u32 icrl = svm->vmcb->control.exit_info_1; u32 id = svm->vmcb->control.exit_info_2 >> 32; - u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; + u32 index = svm->vmcb->control.exit_info_2 & AVIC_PHYSICAL_MAX_INDEX_MASK; struct kvm_lapic *apic = vcpu->arch.apic; trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); @@ -962,7 +1009,8 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; - if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) return; /* @@ -1024,7 +1072,8 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action) lockdep_assert_preemption_disabled(); - if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= PAGE_SIZE)) + if (WARN_ON_ONCE(vcpu->vcpu_id * sizeof(entry) >= + PAGE_SIZE << avic_get_physical_id_table_order(vcpu->kvm))) return; /* @@ -1226,10 +1275,15 @@ bool __init avic_hardware_setup(void) /* AVIC is a prerequisite for x2AVIC. */ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); - if (x2avic_enabled) - pr_info("x2AVIC enabled\n"); - else + if (x2avic_enabled) { + if (cpu_feature_enabled(X86_FEATURE_X2AVIC_EXT)) + x2avic_max_physical_id = X2AVIC_4K_MAX_PHYSICAL_ID; + else + x2avic_max_physical_id = X2AVIC_MAX_PHYSICAL_ID; + pr_info("x2AVIC enabled (max %u vCPUs)\n", x2avic_max_physical_id + 1); + } else { svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; + } /* * Disable IPI virtualization for AMD Family 17h CPUs (Zen1 and Zen2) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index da6e80b3ac35..c81005b24522 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -613,6 +613,7 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 struct kvm_vcpu *vcpu = &svm->vcpu; nested_vmcb02_compute_g_pat(svm); + vmcb_mark_dirty(vmcb02, VMCB_NPT); /* Load the nested guest state */ if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) { @@ -751,6 +752,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; + vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP); /* * Stash vmcb02's counter if the guest hasn't moved past the guilty @@ -1430,16 +1432,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm) case SVM_EXIT_IOIO: vmexit = nested_svm_intercept_ioio(svm); break; - case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: { - if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) - vmexit = NESTED_EXIT_DONE; - break; - } - case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: { - if (vmcb12_is_intercept(&svm->nested.ctl, exit_code)) - vmexit = NESTED_EXIT_DONE; - break; - } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { /* * Host-intercepted exceptions have been checked already in diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0835c664fbfd..f59c65abe3cf 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -65,20 +65,24 @@ module_param_named(ciphertext_hiding_asids, nr_ciphertext_hiding_asids, uint, 04 #define AP_RESET_HOLD_NAE_EVENT 1 #define AP_RESET_HOLD_MSR_PROTO 2 -/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ -#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) -#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) -#define SNP_POLICY_MASK_SMT BIT_ULL(16) -#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) -#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) -#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) - -#define SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ - SNP_POLICY_MASK_API_MAJOR | \ - SNP_POLICY_MASK_SMT | \ - SNP_POLICY_MASK_RSVD_MBO | \ - SNP_POLICY_MASK_DEBUG | \ - SNP_POLICY_MASK_SINGLE_SOCKET) +/* + * SEV-SNP policy bits that can be supported by KVM. These include policy bits + * that have implementation support within KVM or policy bits that do not + * require implementation support within KVM to enforce the policy. + */ +#define KVM_SNP_POLICY_MASK_VALID (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET | \ + SNP_POLICY_MASK_CXL_ALLOW | \ + SNP_POLICY_MASK_MEM_AES_256_XTS | \ + SNP_POLICY_MASK_RAPL_DIS | \ + SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM | \ + SNP_POLICY_MASK_PAGE_SWAP_DISABLE) + +static u64 snp_supported_policy_bits __ro_after_init; #define INITIAL_VMSA_GPA 0xFFFFFFFFF000 @@ -2143,6 +2147,10 @@ int sev_dev_get_attr(u32 group, u64 attr, u64 *val) *val = sev_supported_vmsa_features; return 0; + case KVM_X86_SNP_POLICY_BITS: + *val = snp_supported_policy_bits; + return 0; + default: return -ENXIO; } @@ -2207,7 +2215,7 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.flags) return -EINVAL; - if (params.policy & ~SNP_POLICY_MASK_VALID) + if (params.policy & ~snp_supported_policy_bits) return -EINVAL; /* Check for policy bits that must be set */ @@ -3100,8 +3108,11 @@ out: else if (sev_snp_supported) sev_snp_supported = is_sev_snp_initialized(); - if (sev_snp_supported) + if (sev_snp_supported) { + snp_supported_policy_bits = sev_get_snp_policy_bits() & + KVM_SNP_POLICY_MASK_VALID; nr_ciphertext_hiding_asids = init_args.max_snp_asid; + } /* * If ciphertext hiding is enabled, the joint SEV-ES/SEV-SNP @@ -5085,10 +5096,10 @@ struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu) /* Check if the SEV policy allows debugging */ if (sev_snp_guest(vcpu->kvm)) { - if (!(sev->policy & SNP_POLICY_DEBUG)) + if (!(sev->policy & SNP_POLICY_MASK_DEBUG)) return NULL; } else { - if (sev->policy & SEV_POLICY_NODBG) + if (sev->policy & SEV_POLICY_MASK_NODBG) return NULL; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9d29b2e7e855..f56c2d895011 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -272,6 +272,7 @@ static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) } static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, + int emul_type, bool commit_side_effects) { struct vcpu_svm *svm = to_svm(vcpu); @@ -293,7 +294,7 @@ static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, if (unlikely(!commit_side_effects)) old_rflags = svm->vmcb->save.rflags; - if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) + if (!kvm_emulate_instruction(vcpu, emul_type)) return 0; if (unlikely(!commit_side_effects)) @@ -311,11 +312,13 @@ done: static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - return __svm_skip_emulated_instruction(vcpu, true); + return __svm_skip_emulated_instruction(vcpu, EMULTYPE_SKIP, true); } -static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) +static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu, u8 vector) { + const int emul_type = EMULTYPE_SKIP | EMULTYPE_SKIP_SOFT_INT | + EMULTYPE_SET_SOFT_INT_VECTOR(vector); unsigned long rip, old_rip = kvm_rip_read(vcpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -331,7 +334,7 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) * in use, the skip must not commit any side effects such as clearing * the interrupt shadow or RFLAGS.RF. */ - if (!__svm_skip_emulated_instruction(vcpu, !nrips)) + if (!__svm_skip_emulated_instruction(vcpu, emul_type, !nrips)) return -EIO; rip = kvm_rip_read(vcpu); @@ -367,7 +370,7 @@ static void svm_inject_exception(struct kvm_vcpu *vcpu) kvm_deliver_exception_payload(vcpu, ex); if (kvm_exception_is_soft(ex->vector) && - svm_update_soft_interrupt_rip(vcpu)) + svm_update_soft_interrupt_rip(vcpu, ex->vector)) return; svm->vmcb->control.event_inj = ex->vector @@ -1198,6 +1201,11 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) svm->vmcb = target_vmcb->ptr; } +static int svm_vcpu_precreate(struct kvm *kvm) +{ + return avic_alloc_physical_id_table(kvm); +} + static int svm_vcpu_create(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm; @@ -3442,13 +3450,8 @@ static bool svm_check_exit_valid(u64 exit_code) static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) { - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_code; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, exit_code); return 0; } @@ -3633,11 +3636,12 @@ static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { + struct kvm_queued_interrupt *intr = &vcpu->arch.interrupt; struct vcpu_svm *svm = to_svm(vcpu); u32 type; - if (vcpu->arch.interrupt.soft) { - if (svm_update_soft_interrupt_rip(vcpu)) + if (intr->soft) { + if (svm_update_soft_interrupt_rip(vcpu, intr->nr)) return; type = SVM_EVTINJ_TYPE_SOFT; @@ -3645,12 +3649,10 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) type = SVM_EVTINJ_TYPE_INTR; } - trace_kvm_inj_virq(vcpu->arch.interrupt.nr, - vcpu->arch.interrupt.soft, reinjected); + trace_kvm_inj_virq(intr->nr, intr->soft, reinjected); ++vcpu->stat.irq_injections; - svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | - SVM_EVTINJ_VALID | type; + svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type; } void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, @@ -4251,7 +4253,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); - kvm_load_guest_xsave_state(vcpu); /* * Hardware only context switches DEBUGCTL if LBR virtualization is @@ -4294,7 +4295,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) update_debugctlmsr(vcpu->arch.host_debugctl); - kvm_load_host_xsave_state(vcpu); stgi(); /* Any pending NMI will happen here */ @@ -4326,14 +4326,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; - /* - * We need to handle MC intercepts here before the vcpu has a chance to - * change the physical cpu - */ - if (unlikely(svm->vmcb->control.exit_code == - SVM_EXIT_EXCP_BASE + MC_VECTOR)) - svm_handle_mce(vcpu); - trace_kvm_exit(vcpu, KVM_ISA_SVM); svm_complete_interrupts(vcpu); @@ -4526,31 +4518,45 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, case SVM_EXIT_WRITE_CR0: { unsigned long cr0, val; - if (info->intercept == x86_intercept_cr_write) + /* + * Adjust the exit code accordingly if a CR other than CR0 is + * being written, and skip straight to the common handling as + * only CR0 has an additional selective intercept. + */ + if (info->intercept == x86_intercept_cr_write && info->modrm_reg) { icpt_info.exit_code += info->modrm_reg; - - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || - info->intercept == x86_intercept_clts) break; + } - if (!(vmcb12_is_intercept(&svm->nested.ctl, - INTERCEPT_SELECTIVE_CR0))) + /* + * Convert the exit_code to SVM_EXIT_CR0_SEL_WRITE if a + * selective CR0 intercept is triggered (the common logic will + * treat the selective intercept as being enabled). Note, the + * unconditional intercept has higher priority, i.e. this is + * only relevant if *only* the selective intercept is enabled. + */ + if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_CR0_WRITE) || + !(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0))) break; - cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; - val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; + /* CLTS never triggers INTERCEPT_SELECTIVE_CR0 */ + if (info->intercept == x86_intercept_clts) + break; + /* LMSW always triggers INTERCEPT_SELECTIVE_CR0 */ if (info->intercept == x86_intercept_lmsw) { - cr0 &= 0xfUL; - val &= 0xfUL; - /* lmsw can't clear PE - catch this here */ - if (cr0 & X86_CR0_PE) - val |= X86_CR0_PE; + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + break; } + /* + * MOV-to-CR0 only triggers INTERCEPT_SELECTIVE_CR0 if any bit + * other than SVM_CR0_SELECTIVE_MASK is changed. + */ + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; if (cr0 ^ val) icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; - break; } case SVM_EXIT_READ_DR0: @@ -4622,8 +4628,16 @@ out: static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) + switch (to_svm(vcpu)->vmcb->control.exit_code) { + case SVM_EXIT_EXCP_BASE + MC_VECTOR: + svm_handle_mce(vcpu); + break; + case SVM_EXIT_INTR: vcpu->arch.at_instruction_boundary = true; + break; + default: + break; + } } static void svm_setup_mce(struct kvm_vcpu *vcpu) @@ -5012,6 +5026,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = { .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, + .vcpu_precreate = svm_vcpu_precreate, .vcpu_create = svm_vcpu_create, .vcpu_free = svm_vcpu_free, .vcpu_reset = svm_vcpu_reset, @@ -5316,7 +5331,9 @@ static __init int svm_hardware_setup(void) if (nested) { pr_info("Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + kvm_enable_efer_bits(EFER_SVME); + if (!boot_cpu_has(X86_FEATURE_EFER_LMSLE_MBZ)) + kvm_enable_efer_bits(EFER_LMSLE); r = nested_svm_init_msrpm_merge_offsets(); if (r) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index dd78e6402345..9e151dbdef25 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -117,9 +117,6 @@ struct kvm_sev_info { cpumask_var_t have_run_cpus; /* CPUs that have done VMRUN for this VM. */ }; -#define SEV_POLICY_NODBG BIT_ULL(0) -#define SNP_POLICY_DEBUG BIT_ULL(19) - struct kvm_svm { struct kvm kvm; @@ -807,6 +804,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops; bool __init avic_hardware_setup(void); void avic_hardware_unsetup(void); +int avic_alloc_physical_id_table(struct kvm *kvm); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb); diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 235c4af6b692..3392bcadfb89 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -52,11 +52,23 @@ * there must not be any returns or indirect branches between this code * and vmentry. */ - movl SVM_spec_ctrl(%_ASM_DI), %eax - cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax +#ifdef CONFIG_X86_64 + mov SVM_spec_ctrl(%rdi), %rdx + cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx + je 801b + movl %edx, %eax + shr $32, %rdx +#else + mov SVM_spec_ctrl(%edi), %eax + mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx + xor %eax, %ecx + mov SVM_spec_ctrl + 4(%edi), %edx + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %esi + xor %edx, %esi + or %esi, %ecx je 801b +#endif mov $MSR_IA32_SPEC_CTRL, %ecx - xor %edx, %edx wrmsr jmp 801b .endm @@ -81,17 +93,31 @@ jnz 998f rdmsr movl %eax, SVM_spec_ctrl(%_ASM_DI) + movl %edx, SVM_spec_ctrl + 4(%_ASM_DI) 998: - /* Now restore the host value of the MSR if different from the guest's. */ - movl PER_CPU_VAR(x86_spec_ctrl_current), %eax - cmp SVM_spec_ctrl(%_ASM_DI), %eax +#ifdef CONFIG_X86_64 + mov PER_CPU_VAR(x86_spec_ctrl_current), %rdx + cmp SVM_spec_ctrl(%rdi), %rdx je 901b - xor %edx, %edx + movl %edx, %eax + shr $32, %rdx +#else + mov PER_CPU_VAR(x86_spec_ctrl_current), %eax + mov SVM_spec_ctrl(%edi), %esi + xor %eax, %esi + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edx + mov SVM_spec_ctrl + 4(%edi), %edi + xor %edx, %edi + or %edi, %esi + je 901b +#endif wrmsr jmp 901b .endm +#define SVM_CLEAR_CPU_BUFFERS \ + ALTERNATIVE "", __CLEAR_CPU_BUFFERS, X86_FEATURE_CLEAR_CPU_BUF_VM /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode @@ -134,7 +160,7 @@ SYM_FUNC_START(__svm_vcpu_run) mov %_ASM_ARG1, %_ASM_DI .endif - /* Clobbers RAX, RCX, RDX. */ + /* Clobbers RAX, RCX, RDX (and ESI on 32-bit), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL /* @@ -170,7 +196,7 @@ SYM_FUNC_START(__svm_vcpu_run) mov VCPU_RDI(%_ASM_DI), %_ASM_DI /* Clobbers EFLAGS.ZF */ - VM_CLEAR_CPU_BUFFERS + SVM_CLEAR_CPU_BUFFERS /* Enter guest mode */ 3: vmrun %_ASM_AX @@ -211,7 +237,10 @@ SYM_FUNC_START(__svm_vcpu_run) /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT - /* Clobbers RAX, RCX, RDX. */ + /* + * Clobbers RAX, RCX, RDX (and ESI, EDI on 32-bit), consumes RDI (@svm) + * and RSP (pointer to @spec_ctrl_intercepted). + */ RESTORE_HOST_SPEC_CTRL /* @@ -331,7 +360,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov %rdi, SEV_ES_RDI (%rdx) mov %rsi, SEV_ES_RSI (%rdx) - /* Clobbers RAX, RCX, RDX (@hostsa). */ + /* Clobbers RAX, RCX, and RDX (@hostsa), consumes RDI (@svm). */ RESTORE_GUEST_SPEC_CTRL /* Get svm->current_vmcb->pa into RAX. */ @@ -339,7 +368,7 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) mov KVM_VMCB_pa(%rax), %rax /* Clobbers EFLAGS.ZF */ - VM_CLEAR_CPU_BUFFERS + SVM_CLEAR_CPU_BUFFERS /* Enter guest mode */ 1: vmrun %rax diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0eb2773b2ae2..a46ccd670785 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -831,6 +831,14 @@ static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp) return tdx_vcpu_ioctl(vcpu, argp); } +static int vt_vcpu_mem_enc_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + if (!is_td_vcpu(vcpu)) + return -EINVAL; + + return tdx_vcpu_unlocked_ioctl(vcpu, argp); +} + static int vt_gmem_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, bool is_private) { @@ -1005,6 +1013,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl), .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl), + .vcpu_mem_enc_unlocked_ioctl = vt_op_tdx_only(vcpu_mem_enc_unlocked_ioctl), .gmem_max_mapping_level = vt_op_tdx_only(gmem_max_mapping_level) }; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bcea087b642f..40777278eabb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -23,8 +23,8 @@ static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); +static bool __ro_after_init warn_on_missed_cc; +module_param(warn_on_missed_cc, bool, 0444); #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -555,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; + if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) + return -EINVAL; + return 0; } @@ -761,7 +764,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -780,7 +783,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -2296,15 +2299,6 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) return; vmx->nested.vmcs02_initialized = true; - /* - * We don't care what the EPTP value is we just need to guarantee - * it's valid so we don't get a false positive when doing early - * consistency checks. - */ - if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, - construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); - if (vmx->ve_info) vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); @@ -2749,7 +2743,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); } vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( @@ -2961,6 +2955,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, } } + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && + CC(!vmcs12->tsc_multiplier)) + return -EINVAL; + return 0; } @@ -3078,6 +3076,38 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; + u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; + + /* + * Don't bother with the consistency checks if KVM isn't configured to + * WARN on missed consistency checks, as KVM needs to rely on hardware + * to fully detect an illegal vTPR vs. TRP Threshold combination due to + * the vTPR being writable by L1 at all times (it's an in-memory value, + * not a VMCS field). I.e. even if the check passes now, it might fail + * at the actual VM-Enter. + * + * Keying off the module param also allows treating an invalid vAPIC + * mapping as a consistency check failure without increasing the risk + * of breaking a "real" VM. + */ + if (!warn_on_missed_cc) + return 0; + + if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + (CC(!vapic) || + CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3333,84 +3363,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return 0; } -static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; - bool vm_fail; - - if (!nested_early_check) - return 0; - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - - preempt_disable(); - - vmx_prepare_switch_to_guest(vcpu); - - /* - * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, - * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. - * there is no need to preserve other bits or save/restore the field. - */ - vmcs_writel(GUEST_RFLAGS, 0); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - __vmx_vcpu_run_flags(vmx)); - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - if (vm_fail) { - u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); - - preempt_enable(); - - trace_kvm_nested_vmenter_failed( - "early hardware check VM-instruction error: ", error); - WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; - } - - /* - * VMExit clears RFLAGS.IF and DR7, even on a consistency check. - */ - if (hw_breakpoint_active()) - set_debugreg(__this_cpu_read(cpu_dr7), 7); - local_irq_enable(); - preempt_enable(); - - /* - * A non-failing VMEntry means we somehow entered guest mode with - * an illegal RIP, and that's just the tip of the iceberg. There - * is no telling what memory has been modified or what state has - * been exposed to unknown code. Hitting this all but guarantees - * a (very critical) hardware issue. - */ - WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & - VMX_EXIT_REASONS_FAILED_VMENTRY)); - - return 0; -} - #ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { @@ -3667,22 +3619,18 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, &vmx->nested.pre_vmenter_ssp_tbl); /* - * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* - * nested early checks are disabled. In the event of a "late" VM-Fail, - * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its - * software model to the pre-VMEntry host state. When EPT is disabled, - * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes - * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing - * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to - * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested - * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is - * guaranteed to be overwritten with a shadow CR3 prior to re-entering - * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as - * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks - * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail - * path would need to manually save/restore vmcs01.GUEST_CR3. + * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the + * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but + * not KVM, KVM must unwind its software model to the pre-VM-Entry host + * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not + * L1's "real" CR3, which causes nested_vmx_restore_host_state() to + * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the + * unwind naturally setting arch.cr3 to the correct value. Smashing + * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, + * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be + * overwritten with a shadow CR3 prior to re-entering L1. */ - if (!enable_ept && !nested_early_check) + if (!enable_ept) vmcs_writel(GUEST_CR3, vcpu->arch.cr3); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); @@ -3695,7 +3643,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } - if (nested_vmx_check_vmentry_hw(vcpu)) { + if (nested_vmx_check_controls_late(vcpu, vmcs12)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_VMFAIL; } @@ -3880,7 +3828,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will @@ -5164,12 +5112,13 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* * The only expected VM-instruction error is "VM entry with * invalid control field(s)." Anything else indicates a - * problem with L0. And we should never get here with a - * VMFail of any type if early consistency checks are enabled. + * problem with L0. */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - WARN_ON_ONCE(nested_early_check); + + /* VM-Fail at VM-Entry means KVM missed a consistency check. */ + WARN_ON_ONCE(warn_on_missed_cc); } /* diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h index 2f20fb170def..6a87a12135fb 100644 --- a/arch/x86/kvm/vmx/run_flags.h +++ b/arch/x86/kvm/vmx/run_flags.h @@ -2,12 +2,8 @@ #ifndef __KVM_X86_VMX_RUN_FLAGS_H #define __KVM_X86_VMX_RUN_FLAGS_H -#define VMX_RUN_VMRESUME_SHIFT 0 -#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT 1 -#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT 2 - -#define VMX_RUN_VMRESUME BIT(VMX_RUN_VMRESUME_SHIFT) -#define VMX_RUN_SAVE_SPEC_CTRL BIT(VMX_RUN_SAVE_SPEC_CTRL_SHIFT) -#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO_SHIFT) +#define VMX_RUN_VMRESUME BIT(0) +#define VMX_RUN_SAVE_SPEC_CTRL BIT(1) +#define VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO BIT(2) #endif /* __KVM_X86_VMX_RUN_FLAGS_H */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 0a49c863c811..2d7a4d52ccfb 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -24,20 +24,33 @@ #undef pr_fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#define pr_tdx_error(__fn, __err) \ - pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err) +#define __TDX_BUG_ON(__err, __f, __kvm, __fmt, __args...) \ +({ \ + struct kvm *_kvm = (__kvm); \ + bool __ret = !!(__err); \ + \ + if (WARN_ON_ONCE(__ret && (!_kvm || !_kvm->vm_bugged))) { \ + if (_kvm) \ + kvm_vm_bugged(_kvm); \ + pr_err_ratelimited("SEAMCALL " __f " failed: 0x%llx" __fmt "\n",\ + __err, __args); \ + } \ + unlikely(__ret); \ +}) + +#define TDX_BUG_ON(__err, __fn, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, "%s", "") + +#define TDX_BUG_ON_1(__err, __fn, a1, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx", a1) + +#define TDX_BUG_ON_2(__err, __fn, a1, a2, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 " 0x%llx", a1, a2) + +#define TDX_BUG_ON_3(__err, __fn, a1, a2, a3, __kvm) \ + __TDX_BUG_ON(__err, #__fn, __kvm, ", " #a1 " 0x%llx, " #a2 ", 0x%llx, " #a3 " 0x%llx", \ + a1, a2, a3) -#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \ - pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__) - -#define pr_tdx_error_1(__fn, __err, __rcx) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx) - -#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx) - -#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \ - __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8) bool enable_tdx __ro_after_init; module_param_named(tdx, enable_tdx, bool, 0444); @@ -281,25 +294,34 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_no_vcpus_enter_start(struct kvm *kvm) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true); - - kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); -} - -static void tdx_no_vcpus_enter_stop(struct kvm *kvm) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - lockdep_assert_held_write(&kvm->mmu_lock); - - WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false); -} +/* + * Execute a SEAMCALL related to removing/blocking S-EPT entries, with a single + * retry (if necessary) after forcing vCPUs to exit and wait for the operation + * to complete. All flows that remove/block S-EPT entries run with mmu_lock + * held for write, i.e. are mutually exclusive with each other, but they aren't + * mutually exclusive with running vCPUs, and so can fail with "operand busy" + * if a vCPU acquires a relevant lock in the TDX-Module, e.g. when doing TDCALL. + * + * Note, the retry is guaranteed to succeed, absent KVM and/or TDX-Module bugs. + */ +#define tdh_do_no_vcpus(tdh_func, kvm, args...) \ +({ \ + struct kvm_tdx *__kvm_tdx = to_kvm_tdx(kvm); \ + u64 __err; \ + \ + lockdep_assert_held_write(&kvm->mmu_lock); \ + \ + __err = tdh_func(args); \ + if (unlikely(tdx_operand_busy(__err))) { \ + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, true); \ + kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); \ + \ + __err = tdh_func(args); \ + \ + WRITE_ONCE(__kvm_tdx->wait_for_sept_zap, false); \ + } \ + __err; \ +}) /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ static int __tdx_reclaim_page(struct page *page) @@ -313,10 +335,9 @@ static int __tdx_reclaim_page(struct page *page) * before the HKID is released and control pages have also been * released at this point, so there is no possibility of contention. */ - if (WARN_ON_ONCE(err)) { - pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8); + if (TDX_BUG_ON_3(err, TDH_PHYMEM_PAGE_RECLAIM, rcx, rdx, r8, NULL)) return -EIO; - } + return 0; } @@ -404,8 +425,8 @@ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu) return; smp_call_function_single(cpu, tdx_flush_vp, &arg, 1); - if (KVM_BUG_ON(arg.err, vcpu->kvm)) - pr_tdx_error(TDH_VP_FLUSH, arg.err); + + TDX_BUG_ON(arg.err, TDH_VP_FLUSH, vcpu->kvm); } void tdx_disable_virtualization_cpu(void) @@ -464,8 +485,7 @@ static void smp_func_do_phymem_cache_wb(void *unused) } out: - if (WARN_ON_ONCE(err)) - pr_tdx_error(TDH_PHYMEM_CACHE_WB, err); + TDX_BUG_ON(err, TDH_PHYMEM_CACHE_WB, NULL); } void tdx_mmu_release_hkid(struct kvm *kvm) @@ -504,8 +524,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) err = tdh_mng_vpflushdone(&kvm_tdx->td); if (err == TDX_FLUSHVP_NOT_DONE) goto out; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_MNG_VPFLUSHDONE, err); + if (TDX_BUG_ON(err, TDH_MNG_VPFLUSHDONE, kvm)) { pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n", kvm_tdx->hkid); goto out; @@ -528,8 +547,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm) * tdh_mng_key_freeid() will fail. */ err = tdh_mng_key_freeid(&kvm_tdx->td); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_MNG_KEY_FREEID, err); + if (TDX_BUG_ON(err, TDH_MNG_KEY_FREEID, kvm)) { pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n", kvm_tdx->hkid); } else { @@ -580,10 +598,9 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) * when it is reclaiming TDCS). */ err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) return; - } + tdx_quirk_reset_page(kvm_tdx->td.tdr_page); __free_page(kvm_tdx->td.tdr_page); @@ -606,11 +623,8 @@ static int tdx_do_tdh_mng_key_config(void *param) /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */ err = tdh_mng_key_config(&kvm_tdx->td); - - if (KVM_BUG_ON(err, &kvm_tdx->kvm)) { - pr_tdx_error(TDH_MNG_KEY_CONFIG, err); + if (TDX_BUG_ON(err, TDH_MNG_KEY_CONFIG, &kvm_tdx->kvm)) return -EIO; - } return 0; } @@ -763,25 +777,6 @@ static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) return tdx_vcpu_state_details_intr_pending(vcpu_state_details); } -/* - * Compared to vmx_prepare_switch_to_guest(), there is not much to do - * as SEAMCALL/SEAMRET calls take care of most of save and restore. - */ -void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) -{ - struct vcpu_vt *vt = to_vt(vcpu); - - if (vt->guest_state_loaded) - return; - - if (likely(is_64bit_mm(current->mm))) - vt->msr_host_kernel_gs_base = current->thread.gsbase; - else - vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); - - vt->guest_state_loaded = true; -} - struct tdx_uret_msr { u32 msr; unsigned int slot; @@ -795,19 +790,38 @@ static struct tdx_uret_msr tdx_uret_msrs[] = { {.msr = MSR_TSC_AUX,}, }; -static void tdx_user_return_msr_update_cache(void) +void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { + struct vcpu_vt *vt = to_vt(vcpu); int i; + if (vt->guest_state_loaded) + return; + + if (likely(is_64bit_mm(current->mm))) + vt->msr_host_kernel_gs_base = current->thread.gsbase; + else + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + + vt->guest_state_loaded = true; + + /* + * Explicitly set user-return MSRs that are clobbered by the TDX-Module + * if VP.ENTER succeeds, i.e. on TD-Exit, with the values that would be + * written by the TDX-Module. Don't rely on the TDX-Module to actually + * clobber the MSRs, as the contract is poorly defined and not upheld. + * E.g. the TDX-Module will synthesize an EPT Violation without doing + * VM-Enter if it suspects a zero-step attack, and never "restore" VMM + * state. + */ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) - kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot, - tdx_uret_msrs[i].defval); + kvm_set_user_return_msr(tdx_uret_msrs[i].slot, + tdx_uret_msrs[i].defval, -1ull); } static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) { struct vcpu_vt *vt = to_vt(vcpu); - struct vcpu_tdx *tdx = to_tdx(vcpu); if (!vt->guest_state_loaded) return; @@ -815,11 +829,6 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu) ++vcpu->stat.host_state_reload; wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base); - if (tdx->guest_entered) { - tdx_user_return_msr_update_cache(); - tdx->guest_entered = false; - } - vt->guest_state_loaded = false; } @@ -829,19 +838,52 @@ void tdx_vcpu_put(struct kvm_vcpu *vcpu) tdx_prepare_switch_to_host(vcpu); } +/* + * Life cycles for a TD and a vCPU: + * 1. KVM_CREATE_VM ioctl. + * TD state is TD_STATE_UNINITIALIZED. + * hkid is not assigned at this stage. + * 2. KVM_TDX_INIT_VM ioctl. + * TD transitions to TD_STATE_INITIALIZED. + * hkid is assigned after this stage. + * 3. KVM_CREATE_VCPU ioctl. (only when TD is TD_STATE_INITIALIZED). + * 3.1 tdx_vcpu_create() transitions vCPU state to VCPU_TD_STATE_UNINITIALIZED. + * 3.2 vcpu_load() and vcpu_put() in kvm_arch_vcpu_create(). + * 3.3 (conditional) if any error encountered after kvm_arch_vcpu_create() + * kvm_arch_vcpu_destroy() --> tdx_vcpu_free(). + * 4. KVM_TDX_INIT_VCPU ioctl. + * tdx_vcpu_init() transitions vCPU state to VCPU_TD_STATE_INITIALIZED. + * vCPU control structures are allocated at this stage. + * 5. kvm_destroy_vm(). + * 5.1 tdx_mmu_release_hkid(): (1) tdh_vp_flush(), disassociates all vCPUs. + * (2) puts hkid to !assigned state. + * 5.2 kvm_destroy_vcpus() --> tdx_vcpu_free(): + * transitions vCPU to VCPU_TD_STATE_UNINITIALIZED state. + * 5.3 tdx_vm_destroy() + * transitions TD to TD_STATE_UNINITIALIZED state. + * + * tdx_vcpu_free() can be invoked only at 3.3 or 5.2. + * - If at 3.3, hkid is still assigned, but the vCPU must be in + * VCPU_TD_STATE_UNINITIALIZED state. + * - if at 5.2, hkid must be !assigned and all vCPUs must be in + * VCPU_TD_STATE_INITIALIZED state and have been dissociated. + */ void tdx_vcpu_free(struct kvm_vcpu *vcpu) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); struct vcpu_tdx *tdx = to_tdx(vcpu); int i; + if (vcpu->cpu != -1) { + KVM_BUG_ON(tdx->state == VCPU_TD_STATE_INITIALIZED, vcpu->kvm); + tdx_flush_vp_on_cpu(vcpu); + return; + } + /* * It is not possible to reclaim pages while hkid is assigned. It might - * be assigned if: - * 1. the TD VM is being destroyed but freeing hkid failed, in which - * case the pages are leaked - * 2. TD VCPU creation failed and this on the error path, in which case - * there is nothing to do anyway + * be assigned if the TD VM is being destroyed but freeing hkid failed, + * in which case the pages are leaked. */ if (is_hkid_assigned(kvm_tdx)) return; @@ -856,7 +898,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) } if (tdx->vp.tdvpr_page) { tdx_reclaim_control_page(tdx->vp.tdvpr_page); - tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_page = NULL; tdx->vp.tdvpr_pa = 0; } @@ -1059,7 +1101,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) update_debugctlmsr(vcpu->arch.host_debugctl); tdx_load_host_xsave_state(vcpu); - tdx->guest_entered = true; vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET; @@ -1069,9 +1110,6 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) return EXIT_FASTPATH_NONE; - if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) - kvm_machine_check(); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(tdx_failed_vmentry(vcpu))) @@ -1583,137 +1621,79 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level) td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa); } -static void tdx_unpin(struct kvm *kvm, struct page *page) +static int tdx_mem_page_add(struct kvm *kvm, gfn_t gfn, enum pg_level level, + kvm_pfn_t pfn) { - put_page(page); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err, entry, level_state; + gpa_t gpa = gfn_to_gpa(gfn); + + lockdep_assert_held(&kvm->slots_lock); + + if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm) || + KVM_BUG_ON(!kvm_tdx->page_add_src, kvm)) + return -EIO; + + err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), + kvm_tdx->page_add_src, &entry, &level_state); + if (unlikely(tdx_operand_busy(err))) + return -EBUSY; + + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_ADD, entry, level_state, kvm)) + return -EIO; + + return 0; } static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) + enum pg_level level, kvm_pfn_t pfn) { int tdx_level = pg_level_to_tdx_sept_level(level); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + struct page *page = pfn_to_page(pfn); gpa_t gpa = gfn_to_gpa(gfn); u64 entry, level_state; u64 err; err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state); - if (unlikely(tdx_operand_busy(err))) { - tdx_unpin(kvm, page); + if (unlikely(tdx_operand_busy(err))) return -EBUSY; - } - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state); - tdx_unpin(kvm, page); + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_AUG, entry, level_state, kvm)) return -EIO; - } - - return 0; -} - -/* - * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the - * callback tdx_gmem_post_populate() then maps pages into private memory. - * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the - * private EPT structures for the page to have been built before, which is - * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that - * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD(). - * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there - * are no half-initialized shared EPT pages. - */ -static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) -{ - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - - if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm)) - return -EINVAL; - /* nr_premapped will be decreased when tdh_mem_page_add() is called. */ - atomic64_inc(&kvm_tdx->nr_premapped); return 0; } static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) + enum pg_level level, u64 mirror_spte) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - struct page *page = pfn_to_page(pfn); + kvm_pfn_t pfn = spte_to_pfn(mirror_spte); /* TODO: handle large pages. */ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EINVAL; + return -EIO; - /* - * Because guest_memfd doesn't support page migration with - * a_ops->migrate_folio (yet), no callback is triggered for KVM on page - * migration. Until guest_memfd supports page migration, prevent page - * migration. - * TODO: Once guest_memfd introduces callback on page migration, - * implement it and remove get_page/put_page(). - */ - get_page(page); + WARN_ON_ONCE(!is_shadow_present_pte(mirror_spte) || + (mirror_spte & VMX_EPT_RWX_MASK) != VMX_EPT_RWX_MASK); /* - * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching - * barrier in tdx_td_finalize(). + * Ensure pre_fault_allowed is read by kvm_arch_vcpu_pre_fault_memory() + * before kvm_tdx->state. Userspace must not be allowed to pre-fault + * arbitrary memory until the initial memory image is finalized. Pairs + * with the smp_wmb() in tdx_td_finalize(). */ smp_rmb(); - if (likely(kvm_tdx->state == TD_STATE_RUNNABLE)) - return tdx_mem_page_aug(kvm, gfn, level, page); - - return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn); -} - -static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) -{ - int tdx_level = pg_level_to_tdx_sept_level(level); - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - gpa_t gpa = gfn_to_gpa(gfn); - u64 err, entry, level_state; - - /* TODO: handle large pages. */ - if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) - return -EINVAL; - - if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) - return -EINVAL; /* - * When zapping private page, write lock is held. So no race condition - * with other vcpu sept operation. - * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + * If the TD isn't finalized/runnable, then userspace is initializing + * the VM image via KVM_TDX_INIT_MEM_REGION; ADD the page to the TD. */ - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - - if (unlikely(tdx_operand_busy(err))) { - /* - * The second retry is expected to succeed after kicking off all - * other vCPUs and prevent them from invoking TDH.VP.ENTER. - */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry, - &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state); - return -EIO; - } - - err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) + return tdx_mem_page_add(kvm, gfn, level, pfn); - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); - return -EIO; - } - tdx_quirk_reset_page(page); - tdx_unpin(kvm, page); - return 0; + return tdx_mem_page_aug(kvm, gfn, level, pfn); } static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, @@ -1729,81 +1709,13 @@ static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn, if (unlikely(tdx_operand_busy(err))) return -EBUSY; - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_SEPT_ADD, entry, level_state, kvm)) return -EIO; - } return 0; } /* - * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is - * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called - * successfully. - * - * Since tdh_mem_sept_add() must have been invoked successfully before a - * non-leaf entry present in the mirrored page table, the SEPT ZAP related - * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead - * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the - * SEPT. - * - * Further check if the returned entry from SEPT walking is with RWX permissions - * to filter out anything unexpected. - * - * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from - * level_state returned from a SEAMCALL error is the same as that passed into - * the SEAMCALL. - */ -static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err, - u64 entry, int level) -{ - if (!err || kvm_tdx->state == TD_STATE_RUNNABLE) - return false; - - if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX)) - return false; - - if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK))) - return false; - - return true; -} - -static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, struct page *page) -{ - int tdx_level = pg_level_to_tdx_sept_level(level); - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level); - u64 err, entry, level_state; - - /* For now large page isn't supported yet. */ - WARN_ON_ONCE(level != PG_LEVEL_4K); - - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - - if (unlikely(tdx_operand_busy(err))) { - /* After no vCPUs enter, the second retry is expected to succeed */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state); - tdx_no_vcpus_enter_stop(kvm); - } - if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) && - !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) { - atomic64_dec(&kvm_tdx->nr_premapped); - tdx_unpin(kvm, page); - return 0; - } - - if (KVM_BUG_ON(err, kvm)) { - pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); - return -EIO; - } - return 1; -} - -/* * Ensure shared and private EPTs to be flushed on all vCPUs. * tdh_mem_track() is the only caller that increases TD epoch. An increase in * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are @@ -1836,18 +1748,15 @@ static void tdx_track(struct kvm *kvm) if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE)) return; + /* + * The full sequence of TDH.MEM.TRACK and forcing vCPUs out of guest + * mode must be serialized, as TDH.MEM.TRACK will fail if the previous + * tracking epoch hasn't completed. + */ lockdep_assert_held_write(&kvm->mmu_lock); - err = tdh_mem_track(&kvm_tdx->td); - if (unlikely(tdx_operand_busy(err))) { - /* After no vCPUs enter, the second retry is expected to succeed */ - tdx_no_vcpus_enter_start(kvm); - err = tdh_mem_track(&kvm_tdx->td); - tdx_no_vcpus_enter_stop(kvm); - } - - if (KVM_BUG_ON(err, kvm)) - pr_tdx_error(TDH_MEM_TRACK, err); + err = tdh_do_no_vcpus(tdh_mem_track, kvm, &kvm_tdx->td); + TDX_BUG_ON(err, TDH_MEM_TRACK, kvm); kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE); } @@ -1866,7 +1775,7 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, * and slot move/deletion. */ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm)) - return -EINVAL; + return -EIO; /* * The HKID assigned to this TD was already freed and cache was @@ -1875,11 +1784,16 @@ static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn, return tdx_reclaim_page(virt_to_page(private_spt)); } -static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, - enum pg_level level, kvm_pfn_t pfn) +static void tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, + enum pg_level level, u64 mirror_spte) { - struct page *page = pfn_to_page(pfn); - int ret; + struct page *page = pfn_to_page(spte_to_pfn(mirror_spte)); + int tdx_level = pg_level_to_tdx_sept_level(level); + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + gpa_t gpa = gfn_to_gpa(gfn); + u64 err, entry, level_state; + + lockdep_assert_held_write(&kvm->mmu_lock); /* * HKID is released after all private pages have been removed, and set @@ -1887,11 +1801,16 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, * there can't be anything populated in the private EPT. */ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm)) - return -EINVAL; + return; - ret = tdx_sept_zap_private_spte(kvm, gfn, level, page); - if (ret <= 0) - return ret; + /* TODO: handle large pages. */ + if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm)) + return; + + err = tdh_do_no_vcpus(tdh_mem_range_block, kvm, &kvm_tdx->td, gpa, + tdx_level, &entry, &level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_RANGE_BLOCK, entry, level_state, kvm)) + return; /* * TDX requires TLB tracking before dropping private page. Do @@ -1899,7 +1818,21 @@ static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn, */ tdx_track(kvm); - return tdx_sept_drop_private_spte(kvm, gfn, level, page); + /* + * When zapping private page, write lock is held. So no race condition + * with other vcpu sept operation. + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. + */ + err = tdh_do_no_vcpus(tdh_mem_page_remove, kvm, &kvm_tdx->td, gpa, + tdx_level, &entry, &level_state); + if (TDX_BUG_ON_2(err, TDH_MEM_PAGE_REMOVE, entry, level_state, kvm)) + return; + + err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page); + if (TDX_BUG_ON(err, TDH_PHYMEM_PAGE_WBINVD, kvm)) + return; + + tdx_quirk_reset_page(page); } void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, @@ -2145,11 +2078,7 @@ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath) } unhandled_exit: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vp_enter_ret; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, vp_enter_ret); return 0; } @@ -2282,37 +2211,28 @@ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd) if (cmd->flags) return -EINVAL; - caps = kzalloc(sizeof(*caps) + - sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config, - GFP_KERNEL); - if (!caps) - return -ENOMEM; - user_caps = u64_to_user_ptr(cmd->data); - if (get_user(nr_user_entries, &user_caps->cpuid.nent)) { - ret = -EFAULT; - goto out; - } + if (get_user(nr_user_entries, &user_caps->cpuid.nent)) + return -EFAULT; - if (nr_user_entries < td_conf->num_cpuid_config) { - ret = -E2BIG; - goto out; - } + if (nr_user_entries < td_conf->num_cpuid_config) + return -E2BIG; + + caps = kzalloc(struct_size(caps, cpuid.entries, + td_conf->num_cpuid_config), GFP_KERNEL); + if (!caps) + return -ENOMEM; ret = init_kvm_tdx_caps(td_conf, caps); if (ret) goto out; - if (copy_to_user(user_caps, caps, sizeof(*caps))) { + if (copy_to_user(user_caps, caps, struct_size(caps, cpuid.entries, + caps->cpuid.nent))) { ret = -EFAULT; goto out; } - if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries, - caps->cpuid.nent * - sizeof(caps->cpuid.entries[0]))) - ret = -EFAULT; - out: /* kfree() accepts NULL. */ kfree(caps); @@ -2537,8 +2457,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, goto free_packages; } - if (WARN_ON_ONCE(err)) { - pr_tdx_error(TDH_MNG_CREATE, err); + if (TDX_BUG_ON(err, TDH_MNG_CREATE, kvm)) { ret = -EIO; goto free_packages; } @@ -2579,8 +2498,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, ret = -EAGAIN; goto teardown; } - if (WARN_ON_ONCE(err)) { - pr_tdx_error(TDH_MNG_ADDCX, err); + if (TDX_BUG_ON(err, TDH_MNG_ADDCX, kvm)) { ret = -EIO; goto teardown; } @@ -2597,8 +2515,7 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, *seamcall_err = err; ret = -EINVAL; goto teardown; - } else if (WARN_ON_ONCE(err)) { - pr_tdx_error_1(TDH_MNG_INIT, err, rcx); + } else if (TDX_BUG_ON_1(err, TDH_MNG_INIT, rcx, kvm)) { ret = -EIO; goto teardown; } @@ -2642,7 +2559,7 @@ free_tdcs: free_tdr: if (tdr_page) __free_page(tdr_page); - kvm_tdx->td.tdr_page = 0; + kvm_tdx->td.tdr_page = NULL; free_hkid: tdx_hkid_free(kvm_tdx); @@ -2747,11 +2664,53 @@ err_out: return -EIO; } +typedef void *tdx_vm_state_guard_t; + +static tdx_vm_state_guard_t tdx_acquire_vm_state_locks(struct kvm *kvm) +{ + int r; + + mutex_lock(&kvm->lock); + + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) { + r = -EBUSY; + goto out_err; + } + + r = kvm_lock_all_vcpus(kvm); + if (r) + goto out_err; + + /* + * Note the unintuitive ordering! vcpu->mutex must be taken outside + * kvm->slots_lock! + */ + mutex_lock(&kvm->slots_lock); + return kvm; + +out_err: + mutex_unlock(&kvm->lock); + return ERR_PTR(r); +} + +static void tdx_release_vm_state_locks(struct kvm *kvm) +{ + mutex_unlock(&kvm->slots_lock); + kvm_unlock_all_vcpus(kvm); + mutex_unlock(&kvm->lock); +} + +DEFINE_CLASS(tdx_vm_state_guard, tdx_vm_state_guard_t, + if (!IS_ERR(_T)) tdx_release_vm_state_locks(_T), + tdx_acquire_vm_state_locks(kvm), struct kvm *kvm); + static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { + struct kvm_tdx_init_vm __user *user_data = u64_to_user_ptr(cmd->data); struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct kvm_tdx_init_vm *init_vm; struct td_params *td_params = NULL; + u32 nr_user_entries; int ret; BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid)); @@ -2763,28 +2722,16 @@ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd) if (cmd->flags) return -EINVAL; - init_vm = kmalloc(sizeof(*init_vm) + - sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES, - GFP_KERNEL); - if (!init_vm) - return -ENOMEM; - - if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) { - ret = -EFAULT; - goto out; - } + if (get_user(nr_user_entries, &user_data->cpuid.nent)) + return -EFAULT; - if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) { - ret = -E2BIG; - goto out; - } + if (nr_user_entries > KVM_MAX_CPUID_ENTRIES) + return -E2BIG; - if (copy_from_user(init_vm->cpuid.entries, - u64_to_user_ptr(cmd->data) + sizeof(*init_vm), - flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) { - ret = -EFAULT; - goto out; - } + init_vm = memdup_user(user_data, + struct_size(user_data, cpuid.entries, nr_user_entries)); + if (IS_ERR(init_vm)) + return PTR_ERR(init_vm); if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) { ret = -EINVAL; @@ -2868,24 +2815,14 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); - guard(mutex)(&kvm->slots_lock); - if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; - /* - * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue - * TDH.MEM.PAGE.ADD(). - */ - if (atomic64_read(&kvm_tdx->nr_premapped)) - return -EINVAL; cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td); if (tdx_operand_busy(cmd->hw_error)) return -EBUSY; - if (KVM_BUG_ON(cmd->hw_error, kvm)) { - pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error); + if (TDX_BUG_ON(cmd->hw_error, TDH_MR_FINALIZE, kvm)) return -EIO; - } kvm_tdx->state = TD_STATE_RUNNABLE; /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */ @@ -2894,27 +2831,38 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd) return 0; } -int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +static int tdx_get_cmd(void __user *argp, struct kvm_tdx_cmd *cmd) { - struct kvm_tdx_cmd tdx_cmd; - int r; - - if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd))) + if (copy_from_user(cmd, argp, sizeof(*cmd))) return -EFAULT; /* - * Userspace should never set hw_error. It is used to fill - * hardware-defined error by the kernel. + * Userspace should never set hw_error. KVM writes hw_error to report + * hardware-defined error back to userspace. */ - if (tdx_cmd.hw_error) + if (cmd->hw_error) return -EINVAL; - mutex_lock(&kvm->lock); + return 0; +} + +int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) +{ + struct kvm_tdx_cmd tdx_cmd; + int r; + + r = tdx_get_cmd(argp, &tdx_cmd); + if (r) + return r; + + if (tdx_cmd.id == KVM_TDX_CAPABILITIES) + return tdx_get_capabilities(&tdx_cmd); + + CLASS(tdx_vm_state_guard, guard)(kvm); + if (IS_ERR(guard)) + return PTR_ERR(guard); switch (tdx_cmd.id) { - case KVM_TDX_CAPABILITIES: - r = tdx_get_capabilities(&tdx_cmd); - break; case KVM_TDX_INIT_VM: r = tdx_td_init(kvm, &tdx_cmd); break; @@ -2922,15 +2870,12 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) r = tdx_td_finalize(kvm, &tdx_cmd); break; default: - r = -EINVAL; - goto out; + return -EINVAL; } if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd))) - r = -EFAULT; + return -EFAULT; -out: - mutex_unlock(&kvm->lock); return r; } @@ -2972,16 +2917,14 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) } err = tdh_vp_create(&kvm_tdx->td, &tdx->vp); - if (KVM_BUG_ON(err, vcpu->kvm)) { + if (TDX_BUG_ON(err, TDH_VP_CREATE, vcpu->kvm)) { ret = -EIO; - pr_tdx_error(TDH_VP_CREATE, err); goto free_tdcx; } for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) { err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]); - if (KVM_BUG_ON(err, vcpu->kvm)) { - pr_tdx_error(TDH_VP_ADDCX, err); + if (TDX_BUG_ON(err, TDH_VP_ADDCX, vcpu->kvm)) { /* * Pages already added are reclaimed by the vcpu_free * method, but the rest are freed here. @@ -2994,10 +2937,19 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) } } - err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); - if (KVM_BUG_ON(err, vcpu->kvm)) { - pr_tdx_error(TDH_VP_INIT, err); - return -EIO; + /* + * tdh_vp_init() can take an exclusive lock of the TDR resource inside + * the TDX-Module. The TDR resource is also taken as shared in several + * no-fail MMU paths, which could return TDX_OPERAND_BUSY on contention + * (TDX-Module locks are try-lock implementations with no slow path). + * Take mmu_lock for write to reflect the nature of the lock taken by + * the TDX-Module, and to ensure the no-fail MMU paths succeed, e.g. if + * a concurrent PUNCH_HOLE on guest_memfd triggers removal of SPTEs. + */ + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) { + err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id); + if (TDX_BUG_ON(err, TDH_VP_INIT, vcpu->kvm)) + return -EIO; } vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -3016,7 +2968,7 @@ free_tdcx: free_tdvpr: if (tdx->vp.tdvpr_page) __free_page(tdx->vp.tdvpr_page); - tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_page = NULL; tdx->vp.tdvpr_pa = 0; return ret; @@ -3054,7 +3006,8 @@ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_i static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) { - struct kvm_cpuid2 __user *output, *td_cpuid; + struct kvm_cpuid2 __user *output; + struct kvm_cpuid2 *td_cpuid; int r = 0, i = 0, leaf; u32 level; @@ -3167,15 +3120,15 @@ struct tdx_gmem_post_populate_arg { static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, void __user *src, int order, void *_arg) { - u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS; - struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct tdx_gmem_post_populate_arg *arg = _arg; - struct kvm_vcpu *vcpu = arg->vcpu; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); + u64 err, entry, level_state; gpa_t gpa = gfn_to_gpa(gfn); - u8 level = PG_LEVEL_4K; struct page *src_page; int ret, i; - u64 err, entry, level_state; + + if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm)) + return -EIO; /* * Get the source page if it has been faulted in. Return failure if the @@ -3187,49 +3140,29 @@ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, if (ret != 1) return -ENOMEM; - ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level); - if (ret < 0) - goto out; - - /* - * The private mem cannot be zapped after kvm_tdp_map_page() - * because all paths are covered by slots_lock and the - * filemap invalidate lock. Check that they are indeed enough. - */ - if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { - scoped_guard(read_lock, &kvm->mmu_lock) { - if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) { - ret = -EIO; - goto out; - } - } - } + kvm_tdx->page_add_src = src_page; + ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn); + kvm_tdx->page_add_src = NULL; - ret = 0; - err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn), - src_page, &entry, &level_state); - if (err) { - ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO; - goto out; - } + put_page(src_page); - if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) - atomic64_dec(&kvm_tdx->nr_premapped); + if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION)) + return ret; - if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) { - for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { - err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, - &level_state); - if (err) { - ret = -EIO; - break; - } - } + /* + * Note, MR.EXTEND can fail if the S-EPT mapping is somehow removed + * between mapping the pfn and now, but slots_lock prevents memslot + * updates, filemap_invalidate_lock() prevents guest_memfd updates, + * mmu_notifier events can't reach S-EPT entries, and KVM's internal + * zapping flows are mutually exclusive with S-EPT mappings. + */ + for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) { + err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry, &level_state); + if (TDX_BUG_ON_2(err, TDH_MR_EXTEND, entry, level_state, kvm)) + return -EIO; } -out: - put_page(src_page); - return ret; + return 0; } static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd) @@ -3245,8 +3178,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c if (tdx->state != VCPU_TD_STATE_INITIALIZED) return -EINVAL; - guard(mutex)(&kvm->slots_lock); - /* Once TD is finalized, the initial guest memory is fixed. */ if (kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; @@ -3264,7 +3195,6 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1)) return -EINVAL; - kvm_mmu_reload(vcpu); ret = 0; while (region.nr_pages) { if (signal_pending(current)) { @@ -3301,28 +3231,57 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c return ret; } -int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { - struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm *kvm = vcpu->kvm; + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); struct kvm_tdx_cmd cmd; - int ret; + int r; - if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) - return -EINVAL; + r = tdx_get_cmd(argp, &cmd); + if (r) + return r; - if (copy_from_user(&cmd, argp, sizeof(cmd))) - return -EFAULT; + CLASS(tdx_vm_state_guard, guard)(kvm); + if (IS_ERR(guard)) + return PTR_ERR(guard); - if (cmd.hw_error) + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) return -EINVAL; + vcpu_load(vcpu); + switch (cmd.id) { + case KVM_TDX_INIT_MEM_REGION: + r = tdx_vcpu_init_mem_region(vcpu, &cmd); + break; case KVM_TDX_INIT_VCPU: - ret = tdx_vcpu_init(vcpu, &cmd); + r = tdx_vcpu_init(vcpu, &cmd); break; - case KVM_TDX_INIT_MEM_REGION: - ret = tdx_vcpu_init_mem_region(vcpu, &cmd); + default: + r = -ENOIOCTLCMD; break; + } + + vcpu_put(vcpu); + + return r; +} + +int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) +{ + struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm); + struct kvm_tdx_cmd cmd; + int ret; + + if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE) + return -EINVAL; + + ret = tdx_get_cmd(argp, &cmd); + if (ret) + return ret; + + switch (cmd.id) { case KVM_TDX_GET_CPUID: ret = tdx_vcpu_get_cpuid(vcpu, &cmd); break; @@ -3447,10 +3406,6 @@ static int __init __tdx_bringup(void) /* * Check if MSRs (tdx_uret_msrs) can be saved/restored * before returning to user space. - * - * this_cpu_ptr(user_return_msrs)->registered isn't checked - * because the registration is done at vcpu runtime by - * tdx_user_return_msr_update_cache(). */ tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr); if (tdx_uret_msrs[i].slot == -1) { diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index ca39a9391db1..45b5183ccb36 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -36,8 +36,12 @@ struct kvm_tdx { struct tdx_td td; - /* For KVM_TDX_INIT_MEM_REGION. */ - atomic64_t nr_premapped; + /* + * Scratch pointer used to pass the source page to tdx_mem_page_add(). + * Protected by slots_lock, and non-NULL only when mapping a private + * pfn via tdx_gmem_post_populate(). + */ + struct page *page_add_src; /* * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do @@ -67,7 +71,6 @@ struct vcpu_tdx { u64 vp_enter_ret; enum vcpu_tdx_state state; - bool guest_entered; u64 map_gpa_next; u64 map_gpa_end; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bc255d709d8a..4426d34811fc 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -71,6 +71,7 @@ * @regs: unsigned long * (to guest registers) * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl + * VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO: vCPU can access host MMIO * * Returns: * 0 on VM-Exit, 1 on VM-Fail @@ -92,7 +93,7 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Save @vmx for SPEC_CTRL handling */ push %_ASM_ARG1 - /* Save @flags for SPEC_CTRL handling */ + /* Save @flags (used for VMLAUNCH vs. VMRESUME and mitigations). */ push %_ASM_ARG3 /* @@ -101,9 +102,6 @@ SYM_FUNC_START(__vmx_vcpu_run) */ push %_ASM_ARG2 - /* Copy @flags to EBX, _ASM_ARG3 is volatile. */ - mov %_ASM_ARG3L, %ebx - lea (%_ASM_SP), %_ASM_ARG2 call vmx_update_host_rsp @@ -118,13 +116,23 @@ SYM_FUNC_START(__vmx_vcpu_run) * and vmentry. */ mov 2*WORD_SIZE(%_ASM_SP), %_ASM_DI - movl VMX_spec_ctrl(%_ASM_DI), %edi - movl PER_CPU_VAR(x86_spec_ctrl_current), %esi - cmp %edi, %esi +#ifdef CONFIG_X86_64 + mov VMX_spec_ctrl(%rdi), %rdx + cmp PER_CPU_VAR(x86_spec_ctrl_current), %rdx je .Lspec_ctrl_done + movl %edx, %eax + shr $32, %rdx +#else + mov VMX_spec_ctrl(%edi), %eax + mov PER_CPU_VAR(x86_spec_ctrl_current), %ecx + xor %eax, %ecx + mov VMX_spec_ctrl + 4(%edi), %edx + mov PER_CPU_VAR(x86_spec_ctrl_current + 4), %edi + xor %edx, %edi + or %edi, %ecx + je .Lspec_ctrl_done +#endif mov $MSR_IA32_SPEC_CTRL, %ecx - xor %edx, %edx - mov %edi, %eax wrmsr .Lspec_ctrl_done: @@ -137,9 +145,6 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load @regs to RAX. */ mov (%_ASM_SP), %_ASM_AX - /* Check if vmlaunch or vmresume is needed */ - bt $VMX_RUN_VMRESUME_SHIFT, %ebx - /* Load guest registers. Don't clobber flags. */ mov VCPU_RCX(%_ASM_AX), %_ASM_CX mov VCPU_RDX(%_ASM_AX), %_ASM_DX @@ -160,11 +165,23 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Load guest RAX. This kills the @regs pointer! */ mov VCPU_RAX(%_ASM_AX), %_ASM_AX - /* Clobbers EFLAGS.ZF */ - CLEAR_CPU_BUFFERS - - /* Check EFLAGS.CF from the VMX_RUN_VMRESUME bit test above. */ - jnc .Lvmlaunch + /* + * Note, ALTERNATIVE_2 works in reverse order. If CLEAR_CPU_BUF_VM is + * enabled, do VERW unconditionally. If CPU_BUF_VM_MMIO is enabled, + * check @flags to see if the vCPU has access to host MMIO, and if so, + * do VERW. Else, do nothing (no mitigations needed/enabled). + */ + ALTERNATIVE_2 "", \ + __stringify(testl $VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO, WORD_SIZE(%_ASM_SP); \ + jz .Lskip_mmio_verw; \ + VERW; \ + .Lskip_mmio_verw:), \ + X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO, \ + __stringify(VERW), X86_FEATURE_CLEAR_CPU_BUF_VM + + /* Check @flags to see if VMLAUNCH or VMRESUME is needed. */ + testl $VMX_RUN_VMRESUME, WORD_SIZE(%_ASM_SP) + jz .Lvmlaunch /* * After a successful VMRESUME/VMLAUNCH, control flow "magically" diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 91b6f2f3edc2..4cbe8c84b636 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -203,6 +203,7 @@ module_param(pt_mode, int, S_IRUGO); struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; +#ifdef CONFIG_CPU_MITIGATIONS static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); static DEFINE_MUTEX(vmx_l1d_flush_mutex); @@ -225,7 +226,7 @@ static const struct { #define L1D_CACHE_ORDER 4 static void *vmx_l1d_flush_pages; -static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) +static int __vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) { struct page *page; unsigned int i; @@ -302,6 +303,26 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } +static int vmx_setup_l1d_flush(void) +{ + /* + * Hand the parameter mitigation value in which was stored in the pre + * module init parser. If no parameter was given, it will contain + * 'auto' which will be turned into the default 'cond' mitigation mode. + */ + return __vmx_setup_l1d_flush(vmentry_l1d_flush_param); +} + +static void vmx_cleanup_l1d_flush(void) +{ + if (vmx_l1d_flush_pages) { + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); + vmx_l1d_flush_pages = NULL; + } + /* Restore state so sysfs ignores VMX */ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} + static int vmentry_l1d_flush_parse(const char *s) { unsigned int i; @@ -339,7 +360,7 @@ static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) } mutex_lock(&vmx_l1d_flush_mutex); - ret = vmx_setup_l1d_flush(l1tf); + ret = __vmx_setup_l1d_flush(l1tf); mutex_unlock(&vmx_l1d_flush_mutex); return ret; } @@ -352,6 +373,101 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } +/* + * Software based L1D cache flush which is used when microcode providing + * the cache control MSR is not loaded. + * + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to + * flush it is required to read in 64 KiB because the replacement algorithm + * is not exactly LRU. This could be sized at runtime via topology + * information but as all relevant affected CPUs have 32KiB L1D cache size + * there is no point in doing so. + */ +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + int size = PAGE_SIZE << L1D_CACHE_ORDER; + + if (!static_branch_unlikely(&vmx_l1d_should_flush)) + return; + + /* + * This code is only executed when the flush mode is 'cond' or + * 'always' + */ + if (static_branch_likely(&vmx_l1d_flush_cond)) { + /* + * Clear the per-cpu flush bit, it gets set again if the vCPU + * is reloaded, i.e. if the vCPU is scheduled out or if KVM + * exits to userspace, or if KVM reaches one of the unsafe + * VMEXIT handlers, e.g. if KVM calls into the emulator, + * or from the interrupt handlers. + */ + if (!kvm_get_cpu_l1tf_flush_l1d()) + return; + kvm_clear_cpu_l1tf_flush_l1d(); + } + + vcpu->stat.l1d_flush++; + + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + return; + } + + asm volatile( + /* First ensure the pages are in the TLB */ + "xorl %%eax, %%eax\n" + ".Lpopulate_tlb:\n\t" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $4096, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lpopulate_tlb\n\t" + "xorl %%eax, %%eax\n\t" + "cpuid\n\t" + /* Now fill the cache */ + "xorl %%eax, %%eax\n" + ".Lfill_cache:\n" + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" + "addl $64, %%eax\n\t" + "cmpl %%eax, %[size]\n\t" + "jne .Lfill_cache\n\t" + "lfence\n" + :: [flush_pages] "r" (vmx_l1d_flush_pages), + [size] "r" (size) + : "eax", "ebx", "ecx", "edx"); +} + +#else /* CONFIG_CPU_MITIGATIONS*/ +static int vmx_setup_l1d_flush(void) +{ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NEVER; + return 0; +} +static void vmx_cleanup_l1d_flush(void) +{ + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; +} +static __always_inline void vmx_l1d_flush(struct kvm_vcpu *vcpu) +{ + +} +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) +{ + pr_warn_once("Kernel compiled without mitigations, ignoring vmentry_l1d_flush\n"); + return 0; +} +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) +{ + return sysfs_emit(s, "never\n"); +} +#endif + +static const struct kernel_param_ops vmentry_l1d_flush_ops = { + .set = vmentry_l1d_flush_set, + .get = vmentry_l1d_flush_get, +}; +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); + static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) { u64 msr; @@ -404,12 +520,6 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) vmx->disable_fb_clear = false; } -static const struct kernel_param_ops vmentry_l1d_flush_ops = { - .set = vmentry_l1d_flush_set, - .get = vmentry_l1d_flush_get, -}; -module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); - static u32 vmx_segment_access_rights(struct kvm_segment *var); void vmx_vmexit(void); @@ -752,7 +862,7 @@ static void __loaded_vmcs_clear(void *arg) loaded_vmcs->launched = 0; } -void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; @@ -903,7 +1013,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; - if (static_branch_unlikely(&cpu_buf_vm_clear) && + if (cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO) && kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; @@ -3219,6 +3329,40 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->vpid; } +static u64 construct_eptp(hpa_t root_hpa) +{ + u64 eptp = root_hpa | VMX_EPTP_MT_WB; + struct kvm_mmu_page *root; + + if (kvm_mmu_is_dummy_root(root_hpa)) + return eptp | VMX_EPTP_PWL_4; + + /* + * EPT roots should always have an associated MMU page. Return a "bad" + * EPTP to induce VM-Fail instead of continuing on in a unknown state. + */ + root = root_to_sp(root_hpa); + if (WARN_ON_ONCE(!root)) + return INVALID_PAGE; + + eptp |= (root->role.level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + + if (enable_ept_ad_bits && !root->role.ad_disabled) + eptp |= VMX_EPTP_AD_ENABLE_BIT; + + return eptp; +} + +static void vmx_flush_tlb_ept_root(hpa_t root_hpa) +{ + u64 eptp = construct_eptp(root_hpa); + + if (VALID_PAGE(eptp)) + ept_sync_context(eptp); + else + ept_sync_global(); +} + void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3229,8 +3373,7 @@ void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa, - mmu->root_role.level)); + vmx_flush_tlb_ept_root(root_hpa); else vpid_sync_context(vmx_get_current_vpid(vcpu)); } @@ -3396,30 +3539,16 @@ static int vmx_get_max_ept_level(void) return 4; } -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) -{ - u64 eptp = VMX_EPTP_MT_WB; - - eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; - - if (enable_ept_ad_bits && - (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) - eptp |= VMX_EPTP_AD_ENABLE_BIT; - eptp |= root_hpa; - - return eptp; -} - void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; unsigned long guest_cr3; - u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, root_hpa, root_level); - vmcs_write64(EPT_POINTER, eptp); + KVM_MMU_WARN_ON(root_to_sp(root_hpa) && + root_level != root_to_sp(root_hpa)->role.level); + vmcs_write64(EPT_POINTER, construct_eptp(root_hpa)); hv_track_root_tdp(vcpu, root_hpa); @@ -6631,15 +6760,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return kvm_vmx_exit_handlers[exit_handler_index](vcpu); unexpected_vmexit: - vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", - exit_reason.full); dump_vmcs(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = exit_reason.full; - vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; + kvm_prepare_unexpected_reason_exit(vcpu, exit_reason.full); return 0; } @@ -6661,77 +6783,6 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return ret; } -/* - * Software based L1D cache flush which is used when microcode providing - * the cache control MSR is not loaded. - * - * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to - * flush it is required to read in 64 KiB because the replacement algorithm - * is not exactly LRU. This could be sized at runtime via topology - * information but as all relevant affected CPUs have 32KiB L1D cache size - * there is no point in doing so. - */ -static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) -{ - int size = PAGE_SIZE << L1D_CACHE_ORDER; - - /* - * This code is only executed when the flush mode is 'cond' or - * 'always' - */ - if (static_branch_likely(&vmx_l1d_flush_cond)) { - bool flush_l1d; - - /* - * Clear the per-vcpu flush bit, it gets set again if the vCPU - * is reloaded, i.e. if the vCPU is scheduled out or if KVM - * exits to userspace, or if KVM reaches one of the unsafe - * VMEXIT handlers, e.g. if KVM calls into the emulator. - */ - flush_l1d = vcpu->arch.l1tf_flush_l1d; - vcpu->arch.l1tf_flush_l1d = false; - - /* - * Clear the per-cpu flush bit, it gets set again from - * the interrupt handlers. - */ - flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); - kvm_clear_cpu_l1tf_flush_l1d(); - - if (!flush_l1d) - return; - } - - vcpu->stat.l1d_flush++; - - if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - return; - } - - asm volatile( - /* First ensure the pages are in the TLB */ - "xorl %%eax, %%eax\n" - ".Lpopulate_tlb:\n\t" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $4096, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lpopulate_tlb\n\t" - "xorl %%eax, %%eax\n\t" - "cpuid\n\t" - /* Now fill the cache */ - "xorl %%eax, %%eax\n" - ".Lfill_cache:\n" - "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" - "addl $64, %%eax\n\t" - "cmpl %%eax, %[size]\n\t" - "jne .Lfill_cache\n\t" - "lfence\n" - :: [flush_pages] "r" (vmx_l1d_flush_pages), - [size] "r" (size) - : "eax", "ebx", "ecx", "edx"); -} - void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -7050,10 +7101,19 @@ void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) if (to_vt(vcpu)->emulation_required) return; - if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) + switch (vmx_get_exit_reason(vcpu).basic) { + case EXIT_REASON_EXTERNAL_INTERRUPT: handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) + break; + case EXIT_REASON_EXCEPTION_NMI: handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); + break; + case EXIT_REASON_MCE_DURING_VMENTRY: + kvm_machine_check(); + break; + default: + break; + } } /* @@ -7328,21 +7388,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, guest_state_enter_irqoff(); - /* - * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW - * mitigation for MDS is done late in VMentry and is still - * executed in spite of L1D Flush. This is because an extra VERW - * should not matter much after the big hammer L1D Flush. - * - * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, - * and is affected by MMIO Stale Data. In such cases mitigation in only - * needed against an MMIO capable guest. - */ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&cpu_buf_vm_clear) && - (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) - x86_clear_cpu_buffers(); + vmx_l1d_flush(vcpu); vmx_disable_fb_clear(vmx); @@ -7454,8 +7500,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - kvm_load_guest_xsave_state(vcpu); - pt_guest_enter(vmx); atomic_switch_perf_msrs(vmx); @@ -7499,8 +7543,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) pt_guest_exit(vmx); - kvm_load_host_xsave_state(vcpu); - if (is_guest_mode(vcpu)) { /* * Track VMLAUNCH/VMRESUME that have made past guest state @@ -7516,9 +7558,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) - kvm_machine_check(); - trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) @@ -8679,16 +8718,6 @@ __init int vmx_hardware_setup(void) return r; } -static void vmx_cleanup_l1d_flush(void) -{ - if (vmx_l1d_flush_pages) { - free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); - vmx_l1d_flush_pages = NULL; - } - /* Restore state so sysfs ignores VMX */ - l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; -} - void vmx_exit(void) { allow_smaller_maxphyaddr = false; @@ -8724,14 +8753,8 @@ int __init vmx_init(void) if (r) return r; - /* - * Must be called after common x86 init so enable_ept is properly set - * up. Hand the parameter mitigation value in which was stored in - * the pre module init parser. If no parameter was given, it will - * contain 'auto' which will be turned into the default 'cond' - * mitigation mode. - */ - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + /* Must be called after common x86 init so enable_ept is setup. */ + r = vmx_setup_l1d_flush(); if (r) goto err_l1d_flush; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index ea93121029f9..bc3ed3145d7e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -369,7 +369,6 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); @@ -681,7 +680,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) { diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index 9697368d65b3..d09abeac2b56 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,7 +73,6 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); -void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); @@ -149,6 +148,7 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp); +int tdx_vcpu_unlocked_ioctl(struct kvm_vcpu *vcpu, void __user *argp); void tdx_flush_tlb_current(struct kvm_vcpu *vcpu); void tdx_flush_tlb_all(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9c2aa6f4705..0c6d899d53dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -159,9 +159,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); -static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, 0444); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); @@ -212,7 +209,7 @@ struct kvm_user_return_msrs { u32 __read_mostly kvm_nr_uret_msrs; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; -static struct kvm_user_return_msrs __percpu *user_return_msrs; +static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ @@ -575,24 +572,26 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) vcpu->arch.apf.gfns[i] = ~0; } +static void kvm_destroy_user_return_msrs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); + + kvm_nr_uret_msrs = 0; +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; struct kvm_user_return_msrs *msrs = container_of(urn, struct kvm_user_return_msrs, urn); struct kvm_user_return_msr_values *values; - unsigned long flags; - /* - * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_disable_virtualization_cpu() - */ - local_irq_save(flags); - if (msrs->registered) { - msrs->registered = false; - user_return_notifier_unregister(urn); - } - local_irq_restore(flags); + msrs->registered = false; + user_return_notifier_unregister(urn); + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { @@ -643,7 +642,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); u64 value; int i; @@ -665,7 +664,7 @@ static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); @@ -681,24 +680,15 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); -void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) -{ - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); - - msrs->values[slot].curr = value; - kvm_user_return_register_notifier(msrs); -} -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_user_return_msr_update_cache); - u64 kvm_get_user_return_msr(unsigned int slot) { - return this_cpu_ptr(user_return_msrs)->values[slot].curr; + return this_cpu_ptr(&user_return_msrs)->values[slot].curr; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); static void drop_user_return_notifiers(void) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); if (msrs->registered) kvm_on_user_return(&msrs->urn); @@ -1045,6 +1035,13 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); +static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + return (vcpu->arch.apf.msr_en_val & mask) == mask; +} + static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -1137,15 +1134,20 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon } if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - /* * Clearing CR0.PG is defined to flush the TLB from the guest's * perspective. */ if (!(cr0 & X86_CR0_PG)) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + /* + * Check for async #PF completion events when enabling paging, + * as the vCPU may have previously encountered async #PFs (it's + * entirely legal for the guest to toggle paging on/off without + * waiting for the async #PF queue to drain). + */ + else if (kvm_pv_async_pf_enabled(vcpu)) + kvm_make_request(KVM_REQ_APF_READY, vcpu); } if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) @@ -1203,20 +1205,27 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest) { if (vcpu->arch.guest_state_protected) return; - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) + return; - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, + load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0); - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss); - } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss); +} + +static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_state_protected) + return; if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && @@ -1224,9 +1233,8 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) wrpkru(vcpu->arch.pkru); } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_guest_xsave_state); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; @@ -1238,19 +1246,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.pkru != vcpu->arch.host_pkru) wrpkru(vcpu->arch.host_pkru); } - - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); - - if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrq(MSR_IA32_XSS, kvm_host.xss); - } - } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) @@ -3505,27 +3501,17 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) /* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from - * the rest of the vcpus to remain static. Otherwise ntp frequency - * correction applies to one vcpu's system_timestamp but not - * the others. + * the rest of the vcpus to remain static. * * So in those cases, request a kvmclock update for all vcpus. - * We need to rate-limit these requests though, as they can - * considerably slow guests that have a large number of vcpus. - * The time for a remote vcpu to update its kvmclock is bound - * by the delay we use to rate-limit the updates. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. */ - -#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) - -static void kvmclock_update_fn(struct work_struct *work) +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { unsigned long i; - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_update_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; + struct kvm *kvm = v->kvm; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -3533,29 +3519,6 @@ static void kvmclock_update_fn(struct work_struct *work) } } -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) -{ - struct kvm *kvm = v->kvm; - - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - schedule_delayed_work(&kvm->arch.kvmclock_update_work, - KVMCLOCK_UPDATE_DELAY); -} - -#define KVMCLOCK_SYNC_PERIOD (300 * HZ) - -static void kvmclock_sync_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_sync_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); -} - /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ static bool is_mci_control_msr(u32 msr) { @@ -3650,13 +3613,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) -{ - u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; - - return (vcpu->arch.apf.msr_en_val & mask) == mask; -} - static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; @@ -4182,7 +4138,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { - vcpu->arch.apf.pageready_pending = false; + /* + * Pairs with the smp_mb__after_atomic() in + * kvm_arch_async_page_present_queued(). + */ + smp_store_mb(vcpu->arch.apf.pageready_pending, false); + kvm_check_async_pf_completion(vcpu); } break; @@ -5188,7 +5149,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (vcpu->scheduled_out && pmu->version && pmu->event_count) { pmu->need_cleanup = true; @@ -7239,6 +7200,19 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + + if (ioctl == KVM_MEMORY_ENCRYPT_OP && + kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl) + return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp); + + return -ENOIOCTLCMD; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; @@ -7998,7 +7972,7 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { /* kvm_write_guest_virt_system can pull in tons of pages. */ - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); @@ -8842,6 +8816,14 @@ static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); } +static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr) +{ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + *xcr = emul_to_vcpu(ctxt)->arch.xcr0; + return 0; +} + static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) { return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); @@ -8914,6 +8896,7 @@ static const struct x86_emulate_ops emulate_ops = { .is_smm = emulator_is_smm, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, + .get_xcr = emulator_get_xcr, .set_xcr = emulator_set_xcr, .get_untagged_addr = emulator_get_untagged_addr, .is_canonical_addr = emulator_is_canonical_addr, @@ -9109,6 +9092,18 @@ void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); +void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason) +{ + vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason); + + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit); + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { struct kvm *kvm = vcpu->kvm; @@ -9337,6 +9332,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt, + int emulation_type) +{ + u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type); + + switch (ctxt->b) { + case 0xcc: + return vector == BP_VECTOR; + case 0xcd: + return vector == ctxt->src.val; + case 0xce: + return vector == OF_VECTOR; + default: + return false; + } +} + /* * Decode an instruction for emulation. The caller is responsible for handling * code breakpoints. Note, manually detecting code breakpoints is unnecessary @@ -9394,7 +9406,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return handle_emulation_failure(vcpu, emulation_type); } - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); @@ -9447,6 +9459,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { + if (emulation_type & EMULTYPE_SKIP_SOFT_INT && + !is_soft_int_instruction(ctxt, emulation_type)) + return 0; + if (ctxt->mode != X86EMUL_MODE_PROT64) ctxt->eip = (u32)ctxt->_eip; else @@ -10031,17 +10047,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -ENOMEM; } - user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); - if (!user_return_msrs) { - pr_err("failed to allocate percpu kvm_user_return_msrs\n"); - r = -ENOMEM; - goto out_free_x86_emulator_cache; - } - kvm_nr_uret_msrs = 0; - r = kvm_mmu_vendor_module_init(); if (r) - goto out_free_percpu; + goto out_free_x86_emulator_cache; kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; @@ -10066,6 +10074,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + WARN_ON_ONCE(kvm_nr_uret_msrs); + r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; @@ -10138,9 +10148,8 @@ out_unwind_ops: kvm_x86_ops.enable_virtualization_cpu = NULL; kvm_x86_call(hardware_unsetup)(); out_mmu_exit: + kvm_destroy_user_return_msrs(); kvm_mmu_vendor_module_exit(); -out_free_percpu: - free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; @@ -10168,8 +10177,8 @@ void kvm_x86_vendor_exit(void) cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_call(hardware_unsetup)(); + kvm_destroy_user_return_msrs(); kvm_mmu_vendor_module_exit(); - free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); @@ -11291,6 +11300,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_fpu.xfd_err) wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + kvm_load_xfeatures(vcpu, true); + if (unlikely(vcpu->arch.switch_db_regs && !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { set_debugreg(DR7_FIXED_1, 7); @@ -11319,6 +11330,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) guest_timing_enter_irqoff(); + /* + * Swap PKRU with hardware breakpoints disabled to minimize the number + * of flows where non-KVM code can run with guest state loaded. + */ + kvm_load_guest_pkru(vcpu); + for (;;) { /* * Assert that vCPU vs. VM APICv state is consistent. An APICv @@ -11347,6 +11364,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; } + kvm_load_host_pkru(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit @@ -11377,6 +11396,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_load_xfeatures(vcpu, false); + /* * Sync xfd before calling handle_exit_irqoff() which may * rely on the fact that guest_fpu::xfd is up-to-date (e.g. @@ -12734,8 +12755,6 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); @@ -12746,10 +12765,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.msr_kvm_poll_control = 1; mutex_unlock(&vcpu->mutex); - - if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -13088,7 +13103,21 @@ int kvm_arch_enable_virtualization_cpu(void) void kvm_arch_disable_virtualization_cpu(void) { kvm_x86_call(disable_virtualization_cpu)(); - drop_user_return_notifiers(); + + /* + * Leave the user-return notifiers as-is when disabling virtualization + * for reboot, i.e. when disabling via IPI function call, and instead + * pin kvm.ko (if it's a module) to defend against use-after-free (in + * the *very* unlikely scenario module unload is racing with reboot). + * On a forced reboot, tasks aren't frozen before shutdown, and so KVM + * could be actively modifying user-return MSR state when the IPI to + * disable virtualization arrives. Handle the extreme edge case here + * instead of trying to account for it in the normal flows. + */ + if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) + drop_user_return_notifiers(); + else + __module_get(THIS_MODULE); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -13160,9 +13189,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.hv_root_tdp = INVALID_PAGE; #endif - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); - kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); @@ -13269,9 +13295,6 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) * is unsafe, i.e. will lead to use-after-free. The PIT also needs to * be stopped before IRQ routing is freed. */ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - #ifdef CONFIG_KVM_IOAPIC kvm_free_pit(kvm); #endif @@ -13888,7 +13911,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, if ((work->wakeup_all || work->notpresent_injected) && kvm_pv_async_pf_enabled(vcpu) && !apf_put_user_ready(vcpu, work->arch.token)) { - vcpu->arch.apf.pageready_pending = true; + WRITE_ONCE(vcpu->arch.apf.pageready_pending, true); kvm_apic_set_irq(vcpu, &irq, NULL); } @@ -13899,7 +13922,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) { kvm_make_request(KVM_REQ_APF_READY, vcpu); - if (!vcpu->arch.apf.pageready_pending) + + /* Pairs with smp_store_mb() in kvm_set_msr_common(). */ + smp_mb__after_atomic(); + + if (!READ_ONCE(vcpu->arch.apf.pageready_pending)) kvm_vcpu_kick(vcpu); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f3dc77f006f9..fdab0ad49098 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -420,6 +420,20 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) return !(kvm->arch.disabled_quirks & quirk); } +static __always_inline void kvm_request_l1tf_flush_l1d(void) +{ +#if IS_ENABLED(CONFIG_CPU_MITIGATIONS) && IS_ENABLED(CONFIG_KVM_INTEL) + /* + * Use a raw write to set the per-CPU flag, as KVM will ensure a flush + * even if preemption is currently enabled.. If the current vCPU task + * is migrated to a different CPU (or userspace runs the vCPU on a + * different task) before the next VM-Entry, then kvm_arch_vcpu_load() + * will request a flush on the new CPU. + */ + raw_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); +#endif +} + void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); @@ -622,8 +636,6 @@ static inline void kvm_machine_check(void) #endif } -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); int kvm_spec_ctrl_test_value(u64 value); int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 0d13d47c164b..db7c7c50cebc 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2777,6 +2777,43 @@ void sev_platform_shutdown(void) } EXPORT_SYMBOL_GPL(sev_platform_shutdown); +u64 sev_get_snp_policy_bits(void) +{ + struct psp_device *psp = psp_master; + struct sev_device *sev; + u64 policy_bits; + + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return 0; + + if (!psp || !psp->sev_data) + return 0; + + sev = psp->sev_data; + + policy_bits = SNP_POLICY_MASK_BASE; + + if (sev->snp_plat_status.feature_info) { + if (sev->snp_feat_info_0.ecx & SNP_RAPL_DISABLE_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_RAPL_DIS; + + if (sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM; + + if (sev->snp_feat_info_0.ecx & SNP_AES_256_XTS_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_MEM_AES_256_XTS; + + if (sev->snp_feat_info_0.ecx & SNP_CXL_ALLOW_POLICY_SUPPORTED) + policy_bits |= SNP_POLICY_MASK_CXL_ALLOW; + + if (sev_version_greater_or_equal(1, 58)) + policy_bits |= SNP_POLICY_MASK_PAGE_SWAP_DISABLE; + } + + return policy_bits; +} +EXPORT_SYMBOL_GPL(sev_get_snp_policy_bits); + void sev_dev_destroy(struct psp_device *psp) { struct sev_device *sev = psp->sev_data; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bacad18357b3..d927ae32e7d0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -491,8 +491,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, - ~__GFP_FS), 0); + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), + 0, NULL); if (!folio) break; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 46bd8ca58670..d4523d5debcd 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -742,7 +742,7 @@ again: } folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), - 0); + 0, NULL); if (!folio) return ERR_PTR(-ENOMEM); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bc80cfe482f7..b7369fb4fbe9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -562,7 +562,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) * Allocate a managed folio for cached I/O, or it may be * then filled with a file-backed folio for in-place I/O */ - newfolio = filemap_alloc_folio(gfp, 0); + newfolio = filemap_alloc_folio(gfp, 0, NULL); if (!newfolio) continue; newfolio->private = Z_EROFS_PREALLOCATED_FOLIO; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6ad8d3bc6df7..a65e8cd388bc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1947,7 +1947,7 @@ static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, return; } - cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0); + cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL); if (!cfolio) return; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,18 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..a17fabbc0269 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -654,9 +654,11 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy); #else -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { return folio_alloc_noprof(gfp, order); } @@ -667,7 +669,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o static inline struct page *__page_cache_alloc(gfp_t gfp) { - return &filemap_alloc_folio(gfp, 0)->page; + return &filemap_alloc_folio(gfp, 0, NULL)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) @@ -753,11 +755,17 @@ static inline fgf_t fgf_set_order(size_t size) } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp); +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); +static inline struct folio *__filemap_get_folio(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp) +{ + return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL); +} + /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..abcdee256c65 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -14,6 +14,39 @@ #include <uapi/linux/psp-sev.h> +/* As defined by SEV API, under "Guest Policy". */ +#define SEV_POLICY_MASK_NODBG BIT(0) +#define SEV_POLICY_MASK_NOKS BIT(1) +#define SEV_POLICY_MASK_ES BIT(2) +#define SEV_POLICY_MASK_NOSEND BIT(3) +#define SEV_POLICY_MASK_DOMAIN BIT(4) +#define SEV_POLICY_MASK_SEV BIT(5) +#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16) +#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24) + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) +#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21) +#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22) +#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23) +#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24) +#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25) + +/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */ +#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_MIGRATE_MA | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) + #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ /** @@ -849,7 +882,10 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) +#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5) #ifdef CONFIG_CRYPTO_DEV_SP_PSP @@ -995,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); +u64 sev_get_snp_policy_bits(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index bb575f3ab45e..638ca21b7a90 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -103,5 +103,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ +#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/mm/filemap.c b/mm/filemap.c index 2f1e7e283a51..270485b19a5b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1002,11 +1002,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { int n; struct folio *folio; + if (policy) + return folio_alloc_mpol_noprof(gfp, order, policy, + NO_INTERLEAVE_INDEX, numa_node_id()); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { @@ -1923,11 +1928,12 @@ out: } /** - * __filemap_get_folio - Find and get a reference to a folio. + * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * @@ -1938,8 +1944,8 @@ out: * * Return: The found folio or an ERR_PTR() otherwise. */ -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp) +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; @@ -2009,7 +2015,7 @@ no_page: err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order); + folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; @@ -2056,7 +2062,7 @@ no_page: folio_clear_dropbehind(folio); return folio; } -EXPORT_SYMBOL(__filemap_get_folio); +EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) @@ -2551,7 +2557,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) @@ -3995,8 +4001,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, - mapping_min_folio_order(mapping)); + folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eb83cff7db8c..3d797d47a040 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -354,6 +354,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) return &default_policy; } +EXPORT_SYMBOL_FOR_MODULES(get_task_policy, "kvm"); static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); @@ -487,6 +488,7 @@ void __mpol_put(struct mempolicy *pol) return; kmem_cache_free(policy_cache, pol); } +EXPORT_SYMBOL_FOR_MODULES(__mpol_put, "kvm"); static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { @@ -2885,6 +2887,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, read_unlock(&sp->lock); return pol; } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_lookup, "kvm"); static void sp_free(struct sp_node *n) { @@ -3170,6 +3173,7 @@ put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } +EXPORT_SYMBOL_FOR_MODULES(mpol_shared_policy_init, "kvm"); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *pol) @@ -3188,6 +3192,7 @@ int mpol_set_shared_policy(struct shared_policy *sp, sp_free(new); return err; } +EXPORT_SYMBOL_FOR_MODULES(mpol_set_shared_policy, "kvm"); /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *sp) @@ -3206,6 +3211,7 @@ void mpol_free_shared_policy(struct shared_policy *sp) } write_unlock(&sp->lock); } +EXPORT_SYMBOL_FOR_MODULES(mpol_free_shared_policy, "kvm"); #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; diff --git a/mm/readahead.c b/mm/readahead.c index 3a4b5d58eeb6..b415c9969176 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -186,7 +186,7 @@ static struct folio *ractl_alloc_folio(struct readahead_control *ractl, { struct folio *folio; - folio = filemap_alloc_folio(gfp_mask, order); + folio = filemap_alloc_folio(gfp_mask, order, NULL); if (folio && ractl->dropbehind) __folio_set_dropbehind(folio); diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index d9fffe06d3ea..f2b223072b62 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -6,7 +6,7 @@ ARCH ?= $(SUBARCH) ifeq ($(ARCH),$(filter $(ARCH),arm64 s390 riscv x86 x86_64 loongarch)) # Top-level selftests allows ARCH=x86_64 :-( ifeq ($(ARCH),x86_64) - ARCH := x86 + override ARCH := x86 endif include Makefile.kvm else diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..1b5b6668c803 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -88,8 +88,12 @@ TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/msrs_test +TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test +TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test @@ -111,14 +115,12 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test -TEST_GEN_PROGS_x86 += x86/vmx_close_while_nested_test TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state +TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test -TEST_GEN_PROGS_x86 += x86/vmx_tsc_adjust_test -TEST_GEN_PROGS_x86 += x86/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test @@ -210,6 +212,7 @@ TEST_GEN_PROGS_riscv += mmu_stress_test TEST_GEN_PROGS_riscv += rseq_test TEST_GEN_PROGS_riscv += steal_time +TEST_GEN_PROGS_loongarch = arch_timer TEST_GEN_PROGS_loongarch += coalesced_io_test TEST_GEN_PROGS_loongarch += demand_paging_test TEST_GEN_PROGS_loongarch += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 6338f5bbdb70..8d7758f12280 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -636,7 +636,7 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) - close(fd[f]); + kvm_close(fd[f]); } /* handles the valid case: intid=0xffffffff num=1 */ diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index e7d9aeb418d3..618c937f3c90 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -19,6 +19,7 @@ #include <sys/stat.h> #include "kvm_util.h" +#include "numaif.h" #include "test_util.h" #include "ucall_common.h" @@ -75,6 +76,101 @@ static void test_mmap_supported(int fd, size_t total_size) kvm_munmap(mem, total_size); } +static void test_mbind(int fd, size_t total_size) +{ + const unsigned long nodemask_0 = 1; /* nid: 0 */ + unsigned long nodemask = 0; + unsigned long maxnode = 8; + int policy; + char *mem; + int ret; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + /* Test MPOL_INTERLEAVE policy */ + kvm_mbind(mem, page_size * 2, MPOL_INTERLEAVE, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_INTERLEAVE && nodemask == nodemask_0, + "Wanted MPOL_INTERLEAVE (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_INTERLEAVE, nodemask_0, policy, nodemask); + + /* Test basic MPOL_BIND policy */ + kvm_mbind(mem + page_size * 2, page_size * 2, MPOL_BIND, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem + page_size * 2, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_BIND && nodemask == nodemask_0, + "Wanted MPOL_BIND (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_BIND, nodemask_0, policy, nodemask); + + /* Test MPOL_DEFAULT policy */ + kvm_mbind(mem, total_size, MPOL_DEFAULT, NULL, 0, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_DEFAULT && !nodemask, + "Wanted MPOL_DEFAULT (%u) and nodemask 0x0, got %u and 0x%lx", + MPOL_DEFAULT, policy, nodemask); + + /* Test with invalid policy */ + ret = mbind(mem, page_size, 999, &nodemask_0, maxnode, 0); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "mbind with invalid policy should fail with EINVAL"); + + kvm_munmap(mem, total_size); +} + +static void test_numa_allocation(int fd, size_t total_size) +{ + unsigned long node0_mask = 1; /* Node 0 */ + unsigned long node1_mask = 2; /* Node 1 */ + unsigned long maxnode = 8; + void *pages[4]; + int status[4]; + char *mem; + int i; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + for (i = 0; i < 4; i++) + pages[i] = (char *)mem + page_size * i; + + /* Set NUMA policy after allocation */ + memset(mem, 0xaa, page_size); + kvm_mbind(pages[0], page_size, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, page_size); + + /* Set NUMA policy before allocation */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + /* Validate if pages are allocated on specified NUMA nodes */ + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 1, "Expected page 0 on node 1, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 1, "Expected page 1 on node 1, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 0, "Expected page 2 on node 0, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 0, "Expected page 3 on node 0, got it on node %d", status[3]); + + /* Punch hole for all pages */ + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, total_size); + + /* Change NUMA policy nodes and reallocate */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 0, "Expected page 0 on node 0, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 0, "Expected page 1 on node 0, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 1, "Expected page 2 on node 1, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 1, "Expected page 3 on node 1, got it on node %d", status[3]); + + kvm_munmap(mem, total_size); +} + static void test_fault_sigbus(int fd, size_t accessible_size, size_t map_size) { const char val = 0xaa; @@ -273,11 +369,13 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { gmem_test(mmap_supported, vm, flags); gmem_test(fault_overflow, vm, flags); + gmem_test(numa_allocation, vm, flags); } else { gmem_test(fault_private, vm, flags); } gmem_test(mmap_cow, vm, flags); + gmem_test(mbind, vm, flags); } else { gmem_test(mmap_not_supported, vm, flags); } diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h new file mode 100644 index 000000000000..d4e613162bba --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_SYSCALLS_H +#define SELFTEST_KVM_SYSCALLS_H + +#include <sys/syscall.h> + +#define MAP_ARGS0(m,...) +#define MAP_ARGS1(m,t,a,...) m(t,a) +#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) +#define MAP_ARGS3(m,t,a,...) m(t,a), MAP_ARGS2(m,__VA_ARGS__) +#define MAP_ARGS4(m,t,a,...) m(t,a), MAP_ARGS3(m,__VA_ARGS__) +#define MAP_ARGS5(m,t,a,...) m(t,a), MAP_ARGS4(m,__VA_ARGS__) +#define MAP_ARGS6(m,t,a,...) m(t,a), MAP_ARGS5(m,__VA_ARGS__) +#define MAP_ARGS(n,...) MAP_ARGS##n(__VA_ARGS__) + +#define __DECLARE_ARGS(t, a) t a +#define __UNPACK_ARGS(t, a) a + +#define DECLARE_ARGS(nr_args, args...) MAP_ARGS(nr_args, __DECLARE_ARGS, args) +#define UNPACK_ARGS(nr_args, args...) MAP_ARGS(nr_args, __UNPACK_ARGS, args) + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* Define a kvm_<syscall>() API to assert success. */ +#define __KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline void kvm_##name(DECLARE_ARGS(nr_args, args)) \ +{ \ + int r; \ + \ + r = name(UNPACK_ARGS(nr_args, args)); \ + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR(#name, r)); \ +} + +/* + * Macro to define syscall APIs, either because KVM selftests doesn't link to + * the standard library, e.g. libnuma, or because there is no library that yet + * provides the syscall. These + */ +#define KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline long name(DECLARE_ARGS(nr_args, args)) \ +{ \ + return syscall(__NR_##name, UNPACK_ARGS(nr_args, args)); \ +} \ +__KVM_SYSCALL_DEFINE(name, nr_args, args) + +/* + * Special case mmap(), as KVM selftest rarely/never specific an address, + * rarely specify an offset, and because the unique return code requires + * special handling anyways. + */ +static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, + off_t offset) +{ + void *mem; + + mem = mmap(NULL, size, prot, flags, fd, offset); + TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", + (int)(unsigned long)MAP_FAILED)); + return mem; +} + +static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) +{ + return __kvm_mmap(size, prot, flags, fd, 0); +} + +static inline int kvm_dup(int fd) +{ + int new_fd = dup(fd); + + TEST_ASSERT(new_fd >= 0, __KVM_SYSCALL_ERROR("dup()", new_fd)); + return new_fd; +} + +__KVM_SYSCALL_DEFINE(munmap, 2, void *, mem, size_t, size); +__KVM_SYSCALL_DEFINE(close, 1, int, fd); +__KVM_SYSCALL_DEFINE(fallocate, 4, int, fd, int, mode, loff_t, offset, loff_t, len); +__KVM_SYSCALL_DEFINE(ftruncate, 2, unsigned int, fd, off_t, length); + +#endif /* SELFTEST_KVM_SYSCALLS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d3f3e455c031..94c219c3877c 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -23,6 +23,7 @@ #include <pthread.h> +#include "kvm_syscalls.h" #include "kvm_util_arch.h" #include "kvm_util_types.h" #include "sparsebit.h" @@ -177,7 +178,7 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_16K, VM_MODE_P40V48_64K, - VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_PXXVYY_4K, /* For 48-bit or 57-bit VA, depending on host support */ VM_MODE_P47V64_4K, VM_MODE_P44V64_4K, VM_MODE_P36V48_4K, @@ -219,7 +220,7 @@ extern enum vm_guest_mode vm_mode_default; #elif defined(__x86_64__) -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define VM_MODE_DEFAULT VM_MODE_PXXVYY_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 8) @@ -283,34 +284,6 @@ static inline bool kvm_has_cap(long cap) return kvm_check_cap(cap); } -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, - off_t offset) -{ - void *mem; - - mem = mmap(NULL, size, prot, flags, fd, offset); - TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", - (int)(unsigned long)MAP_FAILED)); - - return mem; -} - -static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) -{ - return __kvm_mmap(size, prot, flags, fd, 0); -} - -static inline void kvm_munmap(void *mem, size_t size) -{ - int ret; - - ret = munmap(mem, size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); -} - /* * Use the "inner", double-underscore macro when reporting errors from within * other macros so that the name of ioctl() and not its literal numeric value @@ -700,12 +673,12 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag uint32_t guest_memfd, uint64_t guest_memfd_offset); void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags); + enum vm_mem_backing_src_type src_type, + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags); void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd_fd, uint64_t guest_memfd_offset); #ifndef vm_arch_has_protected_memory static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) @@ -1230,6 +1203,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { virt_arch_pg_map(vm, vaddr, paddr); + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } diff --git a/tools/testing/selftests/kvm/include/loongarch/arch_timer.h b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h new file mode 100644 index 000000000000..2ed106b32c81 --- /dev/null +++ b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * LoongArch Constant Timer specific interface + */ +#ifndef SELFTEST_KVM_ARCH_TIMER_H +#define SELFTEST_KVM_ARCH_TIMER_H + +#include "processor.h" + +/* LoongArch timer frequency is constant 100MHZ */ +#define TIMER_FREQ (100UL << 20) +#define msec_to_cycles(msec) (TIMER_FREQ * (unsigned long)(msec) / 1000) +#define usec_to_cycles(usec) (TIMER_FREQ * (unsigned long)(usec) / 1000000) +#define cycles_to_usec(cycles) ((unsigned long)(cycles) * 1000000 / TIMER_FREQ) + +static inline unsigned long timer_get_cycles(void) +{ + unsigned long val = 0; + + __asm__ __volatile__( + "rdtime.d %0, $zero\n\t" + : "=r"(val) + : + ); + + return val; +} + +static inline unsigned long timer_get_cfg(void) +{ + return csr_read(LOONGARCH_CSR_TCFG); +} + +static inline unsigned long timer_get_val(void) +{ + return csr_read(LOONGARCH_CSR_TVAL); +} + +static inline void disable_timer(void) +{ + csr_write(0, LOONGARCH_CSR_TCFG); +} + +static inline void timer_irq_enable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val |= ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_irq_disable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val &= ~ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_set_next_cmp_ms(unsigned int msec, bool period) +{ + unsigned long val; + + val = msec_to_cycles(msec) & CSR_TCFG_VAL; + val |= CSR_TCFG_EN; + if (period) + val |= CSR_TCFG_PERIOD; + csr_write(val, LOONGARCH_CSR_TCFG); +} + +static inline void __delay(uint64_t cycles) +{ + uint64_t start = timer_get_cycles(); + + while ((timer_get_cycles() - start) < cycles) + cpu_relax(); +} + +static inline void udelay(unsigned long usec) +{ + __delay(usec_to_cycles(usec)); +} +#endif /* SELFTEST_KVM_ARCH_TIMER_H */ diff --git a/tools/testing/selftests/kvm/include/loongarch/processor.h b/tools/testing/selftests/kvm/include/loongarch/processor.h index 6427a3275e6a..76840ddda57d 100644 --- a/tools/testing/selftests/kvm/include/loongarch/processor.h +++ b/tools/testing/selftests/kvm/include/loongarch/processor.h @@ -83,7 +83,14 @@ #define LOONGARCH_CSR_PRMD 0x1 #define LOONGARCH_CSR_EUEN 0x2 #define LOONGARCH_CSR_ECFG 0x4 +#define ECFGB_TIMER 11 +#define ECFGF_TIMER (BIT_ULL(ECFGB_TIMER)) #define LOONGARCH_CSR_ESTAT 0x5 /* Exception status */ +#define CSR_ESTAT_EXC_SHIFT 16 +#define CSR_ESTAT_EXC_WIDTH 6 +#define CSR_ESTAT_EXC (0x3f << CSR_ESTAT_EXC_SHIFT) +#define EXCCODE_INT 0 /* Interrupt */ +#define INT_TI 11 /* Timer interrupt*/ #define LOONGARCH_CSR_ERA 0x6 /* ERA */ #define LOONGARCH_CSR_BADV 0x7 /* Bad virtual address */ #define LOONGARCH_CSR_EENTRY 0xc @@ -106,6 +113,14 @@ #define LOONGARCH_CSR_KS1 0x31 #define LOONGARCH_CSR_TMID 0x40 #define LOONGARCH_CSR_TCFG 0x41 +#define CSR_TCFG_VAL (BIT_ULL(48) - BIT_ULL(2)) +#define CSR_TCFG_PERIOD_SHIFT 1 +#define CSR_TCFG_PERIOD (0x1UL << CSR_TCFG_PERIOD_SHIFT) +#define CSR_TCFG_EN (0x1UL) +#define LOONGARCH_CSR_TVAL 0x42 +#define LOONGARCH_CSR_TINTCLR 0x44 /* Timer interrupt clear */ +#define CSR_TINTCLR_TI_SHIFT 0 +#define CSR_TINTCLR_TI (1 << CSR_TINTCLR_TI_SHIFT) /* TLB refill exception entry */ #define LOONGARCH_CSR_TLBRENTRY 0x88 #define LOONGARCH_CSR_TLBRSAVE 0x8b @@ -113,6 +128,28 @@ #define CSR_TLBREHI_PS_SHIFT 0 #define CSR_TLBREHI_PS (0x3fUL << CSR_TLBREHI_PS_SHIFT) +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__( \ + "csrrd %[val], %[reg]\n\t" \ + : [val] "=r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + +#define csr_write(v, csr) \ +({ \ + register unsigned long __v = v; \ + __asm__ __volatile__ ( \ + "csrwr %[val], %[reg]\n\t" \ + : [val] "+r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + #define EXREGS_GPRS (32) #ifndef __ASSEMBLER__ @@ -124,18 +161,60 @@ struct ex_regs { unsigned long pc; unsigned long estat; unsigned long badv; + unsigned long prmd; }; #define PC_OFFSET_EXREGS offsetof(struct ex_regs, pc) #define ESTAT_OFFSET_EXREGS offsetof(struct ex_regs, estat) #define BADV_OFFSET_EXREGS offsetof(struct ex_regs, badv) +#define PRMD_OFFSET_EXREGS offsetof(struct ex_regs, prmd) #define EXREGS_SIZE sizeof(struct ex_regs) +#define VECTOR_NUM 64 + +typedef void(*handler_fn)(struct ex_regs *); + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM]; +}; + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler); + +static inline void cpu_relax(void) +{ + asm volatile("nop" ::: "memory"); +} + +static inline void local_irq_enable(void) +{ + unsigned int flags = CSR_CRMD_IE; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} + +static inline void local_irq_disable(void) +{ + unsigned int flags = 0; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} #else #define PC_OFFSET_EXREGS ((EXREGS_GPRS + 0) * 8) #define ESTAT_OFFSET_EXREGS ((EXREGS_GPRS + 1) * 8) #define BADV_OFFSET_EXREGS ((EXREGS_GPRS + 2) * 8) -#define EXREGS_SIZE ((EXREGS_GPRS + 3) * 8) +#define PRMD_OFFSET_EXREGS ((EXREGS_GPRS + 3) * 8) +#define EXREGS_SIZE ((EXREGS_GPRS + 4) * 8) #endif #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index b020547403fd..29572a6d789c 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -1,55 +1,83 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/numaif.h - * - * Copyright (C) 2020, Google LLC. - * - * This work is licensed under the terms of the GNU GPL, version 2. - * - * Header file that provides access to NUMA API functions not explicitly - * exported to user space. - */ +/* Copyright (C) 2020, Google LLC. */ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H -#define __NR_get_mempolicy 239 -#define __NR_migrate_pages 256 +#include <dirent.h> -/* System calls */ -long get_mempolicy(int *policy, const unsigned long *nmask, - unsigned long maxnode, void *addr, int flags) +#include <linux/mempolicy.h> + +#include "kvm_syscalls.h" + +KVM_SYSCALL_DEFINE(get_mempolicy, 5, int *, policy, const unsigned long *, nmask, + unsigned long, maxnode, void *, addr, int, flags); + +KVM_SYSCALL_DEFINE(set_mempolicy, 3, int, mode, const unsigned long *, nmask, + unsigned long, maxnode); + +KVM_SYSCALL_DEFINE(set_mempolicy_home_node, 4, unsigned long, start, + unsigned long, len, unsigned long, home_node, + unsigned long, flags); + +KVM_SYSCALL_DEFINE(migrate_pages, 4, int, pid, unsigned long, maxnode, + const unsigned long *, frommask, const unsigned long *, tomask); + +KVM_SYSCALL_DEFINE(move_pages, 6, int, pid, unsigned long, count, void *, pages, + const int *, nodes, int *, status, int, flags); + +KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, + const unsigned long *, nodemask, unsigned long, maxnode, + unsigned int, flags); + +static inline int get_max_numa_node(void) { - return syscall(__NR_get_mempolicy, policy, nmask, - maxnode, addr, flags); + struct dirent *de; + int max_node = 0; + DIR *d; + + /* + * Assume there's a single node if the kernel doesn't support NUMA, + * or if no nodes are found. + */ + d = opendir("/sys/devices/system/node"); + if (!d) + return 0; + + while ((de = readdir(d)) != NULL) { + int node_id; + char *endptr; + + if (strncmp(de->d_name, "node", 4) != 0) + continue; + + node_id = strtol(de->d_name + 4, &endptr, 10); + if (*endptr != '\0') + continue; + + if (node_id > max_node) + max_node = node_id; + } + closedir(d); + + return max_node; } -long migrate_pages(int pid, unsigned long maxnode, - const unsigned long *frommask, - const unsigned long *tomask) +static bool is_numa_available(void) { - return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); + /* + * Probe for NUMA by doing a dummy get_mempolicy(). If the syscall + * fails with ENOSYS, then the kernel was built without NUMA support. + * if the syscall fails with EPERM, then the process/user lacks the + * necessary capabilities (CAP_SYS_NICE). + */ + return !get_mempolicy(NULL, NULL, 0, NULL, 0) || + (errno != ENOSYS && errno != EPERM); } -/* Policies */ -#define MPOL_DEFAULT 0 -#define MPOL_PREFERRED 1 -#define MPOL_BIND 2 -#define MPOL_INTERLEAVE 3 - -#define MPOL_MAX MPOL_INTERLEAVE - -/* Flags for get_mem_policy */ -#define MPOL_F_NODE (1<<0) /* return next il node or node of address */ - /* Warning: MPOL_F_NODE is unsupported and - * subject to change. Don't use. - */ -#define MPOL_F_ADDR (1<<1) /* look up vma using address */ -#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ - -/* Flags for mbind */ -#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ +static inline bool is_multi_numa_node_system(void) +{ + return is_numa_available() && get_max_numa_node() >= 1; +} #endif /* SELFTEST_KVM_NUMAIF_H */ diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 51cd84b9ca66..57d62a425109 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1441,7 +1441,7 @@ enum pg_level { PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, - PG_LEVEL_NUM + PG_LEVEL_256T }; #define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index edb3c391b982..96e2b4c630a9 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -568,8 +568,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot); +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index f02355c3c4c2..b7dbde9c0843 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -239,14 +239,14 @@ int main(int argc, char *argv[]) * single stats file works and doesn't cause explosions. */ vm_stats_fds = vm_get_stats_fd(vms[i]); - stats_test(dup(vm_stats_fds)); + stats_test(kvm_dup(vm_stats_fds)); /* Verify userspace can instantiate multiple stats files. */ stats_test(vm_get_stats_fd(vms[i])); for (j = 0; j < max_vcpu; ++j) { vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); - stats_test(dup(vcpu_stats_fds[j])); + stats_test(kvm_dup(vcpu_stats_fds[j])); stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); } diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 54f6d17c78f7..d46e4b13b92c 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -324,7 +324,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) /* Configure base granule size */ switch (vm->mode) { - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: TEST_FAIL("AArch64 does not support 4K sized pages " "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..b81b2df0512e 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -201,7 +201,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", - [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_PXXVYY_4K] = "PA-bits:ANY, VA-bits:48 or 57, 4K pages", [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", @@ -228,7 +228,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, - [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, + [VM_MODE_PXXVYY_4K] = { 0, 0, 0x1000, 12 }, [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, @@ -310,24 +310,26 @@ struct kvm_vm *____vm_create(struct vm_shape shape) case VM_MODE_P36V47_16K: vm->pgtable_levels = 3; break; - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: #ifdef __x86_64__ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); kvm_init_vm_address_properties(vm); - /* - * Ignore KVM support for 5-level paging (vm->va_bits == 57), - * it doesn't take effect unless a CR4.LA57 is set, which it - * isn't for this mode (48-bit virtual address space). - */ - TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, - "Linear address width (%d bits) not supported", - vm->va_bits); + pr_debug("Guest physical address width detected: %d\n", vm->pa_bits); - vm->pgtable_levels = 4; - vm->va_bits = 48; + pr_debug("Guest virtual address width detected: %d\n", + vm->va_bits); + + if (vm->va_bits == 57) { + vm->pgtable_levels = 5; + } else { + TEST_ASSERT(vm->va_bits == 48, + "Unexpected guest virtual address width: %d", + vm->va_bits); + vm->pgtable_levels = 4; + } #else - TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); + TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms"); #endif break; case VM_MODE_P47V64_4K: @@ -704,8 +706,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) static void kvm_stats_release(struct kvm_binary_stats *stats) { - int ret; - if (stats->fd < 0) return; @@ -714,8 +714,7 @@ static void kvm_stats_release(struct kvm_binary_stats *stats) stats->desc = NULL; } - ret = close(stats->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(stats->fd); stats->fd = -1; } @@ -738,8 +737,6 @@ __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) */ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { - int ret; - if (vcpu->dirty_gfns) { kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size); vcpu->dirty_gfns = NULL; @@ -747,9 +744,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_munmap(vcpu->run, vcpu_mmap_sz()); - ret = close(vcpu->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - + kvm_close(vcpu->fd); kvm_stats_release(&vcpu->stats); list_del(&vcpu->list); @@ -761,16 +756,12 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) void kvm_vm_release(struct kvm_vm *vmp) { struct kvm_vcpu *vcpu, *tmp; - int ret; list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) vm_vcpu_rm(vmp, vcpu); - ret = close(vmp->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - - ret = close(vmp->kvm_fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(vmp->fd); + kvm_close(vmp->kvm_fd); /* Free cached stats metadata and close FD */ kvm_stats_release(&vmp->stats); @@ -828,7 +819,7 @@ void kvm_vm_free(struct kvm_vm *vmp) int kvm_memfd_alloc(size_t size, bool hugepages) { int memfd_flags = MFD_CLOEXEC; - int fd, r; + int fd; if (hugepages) memfd_flags |= MFD_HUGETLB; @@ -836,11 +827,8 @@ int kvm_memfd_alloc(size_t size, bool hugepages) fd = memfd_create("kvm_selftest", memfd_flags); TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); - r = ftruncate(fd, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); - - r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_ftruncate(fd, size); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); return fd; } @@ -957,8 +945,8 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags /* FIXME: This thing needs to be ripped apart and rewritten. */ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd, uint64_t guest_memfd_offset) { int ret; struct userspace_mem_region *region; @@ -972,30 +960,29 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); - TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " + TEST_ASSERT((gpa % vm->page_size) == 0, "Guest physical " "address not on a page boundary.\n" - " guest_paddr: 0x%lx vm->page_size: 0x%x", - guest_paddr, vm->page_size); - TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) + " gpa: 0x%lx vm->page_size: 0x%x", + gpa, vm->page_size); + TEST_ASSERT((((gpa >> vm->page_shift) + npages) - 1) <= vm->max_gfn, "Physical range beyond maximum " "supported physical address,\n" - " guest_paddr: 0x%lx npages: 0x%lx\n" + " gpa: 0x%lx npages: 0x%lx\n" " vm->max_gfn: 0x%lx vm->page_size: 0x%x", - guest_paddr, npages, vm->max_gfn, vm->page_size); + gpa, npages, vm->max_gfn, vm->page_size); /* * Confirm a mem region with an overlapping address doesn't * already exist. */ region = (struct userspace_mem_region *) userspace_mem_region_find( - vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); + vm, gpa, (gpa + npages * vm->page_size) - 1); if (region != NULL) TEST_FAIL("overlapping userspace_mem_region already " "exists\n" - " requested guest_paddr: 0x%lx npages: 0x%lx " - "page_size: 0x%x\n" - " existing guest_paddr: 0x%lx size: 0x%lx", - guest_paddr, npages, vm->page_size, + " requested gpa: 0x%lx npages: 0x%lx page_size: 0x%x\n" + " existing gpa: 0x%lx size: 0x%lx", + gpa, npages, vm->page_size, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); @@ -1009,8 +996,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" " existing slot: %u paddr: 0x%lx size: 0x%lx", - slot, guest_paddr, npages, - region->region.slot, + slot, gpa, npages, region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); } @@ -1036,7 +1022,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, if (src_type == VM_MEM_SRC_ANONYMOUS_THP) alignment = max(backing_src_pagesz, alignment); - TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); + TEST_ASSERT_EQ(gpa, align_up(gpa, backing_src_pagesz)); /* Add enough memory to align up if necessary */ if (alignment > 1) @@ -1084,8 +1070,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, * needing to track if the fd is owned by the framework * or by the caller. */ - guest_memfd = dup(guest_memfd); - TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + guest_memfd = kvm_dup(guest_memfd); } region->region.guest_memfd = guest_memfd; @@ -1097,20 +1082,18 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, region->unused_phy_pages = sparsebit_alloc(); if (vm_arch_has_protected_memory(vm)) region->protected_phy_pages = sparsebit_alloc(); - sparsebit_set_num(region->unused_phy_pages, - guest_paddr >> vm->page_shift, npages); + sparsebit_set_num(region->unused_phy_pages, gpa >> vm->page_shift, npages); region->region.slot = slot; region->region.flags = flags; - region->region.guest_phys_addr = guest_paddr; + region->region.guest_phys_addr = gpa; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", - ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size, + " guest_phys_addr: 0x%lx size: 0x%llx guest_memfd: %d", + ret, errno, slot, flags, gpa, region->region.memory_size, region->region.guest_memfd); /* Add to quick lookup data structures */ @@ -1132,10 +1115,10 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, - uint64_t npages, uint32_t flags) + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags) { - vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); + vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0); } /* @@ -1456,8 +1439,6 @@ static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, pages--, vaddr += vm->page_size, paddr += vm->page_size) { virt_pg_map(vm, vaddr, paddr); - - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } return vaddr_start; @@ -1571,7 +1552,6 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, while (npages--) { virt_pg_map(vm, vaddr, paddr); - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); vaddr += page_size; paddr += page_size; @@ -2305,11 +2285,35 @@ __weak void kvm_selftest_arch_init(void) { } +static void report_unexpected_signal(int signum) +{ +#define KVM_CASE_SIGNUM(sig) \ + case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum) + + switch (signum) { + KVM_CASE_SIGNUM(SIGBUS); + KVM_CASE_SIGNUM(SIGSEGV); + KVM_CASE_SIGNUM(SIGILL); + KVM_CASE_SIGNUM(SIGFPE); + default: + TEST_FAIL("Unexpected signal %d\n", signum); + } +} + void __attribute((constructor)) kvm_selftest_init(void) { + struct sigaction sig_sa = { + .sa_handler = report_unexpected_signal, + }; + /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + sigaction(SIGBUS, &sig_sa, NULL); + sigaction(SIGSEGV, &sig_sa, NULL); + sigaction(SIGILL, &sig_sa, NULL); + sigaction(SIGFPE, &sig_sa, NULL); + guest_random_seed = last_guest_seed = random(); pr_info("Random seed: 0x%x\n", guest_random_seed); diff --git a/tools/testing/selftests/kvm/lib/loongarch/exception.S b/tools/testing/selftests/kvm/lib/loongarch/exception.S index 88bfa505c6f5..3f1e4b67c5ae 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/exception.S +++ b/tools/testing/selftests/kvm/lib/loongarch/exception.S @@ -51,9 +51,15 @@ handle_exception: st.d t0, sp, ESTAT_OFFSET_EXREGS csrrd t0, LOONGARCH_CSR_BADV st.d t0, sp, BADV_OFFSET_EXREGS + csrrd t0, LOONGARCH_CSR_PRMD + st.d t0, sp, PRMD_OFFSET_EXREGS or a0, sp, zero bl route_exception + ld.d t0, sp, PC_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_ERA + ld.d t0, sp, PRMD_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_PRMD restore_gprs sp csrrd sp, LOONGARCH_CSR_KS0 ertn diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 0ac1abcb71cb..07c103369ddb 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -3,6 +3,7 @@ #include <assert.h> #include <linux/compiler.h> +#include <asm/kvm.h> #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" @@ -11,6 +12,7 @@ #define LOONGARCH_GUEST_STACK_VADDR_MIN 0x200000 static vm_paddr_t invalid_pgtable[4]; +static vm_vaddr_t exception_handlers; static uint64_t virt_pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) { @@ -183,7 +185,14 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) void route_exception(struct ex_regs *regs) { + int vector; unsigned long pc, estat, badv; + struct handlers *handlers; + + handlers = (struct handlers *)exception_handlers; + vector = (regs->estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + if (handlers && handlers->exception_handlers[vector]) + return handlers->exception_handlers[vector](regs); pc = regs->pc; badv = regs->badv; @@ -192,6 +201,32 @@ void route_exception(struct ex_regs *regs) while (1) ; } +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + void *addr; + + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), + LOONGARCH_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA); + + addr = addr_gva2hva(vm, vm->handlers); + memset(addr, 0, vm->page_size); + exception_handlers = vm->handlers; + sync_global_to_guest(vm, exception_handlers); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector] = handler; +} + +uint32_t guest_get_vcpuid(void) +{ + return csr_read(LOONGARCH_CSR_CPUID); +} + void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { int i; @@ -211,6 +246,11 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) vcpu_regs_set(vcpu, ®s); } +static void loongarch_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + __vcpu_set_reg(vcpu, id, val); +} + static void loongarch_get_csr(struct kvm_vcpu *vcpu, uint64_t id, void *addr) { uint64_t csrid; @@ -242,8 +282,8 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - /* user mode and page enable mode */ - val = PLV_USER | CSR_CRMD_PG; + /* kernel mode and page enable mode */ + val = PLV_KERN | CSR_CRMD_PG; loongarch_set_csr(vcpu, LOONGARCH_CSR_CRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_PRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_EUEN, 1); @@ -251,7 +291,10 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) loongarch_set_csr(vcpu, LOONGARCH_CSR_TCFG, 0); loongarch_set_csr(vcpu, LOONGARCH_CSR_ASID, 1); + /* time count start from 0 */ val = 0; + loongarch_set_reg(vcpu, KVM_REG_LOONGARCH_COUNTER, val); + width = vm->page_shift - 3; switch (vm->pgtable_levels) { diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 7f5d62a65c68..0b1f288ad556 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -63,7 +63,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); /* * Identity map the first 4G and the test region with 1G pages so that diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index b418502c5ecc..36104d27f3d9 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -158,10 +158,10 @@ bool kvm_is_tdp_enabled(void) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); - /* If needed, create page map l4 table. */ + /* If needed, create the top-level page table. */ if (!vm->pgd_created) { vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; @@ -218,11 +218,11 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pml4e, *pdpe, *pde; - uint64_t *pte; + uint64_t *pte = &vm->pgd; + int current_level; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, - "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((vaddr % pg_size) == 0, "Virtual address not aligned,\n" @@ -243,20 +243,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); - if (*pml4e & PTE_LARGE_MASK) - return; - - pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level); - if (*pdpe & PTE_LARGE_MASK) - return; - - pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level); - if (*pde & PTE_LARGE_MASK) - return; + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_create_upper_pte(vm, pte, vaddr, paddr, + current_level, level); + if (*pte & PTE_LARGE_MASK) + return; + } /* Fill in page table entry. */ - pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); @@ -289,6 +286,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, for (i = 0; i < nr_pages; i++) { __virt_pg_map(vm, vaddr, paddr, level); + sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift, + nr_bytes / PAGE_SIZE); vaddr += pg_size; paddr += pg_size; @@ -310,40 +309,38 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { - uint64_t *pml4e, *pdpe, *pde; + int va_width = 12 + (vm->pgtable_levels) * 9; + uint64_t *pte = &vm->pgd; + int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); /* - * Based on the mode check above there are 48 bits in the vaddr, so - * shift 16 to sign extend the last bit (bit-47), + * Check that the vaddr is a sign-extended va_width value. */ - TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), - "Canonical check failed. The virtual address is invalid."); - - pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); - if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G)) - return pml4e; - - pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); - if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G)) - return pdpe; - - pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); - if (vm_is_target_pte(pde, level, PG_LEVEL_2M)) - return pde; + TEST_ASSERT(vaddr == + (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), + "Canonical check failed. The virtual address is invalid."); + + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_get_pte(vm, pte, vaddr, current_level); + if (vm_is_target_pte(pte, level, current_level)) + return pte; + } - return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); } uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) @@ -526,7 +523,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; - TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); @@ -540,6 +538,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; if (kvm_cpu_has(X86_FEATURE_XSAVE)) sregs.cr4 |= X86_CR4_OSXSAVE; + if (vm->pgtable_levels == 5) + sregs.cr4 |= X86_CR4_LA57; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index d4d1208dd023..29b082a58daa 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -401,11 +401,11 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; uint16_t index; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((nested_paddr >> 48) == 0, - "Nested physical address 0x%lx requires 5-level paging", + "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT", nested_paddr); TEST_ASSERT((nested_paddr % page_size) == 0, "Nested physical address not on page boundary,\n" @@ -534,8 +534,7 @@ bool kvm_cpu_has_ept(void) return ctrl & SECONDARY_EXEC_ENABLE_EPT; } -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot) +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm) { TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); diff --git a/tools/testing/selftests/kvm/loongarch/arch_timer.c b/tools/testing/selftests/kvm/loongarch/arch_timer.c new file mode 100644 index 000000000000..355ecac30954 --- /dev/null +++ b/tools/testing/selftests/kvm/loongarch/arch_timer.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * The test validates periodic/one-shot constant timer IRQ using + * CSR.TCFG and CSR.TVAL registers. + */ +#include "arch_timer.h" +#include "kvm_util.h" +#include "processor.h" +#include "timer_test.h" +#include "ucall_common.h" + +static void do_idle(void) +{ + unsigned int intid; + unsigned long estat; + + __asm__ __volatile__("idle 0" : : : "memory"); + + estat = csr_read(LOONGARCH_CSR_ESTAT); + intid = !!(estat & BIT(INT_TI)); + + /* Make sure pending timer IRQ arrived */ + GUEST_ASSERT_EQ(intid, 1); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int intid; + uint32_t cpu = guest_get_vcpuid(); + uint64_t xcnt, val, cfg, xcnt_diff_us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + intid = !!(regs->estat & BIT(INT_TI)); + + /* Make sure we are dealing with the correct timer IRQ */ + GUEST_ASSERT_EQ(intid, 1); + + cfg = timer_get_cfg(); + if (cfg & CSR_TCFG_PERIOD) { + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter - 1); + if (shared_data->nr_iter == 0) + disable_timer(); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + return; + } + + /* + * On real machine, value of LOONGARCH_CSR_TVAL is BIT_ULL(48) - 1 + * On virtual machine, its value counts down from BIT_ULL(48) - 1 + */ + val = timer_get_val(); + xcnt = timer_get_cycles(); + xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt); + + /* Basic 'timer condition met' check */ + __GUEST_ASSERT(val > cfg, + "val = 0x%lx, cfg = 0x%lx, xcnt_diff_us = 0x%lx", + val, cfg, xcnt_diff_us); + + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); +} + +static void guest_test_period_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = test_args.nr_iter; + shared_data->xcnt = timer_get_cycles(); + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + timer_set_next_cmp_ms(test_args.timer_period_ms, true); + + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + } + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(irq_iter == 0, + "irq_iter = 0x%x.\n" + " Guest period timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + irq_iter); +} + +static void guest_test_oneshot_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = 0; + shared_data->guest_stage = 0; + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(config_iter + 1 == irq_iter, + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" + " Guest timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + config_iter + 1, irq_iter); + } +} + +static void guest_test_emulate_timer(uint32_t cpu) +{ + uint32_t config_iter; + uint64_t xcnt_diff_us, us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + local_irq_disable(); + shared_data->nr_iter = 0; + us = msecs_to_usecs(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + do_idle(); + + xcnt_diff_us = cycles_to_usec(timer_get_cycles() - shared_data->xcnt); + __GUEST_ASSERT(xcnt_diff_us >= us, + "xcnt_diff_us = 0x%lx, us = 0x%lx.\n", + xcnt_diff_us, us); + } + local_irq_enable(); +} + +static void guest_time_count_test(uint32_t cpu) +{ + uint32_t config_iter; + unsigned long start, end, prev, us; + + /* Assuming that test case starts to run in 1 second */ + start = timer_get_cycles(); + us = msec_to_cycles(1000); + __GUEST_ASSERT(start <= us, + "start = 0x%lx, us = 0x%lx.\n", + start, us); + + us = msec_to_cycles(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + start = timer_get_cycles(); + end = start + us; + /* test time count growing up always */ + while (start < end) { + prev = start; + start = timer_get_cycles(); + __GUEST_ASSERT(prev <= start, + "prev = 0x%lx, start = 0x%lx.\n", + prev, start); + } + } +} + +static void guest_code(void) +{ + uint32_t cpu = guest_get_vcpuid(); + + /* must run at first */ + guest_time_count_test(cpu); + + timer_irq_enable(); + local_irq_enable(); + guest_test_period_timer(cpu); + guest_test_oneshot_timer(cpu); + guest_test_emulate_timer(cpu); + + GUEST_DONE(); +} + +struct kvm_vm *test_vm_create(void) +{ + struct kvm_vm *vm; + int nr_vcpus = test_args.nr_vcpus; + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + vm_init_descriptor_tables(vm); + vm_install_exception_handler(vm, EXCCODE_INT, guest_irq_handler); + + /* Make all the test's cmdline args visible to the guest */ + sync_global_to_guest(vm, test_args); + + return vm; +} + +void test_vm_cleanup(struct kvm_vm *vm) +{ + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index 37b7e6524533..51c070556f3e 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -263,8 +263,10 @@ static void calc_default_nr_vcpus(void) TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, strerror(errno)); - nr_vcpus = CPU_COUNT(&possible_mask) * 3/4; + nr_vcpus = CPU_COUNT(&possible_mask); TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?"); + if (nr_vcpus >= 2) + nr_vcpus = nr_vcpus * 3/4; } int main(int argc, char *argv[]) @@ -360,11 +362,9 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ /* Identity map memory in the guest using 1gb pages. */ - for (i = 0; i < slot_size; i += SZ_1G) - __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); + virt_map_level(vm, gpa, gpa, slot_size, PG_LEVEL_1G); #else - for (i = 0; i < slot_size; i += vm->page_size) - virt_pg_map(vm, gpa + i, gpa + i); + virt_map(vm, gpa, gpa, slot_size >> vm->page_shift); #endif } diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index f04768c1d2e4..93e603d91311 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -17,13 +17,13 @@ #define TEST_NPAGES (TEST_SIZE / PAGE_SIZE) #define TEST_SLOT 10 -static void guest_code(uint64_t base_gpa) +static void guest_code(uint64_t base_gva) { volatile uint64_t val __used; int i; for (i = 0; i < TEST_NPAGES; i++) { - uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE); + uint64_t *src = (uint64_t *)(base_gva + i * PAGE_SIZE); val = *src; } @@ -161,6 +161,7 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset, static void __test_pre_fault_memory(unsigned long vm_type, bool private) { + uint64_t gpa, gva, alignment, guest_page_size; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, .type = vm_type, @@ -170,35 +171,30 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) struct kvm_vm *vm; struct ucall uc; - uint64_t guest_test_phys_mem; - uint64_t guest_test_virt_mem; - uint64_t alignment, guest_page_size; - vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code); alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; - guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size; + gpa = (vm->max_gfn - TEST_NPAGES) * guest_page_size; #ifdef __s390x__ alignment = max(0x100000UL, guest_page_size); #else alignment = SZ_2M; #endif - guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); - guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1); + gpa = align_down(gpa, alignment); + gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1); - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, TEST_SLOT, TEST_NPAGES, - private ? KVM_MEM_GUEST_MEMFD : 0); - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, TEST_SLOT, + TEST_NPAGES, private ? KVM_MEM_GUEST_MEMFD : 0); + virt_map(vm, gva, gpa, TEST_NPAGES); if (private) - vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE); + vm_mem_set_private(vm, gpa, TEST_SIZE); - pre_fault_memory(vcpu, guest_test_phys_mem, 0, SZ_2M, 0, private); - pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); - pre_fault_memory(vcpu, guest_test_phys_mem, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, 0, SZ_2M, 0, private); + pre_fault_memory(vcpu, gpa, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); - vcpu_args_set(vcpu, 1, guest_test_virt_mem); + vcpu_args_set(vcpu, 1, gva); vcpu_run(vcpu); run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86/hyperv_features.c b/tools/testing/selftests/kvm/x86/hyperv_features.c index 99d327084172..130b9ce7e5dd 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86/hyperv_features.c @@ -94,7 +94,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { input = pgs_gpa; - output = pgs_gpa + 4096; + output = pgs_gpa + PAGE_SIZE; } else { input = output = 0; } diff --git a/tools/testing/selftests/kvm/x86/hyperv_ipi.c b/tools/testing/selftests/kvm/x86/hyperv_ipi.c index 2b5b4bc6ef7e..ca61836c4e32 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86/hyperv_ipi.c @@ -102,7 +102,7 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) /* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ ipi->vector = IPI_VECTOR; ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1; - hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -116,13 +116,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 0; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -138,13 +138,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -160,14 +160,14 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -183,10 +183,10 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_ALL; - hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index 077cd0ec3040..a3b7ce155981 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -621,7 +621,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NTEST_PAGES; i++) { pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); gpa = addr_hva2gpa(vm, pte); - __virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K); + virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK); data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); } diff --git a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c index dad988351493..f001cb836bfa 100644 --- a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_close_while_nested - * * Copyright (C) 2019, Red Hat, Inc. * * Verify that nothing bad happens if a KVM user exits with open @@ -12,6 +10,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include <string.h> #include <sys/ioctl.h> @@ -22,6 +21,8 @@ enum { PORT_L0_EXIT = 0x2000, }; +#define L2_GUEST_STACK_SIZE 64 + static void l2_guest_code(void) { /* Exit to L0 */ @@ -29,9 +30,8 @@ static void l2_guest_code(void) : : [port] "d" (PORT_L0_EXIT) : "rax"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_vmx_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); @@ -45,19 +45,43 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(0); } +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* Prepare the VMCB for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(0); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva; struct kvm_vcpu *vcpu; struct kvm_vm *vm; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); for (;;) { volatile struct kvm_run *run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c new file mode 100644 index 000000000000..a6b6da9cf7fe --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * This test verifies that L1 fails to enter L2 with an invalid CR3, and + * succeeds otherwise. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + vmcall(); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = svm->vmcb->save.cr3; + svm->vmcb->save.cr3 = -1ull; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_ERR); + + /* Now restore CR3 and make sure L2 runs successfully */ + svm->vmcb->save.cr3 = save_cr3; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = vmreadz(GUEST_CR3); + vmwrite(GUEST_CR3, -1ull); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == + (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); + + /* Now restore CR3 and make sure L2 runs successfully */ + vmwrite(GUEST_CR3, save_cr3); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_DONE(); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t guest_gva = 0; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c index 2ceb5c78c442..2839f650e5c9 100644 --- a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_tsc_adjust_test - * * Copyright (C) 2018, Google LLC. * * IA32_TSC_ADJUST test @@ -22,6 +20,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include <string.h> #include <sys/ioctl.h> @@ -35,6 +34,8 @@ #define TSC_ADJUST_VALUE (1ll << 32) #define TSC_OFFSET_VALUE -(1ll << 48) +#define L2_GUEST_STACK_SIZE 64 + enum { PORT_ABORT = 0x1000, PORT_REPORT, @@ -72,42 +73,47 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_guest_code(void *data) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - uint32_t control; - uintptr_t save_cr3; + /* Set TSC from L1 and make sure TSC_ADJUST is updated correctly */ GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE); wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); - GUEST_ASSERT(load_vmcs(vmx_pages)); - - /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); - control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); - control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; - vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); - vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); - - /* Jump into L2. First, test failure to load guest CR3. */ - save_cr3 = vmreadz(GUEST_CR3); - vmwrite(GUEST_CR3, -1ull); - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == - (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); - check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - vmwrite(GUEST_CR3, save_cr3); - - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + /* + * Run L2 with TSC_OFFSET. L2 will write to TSC, and L1 is not + * intercepting the write so it should update L1's TSC_ADJUST. + */ + if (this_cpu_has(X86_FEATURE_VMX)) { + struct vmx_pages *vmx_pages = data; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + } else { + struct svm_test_data *svm = data; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + svm->vmcb->control.tsc_offset = TSC_OFFSET_VALUE; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + } check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE); - GUEST_DONE(); } @@ -119,16 +125,19 @@ static void report(int64_t val) int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t nested_gva; struct kvm_vcpu *vcpu; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); - vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (;;) { struct ucall uc; diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c index 1759fa5cb3f2..4260c9e4f489 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "vmx.h" +#include "svm_util.h" #include "kselftest.h" /* L2 is scaled up (from L1's perspective) by this factor */ @@ -79,7 +80,30 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC scaling for L2 */ + wrmsr(MSR_AMD64_TSC_RATIO, L2_SCALE_FACTOR << 32); + + /* launch L2 */ + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint32_t control; @@ -116,11 +140,19 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva = 0; uint64_t tsc_start, tsc_end; uint64_t tsc_khz; @@ -129,7 +161,8 @@ int main(int argc, char *argv[]) uint64_t l1_tsc_freq = 0; uint64_t l2_tsc_freq = 0; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); @@ -152,8 +185,13 @@ int main(int argc, char *argv[]) printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL); TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c index 82a8d88b5338..1969f4ab9b28 100644 --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c @@ -380,7 +380,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; pthread_t threads[KVM_MAX_VCPUS]; struct kvm_vm *vm; - int memfd, i, r; + int memfd, i; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, @@ -428,11 +428,8 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t * should prevent the VM from being fully destroyed until the last * reference to the guest_memfd is also put. */ - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); - - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); close(memfd); } diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index 77256c89bb8d..86ad1c7d068f 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -104,7 +104,7 @@ static void test_sync_vmsa(uint32_t type, uint64_t policy) vm_sev_launch(vm, policy, NULL); /* This page is shared, so make it decrypted. */ - memset(hva, 0, 4096); + memset(hva, 0, PAGE_SIZE); vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index 141b7fc0c965..f2c7a1c297e3 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -141,7 +141,7 @@ static void __attribute__((__flatten__)) guest_code(void *arg) if (this_cpu_has(X86_FEATURE_XSAVE)) { uint64_t supported_xcr0 = this_cpu_supported_xcr0(); - uint8_t buffer[4096]; + uint8_t buffer[PAGE_SIZE]; memset(buffer, 0xcc, sizeof(buffer)); diff --git a/tools/testing/selftests/kvm/x86/userspace_io_test.c b/tools/testing/selftests/kvm/x86/userspace_io_test.c index 9481cbcf284f..be7d72f3c029 100644 --- a/tools/testing/selftests/kvm/x86/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86/userspace_io_test.c @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) regs.rcx = 1; if (regs.rcx == 3) regs.rcx = 8192; - memset((void *)run + run->io.data_offset, 0xaa, 4096); + memset((void *)run + run->io.data_offset, 0xaa, PAGE_SIZE); vcpu_regs_set(vcpu, ®s); } diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index fa512d033205..98cb6bdab3e6 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -120,17 +120,17 @@ static void test_vmx_dirty_log(bool enable_ept) * GPAs as the EPT enabled case. */ if (enable_ept) { - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -153,9 +153,9 @@ static void test_vmx_dirty_log(bool enable_ept) } TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); break; case UCALL_DONE: done = true; diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c new file mode 100644 index 000000000000..cf1d2d1f2a8f --- /dev/null +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * Test KVM's ability to save and restore nested state when the L1 guest + * is using 5-level paging and the L2 guest is using 4-level paging. + * + * This test would have failed prior to commit 9245fd6b8531 ("KVM: x86: + * model canonical checks more precisely"). + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define LA57_GS_BASE 0xff2bc0311fb00000ull + +static void l2_guest_code(void) +{ + /* + * Sync with L0 to trigger save/restore. After + * resuming, execute VMCALL to exit back to L1. + */ + GUEST_SYNC(1); + vmcall(); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 guest_cr4; + vm_paddr_t pml5_pa, pml4_pa; + u64 *pml5; + u64 exit_reason; + + /* Set GS_BASE to a value that is only canonical with LA57. */ + wrmsr(MSR_GS_BASE, LA57_GS_BASE); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) == LA57_GS_BASE); + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * Set up L2 with a 4-level page table by pointing its CR3 to + * L1's first PML4 table and clearing CR4.LA57. This creates + * the CR4.LA57 mismatch that exercises the bug. + */ + pml5_pa = get_cr3() & PHYSICAL_PAGE_MASK; + pml5 = (u64 *)pml5_pa; + pml4_pa = pml5[0] & PHYSICAL_PAGE_MASK; + vmwrite(GUEST_CR3, pml4_pa); + + guest_cr4 = vmreadz(GUEST_CR4); + guest_cr4 &= ~X86_CR4_LA57; + vmwrite(GUEST_CR4, guest_cr4); + + GUEST_ASSERT(!vmlaunch()); + + exit_reason = vmreadz(VM_EXIT_REASON); + GUEST_ASSERT(exit_reason == EXIT_REASON_VMCALL); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + l1_guest_code(vmx_pages); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_LA57)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* + * L1 needs to read its own PML5 table to set up L2. Identity map + * the PML5 table to facilitate this. + */ + virt_map(vm, vm->pgd, vm->pgd, 1); + + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + for (stage = 1;; stage++) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(uc.args[1] == stage, + "Expected stage %d, got stage %lu", stage, (ulong)uc.args[1]); + if (stage == 1) { + pr_info("L2 is active; performing save/restore.\n"); + state = vcpu_save_state(vcpu); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c index 35cb9de54a82..ae4a4b6c05ca 100644 --- a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c @@ -256,7 +256,7 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, int nodes = 0; time_t start_time, last_update, now; time_t interval_secs = 1; - int i, r; + int i; int from, to; unsigned long bit; uint64_t hlt_count; @@ -267,9 +267,8 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, delay_usecs); /* Get set of first 64 numa nodes available */ - r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, + kvm_get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, 0, MPOL_F_MEMS_ALLOWED); - TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno); fprintf(stderr, "Numa nodes found amongst first %lu possible nodes " "(each 1-bit indicates node is present): %#lx\n", diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 5f0015c5dd95..267c7369c765 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS tristate select IRQ_BYPASS_MANAGER -config HAVE_KVM_VCPU_ASYNC_IOCTL - bool - config HAVE_KVM_VCPU_RUN_PID_CHANGE bool diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a7794ffdb976..0e8b5277be3b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -707,7 +707,7 @@ bool kvm_notify_irqfd_resampler(struct kvm *kvm, */ int kvm_irqfd_init(void) { - irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); + irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", WQ_PERCPU, 0); if (!irqfd_cleanup_wq) return -ENOMEM; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index ffadc5ee8e04..fdaea3422c30 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -1,18 +1,47 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/anon_inodes.h> #include <linux/backing-dev.h> #include <linux/falloc.h> +#include <linux/fs.h> #include <linux/kvm_host.h> +#include <linux/mempolicy.h> +#include <linux/pseudo_fs.h> #include <linux/pagemap.h> -#include <linux/anon_inodes.h> #include "kvm_mm.h" -struct kvm_gmem { +static struct vfsmount *kvm_gmem_mnt; + +/* + * A guest_memfd instance can be associated multiple VMs, each with its own + * "view" of the underlying physical memory. + * + * The gmem's inode is effectively the raw underlying physical storage, and is + * used to track properties of the physical memory, while each gmem file is + * effectively a single VM's view of that storage, and is used to track assets + * specific to its associated VM, e.g. memslots=>gmem bindings. + */ +struct gmem_file { struct kvm *kvm; struct xarray bindings; struct list_head entry; }; +struct gmem_inode { + struct shared_policy policy; + struct inode vfs_inode; + + u64 flags; +}; + +static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) +{ + return container_of(inode, struct gmem_inode, vfs_inode); +} + +#define kvm_gmem_for_each_file(f, mapping) \ + list_for_each_entry(f, &(mapping)->i_private_list, entry) + /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. @@ -25,6 +54,11 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } +static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return gfn - slot->base_gfn + slot->gmem.pgoff; +} + static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, pgoff_t index, struct folio *folio) { @@ -77,9 +111,9 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, * The order will be passed when creating the guest_memfd, and * checked when creating memslots. */ - WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); - index = gfn - slot->base_gfn + slot->gmem.pgoff; - index = ALIGN_DOWN(index, 1 << folio_order(folio)); + WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); + index = kvm_gmem_get_index(slot, gfn); + index = ALIGN_DOWN(index, folio_nr_pages(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) kvm_gmem_mark_prepared(folio); @@ -99,27 +133,45 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ - return filemap_grab_folio(inode->i_mapping, index); + struct mempolicy *policy; + struct folio *folio; + + /* + * Fast-path: See if folio is already present in mapping to avoid + * policy_lookup. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (!IS_ERR(folio)) + return folio; + + policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); + folio = __filemap_get_folio_mpol(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping), policy); + mpol_cond_put(policy); + + return folio; } static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) { - if ((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED) + if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) return KVM_FILTER_SHARED; return KVM_FILTER_PRIVATE; } -static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, pgoff_t end, enum kvm_gfn_range_filter attr_filter) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; - xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + xa_for_each_range(&f->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; struct kvm_gfn_range gfn_range = { @@ -150,22 +202,21 @@ static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; enum kvm_gfn_range_filter attr_filter; - struct kvm_gmem *gmem; + struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_begin(gmem, start, end, attr_filter); + kvm_gmem_for_each_file(f, inode->i_mapping) + __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } -static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, pgoff_t end) { - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; - if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); @@ -175,11 +226,10 @@ static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; - struct kvm_gmem *gmem; + struct gmem_file *f; - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_end(gmem, start, end); + kvm_gmem_for_each_file(f, inode->i_mapping) + __kvm_gmem_invalidate_end(f, start, end); } static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) @@ -277,9 +327,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, static int kvm_gmem_release(struct inode *inode, struct file *file) { - struct kvm_gmem *gmem = file->private_data; + struct gmem_file *f = file->private_data; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; /* @@ -299,7 +349,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) filemap_invalidate_lock(inode->i_mapping); - xa_for_each(&gmem->bindings, index, slot) + xa_for_each(&f->bindings, index, slot) WRITE_ONCE(slot->gmem.file, NULL); /* @@ -307,18 +357,18 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ - __kvm_gmem_invalidate_begin(gmem, 0, -1ul, + __kvm_gmem_invalidate_begin(f, 0, -1ul, kvm_gmem_get_invalidate_filter(inode)); - __kvm_gmem_invalidate_end(gmem, 0, -1ul); + __kvm_gmem_invalidate_end(f, 0, -1ul); - list_del(&gmem->entry); + list_del(&f->entry); filemap_invalidate_unlock(inode->i_mapping); mutex_unlock(&kvm->slots_lock); - xa_destroy(&gmem->bindings); - kfree(gmem); + xa_destroy(&f->bindings); + kfree(f); kvm_put_kvm(kvm); @@ -335,16 +385,12 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } -static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) -{ - return gfn - slot->base_gfn + slot->gmem.pgoff; -} +DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), + kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); static bool kvm_gmem_supports_mmap(struct inode *inode) { - const u64 flags = (u64)inode->i_private; - - return flags & GUEST_MEMFD_FLAG_MMAP; + return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -356,17 +402,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; - if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) + if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) return VM_FAULT_SIGBUS; folio = kvm_gmem_get_folio(inode, vmf->pgoff); if (IS_ERR(folio)) { - int err = PTR_ERR(folio); - - if (err == -EAGAIN) + if (PTR_ERR(folio) == -EAGAIN) return VM_FAULT_RETRY; - return vmf_error(err); + return vmf_error(PTR_ERR(folio)); } if (WARN_ON_ONCE(folio_test_large(folio))) { @@ -390,8 +434,40 @@ out_folio: return ret; } +#ifdef CONFIG_NUMA +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + + return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *pgoff) +{ + struct inode *inode = file_inode(vma->vm_file); + + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* + * Return the memory policy for this index, or NULL if none is set. + * + * Returning NULL, e.g. instead of the current task's memory policy, is + * important for the .get_policy kernel ABI: it indicates that no + * explicit policy has been set via mbind() for this memory. The caller + * can then replace NULL with the default memory policy instead of the + * current task's memory policy. + */ + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); +} +#endif /* CONFIG_NUMA */ + static const struct vm_operations_struct kvm_gmem_vm_ops = { - .fault = kvm_gmem_fault_user_mapping, + .fault = kvm_gmem_fault_user_mapping, +#ifdef CONFIG_NUMA + .get_policy = kvm_gmem_get_policy, + .set_policy = kvm_gmem_set_policy, +#endif }; static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) @@ -416,11 +492,6 @@ static struct file_operations kvm_gmem_fops = { .fallocate = kvm_gmem_fallocate, }; -void kvm_gmem_init(struct module *module) -{ - kvm_gmem_fops.owner = module; -} - static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -492,8 +563,8 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { - const char *anon_name = "[kvm-gmem]"; - struct kvm_gmem *gmem; + static const char *name = "[kvm-gmem]"; + struct gmem_file *f; struct inode *inode; struct file *file; int fd, err; @@ -502,25 +573,24 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) if (fd < 0) return fd; - gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); - if (!gmem) { + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) { err = -ENOMEM; goto err_fd; } - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, - O_RDWR, NULL); - if (IS_ERR(file)) { - err = PTR_ERR(file); + /* __fput() will take care of fops_put(). */ + if (!fops_get(&kvm_gmem_fops)) { + err = -ENOENT; goto err_gmem; } - file->f_flags |= O_LARGEFILE; - - inode = file->f_inode; - WARN_ON(file->f_mapping != inode->i_mapping); + inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto err_fops; + } - inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; inode->i_mode |= S_IFREG; @@ -530,16 +600,31 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + GMEM_I(inode)->flags = flags; + + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_inode; + } + + file->f_flags |= O_LARGEFILE; + file->private_data = f; + kvm_get_kvm(kvm); - gmem->kvm = kvm; - xa_init(&gmem->bindings); - list_add(&gmem->entry, &inode->i_mapping->i_private_list); + f->kvm = kvm; + xa_init(&f->bindings); + list_add(&f->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; +err_inode: + iput(inode); +err_fops: + fops_put(&kvm_gmem_fops); err_gmem: - kfree(gmem); + kfree(f); err_fd: put_unused_fd(fd); return err; @@ -564,7 +649,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, { loff_t size = slot->npages << PAGE_SHIFT; unsigned long start, end; - struct kvm_gmem *gmem; + struct gmem_file *f; struct inode *inode; struct file *file; int r = -EINVAL; @@ -578,8 +663,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (file->f_op != &kvm_gmem_fops) goto err; - gmem = file->private_data; - if (gmem->kvm != kvm) + f = file->private_data; + if (f->kvm != kvm) goto err; inode = file_inode(file); @@ -593,8 +678,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, start = offset >> PAGE_SHIFT; end = start + slot->npages; - if (!xa_empty(&gmem->bindings) && - xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (!xa_empty(&f->bindings) && + xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { filemap_invalidate_unlock(inode->i_mapping); goto err; } @@ -609,7 +694,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (kvm_gmem_supports_mmap(inode)) slot->flags |= KVM_MEMSLOT_GMEM_ONLY; - xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); /* @@ -623,12 +708,12 @@ err: return r; } -static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gmem) +static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; - xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); /* * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() @@ -639,8 +724,6 @@ static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gme void kvm_gmem_unbind(struct kvm_memory_slot *slot) { - struct file *file; - /* * Nothing to do if the underlying file was _already_ closed, as * kvm_gmem_release() invalidates and nullifies all bindings. @@ -648,7 +731,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) if (!slot->gmem.file) return; - file = kvm_gmem_get_file(slot); + CLASS(gmem_get_file, file)(slot); /* * However, if the file is _being_ closed, then the bindings need to be @@ -668,8 +751,6 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) filemap_invalidate_lock(file->f_mapping); __kvm_gmem_unbind(slot, file->private_data); filemap_invalidate_unlock(file->f_mapping); - - fput(file); } /* Returns a locked folio on success. */ @@ -678,18 +759,17 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, pgoff_t index, kvm_pfn_t *pfn, bool *is_prepared, int *max_order) { - struct file *gmem_file = READ_ONCE(slot->gmem.file); - struct kvm_gmem *gmem = file->private_data; + struct file *slot_file = READ_ONCE(slot->gmem.file); + struct gmem_file *f = file->private_data; struct folio *folio; - if (file != gmem_file) { - WARN_ON_ONCE(gmem_file); + if (file != slot_file) { + WARN_ON_ONCE(slot_file); return ERR_PTR(-EFAULT); } - gmem = file->private_data; - if (xa_load(&gmem->bindings, index) != slot) { - WARN_ON_ONCE(xa_load(&gmem->bindings, index)); + if (xa_load(&f->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&f->bindings, index)); return ERR_PTR(-EIO); } @@ -716,19 +796,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); - struct file *file = kvm_gmem_get_file(slot); struct folio *folio; bool is_prepared = false; int r = 0; + CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); - if (IS_ERR(folio)) { - r = PTR_ERR(folio); - goto out; - } + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!is_prepared) r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); @@ -740,8 +818,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, else folio_put(folio); -out: - fput(file); return r; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); @@ -750,7 +826,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { - struct file *file; struct kvm_memory_slot *slot; void __user *p; @@ -766,7 +841,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (!kvm_slot_has_gmem(slot)) return -EINVAL; - file = kvm_gmem_get_file(slot); + CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; @@ -824,8 +899,118 @@ put_folio_and_exit: filemap_invalidate_unlock(file->f_mapping); - fput(file); return ret && !i ? ret : i; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif + +static struct kmem_cache *kvm_gmem_inode_cachep; + +static void kvm_gmem_init_inode_once(void *__gi) +{ + struct gmem_inode *gi = __gi; + + /* + * Note! Don't initialize the inode with anything specific to the + * guest_memfd instance, or that might be specific to how the inode is + * used (from the VFS-layer's perspective). This hook is called only + * during the initial slab allocation, i.e. only fields/state that are + * idempotent across _all_ use of the inode _object_ can be initialized + * at this time! + */ + inode_init_once(&gi->vfs_inode); +} + +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) +{ + struct gmem_inode *gi; + + gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); + if (!gi) + return NULL; + + mpol_shared_policy_init(&gi->policy, NULL); + + gi->flags = 0; + return &gi->vfs_inode; +} + +static void kvm_gmem_destroy_inode(struct inode *inode) +{ + mpol_free_shared_policy(&GMEM_I(inode)->policy); +} + +static void kvm_gmem_free_inode(struct inode *inode) +{ + kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); +} + +static const struct super_operations kvm_gmem_super_operations = { + .statfs = simple_statfs, + .alloc_inode = kvm_gmem_alloc_inode, + .destroy_inode = kvm_gmem_destroy_inode, + .free_inode = kvm_gmem_free_inode, +}; + +static int kvm_gmem_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; + ctx = fc->fs_private; + ctx->ops = &kvm_gmem_super_operations; + + return 0; +} + +static struct file_system_type kvm_gmem_fs = { + .name = "guest_memfd", + .init_fs_context = kvm_gmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int kvm_gmem_init_mount(void) +{ + kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); + + if (IS_ERR(kvm_gmem_mnt)) + return PTR_ERR(kvm_gmem_mnt); + + kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; + return 0; +} + +int kvm_gmem_init(struct module *module) +{ + struct kmem_cache_args args = { + .align = 0, + .ctor = kvm_gmem_init_inode_once, + }; + int ret; + + kvm_gmem_fops.owner = module; + kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", + sizeof(struct gmem_inode), + &args, SLAB_ACCOUNT); + if (!kvm_gmem_inode_cachep) + return -ENOMEM; + + ret = kvm_gmem_init_mount(); + if (ret) { + kmem_cache_destroy(kvm_gmem_inode_cachep); + return ret; + } + return 0; +} + +void kvm_gmem_exit(void) +{ + kern_unmount(kvm_gmem_mnt); + kvm_gmem_mnt = NULL; + rcu_barrier(); + kmem_cache_destroy(kvm_gmem_inode_cachep); +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..74e1afd67b44 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4026,7 +4026,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) yielded = kvm_vcpu_yield_to(vcpu); if (yielded > 0) { - WRITE_ONCE(kvm->last_boosted_vcpu, i); + WRITE_ONCE(kvm->last_boosted_vcpu, idx); break; } else if (yielded < 0 && !--try) { break; @@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp, return r; /* - * Some architectures have vcpu ioctls that are asynchronous to vcpu - * execution; mutex_lock() would break them. + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, + * e.g. to support ioctls that can run asynchronous to vCPU execution. */ - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; @@ -6517,7 +6517,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (WARN_ON_ONCE(r)) goto err_vfio; - kvm_gmem_init(module); + r = kvm_gmem_init(module); + if (r) + goto err_gmem; r = kvm_init_virtualization(); if (r) @@ -6538,6 +6540,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) err_register: kvm_uninit_virtualization(); err_virt: + kvm_gmem_exit(); +err_gmem: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6569,6 +6573,7 @@ void kvm_exit(void) for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_gmem_exit(); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); kvm_irqfd_exit(); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 31defb08ccba..9fcc5d5b7f8d 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -68,17 +68,18 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, #endif /* HAVE_KVM_PFNCACHE */ #ifdef CONFIG_KVM_GUEST_MEMFD -void kvm_gmem_init(struct module *module); +int kvm_gmem_init(struct module *module); +void kvm_gmem_exit(void); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset); void kvm_gmem_unbind(struct kvm_memory_slot *slot); #else -static inline void kvm_gmem_init(struct module *module) +static inline int kvm_gmem_init(struct module *module) { - + return 0; } - +static inline void kvm_gmem_exit(void) {}; static inline int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset) |
