diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 7a7dc9d56027..a52250c9da2f 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -131,4 +131,8 @@ static __always_inline __pure bool fpu_state_size_dynamic(void) } #endif +#ifdef __PKVM_HYP__ +void pkvm_setup_xstate_cache(void); +#endif + #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e20385a2821a..2e42ce1efd97 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2317,6 +2317,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); +bool pkvm_host_has_emulated_msr(struct kvm *kvm, u32 msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -2516,6 +2517,7 @@ void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); +int kvm_vcpu_x86_setup_mce(struct kvm_vcpu *vcpu, u64 mcg_cap); static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { diff --git a/arch/x86/include/asm/kvm_pkvm.h b/arch/x86/include/asm/kvm_pkvm.h index bcbf9bccf340..a470a3428ea5 100644 --- a/arch/x86/include/asm/kvm_pkvm.h +++ b/arch/x86/include/asm/kvm_pkvm.h @@ -69,6 +69,54 @@ union pkvm_hc_data { struct { struct pkvm_memcache memcache; } vcpu_free; + struct { + u64 data; + } get_msr; + union { + unsigned long rsp; + unsigned long rip; + unsigned long cr0; + unsigned long cr3; + unsigned long cr4; + u64 pdptrs[4]; + } cache_reg; + struct { + unsigned long data; + } get_rflags; + struct { + struct kvm_segment seg_val; + int seg; + } set_segment; + struct { + struct kvm_segment seg_val; + } get_segment; + struct { + u64 data; + } get_segment_base; + struct { + struct desc_ptr desc; + } set_idt; + struct { + struct desc_ptr desc; + } get_idt; + struct { + struct desc_ptr desc; + } set_gdt; + struct { + struct desc_ptr desc; + } get_gdt; + struct { + u32 data; + } get_interrupt_shadow; + struct { + bool data; + } get_nmi_mask; + struct { + struct pkvm_memcache memcache; + } vcpu_after_set_cpuid; + struct { + struct pkvm_memcache memcache; + } vcpu_add_fpstate; struct { u64 data[PKVM_HC_DATA_MAX_NUM]; } raw; @@ -85,11 +133,16 @@ static_assert(sizeof(union pkvm_hc_data) == PKVM_HC_DATA_MAX_NUM * sizeof(u64)); (ALIGN(sizeof(((union pkvm_hc_data *)0)->f), sizeof(u64)) / sizeof(u64)) #define PKVM_HC_OUTPUT_NUM(f) f##_output_num +#define PKVM_HC_INPUT_NUM(f) f##_input_num enum { #define PKVM_HC(f) PKVM_HC_OUTPUT_NUM(f) = 0, #define PKVM_HC_OUT(f) PKVM_HC_OUTPUT_NUM(f) = PKVM_HC_DATA_NUM(f), #include + + #define PKVM_HC(f) PKVM_HC_INPUT_NUM(f) = 0, + #define PKVM_HC_IN(f) PKVM_HC_INPUT_NUM(f) = PKVM_HC_DATA_NUM(f), + #include }; static inline int pkvm_hc_output_num(enum pkvm_hc hc) @@ -102,6 +155,16 @@ static inline int pkvm_hc_output_num(enum pkvm_hc hc) } } +static inline int pkvm_hc_input_num(enum pkvm_hc hc) +{ + switch (hc) { + #define PKVM_HC(f) case TO_PKVM_HC(f): return PKVM_HC_INPUT_NUM(f); + #include + default: + return 0; + } +} + #define PKVM_HC_IN_0() #define PKVM_HC_IN_1(a1) , "b"(a1) #define PKVM_HC_IN_2(a1, a2) PKVM_HC_IN_1(a1), "c"(a2) @@ -150,6 +213,23 @@ static inline int pkvm_hc_output_num(enum pkvm_hc hc) __pkvm_hypercall(f, o, 4, ##__VA_ARGS__), \ PKVM_HC_UNREACHABLE(f)))))) +#define pkvm_hypercall_in(f, i) \ + __builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 1, \ + __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0]), \ + __builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 2, \ + __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0], \ + (i)->raw.data[1]), \ + __builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 3, \ + __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0], \ + (i)->raw.data[1], \ + (i)->raw.data[2]), \ + __builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 4, \ + __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0], \ + (i)->raw.data[1], \ + (i)->raw.data[2], \ + (i)->raw.data[3]), \ + PKVM_HC_UNREACHABLE(f))))) + static inline unsigned long pkvm_hc(struct kvm_vcpu *vcpu) { return vcpu->arch.regs[VCPU_REGS_RAX]; @@ -159,6 +239,11 @@ static inline unsigned long pkvm_hc(struct kvm_vcpu *vcpu) static inline unsigned long pkvm_hc_input##n(struct kvm_vcpu *vcpu) \ { \ return vcpu->arch.regs[VCPU_REGS_##reg]; \ +} \ +static inline void pkvm_hc_get_input##n(struct kvm_vcpu *vcpu, union pkvm_hc_data *p) \ +{ \ + BUILD_BUG_ON(n == 0 || n > PKVM_HC_DATA_MAX_NUM); \ + p->raw.data[n - 1] = vcpu->arch.regs[VCPU_REGS_##reg]; \ } DEFINE_PKVM_HC_INPUT(1, RBX) @@ -166,6 +251,29 @@ DEFINE_PKVM_HC_INPUT(2, RCX) DEFINE_PKVM_HC_INPUT(3, RDX) DEFINE_PKVM_HC_INPUT(4, RSI) +static inline void pkvm_hc_get_input(struct kvm_vcpu *vcpu, enum pkvm_hc hc, + union pkvm_hc_data *in) +{ + switch (pkvm_hc_input_num(hc)) { + case 4: + pkvm_hc_get_input4(vcpu, in); + fallthrough; + case 3: + pkvm_hc_get_input3(vcpu, in); + fallthrough; + case 2: + pkvm_hc_get_input2(vcpu, in); + fallthrough; + case 1: + pkvm_hc_get_input1(vcpu, in); + fallthrough; + case 0: + break; + default: + BUG(); + } +} + static inline void pkvm_hc_set_ret(struct kvm_vcpu *vcpu, int ret) { vcpu->arch.regs[VCPU_REGS_RAX] = ret; @@ -222,8 +330,12 @@ extern u64 pkvm_sym(sme_me_mask); extern struct cpumask pkvm_sym(__cpu_possible_mask); extern unsigned int pkvm_sym(nr_cpu_ids); DECLARE_STATIC_KEY_FALSE(pkvm_sym(switch_vcpu_ibpb)); +extern u64 pkvm_sym(x86_pred_cmd); extern struct fpu_state_config pkvm_sym(fpu_kernel_cfg); extern struct fpu_state_config pkvm_sym(fpu_user_cfg); +#ifdef CONFIG_X86_64 +DECLARE_STATIC_KEY_FALSE(pkvm_sym(__fpu_state_size_dynamic)); +#endif u64 pkvm_total_reserve_pages(void); PKVM_DECLARE(void *, pkvm_early_alloc_page, (void)); diff --git a/arch/x86/include/asm/pkvm_hypercalls.h b/arch/x86/include/asm/pkvm_hypercalls.h index e4864e76136e..ff0129af2ff6 100644 --- a/arch/x86/include/asm/pkvm_hypercalls.h +++ b/arch/x86/include/asm/pkvm_hypercalls.h @@ -7,6 +7,10 @@ BUILD_BUG_ON(1) #define PKVM_HC_OUT PKVM_HC #endif +#ifndef PKVM_HC_IN +#define PKVM_HC_IN PKVM_HC +#endif + /* Hypercalls used only during pKVM initialization */ PKVM_HC(init_finalize) @@ -21,6 +25,55 @@ PKVM_HC(vm_init) PKVM_HC_OUT(vm_destroy) PKVM_HC(vcpu_create) PKVM_HC_OUT(vcpu_free) +PKVM_HC(vcpu_load) +PKVM_HC(vcpu_put) +PKVM_HC(vcpu_reset) +PKVM_HC(update_exception_bitmap) +PKVM_HC(set_efer) +PKVM_HC(set_msr) +PKVM_HC_OUT(get_msr) +PKVM_HC_OUT(cache_reg) +PKVM_HC(set_cr4) +PKVM_HC(set_cr0) +PKVM_HC_OUT(get_rflags) +PKVM_HC(set_rflags) +PKVM_HC(set_dr7) +PKVM_HC_IN(set_segment) +PKVM_HC_OUT(get_segment) +PKVM_HC_OUT(get_segment_base) +PKVM_HC_IN(set_idt) +PKVM_HC_OUT(get_idt) +PKVM_HC_IN(set_gdt) +PKVM_HC_OUT(get_gdt) +PKVM_HC(flush_tlb_all) +PKVM_HC(flush_tlb_current) +PKVM_HC(flush_tlb_gva) +PKVM_HC(flush_tlb_guest) +PKVM_HC(set_interrupt_shadow) +PKVM_HC_OUT(get_interrupt_shadow) +PKVM_HC(enable_nmi_window) +PKVM_HC(enable_irq_window) +PKVM_HC(interrupt_allowed) +PKVM_HC(nmi_allowed) +PKVM_HC_OUT(get_nmi_mask) +PKVM_HC(set_nmi_mask) +PKVM_HC(inject_irq) +PKVM_HC(inject_nmi) +PKVM_HC(inject_exception) +PKVM_HC(cancel_injection) +PKVM_HC(update_cr8_intercept) +PKVM_HC(set_virtual_apic_mode) +PKVM_HC(refresh_apicv_exec_ctrl) +PKVM_HC(load_eoi_exitmap) +PKVM_HC(hwapic_isr_update) +PKVM_HC(sync_pir_to_irr) +PKVM_HC_OUT(vcpu_after_set_cpuid) +PKVM_HC_OUT(vcpu_add_fpstate) +PKVM_HC(write_tsc_offset) +PKVM_HC(write_tsc_multiplier) +PKVM_HC(load_mmu_pgd) +PKVM_HC(setup_mce) #undef PKVM_HC #undef PKVM_HC_OUT +#undef PKVM_HC_IN diff --git a/arch/x86/include/asm/pkvm_image_vars.h b/arch/x86/include/asm/pkvm_image_vars.h index 5e351110bb1e..c26e3dfe3cc8 100644 --- a/arch/x86/include/asm/pkvm_image_vars.h +++ b/arch/x86/include/asm/pkvm_image_vars.h @@ -19,6 +19,7 @@ PKVM_ALIAS(__trace_bprintk); PKVM_ALIAS(__dynamic_pr_debug); PKVM_ALIAS(mem_dump_obj); PKVM_ALIAS(vmalloc_base); +PKVM_ALIAS(get_cpu_entry_area); #endif #endif /* _ASM_x86_PKVM_IMAGE_VARS_H */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3746376172e6..11ada9bcdb52 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -29,7 +29,7 @@ #define CREATE_TRACE_POINTS #include -#if defined(CONFIG_X86_64) && !defined(__PKVM_HYP__) +#ifdef CONFIG_X86_64 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); DEFINE_PER_CPU(u64, xfd_state); #endif @@ -123,6 +123,7 @@ static void update_avx_timestamp(struct fpu *fpu) if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) fpu->avx512_timestamp = jiffies; } +#endif /* !__PKVM_HYP__ */ /* * Save the FPU register state in fpu->fpstate->regs. The register state is @@ -142,7 +143,9 @@ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); +#ifndef __PKVM_HYP__ update_avx_timestamp(fpu); +#endif return; } @@ -213,12 +216,15 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) } } +#ifndef __PKVM_HYP__ void fpu_reset_from_exception_fixup(void) { restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } +#endif /* !__PKVM_HYP__ */ #if IS_ENABLED(CONFIG_KVM) +#ifndef __PKVM_HYP__ static void __fpstate_reset(struct fpstate *fpstate); static void fpu_lock_guest_permissions(void) @@ -293,6 +299,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu) vfree(fpstate); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); +#endif /* !__PKVM_HYP__ */ /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable @@ -319,14 +326,19 @@ EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { +#ifndef __PKVM_HYP__ fpregs_lock(); +#endif guest_fpu->fpstate->xfd = xfd; if (guest_fpu->fpstate->in_use) xfd_update_state(guest_fpu->fpstate); +#ifndef __PKVM_HYP__ fpregs_unlock(); +#endif } EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); +#ifndef __PKVM_HYP__ /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state * @@ -350,6 +362,7 @@ void fpu_sync_guest_vmexit_xfd_state(void) } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); +#endif /* !__PKVM_HYP__ */ #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) @@ -358,10 +371,35 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) struct fpu *fpu = x86_task_fpu(current); struct fpstate *cur_fps = fpu->fpstate; +#ifndef __PKVM_HYP__ fpregs_lock(); if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); - +#else +#ifdef CONFIG_X86_64 + if (fpu_state_size_dynamic() && enter_guest) { + /* + * Refresh the xfd_state percpu cache before guest vmenter so + * that the xfd can be restored after guest vmexit. + */ + rdmsrl(MSR_IA32_XFD, cur_fps->xfd); + __this_cpu_write(xfd_state, cur_fps->xfd); + } +#endif + /* + * If entering the npVM, the FPU are already loaded with the npVM fpu + * state by the host. If exiting from the npVM, the fpu registers will be + * saved by the host. So no need to save FPU for the npVM. + * + * If entering the pVM, the FPU are loaded with the host fpu state, which + * is already saved by the host itself before switching to the pkvm + * hypervisor. If exiting from the pVM, then the fpu state should be saved + * by the pkvm hypervisor as the host is not allowed to do this for + * isolation purpose. + */ + if (guest_fps->is_confidential && !enter_guest) + save_fpregs_to_fpstate(fpu); +#endif /* Swap fpstate */ if (enter_guest) { fpu->__task_fpstate = cur_fps; @@ -375,6 +413,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) cur_fps = fpu->fpstate; +#ifndef __PKVM_HYP__ if (!cur_fps->is_confidential) { /* Includes XFD update */ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); @@ -389,10 +428,29 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) fpregs_mark_activate(); fpregs_unlock(); +#else + /* + * Similarly to the FPU saving case, no need to restore FPU for the npVM + * as this will be handled by the host. + * + * If entering the pVM, restore the FPU with the pVM fpu state. If + * exiting the pVM, wipe the FPU by restoring FPU with an initial fpu + * state. + */ + if (guest_fps->is_confidential) { + /* Includes XFD update */ + restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); + } else { + /* Only update XFD as npVM FPU is already loaded by the host */ + xfd_update_state(cur_fps); + } +#endif + return 0; } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); +#ifndef __PKVM_HYP__ void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) { @@ -441,8 +499,10 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); +#endif /* !__PKVM_HYP__ */ #endif /* CONFIG_KVM */ +#ifndef __PKVM_HYP__ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { if (!irqs_disabled()) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 28e4fd65c9da..6612a5478487 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -38,6 +38,7 @@ (bit) = FIRST_EXTENDED_XFEATURE; \ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) +#ifndef __PKVM_HYP__ /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms @@ -86,6 +87,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, [XFEATURE_APX] = X86_FEATURE_APX, }; +#endif /* !__PKVM_HYP__ */ static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; @@ -121,6 +123,7 @@ static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) +#ifndef __PKVM_HYP__ /* * Return whether the system supports a given xfeature. * @@ -158,6 +161,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) return 1; } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); +#endif /* !__PKVM_HYP__ */ static bool xfeature_is_aligned64(int xfeature_nr) { @@ -197,6 +201,7 @@ static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) return offs; } +#ifndef __PKVM_HYP__ /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. @@ -233,6 +238,7 @@ void fpu__init_cpu_xstate(void) xfeatures_mask_independent()); } } +#endif /* !__PKVM_HYP__ */ static bool xfeature_enabled(enum xfeature xfeature) { @@ -292,6 +298,7 @@ static void __init setup_xstate_cache(void) sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } +#ifndef __PKVM_HYP__ /* * Print out all the supported xstate features: */ @@ -585,6 +592,7 @@ static bool __init check_xstate_against_struct(int nr) return true; } +#endif /* !__PKVM_HYP__ */ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { @@ -606,6 +614,7 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) return offset + xstate_sizes[topmost]; } +#ifndef __PKVM_HYP__ /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating @@ -988,6 +997,7 @@ void fpu__resume_cpu(void) if (fpu_state_size_dynamic()) wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } +#endif /* !__PKVM_HYP__ */ /* * Given an xstate feature nr, calculate where in the xsave @@ -1060,6 +1070,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) } EXPORT_SYMBOL_GPL(get_xsave_addr); +#ifndef __PKVM_HYP__ /* * Given an xstate feature nr, calculate where in the xsave buffer the state is. * The xsave buffer should be in standard format, not compacted (e.g. user mode @@ -1473,6 +1484,7 @@ void xrstors(struct xregs_state *xstate, u64 mask) XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } +#endif /* !__PKVM_HYP__ */ #if IS_ENABLED(CONFIG_KVM) void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) @@ -1485,6 +1497,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); #endif +#ifndef __PKVM_HYP__ #ifdef CONFIG_X86_64 #ifdef CONFIG_X86_DEBUG_FPU @@ -1813,6 +1826,7 @@ static inline int xstate_request_perm(unsigned long idx, bool guest) return -EPERM; } #endif /* !CONFIG_X86_64 */ +#endif /* __PKVM_HYP__ */ u64 xstate_get_guest_group_perm(void) { @@ -1820,6 +1834,7 @@ u64 xstate_get_guest_group_perm(void) } EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); +#ifndef __PKVM_HYP__ /** * fpu_xstate_prctl - xstate permission operations * @option: A subfunction of arch_prctl() @@ -2009,3 +2024,51 @@ int elf_coredump_extra_notes_size(void) return size; } #endif /* CONFIG_COREDUMP */ +#else /* !__PKVM_HYP__ */ +void pkvm_setup_xstate_cache(void) +{ + if (!boot_cpu_has(X86_FEATURE_FPU) || + !boot_cpu_has(X86_FEATURE_XSAVE)) { + pr_info("pkvm: No FPU or XSAVE detected\n"); + return; + } + + /* Cache size, offset and flags for initialization */ + setup_xstate_cache(); +} + +int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) +{ + u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; + struct fpstate *fps; + unsigned int ksize; + + if (!xfd_event) + return 0; + + if (WARN_ON_ONCE(!guest_fpu)) + return -EINVAL; + + if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) + return -EPERM; + + fps = guest_fpu->fpstate; + ksize = xstate_calculate_size(fps->xfeatures | xfd_event, + cpu_feature_enabled(X86_FEATURE_XCOMPACTED)); + if (fps->size < ksize) { + /* State size is insufficient. */ + return -ENOMEM; + } + + guest_fpu->xfeatures |= xfd_event; + fps->xfeatures |= xfd_event; + fps->user_xfeatures |= xfd_event; + fps->xfd &= ~xfd_event; + + xstate_init_xcomp_bv(&fps->regs.xsave, fps->xfeatures); + if (fps->in_use) + xfd_update_state(fps); + + return 0; +} +#endif /* __PKVM_HYP__ */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cc400a96e26e..e141460ff4ba 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,6 @@ u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps); -#ifndef __PKVM_HYP__ struct cpuid_xstate_sizes { u32 eax; u32 ebx; @@ -151,6 +151,14 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) return -EINVAL; } +#ifdef __PKVM_HYP__ + /* + * Exposing dynamic xfeatures to npVM is handled by the host as npVM's + * fpstate is allocated and managed by the host. + */ + if (!pkvm_is_protected_vcpu(vcpu)) + return 0; +#endif /* * Exposing dynamic xfeatures to the guest requires additional * enabling in the FPU, e.g. to expand the guest XSAVE state size. @@ -243,6 +251,16 @@ static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu) if (!best) return 0; + if (pkvm_is_protected_vcpu(vcpu)) { + /* + * The pKVM hypervisor doesn't support emulate KVM PV features + * for pVM for simplicity. Thus remove KVM PV feature bits from + * the corresponding CPUID. + */ + best->eax = 0; + return 0; + } + if (kvm_hlt_in_guest(vcpu->kvm)) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); @@ -323,7 +341,7 @@ static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) { -#ifdef CONFIG_KVM_HYPERV +#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__) struct kvm_cpuid_entry2 *entry; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE); @@ -372,8 +390,10 @@ static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func, void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { +#ifndef __PKVM_HYP__ struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_cpuid_entry2 *best; +#endif struct kvm_cpuid_entry2 *entry; bool allow_gbpages; int i; @@ -426,6 +446,7 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES); guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages); +#ifndef __PKVM_HYP__ best = kvm_find_cpuid_entry(vcpu, 1); if (best && apic) { if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) @@ -435,6 +456,7 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_apic_set_version(vcpu); } +#endif vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu); @@ -445,23 +467,29 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); +#ifndef __PKVM_HYP__ kvm_pmu_refresh(vcpu); +#endif #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) | __cr4_reserved_bits(guest_cpu_cap_has, vcpu); #undef __kvm_cpu_cap_has +#ifndef __PKVM_HYP__ kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu)); +#endif /* Invoke the vendor callback only after the above state is updated. */ kvm_x86_call(vcpu_after_set_cpuid)(vcpu); +#ifndef __PKVM_HYP__ /* * Except for the MMU, which needs to do its thing any vendor specific * adjustments to the reserved GPA bits. */ kvm_mmu_after_set_cpuid(vcpu); +#endif kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); } @@ -504,8 +532,12 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); } +#ifndef __PKVM_HYP__ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) +#else +int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent) +#endif { u32 vcpu_caps[NR_KVM_CPU_CAPS]; int r; @@ -563,7 +595,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, kvm_vcpu_after_set_cpuid(vcpu); success: +#ifndef __PKVM_HYP__ kvfree(e2); +#endif return 0; err: @@ -573,6 +607,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, return r; } +#ifndef __PKVM_HYP__ /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -1275,7 +1310,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps); #undef VENDOR_F #undef RUNTIME_F -#ifndef __PKVM_HYP__ struct kvm_cpuid_array { struct kvm_cpuid_entry2 *entries; int maxnent; @@ -1747,8 +1781,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } else { phys_as = entry->eax & 0xff; g_phys_as = phys_as; + /* FIXME: Check pKVM guest MMU level */ +#ifndef __PKVM_HYP__ if (kvm_mmu_get_max_tdp_level() < 5) g_phys_as = min(g_phys_as, 48U); +#endif } entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); @@ -1880,6 +1917,7 @@ static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, return r; } +#ifndef __PKVM_HYP__ static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, __u32 num_entries, unsigned int ioctl_type) { @@ -2107,4 +2145,257 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_cpuid); +#else /* !__PKVM_HYP__ */ + +static DEFINE_PER_CPU(struct kvm_cpuid_entry2, cpuid_def[KVM_MAX_CPUID_ENTRIES]); + +static int pkvm_get_cpuid(struct kvm_cpuid_entry2 *entries, int *nent) +{ + static const u32 funcs[] = { + 0, 0x80000000, KVM_CPUID_SIGNATURE, + }; + + struct kvm_cpuid_array array = { + .entries = entries, + .nent = 0, + .maxnent = *nent, + }; + int r, i; + + if (*nent < 1) + return -E2BIG; + if (*nent > KVM_MAX_CPUID_ENTRIES) + *nent = KVM_MAX_CPUID_ENTRIES; + + for (i = 0; i < ARRAY_SIZE(funcs); i++) { + r = get_cpuid_func(&array, funcs[i], KVM_GET_SUPPORTED_CPUID); + if (r) + goto out; + } + + *nent = array.nent; +out: + return r; +} + +static bool pkvm_cpuid_entry_host_owned(struct kvm_cpuid_entry2 *e2) +{ + switch (e2->function) { + case 0xb: /* topology */ + case 0x1f: /* topology */ + case 0x80000002: /* Processor Brand String */ + case 0x80000003: /* Processor Brand String */ + case 0x80000004: /* Processor Brand String */ + return true; + } + + return false; +} + +#define CPUID_4_EAX_VALID_MASK GENMASK(4, 0) +#define CPUID_4_EBX_COH_LINE_SIZE_MASK GENMASK(11, 0) +#define CPUID_7_0_EDX_HYBRID (1 << 15) +static void pkvm_fixup_cpuid_entry(struct kvm_cpuid_entry2 *entry) +{ + switch (entry->function) { + case 4: + /* + * Deterministic cache parameters. + * + * Fix the coherency line size to 64 bytes following TDX. + */ + if (entry->eax & CPUID_4_EAX_VALID_MASK) { + entry->ebx &= ~CPUID_4_EBX_COH_LINE_SIZE_MASK; + entry->ebx |= 0x3F; + } + break; + case 7: /* Extended features */ + if (entry->index) + break; + + /* No support of hybrid */ + entry->edx &= ~CPUID_7_0_EDX_HYBRID; + break; + case 0x1a: + /* + * Native model ID. + * + * Clear the entry due to no support of hybrid. This leaf is + * not controlled by the host and __do_cpuid_func() already + * clears it. But in case __do_cpuid_func() may change its + * policy later, force clearing it here explicitly. + */ + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + default: + break; + } +} + +#define CPUID_1_EBX_ID_MASK GENMASK(31, 16) +#define CPUID_1_ECX_TSC_DLTIMER (1 << 24) +#define CPUID_1_ECX_HYP (1 << 31) +#define CPUID_1_EDX_HTT (1 << 28) +static void pkvm_enforce_cpuid_entry(struct kvm_cpuid_entry2 *entry, + struct kvm_cpuid_entry2 *def) +{ + struct kvm_cpuid_entry2 tmp = *def; + +#define COPY_BITS(reg1, reg2, mask) { \ + (reg1) &= ~(mask); \ + (reg1) |= (reg2) & (mask); \ +} + switch (entry->function) { + case 1: + COPY_BITS(tmp.ecx, entry->ecx, + CPUID_1_ECX_TSC_DLTIMER | CPUID_1_ECX_HYP); + COPY_BITS(tmp.ebx, entry->ebx, CPUID_1_EBX_ID_MASK); + COPY_BITS(tmp.edx, entry->edx, CPUID_1_EDX_HTT); + break; + default: + break; + } + + *entry = tmp; +} + +static bool cpuid_entry_is_empty(struct kvm_cpuid_entry2 *e2) +{ + return !e2->function && !e2->eax; +} + +static struct kvm_cpuid_entry2 *find_cpuid_entry(struct kvm_cpuid_entry2 *buf, + int nent, struct kvm_cpuid_entry2 *e2) +{ + int i; + + for (i = 0; i < nent; i++) { + if (cpuid_entry_is_empty(&buf[i])) + continue; + + if ((buf[i].function == e2->function) && + (buf[i].index == e2->index) && + (buf[i].flags == e2->flags)) + return &buf[i]; + } + + return NULL; +} + +/* + * pKVM enforces a simple CPUID policy (similar to QEMU '--cpu host') for + * pVM, by using the pKVM supported bits as the base plus a small set + * allowing the host to manage. This saves a lot of effort of defining/ + * maintaining a bit-wise complex policy as TDX does. + * + * As crosvm is the main VMM targeted in the pKVM project, the allowed set + * is currently scrutinized/defined based on the bits mangled by crosvm. + * It is not flexible but good for security/simplicity. The allowed set + * could be extended case-by-case when seeing new demand for the host + * to set. + * + * The enforcement includes: + * - if an entry is fully host-controlled, leave it intact. + * + * - if an entry is func#4 (cache parameters), it's configured by the host + * but certain fields will be overridden with fixed values. If none of + * func4 entries exist, pKVM will insert the default cache parameters + * as failsafe. + * + * - for remaining entries, there must be a matching one in the default + * set, otherwise the original entry is cleared. If matched, the entry + * is fully/partially overridden based on the default value. + * + * - Append a default entry to the buffer if it's not included by + * the host, to prevent the host attack by hiding cpuid leaves which + * may affect pVM security + * + * - Fixed values are enforced in the last step + */ +int pkvm_enforce_cpuid(struct kvm_cpuid_entry2 *e2, int *nent, int max_nent) +{ + struct kvm_cpuid_entry2 *de2 = this_cpu_ptr(cpuid_def); + int def_nent, r, i, n; + int orig_nent = *nent; + bool has_func4 = false; + + memset(de2, 0, KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2)); + def_nent = KVM_MAX_CPUID_ENTRIES; + + /* + * It is possible that the pKVM hypervisor can implement different + * permitted XCR0 for each guest (although currently the pKVM hypervisor + * implements the same permitted XCR0 for all guests). In this case, the + * default CPUID leaf 0xD will be different. Thus get the default CPUID + * entries for each guest, rather than initialize cpuid_def once during + * pKVM initialization. + */ + r = pkvm_get_cpuid(de2, &def_nent); + if (r) + return r; + + /* Enforce cpuid leaves according to the default set */ + for (i = 0; i < orig_nent; i++) { + struct kvm_cpuid_entry2 *tmp; + + if (cpuid_entry_is_empty(&e2[i]) || + pkvm_cpuid_entry_host_owned(&e2[i])) + continue; + + if (e2[i].function == 4) { + has_func4 = true; + continue; + } + + tmp = find_cpuid_entry(de2, def_nent, &e2[i]); + if (tmp) + pkvm_enforce_cpuid_entry(&e2[i], tmp); + else + memset(&e2[i], 0, sizeof(struct kvm_cpuid_entry2)); + } + + /* Insert default cpuid leaves if missing in the host buffer */ + n = 0; + for (i = 0; i < def_nent; i++) { + if (pkvm_cpuid_entry_host_owned(&de2[i])) + continue; + + /* + * If the host already provides cache parameters, + * skip all func4 entries in the default set. Simply + * comparing func/index doesn't work as the default set + * may contain more entries than host provides (due to + * different number of levels of cache on different + * physical CPUs on a hybrid system). + */ + if ((de2[i].function == 4) && has_func4) + continue; + + if (find_cpuid_entry(e2, orig_nent, &de2[i])) + continue; + + /* find an empty slot */ + while (n < max_nent && !cpuid_entry_is_empty(&e2[n])) + n++; + + if (n == max_nent) + return -ENOSPC; + + e2[n++] = de2[i]; + } + + if (n > orig_nent) + *nent = n; + + /* Apply fixed values to the final set of entries */ + for (i = 0; i < *nent; i++) { + if (cpuid_entry_is_empty(&e2[i]) || + pkvm_cpuid_entry_host_owned(&e2[i])) + continue; + + pkvm_fixup_cpuid_entry(&e2[i]); + } + + return 0; +} #endif /* !__PKVM_HYP__ */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d3f5ae15a7ca..506722a39bc9 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -290,4 +290,9 @@ static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)); } +#ifdef __PKVM_HYP__ +int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent); +int pkvm_enforce_cpuid(struct kvm_cpuid_entry2 *e2, int *nent, int max_nent); +#endif + #endif diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 8ddb01191d6f..874de1493d3d 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -44,6 +44,7 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif +#ifndef __PKVM_HYP__ /* * Using the register cache from interrupt context is generally not allowed, as * caching a register and marking it available/dirty can't be done atomically, @@ -55,6 +56,9 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15) */ #define kvm_assert_register_caching_allowed(vcpu) \ lockdep_assert_once(in_task() || kvm_arch_pmi_in_guest(vcpu)) +#else +#define kvm_assert_register_caching_allowed(vcpu) +#endif /* * avail dirty diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h index eefab3dc8498..6c5be685696b 100644 --- a/arch/x86/kvm/kvm_onhyperv.h +++ b/arch/x86/kvm/kvm_onhyperv.h @@ -6,7 +6,7 @@ #ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__ #define __ARCH_X86_KVM_KVM_ONHYPERV_H__ -#if IS_ENABLED(CONFIG_HYPERV) +#if IS_ENABLED(CONFIG_HYPERV) && !defined(__PKVM_HYP__) int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); int hv_flush_remote_tlbs(struct kvm *kvm); void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9a407266db94..0f5073bc1f1d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -47,6 +47,8 @@ #include "hyperv.h" #include "smm.h" +#ifndef __PKVM_HYP__ + #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) #else @@ -533,12 +535,14 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) { return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +#endif /* !__PKVM_HYP__ */ static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index) { return apic->nr_lvt_entries > lvt_index; } +#ifndef __PKVM_HYP__ void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -561,26 +565,34 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } +#endif /* !__PKVM_HYP__ */ void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu) { int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; +#ifndef __PKVM_HYP__ int i; +#endif if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries) return; +#ifndef __PKVM_HYP__ /* Initialize/mask any "new" LVT entries. */ for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++) kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED); +#endif apic->nr_lvt_entries = nr_lvt_entries; +#ifndef __PKVM_HYP__ /* The number of LVT entries is reflected in the version register. */ kvm_apic_set_version(vcpu); +#endif } +#ifndef __PKVM_HYP__ static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = { [LVT_TIMER] = LVT_MASK, /* timer mode mask added at runtime */ [LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK, @@ -1671,6 +1683,7 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev) { return container_of(dev, struct kvm_lapic, dev); } +#endif /* !__PKVM_HYP__ */ #define APIC_REG_MASK(reg) (1ull << ((reg) >> 4)) #define APIC_REGS_MASK(first, count) \ @@ -1714,6 +1727,7 @@ u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask); +#ifndef __PKVM_HYP__ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, void *data) { @@ -3507,3 +3521,4 @@ void kvm_lapic_exit(void) static_key_deferred_flush(&apic_sw_disabled); WARN_ON(static_branch_unlikely(&apic_sw_disabled.key)); } +#endif /* !__PKVM_HYP__ */ diff --git a/arch/x86/kvm/pkvm/Makefile b/arch/x86/kvm/pkvm/Makefile index 408e4206c579..0b887aa1ece1 100644 --- a/arch/x86/kvm/pkvm/Makefile +++ b/arch/x86/kvm/pkvm/Makefile @@ -37,14 +37,18 @@ pkvm-hyp-y += $(kernel-lib)/sort.o $(kernel-lib)/bsearch.o \ $(kernel-lib)/ctype.o kvm := .. -pkvm-hyp-y += $(kvm)/x86.o $(kvm)/cpuid.o +pkvm-hyp-y += $(kvm)/x86.o $(kvm)/cpuid.o $(kvm)/mtrr.o \ + $(kvm)/lapic.o pkvm-hyp-$(CONFIG_PKVM_INTEL) += vmx/host_vmentry.o vmx/host_vmx.o \ $(kvm)/vmx/vmx.o vmx/idt.o vmx/ept.o \ - $(kvm)/vmx/main.o + $(kvm)/vmx/main.o $(kvm)/vmx/vmenter.o + +AFLAGS_$(kvm)/vmx/vmenter.pkvm.o += -iquote $(obj) +$(obj)/$(kvm)/vmx/vmenter.pkvm.o: $(obj)/kvm-asm-offsets.h fpu := ../../kernel/fpu -pkvm-hyp-y += $(fpu)/core.o +pkvm-hyp-y += $(fpu)/core.o $(fpu)/xstate.o pkvm-obj := $(patsubst %.o,%.pkvm.o,$(pkvm-hyp-y)) obj-$(CONFIG_PKVM_X86) += pkvm.o diff --git a/arch/x86/kvm/pkvm/cpu.c b/arch/x86/kvm/pkvm/cpu.c index 11897b635e9e..72a594ebec7e 100644 --- a/arch/x86/kvm/pkvm/cpu.c +++ b/arch/x86/kvm/pkvm/cpu.c @@ -21,6 +21,7 @@ struct cpumask __cpu_possible_mask __ro_after_init; unsigned int nr_cpu_ids; DEFINE_PER_CPU(u64, x86_spec_ctrl_current); DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); +u64 x86_pred_cmd = PRED_CMD_IBPB; /* * Used to switch the FPU state between the host VM and pVMs. The fpu struct is diff --git a/arch/x86/kvm/pkvm/entry.S b/arch/x86/kvm/pkvm/entry.S index 7323e8867c37..6841745f4d31 100644 --- a/arch/x86/kvm/pkvm/entry.S +++ b/arch/x86/kvm/pkvm/entry.S @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -83,3 +84,92 @@ SYM_CODE_START_NOALIGN(x86_verw_sel) SYM_CODE_END(x86_verw_sel); .popsection + +/* Clobbers AX, CX, DX */ +SYM_FUNC_START(write_ibpb) + ANNOTATE_NOENDBR + movl $MSR_IA32_PRED_CMD, %ecx + movl _ASM_RIP(x86_pred_cmd), %eax + xorl %edx, %edx + wrmsr + + /* Make sure IBPB clears return stack preductions too. */ + FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET + RET +SYM_FUNC_END(write_ibpb) + +#ifdef CONFIG_X86_64 +/* + * This sequence executes branches in order to remove user branch information + * from the branch history tracker in the Branch Predictor, therefore removing + * user influence on subsequent BTB lookups. + * + * It should be used on parts prior to Alder Lake. Newer parts should use the + * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being + * virtualized on newer hardware the VMM should protect against BHI attacks by + * setting BHI_DIS_S for the guests. + * + * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging + * and not clearing the branch history. The call tree looks like: + * + * call 1 + * call 2 + * call 2 + * call 2 + * call 2 + * call 2 + * ret + * ret + * ret + * ret + * ret + * ret + * + * This means that the stack is non-constant and ORC can't unwind it with %rsp + * alone. Therefore we unconditionally set up the frame pointer, which allows + * ORC to unwind properly. + * + * The alignment is for performance and not for safety, and may be safely + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. + */ +SYM_FUNC_START(clear_bhb_loop) + ANNOTATE_NOENDBR + push %rbp + mov %rsp, %rbp + movl $5, %ecx + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f + .align 64, 0xcc + /* + * Shift instructions so that the RET is in the upper half of the + * cacheline and don't take the slowpath to its_return_thunk. + */ + .skip 32 - (.Lret1 - 1f), 0xcc + ANNOTATE_INTRA_FUNCTION_CALL +1: call 2f +.Lret1: RET + .align 64, 0xcc + /* + * As above shift instructions for RET at .Lret2 as well. + * + * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc +2: movl $5, %eax +3: jmp 4f + nop +4: sub $1, %eax + jnz 3b + sub $1, %ecx + jnz 1b +.Lret2: RET +5: lfence + pop %rbp + RET +SYM_FUNC_END(clear_bhb_loop) +STACK_FRAME_NON_STANDARD(clear_bhb_loop) +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kvm/pkvm/idt.c b/arch/x86/kvm/pkvm/idt.c index 4238be54b7b3..40c456fb9a58 100644 --- a/arch/x86/kvm/pkvm/idt.c +++ b/arch/x86/kvm/pkvm/idt.c @@ -114,6 +114,8 @@ static bool pkvm_fixup_exception(struct pt_regs *regs) reg = FIELD_GET(EX_DATA_REG_MASK, e->data); switch (type) { + case EX_TYPE_DEFAULT: + return ex_handler_default(e, regs); case EX_TYPE_WRMSR: return ex_handler_msr(e, regs, true, false, reg); case EX_TYPE_RDMSR: diff --git a/arch/x86/kvm/pkvm/init_finalize.c b/arch/x86/kvm/pkvm/init_finalize.c index d5489b6f6306..8d89dbe4b01a 100644 --- a/arch/x86/kvm/pkvm/init_finalize.c +++ b/arch/x86/kvm/pkvm/init_finalize.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include +#include "../cpuid.h" #include "early_alloc.h" #include "fpu.h" #include "init_finalize.h" @@ -238,6 +240,14 @@ static int finalize_global(struct pkvm_mem_info infos[], int nr_infos, if (ret) return ret; + pkvm_setup_xstate_cache(); + + /* + * Initialize KVM cpuid_xstate_sizes to support CPUID emulation for the + * guest VMs. + */ + kvm_init_xstate_sizes(); + return hyp_g_finalize ? hyp_g_finalize() : 0; } diff --git a/arch/x86/kvm/pkvm/pkvm.c b/arch/x86/kvm/pkvm/pkvm.c index 5685c174796d..8539662a7012 100644 --- a/arch/x86/kvm/pkvm/pkvm.c +++ b/arch/x86/kvm/pkvm/pkvm.c @@ -31,6 +31,11 @@ bool tdp_enabled = true; struct pkvm_hyp *pkvm_hyp; DEFINE_PER_CPU(struct pkvm_pcpu *, phys_cpu); DEFINE_PER_CPU(struct kvm_vcpu *, host_vcpu); +/* + * similarly pmu.c is not compiled. define kvm_mmu_cap here for the use + * in cpuid.c + */ +struct x86_pmu_capability __read_mostly kvm_pmu_cap = {0}; /* The maximum number of VMs under pkvm. */ #define MAX_PKVM_VMS 64 @@ -52,6 +57,9 @@ static struct pkvm_vm_ref { */ size_t kvm_vcpu_sz = sizeof(struct kvm_vcpu); +/* The current loaded guest vCPU. */ +static DEFINE_PER_CPU(struct kvm_vcpu*, cur_guest_vcpu); + static int __pkvm_vcpu_free(struct pkvm_vm *pkvm_vm, int vcpu_handle, struct pkvm_memcache *mc); @@ -318,10 +326,41 @@ static void unsetup_vcpu_lapic(struct kvm_vcpu *vcpu) pkvm_host_unshare_hyp(__pkvm_pa(apic->regs), PAGE_SIZE); } +static int share_vcpu_mce_banks(struct kvm_vcpu *vcpu) +{ + int ret; + + if (pkvm_is_protected_vcpu(vcpu)) + return -EINVAL; + + ret = pkvm_host_share_hyp(__pkvm_pa(vcpu->arch.mce_banks), KVM_MCE_SIZE); + if (ret) + return ret; + + ret = pkvm_host_share_hyp(__pkvm_pa(vcpu->arch.mci_ctl2_banks), KVM_MCI_CTL2_SIZE); + if (ret) + pkvm_host_unshare_hyp(__pkvm_pa(vcpu->arch.mce_banks), KVM_MCE_SIZE); + + return ret; +} + +static void unshare_vcpu_mce_banks(struct kvm_vcpu *vcpu) +{ + if (pkvm_is_protected_vcpu(vcpu)) + return; + + pkvm_host_unshare_hyp(__pkvm_pa(vcpu->arch.mce_banks), KVM_MCE_SIZE); + pkvm_host_unshare_hyp(__pkvm_pa(vcpu->arch.mci_ctl2_banks), KVM_MCI_CTL2_SIZE); +} + static int __vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, struct fpstate *fps) { struct pkvm_vcpu *pkvm_vcpu = to_pkvm_vcpu(vcpu); int ret = kvm_x86_call(vcpu_precreate)(kvm); + void *unused = (void *)pkvm_vcpu + + PKVM_VCPU_BASE_SIZE + + kvm_vcpu_sz; + int cpu = raw_smp_processor_id(); if (ret) return ret; @@ -335,26 +374,69 @@ static int __vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, struct fpstate vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - vcpu->arch.mce_banks = (void *)pkvm_vcpu + PKVM_VCPU_BASE_SIZE + kvm_vcpu_sz; - vcpu->arch.mci_ctl2_banks = (void *)vcpu->arch.mce_banks + KVM_MCE_SIZE; + + if (!pkvm_is_protected_vcpu(vcpu)) { + vcpu->arch.mce_banks = kern_pkvm_va(pkvm_vcpu->shared_vcpu->arch.mce_banks); + vcpu->arch.mci_ctl2_banks = + kern_pkvm_va(pkvm_vcpu->shared_vcpu->arch.mci_ctl2_banks); + ret = share_vcpu_mce_banks(vcpu); + if (ret) + return ret; + } else { + vcpu->arch.mce_banks = unused; + unused += KVM_MCE_SIZE; + vcpu->arch.mci_ctl2_banks = unused; + unused += KVM_MCI_CTL2_SIZE; + } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + vcpu->arch.apic_base = pkvm_vcpu->shared_vcpu->arch.apic_base; if (lapic_in_kernel(pkvm_vcpu->shared_vcpu)) - vcpu->arch.apic = (void *)vcpu->arch.mci_ctl2_banks + KVM_MCI_CTL2_SIZE; + vcpu->arch.apic = unused; ret = setup_vcpu_lapic(vcpu); if (ret) - return ret; + goto unshare_mce; vcpu->arch.guest_fpu.fpstate = fps; pkvm_init_guest_fpu(&vcpu->arch.guest_fpu); if (pkvm_is_protected_vcpu(vcpu)) fpstate_set_confidential(&vcpu->arch.guest_fpu); + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { + vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; + } + + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; + ret = kvm_x86_call(vcpu_create)(vcpu); if (ret) - unsetup_vcpu_lapic(vcpu); + goto unsetup_lapic; + /* Load guest vCPU to reset it. */ + kvm_x86_call(vcpu_load)(vcpu, cpu); + + kvm_vcpu_reset(vcpu, false); + + /* + * The guest vCPU should be put before switching back to the host vCPU + * to make sure the vcpu state is not cached on this CPU as this guest + * vCPU may be loaded on another CPU later by the host via the PV + * interface. + */ + kvm_x86_call(vcpu_put)(vcpu); + + kvm_x86_call(vcpu_load)(this_cpu_read(host_vcpu), cpu); + + return 0; + +unsetup_lapic: + unsetup_vcpu_lapic(vcpu); +unshare_mce: + unshare_vcpu_mce_banks(vcpu); return ret; } @@ -363,6 +445,7 @@ static void __vcpu_free(struct kvm_vcpu *vcpu) kvm_x86_call(vcpu_free)(vcpu); unsetup_vcpu_lapic(vcpu); + unshare_vcpu_mce_banks(vcpu); } static int pkvm_vcpu_create(int vm_handle, phys_addr_t host_vcpu_pa, @@ -384,7 +467,9 @@ static int pkvm_vcpu_create(int vm_handle, phys_addr_t host_vcpu_pa, goto put_vm; shared_vcpu = __pkvm_va(host_vcpu_pa); - vcpu_size = PKVM_VCPU_BASE_SIZE + kvm_vcpu_sz + KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE; + vcpu_size = PKVM_VCPU_BASE_SIZE + kvm_vcpu_sz; + if (pkvm_is_protected_vm(&pkvm_vm->kvm)) + vcpu_size += KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE; if (lapic_in_kernel(shared_vcpu)) vcpu_size += sizeof(struct kvm_lapic); vcpu_size = PAGE_ALIGN(vcpu_size); @@ -446,6 +531,10 @@ static int __pkvm_vcpu_free(struct pkvm_vm *pkvm_vm, int vcpu_handle, fps = pkvm_vcpu->vcpu.arch.guest_fpu.fpstate; teardown_donated_memory(mc, fps, fps->size); + if (pkvm_vcpu->vcpu.arch.cpuid_entries) + teardown_donated_memory(mc, pkvm_vcpu->vcpu.arch.cpuid_entries, + PAGE_ALIGN(sizeof(struct kvm_cpuid_entry2) * + pkvm_vcpu->vcpu.arch.cpuid_nent)); teardown_donated_memory(mc, pkvm_vcpu, pkvm_vcpu->size); pkvm_host_unshare_hyp(shared_vcpu_pa, kvm_vcpu_sz); @@ -473,12 +562,775 @@ static int pkvm_vcpu_free(int vm_handle, int vcpu_handle, struct pkvm_memcache * return ret; } +static int pkvm_vcpu_load(int vm_handle, int vcpu_handle) +{ + struct pkvm_vcpu *pkvm_vcpu = pkvm_get_vcpu(vm_handle, vcpu_handle); + int cpu = raw_smp_processor_id(); + struct kvm_vcpu *vcpu; + int loaded_cpu; + int ret = 0; + + if (!pkvm_vcpu) + return -EINVAL; + + vcpu = &pkvm_vcpu->vcpu; + loaded_cpu = cmpxchg(&vcpu->cpu, -1, cpu); + if (loaded_cpu == -1) { + /* + * Get the pkvm_vcpu to prevent it from being freed via the + * vcpu_free PV interface while it is still loaded. If the + * obtained pkvm_vcpu is not the same as the original one, it + * must be a pkvm bug. + */ + BUG_ON(pkvm_vcpu != pkvm_get_vcpu(vm_handle, vcpu_handle)); + + this_cpu_write(cur_guest_vcpu, vcpu); + } else if (loaded_cpu == cpu) { + /* The guest vCPU is already loaded on this CPU. */ + this_cpu_write(cur_guest_vcpu, vcpu); + } else { + /* The guest vCPU is already loaded on another CPU. */ + ret = -EBUSY; + } + + pkvm_put_vcpu(pkvm_vcpu); + + return ret; +} + +static int pkvm_vcpu_put(int vm_handle, int vcpu_handle) +{ + struct pkvm_vcpu *pkvm_vcpu = pkvm_get_vcpu(vm_handle, vcpu_handle); + int cpu = raw_smp_processor_id(), loaded_cpu, ret = 0; + struct kvm_vcpu *vcpu; + + if (!pkvm_vcpu) + return -EINVAL; + + vcpu = &pkvm_vcpu->vcpu; + loaded_cpu = vcpu->cpu; + if (loaded_cpu == cpu) { + /* + * The current active vCPU is the host vCPU. Switch to the guest + * vCPU in case vcpu_put operation requires. + */ + kvm_x86_call(vcpu_load)(vcpu, cpu); + + /* + * Another guest vCPU may have already been loaded on this CPU + * thus the cur_guest_vcpu may be overridden. So only set the + * cur_guest_vcpu as NULL if it points to the guest vCPU being + * put. + */ + if (vcpu == this_cpu_read(cur_guest_vcpu)) + this_cpu_write(cur_guest_vcpu, NULL); + + kvm_x86_call(vcpu_put)(vcpu); + + /* + * Put this pkvm_vcpu to allow it to be freed via the vcpu_free PV + * interface. + */ + pkvm_put_vcpu(pkvm_vcpu); + + /* Switch to the host vCPU as a guest vCPU was just loaded. */ + kvm_x86_call(vcpu_load)(this_cpu_read(host_vcpu), cpu); + + /* + * Paired with cmpxchg in pkvm_vcpu_load() to make sure the + * vcpu->cpu is set only after the put is completed. + */ + smp_store_release(&vcpu->cpu, -1); + } else { + /* + * The guest vCPU is not loaded on any CPU or is loaded on a + * different CPU. + */ + ret = -EINVAL; + } + + pkvm_put_vcpu(pkvm_vcpu); + + return ret; +} + +static bool is_guest_vcpu_accessible(struct kvm_vcpu *vcpu, enum pkvm_hc hc) +{ + /* + * There is no isolation between non-protected VMs and the host, thus + * all the PV interfaces are allowed for an npVM. + */ + if (!pkvm_is_protected_vcpu(vcpu)) + return true; + + switch (hc) { + case __pkvm__enable_nmi_window: + case __pkvm__enable_irq_window: + case __pkvm__interrupt_allowed: + case __pkvm__nmi_allowed: + case __pkvm__get_nmi_mask: + case __pkvm__inject_irq: + case __pkvm__inject_nmi: + case __pkvm__cancel_injection: + case __pkvm__update_cr8_intercept: + case __pkvm__set_virtual_apic_mode: + case __pkvm__refresh_apicv_exec_ctrl: + case __pkvm__load_eoi_exitmap: + case __pkvm__hwapic_isr_update: + case __pkvm__sync_pir_to_irr: + case __pkvm__write_tsc_offset: + case __pkvm__write_tsc_multiplier: + case __pkvm__load_mmu_pgd: + case __pkvm__setup_mce: + /* + * The host is responsible for running vCPU, injecting + * interrupts, emulating lapic etc. Always allow the related PV + * interfaces. + * + * TODO: As the pVM can use another secure time source, the + * guest TSC is allowed for the host to emulate and access. To + * support the pVM with secure TSC, add protection for TSC + * related PV interfaces. + * __pkvm__write_tsc_offset + * __pkvm__write_tsc_multiplier + */ + return true; + case __pkvm__set_efer: + case __pkvm__set_msr: + case __pkvm__get_msr: + case __pkvm__set_cr4: + case __pkvm__set_cr0: + case __pkvm__set_rflags: + case __pkvm__get_rflags: + case __pkvm__vcpu_reset: + case __pkvm__set_segment: + case __pkvm__get_segment: + case __pkvm__get_segment_base: + case __pkvm__set_idt: + case __pkvm__get_idt: + case __pkvm__set_gdt: + case __pkvm__get_gdt: + case __pkvm__flush_tlb_all: + case __pkvm__flush_tlb_current: + case __pkvm__flush_tlb_gva: + case __pkvm__flush_tlb_guest: + case __pkvm__vcpu_after_set_cpuid: + case __pkvm__vcpu_add_fpstate: + /* + * As the host needs to pre-configure the pVM's vCPU state for + * booting, the protection for pVM is only enforced by the pKVM + * hypervisor once the vCPU has started running. + */ + return !kvm_vcpu_has_run(vcpu); + default: + /* + * The other PV interfaces are not necessary for the host to + * access the pVM's vCPU state. Deny these PV interfaces by + * default. + */ + return false; + } +} + +static void pkvm_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + /* + * The guest_debug will impact what exceptions should be intercepted + * for the debugging purpose. Debugging npVMs from the host side is + * allowed thus updating its guest_debug flags accordingly, but + * debugging pVMs from the host side is not allowed. + * + * As the __pkvm__update_exception_bitmap is always denied for the pVM, + * it must be a code bug if the vcpu is protected. + */ + BUG_ON(pkvm_is_protected_vcpu(vcpu)); + vcpu->guest_debug = to_pkvm_vcpu(vcpu)->shared_vcpu->guest_debug; + + kvm_x86_call(update_exception_bitmap)(vcpu); +} + +static int pkvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + if (pkvm_is_protected_vcpu(vcpu)) { + if (WARN_ON(kvm_vcpu_has_run(vcpu))) + return -EPERM; + + /* + * For simplicity and security, allow the host to change + * initial values of those MSRs (or individual bits in MSRs) + * that are currently tweaked by crosvm, and only those. + * The allowed set can be extended as needed. + */ + switch (index) { + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + break; + case MSR_IA32_MISC_ENABLE: + if (data & ~(MSR_IA32_MISC_ENABLE_FAST_STRING | + MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)) + return -EPERM; + + /* + * vPMU is not supported by pKVM yet. Don't trick the pVM + * that it is. + */ + data |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; + break; + case MSR_STAR: + case MSR_LSTAR: + case MSR_CSTAR: + case MSR_SYSCALL_MASK: + case MSR_KERNEL_GS_BASE: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* + * TODO: The user space VMM from the host side (e.g., + * crosvm) may still try to set these MSRs which are + * protected by the pKVM hypervisor for a pVM. Ignore + * writings to these MSRs and return 0 to make such + * user space VMM happy, meanwhile doesn't really modify + * these MSRs. This eventually will be fixed in the user + * space VMM to avoid doing so for a pVM. Once this is + * implemented, these can be removed. + */ + return 0; + default: + return -EPERM; + } + } + + return kvm_msr_write(vcpu, index, data); +} + +static int pkvm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg, + union pkvm_hc_data *out) +{ + kvm_x86_call(cache_reg)(vcpu, reg); + + switch (reg) { + case VCPU_REGS_RSP: + out->cache_reg.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + break; + case VCPU_REGS_RIP: + out->cache_reg.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + break; + case VCPU_EXREG_PDPTR: { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + out->cache_reg.pdptrs[0] = mmu->pdptrs[0]; + out->cache_reg.pdptrs[1] = mmu->pdptrs[1]; + out->cache_reg.pdptrs[2] = mmu->pdptrs[2]; + out->cache_reg.pdptrs[3] = mmu->pdptrs[3]; + break; + } + case VCPU_EXREG_CR0: + out->cache_reg.cr0 = vcpu->arch.cr0; + break; + case VCPU_EXREG_CR3: + out->cache_reg.cr3 = vcpu->arch.cr3; + break; + case VCPU_EXREG_CR4: + out->cache_reg.cr4 = vcpu->arch.cr4; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static void pkvm_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long dr7 = val; + + kvm_x86_call(set_dr7)(vcpu, dr7); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; + if (dr7 & DR7_BP_EN_MASK) + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; +} + +static inline bool pkvm_event_injection_allowed(struct kvm_vcpu *vcpu) +{ + return !kvm_event_needs_reinjection(vcpu) && !vcpu->arch.exception.pending; +} + +static int pkvm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (for_injection && !pkvm_event_injection_allowed(vcpu)) + return -EBUSY; + + return kvm_x86_call(interrupt_allowed)(vcpu, for_injection); +} + +static int pkvm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + if (for_injection && !pkvm_event_injection_allowed(vcpu)) + return -EBUSY; + + return kvm_x86_call(nmi_allowed)(vcpu, for_injection); +} + +static void pkvm_inject_irq(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *shared_vcpu = to_pkvm_vcpu(vcpu)->shared_vcpu; + + if (WARN_ON_ONCE(pkvm_interrupt_allowed(vcpu, true) <= 0)) + return; + + vcpu->arch.interrupt.soft = shared_vcpu->arch.interrupt.soft; + vcpu->arch.interrupt.nr = shared_vcpu->arch.interrupt.nr; + kvm_x86_call(inject_irq)(vcpu, false); +} + +static void pkvm_inject_nmi(struct kvm_vcpu *vcpu) +{ + if (WARN_ON_ONCE(pkvm_nmi_allowed(vcpu, true) <= 0)) + return; + + kvm_x86_call(inject_nmi)(vcpu); +} + +static void pkvm_inject_exception(struct kvm_vcpu *vcpu) +{ + /* + * As the __pkvm__inject_exception is always denied for the pVM, + * it must be a code bug if the vcpu is protected. + */ + BUG_ON(pkvm_is_protected_vcpu(vcpu)); + vcpu->arch.exception = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.exception; + + kvm_x86_call(inject_exception)(vcpu); +} + +static void pkvm_cancel_injection(struct kvm_vcpu *vcpu) +{ + struct pkvm_vcpu *pkvm_vcpu = to_pkvm_vcpu(vcpu); + struct kvm_vcpu *shared_vcpu; + + kvm_x86_call(cancel_injection)(vcpu); + + shared_vcpu = pkvm_vcpu->shared_vcpu; + if (vcpu->arch.nmi_injected) { + shared_vcpu->arch.nmi_injected = true; + vcpu->arch.nmi_injected = false; + } else if (vcpu->arch.interrupt.injected) { + kvm_queue_interrupt(shared_vcpu, vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft); + kvm_clear_interrupt_queue(vcpu); + } else if (!pkvm_is_protected_vcpu(vcpu) && vcpu->arch.exception.injected) { + /* + * For the pVM, the exception can only be injected and canceled + * by the pkvm hypervisor. + * For the npVM, the exception can be injected and canceled by + * both sides. + */ + shared_vcpu->arch.exception = vcpu->arch.exception; + kvm_clear_exception_queue(vcpu); + } +} + +static void pkvm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + u64 apic_base = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.apic_base; + + if ((vcpu->arch.apic_base ^ apic_base) & MSR_IA32_APICBASE_ENABLE) + vcpu->arch.cpuid_dynamic_bits_dirty = true; + + vcpu->arch.apic_base = apic_base; + kvm_x86_call(set_virtual_apic_mode)(vcpu); +} + +static void pkvm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu, bool apicv_active) +{ + if (!lapic_in_kernel(vcpu)) + return; + + vcpu->arch.apic->apicv_active = apicv_active; + kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu); +} + +static void pkvm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 eoi_exit_bitmap0, + u64 eoi_exit_bitmap1, u64 eoi_exit_bitmap2, + u64 eoi_exit_bitmap3) +{ + u64 eoi_exit_bitmap[] = { + eoi_exit_bitmap0, + eoi_exit_bitmap1, + eoi_exit_bitmap2, + eoi_exit_bitmap3, + }; + + kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap); +} + +static void pkvm_sync_pir_to_irr(struct kvm_vcpu *vcpu, int pir) +{ + to_pkvm_vcpu(vcpu)->max_irr = pir; + kvm_x86_call(sync_pir_to_irr)(vcpu); +} + +static int pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu, + phys_addr_t cpuid_pa, + struct pkvm_memcache *mc) +{ + struct kvm_cpuid_entry2 *new, *old; + int new_nent, old_nent, ret; + u64 size, aligned_size; + + new_nent = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.cpuid_nent; + size = sizeof(struct kvm_cpuid_entry2) * new_nent; + aligned_size = PAGE_ALIGN(size); + ret = pkvm_host_donate_hyp(cpuid_pa, aligned_size, false); + if (ret) + return ret; + + new = __pkvm_va(cpuid_pa); + if (pkvm_is_protected_vcpu(vcpu)) { + /* + * Donation is page-granule, so the host must ensure that + * the cpuid buffer size is page aligned though the actual + * nent only records valid entries. + * + * Clear the trailing space after nent so it can be used + * to hold missing cpuid entries enforced by pkvm. + */ + memset((void *)new + size, 0, aligned_size - size); + + ret = pkvm_enforce_cpuid(new, &new_nent, + aligned_size / sizeof(struct kvm_cpuid_entry2)); + if (ret) + goto undonate; + } + + old = vcpu->arch.cpuid_entries; + old_nent = vcpu->arch.cpuid_nent; + + ret = kvm_set_cpuid(vcpu, new, new_nent); + if (ret) + goto undonate; + + memset(mc, 0, sizeof(*mc)); + /* + * New cpuid entries memory is consumed. Tear down the old cpuid + * entries memory if there is. + */ + if (old) + teardown_donated_memory(mc, (void *)old, + PAGE_ALIGN(sizeof(struct kvm_cpuid_entry2) * + old_nent)); + + return 0; + +undonate: + pkvm_hyp_donate_host(__pkvm_pa(new), aligned_size, false); + return ret; +} + +static int pkvm_vcpu_add_fpstate(struct kvm_vcpu *vcpu, + phys_addr_t fpstate_pa, size_t size, + struct pkvm_memcache *mc) +{ + struct fpstate *new, *old; + int ret; + + /* Expect the host to use this PV interface for pVM only. */ + if (!pkvm_is_protected_vcpu(vcpu)) + return -EINVAL; + + memset(mc, 0, sizeof(*mc)); + + old = vcpu->arch.guest_fpu.fpstate; + new = __pkvm_va(fpstate_pa); + /* + * Reuse the existing fpstate memory if it's sufficiently large. At this + * stage, we can't determine whether the new fpstate size matches the + * vCPUID or not, because that check only occurs when the host calls + * __pkvm__vcpu_after_set_cpuid to update the vCPUID. If the new fpstate + * size is smaller than what the new vCPUID requires, the vCPUID won't + * be updated. Therefore, ensuring the new fpstate size is at least as + * large as the previous one allows continued support for this scenario. + */ + if (old && old->size >= size) { + teardown_donated_memory(mc, new, size); + return 0; + } + + ret = pkvm_host_donate_hyp(fpstate_pa, size, true); + if (ret) + return ret; + + new->size = size; + vcpu->arch.guest_fpu.fpstate = new; + + pkvm_init_guest_fpu(&vcpu->arch.guest_fpu); + fpstate_set_confidential(&vcpu->arch.guest_fpu); + + /* + * New physical fpstate memory is consumed. Tear down the old fpstate + * memory if there is. + */ + if (old) + teardown_donated_memory(mc, old, old->size); + + return 0; +} + +static void pkvm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 tsc_offset) +{ + vcpu->arch.l1_tsc_offset = tsc_offset; + vcpu->arch.tsc_offset = tsc_offset; + kvm_x86_call(write_tsc_offset)(vcpu); +} + +static void pkvm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 ratio) +{ + if (!kvm_caps.has_tsc_control) + return; + + vcpu->arch.l1_tsc_scaling_ratio = ratio; + vcpu->arch.tsc_scaling_ratio = ratio; + kvm_x86_call(write_tsc_multiplier)(vcpu); +} + +static int pkvm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +{ + struct kvm_vcpu *shared_vcpu = to_pkvm_vcpu(vcpu)->shared_vcpu; + + /* + * The guest CR3/PDPTR may be updated by the load_mmu_pgd. Sync the + * guest CR3/PDPTR from the host for both npVMs or pVMs (if pVMs are not + * starting to run yet). + */ + if (!pkvm_is_protected_vcpu(vcpu) || !kvm_vcpu_has_run(vcpu)) { + if (kvm_register_is_dirty(shared_vcpu, VCPU_EXREG_CR3)) { + vcpu->arch.cr3 = shared_vcpu->arch.cr3; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + } + + if (kvm_register_is_dirty(shared_vcpu, VCPU_EXREG_PDPTR)) { + struct kvm_mmu *shared_walk_mmu = kern_pkvm_va(shared_vcpu->arch.walk_mmu); + struct kvm_mmu *walk_mmu = vcpu->arch.walk_mmu; + int ret; + + ret = pkvm_host_share_hyp(__pkvm_pa(shared_walk_mmu), + sizeof(struct kvm_mmu)); + if (ret) + return ret; + + walk_mmu->pdptrs[0] = shared_walk_mmu->pdptrs[0]; + walk_mmu->pdptrs[1] = shared_walk_mmu->pdptrs[1]; + walk_mmu->pdptrs[2] = shared_walk_mmu->pdptrs[2]; + walk_mmu->pdptrs[3] = shared_walk_mmu->pdptrs[3]; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + + pkvm_host_unshare_hyp(__pkvm_pa(shared_walk_mmu), + sizeof(struct kvm_mmu)); + } + } + + /* + * TODO: Implement guest memory protection rather than directly using + * the EPT controlled by the host. + */ + vcpu->arch.mmu->root.hpa = root_hpa; + vcpu->arch.mmu->root_role.level = root_level; + + kvm_x86_call(load_mmu_pgd)(vcpu, vcpu->arch.mmu->root.hpa, + vcpu->arch.mmu->root_role.level); + + return 0; +} + +static int pkvm_vcpu_handle_host_hypercall(struct kvm_vcpu *hvcpu, enum pkvm_hc hc, + union pkvm_hc_data *in, union pkvm_hc_data *out) +{ + struct kvm_vcpu *vcpu = this_cpu_read(cur_guest_vcpu); + int cpu = raw_smp_processor_id(), ret = 0; + + BUG_ON(hvcpu != this_cpu_read(host_vcpu)); + + if (!vcpu) + return -EINVAL; + + if (!is_guest_vcpu_accessible(vcpu, hc)) + return -EPERM; + + kvm_x86_call(vcpu_load)(vcpu, cpu); + + switch (hc) { + case __pkvm__update_exception_bitmap: + pkvm_update_exception_bitmap(vcpu); + break; + case __pkvm__set_efer: + ret = kvm_x86_call(set_efer)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__set_msr: + ret = pkvm_set_msr(vcpu, pkvm_hc_input1(hvcpu), + pkvm_hc_input2(hvcpu)); + break; + case __pkvm__get_msr: + ret = kvm_msr_read(vcpu, pkvm_hc_input1(hvcpu), &out->get_msr.data); + break; + case __pkvm__cache_reg: + ret = pkvm_cache_reg(vcpu, pkvm_hc_input1(hvcpu), out); + break; + case __pkvm__set_cr4: + kvm_x86_call(set_cr4)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__set_cr0: + kvm_x86_call(set_cr0)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__set_rflags: + kvm_x86_call(set_rflags)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__get_rflags: + out->get_rflags.data = kvm_x86_call(get_rflags)(vcpu); + break; + case __pkvm__set_dr7: + pkvm_set_dr7(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__vcpu_reset: + /* + * Only needs to support reset vCPU for INIT as the non-INIT reset + * is done by the pKVM hypervisor when creating this vCPU. + * + * TODO: The INIT for pVMs will be handled inside the pKVM hypervisor. + * Once this is implemented, make the __pkvm__vcpu_reset only for npVM. + */ + kvm_vcpu_reset(vcpu, true); + break; + case __pkvm__set_segment: + kvm_x86_call(set_segment)(vcpu, &in->set_segment.seg_val, + in->set_segment.seg); + break; + case __pkvm__get_segment: + kvm_x86_call(get_segment)(vcpu, &out->get_segment.seg_val, + pkvm_hc_input1(hvcpu)); + break; + case __pkvm__get_segment_base: + out->get_segment_base.data = + kvm_x86_call(get_segment_base)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__set_idt: + kvm_x86_call(set_idt)(vcpu, &in->set_idt.desc); + break; + case __pkvm__get_idt: + kvm_x86_call(get_idt)(vcpu, &out->get_idt.desc); + break; + case __pkvm__set_gdt: + kvm_x86_call(set_gdt)(vcpu, &in->set_gdt.desc); + break; + case __pkvm__get_gdt: + kvm_x86_call(get_gdt)(vcpu, &out->get_gdt.desc); + break; + case __pkvm__flush_tlb_all: + kvm_x86_call(flush_tlb_all)(vcpu); + break; + case __pkvm__flush_tlb_current: + kvm_x86_call(flush_tlb_current)(vcpu); + break; + case __pkvm__flush_tlb_gva: + kvm_x86_call(flush_tlb_gva)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__flush_tlb_guest: + kvm_x86_call(flush_tlb_guest)(vcpu); + break; + case __pkvm__set_interrupt_shadow: + kvm_x86_call(set_interrupt_shadow)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__get_interrupt_shadow: + out->get_interrupt_shadow.data = kvm_x86_call(get_interrupt_shadow)(vcpu); + break; + case __pkvm__enable_nmi_window: + kvm_x86_call(enable_nmi_window)(vcpu); + break; + case __pkvm__enable_irq_window: + kvm_x86_call(enable_irq_window)(vcpu); + break; + case __pkvm__interrupt_allowed: + ret = pkvm_interrupt_allowed(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__nmi_allowed: + ret = pkvm_nmi_allowed(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__get_nmi_mask: + out->get_nmi_mask.data = kvm_x86_call(get_nmi_mask)(vcpu); + break; + case __pkvm__set_nmi_mask: + kvm_x86_call(set_nmi_mask)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__inject_irq: + pkvm_inject_irq(vcpu); + break; + case __pkvm__inject_nmi: + pkvm_inject_nmi(vcpu); + break; + case __pkvm__inject_exception: + pkvm_inject_exception(vcpu); + break; + case __pkvm__cancel_injection: + pkvm_cancel_injection(vcpu); + break; + case __pkvm__update_cr8_intercept: + kvm_x86_call(update_cr8_intercept)(vcpu, pkvm_hc_input1(hvcpu), + pkvm_hc_input2(hvcpu)); + break; + case __pkvm__set_virtual_apic_mode: + pkvm_set_virtual_apic_mode(vcpu); + break; + case __pkvm__refresh_apicv_exec_ctrl: + pkvm_refresh_apicv_exec_ctrl(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__load_eoi_exitmap: + pkvm_load_eoi_exitmap(vcpu, pkvm_hc_input1(hvcpu), pkvm_hc_input2(hvcpu), + pkvm_hc_input3(hvcpu), pkvm_hc_input4(hvcpu)); + break; + case __pkvm__hwapic_isr_update: + kvm_x86_call(hwapic_isr_update)(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__sync_pir_to_irr: + pkvm_sync_pir_to_irr(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__vcpu_after_set_cpuid: + ret = pkvm_vcpu_after_set_cpuid(vcpu, pkvm_host_gpa_to_phys(pkvm_hc_input1(hvcpu)), + &out->vcpu_after_set_cpuid.memcache); + break; + case __pkvm__vcpu_add_fpstate: + ret = pkvm_vcpu_add_fpstate(vcpu, pkvm_host_gpa_to_phys(pkvm_hc_input1(hvcpu)), + pkvm_hc_input2(hvcpu), &out->vcpu_add_fpstate.memcache); + break; + case __pkvm__write_tsc_offset: + pkvm_write_tsc_offset(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__write_tsc_multiplier: + pkvm_write_tsc_multiplier(vcpu, pkvm_hc_input1(hvcpu)); + break; + case __pkvm__load_mmu_pgd: + ret = pkvm_load_mmu_pgd(vcpu, pkvm_hc_input1(hvcpu), pkvm_hc_input2(hvcpu)); + break; + case __pkvm__setup_mce: + ret = kvm_vcpu_x86_setup_mce(vcpu, pkvm_hc_input1(hvcpu)); + break; + default: + ret = -EINVAL; + break; + } + + kvm_x86_call(vcpu_load)(hvcpu, cpu); + return ret; +} + void pkvm_handle_host_hypercall(struct kvm_vcpu *vcpu) { enum pkvm_hc hc = pkvm_hc(vcpu); - union pkvm_hc_data out; + union pkvm_hc_data in, out; int ret = 0; + pkvm_hc_get_input(vcpu, hc, &in); + switch (hc) { case __pkvm__init_finalize: ret = pkvm_init_finalize((struct pkvm_mem_info *)pkvm_hc_input1(vcpu), @@ -515,8 +1367,16 @@ void pkvm_handle_host_hypercall(struct kvm_vcpu *vcpu) ret = pkvm_vcpu_free(pkvm_hc_input1(vcpu), pkvm_hc_input2(vcpu), &out.vcpu_free.memcache); break; + case __pkvm__vcpu_load: + ret = pkvm_vcpu_load(pkvm_hc_input1(vcpu), + pkvm_hc_input2(vcpu)); + break; + case __pkvm__vcpu_put: + ret = pkvm_vcpu_put(pkvm_hc_input1(vcpu), + pkvm_hc_input2(vcpu)); + break; default: - ret = -EINVAL; + ret = pkvm_vcpu_handle_host_hypercall(vcpu, hc, &in, &out); break; } @@ -612,3 +1472,42 @@ void pkvm_put_vm(struct pkvm_vm *pkvm_vm) WARN_ON(atomic_dec_if_positive(&pkvm_vm_ref->refcount) <= 0); } + +struct pkvm_vcpu *pkvm_get_vcpu(int vm_handle, int vcpu_handle) +{ + struct pkvm_vm *pkvm_vm; + + if (vcpu_handle < 0 || vcpu_handle >= KVM_MAX_VCPUS) + return NULL; + + pkvm_vm = pkvm_get_vm(vm_handle); + if (!pkvm_vm) + return NULL; + + vcpu_handle = array_index_nospec(vcpu_handle, KVM_MAX_VCPUS); + if (atomic_inc_not_zero(&pkvm_vm->vcpu_refs[vcpu_handle])) + return pkvm_vm->vcpus[vcpu_handle]; + + pkvm_put_vm(pkvm_vm); + return NULL; +} + +void pkvm_put_vcpu(struct pkvm_vcpu *pkvm_vcpu) +{ + int vcpu_handle = pkvm_vcpu->vcpu.arch.pkvm.handle; + + WARN_ON(atomic_dec_if_positive(&pkvm_vcpu->pkvm_vm->vcpu_refs[vcpu_handle]) <= 0); + + pkvm_put_vm(pkvm_vcpu->pkvm_vm); +} + +unsigned long pkvm_pcpu_tss(int cpu) +{ +#ifdef CONFIG_PKVM_X86_DEBUG + return (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss; +#else + struct pkvm_pcpu *pcpu = per_cpu(phys_cpu, cpu); + + return (unsigned long)&pcpu->tss; +#endif +} diff --git a/arch/x86/kvm/pkvm/pkvm.h b/arch/x86/kvm/pkvm/pkvm.h index e6b6af7eb6f7..4570a912539f 100644 --- a/arch/x86/kvm/pkvm/pkvm.h +++ b/arch/x86/kvm/pkvm/pkvm.h @@ -21,6 +21,8 @@ struct pkvm_vcpu { * structure wrapping the kvm_vcpu structure (see below). */ size_t size; + /* Maximum IRR value recorded for posted interrupts. */ + int max_irr; /* * The struct kvm_vcpu should be the last element. In cases where struct * kvm_vcpu is wrapped by a vendor specific structure, putting it as the @@ -96,5 +98,8 @@ void pkvm_kick_vcpu(struct kvm_vcpu *vcpu); int pkvm_x86_vendor_init(struct kvm_x86_init_ops *ops); struct pkvm_vm *pkvm_get_vm(int vm_handle); void pkvm_put_vm(struct pkvm_vm *pkvm_vm); +struct pkvm_vcpu *pkvm_get_vcpu(int vm_handle, int vcpu_handle); +void pkvm_put_vcpu(struct pkvm_vcpu *pkvm_vcpu); +unsigned long pkvm_pcpu_tss(int cpu); #endif /* __PKVM_X86_PKVM_H */ diff --git a/arch/x86/kvm/pkvm/undef.h b/arch/x86/kvm/pkvm/undef.h index f0a06ea8e0eb..3bc1a1b84381 100644 --- a/arch/x86/kvm/pkvm/undef.h +++ b/arch/x86/kvm/pkvm/undef.h @@ -23,6 +23,8 @@ #undef CONFIG_USE_X86_SEG_SUPPORT #undef CONFIG_MATH_EMULATION #undef CONFIG_X86_DEBUG_FPU +#undef CONFIG_PROVE_LOCKING +#undef CONFIG_DEBUG_IRQFLAGS #define NOTRACE @@ -54,6 +56,9 @@ #undef CONFIG_GENERIC_BUG #undef CONFIG_TRACEPOINTS #undef CONFIG_DEBUG_PREEMPT +#undef CONFIG_DYNAMIC_DEBUG +#undef CONFIG_DYNAMIC_DEBUG_CORE +#undef CONFIG_TRACE_IRQFLAGS */ #endif /* __PKVM_X86_UNDEF_H */ diff --git a/arch/x86/kvm/pkvm/vmx/idt.c b/arch/x86/kvm/pkvm/vmx/idt.c index d11ee08be21c..f5eb9245979f 100644 --- a/arch/x86/kvm/pkvm/vmx/idt.c +++ b/arch/x86/kvm/pkvm/vmx/idt.c @@ -5,11 +5,23 @@ #include #include "host_vmx.h" #include "idt.h" +#include "memory.h" #include "pkvm.h" static void handle_nmi(struct pt_regs *regs, int vector, bool has_error_code) { struct kvm_vcpu *vcpu = this_cpu_read(host_vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 cur_vmcs_pa = vmcs_store(); + bool is_host_vmcs; + + /* There should always be a loaded VMCS, otherwise it is a code bug. */ + BUG_ON(!VALID_PAGE(cur_vmcs_pa)); + + /* Switch to the host VMCS if the current one is not for host. */ + is_host_vmcs = (cur_vmcs_pa == __pkvm_pa(vmx->loaded_vmcs->vmcs)); + if (!is_host_vmcs) + vmcs_load(vmx->loaded_vmcs->vmcs); /* * The NMI happens while the pKVM hypervisor is running, but it should @@ -23,7 +35,11 @@ static void handle_nmi(struct pt_regs *regs, int vector, bool has_error_code) * Request host immediate exit in case the pending NMI has already been * handled in this host vmexit handling cycle. */ - request_host_immediate_exit(to_vmx(vcpu)); + request_host_immediate_exit(vmx); + + /* Restore to the previous VMCS if it is not for host. */ + if (!is_host_vmcs) + vmcs_load(__pkvm_va(cur_vmcs_pa)); } void pkvm_vmx_register_excp_handlers(void) diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2cfd42a3b450..5103d45456b0 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -884,17 +884,22 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .vcpu_precreate = vt_op(vcpu_precreate), .vcpu_create = vt_op(vcpu_create), .vcpu_free = vt_op(vcpu_free), -#ifndef __PKVM_HYP__ .vcpu_reset = vt_op(vcpu_reset), +#ifndef __PKVM_HYP__ .prepare_switch_to_guest = vt_op(prepare_switch_to_guest), +#endif .vcpu_load = vt_op(vcpu_load), .vcpu_put = vt_op(vcpu_put), +#ifndef __PKVM_HYP__ .HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS, +#endif .update_exception_bitmap = vt_op(update_exception_bitmap), +#ifndef __PKVM_HYP__ .get_feature_msr = vmx_get_feature_msr, +#endif .get_msr = vt_op(get_msr), .set_msr = vt_op(set_msr), @@ -925,14 +930,18 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .flush_tlb_gva = vt_op(flush_tlb_gva), .flush_tlb_guest = vt_op(flush_tlb_guest), +#ifndef __PKVM_HYP__ .vcpu_pre_run = vt_op(vcpu_pre_run), .vcpu_run = vt_op(vcpu_run), .handle_exit = vt_op(handle_exit), .skip_emulated_instruction = vmx_skip_emulated_instruction, .update_emulated_instruction = vmx_update_emulated_instruction, +#endif .set_interrupt_shadow = vt_op(set_interrupt_shadow), .get_interrupt_shadow = vt_op(get_interrupt_shadow), +#ifndef __PKVM_HYP__ .patch_hypercall = vt_op(patch_hypercall), +#endif .inject_irq = vt_op(inject_irq), .inject_nmi = vt_op(inject_nmi), .inject_exception = vt_op(inject_exception), @@ -945,15 +954,22 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .enable_irq_window = vt_op(enable_irq_window), .update_cr8_intercept = vt_op(update_cr8_intercept), +#ifndef __PKVM_HYP__ .x2apic_icr_is_split = false, +#endif .set_virtual_apic_mode = vt_op(set_virtual_apic_mode), +#ifndef __PKVM_HYP__ .set_apic_access_page_addr = vt_op(set_apic_access_page_addr), +#endif .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl), .load_eoi_exitmap = vt_op(load_eoi_exitmap), +#ifndef __PKVM_HYP__ .apicv_pre_state_restore = pi_apicv_pre_state_restore, .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, +#endif .hwapic_isr_update = vt_op(hwapic_isr_update), .sync_pir_to_irr = vt_op(sync_pir_to_irr), +#ifndef __PKVM_HYP__ .deliver_interrupt = vt_op(deliver_interrupt), .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, @@ -963,18 +979,22 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .get_exit_info = vt_op(get_exit_info), .get_entry_info = vt_op(get_entry_info), +#endif .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid), +#ifndef __PKVM_HYP__ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .get_l2_tsc_offset = vt_op(get_l2_tsc_offset), .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier), +#endif .write_tsc_offset = vt_op(write_tsc_offset), .write_tsc_multiplier = vt_op(write_tsc_multiplier), .load_mmu_pgd = vt_op(load_mmu_pgd), +#ifndef __PKVM_HYP__ .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, @@ -990,7 +1010,9 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .cancel_hv_timer = vt_op(cancel_hv_timer), #endif +#endif .setup_mce = vt_op(setup_mce), +#ifndef __PKVM_HYP__ #ifdef CONFIG_KVM_SMM .smi_allowed = vt_op(smi_allowed), diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 9eb402c522ad..a7cc36422c55 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -8,8 +8,15 @@ #include "vmx.h" #ifdef __PKVM_HYP__ +static inline void vmx_leave_nested(struct kvm_vcpu *vcpu) {} static inline void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) {} static inline void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) {} +static inline void nested_vmx_set_vmcs_shadowing_bitmap(void) {} +static inline int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { return 1; } +static inline int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) +{ + return 1; +} static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/vmx/pkvm_host.c b/arch/x86/kvm/vmx/pkvm_host.c index c16a8f5464b3..a9462442e7d5 100644 --- a/arch/x86/kvm/vmx/pkvm_host.c +++ b/arch/x86/kvm/vmx/pkvm_host.c @@ -4,6 +4,8 @@ #include #include #include "pkvm_constants.h" +#include "posted_intr.h" +#include "trace.h" #include "x86_ops.h" #include "vmx.h" @@ -41,6 +43,85 @@ static int pkvm_alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) return -ENOMEM; } +static void __pkvm_vcpu_unload(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct vcpu_vmx *vmx; + + if (pkvm_hypercall(vcpu_put, vcpu->kvm->arch.pkvm.handle, + vcpu->arch.pkvm.handle)) + return; + + vmx = to_vmx(vcpu); + vmx->loaded_vmcs->cpu = -1; +} + +static void pkvm_vcpu_unload(struct kvm_vcpu *vcpu) +{ + int cpu = to_vmx(vcpu)->loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, __pkvm_vcpu_unload, vcpu, 1); +} + +static bool pkvm_segment_cache_test(struct vcpu_vmx *vmx, int seg, int field) +{ + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { + kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + + return vmx->segment_cache.bitmask & mask; +} + +static void pkvm_segment_cache_set(struct vcpu_vmx *vmx, int seg, int field) +{ + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { + kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + + /* + * Make sure the cached segment field value is updated before setting + * the bitmask. This code may get preempted by pkvm_get_cpl_no_cache() + * (on the same CPU), and we don't want pkvm_get_cpl_no_cache() to see + * the field marked in the bitmask as available while its cached value + * is still out of date. + */ + barrier(); + + vmx->segment_cache.bitmask |= mask; +} + +static void pkvm_cache_segment(struct vcpu_vmx *vmx, struct kvm_segment *var, int seg) +{ + struct kvm_save_segment *save = &vmx->segment_cache.seg[seg]; + + save->selector = var->selector; + pkvm_segment_cache_set(vmx, seg, SEG_FIELD_SEL); + + save->base = var->base; + pkvm_segment_cache_set(vmx, seg, SEG_FIELD_BASE); + + save->limit = var->limit; + pkvm_segment_cache_set(vmx, seg, SEG_FIELD_LIMIT); + + save->ar = (var->unusable << 16) | + (var->g << 15) | + (var->db << 14) | + (var->l << 13) | + (var->avl << 12) | + (var->present << 7) | + (var->dpl << 5) | + (var->s << 4) | + var->type; + pkvm_segment_cache_set(vmx, seg, SEG_FIELD_AR); +} + static int pkvm_check_processor_compat(void) { return pkvm_hypercall(check_processor_compatibility); @@ -61,6 +142,18 @@ static void pkvm_disable_virtualization_cpu(void) */ } +/* + * The kvm parameter can be NULL (module initialization, or invocation before + * VM creation). Be sure to check the kvm parameter before using it. + */ +static bool pkvm_has_emulated_msr(struct kvm *kvm, u32 index) +{ + if (!kvm) + return vmx_has_emulated_msr(NULL, index); + + return pkvm_host_has_emulated_msr(kvm, index); +} + static int pkvm_vm_init(struct kvm *kvm) { void *pkvm_vm; @@ -130,7 +223,9 @@ static int pkvm_vcpu_create(struct kvm_vcpu *vcpu) vmx->loaded_vmcs = &vmx->vmcs01; vmx->loaded_vmcs->cpu = -1; - vcpu_size = PKVM_VMX_VCPU_SIZE + KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE; + vcpu_size = PKVM_VMX_VCPU_SIZE; + if (pkvm_is_protected_vcpu(vcpu)) + vcpu_size += KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE; if (lapic_in_kernel(vcpu)) vcpu_size += sizeof(struct kvm_lapic); @@ -170,6 +265,8 @@ static void pkvm_vcpu_free(struct kvm_vcpu *vcpu) union pkvm_hc_data out; int ret; + pkvm_vcpu_unload(vcpu); + ret = pkvm_hypercall_out(vcpu_free, &out, vm_handle, vcpu_handle); if (ret) { pr_err("failed to free VM%d vcpu%d: %d\n", vm_handle, vcpu_handle, ret); @@ -181,6 +278,753 @@ static void pkvm_vcpu_free(struct kvm_vcpu *vcpu) pkvm_free_loaded_vmcs(vmx->loaded_vmcs); } +static void pkvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * TODO: The vcpu_reset PV interface will be disallowed for the pVM + * once its INIT event is handled inside the pKVM hypervisor. So should + * check `pkvm_is_protected_vcpu(vcpu)` rather than + * `vcpu->arch.guest_state_protected` once it is ready. See comments for + * `__pkvm__vcpu_reset` in pkvm_vcpu_handle_host_hypercall. + */ + if (!vcpu->arch.guest_state_protected && init_event) + KVM_BUG_ON(pkvm_hypercall(vcpu_reset), vcpu->kvm); + + /* + * The host is responsible for injecting interrupts to the guest. The + * pi_desc is the key structure for the host to inject interrupts via + * the posted interrupt mechanism. Its physical address is used for the + * POSTED_INTR_DESC_ADDR in the VMCS by the pKVM hypervisor. Initialize + * the pi_desc when reset vcpu. + */ + vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&vmx->vt.pi_desc); + + /* + * The guest CR0/CR4 are managed by the pKVM hypervisor. When the host + * reads the guest CR0/CR4, it should get the up-to-date value from the + * pKVM. So make all bits in the CR0/CR4 as owned by the guest to + * indicate no bit is owned by the host. + */ + vcpu->arch.cr0_guest_owned_bits = ~0; + vcpu->arch.cr4_guest_owned_bits = ~0; + + kvm_set_cr8(vcpu, 0); + + if (pkvm_is_protected_vcpu(vcpu)) { + /* + * Emulating xapic mode will require the host to decode MMIO + * instruction which is not supported if the guest is a pVM as + * the pVM's CPU and memory state will be isolated. To avoid + * using xapic mode for a pVM, enable x2apic mode by default so + * that pVM will use MSR instructions to access lapic, which + * doesn't require decoding. + */ + u64 data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC | + (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0); + + guest_cpu_cap_set(vcpu, X86_FEATURE_X2APIC); + kvm_apic_set_base(vcpu, data, true); + } +} + +static void pkvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool already_loaded; + + already_loaded = vmx->loaded_vmcs->cpu == cpu; + if (!already_loaded) + pkvm_vcpu_unload(vcpu); + + if (KVM_BUG_ON(pkvm_hypercall(vcpu_load, vcpu->kvm->arch.pkvm.handle, + vcpu->arch.pkvm.handle), vcpu->kvm)) + return; + + if (!already_loaded) + vmx->loaded_vmcs->cpu = cpu; + + vmx_vcpu_pi_load(vcpu, cpu); +} + +static void pkvm_vcpu_put(struct kvm_vcpu *vcpu) +{ + vmx_vcpu_pi_put(vcpu); +} + +static void pkvm_update_exception_bitmap(struct kvm_vcpu *vcpu) +{ + if (!pkvm_is_protected_vcpu(vcpu)) + KVM_BUG_ON(pkvm_hypercall(update_exception_bitmap), vcpu->kvm); +} + +static int pkvm_get_feature_msr(u32 msr, u64 *data) +{ + switch (msr) { + case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: + return 1; + default: + return KVM_MSR_RET_UNSUPPORTED; + } +} + +static int pkvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (pkvm_host_has_emulated_msr(vcpu->kvm, msr_info->index)) + return kvm_get_msr_common(vcpu, msr_info); + + if (!vcpu->arch.guest_state_protected) { + union pkvm_hc_data out; + int ret; + + ret = pkvm_hypercall_out(get_msr, &out, msr_info->index); + if (!ret) + msr_info->data = out.get_msr.data; + + return ret; + } + + return -EPERM; +} + +static int pkvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + if (pkvm_host_has_emulated_msr(vcpu->kvm, msr_info->index)) + return kvm_set_msr_common(vcpu, msr_info); + + if (!vcpu->arch.guest_state_protected) + return pkvm_hypercall(set_msr, msr_info->index, msr_info->data); + + return -EPERM; +} + +static u64 pkvm_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + union pkvm_hc_data out; + ulong *p; + + if (vcpu->arch.guest_state_protected) + return 0; + + p = &vmx->segment_cache.seg[seg].base; + + if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_BASE)) { + if (KVM_BUG_ON(pkvm_hypercall_out(get_segment_base, &out, seg), vcpu->kvm)) + return 0; + + *p = out.get_segment_base.data; + pkvm_segment_cache_set(vmx, seg, SEG_FIELD_BASE); + } + + return *p; +} + +static void pkvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_save_segment *segment; + u32 ar; + + if (vcpu->arch.guest_state_protected) { + if (var) + memset(var, 0, sizeof(*var)); + return; + } + + if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_SEL) || + !pkvm_segment_cache_test(vmx, seg, SEG_FIELD_BASE) || + !pkvm_segment_cache_test(vmx, seg, SEG_FIELD_LIMIT) || + !pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR)) { + union pkvm_hc_data out; + + if (KVM_BUG_ON(pkvm_hypercall_out(get_segment, &out, seg), vcpu->kvm)) + return; + + pkvm_cache_segment(vmx, &out.get_segment.seg_val, seg); + } + + if (!var) + return; + + segment = &vmx->segment_cache.seg[seg]; + var->selector = segment->selector; + var->base = segment->base; + var->limit = segment->limit; + ar = segment->ar; + var->unusable = (ar >> 16) & 1; + var->type = ar & 15; + var->s = (ar >> 4) & 1; + var->dpl = (ar >> 5) & 3; + /* + * Some userspaces do not preserve unusable property. Since usable + * segment has to be present according to VMX spec we can use present + * property to amend userspace bug by making unusable segment always + * nonpresent. vmx_segment_access_rights() already marks nonpresent + * segment as unusable. + */ + var->present = !var->unusable; + var->avl = (ar >> 12) & 1; + var->l = (ar >> 13) & 1; + var->db = (ar >> 14) & 1; + var->g = (ar >> 15) & 1; +} + +static void pkvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + union pkvm_hc_data in = { + .set_segment = { + .seg_val = *var, + .seg = seg, + }, + }; + + if (vcpu->arch.guest_state_protected) + return; + + vmx_segment_cache_clear(to_vmx(vcpu)); + + KVM_BUG_ON(pkvm_hypercall_in(set_segment, &in), vcpu->kvm); +} + +static int pkvm_get_cpl(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int seg = VCPU_SREG_SS; + u32 ar; + + if (vcpu->arch.guest_state_protected) + return 0; + + if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR)) + pkvm_get_segment(vcpu, NULL, seg); + + ar = vmx->segment_cache.seg[seg].ar; + return VMX_AR_DPL(ar); +} + +static int pkvm_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int seg = VCPU_SREG_SS; + union pkvm_hc_data out; + + if (vcpu->arch.guest_state_protected) + return 0; + + /* + * Even though this is a no_cache version of get_cpl, still use the + * cached value if it is available, to avoid unnecessary calls to pKVM. + * It may be cached either by the pKVM hypervisor itself (when + * returning to the host after vcpu_run) or by the host after another + * get_segment call to pKVM (in such case, the barrier() in + * pkvm_segment_cache_set() makes sure that we are seeing the up-to-date + * value). + */ + if (likely(pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR))) + return VMX_AR_DPL(vmx->segment_cache.seg[seg].ar); + + if (KVM_BUG_ON(pkvm_hypercall_out(get_segment, &out, seg), vcpu->kvm)) + return 0; + + return out.get_segment.seg_val.dpl; +} + +static void pkvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int seg = VCPU_SREG_CS; + u32 ar; + + if (vcpu->arch.guest_state_protected) { + *db = *l = 0; + return; + } + + if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR)) + pkvm_get_segment(vcpu, NULL, seg); + + ar = vmx->segment_cache.seg[seg].ar; + *db = (ar >> 14) & 1; + *l = (ar >> 13) & 1; +} + +static bool pkvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + return true; +} + +static void pkvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (!vcpu->arch.guest_state_protected) + KVM_BUG_ON(pkvm_hypercall(set_cr0, cr0), vcpu->kvm); + + vcpu->arch.cr0 = cr0; +} + +static bool pkvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + /* The pKVM doesn't support VMX feature. */ + return !(cr4 & X86_CR4_VMXE); +} + +static void pkvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + if (!vcpu->arch.guest_state_protected) + KVM_BUG_ON(pkvm_hypercall(set_cr4, cr4), vcpu->kvm); + + vcpu->arch.cr4 = cr4; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + vcpu->arch.cpuid_dynamic_bits_dirty = true; +} + +static int pkvm_set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + int ret = -EINVAL; + + if (!vcpu->arch.guest_state_protected) + ret = pkvm_hypercall(set_efer, efer); + + vcpu->arch.efer = efer; + return ret; +} + +static void pkvm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + union pkvm_hc_data data; + + if (vcpu->arch.guest_state_protected || + KVM_BUG_ON(pkvm_hypercall_out(get_idt, &data), vcpu->kvm)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + *dt = data.get_idt.desc; +} + +static void pkvm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + union pkvm_hc_data data = { + .set_gdt.desc = *dt, + }; + + if (vcpu->arch.guest_state_protected) + return; + + KVM_BUG_ON(pkvm_hypercall_in(set_idt, &data), vcpu->kvm); +} + +static void pkvm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + union pkvm_hc_data data; + + if (vcpu->arch.guest_state_protected || + KVM_BUG_ON(pkvm_hypercall_out(get_gdt, &data), vcpu->kvm)) { + memset(dt, 0, sizeof(*dt)); + return; + } + + *dt = data.get_gdt.desc; +} + +static void pkvm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +{ + union pkvm_hc_data data = { + .set_gdt.desc = *dt, + }; + + if (vcpu->arch.guest_state_protected) + return; + + KVM_BUG_ON(pkvm_hypercall_in(set_gdt, &data), vcpu->kvm); +} + +static void pkvm_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + if (!pkvm_is_protected_vcpu(vcpu)) + KVM_BUG_ON(pkvm_hypercall(set_dr7, val), vcpu->kvm); +} + +static void pkvm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + union pkvm_hc_data out; + + if (pkvm_is_protected_vcpu(vcpu)) + return; + + if (KVM_BUG_ON(pkvm_hypercall_out(cache_reg, &out, reg), vcpu->kvm)) + return; + + kvm_register_mark_available(vcpu, reg); + + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = out.cache_reg.rsp; + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = out.cache_reg.rip; + break; + case VCPU_EXREG_PDPTR: { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + mmu->pdptrs[0] = out.cache_reg.pdptrs[0]; + mmu->pdptrs[1] = out.cache_reg.pdptrs[1]; + mmu->pdptrs[2] = out.cache_reg.pdptrs[2]; + mmu->pdptrs[3] = out.cache_reg.pdptrs[3]; + break; + } + case VCPU_EXREG_CR0: + vcpu->arch.cr0 = out.cache_reg.cr0; + break; + case VCPU_EXREG_CR3: + vcpu->arch.cr3 = out.cache_reg.cr3; + break; + case VCPU_EXREG_CR4: + vcpu->arch.cr4 = out.cache_reg.cr4; + break; + default: + KVM_BUG_ON(1, vcpu->kvm); + break; + } +} + +static unsigned long pkvm_get_rflags(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { + if (vcpu->arch.guest_state_protected) { + vmx->rflags = 0; + } else { + union pkvm_hc_data out; + + if (KVM_BUG_ON(pkvm_hypercall_out(get_rflags, &out), vcpu->kvm)) + return 0; + + vmx->rflags = out.get_rflags.data; + } + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + } + + return vmx->rflags; +} + +static void pkvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + to_vmx(vcpu)->rflags = rflags; + if (!vcpu->arch.guest_state_protected) + KVM_BUG_ON(pkvm_hypercall(set_rflags, rflags), vcpu->kvm); + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); +} + +static bool pkvm_get_if_flag(struct kvm_vcpu *vcpu) +{ + return pkvm_get_rflags(vcpu) & X86_EFLAGS_IF; +} + +static void pkvm_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.guest_state_protected) + KVM_BUG_ON(pkvm_hypercall(flush_tlb_all), vcpu->kvm); +} + +static void pkvm_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.guest_state_protected) + KVM_BUG_ON(pkvm_hypercall(flush_tlb_current), vcpu->kvm); +} + +static void pkvm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (!vcpu->arch.guest_state_protected) + KVM_BUG_ON(pkvm_hypercall(flush_tlb_gva, addr), vcpu->kvm); +} + +static void pkvm_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.guest_state_protected) + KVM_BUG_ON(pkvm_hypercall(flush_tlb_guest), vcpu->kvm); +} + +static void pkvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + if (!pkvm_is_protected_vcpu(vcpu)) + KVM_BUG_ON(pkvm_hypercall(set_interrupt_shadow, mask), vcpu->kvm); +} + +static u32 pkvm_get_interrupt_shadow(struct kvm_vcpu *vcpu) +{ + union pkvm_hc_data out; + + if (pkvm_is_protected_vcpu(vcpu)) + return 0; + + KVM_BUG_ON(pkvm_hypercall_out(get_interrupt_shadow, &out), vcpu->kvm); + + return out.get_interrupt_shadow.data; +} + +static void pkvm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +{ + trace_kvm_inj_virq(vcpu->arch.interrupt.nr, + vcpu->arch.interrupt.soft, reinjected); + + ++vcpu->stat.irq_injections; + + KVM_BUG_ON(pkvm_hypercall(inject_irq), vcpu->kvm); +} + +static void pkvm_inject_nmi(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.nmi_injections; + + KVM_BUG_ON(pkvm_hypercall(inject_nmi), vcpu->kvm); +} + +static void pkvm_inject_exception(struct kvm_vcpu *vcpu) +{ + if (pkvm_is_protected_vcpu(vcpu)) + return; + + KVM_BUG_ON(pkvm_hypercall(inject_exception), vcpu->kvm); +} + +static void pkvm_cancel_injection(struct kvm_vcpu *vcpu) +{ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + + if (KVM_BUG_ON(pkvm_hypercall(cancel_injection), vcpu->kvm)) + return; + + if (vcpu->arch.nmi_injected || + vcpu->arch.interrupt.injected || + vcpu->arch.exception.injected) + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + +static int pkvm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + return pkvm_hypercall(interrupt_allowed, for_injection); +} + +static int pkvm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + return pkvm_hypercall(nmi_allowed, for_injection); +} + +static bool pkvm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + union pkvm_hc_data out; + + if (KVM_BUG_ON(pkvm_hypercall_out(get_nmi_mask, &out), vcpu->kvm)) + return false; + + return out.get_nmi_mask.data; +} + +static void pkvm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + if (!pkvm_is_protected_vcpu(vcpu)) + KVM_BUG_ON(pkvm_hypercall(set_nmi_mask, masked), vcpu->kvm); +} + +static void pkvm_enable_nmi_window(struct kvm_vcpu *vcpu) +{ + KVM_BUG_ON(pkvm_hypercall(enable_nmi_window), vcpu->kvm); +} + +static void pkvm_enable_irq_window(struct kvm_vcpu *vcpu) +{ + KVM_BUG_ON(pkvm_hypercall(enable_irq_window), vcpu->kvm); +} + +static void pkvm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +{ + KVM_BUG_ON(pkvm_hypercall(update_cr8_intercept, tpr, irr), vcpu->kvm); +} + +static void pkvm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) +{ + if (lapic_in_kernel(vcpu)) + KVM_BUG_ON(pkvm_hypercall(set_virtual_apic_mode), vcpu->kvm); +} + +static void pkvm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +{ + if (lapic_in_kernel(vcpu)) + KVM_BUG_ON(pkvm_hypercall(refresh_apicv_exec_ctrl, vcpu->arch.apic->apicv_active), + vcpu->kvm); +} + +static void pkvm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + if (kvm_vcpu_apicv_active(vcpu)) + KVM_BUG_ON(pkvm_hypercall(load_eoi_exitmap, eoi_exit_bitmap[0], + eoi_exit_bitmap[1], eoi_exit_bitmap[2], + eoi_exit_bitmap[3]), + vcpu->kvm); +} + +#define VMX_REQUIRED_APICV_INHIBITS \ + (BIT(APICV_INHIBIT_REASON_DISABLED) | \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)) + +static void pkvm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) +{ + KVM_BUG_ON(pkvm_hypercall(hwapic_isr_update, max_isr), vcpu->kvm); +} + +static int pkvm_vcpu_realloc_fpstate(struct kvm_vcpu *vcpu) +{ + union pkvm_hc_data out; + size_t fpsize; + void *fps; + int ret; + + fpsize = PAGE_ALIGN(vcpu->arch.guest_fpu.fpstate->size + + ALIGN(offsetof(struct fpstate, regs), 64)); + fps = alloc_pages_exact(fpsize, GFP_KERNEL_ACCOUNT); + if (!fps) + return -ENOMEM; + + ret = pkvm_hypercall_out(vcpu_add_fpstate, &out, __pa(fps), fpsize); + if (KVM_BUG_ON(ret, vcpu->kvm)) + free_pages_exact(fps, fpsize); + else + kvm_free_pkvm_memcache(&out.vcpu_add_fpstate.memcache); + + return ret; +} + +static void pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *e2 = vcpu->arch.cpuid_entries; + int nent = vcpu->arch.cpuid_nent; + union pkvm_hc_data out; + void *entries; + size_t size; + + if (vcpu->arch.guest_state_protected || !e2 || !nent) + return; + + /* + * With exposing the FPU dynamic feature via the cpuid, the fpstate + * allocated when creating the vcpu may not be sufficient for the + * guest. As the pVM's FPU state is managed by the pKVM hypervisor + * while the npVM's FPU state is managed by the host, re-allocating the + * fpstate is only necessary for the pVM, and should be done before + * adding the new cpuid entries to the pKVM hypervisor. + */ + if ((vcpu->arch.guest_fpu.xfeatures & XFEATURE_MASK_USER_DYNAMIC) && + pkvm_is_protected_vcpu(vcpu) && + pkvm_vcpu_realloc_fpstate(vcpu)) + return; + + size = sizeof(struct kvm_cpuid_entry2) * nent; + entries = alloc_pages_exact(size, GFP_KERNEL_ACCOUNT); + if (!entries) { + kvm_err("Failed to allocate cpuid pages for pKVM vcpu\n"); + return; + } + + memcpy(entries, (void *)e2, size); + + if (KVM_BUG_ON(pkvm_hypercall_out(vcpu_after_set_cpuid, &out, __pa(entries)), vcpu->kvm)) + free_pages_exact(entries, size); + else + kvm_free_pkvm_memcache(&out.vcpu_after_set_cpuid.memcache); +} + +static u64 pkvm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static u64 pkvm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + return kvm_caps.default_tsc_scaling_ratio; +} + +static void pkvm_write_tsc_offset(struct kvm_vcpu *vcpu) +{ + KVM_BUG_ON(pkvm_hypercall(write_tsc_offset, vcpu->arch.tsc_offset), vcpu->kvm); +} + +static void pkvm_write_tsc_multiplier(struct kvm_vcpu *vcpu) +{ + KVM_BUG_ON(pkvm_hypercall(write_tsc_multiplier, vcpu->arch.tsc_scaling_ratio), vcpu->kvm); +} + +static void pkvm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) +{ + KVM_BUG_ON(pkvm_hypercall(load_mmu_pgd, root_hpa, root_level), vcpu->kvm); +} + +static void pkvm_leave_nested(struct kvm_vcpu *vcpu) {} +static bool pkvm_nested_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code) +{ + return false; +} +static int pkvm_check_nested_events(struct kvm_vcpu *vcpu) { return 0; } +static void pkvm_nested_triple_fault(struct kvm_vcpu *vcpu) {} +static bool pkvm_get_nested_state_pages(struct kvm_vcpu *vcpu) { return true; } +static int pkvm_nested_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) { return 0; } + +static struct kvm_x86_nested_ops pkvm_nested_ops = { + .leave_nested = pkvm_leave_nested, + .is_exception_vmexit = pkvm_nested_is_exception_vmexit, + .check_events = pkvm_check_nested_events, + .triple_fault = pkvm_nested_triple_fault, + .get_nested_state_pages = pkvm_get_nested_state_pages, + .write_log_dirty = pkvm_nested_write_pml_buffer, +}; + +static void pkvm_setup_mce(struct kvm_vcpu *vcpu) +{ + KVM_BUG_ON(pkvm_hypercall(setup_mce, vcpu->arch.mcg_cap), vcpu->kvm); +} + +#ifdef CONFIG_KVM_SMM +static int pkvm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +{ + return false; +} + +static int pkvm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +{ + return -EINVAL; +} + +static int pkvm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +{ + return -EINVAL; +} + +static void pkvm_enable_smi_window(struct kvm_vcpu *vcpu) {} +#endif + +static bool pkvm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +{ + /* + * The init signal will be blocked if the guest VM is emulating nested + * and in virtual VMX root mode. But as this is not a supported case by + * the pKVM hypervisor, the init signal should never be blocked for the + * guest VM. + */ + return false; +} + struct kvm_x86_ops pkvm_host_vt_x86_ops __initdata = { .name = KBUILD_MODNAME, @@ -190,6 +1034,8 @@ struct kvm_x86_ops pkvm_host_vt_x86_ops __initdata = { .disable_virtualization_cpu = pkvm_disable_virtualization_cpu, .emergency_disable_virtualization_cpu = pkvm_disable_virtualization_cpu, + .has_emulated_msr = pkvm_has_emulated_msr, + .vm_size = sizeof(struct kvm_vmx), .vm_init = pkvm_vm_init, .vm_destroy = pkvm_vm_destroy, @@ -197,4 +1043,95 @@ struct kvm_x86_ops pkvm_host_vt_x86_ops __initdata = { .vcpu_precreate = vmx_vcpu_precreate, .vcpu_create = pkvm_vcpu_create, .vcpu_free = pkvm_vcpu_free, + .vcpu_reset = pkvm_vcpu_reset, + + .vcpu_load = pkvm_vcpu_load, + .vcpu_put = pkvm_vcpu_put, + + .update_exception_bitmap = pkvm_update_exception_bitmap, + .get_feature_msr = pkvm_get_feature_msr, + .get_msr = pkvm_get_msr, + .set_msr = pkvm_set_msr, + .get_segment_base = pkvm_get_segment_base, + .get_segment = pkvm_get_segment, + .set_segment = pkvm_set_segment, + .get_cpl = pkvm_get_cpl, + .get_cpl_no_cache = pkvm_get_cpl_no_cache, + .get_cs_db_l_bits = pkvm_get_cs_db_l_bits, + .is_valid_cr0 = pkvm_is_valid_cr0, + .set_cr0 = pkvm_set_cr0, + .is_valid_cr4 = pkvm_is_valid_cr4, + .set_cr4 = pkvm_set_cr4, + .set_efer = pkvm_set_efer, + .get_idt = pkvm_get_idt, + .set_idt = pkvm_set_idt, + .get_gdt = pkvm_get_gdt, + .set_gdt = pkvm_set_gdt, + .set_dr7 = pkvm_set_dr7, + .cache_reg = pkvm_cache_reg, + .get_rflags = pkvm_get_rflags, + .set_rflags = pkvm_set_rflags, + .get_if_flag = pkvm_get_if_flag, + + .flush_tlb_all = pkvm_flush_tlb_all, + .flush_tlb_current = pkvm_flush_tlb_current, + .flush_tlb_gva = pkvm_flush_tlb_gva, + .flush_tlb_guest = pkvm_flush_tlb_guest, + + .set_interrupt_shadow = pkvm_set_interrupt_shadow, + .get_interrupt_shadow = pkvm_get_interrupt_shadow, + .inject_irq = pkvm_inject_irq, + .inject_nmi = pkvm_inject_nmi, + .inject_exception = pkvm_inject_exception, + .cancel_injection = pkvm_cancel_injection, + .interrupt_allowed = pkvm_interrupt_allowed, + .nmi_allowed = pkvm_nmi_allowed, + .get_nmi_mask = pkvm_get_nmi_mask, + .set_nmi_mask = pkvm_set_nmi_mask, + .enable_nmi_window = pkvm_enable_nmi_window, + .enable_irq_window = pkvm_enable_irq_window, + .update_cr8_intercept = pkvm_update_cr8_intercept, + + .x2apic_icr_is_split = false, + .set_virtual_apic_mode = pkvm_set_virtual_apic_mode, + .refresh_apicv_exec_ctrl = pkvm_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = pkvm_load_eoi_exitmap, + .apicv_pre_state_restore = pi_apicv_pre_state_restore, + .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, + .hwapic_isr_update = pkvm_hwapic_isr_update, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_interrupt = vmx_deliver_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, + + .vcpu_after_set_cpuid = pkvm_vcpu_after_set_cpuid, + + .get_l2_tsc_offset = pkvm_get_l2_tsc_offset, + .get_l2_tsc_multiplier = pkvm_get_l2_tsc_multiplier, + .write_tsc_offset = pkvm_write_tsc_offset, + .write_tsc_multiplier = pkvm_write_tsc_multiplier, + + .load_mmu_pgd = pkvm_load_mmu_pgd, + + .nested_ops = &pkvm_nested_ops, + + .pi_update_irte = vmx_pi_update_irte, + .pi_start_bypass = vmx_pi_start_bypass, + + .setup_mce = pkvm_setup_mce, + +#ifdef CONFIG_KVM_SMM + .smi_allowed = pkvm_smi_allowed, + .enter_smm = pkvm_enter_smm, + .leave_smm = pkvm_leave_smm, + .enable_smi_window = pkvm_enable_smi_window, +#endif + + .apic_init_signal_blocked = pkvm_apic_init_signal_blocked, + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, }; + +bool pkvm_interrupt_blocked(struct kvm_vcpu *vcpu) +{ + return (pkvm_interrupt_allowed(vcpu, false) <= 0); +} diff --git a/arch/x86/kvm/vmx/pkvm_init.c b/arch/x86/kvm/vmx/pkvm_init.c index d2f9e0cb08ca..9262da87a57a 100644 --- a/arch/x86/kvm/vmx/pkvm_init.c +++ b/arch/x86/kvm/vmx/pkvm_init.c @@ -8,6 +8,8 @@ #include "pkvm_constants.h" #include "vmx.h" +extern u64 x86_pred_cmd; + static int __init early_pkvm_parse_cmdline(char *buf) { return kstrtobool(buf, &enable_pkvm); @@ -84,6 +86,11 @@ static __init void pkvm_setup_syms(void) pkvm_sym(nr_cpu_ids) = nr_cpu_ids; pkvm_sym(fpu_kernel_cfg) = fpu_kernel_cfg; pkvm_sym(fpu_user_cfg) = fpu_user_cfg; +#ifdef CONFIG_X86_64 + if (static_branch_unlikely(&__fpu_state_size_dynamic)) + static_branch_enable(&pkvm_sym(__fpu_state_size_dynamic)); +#endif + pkvm_sym(x86_pred_cmd) = x86_pred_cmd; } static __init int pkvm_setup_host_vmcs_config(void) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 4a6d9a17da23..6f89de0ba8fc 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -241,7 +241,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) */ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) && ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || - (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) + (!enable_pkvm && !is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)) || + (enable_pkvm && !pkvm_interrupt_blocked(vcpu)))) pi_enable_wakeup_handler(vcpu); else pi_set_sn(pi_desc); diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index bc255d709d8a..5ab72724187d 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -303,9 +303,11 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) SYM_FUNC_END(__vmx_vcpu_run) +#ifndef __PKVM_HYP__ SYM_FUNC_START(vmx_do_nmi_irqoff) VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx SYM_FUNC_END(vmx_do_nmi_irqoff) +#endif #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a8fd3bc1e8d5..1c7d47dad3fb 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -99,10 +99,8 @@ MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); -#ifndef __PKVM_HYP__ static bool __read_mostly enable_vnmi = 1; module_param_named(vnmi, enable_vnmi, bool, 0444); -#endif bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, 0444); @@ -117,10 +115,10 @@ module_param_named(unrestricted_guest, bool __read_mostly enable_ept_ad_bits = 1; module_param_named(eptad, enable_ept_ad_bits, bool, 0444); -#ifndef __PKVM_HYP__ static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, 0444); +#ifndef __PKVM_HYP__ static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, 0444); @@ -147,26 +145,25 @@ module_param(error_on_inconsistent_vmcs_config, bool, 0444); #ifndef __PKVM_HYP__ static bool __read_mostly dump_invalid_vmcs = 0; module_param(dump_invalid_vmcs, bool, 0644); +#endif /* !__PKVM_HYP__ */ #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 -#endif /* !__PKVM_HYP__ */ #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL #ifndef __PKVM_HYP__ /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ static int __read_mostly cpu_preemption_timer_multi; +#endif /* !__PKVM_HYP__ */ static bool __read_mostly enable_preemption_timer = 1; #ifdef CONFIG_X86_64 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif -#endif /* !__PKVM_HYP__ */ extern bool __read_mostly allow_smaller_maxphyaddr; module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); -#ifndef __PKVM_HYP__ #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ @@ -178,6 +175,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) +#ifndef __PKVM_HYP__ #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ @@ -371,6 +369,7 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); } +#endif /* !__PKVM_HYP__ */ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) { @@ -424,16 +423,17 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) vmx->disable_fb_clear = false; } +#ifndef __PKVM_HYP__ static const struct kernel_param_ops vmentry_l1d_flush_ops = { .set = vmentry_l1d_flush_set, .get = vmentry_l1d_flush_get, }; module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); +#endif /* !__PKVM_HYP__ */ static u32 vmx_segment_access_rights(struct kvm_segment *var); void vmx_vmexit(void); -#endif /* !__PKVM_HYP__ */ #define vmx_insn_failed(fmt...) \ do { \ @@ -490,7 +490,9 @@ noinline void invept_error(unsigned long ext, u64 eptp) #ifndef __PKVM_HYP__ static DEFINE_PER_CPU(struct vmcs *, vmxarea); +#endif DEFINE_PER_CPU(struct vmcs *, current_vmcs); +#ifndef __PKVM_HYP__ /* * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. @@ -508,7 +510,6 @@ static DEFINE_PKVM_SPINLOCK(vmx_vpid_lock); struct vmcs_config vmcs_config __ro_after_init; struct vmx_capability vmx_capability __ro_after_init; -#ifndef __PKVM_HYP__ #define VMX_SEGMENT_FIELD(seg) \ [VCPU_SREG_##seg] = { \ .selector = GUEST_##seg##_SELECTOR, \ @@ -533,8 +534,6 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -#endif /* !__PKVM_HYP__ */ - static unsigned long host_idt_base; #ifndef __PKVM_HYP__ @@ -674,12 +673,10 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) return false; } -#ifndef __PKVM_HYP__ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); } -#endif /* !__PKVM_HYP__ */ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { @@ -691,14 +688,21 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) return NULL; } -#ifndef __PKVM_HYP__ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, struct vmx_uret_msr *msr, u64 data) { unsigned int slot = msr - vmx->guest_uret_msrs; int ret = 0; +#ifndef __PKVM_HYP__ if (msr->load_into_hardware) { +#else + /* + * The host may use set_msr PV interface to access uret MSRs and in this + * case, the uret MSRs are not loaded to the hardware. + */ + if (msr->load_into_hardware && vmx->guest_uret_msrs_loaded) { +#endif preempt_disable(); ret = kvm_set_user_return_msr(slot, data, msr->mask); preempt_enable(); @@ -708,6 +712,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, return ret; } +#ifndef __PKVM_HYP__ /* * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) * @@ -756,6 +761,7 @@ void vmx_emergency_disable_virtualization_cpu(void) kvm_cpu_vmxoff(); } +#endif /* !__PKVM_HYP__ */ static void __loaded_vmcs_clear(void *arg) { @@ -771,6 +777,7 @@ static void __loaded_vmcs_clear(void *arg) if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) vmcs_clear(loaded_vmcs->shadow_vmcs); +#ifndef __PKVM_HYP__ list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); /* @@ -781,11 +788,13 @@ static void __loaded_vmcs_clear(void *arg) * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); +#endif loaded_vmcs->cpu = -1; loaded_vmcs->launched = 0; } +#ifndef __PKVM_HYP__ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { int cpu = loaded_vmcs->cpu; @@ -794,6 +803,7 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) smp_call_function_single(cpu, __loaded_vmcs_clear, loaded_vmcs, 1); } +#endif /* !__PKVM_HYP__ */ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, unsigned field) @@ -911,6 +921,7 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +#ifndef __PKVM_HYP__ /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ @@ -943,6 +954,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) return flags; } +#endif /* !__PKVM_HYP__ */ static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) @@ -1133,6 +1145,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) return true; } +#ifndef __PKVM_HYP__ #ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the @@ -1375,6 +1388,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } +#endif /* !__PKVM_HYP__ */ #ifdef CONFIG_X86_64 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache) @@ -1409,6 +1423,7 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif +#ifndef __PKVM_HYP__ static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1424,6 +1439,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) vmx->ple_window, old); } } +#endif /* !__PKVM_HYP__ */ static void shrink_ple_window(struct kvm_vcpu *vcpu) { @@ -1448,6 +1464,15 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) struct vmcs *prev; if (!already_loaded) { +#ifdef __PKVM_HYP__ + /* + * pkvm doesn't support smp call thus doesn't support clear vmcs + * on a remote CPU. Suppose this vmcs is already cleared by + * vmx_vcpu_put, otherwise it cannot be loaded on this CPU. + */ + if (WARN_ON_ONCE(vmx->loaded_vmcs->cpu != -1)) + return; +#else loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); @@ -1462,6 +1487,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); local_irq_enable(); +#endif } prev = per_cpu(current_vmcs, cpu); @@ -1471,6 +1497,25 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) } if (!already_loaded) { +#ifdef __PKVM_HYP__ + struct desc_ptr gdt; + /* + * Flush all EPTP/VPID contexts, the new pCPU may have stale + * TLB entries from its previous association with the vCPU. + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + vmcs_writel(HOST_TR_BASE, pkvm_pcpu_tss(cpu)); + + native_store_gdt(&gdt); + vmcs_writel(HOST_GDTR_BASE, gdt.address); + + if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { + unsigned long msr = __rdmsr(MSR_IA32_SYSENTER_ESP); + + vmcs_writel(HOST_IA32_SYSENTER_ESP, msr); + } +#else void *gdt = get_current_gdt_ro(); /* @@ -1492,6 +1537,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) vmcs_writel(HOST_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); } +#endif vmx->loaded_vmcs->cpu = cpu; } @@ -1508,14 +1554,31 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx_vcpu_load_vmcs(vcpu, cpu); +#ifndef __PKVM_HYP__ vmx_vcpu_pi_load(vcpu, cpu); +#endif } void vmx_vcpu_put(struct kvm_vcpu *vcpu) { +#ifndef __PKVM_HYP__ vmx_vcpu_pi_put(vcpu); vmx_prepare_switch_to_host(to_vmx(vcpu)); +#else + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * The pKVM hypervisor lacks an smp call mechanism to notify remote CPUs + * to clear VMCS when a vCPU is migrated. Therefore, VMCS clearing is + * performed immediately during the vcpu_put operation. + */ + if (vmx->loaded_vmcs->cpu == -1 || + WARN_ON_ONCE(vmx->loaded_vmcs->cpu != raw_smp_processor_id())) + return; + + __loaded_vmcs_clear(vmx->loaded_vmcs); +#endif } bool vmx_emulation_required(struct kvm_vcpu *vcpu) @@ -1604,6 +1667,7 @@ void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); } +#ifndef __PKVM_HYP__ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1866,7 +1930,6 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -#ifndef __PKVM_HYP__ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, bool load_into_hardware) { @@ -1922,6 +1985,7 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) vmx->guest_uret_msrs_loaded = false; } +#ifndef __PKVM_HYP__ u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -1942,6 +2006,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } +#endif /* !__PKVM_HYP__ */ void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { @@ -1991,6 +2056,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } +#ifndef __PKVM_HYP__ int vmx_get_feature_msr(u32 msr, u64 *data) { switch (msr) { @@ -2002,6 +2068,7 @@ int vmx_get_feature_msr(u32 msr, u64 *data) return KVM_MSR_RET_UNSUPPORTED; } } +#endif /* !__PKVM_HYP__ */ /* * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. @@ -2012,7 +2079,9 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; +#ifndef __PKVM_HYP__ u32 index; +#endif switch (msr_info->index) { #ifdef CONFIG_X86_64 @@ -2085,7 +2154,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) return 1; -#ifdef CONFIG_KVM_HYPERV +#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__) /* * Enlightened VMCS v1 doesn't have certain VMCS fields but * instead of just ignoring the features, different Hyper-V @@ -2098,6 +2167,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &msr_info->data); #endif break; +#ifndef __PKVM_HYP__ case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest()) return 1; @@ -2143,6 +2213,17 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; +#else + /* The pKVM doesn't support PT guest mode. */ + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + case MSR_IA32_RTIT_CR3_MATCH: + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + WARN_ON_ONCE(vmx_pt_mode_is_host_guest()); + return 1; +#endif case MSR_IA32_S_CET: msr_info->data = vmcs_readl(GUEST_S_CET); break; @@ -2186,9 +2267,11 @@ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; +#ifndef __PKVM_HYP__ if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) && (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; +#endif if (boot_cpu_has(X86_FEATURE_RTM) && (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) @@ -2221,7 +2304,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) int ret = 0; u32 msr_index = msr_info->index; u64 data = msr_info->data; +#ifndef __PKVM_HYP__ u32 index; +#endif switch (msr_index) { case MSR_EFER: @@ -2289,9 +2374,15 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_guest_debugctl_write(vcpu, data); + /* + * The pKVM doesn't support guest PMU emulation. Disabling this + * code to avoid importing unnecessary symbols. + */ +#ifndef __PKVM_HYP__ if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); +#endif return 0; case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || @@ -2411,6 +2502,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); + + /* The pKVM hypervisor doesn't support PT guest mode. */ +#ifndef __PKVM_HYP__ case MSR_IA32_RTIT_CTL: if (!vmx_pt_mode_is_host_guest() || vmx_rtit_ctl_check(vcpu, data) || @@ -2470,6 +2564,17 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else vmx->pt_desc.guest.addr_a[index / 2] = data; break; +#else + /* The pKVM hypervisor doesn't support PT guest mode. */ + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + case MSR_IA32_RTIT_CR3_MATCH: + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + WARN_ON_ONCE(vmx_pt_mode_is_host_guest()); + return 1; +#endif case MSR_IA32_S_CET: vmcs_writel(GUEST_S_CET, data); break; @@ -2480,6 +2585,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_INTR_SSP_TABLE, data); break; case MSR_IA32_PERF_CAPABILITIES: +#ifndef __PKVM_HYP__ if (data & PERF_CAP_LBR_FMT) { if ((data & PERF_CAP_LBR_FMT) != (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT)) @@ -2500,7 +2606,13 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } ret = kvm_set_msr_common(vcpu, msr_info); break; - +#else + /* + * The pKVM hypervisor doesn't support emulating PMU for guest + * thus also the IA32_PER_CAPABILITIES. + */ + return KVM_MSR_RET_UNSUPPORTED; +#endif default: find_uret_msr: msr = vmx_find_uret_msr(vmx, msr_index); @@ -2559,7 +2671,6 @@ void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) break; } } -#endif /* !__PKVM_HYP__ */ /* * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID @@ -3129,6 +3240,7 @@ static __init int alloc_kvm_area(void) } return 0; } +#endif /* !__PKVM_HYP__ */ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) @@ -3516,12 +3628,14 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->vt.emulation_required = vmx_emulation_required(vcpu); } +#ifndef __PKVM_HYP__ static int vmx_get_max_ept_level(void) { if (cpu_has_vmx_ept_5levels()) return 5; return 4; } +#endif /* !__PKVM_HYP__ */ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { @@ -3593,7 +3707,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ +#ifndef __PKVM_HYP__ hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); +#else + hw_cr4 = (native_read_cr4() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); +#endif if (enable_unrestricted_guest) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; else if (vmx->rmode.vm86_active) @@ -4004,6 +4122,7 @@ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) return true; } +#ifndef __PKVM_HYP__ static int init_rmode_tss(struct kvm *kvm, void __user *ua) { const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); @@ -4066,6 +4185,7 @@ static int init_rmode_identity_map(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); return r; } +#endif /* !__PKVM_HYP__ */ static void seg_setup(int seg) { @@ -4081,7 +4201,6 @@ static void seg_setup(int seg) vmcs_write32(sf->ar_bytes, ar); } -#endif /* !__PKVM_HYP__ */ int allocate_vpid(void) { @@ -4167,7 +4286,6 @@ void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool se } } -#ifndef __PKVM_HYP__ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { /* @@ -4228,6 +4346,7 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) } } +#ifndef __PKVM_HYP__ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4398,6 +4517,7 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, trig_mode, vector); } } +#endif /* !__PKVM_HYP__ */ /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that @@ -4424,7 +4544,11 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ +#ifdef __PKVM_HYP__ + cr4 = native_read_cr4(); +#else cr4 = cr4_read_shadow(); +#endif vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ vmx->loaded_vmcs->host_state.cr4 = cr4; @@ -4786,7 +4910,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) return exec_control; } -#endif /* !__PKVM_HYP__ */ static inline int vmx_get_pid_table_order(struct kvm *kvm) { @@ -4865,7 +4988,6 @@ int vmx_vcpu_precreate(struct kvm *kvm) return vmx_alloc_ipiv_pid_table(kvm); } -#ifndef __PKVM_HYP__ #define VMX_XSS_EXIT_BITMAP 0 static void init_vmcs(struct vcpu_vmx *vmx) @@ -4897,6 +5019,9 @@ static void init_vmcs(struct vcpu_vmx *vmx) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { +#ifdef __PKVM_HYP__ + struct vcpu_vmx *shared_vmx = to_vmx(to_pkvm_vcpu(&vmx->vcpu)->shared_vcpu); +#endif vmcs_write64(EOI_EXIT_BITMAP0, 0); vmcs_write64(EOI_EXIT_BITMAP1, 0); vmcs_write64(EOI_EXIT_BITMAP2, 0); @@ -4905,7 +5030,17 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); +#ifndef __PKVM_HYP__ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); +#else + /* + * The pKVM hypervisor needs to use the pi_desc from the shared + * vmx to set the POSTED_INTR_DESC_ADDR as the host will post + * the virtual interrupt to the guest via its pi_desc. + */ + vmcs_write64(POSTED_INTR_DESC_ADDR, + __pa(kern_pkvm_va(&shared_vmx->vt.pi_desc))); +#endif } if (vmx_can_use_ipiv(&vmx->vcpu)) { @@ -5038,7 +5173,10 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->msr_ia32_umwait_control = 0; vmx->hv_deadline_tsc = -1; + /* The host VMM handles the virtual APIC for the guest. */ +#ifndef __PKVM_HYP__ kvm_set_cr8(vcpu, 0); +#endif seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); @@ -5085,20 +5223,21 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_cpu_cap_has(X86_FEATURE_SHSTK)) vmcs_writel(GUEST_S_CET, 0); + /* The host VMM handles the virtual APIC for the guest. */ +#ifndef __PKVM_HYP__ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); +#endif vpid_sync_context(vmx->vpid); vmx_update_fb_clear_dis(vcpu, vmx); } -#endif /* !__PKVM_HYP__ */ void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } -#ifndef __PKVM_HYP__ void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || @@ -5119,6 +5258,7 @@ void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); ++vcpu->stat.irq_injections; +#ifndef __PKVM_HYP__ if (vmx->rmode.vm86_active) { int inc_eip = 0; if (vcpu->arch.interrupt.soft) @@ -5126,6 +5266,7 @@ void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); return; } +#endif intr = irq | INTR_INFO_VALID_MASK; if (vcpu->arch.interrupt.soft) { intr |= INTR_TYPE_SOFT_INTR; @@ -5158,10 +5299,12 @@ void vmx_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; vmx->loaded_vmcs->nmi_known_unmasked = false; +#ifndef __PKVM_HYP__ if (vmx->rmode.vm86_active) { kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); return; } +#endif vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); @@ -5258,6 +5401,7 @@ int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_interrupt_blocked(vcpu); } +#ifndef __PKVM_HYP__ int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { void __user *ret; @@ -5786,6 +5930,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) out: return kvm_complete_insn_gp(vcpu, err); } +#endif /* !__PKVM_HYP__ */ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { @@ -5811,6 +5956,7 @@ void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) vmcs_writel(GUEST_DR7, val); } +#ifndef __PKVM_HYP__ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { kvm_apic_update_ppr(vcpu); @@ -6929,6 +7075,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) [size] "r" (size) : "eax", "ebx", "ecx", "edx"); } +#endif /* !__PKVM_HYP__ */ void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { @@ -6964,6 +7111,17 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) return; } +#ifdef __PKVM_HYP__ + /* + * Emulating xapic mode requires instruction decoding. As pVM's CPU and + * memory state are isolated from the host, the host cannot decode pVM's + * instruction. Not to use xapic mode for a pVM. + */ + if (pkvm_is_protected_vcpu(vcpu) && + (kvm_get_apic_mode(vcpu) == LAPIC_MODE_XAPIC)) + return; +#endif + sec_exec_control = secondary_exec_controls_get(vmx); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -7000,6 +7158,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap_x2apic(vcpu); } +#ifndef __PKVM_HYP__ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; @@ -7070,6 +7229,7 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) */ read_unlock(&vcpu->kvm->mmu_lock); } +#endif /* !__PKVM_HYP__ */ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { @@ -7128,6 +7288,7 @@ static void vmx_set_rvi(int vector) int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { +#ifndef __PKVM_HYP__ struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; @@ -7164,11 +7325,19 @@ int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * attempt to post interrupts. The posted interrupt vector will cause * a VM-Exit and the subsequent entry will call sync_pir_to_irr. */ - if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) - vmx_set_rvi(max_irr); - else if (got_posted_interrupt) + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) { + if (!enable_pkvm) + vmx_set_rvi(max_irr); + else if (max_irr != -1) + KVM_BUG_ON(pkvm_hypercall(sync_pir_to_irr, max_irr), vcpu->kvm); + } else if (got_posted_interrupt) { kvm_make_request(KVM_REQ_EVENT, vcpu); + } +#else + int max_irr = to_pkvm_vcpu(vcpu)->max_irr; + vmx_set_rvi(max_irr); +#endif return max_irr; } @@ -7183,6 +7352,7 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } +#ifndef __PKVM_HYP__ void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); @@ -7319,6 +7489,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx->loaded_vmcs->entry_time)); } +#endif /* !__PKVM_HYP__ */ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, u32 idt_vectoring_info, @@ -7378,12 +7549,14 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, } } +#ifndef __PKVM_HYP__ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_ERROR_CODE); } +#endif /* !__PKVM_HYP__ */ void vmx_cancel_injection(struct kvm_vcpu *vcpu) { @@ -7395,6 +7568,7 @@ void vmx_cancel_injection(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } +#ifndef __PKVM_HYP__ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) { int i, nr_msrs; @@ -7443,6 +7617,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit vmx->loaded_vmcs->hv_timer_soft_disabled = true; } } +#endif /* !__PKVM_HYP__ */ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { @@ -7451,7 +7626,6 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) vmcs_writel(HOST_RSP, host_rsp); } } -#endif /* !__PKVM_HYP__ */ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags) @@ -7954,6 +8128,7 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); } +#endif /* !__PKVM_HYP__ */ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { @@ -7974,6 +8149,7 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); } +#ifndef __PKVM_HYP__ /* * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits * (indicating "allowed-1") if they are supported in the guest's CPUID. @@ -8091,6 +8267,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } +#endif /* !__PKVM_HYP__ */ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { @@ -8119,12 +8296,15 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); + /* The pKVM hypervisor has disabled nested and PT */ +#ifndef __PKVM_HYP__ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); +#endif if (boot_cpu_has(X86_FEATURE_RTM)) { struct vmx_uret_msr *msr; @@ -8154,6 +8334,7 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } +#ifndef __PKVM_HYP__ static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PERF_CAP_FW_WRITES; @@ -8502,6 +8683,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) else secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } +#endif /* !__PKVM_HYP__ */ void vmx_setup_mce(struct kvm_vcpu *vcpu) { @@ -8513,6 +8695,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } +#ifndef __PKVM_HYP__ #ifdef CONFIG_KVM_SMM int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 69d7de8353ac..047e092f47ad 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -814,6 +814,12 @@ extern void *pkvm_sym(pkvm_vmx_init_ops); int pkvm_vmx_init(void); #endif +bool pkvm_interrupt_blocked(struct kvm_vcpu *vcpu); + +#else + +static inline bool pkvm_interrupt_blocked(struct kvm_vcpu *vcpu) { return false; } + #endif /* CONFIG_PKVM_INTEL */ #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 96677576c836..903822bce814 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -303,6 +303,24 @@ static inline void vmcs_load(struct vmcs *vmcs) vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr); } +static inline u64 vmcs_store(void) +{ + u64 phys_addr = INVALID_PAGE; + + asm goto("1: vmptrst (%0)\n\t" + _ASM_EXTABLE(1b, %l[do_exception]) + : + : "r" (&phys_addr) + : "cc", "memory" + : do_exception); + + return phys_addr; + +do_exception: + kvm_spurious_fault(); + return INVALID_PAGE; +} + static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) { struct { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 13398d68a0aa..a7d223ffb76f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -83,6 +83,7 @@ #include #include #include +#include #include #ifdef __PKVM_HYP__ @@ -146,9 +147,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static DEFINE_MUTEX(vendor_module_lock); +#endif /* !__PKVM_HYP__ */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); -#endif /* !__PKVM_HYP__ */ struct kvm_x86_ops kvm_x86_ops __read_mostly; #ifndef __PKVM_HYP__ @@ -163,11 +164,13 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, 0644); +#endif /* !__PKVM_HYP__ */ bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, 0644); EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); +#ifndef __PKVM_HYP__ unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); @@ -177,11 +180,13 @@ module_param(kvmclock_periodic_sync, bool, 0444); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); +#endif /* !__PKVM_HYP__ */ bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, 0444); EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor); +#ifndef __PKVM_HYP__ /* * Flags to manipulate forced emulation behavior (any non-zero value will * enable forced emulation). @@ -662,14 +667,15 @@ void kvm_user_return_msr_cpu_online(void) } } -#ifndef __PKVM_HYP__ static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) { +#ifndef __PKVM_HYP__ if (!msrs->registered) { msrs->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&msrs->urn); msrs->registered = true; } +#endif } int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) @@ -690,6 +696,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); +#ifndef __PKVM_HYP__ void kvm_user_return_msr_update_cache(unsigned int slot, u64 value) { struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); @@ -938,6 +945,7 @@ static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, { kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); } +#endif /* !__PKVM_HYP__ */ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, bool has_error_code, u32 error_code) @@ -968,6 +976,7 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception); +#ifndef __PKVM_HYP__ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -1271,6 +1280,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state); +#endif /* !__PKVM_HYP__ */ #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) @@ -1325,6 +1335,7 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr); +#ifndef __PKVM_HYP__ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ @@ -1532,6 +1543,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) return vcpu->arch.cr8; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8); +#endif /* !__PKVM_HYP__ */ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { @@ -1558,6 +1570,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7); +#ifndef __PKVM_HYP__ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; @@ -1631,6 +1644,7 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc); +#endif /* !__PKVM_HYP__ */ /* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM @@ -1651,7 +1665,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc); ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) -static u64 kvm_get_arch_capabilities(void) +u64 kvm_get_arch_capabilities(void) { u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP; @@ -1663,6 +1677,7 @@ static u64 kvm_get_arch_capabilities(void) */ data |= ARCH_CAP_PSCHANGE_MC_NO; +#ifndef __PKVM_HYP__ /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. @@ -1674,6 +1689,18 @@ static u64 kvm_get_arch_capabilities(void) */ if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; +#else + /* + * The CPU which can run the pKVM hypervisor doesn't have L1TF CPU + * bugs. This is guaranteed by pkvm_mitigate_cpu_bugs() which currently + * doesn't mitigate L1TF and thus would fail pKVM initialization if L1TF + * was present, so we can set ARCH_CAP_SKIP_VMENTRY_L1DFLUSH for guest. + * As the pKVM hypervisor doesn't support nest, passing this cap to the + * guest is not necessary. But in case nest is supported in the future, + * passing this cap anyway. + */ + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; +#endif if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) data |= ARCH_CAP_RDCL_NO; @@ -1705,12 +1732,23 @@ static u64 kvm_get_arch_capabilities(void) */ } +#ifndef __PKVM_HYP__ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) data |= ARCH_CAP_GDS_NO; +#else + /* + * The CPU which can run the pKVM hypervisor doesn't have GDS bug. This + * is guaranteed by pkvm_mitigate_cpu_bugs() which currently doesn't + * mitigate GDS and thus would fail pKVM initialization if GDS was + * present, so we can set ARCH_CAP_GDS_NO. + */ + data |= ARCH_CAP_GDS_NO; +#endif return data; } +#ifndef __PKVM_HYP__ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { @@ -1740,6 +1778,7 @@ static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, kvm_get_feature_msr); } +#endif /* !__PKVM_HYP__ */ static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { @@ -1773,7 +1812,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { +#ifndef __PKVM_HYP__ u64 old_efer = vcpu->arch.efer; +#endif u64 efer = msr_info->data; int r; @@ -1798,16 +1839,18 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return r; } + /* TODO: Notify the host VMM to reset kvm mmu reset. */ +#ifndef __PKVM_HYP__ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) kvm_mmu_reset_context(vcpu); if (!static_cpu_has(X86_FEATURE_XSAVES) && (efer & EFER_SVME)) kvm_hv_xsaves_xsavec_maybe_warn(vcpu); +#endif return 0; } -#endif /* !__PKVM_HYP__ */ void kvm_enable_efer_bits(u64 mask) { @@ -1867,7 +1910,6 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed); -#ifndef __PKVM_HYP__ /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1974,6 +2016,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return kvm_x86_call(set_msr)(vcpu, &msr); } +#ifndef __PKVM_HYP__ static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { @@ -1986,6 +2029,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, _kvm_set_msr); } +#endif /* !__PKVM_HYP__ */ /* * Read the MSR specified by @index into @data. Select MSR specific fault @@ -2044,6 +2088,7 @@ int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) return __kvm_get_msr(vcpu, index, data, true); } +#ifndef __PKVM_HYP__ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { @@ -3586,6 +3631,7 @@ static void kvmclock_sync_fn(struct work_struct *work) schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); } +#endif /* !__PKVM_HYP__ */ /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ static bool is_mci_control_msr(u32 msr) @@ -3681,6 +3727,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } +#ifndef __PKVM_HYP__ static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) { u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; @@ -3962,12 +4009,14 @@ static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); } +#endif /* !__PKVM_HYP__ */ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; u64 data = msr_info->data; +#ifndef __PKVM_HYP__ /* * Do not allow host-initiated writes to trigger the Xen hypercall * page setup; it could incur locking paths which are not expected @@ -3976,6 +4025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && !msr_info->host_initiated) return kvm_xen_write_hypercall_page(vcpu, data); +#endif switch (msr) { case MSR_AMD64_NB_CFG: @@ -4015,7 +4065,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; vcpu->arch.perf_capabilities = data; + /* + * The pkvm hypervisor doesn't provide X86_FEATURE_PDCM to the + * guest thus no need to do PMU refresh. + */ +#ifndef __PKVM_HYP__ kvm_pmu_refresh(vcpu); +#endif break; case MSR_IA32_PRED_CMD: { u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); @@ -4092,6 +4148,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); +#ifndef __PKVM_HYP__ case MSR_IA32_APICBASE: return kvm_apic_set_base(vcpu, data, msr_info->host_initiated); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: @@ -4112,6 +4169,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr = data; } break; +#endif case MSR_IA32_MISC_ENABLE: { u64 old_val = vcpu->arch.ia32_misc_enable_msr; @@ -4144,6 +4202,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_POWER_CTL: vcpu->arch.msr_ia32_power_ctl = data; break; +#ifndef __PKVM_HYP__ case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); @@ -4153,6 +4212,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.ia32_tsc_adjust_msr += adj; } break; +#endif case MSR_IA32_XSS: if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return KVM_MSR_RET_UNSUPPORTED; @@ -4169,6 +4229,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smi_count = data; break; +#ifndef __PKVM_HYP__ case MSR_KVM_WALL_CLOCK_NEW: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) return 1; @@ -4253,6 +4314,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_kvm_poll_control = data; break; +#endif case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: @@ -4264,8 +4326,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: +#ifndef __PKVM_HYP__ if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); +#endif if (data) kvm_pr_unimpl_wrmsr(vcpu, msr, data); @@ -4280,7 +4344,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * the need to ignore the workaround. */ break; -#ifdef CONFIG_KVM_HYPERV +#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__) case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -4344,13 +4408,25 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + /* + * These CET related MSRs are passed-through to the guest, and this code + * is not for the KVM to emulate rdmsr/wrmsr instruction, but for the + * KVM (or the userspace VMM) to access the guest CET MSRs for managing + * the guest FPU state, or emulating some other instructions (e.g., task + * switch). For a pVM these MSRs are inaccessible to the host anyway and + * the pKVM hypervisor itself doesn't need to access them either. + */ +#ifndef __PKVM_HYP__ case MSR_IA32_U_CET: case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: kvm_set_xstate_msr(vcpu, msr_info); break; +#endif default: +#ifndef __PKVM_HYP__ if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); +#endif return KVM_MSR_RET_UNSUPPORTED; } @@ -4446,8 +4522,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: +#ifndef __PKVM_HYP__ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); +#endif msr_info->data = 0; break; case MSR_IA32_UCODE_REV: @@ -4466,6 +4544,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; +#ifndef __PKVM_HYP__ case MSR_IA32_TSC: { /* * Intel SDM states that MSR_IA32_TSC read adds the TSC offset @@ -4489,6 +4568,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } +#endif case MSR_IA32_CR_PAT: msr_info->data = vcpu->arch.pat; break; @@ -4513,6 +4593,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_EBC_FREQUENCY_ID: msr_info->data = 1 << 24; break; +#ifndef __PKVM_HYP__ case MSR_IA32_APICBASE: msr_info->data = vcpu->arch.apic_base; break; @@ -4524,6 +4605,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; +#endif case MSR_IA32_MISC_ENABLE: msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; @@ -4544,6 +4626,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_EFER: msr_info->data = vcpu->arch.efer; break; +#ifndef __PKVM_HYP__ case MSR_KVM_WALL_CLOCK: if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) return 1; @@ -4604,6 +4687,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.msr_kvm_poll_control; break; +#endif case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -4631,7 +4715,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ msr_info->data = 0x20000000; break; -#ifdef CONFIG_KVM_HYPERV +#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__) case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -4697,13 +4781,25 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; #endif + /* + * These CET related MSRs are passed-through to the guest, and this code + * is not for the KVM to emulate rdmsr/wrmsr instruction, but for the + * KVM (or the userspace VMM) to access the guest CET MSRs for managing + * the guest FPU state, or emulating some other instructions (e.g., task + * switch). For a pVM these MSRs are inaccessible to the host anyway and + * the pKVM hypervisor itself doesn't need to access them either. + */ +#ifndef __PKVM_HYP__ case MSR_IA32_U_CET: case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: kvm_get_xstate_msr(vcpu, msr_info); break; +#endif default: +#ifndef __PKVM_HYP__ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); +#endif return KVM_MSR_RET_UNSUPPORTED; } @@ -4711,6 +4807,57 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common); +#ifdef CONFIG_PKVM_X86 +bool pkvm_host_has_emulated_msr(struct kvm *kvm, u32 msr) +{ + switch (msr) { + case MSR_KVM_WALL_CLOCK: + case MSR_KVM_WALL_CLOCK_NEW: + case MSR_KVM_SYSTEM_TIME: + case MSR_KVM_SYSTEM_TIME_NEW: + case MSR_KVM_ASYNC_PF_EN: + case MSR_KVM_ASYNC_PF_INT: + case MSR_KVM_ASYNC_PF_ACK: + case MSR_KVM_STEAL_TIME: + case MSR_KVM_PV_EOI_EN: + case MSR_KVM_POLL_CONTROL: +#if defined(CONFIG_KVM_HYPERV) + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: + case HV_X64_MSR_CRASH_CTL: + case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: + case HV_X64_MSR_REENLIGHTENMENT_CONTROL: + case HV_X64_MSR_TSC_EMULATION_CONTROL: + case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: +#endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + if (pkvm_is_protected_vm(kvm)) + return false; + fallthrough; + case MSR_IA32_TSC_ADJUST: + case MSR_IA32_TSC: + case MSR_IA32_APICBASE: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: + case MSR_IA32_TSC_DEADLINE: + return true; + default: + /* + * All other emulated MSRs are directly emulated by the pKVM + * hypervisor. + */ + break; + } + + return false; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(pkvm_host_has_emulated_msr); +#endif + +#ifndef __PKVM_HYP__ /* * Read or write a bunch of msrs. All parameters are kernel addresses. * @@ -5461,9 +5608,9 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, vcpu->arch.tpr_access_reporting = !!tac->enabled; return 0; } +#endif /* !__PKVM_HYP__ */ -static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, - u64 mcg_cap) +int kvm_vcpu_x86_setup_mce(struct kvm_vcpu *vcpu, u64 mcg_cap) { int r; unsigned bank_num = mcg_cap & 0xff, bank; @@ -5492,6 +5639,13 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, return r; } +#ifndef __PKVM_HYP__ +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, + u64 mcg_cap) +{ + return kvm_vcpu_x86_setup_mce(vcpu, mcg_cap); +} + /* * Validate this is an UCNA (uncorrectable no action) error by checking the * MCG_STATUS and MCi_STATUS registers: @@ -8509,12 +8663,14 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, { return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); } +#endif /* !__PKVM_HYP__ */ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_call(get_segment_base)(vcpu, seg); } +#ifndef __PKVM_HYP__ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) { kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); @@ -11828,6 +11984,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) vcpu->arch.complete_userspace_io = complete_emulated_mmio; return 0; } +#endif /* !__PKVM_HYP__ */ /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) @@ -11851,6 +12008,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) trace_kvm_fpu(0); } +#ifndef __PKVM_HYP__ static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) { /* @@ -12806,6 +12964,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); kvfree(vcpu->arch.cpuid_entries); } +#endif /* !__PKVM_HYP__ */ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) { @@ -12821,6 +12980,15 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) if (!init_event || !fpstate) return; +#ifdef __PKVM_HYP__ + /* + * The npVM's FPU state is managed by the host thus it is not necessary + * to reset by the pKVM hypervisor. + */ + if (!pkvm_is_protected_vcpu(vcpu)) + return; +#endif + /* * On INIT, only select XSTATE components are zeroed, most components * are unchanged. Currently, the only components that are zeroed and @@ -12879,7 +13047,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (is_guest_mode(vcpu)) kvm_leave_nested(vcpu); + /* The virtual APIC is emulated by the host rather than the pKVM. */ +#ifndef __PKVM_HYP__ kvm_lapic_reset(vcpu, init_event); +#endif WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu)); vcpu->arch.hflags = 0; @@ -12905,11 +13076,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apf.msr_int_val = 0; vcpu->arch.st.msr_val = 0; + /* + * For the pKVM hypervisor, the kvmclock/async_pf is emulated by the + * host. + */ +#ifndef __PKVM_HYP__ kvmclock_reset(vcpu); kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; +#endif kvm_xstate_reset(vcpu, init_event); @@ -12974,7 +13151,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (old_cr0 & X86_CR0_PG) { kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + /* The host will reset kvm mmu context. */ +#ifndef __PKVM_HYP__ kvm_mmu_reset_context(vcpu); +#endif } /* @@ -12991,6 +13171,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset); +#ifndef __PKVM_HYP__ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { struct kvm_segment cs; @@ -13679,6 +13860,7 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { return kvm_x86_call(interrupt_allowed)(vcpu, false); } +#endif /* !__PKVM_HYP__ */ unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) { @@ -13725,6 +13907,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags); +#ifndef __PKVM_HYP__ static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); @@ -13998,6 +14181,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) } #endif #endif +#endif /* !__PKVM_HYP__ */ int kvm_spec_ctrl_test_value(u64 value) { @@ -14007,10 +14191,14 @@ int kvm_spec_ctrl_test_value(u64 value) */ u64 saved_value; +#ifndef __PKVM_HYP__ unsigned long flags; +#endif int ret = 0; +#ifndef __PKVM_HYP__ local_irq_save(flags); +#endif if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) ret = 1; @@ -14019,12 +14207,15 @@ int kvm_spec_ctrl_test_value(u64 value) else wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); +#ifndef __PKVM_HYP__ local_irq_restore(flags); +#endif return ret; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value); +#ifndef __PKVM_HYP__ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 416f8570bb14..8a6dad689b03 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -737,6 +737,7 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); void kvm_user_return_msr_cpu_online(void); +u64 kvm_get_arch_capabilities(void); #define CET_US_RESERVED_BITS GENMASK(9, 6) #define CET_US_SHSTK_MASK_BITS GENMASK(1, 0) diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index ff44ec346162..5f3302bf2cf4 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -134,8 +134,9 @@ struct ddebug_class_param { * pr_debug() and friends are globally enabled or modules have selectively * enabled them. */ -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#if (defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))) && \ + (!defined(__PKVM_HYP__) || defined(CONFIG_PKVM_X86_DEBUG)) extern __printf(2, 3) void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...); diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 57b074e0cfbb..440f2dbe8335 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -39,7 +39,7 @@ struct task_struct; struct task_struct *idle) {} #endif -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(__PKVM_HYP__) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); @@ -127,7 +127,7 @@ do { \ # define lockdep_irq_work_exit(__work) do { } while (0) #endif -#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) && !defined(__PKVM_HYP__) # define lockdep_softirq_enter() \ do { \ current->softirq_context++; \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..435701d5e1e5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -878,6 +878,18 @@ struct kvm { char stats_id[KVM_STATS_NAME_SIZE]; }; +#ifdef __PKVM_HYP__ +#define kvm_err(fmt, ...) \ + pr_err("pkvm: " fmt, ## __VA_ARGS__) +#define kvm_info(fmt, ...) \ + pr_info("pkvm: " fmt, ## __VA_ARGS__) +#define kvm_debug(fmt, ...) \ + pr_debug("pkvm: " fmt, ## __VA_ARGS__) +#define kvm_debug_ratelimited(fmt, ...) \ + pr_debug_ratelimited("pkvm: " fmt, ## __VA_ARGS__) +#define kvm_pr_unimpl(fmt, ...) \ + pr_err_ratelimited("pkvm: " fmt, ## __VA_ARGS__) +#else #define kvm_err(fmt, ...) \ pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__) #define kvm_info(fmt, ...) \ @@ -890,6 +902,7 @@ struct kvm { #define kvm_pr_unimpl(fmt, ...) \ pr_err_ratelimited("kvm [%i]: " fmt, \ task_tgid_nr(current), ## __VA_ARGS__) +#endif /* The guest did something we don't support. */ #define vcpu_unimpl(vcpu, fmt, ...) \ @@ -907,7 +920,11 @@ struct kvm { static inline void kvm_vm_dead(struct kvm *kvm) { kvm->vm_dead = true; +#ifndef __PKVM_HYP__ kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD); +#else + /* TODO: Handle VM dead in the pKVM. */ +#endif } static inline void kvm_vm_bugged(struct kvm *kvm) diff --git a/include/linux/printk.h b/include/linux/printk.h index 45c663124c9b..c15f1faf962c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -735,8 +735,9 @@ struct pi_entry { #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) || \ - (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +#if (defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))) && \ + (!defined(__PKVM_HYP__) || defined(CONFIG_PKVM_X86_DEBUG)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \