diff --git a/arch/x86/include/asm/kvm_pkvm.h b/arch/x86/include/asm/kvm_pkvm.h index bcbf9bccf340..b9316fcf94b7 100644 --- a/arch/x86/include/asm/kvm_pkvm.h +++ b/arch/x86/include/asm/kvm_pkvm.h @@ -224,6 +224,7 @@ extern unsigned int pkvm_sym(nr_cpu_ids); DECLARE_STATIC_KEY_FALSE(pkvm_sym(switch_vcpu_ibpb)); extern struct fpu_state_config pkvm_sym(fpu_kernel_cfg); extern struct fpu_state_config pkvm_sym(fpu_user_cfg); +extern struct pkvm_init_ops *pkvm_sym(init_ops); u64 pkvm_total_reserve_pages(void); PKVM_DECLARE(void *, pkvm_early_alloc_page, (void)); diff --git a/arch/x86/include/asm/pkvm_hypercalls.h b/arch/x86/include/asm/pkvm_hypercalls.h index e4864e76136e..56caee1335e1 100644 --- a/arch/x86/include/asm/pkvm_hypercalls.h +++ b/arch/x86/include/asm/pkvm_hypercalls.h @@ -8,7 +8,9 @@ BUILD_BUG_ON(1) #endif /* Hypercalls used only during pKVM initialization */ +PKVM_HC(init) PKVM_HC(init_finalize) +PKVM_HC(reprivilege_cpu) /* pKVM vmexit tracing/profiling */ PKVM_HC(enable_vmexit_trace) diff --git a/arch/x86/kvm/pkvm/Makefile b/arch/x86/kvm/pkvm/Makefile index 408e4206c579..73bf41d626ea 100644 --- a/arch/x86/kvm/pkvm/Makefile +++ b/arch/x86/kvm/pkvm/Makefile @@ -13,7 +13,7 @@ asflags-y += -include $(srctree)/arch/x86/kvm/pkvm/undef.h cmd_deps := $(src)/compiling_cmds.sh pkvm-hyp-y := early_alloc.o pkvm.o memory.o cpu.o \ - idt.o entry.o init_finalize.o pgtable.o \ + idt.o entry.o init.o pgtable.o \ mmu.o page_alloc.o lapic.o trace.o \ fpu.o @@ -39,7 +39,7 @@ pkvm-hyp-y += $(kernel-lib)/sort.o $(kernel-lib)/bsearch.o \ kvm := .. pkvm-hyp-y += $(kvm)/x86.o $(kvm)/cpuid.o -pkvm-hyp-$(CONFIG_PKVM_INTEL) += vmx/host_vmentry.o vmx/host_vmx.o \ +pkvm-hyp-$(CONFIG_PKVM_INTEL) += vmx/host_vmentry.o vmx/host_vmx.o vmx/host_repriv.o \ $(kvm)/vmx/vmx.o vmx/idt.o vmx/ept.o \ $(kvm)/vmx/main.o diff --git a/arch/x86/kvm/pkvm/init_finalize.c b/arch/x86/kvm/pkvm/init.c similarity index 80% rename from arch/x86/kvm/pkvm/init_finalize.c rename to arch/x86/kvm/pkvm/init.c index d5489b6f6306..498686dd5535 100644 --- a/arch/x86/kvm/pkvm/init_finalize.c +++ b/arch/x86/kvm/pkvm/init.c @@ -3,7 +3,7 @@ #include #include "early_alloc.h" #include "fpu.h" -#include "init_finalize.h" +#include "init.h" #include "lapic.h" #include "memory.h" #include "mmu.h" @@ -13,7 +13,7 @@ static void *hyp_pgt_base; static void *host_pgt_base; static void *pkvm_vmemmap_base; -static DEFINE_PER_CPU(bool, cpu_finalized); +static DEFINE_PER_CPU(bool, cpu_initialized); static int divide_memory_pool(phys_addr_t phys, unsigned long size) { @@ -192,12 +192,14 @@ static int create_host_mmu(const struct pkvm_mem_info infos[], int nr_infos, return 0; } +/* Set by the host before deprivilege and used through the initialization process. */ +struct pkvm_init_ops *init_ops; + #define TMP_NR_INFOS 16 -static int finalize_global(struct pkvm_mem_info infos[], int nr_infos, - struct pkvm_init_ops *init_ops) +static int initialize_global(struct pkvm_mem_info infos[], int nr_infos) { host_mmu_init_fn_t host_mmu_init_fn = init_ops ? init_ops->host_mmu_init : NULL; - hyp_g_finalize_fn_t hyp_g_finalize = init_ops ? init_ops->hyp_g_finalize : NULL; + hyp_global_init_fn_t hyp_global_init = init_ops ? init_ops->hyp_global_init : NULL; struct pkvm_mem_info tmp_infos[TMP_NR_INFOS]; phys_addr_t mem_base = INVALID_PAGE; unsigned long mem_size = 0; @@ -238,33 +240,32 @@ static int finalize_global(struct pkvm_mem_info infos[], int nr_infos, if (ret) return ret; - return hyp_g_finalize ? hyp_g_finalize() : 0; + return hyp_global_init ? hyp_global_init() : 0; } -int pkvm_init_finalize(struct pkvm_mem_info infos[], int nr_infos, - struct pkvm_init_ops *init_ops) +int pkvm_init(struct pkvm_mem_info infos[], int nr_infos) { hyp_mmu_finalize_fn_t hyp_mmu_finalize_fn = init_ops ? init_ops->hyp_mmu_finalize : NULL; host_mmu_finalize_fn_t host_mmu_finalize_fn = init_ops ? init_ops->host_mmu_finalize : NULL; - static bool global_finalized; + static bool global_initialized; int ret; - if (this_cpu_read(cpu_finalized)) + if (this_cpu_read(cpu_initialized)) return -EBUSY; - if (!global_finalized) { - ret = finalize_global(infos, nr_infos, init_ops); + if (!global_initialized) { + ret = initialize_global(infos, nr_infos); if (ret) return ret; - global_finalized = true; + global_initialized = true; } else { /* * The pKVM hypervisor's MMU was already loaded on the first - * finalized CPU during the global finalize. Need to load it - * on all the other CPUs as well. + * initialized CPU during the global initialize. Need to load + * it on all the other CPUs as well. */ pkvm_hyp_mmu_load(); } @@ -285,6 +286,49 @@ int pkvm_init_finalize(struct pkvm_mem_info infos[], int nr_infos, pkvm_vcpu_perf_init(this_cpu_read(host_vcpu)); - this_cpu_write(cpu_finalized, true); + this_cpu_write(cpu_initialized, true); + return 0; +} + +/* + * Flag indicating if pkvm is initialized successfully. + * Used to enforce internal hypercalls to be unavailable + * for general use once pkvm is initialized. + */ +static bool pkvm_initialized __ro_after_init; + +int pkvm_reprivilege_vcpu(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(pkvm_initialized)) + return -EPERM; + + if (!init_ops || !init_ops->reprivilege_cpu) + return -EOPNOTSUPP; + + init_ops->reprivilege_cpu(vcpu->arch.regs); + + /* Reach here only if reprivilege operation fails. */ + return -EFAULT; +} + +int pkvm_init_finalize(void) +{ + int cpu; + + if (READ_ONCE(pkvm_initialized)) + return -EPERM; + + for_each_possible_cpu(cpu) { + if (!per_cpu(cpu_initialized, cpu)) + return -EAGAIN; + } + WRITE_ONCE(pkvm_initialized, true); + if (init_ops) + init_ops->reprivilege_cpu = NULL; + /* + * TODO: Move reprivilege logic to a separate + * section and zero it out here. + */ + return 0; } diff --git a/arch/x86/kvm/pkvm/init.h b/arch/x86/kvm/pkvm/init.h new file mode 100644 index 000000000000..fa7275e1865d --- /dev/null +++ b/arch/x86/kvm/pkvm/init.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PKVM_X86_INIT_H +#define __PKVM_X86_INIT_H + +#include +#include "pgtable.h" + +typedef int (*hyp_mmu_finalize_fn_t)(struct pkvm_pgtable *pgt); +typedef int (*host_mmu_init_fn_t)(struct pkvm_pgtable *pgt, void *pool_base, + unsigned long pool_pages); +typedef int (*host_mmu_finalize_fn_t)(struct pkvm_pgtable *pgt); +typedef int (*hyp_global_init_fn_t)(void); +typedef void (*reprivilege_cpu_fn_t)(unsigned long *vcpu_regs); + +/** + * pkvm_init_ops - The platform vendor specific pKVM init operations used by the + * pkvm_init. Some operation could be NULL if it is not necessary. + * + * @hyp_mmu_finalize: Finalize the hypervisor mmu. + * @host_mmu_init: Initialize the host mmu. + * @host_mmu_finalize: Finalize the host mmu. + * @hyp_global_init: Initialize the hypervisor globally. + * @reprivilege_cpu: Switch the cpu back to root mode. Called if deprivilege + * or pKVM initialization fails. + */ +struct pkvm_init_ops { + hyp_mmu_finalize_fn_t hyp_mmu_finalize; + host_mmu_init_fn_t host_mmu_init; + host_mmu_finalize_fn_t host_mmu_finalize; + hyp_global_init_fn_t hyp_global_init; + reprivilege_cpu_fn_t reprivilege_cpu; +}; + +int pkvm_init(struct pkvm_mem_info infos[], int nr_info); +int pkvm_init_finalize(void); +int pkvm_reprivilege_vcpu(struct kvm_vcpu *vcpu); + +#endif /* __PKVM_X86_INIT_H */ diff --git a/arch/x86/kvm/pkvm/init_finalize.h b/arch/x86/kvm/pkvm/init_finalize.h deleted file mode 100644 index d41ba51eb7bd..000000000000 --- a/arch/x86/kvm/pkvm/init_finalize.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PKVM_X86_INIT_FINALIZE_H -#define __PKVM_X86_INIT_FINALIZE_H - -#include -#include "pgtable.h" - -typedef int (*hyp_mmu_finalize_fn_t)(struct pkvm_pgtable *pgt); -typedef int (*host_mmu_init_fn_t)(struct pkvm_pgtable *pgt, void *pool_base, - unsigned long pool_pages); -typedef int (*host_mmu_finalize_fn_t)(struct pkvm_pgtable *pgt); -typedef int (*hyp_g_finalize_fn_t)(void); - -/** - * pkvm_init_ops - The platform vendor specific pKVM finalize operations used by - * the pkvm_init_finalize. Some operation could be NULL if it is - * not necessary. - * - * @hyp_mmu_finalize: Finalize the hypervisor mmu. - * @host_mmu_init: Initialize the host mmu. - * @host_mmu_finalize: Finalize the host mmu. - * @hyp_g_finalize: Finalize the hypervisor globally. - */ -struct pkvm_init_ops { - hyp_mmu_finalize_fn_t hyp_mmu_finalize; - host_mmu_init_fn_t host_mmu_init; - host_mmu_finalize_fn_t host_mmu_finalize; - hyp_g_finalize_fn_t hyp_g_finalize; -}; - -int pkvm_init_finalize(struct pkvm_mem_info infos[], int nr_info, - struct pkvm_init_ops *init_ops); - -#endif /* __PKVM_X86_INIT_FINALIZE_H */ diff --git a/arch/x86/kvm/pkvm/lapic.c b/arch/x86/kvm/pkvm/lapic.c index c779699804d0..71c69db822b6 100644 --- a/arch/x86/kvm/pkvm/lapic.c +++ b/arch/x86/kvm/pkvm/lapic.c @@ -61,9 +61,9 @@ void pkvm_lapic_send_init(int cpu) /* * Pairs with the smp_store_release() in the setup_lapic(). * If remote lapic is not ready, it means the remote CPU is not - * finalized yet. In this case, it is not necessary to send INIT to kick - * as this remote CPU will handle all the pending requests before being - * finalized. + * initialized yet(by pkvm_init()). In this case, it is not + * necessary to send INIT to kick as this remote CPU will handle + * all the pending requests before being initialized. */ if (unlikely(!smp_load_acquire(&remote->ready))) return; diff --git a/arch/x86/kvm/pkvm/mmu.h b/arch/x86/kvm/pkvm/mmu.h index a3d8d1e7715b..ddedc5c92efe 100644 --- a/arch/x86/kvm/pkvm/mmu.h +++ b/arch/x86/kvm/pkvm/mmu.h @@ -3,7 +3,7 @@ #define __PKVM_X86_MMU_H #include -#include "init_finalize.h" +#include "init.h" extern pkvm_spinlock_t host_mmu_lock; diff --git a/arch/x86/kvm/pkvm/pkvm.c b/arch/x86/kvm/pkvm/pkvm.c index 5685c174796d..03836322d24a 100644 --- a/arch/x86/kvm/pkvm/pkvm.c +++ b/arch/x86/kvm/pkvm/pkvm.c @@ -5,7 +5,7 @@ #include #include "debug.h" #include "fpu.h" -#include "init_finalize.h" +#include "init.h" #include "lapic.h" #include "mem_protect.h" #include "memory.h" @@ -480,10 +480,14 @@ void pkvm_handle_host_hypercall(struct kvm_vcpu *vcpu) int ret = 0; switch (hc) { + case __pkvm__init: + ret = pkvm_init((struct pkvm_mem_info *)pkvm_hc_input1(vcpu), pkvm_hc_input2(vcpu)); + break; case __pkvm__init_finalize: - ret = pkvm_init_finalize((struct pkvm_mem_info *)pkvm_hc_input1(vcpu), - pkvm_hc_input2(vcpu), - (struct pkvm_init_ops *)pkvm_hc_input3(vcpu)); + ret = pkvm_init_finalize(); + break; + case __pkvm__reprivilege_cpu: + ret = pkvm_reprivilege_vcpu(vcpu); break; case __pkvm__enable_vmexit_trace: pkvm_enable_vmexit_trace(pkvm_hc_input1(vcpu)); diff --git a/arch/x86/kvm/pkvm/vmx/ept.c b/arch/x86/kvm/pkvm/vmx/ept.c index 92d3bb367140..f0d5f96058b0 100644 --- a/arch/x86/kvm/pkvm/vmx/ept.c +++ b/arch/x86/kvm/pkvm/vmx/ept.c @@ -281,7 +281,7 @@ int pkvm_host_ept_finalize(struct pkvm_pgtable *pgt) ept_sync_global(); /* * Clear the pending TLB flush request left after updating host EPT - * mappings in finalize_global(), as EPT has just been flushed with + * mappings in initialize_global(), as EPT has just been flushed with * global context anyway. */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, hvcpu); diff --git a/arch/x86/kvm/pkvm/vmx/host_repriv.c b/arch/x86/kvm/pkvm/vmx/host_repriv.c new file mode 100644 index 000000000000..e984512082e7 --- /dev/null +++ b/arch/x86/kvm/pkvm/vmx/host_repriv.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Google + */ + +#include +#include "host_vmx.h" +#include "vcpu_regs.h" + +struct host_cpu_state { + unsigned long cr0, cr3, cr4; + unsigned long rip, rsp; + unsigned long rflags; + unsigned long fsbase, gsbase; + unsigned long long debugctl, perf_global_ctrl; + unsigned long long sysenter_cs, sysenter_esp, sysenter_eip; + unsigned long long efer, cr_pat; + unsigned short cs, ds, es, fs, gs, ss; + + struct desc_ptr gdt, idt; +}; + +static inline void read_host_cpu_state(struct host_cpu_state *hcs) +{ + hcs->rsp = vmcs_readl(GUEST_RSP); + hcs->rip = vmcs_readl(GUEST_RIP) + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + hcs->rflags = vmcs_readl(GUEST_RFLAGS); + + hcs->ds = vmcs_read16(GUEST_DS_SELECTOR); + hcs->es = vmcs_read16(GUEST_ES_SELECTOR); + hcs->fs = vmcs_read16(GUEST_FS_SELECTOR); + hcs->gs = vmcs_read16(GUEST_GS_SELECTOR); + hcs->ss = vmcs_read16(GUEST_SS_SELECTOR); + hcs->cs = vmcs_read16(GUEST_CS_SELECTOR); + + hcs->fsbase = vmcs_readl(GUEST_FS_BASE); + hcs->gsbase = vmcs_readl(GUEST_GS_BASE); + + hcs->gdt.address = vmcs_readl(GUEST_GDTR_BASE); + hcs->gdt.size = vmcs_read32(GUEST_GDTR_LIMIT); + hcs->idt.address = vmcs_readl(GUEST_IDTR_BASE); + hcs->idt.size = vmcs_read32(GUEST_IDTR_LIMIT); + + hcs->debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + hcs->perf_global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); + hcs->sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); + hcs->sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); + hcs->sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); + hcs->efer = vmcs_read64(GUEST_IA32_EFER); + hcs->cr_pat = vmcs_read64(GUEST_IA32_PAT); + + hcs->cr0 = vmcs_readl(GUEST_CR0); + hcs->cr3 = vmcs_readl(GUEST_CR3); + hcs->cr4 = vmcs_readl(GUEST_CR4); +} + +#define PKVM_WRITE_CR(crnum, val) \ +static inline void __pkvm_write_cr##crnum(unsigned long val) \ +{ \ + asm volatile("mov %0,%%cr" #crnum : "+r" (val) : : "memory"); \ +} + +PKVM_WRITE_CR(0, val) +PKVM_WRITE_CR(3, val) +PKVM_WRITE_CR(4, val) + +/* + * Restores register state from memory pointed by rdi + * offset: offset of register backup in memory + * dest_reg: register to be restored. + */ +#define STRINGIFY(x) #x +#define RESTORE_VCPU_REG(offset, dest_reg) \ + "mov " STRINGIFY(offset) "(%%rdi), %%" #dest_reg "\n" + + +static inline void restore_host_special_regs(struct host_cpu_state *hcs) +{ + struct desc_struct *gdt_desc; + tss_desc *tss; + + /* Reset the busy bit to reload TR */ + gdt_desc = (struct desc_struct *)(hcs->gdt.address); + tss = (tss_desc *)&gdt_desc[GDT_ENTRY_TSS]; + tss->type = DESC_TSS; + + __pkvm_write_cr4(hcs->cr4); + __pkvm_write_cr0(hcs->cr0); + __pkvm_write_cr3(hcs->cr3); + + wrmsrq_safe(MSR_CORE_PERF_GLOBAL_CTRL, hcs->perf_global_ctrl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, hcs->debugctl); + wrmsrq(MSR_IA32_SYSENTER_CS, hcs->sysenter_cs); + wrmsrq(MSR_IA32_SYSENTER_ESP, hcs->sysenter_esp); + wrmsrq(MSR_IA32_SYSENTER_EIP, hcs->sysenter_eip); + wrmsrq(MSR_IA32_CR_PAT, hcs->cr_pat); + wrmsrq(MSR_EFER, hcs->efer); + + asm volatile ( + "lgdt %0\n" + "lidt %1\n" + "ltr %w2\n" + "mov %3, %%ds\n" + "mov %4, %%es\n" + "mov %5, %%fs\n" + "mov %6, %%gs\n" + + : + : "m"(hcs->gdt), "m"(hcs->idt), "q"(GDT_ENTRY_TSS*8), + "m"(hcs->ds), "m"(hcs->es), "m"(hcs->fs), "m"(hcs->gs) + : "memory" + ); + + wrmsrl(MSR_FS_BASE, hcs->fsbase); + wrmsrl(MSR_GS_BASE, hcs->gsbase); +} + +/* Restores host cpu state and returns to host in VMX root mode. */ +void pkvm_vmx_reprivilege_cpu(unsigned long *vcpu_regs) +{ + static struct host_cpu_state hcs; + + read_host_cpu_state(&hcs); + restore_host_special_regs(&hcs); + + asm volatile( + /* Update stack as expected by iretq */ + "pushq %0\n" + "pushq %1\n" + "pushq %2\n" + "pushq %3\n" + "pushq %4\n" + + /* Restore general purpose registers */ + RESTORE_VCPU_REG(VCPU_RCX, rcx) + RESTORE_VCPU_REG(VCPU_RDX, rdx) + RESTORE_VCPU_REG(VCPU_RBX, rbx) + RESTORE_VCPU_REG(VCPU_RBP, rbp) + RESTORE_VCPU_REG(VCPU_RSI, rsi) + RESTORE_VCPU_REG(VCPU_R8, r8) + RESTORE_VCPU_REG(VCPU_R9, r9) + RESTORE_VCPU_REG(VCPU_R10, r10) + RESTORE_VCPU_REG(VCPU_R11, r11) + RESTORE_VCPU_REG(VCPU_R12, r12) + RESTORE_VCPU_REG(VCPU_R13, r13) + RESTORE_VCPU_REG(VCPU_R14, r14) + RESTORE_VCPU_REG(VCPU_R15, r15) + + /* Restore RDI (last!) */ + RESTORE_VCPU_REG(VCPU_RDI, rdi) + + /* + * We are not technically returning from the hypercall, but set + * RAX to zero to indicate to host that reprivilege succeeded. + */ + "xor %%rax, %%rax\n" + + "iretq\n" + + : + : "m"(hcs.ss), "m"(hcs.rsp), "m"(hcs.rflags), + "m"(hcs.cs), "m"(hcs.rip), "D"(vcpu_regs) + : "memory", "cc" + ); +} +STACK_FRAME_NON_STANDARD(pkvm_vmx_reprivilege_cpu); diff --git a/arch/x86/kvm/pkvm/vmx/host_vmentry.S b/arch/x86/kvm/pkvm/vmx/host_vmentry.S index 3901e3c9a733..dfd57a27d432 100644 --- a/arch/x86/kvm/pkvm/vmx/host_vmentry.S +++ b/arch/x86/kvm/pkvm/vmx/host_vmentry.S @@ -3,30 +3,11 @@ #include #include #include -#include #include #include #include "kvm-asm-offsets.h" #include "run_flags.h" - -#define WORD_SIZE (BITS_PER_LONG / 8) - -#define VCPU_RAX (__VCPU_REGS_RAX * WORD_SIZE) -#define VCPU_RCX (__VCPU_REGS_RCX * WORD_SIZE) -#define VCPU_RDX (__VCPU_REGS_RDX * WORD_SIZE) -#define VCPU_RBX (__VCPU_REGS_RBX * WORD_SIZE) -#define VCPU_RBP (__VCPU_REGS_RBP * WORD_SIZE) -#define VCPU_RSI (__VCPU_REGS_RSI * WORD_SIZE) -#define VCPU_RDI (__VCPU_REGS_RDI * WORD_SIZE) - -#define VCPU_R8 (__VCPU_REGS_R8 * WORD_SIZE) -#define VCPU_R9 (__VCPU_REGS_R9 * WORD_SIZE) -#define VCPU_R10 (__VCPU_REGS_R10 * WORD_SIZE) -#define VCPU_R11 (__VCPU_REGS_R11 * WORD_SIZE) -#define VCPU_R12 (__VCPU_REGS_R12 * WORD_SIZE) -#define VCPU_R13 (__VCPU_REGS_R13 * WORD_SIZE) -#define VCPU_R14 (__VCPU_REGS_R14 * WORD_SIZE) -#define VCPU_R15 (__VCPU_REGS_R15 * WORD_SIZE) +#include "vcpu_regs.h" /* * pkvm_host_vmexit_entry - Handle a VM-Exit event from the deprivileged diff --git a/arch/x86/kvm/pkvm/vmx/host_vmx.c b/arch/x86/kvm/pkvm/vmx/host_vmx.c index 72ba8ed266fd..f96bfc1cc63b 100644 --- a/arch/x86/kvm/pkvm/vmx/host_vmx.c +++ b/arch/x86/kvm/pkvm/vmx/host_vmx.c @@ -6,7 +6,7 @@ #include "debug.h" #include "ept.h" #include "host_vmx.h" -#include "init_finalize.h" +#include "pkvm/init.h" #include "pkvm/lapic.h" #include "pkvm/trace.h" #include "pkvm.h" @@ -33,7 +33,8 @@ static struct pkvm_init_ops vmx_init_ops = { .hyp_mmu_finalize = vmx_hyp_mmu_finalize, .host_mmu_init = pkvm_host_ept_init, .host_mmu_finalize = pkvm_host_ept_finalize, - .hyp_g_finalize = pkvm_vmx_init, + .hyp_global_init = pkvm_vmx_init, + .reprivilege_cpu = pkvm_vmx_reprivilege_cpu, }; static void skip_emulated_instruction(void) diff --git a/arch/x86/kvm/pkvm/vmx/host_vmx.h b/arch/x86/kvm/pkvm/vmx/host_vmx.h index b6ff7c0908e2..ae783917313a 100644 --- a/arch/x86/kvm/pkvm/vmx/host_vmx.h +++ b/arch/x86/kvm/pkvm/vmx/host_vmx.h @@ -12,4 +12,6 @@ static inline void request_host_immediate_exit(struct vcpu_vmx *vmx) void pkvm_host_vmexit_main(struct vcpu_vmx *vmx); +void pkvm_vmx_reprivilege_cpu(unsigned long *vcpu_regs); + #endif /* __PKVM_VMX_HOST_VMX_H */ diff --git a/arch/x86/kvm/pkvm/vmx/vcpu_regs.h b/arch/x86/kvm/pkvm/vmx/vcpu_regs.h new file mode 100644 index 000000000000..45b66c2081ed --- /dev/null +++ b/arch/x86/kvm/pkvm/vmx/vcpu_regs.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PKVM_VMX_VCPU_REGS_H +#define __PKVM_VMX_VCPU_REGS_H + +#include + +#define WORD_SIZE (BITS_PER_LONG / 8) + +#define VCPU_RAX (__VCPU_REGS_RAX * WORD_SIZE) +#define VCPU_RCX (__VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX (__VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX (__VCPU_REGS_RBX * WORD_SIZE) +#define VCPU_RBP (__VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI (__VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI (__VCPU_REGS_RDI * WORD_SIZE) + +#define VCPU_R8 (__VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 (__VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 (__VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 (__VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 (__VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 (__VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 (__VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 (__VCPU_REGS_R15 * WORD_SIZE) + +#endif /* __PKVM_VMX_VCPU_REGS_H */ + diff --git a/arch/x86/kvm/vmx/pkvm_init.c b/arch/x86/kvm/vmx/pkvm_init.c index d2f9e0cb08ca..2ad2c1151053 100644 --- a/arch/x86/kvm/vmx/pkvm_init.c +++ b/arch/x86/kvm/vmx/pkvm_init.c @@ -976,6 +976,66 @@ static noinline int local_deprivilege_cpu(void) return ret; } +static DEFINE_PER_CPU(bool, deprivileged); +static __init void pkvm_host_reprivilege_cpu(void *data) +{ + unsigned long flags; + int cpu = get_cpu(); + int ret; + + if (!this_cpu_read(deprivileged)) { + put_cpu(); + return; + } + + local_irq_save(flags); + + /* + * Load the RW GDT page for reprivilege code + * to reload TR. + */ + load_direct_gdt(cpu); + + /* + * Intel CET requires indirect jmp/call to return to + * endbr64 instruction. So we can't use kvm_hypercall + * here. + */ + asm volatile( + "vmcall\n" + "endbr64\n" + : "=a"(ret) + : "a"(__pkvm__reprivilege_cpu) + : "memory"); + + /* Switch back to RO GDT page */ + load_fixmap_gdt(cpu); + + if (!ret) { + this_cpu_write(deprivileged, false); + kvm_cpu_vmxoff(); + pr_info("%s: CPU%d back in host mode\n", __func__, cpu); + } else { + pr_warn("%s: CPU%d failed to reprivilege(err=%d)\n", __func__, cpu, ret); + } + + local_irq_restore(flags); + put_cpu(); +} + +static __init void pkvm_host_reprivilege_cpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (!per_cpu(deprivileged, cpu)) + continue; + + smp_call_function_single(cpu, pkvm_host_reprivilege_cpu, + NULL, true); + } +} + static __init void pkvm_host_deprivilege_cpu(void *data) { struct pkvm_deprivilege_param *p = data; @@ -1006,6 +1066,7 @@ static __init void pkvm_host_deprivilege_cpu(void *data) } vcpu->mode = IN_GUEST_MODE; + this_cpu_write(deprivileged, true); pr_info("CPU%d in guest mode\n", cpu); return; vmxoff: @@ -1049,7 +1110,7 @@ static __init int pkvm_host_deprivilege_cpus(struct pkvm_hyp *pkvm) return ret ? ret : p.ret; } -static void do_pkvm_finalize(void *data) +static void do_pkvm_hyp_init(void *data) { unsigned long data_size = data_pages << PAGE_SHIFT; struct pkvm_mem_info infos[] = { @@ -1096,28 +1157,39 @@ static void do_pkvm_finalize(void *data) .prot = pgprot_val(PAGE_KERNEL), }, }; - int ret = pkvm_hypercall(init_finalize, (unsigned long)infos, ARRAY_SIZE(infos), - (unsigned long)pkvm_sym(pkvm_vmx_init_ops)); + int ret = pkvm_hypercall(init, (unsigned long)infos, ARRAY_SIZE(infos)); if (data) *(int *)data = ret; } -static __init int pkvm_init_finalize(void) +static __init int pkvm_hyp_init(void) { - int ret, cpu, finalize_ret; + int ret, cpu, init_ret; for_each_possible_cpu(cpu) { - ret = smp_call_function_single(cpu, do_pkvm_finalize, - &finalize_ret, 1); - if (ret || finalize_ret) { - pr_err("Failed to finalize CPU%d: smp_call %d, finalize: %d\n", - cpu, ret, finalize_ret); + ret = smp_call_function_single(cpu, do_pkvm_hyp_init, + &init_ret, 1); + if (ret || init_ret) { + pr_err("Failed to initialize CPU%d: smp_call %d, initialize: %d\n", + cpu, ret, init_ret); break; } } - return ret ? ret : finalize_ret; + /* + * XXX: Revert + * Temporarily fail pkvm initialization until pVMCS is fully merged. + * pKVM doesn't serve any real purpose until we have pVMCS ready and + * this failure helps us test reprivilege logic. This also enables + * host to boot normally with KVM enabled and thereby not breaking + * any virtualization functionality. + */ + if (!ret || !init_ret) { + pr_err("Explicitly triggering pkvm initialization failure!\n"); + ret = -EFAULT; + } + return ret ? ret : init_ret; } int __init vmx_pkvm_init(void) @@ -1186,22 +1258,25 @@ int __init vmx_pkvm_init(void) pr_cont("reboot with kvm-intel.pkvm_relax_cpu_bugs=false\n"); } + pkvm_sym(init_ops) = pkvm_sym(pkvm_vmx_init_ops); + ret = pkvm_host_deprivilege_cpus(pkvm); - if (ret) { - /* TODO: Re-privilege the deprivileged CPUs */ - goto out; - } + if (ret) + goto repriv_cpus; - ret = pkvm_init_finalize(); - if (ret) { - /* TODO: Re-privilege the deprivileged CPUs */ - goto out; - } + ret = pkvm_hyp_init(); + if (ret) + goto repriv_cpus; + + pkvm_hypercall(init_finalize); pkvm_init_debugfs(); pr_info("Hypervisor is up and running!\n"); return 0; + +repriv_cpus: + pkvm_host_reprivilege_cpus(); out: /* * As the reserved memory at the pkvm_mem_base will not be