diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 7a7dc9d56027..a52250c9da2f 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -131,4 +131,8 @@ static __always_inline __pure bool fpu_state_size_dynamic(void)
 }
 #endif
 
+#ifdef __PKVM_HYP__
+void pkvm_setup_xstate_cache(void);
+#endif
+
 #endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e20385a2821a..2e42ce1efd97 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2317,6 +2317,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
+bool pkvm_host_has_emulated_msr(struct kvm *kvm, u32 msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -2516,6 +2517,7 @@ void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
 				     u32 size);
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
+int kvm_vcpu_x86_setup_mce(struct kvm_vcpu *vcpu, u64 mcg_cap);
 
 static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
 {
diff --git a/arch/x86/include/asm/kvm_pkvm.h b/arch/x86/include/asm/kvm_pkvm.h
index ff0a17588c0e..0f555d032716 100644
--- a/arch/x86/include/asm/kvm_pkvm.h
+++ b/arch/x86/include/asm/kvm_pkvm.h
@@ -69,6 +69,54 @@ union pkvm_hc_data {
 	struct {
 		struct pkvm_memcache memcache;
 	} vcpu_free;
+	struct {
+		u64 data;
+	} get_msr;
+	union {
+		unsigned long rsp;
+		unsigned long rip;
+		unsigned long cr0;
+		unsigned long cr3;
+		unsigned long cr4;
+		u64 pdptrs[4];
+	} cache_reg;
+	struct {
+		unsigned long data;
+	} get_rflags;
+	struct {
+		struct kvm_segment seg_val;
+		int seg;
+	} set_segment;
+	struct {
+		struct kvm_segment seg_val;
+	} get_segment;
+	struct {
+		u64 data;
+	} get_segment_base;
+	struct {
+		struct desc_ptr desc;
+	} set_idt;
+	struct {
+		struct desc_ptr desc;
+	} get_idt;
+	struct {
+		struct desc_ptr desc;
+	} set_gdt;
+	struct {
+		struct desc_ptr desc;
+	} get_gdt;
+	struct {
+		u32 data;
+	} get_interrupt_shadow;
+	struct {
+		bool data;
+	} get_nmi_mask;
+	struct {
+		struct pkvm_memcache memcache;
+	} vcpu_after_set_cpuid;
+	struct {
+		struct pkvm_memcache memcache;
+	} vcpu_add_fpstate;
 	struct {
 		u64 data[PKVM_HC_DATA_MAX_NUM];
 	} raw;
@@ -85,11 +133,16 @@ static_assert(sizeof(union pkvm_hc_data) == PKVM_HC_DATA_MAX_NUM * sizeof(u64));
 	(ALIGN(sizeof(((union pkvm_hc_data *)0)->f), sizeof(u64)) / sizeof(u64))
 
 #define PKVM_HC_OUTPUT_NUM(f)	f##_output_num
+#define PKVM_HC_INPUT_NUM(f)	f##_input_num
 
 enum {
 	#define PKVM_HC(f)	PKVM_HC_OUTPUT_NUM(f) = 0,
 	#define PKVM_HC_OUT(f)	PKVM_HC_OUTPUT_NUM(f) = PKVM_HC_DATA_NUM(f),
 	#include <asm/pkvm_hypercalls.h>
+
+	#define PKVM_HC(f)	PKVM_HC_INPUT_NUM(f) = 0,
+	#define PKVM_HC_IN(f)	PKVM_HC_INPUT_NUM(f) = PKVM_HC_DATA_NUM(f),
+	#include <asm/pkvm_hypercalls.h>
 };
 
 static inline int pkvm_hc_output_num(enum pkvm_hc hc)
@@ -102,6 +155,16 @@ static inline int pkvm_hc_output_num(enum pkvm_hc hc)
 	}
 }
 
+static inline int pkvm_hc_input_num(enum pkvm_hc hc)
+{
+	switch (hc) {
+	#define PKVM_HC(f) case TO_PKVM_HC(f): return PKVM_HC_INPUT_NUM(f);
+	#include <asm/pkvm_hypercalls.h>
+	default:
+		return 0;
+	}
+}
+
 #define PKVM_HC_IN_0()
 #define PKVM_HC_IN_1(a1)		, "b"(a1)
 #define PKVM_HC_IN_2(a1, a2)		PKVM_HC_IN_1(a1), "c"(a2)
@@ -150,6 +213,23 @@ static inline int pkvm_hc_output_num(enum pkvm_hc hc)
 			      __pkvm_hypercall(f, o, 4, ##__VA_ARGS__),			\
 	PKVM_HC_UNREACHABLE(f))))))
 
+#define pkvm_hypercall_in(f, i)								\
+	__builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 1,				\
+			      __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0]),		\
+	__builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 2,				\
+			      __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0],		\
+					       (i)->raw.data[1]),			\
+	__builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 3,				\
+			      __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0],		\
+					       (i)->raw.data[1],			\
+					       (i)->raw.data[2]),			\
+	__builtin_choose_expr(PKVM_HC_INPUT_NUM(f) == 4,				\
+			      __pkvm_hypercall(f, NULL, 0, (i)->raw.data[0],		\
+					       (i)->raw.data[1],			\
+					       (i)->raw.data[2],			\
+					       (i)->raw.data[3]),			\
+	PKVM_HC_UNREACHABLE(f)))))
+
 static inline unsigned long pkvm_hc(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.regs[VCPU_REGS_RAX];
@@ -159,6 +239,11 @@ static inline unsigned long pkvm_hc(struct kvm_vcpu *vcpu)
 static inline unsigned long pkvm_hc_input##n(struct kvm_vcpu *vcpu)			\
 {											\
 	return vcpu->arch.regs[VCPU_REGS_##reg];					\
+}											\
+static inline void pkvm_hc_get_input##n(struct kvm_vcpu *vcpu, union pkvm_hc_data *p)	\
+{											\
+	BUILD_BUG_ON(n == 0 || n > PKVM_HC_DATA_MAX_NUM);				\
+	p->raw.data[n - 1] = vcpu->arch.regs[VCPU_REGS_##reg];				\
 }
 
 DEFINE_PKVM_HC_INPUT(1, RBX)
@@ -166,6 +251,29 @@ DEFINE_PKVM_HC_INPUT(2, RCX)
 DEFINE_PKVM_HC_INPUT(3, RDX)
 DEFINE_PKVM_HC_INPUT(4, RSI)
 
+static inline void pkvm_hc_get_input(struct kvm_vcpu *vcpu, enum pkvm_hc hc,
+				     union pkvm_hc_data *in)
+{
+	switch (pkvm_hc_input_num(hc)) {
+	case 4:
+		pkvm_hc_get_input4(vcpu, in);
+		fallthrough;
+	case 3:
+		pkvm_hc_get_input3(vcpu, in);
+		fallthrough;
+	case 2:
+		pkvm_hc_get_input2(vcpu, in);
+		fallthrough;
+	case 1:
+		pkvm_hc_get_input1(vcpu, in);
+		fallthrough;
+	case 0:
+		break;
+	default:
+		BUG();
+	}
+}
+
 static inline void pkvm_hc_set_ret(struct kvm_vcpu *vcpu, int ret)
 {
 	vcpu->arch.regs[VCPU_REGS_RAX] = ret;
@@ -223,8 +331,12 @@ extern struct pkvm_init_ops *pkvm_sym(init_ops);
 extern struct cpumask pkvm_sym(__cpu_possible_mask);
 extern unsigned int pkvm_sym(nr_cpu_ids);
 DECLARE_STATIC_KEY_FALSE(pkvm_sym(switch_vcpu_ibpb));
+extern u64 pkvm_sym(x86_pred_cmd);
 extern struct fpu_state_config pkvm_sym(fpu_kernel_cfg);
 extern struct fpu_state_config pkvm_sym(fpu_user_cfg);
+#ifdef CONFIG_X86_64
+DECLARE_STATIC_KEY_FALSE(pkvm_sym(__fpu_state_size_dynamic));
+#endif
 
 u64 pkvm_total_reserve_pages(void);
 PKVM_DECLARE(void *, pkvm_early_alloc_page, (void));
diff --git a/arch/x86/include/asm/pkvm_hypercalls.h b/arch/x86/include/asm/pkvm_hypercalls.h
index 56caee1335e1..6ada6c4a5a6a 100644
--- a/arch/x86/include/asm/pkvm_hypercalls.h
+++ b/arch/x86/include/asm/pkvm_hypercalls.h
@@ -7,6 +7,10 @@ BUILD_BUG_ON(1)
 #define PKVM_HC_OUT PKVM_HC
 #endif
 
+#ifndef PKVM_HC_IN
+#define PKVM_HC_IN PKVM_HC
+#endif
+
 /* Hypercalls used only during pKVM initialization */
 PKVM_HC(init)
 PKVM_HC(init_finalize)
@@ -23,6 +27,55 @@ PKVM_HC(vm_init)
 PKVM_HC_OUT(vm_destroy)
 PKVM_HC(vcpu_create)
 PKVM_HC_OUT(vcpu_free)
+PKVM_HC(vcpu_load)
+PKVM_HC(vcpu_put)
+PKVM_HC(vcpu_reset)
+PKVM_HC(update_exception_bitmap)
+PKVM_HC(set_efer)
+PKVM_HC(set_msr)
+PKVM_HC_OUT(get_msr)
+PKVM_HC_OUT(cache_reg)
+PKVM_HC(set_cr4)
+PKVM_HC(set_cr0)
+PKVM_HC_OUT(get_rflags)
+PKVM_HC(set_rflags)
+PKVM_HC(set_dr7)
+PKVM_HC_IN(set_segment)
+PKVM_HC_OUT(get_segment)
+PKVM_HC_OUT(get_segment_base)
+PKVM_HC_IN(set_idt)
+PKVM_HC_OUT(get_idt)
+PKVM_HC_IN(set_gdt)
+PKVM_HC_OUT(get_gdt)
+PKVM_HC(flush_tlb_all)
+PKVM_HC(flush_tlb_current)
+PKVM_HC(flush_tlb_gva)
+PKVM_HC(flush_tlb_guest)
+PKVM_HC(set_interrupt_shadow)
+PKVM_HC_OUT(get_interrupt_shadow)
+PKVM_HC(enable_nmi_window)
+PKVM_HC(enable_irq_window)
+PKVM_HC(interrupt_allowed)
+PKVM_HC(nmi_allowed)
+PKVM_HC_OUT(get_nmi_mask)
+PKVM_HC(set_nmi_mask)
+PKVM_HC(inject_irq)
+PKVM_HC(inject_nmi)
+PKVM_HC(inject_exception)
+PKVM_HC(cancel_injection)
+PKVM_HC(update_cr8_intercept)
+PKVM_HC(set_virtual_apic_mode)
+PKVM_HC(refresh_apicv_exec_ctrl)
+PKVM_HC(load_eoi_exitmap)
+PKVM_HC(hwapic_isr_update)
+PKVM_HC(sync_pir_to_irr)
+PKVM_HC_OUT(vcpu_after_set_cpuid)
+PKVM_HC_OUT(vcpu_add_fpstate)
+PKVM_HC(write_tsc_offset)
+PKVM_HC(write_tsc_multiplier)
+PKVM_HC(load_mmu_pgd)
+PKVM_HC(setup_mce)
 
 #undef PKVM_HC
 #undef PKVM_HC_OUT
+#undef PKVM_HC_IN
diff --git a/arch/x86/include/asm/pkvm_image_vars.h b/arch/x86/include/asm/pkvm_image_vars.h
index 5e351110bb1e..c26e3dfe3cc8 100644
--- a/arch/x86/include/asm/pkvm_image_vars.h
+++ b/arch/x86/include/asm/pkvm_image_vars.h
@@ -19,6 +19,7 @@ PKVM_ALIAS(__trace_bprintk);
 PKVM_ALIAS(__dynamic_pr_debug);
 PKVM_ALIAS(mem_dump_obj);
 PKVM_ALIAS(vmalloc_base);
+PKVM_ALIAS(get_cpu_entry_area);
 #endif
 
 #endif /* _ASM_x86_PKVM_IMAGE_VARS_H */
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3746376172e6..11ada9bcdb52 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -29,7 +29,7 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/fpu.h>
 
-#if defined(CONFIG_X86_64) && !defined(__PKVM_HYP__)
+#ifdef CONFIG_X86_64
 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic);
 DEFINE_PER_CPU(u64, xfd_state);
 #endif
@@ -123,6 +123,7 @@ static void update_avx_timestamp(struct fpu *fpu)
 	if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK)
 		fpu->avx512_timestamp = jiffies;
 }
+#endif /* !__PKVM_HYP__ */
 
 /*
  * Save the FPU register state in fpu->fpstate->regs. The register state is
@@ -142,7 +143,9 @@ void save_fpregs_to_fpstate(struct fpu *fpu)
 {
 	if (likely(use_xsave())) {
 		os_xsave(fpu->fpstate);
+#ifndef __PKVM_HYP__
 		update_avx_timestamp(fpu);
+#endif
 		return;
 	}
 
@@ -213,12 +216,15 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
 	}
 }
 
+#ifndef __PKVM_HYP__
 void fpu_reset_from_exception_fixup(void)
 {
 	restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE);
 }
+#endif /* !__PKVM_HYP__ */
 
 #if IS_ENABLED(CONFIG_KVM)
+#ifndef __PKVM_HYP__
 static void __fpstate_reset(struct fpstate *fpstate);
 
 static void fpu_lock_guest_permissions(void)
@@ -293,6 +299,7 @@ void fpu_free_guest_fpstate(struct fpu_guest *gfpu)
 	vfree(fpstate);
 }
 EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate);
+#endif /* !__PKVM_HYP__ */
 
 /*
   * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable
@@ -319,14 +326,19 @@ EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features);
 #ifdef CONFIG_X86_64
 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
 {
+#ifndef __PKVM_HYP__
 	fpregs_lock();
+#endif
 	guest_fpu->fpstate->xfd = xfd;
 	if (guest_fpu->fpstate->in_use)
 		xfd_update_state(guest_fpu->fpstate);
+#ifndef __PKVM_HYP__
 	fpregs_unlock();
+#endif
 }
 EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
 
+#ifndef __PKVM_HYP__
 /**
  * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state
  *
@@ -350,6 +362,7 @@ void fpu_sync_guest_vmexit_xfd_state(void)
 	}
 }
 EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state);
+#endif /* !__PKVM_HYP__ */
 #endif /* CONFIG_X86_64 */
 
 int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
@@ -358,10 +371,35 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 	struct fpu *fpu = x86_task_fpu(current);
 	struct fpstate *cur_fps = fpu->fpstate;
 
+#ifndef __PKVM_HYP__
 	fpregs_lock();
 	if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
 		save_fpregs_to_fpstate(fpu);
-
+#else
+#ifdef CONFIG_X86_64
+	if (fpu_state_size_dynamic() && enter_guest) {
+		/*
+		 * Refresh the xfd_state percpu cache before guest vmenter so
+		 * that the xfd can be restored after guest vmexit.
+		 */
+		rdmsrl(MSR_IA32_XFD, cur_fps->xfd);
+		__this_cpu_write(xfd_state, cur_fps->xfd);
+	}
+#endif
+	/*
+	 * If entering the npVM, the FPU are already loaded with the npVM fpu
+	 * state by the host. If exiting from the npVM, the fpu registers will be
+	 * saved by the host. So no need to save FPU for the npVM.
+	 *
+	 * If entering the pVM, the FPU are loaded with the host fpu state, which
+	 * is already saved by the host itself before switching to the pkvm
+	 * hypervisor. If exiting from the pVM, then the fpu state should be saved
+	 * by the pkvm hypervisor as the host is not allowed to do this for
+	 * isolation purpose.
+	 */
+	if (guest_fps->is_confidential && !enter_guest)
+		save_fpregs_to_fpstate(fpu);
+#endif
 	/* Swap fpstate */
 	if (enter_guest) {
 		fpu->__task_fpstate = cur_fps;
@@ -375,6 +413,7 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 
 	cur_fps = fpu->fpstate;
 
+#ifndef __PKVM_HYP__
 	if (!cur_fps->is_confidential) {
 		/* Includes XFD update */
 		restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
@@ -389,10 +428,29 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
 
 	fpregs_mark_activate();
 	fpregs_unlock();
+#else
+	/*
+	 * Similarly to the FPU saving case, no need to restore FPU for the npVM
+	 * as this will be handled by the host.
+	 *
+	 * If entering the pVM, restore the FPU with the pVM fpu state. If
+	 * exiting the pVM, wipe the FPU by restoring FPU with an initial fpu
+	 * state.
+	 */
+	if (guest_fps->is_confidential) {
+		/* Includes XFD update */
+		restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
+	} else {
+		/* Only update XFD as npVM FPU is already loaded by the host */
+		xfd_update_state(cur_fps);
+	}
+#endif
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate);
 
+#ifndef __PKVM_HYP__
 void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf,
 				    unsigned int size, u64 xfeatures, u32 pkru)
 {
@@ -441,8 +499,10 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
 	return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
 }
 EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
+#endif /* !__PKVM_HYP__ */
 #endif /* CONFIG_KVM */
 
+#ifndef __PKVM_HYP__
 void kernel_fpu_begin_mask(unsigned int kfpu_mask)
 {
 	if (!irqs_disabled())
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 28e4fd65c9da..6612a5478487 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -38,6 +38,7 @@
 	(bit) = FIRST_EXTENDED_XFEATURE;				\
 	for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask))
 
+#ifndef __PKVM_HYP__
 /*
  * Although we spell it out in here, the Processor Trace
  * xfeature is completely unused.  We use other mechanisms
@@ -86,6 +87,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
 	[XFEATURE_XTILE_DATA]			= X86_FEATURE_AMX_TILE,
 	[XFEATURE_APX]				= X86_FEATURE_APX,
 };
+#endif /* !__PKVM_HYP__ */
 
 static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
 	{ [ 0 ... XFEATURE_MAX - 1] = -1};
@@ -121,6 +123,7 @@ static inline unsigned int next_xfeature_order(unsigned int i, u64 mask)
 #define XSTATE_FLAG_SUPERVISOR	BIT(0)
 #define XSTATE_FLAG_ALIGNED64	BIT(1)
 
+#ifndef __PKVM_HYP__
 /*
  * Return whether the system supports a given xfeature.
  *
@@ -158,6 +161,7 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
 	return 1;
 }
 EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+#endif /* !__PKVM_HYP__ */
 
 static bool xfeature_is_aligned64(int xfeature_nr)
 {
@@ -197,6 +201,7 @@ static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature)
 	return offs;
 }
 
+#ifndef __PKVM_HYP__
 /*
  * Enable the extended processor state save/restore feature.
  * Called once per CPU onlining.
@@ -233,6 +238,7 @@ void fpu__init_cpu_xstate(void)
 				     xfeatures_mask_independent());
 	}
 }
+#endif /* !__PKVM_HYP__ */
 
 static bool xfeature_enabled(enum xfeature xfeature)
 {
@@ -292,6 +298,7 @@ static void __init setup_xstate_cache(void)
 	sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL);
 }
 
+#ifndef __PKVM_HYP__
 /*
  * Print out all the supported xstate features:
  */
@@ -585,6 +592,7 @@ static bool __init check_xstate_against_struct(int nr)
 
 	return true;
 }
+#endif /* !__PKVM_HYP__ */
 
 static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
 {
@@ -606,6 +614,7 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted)
 	return offset + xstate_sizes[topmost];
 }
 
+#ifndef __PKVM_HYP__
 /*
  * This essentially double-checks what the cpu told us about
  * how large the XSAVE buffer needs to be.  We are recalculating
@@ -988,6 +997,7 @@ void fpu__resume_cpu(void)
 	if (fpu_state_size_dynamic())
 		wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd);
 }
+#endif /* !__PKVM_HYP__ */
 
 /*
  * Given an xstate feature nr, calculate where in the xsave
@@ -1060,6 +1070,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
 }
 EXPORT_SYMBOL_GPL(get_xsave_addr);
 
+#ifndef __PKVM_HYP__
 /*
  * Given an xstate feature nr, calculate where in the xsave buffer the state is.
  * The xsave buffer should be in standard format, not compacted (e.g. user mode
@@ -1473,6 +1484,7 @@ void xrstors(struct xregs_state *xstate, u64 mask)
 	XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
 	WARN_ON_ONCE(err);
 }
+#endif /* !__PKVM_HYP__ */
 
 #if IS_ENABLED(CONFIG_KVM)
 void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature)
@@ -1485,6 +1497,7 @@ void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeatu
 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component);
 #endif
 
+#ifndef __PKVM_HYP__
 #ifdef CONFIG_X86_64
 
 #ifdef CONFIG_X86_DEBUG_FPU
@@ -1813,6 +1826,7 @@ static inline int xstate_request_perm(unsigned long idx, bool guest)
 	return -EPERM;
 }
 #endif  /* !CONFIG_X86_64 */
+#endif /* __PKVM_HYP__ */
 
 u64 xstate_get_guest_group_perm(void)
 {
@@ -1820,6 +1834,7 @@ u64 xstate_get_guest_group_perm(void)
 }
 EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm);
 
+#ifndef __PKVM_HYP__
 /**
  * fpu_xstate_prctl - xstate permission operations
  * @option:	A subfunction of arch_prctl()
@@ -2009,3 +2024,51 @@ int elf_coredump_extra_notes_size(void)
 	return size;
 }
 #endif /* CONFIG_COREDUMP */
+#else /* !__PKVM_HYP__ */
+void pkvm_setup_xstate_cache(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_FPU) ||
+	    !boot_cpu_has(X86_FEATURE_XSAVE)) {
+		pr_info("pkvm: No FPU or XSAVE detected\n");
+		return;
+	}
+
+	/* Cache size, offset and flags for initialization */
+	setup_xstate_cache();
+}
+
+int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu)
+{
+	u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC;
+	struct fpstate *fps;
+	unsigned int ksize;
+
+	if (!xfd_event)
+		return 0;
+
+	if (WARN_ON_ONCE(!guest_fpu))
+		return -EINVAL;
+
+	if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event)
+		return -EPERM;
+
+	fps = guest_fpu->fpstate;
+	ksize = xstate_calculate_size(fps->xfeatures | xfd_event,
+				      cpu_feature_enabled(X86_FEATURE_XCOMPACTED));
+	if (fps->size < ksize) {
+		/* State size is insufficient. */
+		return -ENOMEM;
+	}
+
+	guest_fpu->xfeatures |= xfd_event;
+	fps->xfeatures |= xfd_event;
+	fps->user_xfeatures |= xfd_event;
+	fps->xfd &= ~xfd_event;
+
+	xstate_init_xcomp_bv(&fps->regs.xsave, fps->xfeatures);
+	if (fps->in_use)
+		xfd_update_state(fps);
+
+	return 0;
+}
+#endif /* __PKVM_HYP__ */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cc400a96e26e..e141460ff4ba 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -18,6 +18,7 @@
 #include <linux/sched/stat.h>
 
 #include <asm/processor.h>
+#include <asm/kvm_pkvm.h>
 #include <asm/user.h>
 #include <asm/fpu/xstate.h>
 #include <asm/sgx.h>
@@ -36,7 +37,6 @@
 u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly;
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_cpu_caps);
 
-#ifndef __PKVM_HYP__
 struct cpuid_xstate_sizes {
 	u32 eax;
 	u32 ebx;
@@ -151,6 +151,14 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu)
 			return -EINVAL;
 	}
 
+#ifdef __PKVM_HYP__
+	/*
+	 * Exposing dynamic xfeatures to npVM is handled by the host as npVM's
+	 * fpstate is allocated and managed by the host.
+	 */
+	if (!pkvm_is_protected_vcpu(vcpu))
+		return 0;
+#endif
 	/*
 	 * Exposing dynamic xfeatures to the guest requires additional
 	 * enabling in the FPU, e.g. to expand the guest XSAVE state size.
@@ -243,6 +251,16 @@ static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu)
 	if (!best)
 		return 0;
 
+	if (pkvm_is_protected_vcpu(vcpu)) {
+		/*
+		 * The pKVM hypervisor doesn't support emulate KVM PV features
+		 * for pVM for simplicity. Thus remove KVM PV feature bits from
+		 * the corresponding CPUID.
+		 */
+		best->eax = 0;
+		return 0;
+	}
+
 	if (kvm_hlt_in_guest(vcpu->kvm))
 		best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
 
@@ -323,7 +341,7 @@ static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
 
 static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu)
 {
-#ifdef CONFIG_KVM_HYPERV
+#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__)
 	struct kvm_cpuid_entry2 *entry;
 
 	entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
@@ -372,8 +390,10 @@ static int cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, u32 func,
 
 void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 {
+#ifndef __PKVM_HYP__
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	struct kvm_cpuid_entry2 *best;
+#endif
 	struct kvm_cpuid_entry2 *entry;
 	bool allow_gbpages;
 	int i;
@@ -426,6 +446,7 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 				      guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES);
 	guest_cpu_cap_change(vcpu, X86_FEATURE_GBPAGES, allow_gbpages);
 
+#ifndef __PKVM_HYP__
 	best = kvm_find_cpuid_entry(vcpu, 1);
 	if (best && apic) {
 		if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER))
@@ -435,6 +456,7 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 
 		kvm_apic_set_version(vcpu);
 	}
+#endif
 
 	vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu);
 	vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu);
@@ -445,23 +467,29 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 	vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 
+#ifndef __PKVM_HYP__
 	kvm_pmu_refresh(vcpu);
+#endif
 
 #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
 	vcpu->arch.cr4_guest_rsvd_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_) |
 					 __cr4_reserved_bits(guest_cpu_cap_has, vcpu);
 #undef __kvm_cpu_cap_has
 
+#ifndef __PKVM_HYP__
 	kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu));
+#endif
 
 	/* Invoke the vendor callback only after the above state is updated. */
 	kvm_x86_call(vcpu_after_set_cpuid)(vcpu);
 
+#ifndef __PKVM_HYP__
 	/*
 	 * Except for the MMU, which needs to do its thing any vendor specific
 	 * adjustments to the reserved GPA bits.
 	 */
 	kvm_mmu_after_set_cpuid(vcpu);
+#endif
 
 	kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu);
 }
@@ -504,8 +532,12 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu)
 	return rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
 }
 
+#ifndef __PKVM_HYP__
 static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
                         int nent)
+#else
+int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent)
+#endif
 {
 	u32 vcpu_caps[NR_KVM_CPU_CAPS];
 	int r;
@@ -563,7 +595,9 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
 	kvm_vcpu_after_set_cpuid(vcpu);
 
 success:
+#ifndef __PKVM_HYP__
 	kvfree(e2);
+#endif
 	return 0;
 
 err:
@@ -573,6 +607,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
 	return r;
 }
 
+#ifndef __PKVM_HYP__
 /* when an old userspace process fills a new kernel module */
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 			     struct kvm_cpuid *cpuid,
@@ -1275,7 +1310,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cpu_caps);
 #undef VENDOR_F
 #undef RUNTIME_F
 
-#ifndef __PKVM_HYP__
 struct kvm_cpuid_array {
 	struct kvm_cpuid_entry2 *entries;
 	int maxnent;
@@ -1747,8 +1781,11 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 		} else {
 			phys_as = entry->eax & 0xff;
 			g_phys_as = phys_as;
+			/* FIXME: Check pKVM guest MMU level */
+#ifndef __PKVM_HYP__
 			if (kvm_mmu_get_max_tdp_level() < 5)
 				g_phys_as = min(g_phys_as, 48U);
+#endif
 		}
 
 		entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16);
@@ -1880,6 +1917,7 @@ static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func,
 	return r;
 }
 
+#ifndef __PKVM_HYP__
 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
 				 __u32 num_entries, unsigned int ioctl_type)
 {
@@ -2107,4 +2145,257 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 	return kvm_skip_emulated_instruction(vcpu);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_cpuid);
+#else  /* !__PKVM_HYP__ */
+
+static DEFINE_PER_CPU(struct kvm_cpuid_entry2, cpuid_def[KVM_MAX_CPUID_ENTRIES]);
+
+static int pkvm_get_cpuid(struct kvm_cpuid_entry2 *entries, int *nent)
+{
+	static const u32 funcs[] = {
+		0, 0x80000000, KVM_CPUID_SIGNATURE,
+	};
+
+	struct kvm_cpuid_array array = {
+		.entries = entries,
+		.nent = 0,
+		.maxnent = *nent,
+	};
+	int r, i;
+
+	if (*nent < 1)
+		return -E2BIG;
+	if (*nent > KVM_MAX_CPUID_ENTRIES)
+		*nent = KVM_MAX_CPUID_ENTRIES;
+
+	for (i = 0; i < ARRAY_SIZE(funcs); i++) {
+		r = get_cpuid_func(&array, funcs[i], KVM_GET_SUPPORTED_CPUID);
+		if (r)
+			goto out;
+	}
+
+	*nent = array.nent;
+out:
+	return r;
+}
+
+static bool pkvm_cpuid_entry_host_owned(struct kvm_cpuid_entry2 *e2)
+{
+	switch (e2->function) {
+	case 0xb: /* topology */
+	case 0x1f: /* topology */
+	case 0x80000002: /* Processor Brand String */
+	case 0x80000003: /* Processor Brand String */
+	case 0x80000004: /* Processor Brand String */
+		return true;
+	}
+
+	return false;
+}
+
+#define CPUID_4_EAX_VALID_MASK		GENMASK(4, 0)
+#define CPUID_4_EBX_COH_LINE_SIZE_MASK	GENMASK(11, 0)
+#define CPUID_7_0_EDX_HYBRID		(1 << 15)
+static void pkvm_fixup_cpuid_entry(struct kvm_cpuid_entry2 *entry)
+{
+	switch (entry->function) {
+	case 4:
+		/*
+		 * Deterministic cache parameters.
+		 *
+		 * Fix the coherency line size to 64 bytes following TDX.
+		 */
+		if (entry->eax & CPUID_4_EAX_VALID_MASK) {
+			entry->ebx &= ~CPUID_4_EBX_COH_LINE_SIZE_MASK;
+			entry->ebx |= 0x3F;
+		}
+		break;
+	case 7: /* Extended features */
+		if (entry->index)
+			break;
+
+		/* No support of hybrid */
+		entry->edx &= ~CPUID_7_0_EDX_HYBRID;
+		break;
+	case 0x1a:
+		/*
+		 * Native model ID.
+		 *
+		 * Clear the entry due to no support of hybrid. This leaf is
+		 * not controlled by the host and __do_cpuid_func() already
+		 * clears it. But in case __do_cpuid_func() may change its
+		 * policy later, force clearing it here explicitly.
+		 */
+		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+		break;
+	default:
+		break;
+	}
+}
+
+#define CPUID_1_EBX_ID_MASK		GENMASK(31, 16)
+#define CPUID_1_ECX_TSC_DLTIMER		(1 << 24)
+#define CPUID_1_ECX_HYP			(1 << 31)
+#define CPUID_1_EDX_HTT			(1 << 28)
+static void pkvm_enforce_cpuid_entry(struct kvm_cpuid_entry2 *entry,
+				     struct kvm_cpuid_entry2 *def)
+{
+	struct kvm_cpuid_entry2 tmp = *def;
+
+#define COPY_BITS(reg1, reg2, mask) {	\
+	(reg1) &= ~(mask);		\
+	(reg1) |= (reg2) & (mask);	\
+}
+	switch (entry->function) {
+	case 1:
+		COPY_BITS(tmp.ecx, entry->ecx,
+			  CPUID_1_ECX_TSC_DLTIMER | CPUID_1_ECX_HYP);
+		COPY_BITS(tmp.ebx, entry->ebx, CPUID_1_EBX_ID_MASK);
+		COPY_BITS(tmp.edx, entry->edx, CPUID_1_EDX_HTT);
+		break;
+	default:
+		break;
+	}
+
+	*entry = tmp;
+}
+
+static bool cpuid_entry_is_empty(struct kvm_cpuid_entry2 *e2)
+{
+	return !e2->function && !e2->eax;
+}
+
+static struct kvm_cpuid_entry2 *find_cpuid_entry(struct kvm_cpuid_entry2 *buf,
+	      int nent,	struct kvm_cpuid_entry2 *e2)
+{
+	int i;
+
+	for (i = 0; i < nent; i++) {
+		if (cpuid_entry_is_empty(&buf[i]))
+			continue;
+
+		if ((buf[i].function == e2->function) &&
+		    (buf[i].index == e2->index) &&
+		    (buf[i].flags == e2->flags))
+			return &buf[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * pKVM enforces a simple CPUID policy (similar to QEMU '--cpu host') for
+ * pVM, by using the pKVM supported bits as the base plus a small set
+ * allowing the host to manage. This saves a lot of effort of defining/
+ * maintaining a bit-wise complex policy as TDX does.
+ *
+ * As crosvm is the main VMM targeted in the pKVM project, the allowed set
+ * is currently scrutinized/defined based on the bits mangled by crosvm.
+ * It is not flexible but good for security/simplicity. The allowed set
+ * could be extended case-by-case when seeing new demand for the host
+ * to set.
+ *
+ * The enforcement includes:
+ *   - if an entry is fully host-controlled, leave it intact.
+ *
+ *   - if an entry is func#4 (cache parameters), it's configured by the host
+ *     but certain fields will be overridden with fixed values. If none of
+ *     func4 entries exist, pKVM will insert the default cache parameters
+ *     as failsafe.
+ *
+ *   - for remaining entries, there must be a matching one in the default
+ *     set, otherwise the original entry is cleared. If matched, the entry
+ *     is fully/partially overridden based on the default value.
+ *
+ *   - Append a default entry to the buffer if it's not included by
+ *     the host, to prevent the host attack by hiding cpuid leaves which
+ *     may affect pVM security
+ *
+ *   - Fixed values are enforced in the last step
+ */
+int pkvm_enforce_cpuid(struct kvm_cpuid_entry2 *e2, int *nent, int max_nent)
+{
+	struct kvm_cpuid_entry2 *de2 = this_cpu_ptr(cpuid_def);
+	int def_nent, r, i, n;
+	int orig_nent = *nent;
+	bool has_func4 = false;
+
+	memset(de2, 0, KVM_MAX_CPUID_ENTRIES * sizeof(struct kvm_cpuid_entry2));
+	def_nent = KVM_MAX_CPUID_ENTRIES;
+
+	/*
+	 * It is possible that the pKVM hypervisor can implement different
+	 * permitted XCR0 for each guest (although currently the pKVM hypervisor
+	 * implements the same permitted XCR0 for all guests). In this case, the
+	 * default CPUID leaf 0xD will be different. Thus get the default CPUID
+	 * entries for each guest, rather than initialize cpuid_def once during
+	 * pKVM initialization.
+	 */
+	r = pkvm_get_cpuid(de2, &def_nent);
+	if (r)
+		return r;
+
+	/* Enforce cpuid leaves according to the default set */
+	for (i = 0; i < orig_nent; i++) {
+		struct kvm_cpuid_entry2 *tmp;
+
+		if (cpuid_entry_is_empty(&e2[i]) ||
+		    pkvm_cpuid_entry_host_owned(&e2[i]))
+			continue;
+
+		if (e2[i].function == 4) {
+			has_func4 = true;
+			continue;
+		}
+
+		tmp = find_cpuid_entry(de2, def_nent, &e2[i]);
+		if (tmp)
+			pkvm_enforce_cpuid_entry(&e2[i], tmp);
+		else
+			memset(&e2[i], 0, sizeof(struct kvm_cpuid_entry2));
+	}
+
+	/* Insert default cpuid leaves if missing in the host buffer */
+	n = 0;
+	for (i = 0; i < def_nent; i++) {
+		if (pkvm_cpuid_entry_host_owned(&de2[i]))
+			continue;
+
+		/*
+		 * If the host already provides cache parameters,
+		 * skip all func4 entries in the default set. Simply
+		 * comparing func/index doesn't work as the default set
+		 * may contain more entries than host provides (due to
+		 * different number of levels of cache on different
+		 * physical CPUs on a hybrid system).
+		 */
+		if ((de2[i].function == 4) && has_func4)
+			continue;
+
+		if (find_cpuid_entry(e2, orig_nent, &de2[i]))
+			continue;
+
+		/* find an empty slot */
+		while (n < max_nent && !cpuid_entry_is_empty(&e2[n]))
+			n++;
+
+		if (n == max_nent)
+			return -ENOSPC;
+
+		e2[n++] = de2[i];
+	}
+
+	if (n > orig_nent)
+		*nent = n;
+
+	/* Apply fixed values to the final set of entries */
+	for (i = 0; i < *nent; i++) {
+		if (cpuid_entry_is_empty(&e2[i]) ||
+		    pkvm_cpuid_entry_host_owned(&e2[i]))
+			continue;
+
+		pkvm_fixup_cpuid_entry(&e2[i]);
+	}
+
+	return 0;
+}
 #endif /* !__PKVM_HYP__ */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d3f5ae15a7ca..506722a39bc9 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -290,4 +290,9 @@ static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
 		guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB));
 }
 
+#ifdef __PKVM_HYP__
+int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, int nent);
+int pkvm_enforce_cpuid(struct kvm_cpuid_entry2 *e2, int *nent, int max_nent);
+#endif
+
 #endif
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 8ddb01191d6f..874de1493d3d 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -44,6 +44,7 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14)
 BUILD_KVM_GPR_ACCESSORS(r15, R15)
 #endif
 
+#ifndef __PKVM_HYP__
 /*
  * Using the register cache from interrupt context is generally not allowed, as
  * caching a register and marking it available/dirty can't be done atomically,
@@ -55,6 +56,9 @@ BUILD_KVM_GPR_ACCESSORS(r15, R15)
  */
 #define kvm_assert_register_caching_allowed(vcpu)		\
 	lockdep_assert_once(in_task() || kvm_arch_pmi_in_guest(vcpu))
+#else
+#define kvm_assert_register_caching_allowed(vcpu)
+#endif
 
 /*
  * avail  dirty
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index eefab3dc8498..6c5be685696b 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -6,7 +6,7 @@
 #ifndef __ARCH_X86_KVM_KVM_ONHYPERV_H__
 #define __ARCH_X86_KVM_KVM_ONHYPERV_H__
 
-#if IS_ENABLED(CONFIG_HYPERV)
+#if IS_ENABLED(CONFIG_HYPERV) && !defined(__PKVM_HYP__)
 int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
 int hv_flush_remote_tlbs(struct kvm *kvm);
 void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9a407266db94..0f5073bc1f1d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -47,6 +47,8 @@
 #include "hyperv.h"
 #include "smm.h"
 
+#ifndef __PKVM_HYP__
+
 #ifndef CONFIG_X86_64
 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
 #else
@@ -533,12 +535,14 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val)
 {
 	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
 }
+#endif /* !__PKVM_HYP__ */
 
 static inline bool kvm_lapic_lvt_supported(struct kvm_lapic *apic, int lvt_index)
 {
 	return apic->nr_lvt_entries > lvt_index;
 }
 
+#ifndef __PKVM_HYP__
 void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
@@ -561,26 +565,34 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 		v |= APIC_LVR_DIRECTED_EOI;
 	kvm_lapic_set_reg(apic, APIC_LVR, v);
 }
+#endif /* !__PKVM_HYP__ */
 
 void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu)
 {
 	int nr_lvt_entries = kvm_apic_calc_nr_lvt_entries(vcpu);
 	struct kvm_lapic *apic = vcpu->arch.apic;
+#ifndef __PKVM_HYP__
 	int i;
+#endif
 
 	if (!lapic_in_kernel(vcpu) || nr_lvt_entries == apic->nr_lvt_entries)
 		return;
 
+#ifndef __PKVM_HYP__
 	/* Initialize/mask any "new" LVT entries. */
 	for (i = apic->nr_lvt_entries; i < nr_lvt_entries; i++)
 		kvm_lapic_set_reg(apic, APIC_LVTx(i), APIC_LVT_MASKED);
+#endif
 
 	apic->nr_lvt_entries = nr_lvt_entries;
 
+#ifndef __PKVM_HYP__
 	/* The number of LVT entries is reflected in the version register. */
 	kvm_apic_set_version(vcpu);
+#endif
 }
 
+#ifndef __PKVM_HYP__
 static const unsigned int apic_lvt_mask[KVM_APIC_MAX_NR_LVT_ENTRIES] = {
 	[LVT_TIMER] = LVT_MASK,      /* timer mode mask added at runtime */
 	[LVT_THERMAL_MONITOR] = LVT_MASK | APIC_MODE_MASK,
@@ -1671,6 +1683,7 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
 {
 	return container_of(dev, struct kvm_lapic, dev);
 }
+#endif /* !__PKVM_HYP__ */
 
 #define APIC_REG_MASK(reg)	(1ull << ((reg) >> 4))
 #define APIC_REGS_MASK(first, count) \
@@ -1714,6 +1727,7 @@ u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lapic_readable_reg_mask);
 
+#ifndef __PKVM_HYP__
 static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 			      void *data)
 {
@@ -3507,3 +3521,4 @@ void kvm_lapic_exit(void)
 	static_key_deferred_flush(&apic_sw_disabled);
 	WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
 }
+#endif /* !__PKVM_HYP__ */
diff --git a/arch/x86/kvm/pkvm/Makefile b/arch/x86/kvm/pkvm/Makefile
index 3391a32e51cb..42e63575d312 100644
--- a/arch/x86/kvm/pkvm/Makefile
+++ b/arch/x86/kvm/pkvm/Makefile
@@ -36,14 +36,19 @@ pkvm-hyp-y			+= $(kernel-lib)/sort.o $(kernel-lib)/bsearch.o \
 pkvm-hyp-$(CONFIG_LIST_HARDENED) += $(kernel-lib)/list_debug.o
 
 kvm				:= ..
-pkvm-hyp-y			+= $(kvm)/x86.o $(kvm)/cpuid.o
+pkvm-hyp-y			+= $(kvm)/x86.o $(kvm)/cpuid.o $(kvm)/mtrr.o \
+				   $(kvm)/lapic.o
 
 pkvm-hyp-$(CONFIG_PKVM_INTEL)	+= vmx/host_vmentry.o vmx/host_vmx.o \
 				   $(kvm)/vmx/vmx.o vmx/idt.o vmx/ept.o \
-				   vmx/host_repriv.o $(kvm)/vmx/main.o
+				   vmx/host_repriv.o $(kvm)/vmx/main.o \
+				   $(kvm)/vmx/vmenter.o
+
+AFLAGS_$(kvm)/vmx/vmenter.pkvm.o += -iquote $(obj)
+$(obj)/$(kvm)/vmx/vmenter.pkvm.o: $(obj)/kvm-asm-offsets.h
 
 fpu				:= ../../kernel/fpu
-pkvm-hyp-y			+= $(fpu)/core.o
+pkvm-hyp-y			+= $(fpu)/core.o $(fpu)/xstate.o
 
 pkvm-obj 			:= $(patsubst %.o,%.pkvm.o,$(pkvm-hyp-y))
 obj-$(CONFIG_PKVM_X86)		+= pkvm.o
diff --git a/arch/x86/kvm/pkvm/cpu.c b/arch/x86/kvm/pkvm/cpu.c
index 11897b635e9e..72a594ebec7e 100644
--- a/arch/x86/kvm/pkvm/cpu.c
+++ b/arch/x86/kvm/pkvm/cpu.c
@@ -21,6 +21,7 @@ struct cpumask __cpu_possible_mask __ro_after_init;
 unsigned int nr_cpu_ids;
 DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
 DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+u64 x86_pred_cmd = PRED_CMD_IBPB;
 
 /*
  * Used to switch the FPU state between the host VM and pVMs. The fpu struct is
diff --git a/arch/x86/kvm/pkvm/entry.S b/arch/x86/kvm/pkvm/entry.S
index 7323e8867c37..6841745f4d31 100644
--- a/arch/x86/kvm/pkvm/entry.S
+++ b/arch/x86/kvm/pkvm/entry.S
@@ -2,6 +2,7 @@
 #include <linux/linkage.h>
 #include <asm/asm.h>
 #include <asm/cache.h>
+#include <asm/nospec-branch.h>
 #include <asm/segment.h>
 #include <asm/trapnr.h>
 #include <asm/unwind_hints.h>
@@ -83,3 +84,92 @@ SYM_CODE_START_NOALIGN(x86_verw_sel)
 SYM_CODE_END(x86_verw_sel);
 
 .popsection
+
+/* Clobbers AX, CX, DX */
+SYM_FUNC_START(write_ibpb)
+	ANNOTATE_NOENDBR
+	movl	$MSR_IA32_PRED_CMD, %ecx
+	movl	_ASM_RIP(x86_pred_cmd), %eax
+	xorl	%edx, %edx
+	wrmsr
+
+	/* Make sure IBPB clears return stack preductions too. */
+	FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET
+	RET
+SYM_FUNC_END(write_ibpb)
+
+#ifdef CONFIG_X86_64
+/*
+ * This sequence executes branches in order to remove user branch information
+ * from the branch history tracker in the Branch Predictor, therefore removing
+ * user influence on subsequent BTB lookups.
+ *
+ * It should be used on parts prior to Alder Lake. Newer parts should use the
+ * BHI_DIS_S hardware control instead. If a pre-Alder Lake part is being
+ * virtualized on newer hardware the VMM should protect against BHI attacks by
+ * setting BHI_DIS_S for the guests.
+ *
+ * CALLs/RETs are necessary to prevent Loop Stream Detector(LSD) from engaging
+ * and not clearing the branch history. The call tree looks like:
+ *
+ * call 1
+ *    call 2
+ *      call 2
+ *        call 2
+ *          call 2
+ * 	      call 2
+ * 	      ret
+ * 	    ret
+ *        ret
+ *      ret
+ *    ret
+ * ret
+ *
+ * This means that the stack is non-constant and ORC can't unwind it with %rsp
+ * alone.  Therefore we unconditionally set up the frame pointer, which allows
+ * ORC to unwind properly.
+ *
+ * The alignment is for performance and not for safety, and may be safely
+ * refactored in the future if needed. The .skips are for safety, to ensure
+ * that all RETs are in the second half of a cacheline to mitigate Indirect
+ * Target Selection, rather than taking the slowpath via its_return_thunk.
+ */
+SYM_FUNC_START(clear_bhb_loop)
+	ANNOTATE_NOENDBR
+	push	%rbp
+	mov	%rsp, %rbp
+	movl	$5, %ecx
+	ANNOTATE_INTRA_FUNCTION_CALL
+	call	1f
+	jmp	5f
+	.align 64, 0xcc
+	/*
+	 * Shift instructions so that the RET is in the upper half of the
+	 * cacheline and don't take the slowpath to its_return_thunk.
+	 */
+	.skip 32 - (.Lret1 - 1f), 0xcc
+	ANNOTATE_INTRA_FUNCTION_CALL
+1:	call	2f
+.Lret1:	RET
+	.align 64, 0xcc
+	/*
+	 * As above shift instructions for RET at .Lret2 as well.
+	 *
+	 * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc
+	 * but some Clang versions (e.g. 18) don't like this.
+	 */
+	.skip 32 - 18, 0xcc
+2:	movl	$5, %eax
+3:	jmp	4f
+	nop
+4:	sub	$1, %eax
+	jnz	3b
+	sub	$1, %ecx
+	jnz	1b
+.Lret2:	RET
+5:	lfence
+	pop	%rbp
+	RET
+SYM_FUNC_END(clear_bhb_loop)
+STACK_FRAME_NON_STANDARD(clear_bhb_loop)
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kvm/pkvm/idt.c b/arch/x86/kvm/pkvm/idt.c
index 4238be54b7b3..40c456fb9a58 100644
--- a/arch/x86/kvm/pkvm/idt.c
+++ b/arch/x86/kvm/pkvm/idt.c
@@ -114,6 +114,8 @@ static bool pkvm_fixup_exception(struct pt_regs *regs)
 	reg  = FIELD_GET(EX_DATA_REG_MASK,  e->data);
 
 	switch (type) {
+	case EX_TYPE_DEFAULT:
+		return ex_handler_default(e, regs);
 	case EX_TYPE_WRMSR:
 		return ex_handler_msr(e, regs, true, false, reg);
 	case EX_TYPE_RDMSR:
diff --git a/arch/x86/kvm/pkvm/init.c b/arch/x86/kvm/pkvm/init.c
index 1cb9ad78be73..cdf17b157b62 100644
--- a/arch/x86/kvm/pkvm/init.c
+++ b/arch/x86/kvm/pkvm/init.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/kvm_host.h>
+#include <asm/fpu/xstate.h>
 #include <asm/kvm_pkvm.h>
+#include "../cpuid.h"
 #include "early_alloc.h"
 #include "fpu.h"
 #include "init.h"
@@ -243,6 +245,14 @@ static int initialize_global(struct pkvm_mem_info infos[], int nr_infos)
 	if (ret)
 		return ret;
 
+	pkvm_setup_xstate_cache();
+
+	/*
+	 * Initialize KVM cpuid_xstate_sizes to support CPUID emulation for the
+	 * guest VMs.
+	 */
+	kvm_init_xstate_sizes();
+
 	return hyp_global_init ? hyp_global_init() : 0;
 }
 
diff --git a/arch/x86/kvm/pkvm/pkvm.c b/arch/x86/kvm/pkvm/pkvm.c
index ad8ccd5af3ad..dd473c100cd5 100644
--- a/arch/x86/kvm/pkvm/pkvm.c
+++ b/arch/x86/kvm/pkvm/pkvm.c
@@ -30,6 +30,11 @@ bool tdp_enabled = true;
 struct pkvm_hyp *pkvm_hyp;
 DEFINE_PER_CPU(struct pkvm_pcpu *, phys_cpu);
 DEFINE_PER_CPU(struct kvm_vcpu *, host_vcpu);
+/*
+ * similarly pmu.c is not compiled. define kvm_mmu_cap here for the use
+ * in cpuid.c
+ */
+struct x86_pmu_capability __read_mostly kvm_pmu_cap = {0};
 
 /* The maximum number of VMs under pkvm. */
 #define MAX_PKVM_VMS		64
@@ -51,6 +56,9 @@ static struct pkvm_vm_ref {
  */
 size_t kvm_vcpu_sz = sizeof(struct kvm_vcpu);
 
+/* The current loaded guest vCPU. */
+static DEFINE_PER_CPU(struct kvm_vcpu*, cur_guest_vcpu);
+
 static int __pkvm_vcpu_free(struct pkvm_vm *pkvm_vm, int vcpu_handle,
 			    struct pkvm_memcache *mc);
 
@@ -317,10 +325,41 @@ static void unsetup_vcpu_lapic(struct kvm_vcpu *vcpu)
 	pkvm_host_unshare_hyp(__pkvm_pa(apic->regs), PAGE_SIZE);
 }
 
+static int share_vcpu_mce_banks(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	if (pkvm_is_protected_vcpu(vcpu))
+		return -EINVAL;
+
+	ret = pkvm_host_share_hyp(__pkvm_pa(vcpu->arch.mce_banks), KVM_MCE_SIZE);
+	if (ret)
+		return ret;
+
+	ret = pkvm_host_share_hyp(__pkvm_pa(vcpu->arch.mci_ctl2_banks), KVM_MCI_CTL2_SIZE);
+	if (ret)
+		pkvm_host_unshare_hyp(__pkvm_pa(vcpu->arch.mce_banks), KVM_MCE_SIZE);
+
+	return ret;
+}
+
+static void unshare_vcpu_mce_banks(struct kvm_vcpu *vcpu)
+{
+	if (pkvm_is_protected_vcpu(vcpu))
+		return;
+
+	pkvm_host_unshare_hyp(__pkvm_pa(vcpu->arch.mce_banks), KVM_MCE_SIZE);
+	pkvm_host_unshare_hyp(__pkvm_pa(vcpu->arch.mci_ctl2_banks), KVM_MCI_CTL2_SIZE);
+}
+
 static int __vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, struct fpstate *fps)
 {
 	struct pkvm_vcpu *pkvm_vcpu = to_pkvm_vcpu(vcpu);
 	int ret = kvm_x86_call(vcpu_precreate)(kvm);
+	void *unused = (void *)pkvm_vcpu +
+		       PKVM_VCPU_BASE_SIZE +
+		       kvm_vcpu_sz;
+	int cpu = raw_smp_processor_id();
 
 	if (ret)
 		return ret;
@@ -334,26 +373,69 @@ static int __vcpu_create(struct kvm *kvm, struct kvm_vcpu *vcpu, struct fpstate
 	vcpu->arch.regs_avail = ~0;
 	vcpu->arch.regs_dirty = ~0;
 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-	vcpu->arch.mce_banks = (void *)pkvm_vcpu + PKVM_VCPU_BASE_SIZE + kvm_vcpu_sz;
-	vcpu->arch.mci_ctl2_banks = (void *)vcpu->arch.mce_banks + KVM_MCE_SIZE;
+
+	if (!pkvm_is_protected_vcpu(vcpu)) {
+		vcpu->arch.mce_banks = kern_pkvm_va(pkvm_vcpu->shared_vcpu->arch.mce_banks);
+		vcpu->arch.mci_ctl2_banks =
+			kern_pkvm_va(pkvm_vcpu->shared_vcpu->arch.mci_ctl2_banks);
+		ret = share_vcpu_mce_banks(vcpu);
+		if (ret)
+			return ret;
+	} else {
+		vcpu->arch.mce_banks = unused;
+		unused += KVM_MCE_SIZE;
+		vcpu->arch.mci_ctl2_banks = unused;
+		unused += KVM_MCI_CTL2_SIZE;
+	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
 	vcpu->arch.apic_base = pkvm_vcpu->shared_vcpu->arch.apic_base;
 	if (lapic_in_kernel(pkvm_vcpu->shared_vcpu))
-		vcpu->arch.apic = (void *)vcpu->arch.mci_ctl2_banks + KVM_MCI_CTL2_SIZE;
+		vcpu->arch.apic = unused;
 
 	ret = setup_vcpu_lapic(vcpu);
 	if (ret)
-		return ret;
+		goto unshare_mce;
 
 	vcpu->arch.guest_fpu.fpstate = fps;
 	pkvm_init_guest_fpu(&vcpu->arch.guest_fpu);
 	if (pkvm_is_protected_vcpu(vcpu))
 		fpstate_set_confidential(&vcpu->arch.guest_fpu);
 
+	if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
+		vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
+		vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+		vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
+	}
+
+	vcpu->arch.mmu = &vcpu->arch.root_mmu;
+	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+
 	ret = kvm_x86_call(vcpu_create)(vcpu);
 	if (ret)
-		unsetup_vcpu_lapic(vcpu);
+		goto unsetup_lapic;
 
+	/* Load guest vCPU to reset it. */
+	kvm_x86_call(vcpu_load)(vcpu, cpu);
+
+	kvm_vcpu_reset(vcpu, false);
+
+	/*
+	 * The guest vCPU should be put before switching back to the host vCPU
+	 * to make sure the vcpu state is not cached on this CPU as this guest
+	 * vCPU may be loaded on another CPU later by the host via the PV
+	 * interface.
+	 */
+	kvm_x86_call(vcpu_put)(vcpu);
+
+	kvm_x86_call(vcpu_load)(this_cpu_read(host_vcpu), cpu);
+
+	return 0;
+
+unsetup_lapic:
+	unsetup_vcpu_lapic(vcpu);
+unshare_mce:
+	unshare_vcpu_mce_banks(vcpu);
 	return ret;
 }
 
@@ -362,6 +444,7 @@ static void __vcpu_free(struct kvm_vcpu *vcpu)
 	kvm_x86_call(vcpu_free)(vcpu);
 
 	unsetup_vcpu_lapic(vcpu);
+	unshare_vcpu_mce_banks(vcpu);
 }
 
 static int pkvm_vcpu_create(int vm_handle, phys_addr_t host_vcpu_pa,
@@ -383,7 +466,9 @@ static int pkvm_vcpu_create(int vm_handle, phys_addr_t host_vcpu_pa,
 		goto put_vm;
 
 	shared_vcpu = __pkvm_va(host_vcpu_pa);
-	vcpu_size = PKVM_VCPU_BASE_SIZE + kvm_vcpu_sz + KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE;
+	vcpu_size = PKVM_VCPU_BASE_SIZE + kvm_vcpu_sz;
+	if (pkvm_is_protected_vm(&pkvm_vm->kvm))
+		vcpu_size += KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE;
 	if (lapic_in_kernel(shared_vcpu))
 		vcpu_size += sizeof(struct kvm_lapic);
 	vcpu_size = PAGE_ALIGN(vcpu_size);
@@ -445,6 +530,10 @@ static int __pkvm_vcpu_free(struct pkvm_vm *pkvm_vm, int vcpu_handle,
 
 	fps = pkvm_vcpu->vcpu.arch.guest_fpu.fpstate;
 	teardown_donated_memory(mc, fps, fps->size);
+	if (pkvm_vcpu->vcpu.arch.cpuid_entries)
+		teardown_donated_memory(mc, pkvm_vcpu->vcpu.arch.cpuid_entries,
+					PAGE_ALIGN(sizeof(struct kvm_cpuid_entry2) *
+					pkvm_vcpu->vcpu.arch.cpuid_nent));
 	teardown_donated_memory(mc, pkvm_vcpu, pkvm_vcpu->size);
 
 	pkvm_host_unshare_hyp(shared_vcpu_pa, kvm_vcpu_sz);
@@ -472,12 +561,789 @@ static int pkvm_vcpu_free(int vm_handle, int vcpu_handle, struct pkvm_memcache *
 	return ret;
 }
 
+static int pkvm_vcpu_load(int vm_handle, int vcpu_handle)
+{
+	struct pkvm_vcpu *pkvm_vcpu = pkvm_get_vcpu(vm_handle, vcpu_handle);
+	int cpu = raw_smp_processor_id();
+	struct kvm_vcpu *vcpu;
+	int loaded_cpu;
+	int ret = 0;
+
+	if (!pkvm_vcpu)
+		return -EINVAL;
+
+	vcpu = &pkvm_vcpu->vcpu;
+	loaded_cpu = cmpxchg(&vcpu->cpu, -1, cpu);
+	if (loaded_cpu == -1) {
+		/*
+		 * Get the pkvm_vcpu to prevent it from being freed via the
+		 * vcpu_free PV interface while it is still loaded. If the
+		 * obtained pkvm_vcpu is not the same as the original one, it
+		 * must be a pkvm bug.
+		 */
+		BUG_ON(pkvm_vcpu != pkvm_get_vcpu(vm_handle, vcpu_handle));
+
+		this_cpu_write(cur_guest_vcpu, vcpu);
+	} else if (loaded_cpu == cpu) {
+		/* The guest vCPU is already loaded on this CPU. */
+		this_cpu_write(cur_guest_vcpu, vcpu);
+	} else {
+		/* The guest vCPU is already loaded on another CPU. */
+		ret = -EBUSY;
+	}
+
+	pkvm_put_vcpu(pkvm_vcpu);
+
+	return ret;
+}
+
+static int pkvm_vcpu_put(int vm_handle, int vcpu_handle)
+{
+	struct pkvm_vcpu *pkvm_vcpu = pkvm_get_vcpu(vm_handle, vcpu_handle);
+	int cpu = raw_smp_processor_id(), loaded_cpu, ret = 0;
+	struct kvm_vcpu *vcpu;
+
+	if (!pkvm_vcpu)
+		return -EINVAL;
+
+	vcpu = &pkvm_vcpu->vcpu;
+	loaded_cpu = vcpu->cpu;
+	if (loaded_cpu == cpu) {
+		/*
+		 * The current active vCPU is the host vCPU. Switch to the guest
+		 * vCPU in case vcpu_put operation requires.
+		 */
+		kvm_x86_call(vcpu_load)(vcpu, cpu);
+
+		/*
+		 * Another guest vCPU may have already been loaded on this CPU
+		 * thus the cur_guest_vcpu may be overridden. So only set the
+		 * cur_guest_vcpu as NULL if it points to the guest vCPU being
+		 * put.
+		 */
+		if (vcpu == this_cpu_read(cur_guest_vcpu))
+			this_cpu_write(cur_guest_vcpu, NULL);
+
+		kvm_x86_call(vcpu_put)(vcpu);
+
+		/*
+		 * Put this pkvm_vcpu to allow it to be freed via the vcpu_free PV
+		 * interface.
+		 */
+		pkvm_put_vcpu(pkvm_vcpu);
+
+		/* Switch to the host vCPU as a guest vCPU was just loaded. */
+		kvm_x86_call(vcpu_load)(this_cpu_read(host_vcpu), cpu);
+
+		/*
+		 * Paired with cmpxchg in pkvm_vcpu_load() to make sure the
+		 * vcpu->cpu is set only after the put is completed.
+		 */
+		smp_store_release(&vcpu->cpu, -1);
+	} else {
+		/*
+		 * The guest vCPU is not loaded on any CPU or is loaded on a
+		 * different CPU.
+		 */
+		ret = -EINVAL;
+	}
+
+	pkvm_put_vcpu(pkvm_vcpu);
+
+	return ret;
+}
+
+static bool is_guest_vcpu_accessible(struct kvm_vcpu *vcpu, enum pkvm_hc hc)
+{
+	/*
+	 * There is no isolation between non-protected VMs and the host, thus
+	 * all the PV interfaces are allowed for an npVM.
+	 */
+	if (!pkvm_is_protected_vcpu(vcpu))
+		return true;
+
+	switch (hc) {
+	case __pkvm__enable_nmi_window:
+	case __pkvm__enable_irq_window:
+	case __pkvm__interrupt_allowed:
+	case __pkvm__nmi_allowed:
+	case __pkvm__get_nmi_mask:
+	case __pkvm__inject_irq:
+	case __pkvm__inject_nmi:
+	case __pkvm__cancel_injection:
+	case __pkvm__update_cr8_intercept:
+	case __pkvm__set_virtual_apic_mode:
+	case __pkvm__refresh_apicv_exec_ctrl:
+	case __pkvm__load_eoi_exitmap:
+	case __pkvm__hwapic_isr_update:
+	case __pkvm__sync_pir_to_irr:
+	case __pkvm__write_tsc_offset:
+	case __pkvm__write_tsc_multiplier:
+	case __pkvm__load_mmu_pgd:
+	case __pkvm__setup_mce:
+		/*
+		 * The host is responsible for running vCPU, injecting
+		 * interrupts, emulating lapic etc. Always allow the related PV
+		 * interfaces.
+		 *
+		 * TODO: As the pVM can use another secure time source, the
+		 * guest TSC is allowed for the host to emulate and access. To
+		 * support the pVM with secure TSC, add protection for TSC
+		 * related PV interfaces.
+		 *	__pkvm__write_tsc_offset
+		 *	__pkvm__write_tsc_multiplier
+		 */
+		return true;
+	case __pkvm__set_efer:
+	case __pkvm__set_msr:
+	case __pkvm__get_msr:
+	case __pkvm__set_cr4:
+	case __pkvm__set_cr0:
+	case __pkvm__set_rflags:
+	case __pkvm__get_rflags:
+	case __pkvm__vcpu_reset:
+	case __pkvm__set_segment:
+	case __pkvm__get_segment:
+	case __pkvm__get_segment_base:
+	case __pkvm__set_idt:
+	case __pkvm__get_idt:
+	case __pkvm__set_gdt:
+	case __pkvm__get_gdt:
+	case __pkvm__flush_tlb_all:
+	case __pkvm__flush_tlb_current:
+	case __pkvm__flush_tlb_gva:
+	case __pkvm__flush_tlb_guest:
+	case __pkvm__vcpu_after_set_cpuid:
+	case __pkvm__vcpu_add_fpstate:
+		/*
+		 * As the host needs to pre-configure the pVM's vCPU state for
+		 * booting, the protection for pVM is only enforced by the pKVM
+		 * hypervisor once the vCPU has started running.
+		 */
+		return !kvm_vcpu_has_run(vcpu);
+	default:
+		/*
+		 * The other PV interfaces are not necessary for the host to
+		 * access the pVM's vCPU state. Deny these PV interfaces by
+		 * default.
+		 */
+		return false;
+	}
+}
+
+static void pkvm_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * The guest_debug will impact what exceptions should be intercepted
+	 * for the debugging purpose. Debugging npVMs from the host side is
+	 * allowed thus updating its guest_debug flags accordingly, but
+	 * debugging pVMs from the host side is not allowed.
+	 *
+	 * As the __pkvm__update_exception_bitmap is always denied for the pVM,
+	 * it must be a code bug if the vcpu is protected.
+	 */
+	BUG_ON(pkvm_is_protected_vcpu(vcpu));
+	vcpu->guest_debug = to_pkvm_vcpu(vcpu)->shared_vcpu->guest_debug;
+
+	kvm_x86_call(update_exception_bitmap)(vcpu);
+}
+
+static int pkvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+	if (pkvm_is_protected_vcpu(vcpu)) {
+		if (WARN_ON(kvm_vcpu_has_run(vcpu)))
+			return -EPERM;
+
+		/*
+		 * For simplicity and security, allow the host to change
+		 * initial values of those MSRs (or individual bits in MSRs)
+		 * that are currently tweaked by crosvm, and only those.
+		 * The allowed set can be extended as needed.
+		 */
+		switch (index) {
+		case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+		case MSR_MTRRdefType:
+			break;
+		case MSR_IA32_MISC_ENABLE:
+			if (data & ~(MSR_IA32_MISC_ENABLE_FAST_STRING |
+				     MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+				     MSR_IA32_MISC_ENABLE_BTS_UNAVAIL))
+				return -EPERM;
+
+			/*
+			 * vPMU is not supported by pKVM yet. Don't trick the pVM
+			 * that it is.
+			 */
+			data |= MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+				MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
+			break;
+		case MSR_STAR:
+		case MSR_LSTAR:
+		case MSR_CSTAR:
+		case MSR_SYSCALL_MASK:
+		case MSR_KERNEL_GS_BASE:
+		case MSR_IA32_SYSENTER_CS:
+		case MSR_IA32_SYSENTER_ESP:
+		case MSR_IA32_SYSENTER_EIP:
+			/*
+			 * TODO: The user space VMM from the host side (e.g.,
+			 * crosvm) may still try to set these MSRs which are
+			 * protected by the pKVM hypervisor for a pVM. Ignore
+			 * writings to these MSRs and return 0 to make such
+			 * user space VMM happy, meanwhile doesn't really modify
+			 * these MSRs. This eventually will be fixed in the user
+			 * space VMM to avoid doing so for a pVM. Once this is
+			 * implemented, these can be removed.
+			 */
+			return 0;
+		default:
+			return -EPERM;
+		}
+	}
+
+	return kvm_msr_write(vcpu, index, data);
+}
+
+static int pkvm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg,
+			  union pkvm_hc_data *out)
+{
+	kvm_x86_call(cache_reg)(vcpu, reg);
+
+	switch (reg) {
+	case VCPU_REGS_RSP:
+		out->cache_reg.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+		break;
+	case VCPU_REGS_RIP:
+		out->cache_reg.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+		break;
+	case VCPU_EXREG_PDPTR: {
+		struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+		out->cache_reg.pdptrs[0] = mmu->pdptrs[0];
+		out->cache_reg.pdptrs[1] = mmu->pdptrs[1];
+		out->cache_reg.pdptrs[2] = mmu->pdptrs[2];
+		out->cache_reg.pdptrs[3] = mmu->pdptrs[3];
+		break;
+	}
+	case VCPU_EXREG_CR0:
+		out->cache_reg.cr0 = vcpu->arch.cr0;
+		break;
+	case VCPU_EXREG_CR3:
+		out->cache_reg.cr3 = vcpu->arch.cr3;
+		break;
+	case VCPU_EXREG_CR4:
+		out->cache_reg.cr4 = vcpu->arch.cr4;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static void pkvm_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	unsigned long dr7 = val;
+
+	kvm_x86_call(set_dr7)(vcpu, dr7);
+	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
+	if (dr7 & DR7_BP_EN_MASK)
+		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
+}
+
+static inline bool pkvm_event_injection_allowed(struct kvm_vcpu *vcpu)
+{
+	return !kvm_event_needs_reinjection(vcpu) && !vcpu->arch.exception.pending;
+}
+
+static int pkvm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	if (for_injection && !pkvm_event_injection_allowed(vcpu))
+		return -EBUSY;
+
+	return kvm_x86_call(interrupt_allowed)(vcpu, for_injection);
+}
+
+static int pkvm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	if (for_injection && !pkvm_event_injection_allowed(vcpu))
+		return -EBUSY;
+
+	return kvm_x86_call(nmi_allowed)(vcpu, for_injection);
+}
+
+static void pkvm_inject_irq(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu *shared_vcpu = to_pkvm_vcpu(vcpu)->shared_vcpu;
+
+	if (WARN_ON_ONCE(pkvm_interrupt_allowed(vcpu, true) <= 0))
+		return;
+
+	vcpu->arch.interrupt.soft = shared_vcpu->arch.interrupt.soft;
+	vcpu->arch.interrupt.nr = shared_vcpu->arch.interrupt.nr;
+	kvm_x86_call(inject_irq)(vcpu, false);
+}
+
+static void pkvm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+	if (WARN_ON_ONCE(pkvm_nmi_allowed(vcpu, true) <= 0))
+		return;
+
+	kvm_x86_call(inject_nmi)(vcpu);
+}
+
+static void pkvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * As the __pkvm__inject_exception is always denied for the pVM,
+	 * it must be a code bug if the vcpu is protected.
+	 */
+	BUG_ON(pkvm_is_protected_vcpu(vcpu));
+	vcpu->arch.exception = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.exception;
+
+	kvm_x86_call(inject_exception)(vcpu);
+}
+
+static void pkvm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	struct pkvm_vcpu *pkvm_vcpu = to_pkvm_vcpu(vcpu);
+	struct kvm_vcpu *shared_vcpu;
+
+	kvm_x86_call(cancel_injection)(vcpu);
+
+	shared_vcpu = pkvm_vcpu->shared_vcpu;
+	if (vcpu->arch.nmi_injected) {
+		shared_vcpu->arch.nmi_injected = true;
+		vcpu->arch.nmi_injected = false;
+	} else if (vcpu->arch.interrupt.injected) {
+		kvm_queue_interrupt(shared_vcpu, vcpu->arch.interrupt.nr,
+				    vcpu->arch.interrupt.soft);
+		kvm_clear_interrupt_queue(vcpu);
+	} else if (!pkvm_is_protected_vcpu(vcpu) && vcpu->arch.exception.injected) {
+		/*
+		 * For the pVM, the exception can only be injected and canceled
+		 * by the pkvm hypervisor.
+		 * For the npVM, the exception can be injected and canceled by
+		 * both sides.
+		 */
+		shared_vcpu->arch.exception = vcpu->arch.exception;
+		kvm_clear_exception_queue(vcpu);
+	}
+}
+
+static void pkvm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+	u64 apic_base = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.apic_base;
+
+	if ((vcpu->arch.apic_base ^ apic_base) & MSR_IA32_APICBASE_ENABLE)
+		vcpu->arch.cpuid_dynamic_bits_dirty = true;
+
+	vcpu->arch.apic_base = apic_base;
+	kvm_x86_call(set_virtual_apic_mode)(vcpu);
+}
+
+static void pkvm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu, bool apicv_active)
+{
+	if (!lapic_in_kernel(vcpu))
+		return;
+
+	vcpu->arch.apic->apicv_active = apicv_active;
+	kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu);
+}
+
+static void pkvm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 eoi_exit_bitmap0,
+				  u64 eoi_exit_bitmap1, u64 eoi_exit_bitmap2,
+				  u64 eoi_exit_bitmap3)
+{
+	u64 eoi_exit_bitmap[] = {
+		eoi_exit_bitmap0,
+		eoi_exit_bitmap1,
+		eoi_exit_bitmap2,
+		eoi_exit_bitmap3,
+	};
+
+	kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
+}
+
+static void pkvm_sync_pir_to_irr(struct kvm_vcpu *vcpu, int pir)
+{
+	to_pkvm_vcpu(vcpu)->max_irr = pir;
+	kvm_x86_call(sync_pir_to_irr)(vcpu);
+}
+
+static int pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu,
+				     phys_addr_t cpuid_pa,
+				     struct pkvm_memcache *mc)
+{
+	struct kvm_cpuid_entry2 *new, *old;
+	int new_nent, old_nent, ret;
+	u64 size, aligned_size;
+
+	new_nent = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.cpuid_nent;
+	size = sizeof(struct kvm_cpuid_entry2) * new_nent;
+	aligned_size = PAGE_ALIGN(size);
+	ret = pkvm_host_donate_hyp(cpuid_pa, aligned_size, false);
+	if (ret)
+		return ret;
+
+	new = __pkvm_va(cpuid_pa);
+	if (pkvm_is_protected_vcpu(vcpu)) {
+		/*
+		 * Donation is page-granule, so the host must ensure that
+		 * the cpuid buffer size is page aligned though the actual
+		 * nent only records valid entries.
+		 *
+		 * Clear the trailing space after nent so it can be used
+		 * to hold missing cpuid entries enforced by pkvm.
+		 */
+		memset((void *)new + size, 0, aligned_size - size);
+
+		ret = pkvm_enforce_cpuid(new, &new_nent,
+					 aligned_size / sizeof(struct kvm_cpuid_entry2));
+		if (ret)
+			goto undonate;
+	}
+
+	old = vcpu->arch.cpuid_entries;
+	old_nent = vcpu->arch.cpuid_nent;
+
+	ret = kvm_set_cpuid(vcpu, new, new_nent);
+	if (ret)
+		goto undonate;
+
+	memset(mc, 0, sizeof(*mc));
+	/*
+	 * New cpuid entries memory is consumed. Tear down the old cpuid
+	 * entries memory if there is.
+	 */
+	if (old)
+		teardown_donated_memory(mc, (void *)old,
+					PAGE_ALIGN(sizeof(struct kvm_cpuid_entry2) *
+						   old_nent));
+
+	return 0;
+
+undonate:
+	pkvm_hyp_donate_host(__pkvm_pa(new), aligned_size, false);
+	return ret;
+}
+
+static int pkvm_vcpu_add_fpstate(struct kvm_vcpu *vcpu,
+				 phys_addr_t fpstate_pa, size_t size,
+				 struct pkvm_memcache *mc)
+{
+	struct fpstate *new, *old;
+	int ret;
+
+	/* Expect the host to use this PV interface for pVM only. */
+	if (!pkvm_is_protected_vcpu(vcpu))
+		return -EINVAL;
+
+	ret = pkvm_host_donate_hyp(fpstate_pa, size, true);
+	if (ret)
+		return ret;
+
+	memset(mc, 0, sizeof(*mc));
+
+	old = vcpu->arch.guest_fpu.fpstate;
+	new = __pkvm_va(fpstate_pa);
+	/*
+	 * Reuse the existing fpstate memory if it's sufficiently large. At this
+	 * stage, we can't determine whether the new fpstate size matches the
+	 * vCPUID or not, because that check only occurs when the host calls
+	 * __pkvm__vcpu_after_set_cpuid to update the vCPUID. If the new fpstate
+	 * size is smaller than what the new vCPUID requires, the vCPUID won't
+	 * be updated. Therefore, ensuring the new fpstate size is at least as
+	 * large as the previous one allows continued support for this scenario.
+	 */
+	if (old && old->size >= size) {
+		teardown_donated_memory(mc, new, size);
+		return 0;
+	}
+
+	new->size = size;
+	vcpu->arch.guest_fpu.fpstate = new;
+
+	pkvm_init_guest_fpu(&vcpu->arch.guest_fpu);
+	fpstate_set_confidential(&vcpu->arch.guest_fpu);
+
+	/*
+	 * New physical fpstate memory is consumed. Tear down the old fpstate
+	 * memory if there is.
+	 */
+	if (old)
+		teardown_donated_memory(mc, old, old->size);
+
+	return 0;
+}
+
+static void pkvm_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	u64 tsc_offset = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.tsc_offset;
+
+	vcpu->arch.l1_tsc_offset = tsc_offset;
+	vcpu->arch.tsc_offset = tsc_offset;
+	kvm_x86_call(write_tsc_offset)(vcpu);
+}
+
+static int pkvm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+	u64 ratio = to_pkvm_vcpu(vcpu)->shared_vcpu->arch.tsc_scaling_ratio;
+
+	if (!kvm_caps.has_tsc_control)
+		return -EOPNOTSUPP;
+
+	vcpu->arch.l1_tsc_scaling_ratio = ratio;
+	vcpu->arch.tsc_scaling_ratio = ratio;
+	kvm_x86_call(write_tsc_multiplier)(vcpu);
+
+	return 0;
+}
+
+static int pkvm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
+{
+	struct kvm_vcpu *shared_vcpu = to_pkvm_vcpu(vcpu)->shared_vcpu;
+
+	/*
+	 * The guest CR3/PDPTR may be updated by the load_mmu_pgd. Sync the
+	 * guest CR3/PDPTR from the host for both npVMs or pVMs (if pVMs are not
+	 * starting to run yet).
+	 */
+	if (!pkvm_is_protected_vcpu(vcpu) || !kvm_vcpu_has_run(vcpu)) {
+		if (kvm_register_is_dirty(shared_vcpu, VCPU_EXREG_CR3)) {
+			vcpu->arch.cr3 = shared_vcpu->arch.cr3;
+			kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+		}
+
+		if (kvm_register_is_dirty(shared_vcpu, VCPU_EXREG_PDPTR)) {
+			struct kvm_mmu *shared_walk_mmu = kern_pkvm_va(shared_vcpu->arch.walk_mmu);
+			struct kvm_mmu *walk_mmu = vcpu->arch.walk_mmu;
+			int ret;
+
+			ret = pkvm_host_share_hyp(__pkvm_pa(shared_walk_mmu),
+						  sizeof(struct kvm_mmu));
+			if (ret)
+				return ret;
+
+			walk_mmu->pdptrs[0] = shared_walk_mmu->pdptrs[0];
+			walk_mmu->pdptrs[1] = shared_walk_mmu->pdptrs[1];
+			walk_mmu->pdptrs[2] = shared_walk_mmu->pdptrs[2];
+			walk_mmu->pdptrs[3] = shared_walk_mmu->pdptrs[3];
+			kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+
+			pkvm_host_unshare_hyp(__pkvm_pa(shared_walk_mmu),
+					      sizeof(struct kvm_mmu));
+		}
+	}
+
+	/*
+	 * TODO: Implement guest memory protection rather than directly using
+	 * the EPT controlled by the host.
+	 */
+	vcpu->arch.mmu->root.hpa = root_hpa;
+	vcpu->arch.mmu->root_role.level = root_level;
+
+	kvm_x86_call(load_mmu_pgd)(vcpu, vcpu->arch.mmu->root.hpa,
+				   vcpu->arch.mmu->root_role.level);
+
+	return 0;
+}
+
+static int pkvm_vcpu_handle_host_hypercall(struct kvm_vcpu *hvcpu, enum pkvm_hc hc,
+					   union pkvm_hc_data *in, union pkvm_hc_data *out)
+{
+	struct kvm_vcpu *vcpu = this_cpu_read(cur_guest_vcpu);
+	int cpu = raw_smp_processor_id(), ret = 0;
+
+	BUG_ON(hvcpu != this_cpu_read(host_vcpu));
+
+	if (!vcpu)
+		return -EINVAL;
+
+	if (!is_guest_vcpu_accessible(vcpu, hc))
+		return -EPERM;
+
+	kvm_x86_call(vcpu_load)(vcpu, cpu);
+
+	switch (hc) {
+	case __pkvm__update_exception_bitmap:
+		pkvm_update_exception_bitmap(vcpu);
+		break;
+	case __pkvm__set_efer:
+		ret = kvm_x86_call(set_efer)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__set_msr:
+		ret = pkvm_set_msr(vcpu, pkvm_hc_input1(hvcpu),
+				   pkvm_hc_input2(hvcpu));
+		break;
+	case __pkvm__get_msr:
+		ret = kvm_msr_read(vcpu, pkvm_hc_input1(hvcpu), &out->get_msr.data);
+		break;
+	case __pkvm__cache_reg:
+		ret = pkvm_cache_reg(vcpu, pkvm_hc_input1(hvcpu), out);
+		break;
+	case __pkvm__set_cr4:
+		kvm_x86_call(set_cr4)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__set_cr0:
+		kvm_x86_call(set_cr0)(vcpu, pkvm_hc_input1(hvcpu));
+		/*
+		 * EFER will be updated if the vCPU enters to or exits from the
+		 * long mode. Update the EFER for the host unconditionally. As
+		 * the updating is just one line code which is simpler and has
+		 * smaller overhead comparing with the case of doing the check
+		 * first.
+		 */
+		to_pkvm_vcpu(vcpu)->shared_vcpu->arch.efer = vcpu->arch.efer;
+		break;
+	case __pkvm__set_rflags:
+		kvm_x86_call(set_rflags)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__get_rflags:
+		out->get_rflags.data = kvm_x86_call(get_rflags)(vcpu);
+		break;
+	case __pkvm__set_dr7:
+		pkvm_set_dr7(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__vcpu_reset:
+		/*
+		 * Only needs to support reset vCPU for INIT as the non-INIT reset
+		 * is done by the pKVM hypervisor when creating this vCPU.
+		 *
+		 * TODO: The INIT for pVMs will be handled inside the pKVM hypervisor.
+		 * Once this is implemented, make the __pkvm__vcpu_reset only for npVM.
+		 */
+		kvm_vcpu_reset(vcpu, true);
+		break;
+	case __pkvm__set_segment:
+		kvm_x86_call(set_segment)(vcpu, &in->set_segment.seg_val,
+					  in->set_segment.seg);
+		break;
+	case __pkvm__get_segment:
+		kvm_x86_call(get_segment)(vcpu, &out->get_segment.seg_val,
+					  pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__get_segment_base:
+		out->get_segment_base.data =
+			kvm_x86_call(get_segment_base)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__set_idt:
+		kvm_x86_call(set_idt)(vcpu, &in->set_idt.desc);
+		break;
+	case __pkvm__get_idt:
+		kvm_x86_call(get_idt)(vcpu, &out->get_idt.desc);
+		break;
+	case __pkvm__set_gdt:
+		kvm_x86_call(set_gdt)(vcpu, &in->set_gdt.desc);
+		break;
+	case __pkvm__get_gdt:
+		kvm_x86_call(get_gdt)(vcpu, &out->get_gdt.desc);
+		break;
+	case __pkvm__flush_tlb_all:
+		kvm_x86_call(flush_tlb_all)(vcpu);
+		break;
+	case __pkvm__flush_tlb_current:
+		kvm_x86_call(flush_tlb_current)(vcpu);
+		break;
+	case __pkvm__flush_tlb_gva:
+		kvm_x86_call(flush_tlb_gva)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__flush_tlb_guest:
+		kvm_x86_call(flush_tlb_guest)(vcpu);
+		break;
+	case __pkvm__set_interrupt_shadow:
+		kvm_x86_call(set_interrupt_shadow)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__get_interrupt_shadow:
+		out->get_interrupt_shadow.data = kvm_x86_call(get_interrupt_shadow)(vcpu);
+		break;
+	case __pkvm__enable_nmi_window:
+		kvm_x86_call(enable_nmi_window)(vcpu);
+		break;
+	case __pkvm__enable_irq_window:
+		kvm_x86_call(enable_irq_window)(vcpu);
+		break;
+	case __pkvm__interrupt_allowed:
+		ret = pkvm_interrupt_allowed(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__nmi_allowed:
+		ret = pkvm_nmi_allowed(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__get_nmi_mask:
+		out->get_nmi_mask.data = kvm_x86_call(get_nmi_mask)(vcpu);
+		break;
+	case __pkvm__set_nmi_mask:
+		kvm_x86_call(set_nmi_mask)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__inject_irq:
+		pkvm_inject_irq(vcpu);
+		break;
+	case __pkvm__inject_nmi:
+		pkvm_inject_nmi(vcpu);
+		break;
+	case __pkvm__inject_exception:
+		pkvm_inject_exception(vcpu);
+		break;
+	case __pkvm__cancel_injection:
+		pkvm_cancel_injection(vcpu);
+		break;
+	case __pkvm__update_cr8_intercept:
+		kvm_x86_call(update_cr8_intercept)(vcpu, pkvm_hc_input1(hvcpu),
+						   pkvm_hc_input2(hvcpu));
+		break;
+	case __pkvm__set_virtual_apic_mode:
+		pkvm_set_virtual_apic_mode(vcpu);
+		break;
+	case __pkvm__refresh_apicv_exec_ctrl:
+		pkvm_refresh_apicv_exec_ctrl(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__load_eoi_exitmap:
+		pkvm_load_eoi_exitmap(vcpu, pkvm_hc_input1(hvcpu), pkvm_hc_input2(hvcpu),
+				      pkvm_hc_input3(hvcpu), pkvm_hc_input4(hvcpu));
+		break;
+	case __pkvm__hwapic_isr_update:
+		kvm_x86_call(hwapic_isr_update)(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__sync_pir_to_irr:
+		pkvm_sync_pir_to_irr(vcpu, pkvm_hc_input1(hvcpu));
+		break;
+	case __pkvm__vcpu_after_set_cpuid:
+		ret = pkvm_vcpu_after_set_cpuid(vcpu, pkvm_host_gpa_to_phys(pkvm_hc_input1(hvcpu)),
+						&out->vcpu_after_set_cpuid.memcache);
+		break;
+	case __pkvm__vcpu_add_fpstate:
+		ret = pkvm_vcpu_add_fpstate(vcpu, pkvm_host_gpa_to_phys(pkvm_hc_input1(hvcpu)),
+					    pkvm_hc_input2(hvcpu), &out->vcpu_add_fpstate.memcache);
+		break;
+	case __pkvm__write_tsc_offset:
+		pkvm_write_tsc_offset(vcpu);
+		break;
+	case __pkvm__write_tsc_multiplier:
+		ret = pkvm_write_tsc_multiplier(vcpu);
+		break;
+	case __pkvm__load_mmu_pgd:
+		ret = pkvm_load_mmu_pgd(vcpu, pkvm_hc_input1(hvcpu), pkvm_hc_input2(hvcpu));
+		break;
+	case __pkvm__setup_mce:
+		ret = kvm_vcpu_x86_setup_mce(vcpu, to_pkvm_vcpu(vcpu)->shared_vcpu->arch.mcg_cap);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	kvm_x86_call(vcpu_load)(hvcpu, cpu);
+	return ret;
+}
+
 void pkvm_handle_host_hypercall(struct kvm_vcpu *vcpu)
 {
 	enum pkvm_hc hc = pkvm_hc(vcpu);
-	union pkvm_hc_data out;
+	union pkvm_hc_data in, out;
 	int ret = 0;
 
+	pkvm_hc_get_input(vcpu, hc, &in);
+
 	switch (hc) {
 	case __pkvm__init:
 		ret = pkvm_init((struct pkvm_mem_info *)pkvm_hc_input1(vcpu),
@@ -519,8 +1385,16 @@ void pkvm_handle_host_hypercall(struct kvm_vcpu *vcpu)
 		ret = pkvm_vcpu_free(pkvm_hc_input1(vcpu), pkvm_hc_input2(vcpu),
 				     &out.vcpu_free.memcache);
 		break;
+	case __pkvm__vcpu_load:
+		ret = pkvm_vcpu_load(pkvm_hc_input1(vcpu),
+				     pkvm_hc_input2(vcpu));
+		break;
+	case __pkvm__vcpu_put:
+		ret = pkvm_vcpu_put(pkvm_hc_input1(vcpu),
+				    pkvm_hc_input2(vcpu));
+		break;
 	default:
-		ret = -EINVAL;
+		ret = pkvm_vcpu_handle_host_hypercall(vcpu, hc, &in, &out);
 		break;
 	}
 
@@ -616,3 +1490,42 @@ void pkvm_put_vm(struct pkvm_vm *pkvm_vm)
 
 	WARN_ON(atomic_dec_if_positive(&pkvm_vm_ref->refcount) <= 0);
 }
+
+struct pkvm_vcpu *pkvm_get_vcpu(int vm_handle, int vcpu_handle)
+{
+	struct pkvm_vm *pkvm_vm;
+
+	if (vcpu_handle < 0 || vcpu_handle >= KVM_MAX_VCPUS)
+		return NULL;
+
+	pkvm_vm = pkvm_get_vm(vm_handle);
+	if (!pkvm_vm)
+		return NULL;
+
+	vcpu_handle = array_index_nospec(vcpu_handle, KVM_MAX_VCPUS);
+	if (atomic_inc_not_zero(&pkvm_vm->vcpu_refs[vcpu_handle]))
+		return pkvm_vm->vcpus[vcpu_handle];
+
+	pkvm_put_vm(pkvm_vm);
+	return NULL;
+}
+
+void pkvm_put_vcpu(struct pkvm_vcpu *pkvm_vcpu)
+{
+	int vcpu_handle = pkvm_vcpu->vcpu.arch.pkvm.handle;
+
+	WARN_ON(atomic_dec_if_positive(&pkvm_vcpu->pkvm_vm->vcpu_refs[vcpu_handle]) <= 0);
+
+	pkvm_put_vm(pkvm_vcpu->pkvm_vm);
+}
+
+unsigned long pkvm_pcpu_tss(int cpu)
+{
+#ifdef CONFIG_PKVM_X86_DEBUG
+	return (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss;
+#else
+	struct pkvm_pcpu *pcpu = per_cpu(phys_cpu, cpu);
+
+	return (unsigned long)&pcpu->tss;
+#endif
+}
diff --git a/arch/x86/kvm/pkvm/pkvm.h b/arch/x86/kvm/pkvm/pkvm.h
index e6b6af7eb6f7..4570a912539f 100644
--- a/arch/x86/kvm/pkvm/pkvm.h
+++ b/arch/x86/kvm/pkvm/pkvm.h
@@ -21,6 +21,8 @@ struct pkvm_vcpu {
 	 * structure wrapping the kvm_vcpu structure (see below).
 	 */
 	size_t size;
+	/* Maximum IRR value recorded for posted interrupts. */
+	int max_irr;
 	/*
 	 * The struct kvm_vcpu should be the last element. In cases where struct
 	 * kvm_vcpu is wrapped by a vendor specific structure, putting it as the
@@ -96,5 +98,8 @@ void pkvm_kick_vcpu(struct kvm_vcpu *vcpu);
 int pkvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
 struct pkvm_vm *pkvm_get_vm(int vm_handle);
 void pkvm_put_vm(struct pkvm_vm *pkvm_vm);
+struct pkvm_vcpu *pkvm_get_vcpu(int vm_handle, int vcpu_handle);
+void pkvm_put_vcpu(struct pkvm_vcpu *pkvm_vcpu);
+unsigned long pkvm_pcpu_tss(int cpu);
 
 #endif /* __PKVM_X86_PKVM_H */
diff --git a/arch/x86/kvm/pkvm/undef.h b/arch/x86/kvm/pkvm/undef.h
index aca1b42b4ad6..86daed29a958 100644
--- a/arch/x86/kvm/pkvm/undef.h
+++ b/arch/x86/kvm/pkvm/undef.h
@@ -23,6 +23,8 @@
 #undef CONFIG_USE_X86_SEG_SUPPORT
 #undef CONFIG_MATH_EMULATION
 #undef CONFIG_X86_DEBUG_FPU
+#undef CONFIG_PROVE_LOCKING
+#undef CONFIG_DEBUG_IRQFLAGS
 
 #define NOTRACE
 
@@ -54,6 +56,9 @@
 #undef CONFIG_GENERIC_BUG
 #undef CONFIG_TRACEPOINTS
 #undef CONFIG_DEBUG_PREEMPT
+#undef CONFIG_DYNAMIC_DEBUG
+#undef CONFIG_DYNAMIC_DEBUG_CORE
+#undef CONFIG_TRACE_IRQFLAGS
 */
 
 #endif /* __PKVM_X86_UNDEF_H */
diff --git a/arch/x86/kvm/pkvm/vmx/idt.c b/arch/x86/kvm/pkvm/vmx/idt.c
index d11ee08be21c..f5eb9245979f 100644
--- a/arch/x86/kvm/pkvm/vmx/idt.c
+++ b/arch/x86/kvm/pkvm/vmx/idt.c
@@ -5,11 +5,23 @@
 #include <vmx/x86_ops.h>
 #include "host_vmx.h"
 #include "idt.h"
+#include "memory.h"
 #include "pkvm.h"
 
 static void handle_nmi(struct pt_regs *regs, int vector, bool has_error_code)
 {
 	struct kvm_vcpu *vcpu = this_cpu_read(host_vcpu);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u64 cur_vmcs_pa = vmcs_store();
+	bool is_host_vmcs;
+
+	/* There should always be a loaded VMCS, otherwise it is a code bug. */
+	BUG_ON(!VALID_PAGE(cur_vmcs_pa));
+
+	/* Switch to the host VMCS if the current one is not for host. */
+	is_host_vmcs = (cur_vmcs_pa == __pkvm_pa(vmx->loaded_vmcs->vmcs));
+	if (!is_host_vmcs)
+		vmcs_load(vmx->loaded_vmcs->vmcs);
 
 	/*
 	 * The NMI happens while the pKVM hypervisor is running, but it should
@@ -23,7 +35,11 @@ static void handle_nmi(struct pt_regs *regs, int vector, bool has_error_code)
 	 * Request host immediate exit in case the pending NMI has already been
 	 * handled in this host vmexit handling cycle.
 	 */
-	request_host_immediate_exit(to_vmx(vcpu));
+	request_host_immediate_exit(vmx);
+
+	/* Restore to the previous VMCS if it is not for host. */
+	if (!is_host_vmcs)
+		vmcs_load(__pkvm_va(cur_vmcs_pa));
 }
 
 void pkvm_vmx_register_excp_handlers(void)
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 2cfd42a3b450..2c639c600e31 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -884,17 +884,22 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 	.vcpu_precreate = vt_op(vcpu_precreate),
 	.vcpu_create = vt_op(vcpu_create),
 	.vcpu_free = vt_op(vcpu_free),
-#ifndef __PKVM_HYP__
 	.vcpu_reset = vt_op(vcpu_reset),
 
+#ifndef __PKVM_HYP__
 	.prepare_switch_to_guest = vt_op(prepare_switch_to_guest),
+#endif
 	.vcpu_load = vt_op(vcpu_load),
 	.vcpu_put = vt_op(vcpu_put),
 
+#ifndef __PKVM_HYP__
 	.HOST_OWNED_DEBUGCTL = VMX_HOST_OWNED_DEBUGCTL_BITS,
+#endif
 
 	.update_exception_bitmap = vt_op(update_exception_bitmap),
+#ifndef __PKVM_HYP__
 	.get_feature_msr = vmx_get_feature_msr,
+#endif
 	.get_msr = vt_op(get_msr),
 	.set_msr = vt_op(set_msr),
 
@@ -925,14 +930,18 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 	.flush_tlb_gva = vt_op(flush_tlb_gva),
 	.flush_tlb_guest = vt_op(flush_tlb_guest),
 
+#ifndef __PKVM_HYP__
 	.vcpu_pre_run = vt_op(vcpu_pre_run),
 	.vcpu_run = vt_op(vcpu_run),
 	.handle_exit = vt_op(handle_exit),
 	.skip_emulated_instruction = vmx_skip_emulated_instruction,
 	.update_emulated_instruction = vmx_update_emulated_instruction,
+#endif
 	.set_interrupt_shadow = vt_op(set_interrupt_shadow),
 	.get_interrupt_shadow = vt_op(get_interrupt_shadow),
+#ifndef __PKVM_HYP__
 	.patch_hypercall = vt_op(patch_hypercall),
+#endif
 	.inject_irq = vt_op(inject_irq),
 	.inject_nmi = vt_op(inject_nmi),
 	.inject_exception = vt_op(inject_exception),
@@ -945,15 +954,22 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 	.enable_irq_window = vt_op(enable_irq_window),
 	.update_cr8_intercept = vt_op(update_cr8_intercept),
 
+#ifndef __PKVM_HYP__
 	.x2apic_icr_is_split = false,
+#endif
 	.set_virtual_apic_mode = vt_op(set_virtual_apic_mode),
+#ifndef __PKVM_HYP__
 	.set_apic_access_page_addr = vt_op(set_apic_access_page_addr),
+#endif
 	.refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl),
 	.load_eoi_exitmap = vt_op(load_eoi_exitmap),
+#ifndef __PKVM_HYP__
 	.apicv_pre_state_restore = pi_apicv_pre_state_restore,
 	.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+#endif
 	.hwapic_isr_update = vt_op(hwapic_isr_update),
 	.sync_pir_to_irr = vt_op(sync_pir_to_irr),
+#ifndef __PKVM_HYP__
 	.deliver_interrupt = vt_op(deliver_interrupt),
 	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
 
@@ -963,18 +979,22 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 
 	.get_exit_info = vt_op(get_exit_info),
 	.get_entry_info = vt_op(get_entry_info),
+#endif
 
 	.vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid),
 
+#ifndef __PKVM_HYP__
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
 	.get_l2_tsc_offset = vt_op(get_l2_tsc_offset),
 	.get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier),
+#endif
 	.write_tsc_offset = vt_op(write_tsc_offset),
 	.write_tsc_multiplier = vt_op(write_tsc_multiplier),
 
 	.load_mmu_pgd = vt_op(load_mmu_pgd),
 
+#ifndef __PKVM_HYP__
 	.check_intercept = vmx_check_intercept,
 	.handle_exit_irqoff = vmx_handle_exit_irqoff,
 
@@ -989,9 +1009,11 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
 	.set_hv_timer = vt_op(set_hv_timer),
 	.cancel_hv_timer = vt_op(cancel_hv_timer),
 #endif
+#endif /* !__PKVM_HYP__ */
 
 	.setup_mce = vt_op(setup_mce),
 
+#ifndef __PKVM_HYP__
 #ifdef CONFIG_KVM_SMM
 	.smi_allowed = vt_op(smi_allowed),
 	.enter_smm = vt_op(enter_smm),
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 9eb402c522ad..a7cc36422c55 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -8,8 +8,15 @@
 #include "vmx.h"
 
 #ifdef __PKVM_HYP__
+static inline void vmx_leave_nested(struct kvm_vcpu *vcpu) {}
 static inline void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) {}
 static inline void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) {}
+static inline void nested_vmx_set_vmcs_shadowing_bitmap(void) {}
+static inline int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { return 1; }
+static inline int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
+{
+	return 1;
+}
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/vmx/pkvm_host.c b/arch/x86/kvm/vmx/pkvm_host.c
index c16a8f5464b3..a97987d45136 100644
--- a/arch/x86/kvm/vmx/pkvm_host.c
+++ b/arch/x86/kvm/vmx/pkvm_host.c
@@ -4,6 +4,8 @@
 #include <linux/kvm_host.h>
 #include <asm/kvm_pkvm.h>
 #include "pkvm_constants.h"
+#include "posted_intr.h"
+#include "trace.h"
 #include "x86_ops.h"
 #include "vmx.h"
 
@@ -41,6 +43,85 @@ static int pkvm_alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	return -ENOMEM;
 }
 
+static void __pkvm_vcpu_unload(void *arg)
+{
+	struct kvm_vcpu *vcpu = arg;
+	struct vcpu_vmx *vmx;
+
+	if (pkvm_hypercall(vcpu_put, vcpu->kvm->arch.pkvm.handle,
+			   vcpu->arch.pkvm.handle))
+		return;
+
+	vmx = to_vmx(vcpu);
+	vmx->loaded_vmcs->cpu = -1;
+}
+
+static void pkvm_vcpu_unload(struct kvm_vcpu *vcpu)
+{
+	int cpu = to_vmx(vcpu)->loaded_vmcs->cpu;
+
+	if (cpu != -1)
+		smp_call_function_single(cpu, __pkvm_vcpu_unload, vcpu, 1);
+}
+
+static bool pkvm_segment_cache_test(struct vcpu_vmx *vmx, int seg, int field)
+{
+	u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+	if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+		kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
+		vmx->segment_cache.bitmask = 0;
+	}
+
+	return vmx->segment_cache.bitmask & mask;
+}
+
+static void pkvm_segment_cache_set(struct vcpu_vmx *vmx, int seg, int field)
+{
+	u32 mask = 1 << (seg * SEG_FIELD_NR + field);
+
+	if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+		kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
+		vmx->segment_cache.bitmask = 0;
+	}
+
+	/*
+	 * Make sure the cached segment field value is updated before setting
+	 * the bitmask. This code may get preempted by pkvm_get_cpl_no_cache()
+	 * (on the same CPU), and we don't want pkvm_get_cpl_no_cache() to see
+	 * the field marked in the bitmask as available while its cached value
+	 * is still out of date.
+	 */
+	barrier();
+
+	vmx->segment_cache.bitmask |= mask;
+}
+
+static void pkvm_cache_segment(struct vcpu_vmx *vmx, struct kvm_segment *var, int seg)
+{
+	struct kvm_save_segment *save = &vmx->segment_cache.seg[seg];
+
+	save->selector = var->selector;
+	pkvm_segment_cache_set(vmx, seg, SEG_FIELD_SEL);
+
+	save->base = var->base;
+	pkvm_segment_cache_set(vmx, seg, SEG_FIELD_BASE);
+
+	save->limit = var->limit;
+	pkvm_segment_cache_set(vmx, seg, SEG_FIELD_LIMIT);
+
+	save->ar = (var->unusable << 16) |
+		   (var->g << 15)	 |
+		   (var->db << 14)	 |
+		   (var->l << 13)	 |
+		   (var->avl << 12)	 |
+		   (var->present << 7)	 |
+		   (var->dpl << 5)	 |
+		   (var->s << 4)	 |
+		   var->type;
+	pkvm_segment_cache_set(vmx, seg, SEG_FIELD_AR);
+}
+
 static int pkvm_check_processor_compat(void)
 {
 	return pkvm_hypercall(check_processor_compatibility);
@@ -61,6 +142,22 @@ static void pkvm_disable_virtualization_cpu(void)
 	 */
 }
 
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool pkvm_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+	/* SMM mode is not supported by the pKVM hypervisor. */
+	if (index == MSR_IA32_SMBASE)
+		return false;
+
+	if (!kvm)
+		return vmx_has_emulated_msr(NULL, index);
+
+	return pkvm_host_has_emulated_msr(kvm, index);
+}
+
 static int pkvm_vm_init(struct kvm *kvm)
 {
 	void *pkvm_vm;
@@ -130,7 +227,9 @@ static int pkvm_vcpu_create(struct kvm_vcpu *vcpu)
 	vmx->loaded_vmcs = &vmx->vmcs01;
 	vmx->loaded_vmcs->cpu = -1;
 
-	vcpu_size = PKVM_VMX_VCPU_SIZE + KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE;
+	vcpu_size = PKVM_VMX_VCPU_SIZE;
+	if (pkvm_is_protected_vcpu(vcpu))
+		vcpu_size += KVM_MCE_SIZE + KVM_MCI_CTL2_SIZE;
 	if (lapic_in_kernel(vcpu))
 		vcpu_size += sizeof(struct kvm_lapic);
 
@@ -170,6 +269,8 @@ static void pkvm_vcpu_free(struct kvm_vcpu *vcpu)
 	union pkvm_hc_data out;
 	int ret;
 
+	pkvm_vcpu_unload(vcpu);
+
 	ret = pkvm_hypercall_out(vcpu_free, &out, vm_handle, vcpu_handle);
 	if (ret) {
 		pr_err("failed to free VM%d vcpu%d: %d\n", vm_handle, vcpu_handle, ret);
@@ -181,6 +282,775 @@ static void pkvm_vcpu_free(struct kvm_vcpu *vcpu)
 	pkvm_free_loaded_vmcs(vmx->loaded_vmcs);
 }
 
+static void pkvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/*
+	 * TODO: The vcpu_reset PV interface will be disallowed for the pVM
+	 * once its INIT event is handled inside the pKVM hypervisor. So should
+	 * check `pkvm_is_protected_vcpu(vcpu)` rather than
+	 * `vcpu->arch.guest_state_protected` once it is ready. See comments for
+	 * `__pkvm__vcpu_reset` in pkvm_vcpu_handle_host_hypercall.
+	 */
+	if (!vcpu->arch.guest_state_protected && init_event)
+		KVM_BUG_ON(pkvm_hypercall(vcpu_reset), vcpu->kvm);
+
+	/*
+	 * The host is responsible for injecting interrupts to the guest. The
+	 * pi_desc is the key structure for the host to inject interrupts via
+	 * the posted interrupt mechanism. Its physical address is used for the
+	 * POSTED_INTR_DESC_ADDR in the VMCS by the pKVM hypervisor. Initialize
+	 * the pi_desc when reset vcpu.
+	 */
+	vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+	__pi_set_sn(&vmx->vt.pi_desc);
+
+	/*
+	 * The guest CR0/CR4 are managed by the pKVM hypervisor. When the host
+	 * reads the guest CR0/CR4, it should get the up-to-date value from the
+	 * pKVM. So make all bits in the CR0/CR4 as owned by the guest to
+	 * indicate no bit is owned by the host.
+	 */
+	vcpu->arch.cr0_guest_owned_bits = ~0;
+	vcpu->arch.cr4_guest_owned_bits = ~0;
+
+	kvm_set_cr8(vcpu, 0);
+
+	if (pkvm_is_protected_vcpu(vcpu)) {
+		/*
+		 * Emulating xapic mode will require the host to decode MMIO
+		 * instruction which is not supported if the guest is a pVM as
+		 * the pVM's CPU and memory state will be isolated. To avoid
+		 * using xapic mode for a pVM, enable x2apic mode by default so
+		 * that pVM will use MSR instructions to access lapic, which
+		 * doesn't require decoding.
+		 */
+		u64 data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+			   (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
+
+		guest_cpu_cap_set(vcpu, X86_FEATURE_X2APIC);
+		kvm_apic_set_base(vcpu, data, true);
+	}
+}
+
+static void pkvm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	bool already_loaded;
+
+	already_loaded = vmx->loaded_vmcs->cpu == cpu;
+	if (!already_loaded)
+		pkvm_vcpu_unload(vcpu);
+
+	if (KVM_BUG_ON(pkvm_hypercall(vcpu_load, vcpu->kvm->arch.pkvm.handle,
+				      vcpu->arch.pkvm.handle), vcpu->kvm))
+		return;
+
+	if (!already_loaded)
+		vmx->loaded_vmcs->cpu = cpu;
+
+	vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void pkvm_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vmx_vcpu_pi_put(vcpu);
+}
+
+static void pkvm_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+	if (!pkvm_is_protected_vcpu(vcpu))
+		KVM_BUG_ON(pkvm_hypercall(update_exception_bitmap), vcpu->kvm);
+}
+
+static int pkvm_get_feature_msr(u32 msr, u64 *data)
+{
+	switch (msr) {
+	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
+		return 1;
+	default:
+		return KVM_MSR_RET_UNSUPPORTED;
+	}
+}
+
+static int pkvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+	if (pkvm_host_has_emulated_msr(vcpu->kvm, msr_info->index))
+		return kvm_get_msr_common(vcpu, msr_info);
+
+	if (!vcpu->arch.guest_state_protected) {
+		union pkvm_hc_data out;
+		int ret;
+
+		ret = pkvm_hypercall_out(get_msr, &out, msr_info->index);
+		if (!ret)
+			msr_info->data = out.get_msr.data;
+
+		return ret;
+	}
+
+	return -EPERM;
+}
+
+static int pkvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+	if (pkvm_host_has_emulated_msr(vcpu->kvm, msr_info->index))
+		return kvm_set_msr_common(vcpu, msr_info);
+
+	if (!vcpu->arch.guest_state_protected)
+		return pkvm_hypercall(set_msr, msr_info->index, msr_info->data);
+
+	return -EPERM;
+}
+
+static u64 pkvm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	union pkvm_hc_data out;
+	ulong *p;
+
+	if (vcpu->arch.guest_state_protected)
+		return 0;
+
+	p = &vmx->segment_cache.seg[seg].base;
+
+	if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_BASE)) {
+		if (KVM_BUG_ON(pkvm_hypercall_out(get_segment_base, &out, seg), vcpu->kvm))
+			return 0;
+
+		*p = out.get_segment_base.data;
+		pkvm_segment_cache_set(vmx, seg, SEG_FIELD_BASE);
+	}
+
+	return *p;
+}
+
+static void pkvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct kvm_save_segment *segment;
+	u32 ar;
+
+	if (vcpu->arch.guest_state_protected) {
+		if (var)
+			memset(var, 0, sizeof(*var));
+		return;
+	}
+
+	if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_SEL) ||
+	    !pkvm_segment_cache_test(vmx, seg, SEG_FIELD_BASE) ||
+	    !pkvm_segment_cache_test(vmx, seg, SEG_FIELD_LIMIT) ||
+	    !pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR)) {
+		union pkvm_hc_data out;
+
+		if (KVM_BUG_ON(pkvm_hypercall_out(get_segment, &out, seg), vcpu->kvm))
+			return;
+
+		pkvm_cache_segment(vmx, &out.get_segment.seg_val, seg);
+	}
+
+	if (!var)
+		return;
+
+	segment = &vmx->segment_cache.seg[seg];
+	var->selector = segment->selector;
+	var->base = segment->base;
+	var->limit = segment->limit;
+	ar = segment->ar;
+	var->unusable = (ar >> 16) & 1;
+	var->type = ar & 15;
+	var->s = (ar >> 4) & 1;
+	var->dpl = (ar >> 5) & 3;
+	/*
+	 * Some userspaces do not preserve unusable property. Since usable
+	 * segment has to be present according to VMX spec we can use present
+	 * property to amend userspace bug by making unusable segment always
+	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
+	 * segment as unusable.
+	 */
+	var->present = !var->unusable;
+	var->avl = (ar >> 12) & 1;
+	var->l = (ar >> 13) & 1;
+	var->db = (ar >> 14) & 1;
+	var->g = (ar >> 15) & 1;
+}
+
+static void pkvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+	union pkvm_hc_data in = {
+		.set_segment = {
+			.seg_val = *var,
+			.seg = seg,
+		},
+	};
+
+	if (vcpu->arch.guest_state_protected)
+		return;
+
+	vmx_segment_cache_clear(to_vmx(vcpu));
+
+	KVM_BUG_ON(pkvm_hypercall_in(set_segment, &in), vcpu->kvm);
+}
+
+static int pkvm_get_cpl(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int seg = VCPU_SREG_SS;
+	u32 ar;
+
+	if (vcpu->arch.guest_state_protected)
+		return 0;
+
+	if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR))
+		pkvm_get_segment(vcpu, NULL, seg);
+
+	ar = vmx->segment_cache.seg[seg].ar;
+	return VMX_AR_DPL(ar);
+}
+
+static int pkvm_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int seg = VCPU_SREG_SS;
+	union pkvm_hc_data out;
+
+	if (vcpu->arch.guest_state_protected)
+		return 0;
+
+	/*
+	 * Even though this is a no_cache version of get_cpl, still use the
+	 * cached value if it is available, to avoid unnecessary calls to pKVM.
+	 * It may be cached either by the pKVM hypervisor itself (when
+	 * returning to the host after vcpu_run) or by the host after another
+	 * get_segment call to pKVM (in such case, the barrier() in
+	 * pkvm_segment_cache_set() makes sure that we are seeing the up-to-date
+	 * value).
+	 */
+	if (likely(pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR)))
+		return VMX_AR_DPL(vmx->segment_cache.seg[seg].ar);
+
+	if (KVM_BUG_ON(pkvm_hypercall_out(get_segment, &out, seg), vcpu->kvm))
+		return 0;
+
+	return out.get_segment.seg_val.dpl;
+}
+
+static void pkvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int seg = VCPU_SREG_CS;
+	u32 ar;
+
+	if (vcpu->arch.guest_state_protected) {
+		*db = *l = 0;
+		return;
+	}
+
+	if (!pkvm_segment_cache_test(vmx, seg, SEG_FIELD_AR))
+		pkvm_get_segment(vcpu, NULL, seg);
+
+	ar = vmx->segment_cache.seg[seg].ar;
+	*db = (ar >> 14) & 1;
+	*l = (ar >> 13) & 1;
+}
+
+static bool pkvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	return true;
+}
+
+static void pkvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	/*
+	 * Segment will updated by the pKVM hypervisor if the vCPU enters the
+	 * long mode. Clears the segment cache unconditionally for below
+	 * reasons:
+	 * 1) the clearing is just one line of code which is simpler comparing
+	 * with checking if the vCPU enters the long mode or not.
+	 * 2) the overall overhead is smaller than checking if the vCPU enters
+	 * the long mode or not. By clearing the segment cache unconditionally,
+	 * the host will need to use the get_segment PV interface if the host
+	 * wants to read the segment register after setting the CR0. So the
+	 * additional overhead is by sending one more PV interface. But if check
+	 * whether a vCPU will enter the long mode or not before clearing the
+	 * segment cache, there is also one more PV interface overhead which is
+	 * to send cache_reg PV interface to read CR0 PG bit first if the CR0 is
+	 * not up-to-date. As it is unlikely that the host wants to read the
+	 * segment register after setting the CR0 but likely the CR0 is not
+	 * up-to-date before setting the CR0, it seems the overall overhead of
+	 * clearing the segment cache unconditionally is smaller.
+	 */
+	vmx_segment_cache_clear(to_vmx(vcpu));
+
+	if (!vcpu->arch.guest_state_protected)
+		KVM_BUG_ON(pkvm_hypercall(set_cr0, cr0), vcpu->kvm);
+
+	vcpu->arch.cr0 = cr0;
+	kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+}
+
+static bool pkvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	/* The pKVM doesn't support VMX feature. */
+	return !(cr4 & X86_CR4_VMXE);
+}
+
+static void pkvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	unsigned long old_cr4 = kvm_read_cr4(vcpu);
+	if (!vcpu->arch.guest_state_protected)
+		KVM_BUG_ON(pkvm_hypercall(set_cr4, cr4), vcpu->kvm);
+
+	vcpu->arch.cr4 = cr4;
+	kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
+	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
+		vcpu->arch.cpuid_dynamic_bits_dirty = true;
+}
+
+static int pkvm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+	int ret = -EINVAL;
+
+	if (!vcpu->arch.guest_state_protected)
+		ret = pkvm_hypercall(set_efer, efer);
+
+	vcpu->arch.efer = efer;
+	return ret;
+}
+
+static void pkvm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+	union pkvm_hc_data data;
+
+	if (vcpu->arch.guest_state_protected ||
+	    KVM_BUG_ON(pkvm_hypercall_out(get_idt, &data), vcpu->kvm)) {
+		memset(dt, 0, sizeof(*dt));
+		return;
+	}
+
+	*dt = data.get_idt.desc;
+}
+
+static void pkvm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+	union pkvm_hc_data data = {
+		.set_gdt.desc = *dt,
+	};
+
+	if (vcpu->arch.guest_state_protected)
+		return;
+
+	KVM_BUG_ON(pkvm_hypercall_in(set_idt, &data), vcpu->kvm);
+}
+
+static void pkvm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+	union pkvm_hc_data data;
+
+	if (vcpu->arch.guest_state_protected ||
+	    KVM_BUG_ON(pkvm_hypercall_out(get_gdt, &data), vcpu->kvm)) {
+		memset(dt, 0, sizeof(*dt));
+		return;
+	}
+
+	*dt = data.get_gdt.desc;
+}
+
+static void pkvm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+	union pkvm_hc_data data = {
+		.set_gdt.desc = *dt,
+	};
+
+	if (vcpu->arch.guest_state_protected)
+		return;
+
+	KVM_BUG_ON(pkvm_hypercall_in(set_gdt, &data), vcpu->kvm);
+}
+
+static void pkvm_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	if (!pkvm_is_protected_vcpu(vcpu))
+		KVM_BUG_ON(pkvm_hypercall(set_dr7, val), vcpu->kvm);
+}
+
+static void pkvm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+	union pkvm_hc_data out;
+
+	if (pkvm_is_protected_vcpu(vcpu))
+		return;
+
+	if (KVM_BUG_ON(pkvm_hypercall_out(cache_reg, &out, reg), vcpu->kvm))
+		return;
+
+	kvm_register_mark_available(vcpu, reg);
+
+	switch (reg) {
+	case VCPU_REGS_RSP:
+		vcpu->arch.regs[VCPU_REGS_RSP] = out.cache_reg.rsp;
+		break;
+	case VCPU_REGS_RIP:
+		vcpu->arch.regs[VCPU_REGS_RIP] = out.cache_reg.rip;
+		break;
+	case VCPU_EXREG_PDPTR: {
+		struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+
+		mmu->pdptrs[0] = out.cache_reg.pdptrs[0];
+		mmu->pdptrs[1] = out.cache_reg.pdptrs[1];
+		mmu->pdptrs[2] = out.cache_reg.pdptrs[2];
+		mmu->pdptrs[3] = out.cache_reg.pdptrs[3];
+		break;
+	}
+	case VCPU_EXREG_CR0:
+		vcpu->arch.cr0 = out.cache_reg.cr0;
+		break;
+	case VCPU_EXREG_CR3:
+		vcpu->arch.cr3 = out.cache_reg.cr3;
+		break;
+	case VCPU_EXREG_CR4:
+		vcpu->arch.cr4 = out.cache_reg.cr4;
+		break;
+	default:
+		KVM_BUG_ON(1, vcpu->kvm);
+		break;
+	}
+}
+
+static unsigned long pkvm_get_rflags(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
+		if (vcpu->arch.guest_state_protected) {
+			vmx->rflags = 0;
+		} else {
+			union pkvm_hc_data out;
+
+			if (KVM_BUG_ON(pkvm_hypercall_out(get_rflags, &out), vcpu->kvm))
+				return 0;
+
+			vmx->rflags = out.get_rflags.data;
+		}
+		kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+	}
+
+	return vmx->rflags;
+}
+
+static void pkvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	to_vmx(vcpu)->rflags = rflags;
+	if (!vcpu->arch.guest_state_protected)
+		KVM_BUG_ON(pkvm_hypercall(set_rflags, rflags), vcpu->kvm);
+	kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+}
+
+static bool pkvm_get_if_flag(struct kvm_vcpu *vcpu)
+{
+	return pkvm_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
+static void pkvm_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.guest_state_protected)
+		KVM_BUG_ON(pkvm_hypercall(flush_tlb_all), vcpu->kvm);
+}
+
+static void pkvm_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.guest_state_protected)
+		KVM_BUG_ON(pkvm_hypercall(flush_tlb_current), vcpu->kvm);
+}
+
+static void pkvm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	if (!vcpu->arch.guest_state_protected)
+		KVM_BUG_ON(pkvm_hypercall(flush_tlb_gva, addr), vcpu->kvm);
+}
+
+static void pkvm_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.guest_state_protected)
+		KVM_BUG_ON(pkvm_hypercall(flush_tlb_guest), vcpu->kvm);
+}
+
+static void pkvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	if (!pkvm_is_protected_vcpu(vcpu))
+		KVM_BUG_ON(pkvm_hypercall(set_interrupt_shadow, mask), vcpu->kvm);
+}
+
+static u32 pkvm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+	union pkvm_hc_data out;
+
+	if (pkvm_is_protected_vcpu(vcpu))
+		return 0;
+
+	KVM_BUG_ON(pkvm_hypercall_out(get_interrupt_shadow, &out), vcpu->kvm);
+
+	return out.get_interrupt_shadow.data;
+}
+
+static void pkvm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+	trace_kvm_inj_virq(vcpu->arch.interrupt.nr,
+			   vcpu->arch.interrupt.soft, reinjected);
+
+	++vcpu->stat.irq_injections;
+
+	KVM_BUG_ON(pkvm_hypercall(inject_irq), vcpu->kvm);
+}
+
+static void pkvm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.nmi_injections;
+
+	KVM_BUG_ON(pkvm_hypercall(inject_nmi), vcpu->kvm);
+}
+
+static void pkvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+	if (pkvm_is_protected_vcpu(vcpu))
+		return;
+
+	KVM_BUG_ON(pkvm_hypercall(inject_exception), vcpu->kvm);
+}
+
+static void pkvm_cancel_injection(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.nmi_injected = false;
+	kvm_clear_exception_queue(vcpu);
+	kvm_clear_interrupt_queue(vcpu);
+
+	if (KVM_BUG_ON(pkvm_hypercall(cancel_injection), vcpu->kvm))
+		return;
+
+	if (vcpu->arch.nmi_injected ||
+	    vcpu->arch.interrupt.injected ||
+	    vcpu->arch.exception.injected)
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static int pkvm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	return pkvm_hypercall(interrupt_allowed, for_injection);
+}
+
+static int pkvm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	return pkvm_hypercall(nmi_allowed, for_injection);
+}
+
+static bool pkvm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+	union pkvm_hc_data out;
+
+	if (KVM_BUG_ON(pkvm_hypercall_out(get_nmi_mask, &out), vcpu->kvm))
+		return false;
+
+	return out.get_nmi_mask.data;
+}
+
+static void pkvm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+	if (!pkvm_is_protected_vcpu(vcpu))
+		KVM_BUG_ON(pkvm_hypercall(set_nmi_mask, masked), vcpu->kvm);
+}
+
+static void pkvm_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+	KVM_BUG_ON(pkvm_hypercall(enable_nmi_window), vcpu->kvm);
+}
+
+static void pkvm_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+	KVM_BUG_ON(pkvm_hypercall(enable_irq_window), vcpu->kvm);
+}
+
+static void pkvm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+	KVM_BUG_ON(pkvm_hypercall(update_cr8_intercept, tpr, irr), vcpu->kvm);
+}
+
+static void pkvm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+	if (lapic_in_kernel(vcpu))
+		KVM_BUG_ON(pkvm_hypercall(set_virtual_apic_mode), vcpu->kvm);
+}
+
+static void pkvm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+	if (lapic_in_kernel(vcpu))
+		KVM_BUG_ON(pkvm_hypercall(refresh_apicv_exec_ctrl, vcpu->arch.apic->apicv_active),
+			   vcpu->kvm);
+}
+
+static void pkvm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	if (kvm_vcpu_apicv_active(vcpu))
+		KVM_BUG_ON(pkvm_hypercall(load_eoi_exitmap, eoi_exit_bitmap[0],
+					  eoi_exit_bitmap[1], eoi_exit_bitmap[2],
+					  eoi_exit_bitmap[3]),
+			   vcpu->kvm);
+}
+
+#define VMX_REQUIRED_APICV_INHIBITS				\
+	(BIT(APICV_INHIBIT_REASON_DISABLED) |			\
+	 BIT(APICV_INHIBIT_REASON_ABSENT) |			\
+	 BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |			\
+	 BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) |	\
+	 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |		\
+	 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
+
+static void pkvm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+	KVM_BUG_ON(pkvm_hypercall(hwapic_isr_update, max_isr), vcpu->kvm);
+}
+
+static int pkvm_vcpu_realloc_fpstate(struct kvm_vcpu *vcpu)
+{
+	union pkvm_hc_data out;
+	size_t fpsize;
+	void *fps;
+	int ret;
+
+	fpsize = PAGE_ALIGN(vcpu->arch.guest_fpu.fpstate->size +
+			    ALIGN(offsetof(struct fpstate, regs), 64));
+	fps = alloc_pages_exact(fpsize, GFP_KERNEL_ACCOUNT);
+	if (!fps)
+		return -ENOMEM;
+
+	ret = pkvm_hypercall_out(vcpu_add_fpstate, &out, __pa(fps), fpsize);
+	if (KVM_BUG_ON(ret, vcpu->kvm))
+		free_pages_exact(fps, fpsize);
+	else
+		kvm_free_pkvm_memcache(&out.vcpu_add_fpstate.memcache);
+
+	return ret;
+}
+
+static void pkvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *e2 = vcpu->arch.cpuid_entries;
+	int nent = vcpu->arch.cpuid_nent;
+	union pkvm_hc_data out;
+	void *entries;
+	size_t size;
+
+	if (vcpu->arch.guest_state_protected || !e2 || !nent)
+		return;
+
+	/*
+	 * With exposing the FPU dynamic feature via the cpuid, the fpstate
+	 * allocated when creating the vcpu may not be sufficient for the
+	 * guest. As the pVM's FPU state is managed by the pKVM hypervisor
+	 * while the npVM's FPU state is managed by the host, re-allocating the
+	 * fpstate is only necessary for the pVM, and should be done before
+	 * adding the new cpuid entries to the pKVM hypervisor.
+	 */
+	if ((vcpu->arch.guest_fpu.xfeatures & XFEATURE_MASK_USER_DYNAMIC) &&
+	    pkvm_is_protected_vcpu(vcpu) &&
+	    pkvm_vcpu_realloc_fpstate(vcpu))
+		return;
+
+	size = sizeof(struct kvm_cpuid_entry2) * nent;
+	entries = alloc_pages_exact(size, GFP_KERNEL_ACCOUNT);
+	if (!entries) {
+		kvm_err("Failed to allocate cpuid pages for pKVM vcpu\n");
+		return;
+	}
+
+	memcpy(entries, (void *)e2, size);
+
+	if (KVM_BUG_ON(pkvm_hypercall_out(vcpu_after_set_cpuid, &out, __pa(entries)), vcpu->kvm))
+		free_pages_exact(entries, size);
+	else
+		kvm_free_pkvm_memcache(&out.vcpu_after_set_cpuid.memcache);
+}
+
+static u64 pkvm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+static u64 pkvm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+	return kvm_caps.default_tsc_scaling_ratio;
+}
+
+static void pkvm_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+	KVM_BUG_ON(pkvm_hypercall(write_tsc_offset), vcpu->kvm);
+}
+
+static void pkvm_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+	KVM_BUG_ON(pkvm_hypercall(write_tsc_multiplier), vcpu->kvm);
+}
+
+static void pkvm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
+{
+	KVM_BUG_ON(pkvm_hypercall(load_mmu_pgd, root_hpa, root_level), vcpu->kvm);
+}
+
+static void pkvm_leave_nested(struct kvm_vcpu *vcpu) {}
+static bool pkvm_nested_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+					    u32 error_code)
+{
+	return false;
+}
+static int pkvm_check_nested_events(struct kvm_vcpu *vcpu) { return 0; }
+static void pkvm_nested_triple_fault(struct kvm_vcpu *vcpu) {}
+static bool pkvm_get_nested_state_pages(struct kvm_vcpu *vcpu) { return true; }
+static int pkvm_nested_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) { return 0; }
+
+static struct kvm_x86_nested_ops pkvm_nested_ops = {
+	.leave_nested = pkvm_leave_nested,
+	.is_exception_vmexit = pkvm_nested_is_exception_vmexit,
+	.check_events = pkvm_check_nested_events,
+	.triple_fault = pkvm_nested_triple_fault,
+	.get_nested_state_pages = pkvm_get_nested_state_pages,
+	.write_log_dirty = pkvm_nested_write_pml_buffer,
+};
+
+static void pkvm_setup_mce(struct kvm_vcpu *vcpu)
+{
+	KVM_BUG_ON(pkvm_hypercall(setup_mce), vcpu->kvm);
+}
+
+#ifdef CONFIG_KVM_SMM
+static int pkvm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+	return false;
+}
+
+static int pkvm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+	return -EOPNOTSUPP;
+}
+
+static int pkvm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+	return -EOPNOTSUPP;
+}
+
+static void pkvm_enable_smi_window(struct kvm_vcpu *vcpu) {}
+#endif
+
+static bool pkvm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * The init signal will be blocked if the guest VM is emulating nested
+	 * and in virtual VMX root mode. But as this is not a supported case by
+	 * the pKVM hypervisor, the init signal should never be blocked for the
+	 * guest VM.
+	 */
+	return false;
+}
+
 struct kvm_x86_ops pkvm_host_vt_x86_ops __initdata = {
 	.name = KBUILD_MODNAME,
 
@@ -190,6 +1060,8 @@ struct kvm_x86_ops pkvm_host_vt_x86_ops __initdata = {
 	.disable_virtualization_cpu = pkvm_disable_virtualization_cpu,
 	.emergency_disable_virtualization_cpu = pkvm_disable_virtualization_cpu,
 
+	.has_emulated_msr = pkvm_has_emulated_msr,
+
 	.vm_size = sizeof(struct kvm_vmx),
 	.vm_init = pkvm_vm_init,
 	.vm_destroy = pkvm_vm_destroy,
@@ -197,4 +1069,95 @@ struct kvm_x86_ops pkvm_host_vt_x86_ops __initdata = {
 	.vcpu_precreate = vmx_vcpu_precreate,
 	.vcpu_create = pkvm_vcpu_create,
 	.vcpu_free = pkvm_vcpu_free,
+	.vcpu_reset = pkvm_vcpu_reset,
+
+	.vcpu_load = pkvm_vcpu_load,
+	.vcpu_put = pkvm_vcpu_put,
+
+	.update_exception_bitmap = pkvm_update_exception_bitmap,
+	.get_feature_msr = pkvm_get_feature_msr,
+	.get_msr = pkvm_get_msr,
+	.set_msr = pkvm_set_msr,
+	.get_segment_base = pkvm_get_segment_base,
+	.get_segment = pkvm_get_segment,
+	.set_segment = pkvm_set_segment,
+	.get_cpl = pkvm_get_cpl,
+	.get_cpl_no_cache = pkvm_get_cpl_no_cache,
+	.get_cs_db_l_bits = pkvm_get_cs_db_l_bits,
+	.is_valid_cr0 = pkvm_is_valid_cr0,
+	.set_cr0 = pkvm_set_cr0,
+	.is_valid_cr4 = pkvm_is_valid_cr4,
+	.set_cr4 = pkvm_set_cr4,
+	.set_efer = pkvm_set_efer,
+	.get_idt = pkvm_get_idt,
+	.set_idt = pkvm_set_idt,
+	.get_gdt = pkvm_get_gdt,
+	.set_gdt = pkvm_set_gdt,
+	.set_dr7 = pkvm_set_dr7,
+	.cache_reg = pkvm_cache_reg,
+	.get_rflags = pkvm_get_rflags,
+	.set_rflags = pkvm_set_rflags,
+	.get_if_flag = pkvm_get_if_flag,
+
+	.flush_tlb_all = pkvm_flush_tlb_all,
+	.flush_tlb_current = pkvm_flush_tlb_current,
+	.flush_tlb_gva = pkvm_flush_tlb_gva,
+	.flush_tlb_guest = pkvm_flush_tlb_guest,
+
+	.set_interrupt_shadow = pkvm_set_interrupt_shadow,
+	.get_interrupt_shadow = pkvm_get_interrupt_shadow,
+	.inject_irq = pkvm_inject_irq,
+	.inject_nmi = pkvm_inject_nmi,
+	.inject_exception = pkvm_inject_exception,
+	.cancel_injection = pkvm_cancel_injection,
+	.interrupt_allowed = pkvm_interrupt_allowed,
+	.nmi_allowed = pkvm_nmi_allowed,
+	.get_nmi_mask = pkvm_get_nmi_mask,
+	.set_nmi_mask = pkvm_set_nmi_mask,
+	.enable_nmi_window = pkvm_enable_nmi_window,
+	.enable_irq_window = pkvm_enable_irq_window,
+	.update_cr8_intercept = pkvm_update_cr8_intercept,
+
+	.x2apic_icr_is_split = false,
+	.set_virtual_apic_mode = pkvm_set_virtual_apic_mode,
+	.refresh_apicv_exec_ctrl = pkvm_refresh_apicv_exec_ctrl,
+	.load_eoi_exitmap = pkvm_load_eoi_exitmap,
+	.apicv_pre_state_restore = pi_apicv_pre_state_restore,
+	.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+	.hwapic_isr_update = pkvm_hwapic_isr_update,
+	.sync_pir_to_irr = vmx_sync_pir_to_irr,
+	.deliver_interrupt = vmx_deliver_interrupt,
+	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+	.vcpu_after_set_cpuid = pkvm_vcpu_after_set_cpuid,
+
+	.get_l2_tsc_offset = pkvm_get_l2_tsc_offset,
+	.get_l2_tsc_multiplier = pkvm_get_l2_tsc_multiplier,
+	.write_tsc_offset = pkvm_write_tsc_offset,
+	.write_tsc_multiplier = pkvm_write_tsc_multiplier,
+
+	.load_mmu_pgd = pkvm_load_mmu_pgd,
+
+	.nested_ops = &pkvm_nested_ops,
+
+	.pi_update_irte = vmx_pi_update_irte,
+	.pi_start_bypass = vmx_pi_start_bypass,
+
+	.setup_mce = pkvm_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+	.smi_allowed = pkvm_smi_allowed,
+	.enter_smm = pkvm_enter_smm,
+	.leave_smm = pkvm_leave_smm,
+	.enable_smi_window = pkvm_enable_smi_window,
+#endif
+
+	.apic_init_signal_blocked = pkvm_apic_init_signal_blocked,
+
+	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
 };
+
+bool pkvm_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+	return (pkvm_interrupt_allowed(vcpu, false) <= 0);
+}
diff --git a/arch/x86/kvm/vmx/pkvm_init.c b/arch/x86/kvm/vmx/pkvm_init.c
index ff81b7167c5e..8d83e1c61521 100644
--- a/arch/x86/kvm/vmx/pkvm_init.c
+++ b/arch/x86/kvm/vmx/pkvm_init.c
@@ -8,6 +8,8 @@
 #include "pkvm_constants.h"
 #include "vmx.h"
 
+extern u64 x86_pred_cmd;
+
 static int __init early_pkvm_parse_cmdline(char *buf)
 {
 	return kstrtobool(buf, &enable_pkvm);
@@ -84,6 +86,11 @@ static __init void pkvm_setup_syms(void)
 	pkvm_sym(nr_cpu_ids) = nr_cpu_ids;
 	pkvm_sym(fpu_kernel_cfg) = fpu_kernel_cfg;
 	pkvm_sym(fpu_user_cfg) = fpu_user_cfg;
+#ifdef CONFIG_X86_64
+	if (static_branch_unlikely(&__fpu_state_size_dynamic))
+		static_branch_enable(&pkvm_sym(__fpu_state_size_dynamic));
+#endif
+	pkvm_sym(x86_pred_cmd) = x86_pred_cmd;
 }
 
 static __init int pkvm_setup_host_vmcs_config(void)
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 4a6d9a17da23..6f89de0ba8fc 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -241,7 +241,8 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
 	 */
 	if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) &&
 	    ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
-	     (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
+	     (!enable_pkvm && !is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)) ||
+	     (enable_pkvm && !pkvm_interrupt_blocked(vcpu))))
 		pi_enable_wakeup_handler(vcpu);
 	else
 		pi_set_sn(pi_desc);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index bc255d709d8a..5ab72724187d 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -303,9 +303,11 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
 
 SYM_FUNC_END(__vmx_vcpu_run)
 
+#ifndef __PKVM_HYP__
 SYM_FUNC_START(vmx_do_nmi_irqoff)
 	VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
 SYM_FUNC_END(vmx_do_nmi_irqoff)
+#endif
 
 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a8fd3bc1e8d5..1c7d47dad3fb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -99,10 +99,8 @@ MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
 bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-#ifndef __PKVM_HYP__
 static bool __read_mostly enable_vnmi = 1;
 module_param_named(vnmi, enable_vnmi, bool, 0444);
-#endif
 
 bool __read_mostly flexpriority_enabled = 1;
 module_param_named(flexpriority, flexpriority_enabled, bool, 0444);
@@ -117,10 +115,10 @@ module_param_named(unrestricted_guest,
 bool __read_mostly enable_ept_ad_bits = 1;
 module_param_named(eptad, enable_ept_ad_bits, bool, 0444);
 
-#ifndef __PKVM_HYP__
 static bool __read_mostly emulate_invalid_guest_state = true;
 module_param(emulate_invalid_guest_state, bool, 0444);
 
+#ifndef __PKVM_HYP__
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, 0444);
 
@@ -147,26 +145,25 @@ module_param(error_on_inconsistent_vmcs_config, bool, 0444);
 #ifndef __PKVM_HYP__
 static bool __read_mostly dump_invalid_vmcs = 0;
 module_param(dump_invalid_vmcs, bool, 0644);
+#endif /* !__PKVM_HYP__ */
 
 #define MSR_BITMAP_MODE_X2APIC		1
 #define MSR_BITMAP_MODE_X2APIC_APICV	2
-#endif /* !__PKVM_HYP__ */
 
 #define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
 
 #ifndef __PKVM_HYP__
 /* Guest_tsc -> host_tsc conversion requires 64-bit division.  */
 static int __read_mostly cpu_preemption_timer_multi;
+#endif /* !__PKVM_HYP__ */
 static bool __read_mostly enable_preemption_timer = 1;
 #ifdef CONFIG_X86_64
 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 #endif
-#endif /* !__PKVM_HYP__ */
 
 extern bool __read_mostly allow_smaller_maxphyaddr;
 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
 
-#ifndef __PKVM_HYP__
 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 #define KVM_VM_CR0_ALWAYS_ON				\
@@ -178,6 +175,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
 
 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
 
+#ifndef __PKVM_HYP__
 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
 	RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
 	RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
@@ -371,6 +369,7 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
 
 	return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
 }
+#endif /* !__PKVM_HYP__ */
 
 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
 {
@@ -424,16 +423,17 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
 		vmx->disable_fb_clear = false;
 }
 
+#ifndef __PKVM_HYP__
 static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 	.set = vmentry_l1d_flush_set,
 	.get = vmentry_l1d_flush_get,
 };
 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
+#endif /* !__PKVM_HYP__ */
 
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
 
 void vmx_vmexit(void);
-#endif /* !__PKVM_HYP__ */
 
 #define vmx_insn_failed(fmt...)		\
 do {					\
@@ -490,7 +490,9 @@ noinline void invept_error(unsigned long ext, u64 eptp)
 
 #ifndef __PKVM_HYP__
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
+#endif
 DEFINE_PER_CPU(struct vmcs *, current_vmcs);
+#ifndef __PKVM_HYP__
 /*
  * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
  * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
@@ -508,7 +510,6 @@ static DEFINE_PKVM_SPINLOCK(vmx_vpid_lock);
 struct vmcs_config vmcs_config __ro_after_init;
 struct vmx_capability vmx_capability __ro_after_init;
 
-#ifndef __PKVM_HYP__
 #define VMX_SEGMENT_FIELD(seg)					\
 	[VCPU_SREG_##seg] = {                                   \
 		.selector = GUEST_##seg##_SELECTOR,		\
@@ -533,8 +534,6 @@ static const struct kvm_vmx_segment_field {
 	VMX_SEGMENT_FIELD(LDTR),
 };
 
-#endif /* !__PKVM_HYP__ */
-
 static unsigned long host_idt_base;
 
 #ifndef __PKVM_HYP__
@@ -674,12 +673,10 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void)
 	return false;
 }
 
-#ifndef __PKVM_HYP__
 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 {
 	return flexpriority_enabled && lapic_in_kernel(vcpu);
 }
-#endif /* !__PKVM_HYP__ */
 
 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 {
@@ -691,14 +688,21 @@ struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
 	return NULL;
 }
 
-#ifndef __PKVM_HYP__
 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 				  struct vmx_uret_msr *msr, u64 data)
 {
 	unsigned int slot = msr - vmx->guest_uret_msrs;
 	int ret = 0;
 
+#ifndef __PKVM_HYP__
 	if (msr->load_into_hardware) {
+#else
+	/*
+	 * The host may use set_msr PV interface to access uret MSRs and in this
+	 * case, the uret MSRs are not loaded to the hardware.
+	 */
+	if (msr->load_into_hardware && vmx->guest_uret_msrs_loaded) {
+#endif
 		preempt_disable();
 		ret = kvm_set_user_return_msr(slot, data, msr->mask);
 		preempt_enable();
@@ -708,6 +712,7 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 	return ret;
 }
 
+#ifndef __PKVM_HYP__
 /*
  * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
  *
@@ -756,6 +761,7 @@ void vmx_emergency_disable_virtualization_cpu(void)
 
 	kvm_cpu_vmxoff();
 }
+#endif /* !__PKVM_HYP__ */
 
 static void __loaded_vmcs_clear(void *arg)
 {
@@ -771,6 +777,7 @@ static void __loaded_vmcs_clear(void *arg)
 	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
 		vmcs_clear(loaded_vmcs->shadow_vmcs);
 
+#ifndef __PKVM_HYP__
 	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
 
 	/*
@@ -781,11 +788,13 @@ static void __loaded_vmcs_clear(void *arg)
 	 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
 	 */
 	smp_wmb();
+#endif
 
 	loaded_vmcs->cpu = -1;
 	loaded_vmcs->launched = 0;
 }
 
+#ifndef __PKVM_HYP__
 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
 	int cpu = loaded_vmcs->cpu;
@@ -794,6 +803,7 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 		smp_call_function_single(cpu,
 			 __loaded_vmcs_clear, loaded_vmcs, 1);
 }
+#endif /* !__PKVM_HYP__ */
 
 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
 				       unsigned field)
@@ -911,6 +921,7 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
+#ifndef __PKVM_HYP__
 /*
  * Check if MSR is intercepted for currently loaded MSR bitmap.
  */
@@ -943,6 +954,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
 
 	return flags;
 }
+#endif /* !__PKVM_HYP__ */
 
 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
 		unsigned long entry, unsigned long exit)
@@ -1133,6 +1145,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
 	return true;
 }
 
+#ifndef __PKVM_HYP__
 #ifdef CONFIG_X86_32
 /*
  * On 32-bit kernels, VM exits still load the FS and GS bases from the
@@ -1375,6 +1388,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
 	vmx->vt.guest_state_loaded = false;
 	vmx->guest_uret_msrs_loaded = false;
 }
+#endif /* !__PKVM_HYP__ */
 
 #ifdef CONFIG_X86_64
 static u64 vmx_read_guest_host_msr(struct vcpu_vmx *vmx, u32 msr, u64 *cache)
@@ -1409,6 +1423,7 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
 }
 #endif
 
+#ifndef __PKVM_HYP__
 static void grow_ple_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1424,6 +1439,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
 					    vmx->ple_window, old);
 	}
 }
+#endif /* !__PKVM_HYP__ */
 
 static void shrink_ple_window(struct kvm_vcpu *vcpu)
 {
@@ -1448,6 +1464,15 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
 	struct vmcs *prev;
 
 	if (!already_loaded) {
+#ifdef __PKVM_HYP__
+		/*
+		 * pkvm doesn't support smp call thus doesn't support clear vmcs
+		 * on a remote CPU. Suppose this vmcs is already cleared by
+		 * vmx_vcpu_put, otherwise it cannot be loaded on this CPU.
+		 */
+		if (WARN_ON_ONCE(vmx->loaded_vmcs->cpu != -1))
+			return;
+#else
 		loaded_vmcs_clear(vmx->loaded_vmcs);
 		local_irq_disable();
 
@@ -1462,6 +1487,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
 		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
 			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 		local_irq_enable();
+#endif
 	}
 
 	prev = per_cpu(current_vmcs, cpu);
@@ -1471,6 +1497,25 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (!already_loaded) {
+#ifdef __PKVM_HYP__
+		struct desc_ptr gdt;
+		/*
+		 * Flush all EPTP/VPID contexts, the new pCPU may have stale
+		 * TLB entries from its previous association with the vCPU.
+		 */
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+
+		vmcs_writel(HOST_TR_BASE, pkvm_pcpu_tss(cpu));
+
+		native_store_gdt(&gdt);
+		vmcs_writel(HOST_GDTR_BASE, gdt.address);
+
+		if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+			unsigned long msr = __rdmsr(MSR_IA32_SYSENTER_ESP);
+
+			vmcs_writel(HOST_IA32_SYSENTER_ESP, msr);
+		}
+#else
 		void *gdt = get_current_gdt_ro();
 
 		/*
@@ -1492,6 +1537,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
 			vmcs_writel(HOST_IA32_SYSENTER_ESP,
 				    (unsigned long)(cpu_entry_stack(cpu) + 1));
 		}
+#endif
 
 		vmx->loaded_vmcs->cpu = cpu;
 	}
@@ -1508,14 +1554,31 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	vmx_vcpu_load_vmcs(vcpu, cpu);
 
+#ifndef __PKVM_HYP__
 	vmx_vcpu_pi_load(vcpu, cpu);
+#endif
 }
 
 void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
+#ifndef __PKVM_HYP__
 	vmx_vcpu_pi_put(vcpu);
 
 	vmx_prepare_switch_to_host(to_vmx(vcpu));
+#else
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/*
+	 * The pKVM hypervisor lacks an smp call mechanism to notify remote CPUs
+	 * to clear VMCS when a vCPU is migrated. Therefore, VMCS clearing is
+	 * performed immediately during the vcpu_put operation.
+	 */
+	if (vmx->loaded_vmcs->cpu == -1 ||
+			WARN_ON_ONCE(vmx->loaded_vmcs->cpu != raw_smp_processor_id()))
+		return;
+
+	__loaded_vmcs_clear(vmx->loaded_vmcs);
+#endif
 }
 
 bool vmx_emulation_required(struct kvm_vcpu *vcpu)
@@ -1604,6 +1667,7 @@ void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
 }
 
+#ifndef __PKVM_HYP__
 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1866,7 +1930,6 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu)
 	vmx_clear_hlt(vcpu);
 }
 
-#ifndef __PKVM_HYP__
 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
 			       bool load_into_hardware)
 {
@@ -1922,6 +1985,7 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
 	vmx->guest_uret_msrs_loaded = false;
 }
 
+#ifndef __PKVM_HYP__
 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -1942,6 +2006,7 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
 
 	return kvm_caps.default_tsc_scaling_ratio;
 }
+#endif /* !__PKVM_HYP__ */
 
 void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
 {
@@ -1991,6 +2056,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
 	return !(msr->data & ~valid_bits);
 }
 
+#ifndef __PKVM_HYP__
 int vmx_get_feature_msr(u32 msr, u64 *data)
 {
 	switch (msr) {
@@ -2002,6 +2068,7 @@ int vmx_get_feature_msr(u32 msr, u64 *data)
 		return KVM_MSR_RET_UNSUPPORTED;
 	}
 }
+#endif /* !__PKVM_HYP__ */
 
 /*
  * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
@@ -2012,7 +2079,9 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmx_uret_msr *msr;
+#ifndef __PKVM_HYP__
 	u32 index;
+#endif
 
 	switch (msr_info->index) {
 #ifdef CONFIG_X86_64
@@ -2085,7 +2154,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
 				    &msr_info->data))
 			return 1;
-#ifdef CONFIG_KVM_HYPERV
+#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__)
 		/*
 		 * Enlightened VMCS v1 doesn't have certain VMCS fields but
 		 * instead of just ignoring the features, different Hyper-V
@@ -2098,6 +2167,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 							&msr_info->data);
 #endif
 		break;
+#ifndef __PKVM_HYP__
 	case MSR_IA32_RTIT_CTL:
 		if (!vmx_pt_mode_is_host_guest())
 			return 1;
@@ -2143,6 +2213,17 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 		break;
+#else
+	/* The pKVM doesn't support PT guest mode. */
+	case MSR_IA32_RTIT_CTL:
+	case MSR_IA32_RTIT_STATUS:
+	case MSR_IA32_RTIT_CR3_MATCH:
+	case MSR_IA32_RTIT_OUTPUT_BASE:
+	case MSR_IA32_RTIT_OUTPUT_MASK:
+	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+		WARN_ON_ONCE(vmx_pt_mode_is_host_guest());
+		return 1;
+#endif
 	case MSR_IA32_S_CET:
 		msr_info->data = vmcs_readl(GUEST_S_CET);
 		break;
@@ -2186,9 +2267,11 @@ u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
 	    (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
 		debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
 
+#ifndef __PKVM_HYP__
 	if ((kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT) &&
 	    (host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
 		debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+#endif
 
 	if (boot_cpu_has(X86_FEATURE_RTM) &&
 	    (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)))
@@ -2221,7 +2304,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	int ret = 0;
 	u32 msr_index = msr_info->index;
 	u64 data = msr_info->data;
+#ifndef __PKVM_HYP__
 	u32 index;
+#endif
 
 	switch (msr_index) {
 	case MSR_EFER:
@@ -2289,9 +2374,15 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		vmx_guest_debugctl_write(vcpu, data);
 
+		/*
+		 * The pKVM doesn't support guest PMU emulation. Disabling this
+		 * code to avoid importing unnecessary symbols.
+		 */
+#ifndef __PKVM_HYP__
 		if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
 		    (data & DEBUGCTLMSR_LBR))
 			intel_pmu_create_guest_lbr_event(vcpu);
+#endif
 		return 0;
 	case MSR_IA32_BNDCFGS:
 		if (!kvm_mpx_supported() ||
@@ -2411,6 +2502,9 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
 			return 1;
 		return vmx_set_vmx_msr(vcpu, msr_index, data);
+
+	/* The pKVM hypervisor doesn't support PT guest mode. */
+#ifndef __PKVM_HYP__
 	case MSR_IA32_RTIT_CTL:
 		if (!vmx_pt_mode_is_host_guest() ||
 			vmx_rtit_ctl_check(vcpu, data) ||
@@ -2470,6 +2564,17 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		else
 			vmx->pt_desc.guest.addr_a[index / 2] = data;
 		break;
+#else
+	/* The pKVM hypervisor doesn't support PT guest mode. */
+	case MSR_IA32_RTIT_CTL:
+	case MSR_IA32_RTIT_STATUS:
+	case MSR_IA32_RTIT_CR3_MATCH:
+	case MSR_IA32_RTIT_OUTPUT_BASE:
+	case MSR_IA32_RTIT_OUTPUT_MASK:
+	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+		WARN_ON_ONCE(vmx_pt_mode_is_host_guest());
+		return 1;
+#endif
 	case MSR_IA32_S_CET:
 		vmcs_writel(GUEST_S_CET, data);
 		break;
@@ -2480,6 +2585,7 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmcs_writel(GUEST_INTR_SSP_TABLE, data);
 		break;
 	case MSR_IA32_PERF_CAPABILITIES:
+#ifndef __PKVM_HYP__
 		if (data & PERF_CAP_LBR_FMT) {
 			if ((data & PERF_CAP_LBR_FMT) !=
 			    (kvm_caps.supported_perf_cap & PERF_CAP_LBR_FMT))
@@ -2500,7 +2606,13 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		}
 		ret = kvm_set_msr_common(vcpu, msr_info);
 		break;
-
+#else
+		/*
+		 * The pKVM hypervisor doesn't support emulating PMU for guest
+		 * thus also the IA32_PER_CAPABILITIES.
+		 */
+		return KVM_MSR_RET_UNSUPPORTED;
+#endif
 	default:
 	find_uret_msr:
 		msr = vmx_find_uret_msr(vmx, msr_index);
@@ -2559,7 +2671,6 @@ void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 		break;
 	}
 }
-#endif /* !__PKVM_HYP__ */
 
 /*
  * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
@@ -3129,6 +3240,7 @@ static __init int alloc_kvm_area(void)
 	}
 	return 0;
 }
+#endif /* !__PKVM_HYP__ */
 
 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
 		struct kvm_segment *save)
@@ -3516,12 +3628,14 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	vmx->vt.emulation_required = vmx_emulation_required(vcpu);
 }
 
+#ifndef __PKVM_HYP__
 static int vmx_get_max_ept_level(void)
 {
 	if (cpu_has_vmx_ept_5levels())
 		return 5;
 	return 4;
 }
+#endif /* !__PKVM_HYP__ */
 
 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
 {
@@ -3593,7 +3707,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	 * is in force while we are in guest mode.  Do not let guests control
 	 * this bit, even if host CR4.MCE == 0.
 	 */
+#ifndef __PKVM_HYP__
 	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+#else
+	hw_cr4 = (native_read_cr4() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
+#endif
 	if (enable_unrestricted_guest)
 		hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
 	else if (vmx->rmode.vm86_active)
@@ -4004,6 +4122,7 @@ bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
 	return true;
 }
 
+#ifndef __PKVM_HYP__
 static int init_rmode_tss(struct kvm *kvm, void __user *ua)
 {
 	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
@@ -4066,6 +4185,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
+#endif /* !__PKVM_HYP__ */
 
 static void seg_setup(int seg)
 {
@@ -4081,7 +4201,6 @@ static void seg_setup(int seg)
 
 	vmcs_write32(sf->ar_bytes, ar);
 }
-#endif /* !__PKVM_HYP__ */
 
 int allocate_vpid(void)
 {
@@ -4167,7 +4286,6 @@ void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool se
 	}
 }
 
-#ifndef __PKVM_HYP__
 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -4228,6 +4346,7 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
 	}
 }
 
+#ifndef __PKVM_HYP__
 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4398,6 +4517,7 @@ void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
 					   trig_mode, vector);
 	}
 }
+#endif /* !__PKVM_HYP__ */
 
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
@@ -4424,7 +4544,11 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 	vmx->loaded_vmcs->host_state.cr3 = cr3;
 
 	/* Save the most likely value for this task's CR4 in the VMCS. */
+#ifdef __PKVM_HYP__
+	cr4 = native_read_cr4();
+#else
 	cr4 = cr4_read_shadow();
+#endif
 	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
 	vmx->loaded_vmcs->host_state.cr4 = cr4;
 
@@ -4786,7 +4910,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 
 	return exec_control;
 }
-#endif /* !__PKVM_HYP__ */
 
 static inline int vmx_get_pid_table_order(struct kvm *kvm)
 {
@@ -4865,7 +4988,6 @@ int vmx_vcpu_precreate(struct kvm *kvm)
 	return vmx_alloc_ipiv_pid_table(kvm);
 }
 
-#ifndef __PKVM_HYP__
 #define VMX_XSS_EXIT_BITMAP 0
 
 static void init_vmcs(struct vcpu_vmx *vmx)
@@ -4897,6 +5019,9 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 		tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
 
 	if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
+#ifdef __PKVM_HYP__
+		struct vcpu_vmx *shared_vmx = to_vmx(to_pkvm_vcpu(&vmx->vcpu)->shared_vcpu);
+#endif
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4905,7 +5030,17 @@ static void init_vmcs(struct vcpu_vmx *vmx)
 		vmcs_write16(GUEST_INTR_STATUS, 0);
 
 		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+#ifndef __PKVM_HYP__
 		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
+#else
+		/*
+		 * The pKVM hypervisor needs to use the pi_desc from the shared
+		 * vmx to set the POSTED_INTR_DESC_ADDR as the host will post
+		 * the virtual interrupt to the guest via its pi_desc.
+		 */
+		vmcs_write64(POSTED_INTR_DESC_ADDR,
+			     __pa(kern_pkvm_va(&shared_vmx->vt.pi_desc)));
+#endif
 	}
 
 	if (vmx_can_use_ipiv(&vmx->vcpu)) {
@@ -5038,7 +5173,10 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vmx->msr_ia32_umwait_control = 0;
 
 	vmx->hv_deadline_tsc = -1;
+	/* The host VMM handles the virtual APIC for the guest. */
+#ifndef __PKVM_HYP__
 	kvm_set_cr8(vcpu, 0);
+#endif
 
 	seg_setup(VCPU_SREG_CS);
 	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
@@ -5085,20 +5223,21 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	    kvm_cpu_cap_has(X86_FEATURE_SHSTK))
 		vmcs_writel(GUEST_S_CET, 0);
 
+	/* The host VMM handles the virtual APIC for the guest. */
+#ifndef __PKVM_HYP__
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+#endif
 
 	vpid_sync_context(vmx->vpid);
 
 	vmx_update_fb_clear_dis(vcpu, vmx);
 }
-#endif /* !__PKVM_HYP__ */
 
 void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
 }
 
-#ifndef __PKVM_HYP__
 void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
 {
 	if (!enable_vnmi ||
@@ -5119,6 +5258,7 @@ void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
 	trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
 
 	++vcpu->stat.irq_injections;
+#ifndef __PKVM_HYP__
 	if (vmx->rmode.vm86_active) {
 		int inc_eip = 0;
 		if (vcpu->arch.interrupt.soft)
@@ -5126,6 +5266,7 @@ void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
 		kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
 		return;
 	}
+#endif
 	intr = irq | INTR_INFO_VALID_MASK;
 	if (vcpu->arch.interrupt.soft) {
 		intr |= INTR_TYPE_SOFT_INTR;
@@ -5158,10 +5299,12 @@ void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	++vcpu->stat.nmi_injections;
 	vmx->loaded_vmcs->nmi_known_unmasked = false;
 
+#ifndef __PKVM_HYP__
 	if (vmx->rmode.vm86_active) {
 		kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
 		return;
 	}
+#endif
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
@@ -5258,6 +5401,7 @@ int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 	return !vmx_interrupt_blocked(vcpu);
 }
 
+#ifndef __PKVM_HYP__
 int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
 	void __user *ret;
@@ -5786,6 +5930,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 out:
 	return kvm_complete_insn_gp(vcpu, err);
 }
+#endif /* !__PKVM_HYP__ */
 
 void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 {
@@ -5811,6 +5956,7 @@ void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
 	vmcs_writel(GUEST_DR7, val);
 }
 
+#ifndef __PKVM_HYP__
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 {
 	kvm_apic_update_ppr(vcpu);
@@ -6929,6 +7075,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
 		    [size] "r" (size)
 		: "eax", "ebx", "ecx", "edx");
 }
+#endif /* !__PKVM_HYP__ */
 
 void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
@@ -6964,6 +7111,17 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 		return;
 	}
 
+#ifdef __PKVM_HYP__
+	/*
+	 * Emulating xapic mode requires instruction decoding. As pVM's CPU and
+	 * memory state are isolated from the host, the host cannot decode pVM's
+	 * instruction. Not to use xapic mode for a pVM.
+	 */
+	if (pkvm_is_protected_vcpu(vcpu) &&
+	    (kvm_get_apic_mode(vcpu) == LAPIC_MODE_XAPIC))
+		return;
+#endif
+
 	sec_exec_control = secondary_exec_controls_get(vmx);
 	sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 			      SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
@@ -7000,6 +7158,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
 	vmx_update_msr_bitmap_x2apic(vcpu);
 }
 
+#ifndef __PKVM_HYP__
 void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
 {
 	const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
@@ -7070,6 +7229,7 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
 	 */
 	read_unlock(&vcpu->kvm->mmu_lock);
 }
+#endif /* !__PKVM_HYP__ */
 
 void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
 {
@@ -7128,6 +7288,7 @@ static void vmx_set_rvi(int vector)
 
 int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
+#ifndef __PKVM_HYP__
 	struct vcpu_vt *vt = to_vt(vcpu);
 	int max_irr;
 	bool got_posted_interrupt;
@@ -7164,11 +7325,19 @@ int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 	 * attempt to post interrupts.  The posted interrupt vector will cause
 	 * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
 	 */
-	if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
-		vmx_set_rvi(max_irr);
-	else if (got_posted_interrupt)
+	if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) {
+		if (!enable_pkvm)
+			vmx_set_rvi(max_irr);
+		else if (max_irr != -1)
+			KVM_BUG_ON(pkvm_hypercall(sync_pir_to_irr, max_irr), vcpu->kvm);
+	} else if (got_posted_interrupt) {
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
+	}
+#else
+	int max_irr = to_pkvm_vcpu(vcpu)->max_irr;
 
+	vmx_set_rvi(max_irr);
+#endif
 	return max_irr;
 }
 
@@ -7183,6 +7352,7 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
 
+#ifndef __PKVM_HYP__
 void vmx_do_interrupt_irqoff(unsigned long entry);
 void vmx_do_nmi_irqoff(void);
 
@@ -7319,6 +7489,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 			ktime_to_ns(ktime_sub(ktime_get(),
 					      vmx->loaded_vmcs->entry_time));
 }
+#endif /* !__PKVM_HYP__ */
 
 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
 				      u32 idt_vectoring_info,
@@ -7378,12 +7549,14 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
 	}
 }
 
+#ifndef __PKVM_HYP__
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
 }
+#endif /* !__PKVM_HYP__ */
 
 void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
@@ -7395,6 +7568,7 @@ void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
 }
 
+#ifndef __PKVM_HYP__
 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
 {
 	int i, nr_msrs;
@@ -7443,6 +7617,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit
 		vmx->loaded_vmcs->hv_timer_soft_disabled = true;
 	}
 }
+#endif /* !__PKVM_HYP__ */
 
 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 {
@@ -7451,7 +7626,6 @@ void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
 		vmcs_writel(HOST_RSP, host_rsp);
 	}
 }
-#endif /* !__PKVM_HYP__ */
 
 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
 					unsigned int flags)
@@ -7954,6 +8128,7 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 
 	return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
 }
+#endif /* !__PKVM_HYP__ */
 
 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
 {
@@ -7974,6 +8149,7 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
 	secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
 }
 
+#ifndef __PKVM_HYP__
 /*
  * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
  * (indicating "allowed-1") if they are supported in the guest's CPUID.
@@ -8091,6 +8267,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
 	for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
 		vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
 }
+#endif /* !__PKVM_HYP__ */
 
 void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 {
@@ -8119,12 +8296,15 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 			~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
 			  FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
 
+	/* The pKVM hypervisor has disabled nested and PT */
+#ifndef __PKVM_HYP__
 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
 		nested_vmx_cr_fixed1_bits_update(vcpu);
 
 	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
 			guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
 		update_intel_pt_cfg(vcpu);
+#endif
 
 	if (boot_cpu_has(X86_FEATURE_RTM)) {
 		struct vmx_uret_msr *msr;
@@ -8154,6 +8334,7 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	vmx_update_exception_bitmap(vcpu);
 }
 
+#ifndef __PKVM_HYP__
 static __init u64 vmx_get_perf_capabilities(void)
 {
 	u64 perf_cap = PERF_CAP_FW_WRITES;
@@ -8502,6 +8683,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
 	else
 		secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
 }
+#endif /* !__PKVM_HYP__ */
 
 void vmx_setup_mce(struct kvm_vcpu *vcpu)
 {
@@ -8513,6 +8695,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu)
 			~FEAT_CTL_LMCE_ENABLED;
 }
 
+#ifndef __PKVM_HYP__
 #ifdef CONFIG_KVM_SMM
 int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
 {
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 2d24ab7a5560..3942e071b8a8 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -814,6 +814,12 @@ extern struct pkvm_init_ops *pkvm_sym(pkvm_vmx_init_ops);
 int pkvm_vmx_init(void);
 #endif
 
+bool pkvm_interrupt_blocked(struct kvm_vcpu *vcpu);
+
+#else
+
+static inline bool pkvm_interrupt_blocked(struct kvm_vcpu *vcpu) { return false; }
+
 #endif /* CONFIG_PKVM_INTEL */
 
 #endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 96677576c836..903822bce814 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -303,6 +303,24 @@ static inline void vmcs_load(struct vmcs *vmcs)
 	vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
 }
 
+static inline u64 vmcs_store(void)
+{
+	u64 phys_addr = INVALID_PAGE;
+
+	asm goto("1: vmptrst (%0)\n\t"
+		     _ASM_EXTABLE(1b, %l[do_exception])
+		     :
+		     : "r" (&phys_addr)
+		     : "cc", "memory"
+		     : do_exception);
+
+	return phys_addr;
+
+do_exception:
+	kvm_spurious_fault();
+	return INVALID_PAGE;
+}
+
 static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
 {
 	struct {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 13398d68a0aa..a7d223ffb76f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -83,6 +83,7 @@
 #include <asm/intel_pt.h>
 #include <asm/emulate_prefix.h>
 #include <asm/sgx.h>
+#include <asm/kvm_pkvm.h>
 #include <clocksource/hyperv_timer.h>
 
 #ifdef __PKVM_HYP__
@@ -146,9 +147,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
 
 static DEFINE_MUTEX(vendor_module_lock);
+#endif /* !__PKVM_HYP__ */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
-#endif /* !__PKVM_HYP__ */
 struct kvm_x86_ops kvm_x86_ops __read_mostly;
 
 #ifndef __PKVM_HYP__
@@ -163,11 +164,13 @@ EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
 
 static bool __read_mostly ignore_msrs = 0;
 module_param(ignore_msrs, bool, 0644);
+#endif /* !__PKVM_HYP__ */
 
 bool __read_mostly report_ignored_msrs = true;
 module_param(report_ignored_msrs, bool, 0644);
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs);
 
+#ifndef __PKVM_HYP__
 unsigned int min_timer_period_us = 200;
 module_param(min_timer_period_us, uint, 0644);
 
@@ -177,11 +180,13 @@ module_param(kvmclock_periodic_sync, bool, 0444);
 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
 static u32 __read_mostly tsc_tolerance_ppm = 250;
 module_param(tsc_tolerance_ppm, uint, 0644);
+#endif /* !__PKVM_HYP__ */
 
 bool __read_mostly enable_vmware_backdoor = false;
 module_param(enable_vmware_backdoor, bool, 0444);
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor);
 
+#ifndef __PKVM_HYP__
 /*
  * Flags to manipulate forced emulation behavior (any non-zero value will
  * enable forced emulation).
@@ -662,14 +667,15 @@ void kvm_user_return_msr_cpu_online(void)
 	}
 }
 
-#ifndef __PKVM_HYP__
 static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
 {
+#ifndef __PKVM_HYP__
 	if (!msrs->registered) {
 		msrs->urn.on_user_return = kvm_on_user_return;
 		user_return_notifier_register(&msrs->urn);
 		msrs->registered = true;
 	}
+#endif
 }
 
 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
@@ -690,6 +696,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
 
+#ifndef __PKVM_HYP__
 void kvm_user_return_msr_update_cache(unsigned int slot, u64 value)
 {
 	struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
@@ -938,6 +945,7 @@ static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
 {
 	kvm_multiple_exception(vcpu, nr, true, error_code, true, payload);
 }
+#endif /* !__PKVM_HYP__ */
 
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
 			   bool has_error_code, u32 error_code)
@@ -968,6 +976,7 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception);
 
+#ifndef __PKVM_HYP__
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
 	if (err)
@@ -1271,6 +1280,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
 
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state);
+#endif /* !__PKVM_HYP__ */
 
 #ifdef CONFIG_X86_64
 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
@@ -1325,6 +1335,7 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr);
 
+#ifndef __PKVM_HYP__
 int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
 {
 	/* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
@@ -1532,6 +1543,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 		return vcpu->arch.cr8;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8);
+#endif /* !__PKVM_HYP__ */
 
 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
 {
@@ -1558,6 +1570,7 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7);
 
+#ifndef __PKVM_HYP__
 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
 {
 	u64 fixed = DR6_FIXED_1;
@@ -1631,6 +1644,7 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
 	return kvm_skip_emulated_instruction(vcpu);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc);
+#endif /* !__PKVM_HYP__ */
 
 /*
  * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
@@ -1651,7 +1665,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc);
 	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
 	 ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
 
-static u64 kvm_get_arch_capabilities(void)
+u64 kvm_get_arch_capabilities(void)
 {
 	u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP;
 
@@ -1663,6 +1677,7 @@ static u64 kvm_get_arch_capabilities(void)
 	 */
 	data |= ARCH_CAP_PSCHANGE_MC_NO;
 
+#ifndef __PKVM_HYP__
 	/*
 	 * If we're doing cache flushes (either "always" or "cond")
 	 * we will do one whenever the guest does a vmlaunch/vmresume.
@@ -1674,6 +1689,18 @@ static u64 kvm_get_arch_capabilities(void)
 	 */
 	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
 		data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+#else
+	/*
+	 * The CPU which can run the pKVM hypervisor doesn't have L1TF CPU
+	 * bugs. This is guaranteed by pkvm_mitigate_cpu_bugs() which currently
+	 * doesn't mitigate L1TF and thus would fail pKVM initialization if L1TF
+	 * was present, so we can set ARCH_CAP_SKIP_VMENTRY_L1DFLUSH for guest.
+	 * As the pKVM hypervisor doesn't support nest, passing this cap to the
+	 * guest is not necessary. But in case nest is supported in the future,
+	 * passing this cap anyway.
+	 */
+	data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
+#endif
 
 	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 		data |= ARCH_CAP_RDCL_NO;
@@ -1705,12 +1732,23 @@ static u64 kvm_get_arch_capabilities(void)
 		 */
 	}
 
+#ifndef __PKVM_HYP__
 	if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
 		data |= ARCH_CAP_GDS_NO;
+#else
+	/*
+	 * The CPU which can run the pKVM hypervisor doesn't have GDS bug. This
+	 * is guaranteed by pkvm_mitigate_cpu_bugs() which currently doesn't
+	 * mitigate GDS and thus would fail pKVM initialization if GDS was
+	 * present, so we can set ARCH_CAP_GDS_NO.
+	 */
+	data |= ARCH_CAP_GDS_NO;
+#endif
 
 	return data;
 }
 
+#ifndef __PKVM_HYP__
 static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 			       bool host_initiated)
 {
@@ -1740,6 +1778,7 @@ static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 	return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
 				 kvm_get_feature_msr);
 }
+#endif /* !__PKVM_HYP__ */
 
 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
@@ -1773,7 +1812,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer);
 
 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+#ifndef __PKVM_HYP__
 	u64 old_efer = vcpu->arch.efer;
+#endif
 	u64 efer = msr_info->data;
 	int r;
 
@@ -1798,16 +1839,18 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		return r;
 	}
 
+	/* TODO: Notify the host VMM to reset kvm mmu reset. */
+#ifndef __PKVM_HYP__
 	if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
 		kvm_mmu_reset_context(vcpu);
 
 	if (!static_cpu_has(X86_FEATURE_XSAVES) &&
 	    (efer & EFER_SVME))
 		kvm_hv_xsaves_xsavec_maybe_warn(vcpu);
+#endif
 
 	return 0;
 }
-#endif /* !__PKVM_HYP__ */
 
 void kvm_enable_efer_bits(u64 mask)
 {
@@ -1867,7 +1910,6 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed);
 
-#ifndef __PKVM_HYP__
 /*
  * Write @data into the MSR specified by @index.  Select MSR specific fault
  * checks are bypassed if @host_initiated is %true.
@@ -1974,6 +2016,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
 	return kvm_x86_call(set_msr)(vcpu, &msr);
 }
 
+#ifndef __PKVM_HYP__
 static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
 			bool host_initiated)
 {
@@ -1986,6 +2029,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
 	return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
 				 _kvm_set_msr);
 }
+#endif /* !__PKVM_HYP__ */
 
 /*
  * Read the MSR specified by @index into @data.  Select MSR specific fault
@@ -2044,6 +2088,7 @@ int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
 	return __kvm_get_msr(vcpu, index, data, true);
 }
 
+#ifndef __PKVM_HYP__
 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
 				     u32 index, u64 *data, bool host_initiated)
 {
@@ -3586,6 +3631,7 @@ static void kvmclock_sync_fn(struct work_struct *work)
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
 }
+#endif /* !__PKVM_HYP__ */
 
 /* These helpers are safe iff @msr is known to be an MCx bank MSR. */
 static bool is_mci_control_msr(u32 msr)
@@ -3681,6 +3727,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	return 0;
 }
 
+#ifndef __PKVM_HYP__
 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
 {
 	u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
@@ -3962,12 +4009,14 @@ static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R);
 }
+#endif /* !__PKVM_HYP__ */
 
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	u32 msr = msr_info->index;
 	u64 data = msr_info->data;
 
+#ifndef __PKVM_HYP__
 	/*
 	 * Do not allow host-initiated writes to trigger the Xen hypercall
 	 * page setup; it could incur locking paths which are not expected
@@ -3976,6 +4025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
 	    !msr_info->host_initiated)
 		return kvm_xen_write_hypercall_page(vcpu, data);
+#endif
 
 	switch (msr) {
 	case MSR_AMD64_NB_CFG:
@@ -4015,7 +4065,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			break;
 
 		vcpu->arch.perf_capabilities = data;
+		/*
+		 * The pkvm hypervisor doesn't provide X86_FEATURE_PDCM to the
+		 * guest thus no need to do PMU refresh.
+		 */
+#ifndef __PKVM_HYP__
 		kvm_pmu_refresh(vcpu);
+#endif
 		break;
 	case MSR_IA32_PRED_CMD: {
 		u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB);
@@ -4092,6 +4148,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
 	case MSR_MTRRdefType:
 		return kvm_mtrr_set_msr(vcpu, msr, data);
+#ifndef __PKVM_HYP__
 	case MSR_IA32_APICBASE:
 		return kvm_apic_set_base(vcpu, data, msr_info->host_initiated);
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
@@ -4112,6 +4169,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			vcpu->arch.ia32_tsc_adjust_msr = data;
 		}
 		break;
+#endif
 	case MSR_IA32_MISC_ENABLE: {
 		u64 old_val = vcpu->arch.ia32_misc_enable_msr;
 
@@ -4144,6 +4202,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_POWER_CTL:
 		vcpu->arch.msr_ia32_power_ctl = data;
 		break;
+#ifndef __PKVM_HYP__
 	case MSR_IA32_TSC:
 		if (msr_info->host_initiated) {
 			kvm_synchronize_tsc(vcpu, &data);
@@ -4153,6 +4212,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			vcpu->arch.ia32_tsc_adjust_msr += adj;
 		}
 		break;
+#endif
 	case MSR_IA32_XSS:
 		if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
 			return KVM_MSR_RET_UNSUPPORTED;
@@ -4169,6 +4229,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			return 1;
 		vcpu->arch.smi_count = data;
 		break;
+#ifndef __PKVM_HYP__
 	case MSR_KVM_WALL_CLOCK_NEW:
 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
 			return 1;
@@ -4253,6 +4314,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		vcpu->arch.msr_kvm_poll_control = data;
 		break;
+#endif
 
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
@@ -4264,8 +4326,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
 	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
 	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+#ifndef __PKVM_HYP__
 		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
+#endif
 
 		if (data)
 			kvm_pr_unimpl_wrmsr(vcpu, msr, data);
@@ -4280,7 +4344,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 * the need to ignore the workaround.
 		 */
 		break;
-#ifdef CONFIG_KVM_HYPERV
+#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__)
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
 	case HV_X64_MSR_SYNDBG_OPTIONS:
@@ -4344,13 +4408,25 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vcpu->arch.guest_fpu.xfd_err = data;
 		break;
 #endif
+	/*
+	 * These CET related MSRs are passed-through to the guest, and this code
+	 * is not for the KVM to emulate rdmsr/wrmsr instruction, but for the
+	 * KVM (or the userspace VMM) to access the guest CET MSRs for managing
+	 * the guest FPU state, or emulating some other instructions (e.g., task
+	 * switch). For a pVM these MSRs are inaccessible to the host anyway and
+	 * the pKVM hypervisor itself doesn't need to access them either.
+	 */
+#ifndef __PKVM_HYP__
 	case MSR_IA32_U_CET:
 	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
 		kvm_set_xstate_msr(vcpu, msr_info);
 		break;
+#endif
 	default:
+#ifndef __PKVM_HYP__
 		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
+#endif
 
 		return KVM_MSR_RET_UNSUPPORTED;
 	}
@@ -4446,8 +4522,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
 	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
 	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+#ifndef __PKVM_HYP__
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info);
+#endif
 		msr_info->data = 0;
 		break;
 	case MSR_IA32_UCODE_REV:
@@ -4466,6 +4544,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_POWER_CTL:
 		msr_info->data = vcpu->arch.msr_ia32_power_ctl;
 		break;
+#ifndef __PKVM_HYP__
 	case MSR_IA32_TSC: {
 		/*
 		 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
@@ -4489,6 +4568,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
 		break;
 	}
+#endif
 	case MSR_IA32_CR_PAT:
 		msr_info->data = vcpu->arch.pat;
 		break;
@@ -4513,6 +4593,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_EBC_FREQUENCY_ID:
 		msr_info->data = 1 << 24;
 		break;
+#ifndef __PKVM_HYP__
 	case MSR_IA32_APICBASE:
 		msr_info->data = vcpu->arch.apic_base;
 		break;
@@ -4524,6 +4605,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_TSC_ADJUST:
 		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
 		break;
+#endif
 	case MSR_IA32_MISC_ENABLE:
 		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
 		break;
@@ -4544,6 +4626,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_EFER:
 		msr_info->data = vcpu->arch.efer;
 		break;
+#ifndef __PKVM_HYP__
 	case MSR_KVM_WALL_CLOCK:
 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
 			return 1;
@@ -4604,6 +4687,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
 		msr_info->data = vcpu->arch.msr_kvm_poll_control;
 		break;
+#endif
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -4631,7 +4715,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 */
 		msr_info->data = 0x20000000;
 		break;
-#ifdef CONFIG_KVM_HYPERV
+#if defined(CONFIG_KVM_HYPERV) && !defined(__PKVM_HYP__)
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
 	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
 	case HV_X64_MSR_SYNDBG_OPTIONS:
@@ -4697,13 +4781,25 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
 		break;
 #endif
+	/*
+	 * These CET related MSRs are passed-through to the guest, and this code
+	 * is not for the KVM to emulate rdmsr/wrmsr instruction, but for the
+	 * KVM (or the userspace VMM) to access the guest CET MSRs for managing
+	 * the guest FPU state, or emulating some other instructions (e.g., task
+	 * switch). For a pVM these MSRs are inaccessible to the host anyway and
+	 * the pKVM hypervisor itself doesn't need to access them either.
+	 */
+#ifndef __PKVM_HYP__
 	case MSR_IA32_U_CET:
 	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
 		kvm_get_xstate_msr(vcpu, msr_info);
 		break;
+#endif
 	default:
+#ifndef __PKVM_HYP__
 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
 			return kvm_pmu_get_msr(vcpu, msr_info);
+#endif
 
 		return KVM_MSR_RET_UNSUPPORTED;
 	}
@@ -4711,6 +4807,57 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common);
 
+#ifdef CONFIG_PKVM_X86
+bool pkvm_host_has_emulated_msr(struct kvm *kvm, u32 msr)
+{
+	switch (msr) {
+	case MSR_KVM_WALL_CLOCK:
+	case MSR_KVM_WALL_CLOCK_NEW:
+	case MSR_KVM_SYSTEM_TIME:
+	case MSR_KVM_SYSTEM_TIME_NEW:
+	case MSR_KVM_ASYNC_PF_EN:
+	case MSR_KVM_ASYNC_PF_INT:
+	case MSR_KVM_ASYNC_PF_ACK:
+	case MSR_KVM_STEAL_TIME:
+	case MSR_KVM_PV_EOI_EN:
+	case MSR_KVM_POLL_CONTROL:
+#if defined(CONFIG_KVM_HYPERV)
+	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+	case HV_X64_MSR_SYNDBG_OPTIONS:
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_CRASH_CTL:
+	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
+	case HV_X64_MSR_TSC_EMULATION_CONTROL:
+	case HV_X64_MSR_TSC_EMULATION_STATUS:
+	case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+#endif
+	case MSR_IA32_U_CET:
+	case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+		if (pkvm_is_protected_vm(kvm))
+			return false;
+		fallthrough;
+	case MSR_IA32_TSC_ADJUST:
+	case MSR_IA32_TSC:
+	case MSR_IA32_APICBASE:
+	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+	case MSR_IA32_TSC_DEADLINE:
+		return true;
+	default:
+		/*
+		 * All other emulated MSRs are directly emulated by the pKVM
+		 * hypervisor.
+		 */
+		break;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(pkvm_host_has_emulated_msr);
+#endif
+
+#ifndef __PKVM_HYP__
 /*
  * Read or write a bunch of msrs. All parameters are kernel addresses.
  *
@@ -5461,9 +5608,9 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 	vcpu->arch.tpr_access_reporting = !!tac->enabled;
 	return 0;
 }
+#endif /* !__PKVM_HYP__ */
 
-static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
-					u64 mcg_cap)
+int kvm_vcpu_x86_setup_mce(struct kvm_vcpu *vcpu, u64 mcg_cap)
 {
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
@@ -5492,6 +5639,13 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+#ifndef __PKVM_HYP__
+static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
+					u64 mcg_cap)
+{
+	return kvm_vcpu_x86_setup_mce(vcpu, mcg_cap);
+}
+
 /*
  * Validate this is an UCNA (uncorrectable no action) error by checking the
  * MCG_STATUS and MCi_STATUS registers:
@@ -8509,12 +8663,14 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
 {
 	return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
 }
+#endif /* !__PKVM_HYP__ */
 
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	return kvm_x86_call(get_segment_base)(vcpu, seg);
 }
 
+#ifndef __PKVM_HYP__
 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
 {
 	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
@@ -11828,6 +11984,7 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
 	return 0;
 }
+#endif /* !__PKVM_HYP__ */
 
 /* Swap (qemu) user FPU context for the guest FPU context. */
 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
@@ -11851,6 +12008,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	trace_kvm_fpu(0);
 }
 
+#ifndef __PKVM_HYP__
 static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -12806,6 +12964,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	free_page((unsigned long)vcpu->arch.pio_data);
 	kvfree(vcpu->arch.cpuid_entries);
 }
+#endif /* !__PKVM_HYP__ */
 
 static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
@@ -12821,6 +12980,15 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (!init_event || !fpstate)
 		return;
 
+#ifdef __PKVM_HYP__
+	/*
+	 * The npVM's FPU state is managed by the host thus it is not necessary
+	 * to reset by the pKVM hypervisor.
+	 */
+	if (!pkvm_is_protected_vcpu(vcpu))
+		return;
+#endif
+
 	/*
 	 * On INIT, only select XSTATE components are zeroed, most components
 	 * are unchanged.  Currently, the only components that are zeroed and
@@ -12879,7 +13047,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	if (is_guest_mode(vcpu))
 		kvm_leave_nested(vcpu);
 
+	/* The virtual APIC is emulated by the host rather than the pKVM. */
+#ifndef __PKVM_HYP__
 	kvm_lapic_reset(vcpu, init_event);
+#endif
 
 	WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
 	vcpu->arch.hflags = 0;
@@ -12905,11 +13076,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vcpu->arch.apf.msr_int_val = 0;
 	vcpu->arch.st.msr_val = 0;
 
+	/*
+	 * For the pKVM hypervisor, the kvmclock/async_pf is emulated by the
+	 * host.
+	 */
+#ifndef __PKVM_HYP__
 	kvmclock_reset(vcpu);
 
 	kvm_clear_async_pf_completion_queue(vcpu);
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
+#endif
 
 	kvm_xstate_reset(vcpu, init_event);
 
@@ -12974,7 +13151,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	 */
 	if (old_cr0 & X86_CR0_PG) {
 		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+		/* The host will reset kvm mmu context. */
+#ifndef __PKVM_HYP__
 		kvm_mmu_reset_context(vcpu);
+#endif
 	}
 
 	/*
@@ -12991,6 +13171,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset);
 
+#ifndef __PKVM_HYP__
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
 {
 	struct kvm_segment cs;
@@ -13679,6 +13860,7 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	return kvm_x86_call(interrupt_allowed)(vcpu, false);
 }
+#endif /* !__PKVM_HYP__ */
 
 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
 {
@@ -13725,6 +13907,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags);
 
+#ifndef __PKVM_HYP__
 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
 {
 	BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
@@ -13998,6 +14181,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
 }
 #endif
 #endif
+#endif /* !__PKVM_HYP__ */
 
 int kvm_spec_ctrl_test_value(u64 value)
 {
@@ -14007,10 +14191,14 @@ int kvm_spec_ctrl_test_value(u64 value)
 	 */
 
 	u64 saved_value;
+#ifndef __PKVM_HYP__
 	unsigned long flags;
+#endif
 	int ret = 0;
 
+#ifndef __PKVM_HYP__
 	local_irq_save(flags);
+#endif
 
 	if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value))
 		ret = 1;
@@ -14019,12 +14207,15 @@ int kvm_spec_ctrl_test_value(u64 value)
 	else
 		wrmsrq(MSR_IA32_SPEC_CTRL, saved_value);
 
+#ifndef __PKVM_HYP__
 	local_irq_restore(flags);
+#endif
 
 	return ret;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value);
 
+#ifndef __PKVM_HYP__
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
 {
 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 416f8570bb14..8a6dad689b03 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -737,6 +737,7 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 void kvm_user_return_msr_cpu_online(void);
+u64 kvm_get_arch_capabilities(void);
 
 #define CET_US_RESERVED_BITS		GENMASK(9, 6)
 #define CET_US_SHSTK_MASK_BITS		GENMASK(1, 0)
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index ff44ec346162..5f3302bf2cf4 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -134,8 +134,9 @@ struct ddebug_class_param {
  * pr_debug() and friends are globally enabled or modules have selectively
  * enabled them.
  */
-#if defined(CONFIG_DYNAMIC_DEBUG) || \
-	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+#if (defined(CONFIG_DYNAMIC_DEBUG) || \
+	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))) && \
+	(!defined(__PKVM_HYP__) || defined(CONFIG_PKVM_X86_DEBUG))
 
 extern __printf(2, 3)
 void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 57b074e0cfbb..440f2dbe8335 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -39,7 +39,7 @@ struct task_struct;
 					      struct task_struct *idle) {}
 #endif
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(__PKVM_HYP__)
 
 DECLARE_PER_CPU(int, hardirqs_enabled);
 DECLARE_PER_CPU(int, hardirq_context);
@@ -127,7 +127,7 @@ do {						\
 # define lockdep_irq_work_exit(__work)		do { } while (0)
 #endif
 
-#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT)
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) && !defined(__PKVM_HYP__)
 # define lockdep_softirq_enter()		\
 do {						\
 	current->softirq_context++;		\
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5bd76cf394fa..435701d5e1e5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -878,6 +878,18 @@ struct kvm {
 	char stats_id[KVM_STATS_NAME_SIZE];
 };
 
+#ifdef __PKVM_HYP__
+#define kvm_err(fmt, ...) \
+	pr_err("pkvm: " fmt, ## __VA_ARGS__)
+#define kvm_info(fmt, ...) \
+	pr_info("pkvm: " fmt, ## __VA_ARGS__)
+#define kvm_debug(fmt, ...) \
+	pr_debug("pkvm: " fmt, ## __VA_ARGS__)
+#define kvm_debug_ratelimited(fmt, ...) \
+	pr_debug_ratelimited("pkvm: " fmt, ## __VA_ARGS__)
+#define kvm_pr_unimpl(fmt, ...) \
+	pr_err_ratelimited("pkvm: " fmt, ## __VA_ARGS__)
+#else
 #define kvm_err(fmt, ...) \
 	pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__)
 #define kvm_info(fmt, ...) \
@@ -890,6 +902,7 @@ struct kvm {
 #define kvm_pr_unimpl(fmt, ...) \
 	pr_err_ratelimited("kvm [%i]: " fmt, \
 			   task_tgid_nr(current), ## __VA_ARGS__)
+#endif
 
 /* The guest did something we don't support. */
 #define vcpu_unimpl(vcpu, fmt, ...)					\
@@ -907,7 +920,11 @@ struct kvm {
 static inline void kvm_vm_dead(struct kvm *kvm)
 {
 	kvm->vm_dead = true;
+#ifndef __PKVM_HYP__
 	kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD);
+#else
+	/* TODO: Handle VM dead in the pKVM. */
+#endif
 }
 
 static inline void kvm_vm_bugged(struct kvm *kvm)
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 45c663124c9b..c15f1faf962c 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -735,8 +735,9 @@ struct pi_entry {
 #endif
 
 /* If you are writing a driver, please use dev_dbg instead */
-#if defined(CONFIG_DYNAMIC_DEBUG) || \
-	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+#if (defined(CONFIG_DYNAMIC_DEBUG) || \
+	(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))) && \
+	(!defined(__PKVM_HYP__) || defined(CONFIG_PKVM_X86_DEBUG))
 /* descriptor check is first to prevent flooding with "callbacks suppressed" */
 #define pr_debug_ratelimited(fmt, ...)					\
 do {									\