Skip to content

Commit d96c77b

Browse files
committed
KVM: x86: switch hugepage recovery thread to vhost_task
kvm_vm_create_worker_thread() is meant to be used for kthreads that can consume significant amounts of CPU time on behalf of a VM or in response to how the VM behaves (for example how it accesses its memory). Therefore it wants to charge the CPU time consumed by that work to the VM's container. However, because of these threads, cgroups which have kvm instances inside never complete freezing. This can be trivially reproduced: root@test ~# mkdir /sys/fs/cgroup/test root@test ~# echo $$ > /sys/fs/cgroup/test/cgroup.procs root@test ~# qemu-system-x86_64 -nographic -enable-kvm and in another terminal: root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze root@test ~# cat /sys/fs/cgroup/test/cgroup.events populated 1 frozen 0 The cgroup freezing happens in the signal delivery path but kvm_nx_huge_page_recovery_worker, while joining non-root cgroups, never calls into the signal delivery path and thus never gets frozen. Because the cgroup freezer determines whether a given cgroup is frozen by comparing the number of frozen threads to the total number of threads in the cgroup, the cgroup never becomes frozen and users waiting for the state transition may hang indefinitely. Since the worker kthread is tied to a user process, it's better if it behaves similarly to user tasks as much as possible, including being able to send SIGSTOP and SIGCONT. In fact, vhost_task is all that kvm_vm_create_worker_thread() wanted to be and more: not only it inherits the userspace process's cgroups, it has other niceties like being parented properly in the process tree. Use it instead of the homegrown alternative. Incidentally, the new code is also better behaved when you flip recovery back and forth to disabled and back to enabled. If your recovery period is 1 minute, it will run the next recovery after 1 minute independent of how many times you flipped the parameter. (Commit message based on emails from Tejun). Reported-by: Tejun Heo <[email protected]> Reported-by: Luca Boccassi <[email protected]> Acked-by: Tejun Heo <[email protected]> Tested-by: Luca Boccassi <[email protected]> Cc: [email protected] Reviewed-by: Sean Christopherson <[email protected]> Signed-off-by: Paolo Bonzini <[email protected]>
1 parent 0586ade commit d96c77b

File tree

5 files changed

+35
-147
lines changed

5 files changed

+35
-147
lines changed

arch/x86/include/asm/kvm_host.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/irqbypass.h>
2727
#include <linux/hyperv.h>
2828
#include <linux/kfifo.h>
29+
#include <linux/sched/vhost_task.h>
2930

3031
#include <asm/apic.h>
3132
#include <asm/pvclock-abi.h>
@@ -1442,7 +1443,8 @@ struct kvm_arch {
14421443
bool sgx_provisioning_allowed;
14431444

14441445
struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
1445-
struct task_struct *nx_huge_page_recovery_thread;
1446+
struct vhost_task *nx_huge_page_recovery_thread;
1447+
u64 nx_huge_page_last;
14461448

14471449
#ifdef CONFIG_X86_64
14481450
/* The number of TDP MMU pages across all roots. */

arch/x86/kvm/Kconfig

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ config KVM_X86
3030
select HAVE_KVM_IRQ_BYPASS
3131
select HAVE_KVM_IRQ_ROUTING
3232
select HAVE_KVM_READONLY_MEM
33+
select VHOST_TASK
3334
select KVM_ASYNC_PF
3435
select USER_RETURN_NOTIFIER
3536
select KVM_MMIO

arch/x86/kvm/mmu/mmu.c

+31-37
Original file line numberDiff line numberDiff line change
@@ -7162,7 +7162,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
71627162
kvm_mmu_zap_all_fast(kvm);
71637163
mutex_unlock(&kvm->slots_lock);
71647164

7165-
wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
7165+
vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
71667166
}
71677167
mutex_unlock(&kvm_lock);
71687168
}
@@ -7291,7 +7291,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
72917291
mutex_lock(&kvm_lock);
72927292

72937293
list_for_each_entry(kvm, &vm_list, vm_list)
7294-
wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
7294+
vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread);
72957295

72967296
mutex_unlock(&kvm_lock);
72977297
}
@@ -7394,62 +7394,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
73947394
srcu_read_unlock(&kvm->srcu, rcu_idx);
73957395
}
73967396

7397-
static long get_nx_huge_page_recovery_timeout(u64 start_time)
7397+
static void kvm_nx_huge_page_recovery_worker_kill(void *data)
73987398
{
7399-
bool enabled;
7400-
uint period;
7401-
7402-
enabled = calc_nx_huge_pages_recovery_period(&period);
7403-
7404-
return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
7405-
: MAX_SCHEDULE_TIMEOUT;
74067399
}
74077400

7408-
static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
7401+
static bool kvm_nx_huge_page_recovery_worker(void *data)
74097402
{
7410-
u64 start_time;
7403+
struct kvm *kvm = data;
7404+
bool enabled;
7405+
uint period;
74117406
long remaining_time;
74127407

7413-
while (true) {
7414-
start_time = get_jiffies_64();
7415-
remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7416-
7417-
set_current_state(TASK_INTERRUPTIBLE);
7418-
while (!kthread_should_stop() && remaining_time > 0) {
7419-
schedule_timeout(remaining_time);
7420-
remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7421-
set_current_state(TASK_INTERRUPTIBLE);
7422-
}
7423-
7424-
set_current_state(TASK_RUNNING);
7425-
7426-
if (kthread_should_stop())
7427-
return 0;
7408+
enabled = calc_nx_huge_pages_recovery_period(&period);
7409+
if (!enabled)
7410+
return false;
74287411

7429-
kvm_recover_nx_huge_pages(kvm);
7412+
remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period)
7413+
- get_jiffies_64();
7414+
if (remaining_time > 0) {
7415+
schedule_timeout(remaining_time);
7416+
/* check for signals and come back */
7417+
return true;
74307418
}
7419+
7420+
__set_current_state(TASK_RUNNING);
7421+
kvm_recover_nx_huge_pages(kvm);
7422+
kvm->arch.nx_huge_page_last = get_jiffies_64();
7423+
return true;
74317424
}
74327425

74337426
int kvm_mmu_post_init_vm(struct kvm *kvm)
74347427
{
7435-
int err;
7436-
74377428
if (nx_hugepage_mitigation_hard_disabled)
74387429
return 0;
74397430

7440-
err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
7441-
"kvm-nx-lpage-recovery",
7442-
&kvm->arch.nx_huge_page_recovery_thread);
7443-
if (!err)
7444-
kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
7431+
kvm->arch.nx_huge_page_last = get_jiffies_64();
7432+
kvm->arch.nx_huge_page_recovery_thread = vhost_task_create(
7433+
kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill,
7434+
kvm, "kvm-nx-lpage-recovery");
74457435

7446-
return err;
7436+
if (!kvm->arch.nx_huge_page_recovery_thread)
7437+
return -ENOMEM;
7438+
7439+
vhost_task_start(kvm->arch.nx_huge_page_recovery_thread);
7440+
return 0;
74477441
}
74487442

74497443
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
74507444
{
74517445
if (kvm->arch.nx_huge_page_recovery_thread)
7452-
kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
7446+
vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread);
74537447
}
74547448

74557449
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES

include/linux/kvm_host.h

-6
Original file line numberDiff line numberDiff line change
@@ -2425,12 +2425,6 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
24252425
}
24262426
#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */
24272427

2428-
typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data);
2429-
2430-
int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
2431-
uintptr_t data, const char *name,
2432-
struct task_struct **thread_ptr);
2433-
24342428
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
24352429
static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
24362430
{

virt/kvm/kvm_main.c

-103
Original file line numberDiff line numberDiff line change
@@ -6426,106 +6426,3 @@ void kvm_exit(void)
64266426
kvm_irqfd_exit();
64276427
}
64286428
EXPORT_SYMBOL_GPL(kvm_exit);
6429-
6430-
struct kvm_vm_worker_thread_context {
6431-
struct kvm *kvm;
6432-
struct task_struct *parent;
6433-
struct completion init_done;
6434-
kvm_vm_thread_fn_t thread_fn;
6435-
uintptr_t data;
6436-
int err;
6437-
};
6438-
6439-
static int kvm_vm_worker_thread(void *context)
6440-
{
6441-
/*
6442-
* The init_context is allocated on the stack of the parent thread, so
6443-
* we have to locally copy anything that is needed beyond initialization
6444-
*/
6445-
struct kvm_vm_worker_thread_context *init_context = context;
6446-
struct task_struct *parent;
6447-
struct kvm *kvm = init_context->kvm;
6448-
kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6449-
uintptr_t data = init_context->data;
6450-
int err;
6451-
6452-
err = kthread_park(current);
6453-
/* kthread_park(current) is never supposed to return an error */
6454-
WARN_ON(err != 0);
6455-
if (err)
6456-
goto init_complete;
6457-
6458-
err = cgroup_attach_task_all(init_context->parent, current);
6459-
if (err) {
6460-
kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6461-
__func__, err);
6462-
goto init_complete;
6463-
}
6464-
6465-
set_user_nice(current, task_nice(init_context->parent));
6466-
6467-
init_complete:
6468-
init_context->err = err;
6469-
complete(&init_context->init_done);
6470-
init_context = NULL;
6471-
6472-
if (err)
6473-
goto out;
6474-
6475-
/* Wait to be woken up by the spawner before proceeding. */
6476-
kthread_parkme();
6477-
6478-
if (!kthread_should_stop())
6479-
err = thread_fn(kvm, data);
6480-
6481-
out:
6482-
/*
6483-
* Move kthread back to its original cgroup to prevent it lingering in
6484-
* the cgroup of the VM process, after the latter finishes its
6485-
* execution.
6486-
*
6487-
* kthread_stop() waits on the 'exited' completion condition which is
6488-
* set in exit_mm(), via mm_release(), in do_exit(). However, the
6489-
* kthread is removed from the cgroup in the cgroup_exit() which is
6490-
* called after the exit_mm(). This causes the kthread_stop() to return
6491-
* before the kthread actually quits the cgroup.
6492-
*/
6493-
rcu_read_lock();
6494-
parent = rcu_dereference(current->real_parent);
6495-
get_task_struct(parent);
6496-
rcu_read_unlock();
6497-
cgroup_attach_task_all(parent, current);
6498-
put_task_struct(parent);
6499-
6500-
return err;
6501-
}
6502-
6503-
int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6504-
uintptr_t data, const char *name,
6505-
struct task_struct **thread_ptr)
6506-
{
6507-
struct kvm_vm_worker_thread_context init_context = {};
6508-
struct task_struct *thread;
6509-
6510-
*thread_ptr = NULL;
6511-
init_context.kvm = kvm;
6512-
init_context.parent = current;
6513-
init_context.thread_fn = thread_fn;
6514-
init_context.data = data;
6515-
init_completion(&init_context.init_done);
6516-
6517-
thread = kthread_run(kvm_vm_worker_thread, &init_context,
6518-
"%s-%d", name, task_pid_nr(current));
6519-
if (IS_ERR(thread))
6520-
return PTR_ERR(thread);
6521-
6522-
/* kthread_run is never supposed to return NULL */
6523-
WARN_ON(thread == NULL);
6524-
6525-
wait_for_completion(&init_context.init_done);
6526-
6527-
if (!init_context.err)
6528-
*thread_ptr = thread;
6529-
6530-
return init_context.err;
6531-
}

0 commit comments

Comments
 (0)