#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <asm/io.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-#include <asm-generic/bitops/le.h>
#include "coalesced_mmio.h"
#include "async_pf.h"
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/
-DEFINE_SPINLOCK(kvm_lock);
+DEFINE_RAW_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list);
static cpumask_var_t cpus_hardware_enabled;
int cpu;
mutex_lock(&vcpu->mutex);
+ if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
+ /* The thread running this VCPU changed. */
+ struct pid *oldpid = vcpu->pid;
+ struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
+ rcu_assign_pointer(vcpu->pid, newpid);
+ synchronize_rcu();
+ put_pid(oldpid);
+ }
cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
zalloc_cpumask_var(&cpus, GFP_ATOMIC);
- raw_spin_lock(&kvm->requests_lock);
- me = smp_processor_id();
+ me = get_cpu();
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (kvm_make_check_request(req, vcpu))
- continue;
+ kvm_make_request(req, vcpu);
cpu = vcpu->cpu;
/* Set ->requests bit before we read ->mode */
smp_call_function_many(cpus, ack_flush, NULL, 1);
else
called = false;
- raw_spin_unlock(&kvm->requests_lock);
+ put_cpu();
free_cpumask_var(cpus);
return called;
}
vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
+ vcpu->pid = NULL;
init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
+ put_pid(vcpu->pid);
kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run);
}
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
spin_lock_init(&kvm->mmu_lock);
- raw_spin_lock_init(&kvm->requests_lock);
kvm_eventfd_init(kvm);
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1);
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return kvm;
struct mm_struct *mm = kvm->mm;
kvm_arch_sync_events(kvm);
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_del(&kvm->vm_list);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++)
kvm_io_bus_destroy(kvm->buses[i]);
return fault_pfn;
}
+static inline int check_user_page_hwpoison(unsigned long addr)
+{
+ int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
+
+ rc = __get_user_pages(current, current->mm, addr, 1,
+ flags, NULL, NULL, NULL);
+ return rc == -EHWPOISON;
+}
+
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async, bool write_fault, bool *writable)
{
return get_fault_pfn();
down_read(¤t->mm->mmap_sem);
- if (is_hwpoison_address(addr)) {
+ if (check_user_page_hwpoison(addr)) {
up_read(¤t->mm->mmap_sem);
get_page(hwpoison_page);
return page_to_pfn(hwpoison_page);
if (memslot && memslot->dirty_bitmap) {
unsigned long rel_gfn = gfn - memslot->base_gfn;
- generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
+ __set_bit_le(rel_gfn, memslot->dirty_bitmap);
}
}
}
EXPORT_SYMBOL_GPL(kvm_resched);
-void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu)
+void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
- ktime_t expires;
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
-
- /* Sleep for 100 us, and hope lock-holder got scheduled */
- expires = ktime_add_ns(ktime_get(), 100000UL);
- schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ struct kvm *kvm = me->kvm;
+ struct kvm_vcpu *vcpu;
+ int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
+ int yielded = 0;
+ int pass;
+ int i;
- finish_wait(&vcpu->wq, &wait);
+ /*
+ * We boost the priority of a VCPU that is runnable but not
+ * currently running, because it got preempted by something
+ * else and called schedule in __vcpu_run. Hopefully that
+ * VCPU is holding the lock that we need and will release it.
+ * We approximate round-robin by starting at the last boosted VCPU.
+ */
+ for (pass = 0; pass < 2 && !yielded; pass++) {
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct task_struct *task = NULL;
+ struct pid *pid;
+ if (!pass && i < last_boosted_vcpu) {
+ i = last_boosted_vcpu;
+ continue;
+ } else if (pass && i > last_boosted_vcpu)
+ break;
+ if (vcpu == me)
+ continue;
+ if (waitqueue_active(&vcpu->wq))
+ continue;
+ rcu_read_lock();
+ pid = rcu_dereference(vcpu->pid);
+ if (pid)
+ task = get_pid_task(vcpu->pid, PIDTYPE_PID);
+ rcu_read_unlock();
+ if (!task)
+ continue;
+ if (task->flags & PF_VCPU) {
+ put_task_struct(task);
+ continue;
+ }
+ if (yield_to(task, 1)) {
+ put_task_struct(task);
+ kvm->last_boosted_vcpu = i;
+ yielded = 1;
+ break;
+ }
+ put_task_struct(task);
+ }
+ }
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
static void hardware_enable(void *junk)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_enable_nolock(junk);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static void hardware_disable_nolock(void *junk)
static void hardware_disable(void *junk)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_disable_nolock(junk);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static void hardware_disable_all_nolock(void)
static void hardware_disable_all(void)
{
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
hardware_disable_all_nolock();
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
}
static int hardware_enable_all(void)
{
int r = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
kvm_usage_count++;
if (kvm_usage_count == 1) {
}
}
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return r;
}
struct kvm *kvm;
*val = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
*val += *(u32 *)((void *)kvm + offset);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return 0;
}
int i;
*val = 0;
- spin_lock(&kvm_lock);
+ raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
*val += *(u32 *)((void *)vcpu + offset);
- spin_unlock(&kvm_lock);
+ raw_spin_unlock(&kvm_lock);
return 0;
}
debugfs_remove(kvm_debugfs_dir);
}
-static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+static int kvm_suspend(void)
{
if (kvm_usage_count)
hardware_disable_nolock(NULL);
return 0;
}
-static int kvm_resume(struct sys_device *dev)
+static void kvm_resume(void)
{
if (kvm_usage_count) {
- WARN_ON(spin_is_locked(&kvm_lock));
+ WARN_ON(raw_spin_is_locked(&kvm_lock));
hardware_enable_nolock(NULL);
}
- return 0;
}
-static struct sysdev_class kvm_sysdev_class = {
- .name = "kvm",
+static struct syscore_ops kvm_syscore_ops = {
.suspend = kvm_suspend,
.resume = kvm_resume,
};
-static struct sys_device kvm_sysdev = {
- .id = 0,
- .cls = &kvm_sysdev_class,
-};
-
struct page *bad_page;
pfn_t bad_pfn;
goto out_free_2;
register_reboot_notifier(&kvm_reboot_notifier);
- r = sysdev_class_register(&kvm_sysdev_class);
- if (r)
- goto out_free_3;
-
- r = sysdev_register(&kvm_sysdev);
- if (r)
- goto out_free_4;
-
/* A kmem cache lets us meet the alignment requirements of fx_save. */
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
0, NULL);
if (!kvm_vcpu_cache) {
r = -ENOMEM;
- goto out_free_5;
+ goto out_free_3;
}
r = kvm_async_pf_init();
goto out_unreg;
}
+ register_syscore_ops(&kvm_syscore_ops);
+
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;
kvm_async_pf_deinit();
out_free:
kmem_cache_destroy(kvm_vcpu_cache);
-out_free_5:
- sysdev_unregister(&kvm_sysdev);
-out_free_4:
- sysdev_class_unregister(&kvm_sysdev_class);
out_free_3:
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
kvm_async_pf_deinit();
- sysdev_unregister(&kvm_sysdev);
- sysdev_class_unregister(&kvm_sysdev_class);
+ unregister_syscore_ops(&kvm_syscore_ops);
unregister_reboot_notifier(&kvm_reboot_notifier);
unregister_cpu_notifier(&kvm_cpu_notifier);
on_each_cpu(hardware_disable_nolock, NULL, 1);