kvm: don't take vcpu mutex for obviously invalid vcpu ioctls
[pandora-kernel.git] / virt / kvm / eventfd.c
1 /*
2  * kvm eventfd support - use eventfd objects to signal various KVM events
3  *
4  * Copyright 2009 Novell.  All Rights Reserved.
5  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
6  *
7  * Author:
8  *      Gregory Haskins <ghaskins@novell.com>
9  *
10  * This file is free software; you can redistribute it and/or modify
11  * it under the terms of version 2 of the GNU General Public License
12  * as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software Foundation,
21  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
22  */
23
24 #include <linux/kvm_host.h>
25 #include <linux/kvm.h>
26 #include <linux/workqueue.h>
27 #include <linux/syscalls.h>
28 #include <linux/wait.h>
29 #include <linux/poll.h>
30 #include <linux/file.h>
31 #include <linux/list.h>
32 #include <linux/eventfd.h>
33 #include <linux/kernel.h>
34 #include <linux/srcu.h>
35 #include <linux/slab.h>
36 #include <linux/seqlock.h>
37 #include <trace/events/kvm.h>
38
39 #ifdef __KVM_HAVE_IOAPIC
40 #include "ioapic.h"
41 #endif
42 #include "iodev.h"
43
44 #ifdef CONFIG_HAVE_KVM_IRQFD
45 /*
46  * --------------------------------------------------------------------
47  * irqfd: Allows an fd to be used to inject an interrupt to the guest
48  *
49  * Credit goes to Avi Kivity for the original idea.
50  * --------------------------------------------------------------------
51  */
52
53 /*
54  * Resampling irqfds are a special variety of irqfds used to emulate
55  * level triggered interrupts.  The interrupt is asserted on eventfd
56  * trigger.  On acknowledgement through the irq ack notifier, the
57  * interrupt is de-asserted and userspace is notified through the
58  * resamplefd.  All resamplers on the same gsi are de-asserted
59  * together, so we don't need to track the state of each individual
60  * user.  We can also therefore share the same irq source ID.
61  */
62 struct _irqfd_resampler {
63         struct kvm *kvm;
64         /*
65          * List of resampling struct _irqfd objects sharing this gsi.
66          * RCU list modified under kvm->irqfds.resampler_lock
67          */
68         struct list_head list;
69         struct kvm_irq_ack_notifier notifier;
70         /*
71          * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
72          * resamplers among irqfds on the same gsi.
73          * Accessed and modified under kvm->irqfds.resampler_lock
74          */
75         struct list_head link;
76 };
77
78 struct _irqfd {
79         /* Used for MSI fast-path */
80         struct kvm *kvm;
81         wait_queue_t wait;
82         /* Update side is protected by irqfds.lock */
83         struct kvm_kernel_irq_routing_entry irq_entry;
84         seqcount_t irq_entry_sc;
85         /* Used for level IRQ fast-path */
86         int gsi;
87         struct work_struct inject;
88         /* The resampler used by this irqfd (resampler-only) */
89         struct _irqfd_resampler *resampler;
90         /* Eventfd notified on resample (resampler-only) */
91         struct eventfd_ctx *resamplefd;
92         /* Entry in list of irqfds for a resampler (resampler-only) */
93         struct list_head resampler_link;
94         /* Used for setup/shutdown */
95         struct eventfd_ctx *eventfd;
96         struct list_head list;
97         poll_table pt;
98         struct work_struct shutdown;
99 };
100
101 static struct workqueue_struct *irqfd_cleanup_wq;
102
103 static void
104 irqfd_inject(struct work_struct *work)
105 {
106         struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
107         struct kvm *kvm = irqfd->kvm;
108
109         if (!irqfd->resampler) {
110                 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
111                                 false);
112                 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
113                                 false);
114         } else
115                 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
116                             irqfd->gsi, 1, false);
117 }
118
119 /*
120  * Since resampler irqfds share an IRQ source ID, we de-assert once
121  * then notify all of the resampler irqfds using this GSI.  We can't
122  * do multiple de-asserts or we risk racing with incoming re-asserts.
123  */
124 static void
125 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
126 {
127         struct _irqfd_resampler *resampler;
128         struct kvm *kvm;
129         struct _irqfd *irqfd;
130         int idx;
131
132         resampler = container_of(kian, struct _irqfd_resampler, notifier);
133         kvm = resampler->kvm;
134
135         kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
136                     resampler->notifier.gsi, 0, false);
137
138         idx = srcu_read_lock(&kvm->irq_srcu);
139
140         list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
141                 eventfd_signal(irqfd->resamplefd, 1);
142
143         srcu_read_unlock(&kvm->irq_srcu, idx);
144 }
145
146 static void
147 irqfd_resampler_shutdown(struct _irqfd *irqfd)
148 {
149         struct _irqfd_resampler *resampler = irqfd->resampler;
150         struct kvm *kvm = resampler->kvm;
151
152         mutex_lock(&kvm->irqfds.resampler_lock);
153
154         list_del_rcu(&irqfd->resampler_link);
155         synchronize_srcu(&kvm->irq_srcu);
156
157         if (list_empty(&resampler->list)) {
158                 list_del(&resampler->link);
159                 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
160                 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
161                             resampler->notifier.gsi, 0, false);
162                 kfree(resampler);
163         }
164
165         mutex_unlock(&kvm->irqfds.resampler_lock);
166 }
167
168 /*
169  * Race-free decouple logic (ordering is critical)
170  */
171 static void
172 irqfd_shutdown(struct work_struct *work)
173 {
174         struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
175         u64 cnt;
176
177         /*
178          * Synchronize with the wait-queue and unhook ourselves to prevent
179          * further events.
180          */
181         eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
182
183         /*
184          * We know no new events will be scheduled at this point, so block
185          * until all previously outstanding events have completed
186          */
187         flush_work(&irqfd->inject);
188
189         if (irqfd->resampler) {
190                 irqfd_resampler_shutdown(irqfd);
191                 eventfd_ctx_put(irqfd->resamplefd);
192         }
193
194         /*
195          * It is now safe to release the object's resources
196          */
197         eventfd_ctx_put(irqfd->eventfd);
198         kfree(irqfd);
199 }
200
201
202 /* assumes kvm->irqfds.lock is held */
203 static bool
204 irqfd_is_active(struct _irqfd *irqfd)
205 {
206         return list_empty(&irqfd->list) ? false : true;
207 }
208
209 /*
210  * Mark the irqfd as inactive and schedule it for removal
211  *
212  * assumes kvm->irqfds.lock is held
213  */
214 static void
215 irqfd_deactivate(struct _irqfd *irqfd)
216 {
217         BUG_ON(!irqfd_is_active(irqfd));
218
219         list_del_init(&irqfd->list);
220
221         queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
222 }
223
224 /*
225  * Called with wqh->lock held and interrupts disabled
226  */
227 static int
228 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
229 {
230         struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
231         unsigned long flags = (unsigned long)key;
232         struct kvm_kernel_irq_routing_entry irq;
233         struct kvm *kvm = irqfd->kvm;
234         unsigned seq;
235         int idx;
236
237         if (flags & POLLIN) {
238                 idx = srcu_read_lock(&kvm->irq_srcu);
239                 do {
240                         seq = read_seqcount_begin(&irqfd->irq_entry_sc);
241                         irq = irqfd->irq_entry;
242                 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
243                 /* An event has been signaled, inject an interrupt */
244                 if (irq.type == KVM_IRQ_ROUTING_MSI)
245                         kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
246                                         false);
247                 else
248                         schedule_work(&irqfd->inject);
249                 srcu_read_unlock(&kvm->irq_srcu, idx);
250         }
251
252         if (flags & POLLHUP) {
253                 /* The eventfd is closing, detach from KVM */
254                 unsigned long flags;
255
256                 spin_lock_irqsave(&kvm->irqfds.lock, flags);
257
258                 /*
259                  * We must check if someone deactivated the irqfd before
260                  * we could acquire the irqfds.lock since the item is
261                  * deactivated from the KVM side before it is unhooked from
262                  * the wait-queue.  If it is already deactivated, we can
263                  * simply return knowing the other side will cleanup for us.
264                  * We cannot race against the irqfd going away since the
265                  * other side is required to acquire wqh->lock, which we hold
266                  */
267                 if (irqfd_is_active(irqfd))
268                         irqfd_deactivate(irqfd);
269
270                 spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
271         }
272
273         return 0;
274 }
275
276 static void
277 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
278                         poll_table *pt)
279 {
280         struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
281         add_wait_queue(wqh, &irqfd->wait);
282 }
283
284 /* Must be called under irqfds.lock */
285 static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
286 {
287         struct kvm_kernel_irq_routing_entry *e;
288         struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
289         int i, n_entries;
290
291         n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
292
293         write_seqcount_begin(&irqfd->irq_entry_sc);
294
295         irqfd->irq_entry.type = 0;
296
297         e = entries;
298         for (i = 0; i < n_entries; ++i, ++e) {
299                 /* Only fast-path MSI. */
300                 if (e->type == KVM_IRQ_ROUTING_MSI)
301                         irqfd->irq_entry = *e;
302         }
303
304         write_seqcount_end(&irqfd->irq_entry_sc);
305 }
306
307 static int
308 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
309 {
310         struct _irqfd *irqfd, *tmp;
311         struct fd f;
312         struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
313         int ret;
314         unsigned int events;
315         int idx;
316
317         irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
318         if (!irqfd)
319                 return -ENOMEM;
320
321         irqfd->kvm = kvm;
322         irqfd->gsi = args->gsi;
323         INIT_LIST_HEAD(&irqfd->list);
324         INIT_WORK(&irqfd->inject, irqfd_inject);
325         INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
326         seqcount_init(&irqfd->irq_entry_sc);
327
328         f = fdget(args->fd);
329         if (!f.file) {
330                 ret = -EBADF;
331                 goto out;
332         }
333
334         eventfd = eventfd_ctx_fileget(f.file);
335         if (IS_ERR(eventfd)) {
336                 ret = PTR_ERR(eventfd);
337                 goto fail;
338         }
339
340         irqfd->eventfd = eventfd;
341
342         if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
343                 struct _irqfd_resampler *resampler;
344
345                 resamplefd = eventfd_ctx_fdget(args->resamplefd);
346                 if (IS_ERR(resamplefd)) {
347                         ret = PTR_ERR(resamplefd);
348                         goto fail;
349                 }
350
351                 irqfd->resamplefd = resamplefd;
352                 INIT_LIST_HEAD(&irqfd->resampler_link);
353
354                 mutex_lock(&kvm->irqfds.resampler_lock);
355
356                 list_for_each_entry(resampler,
357                                     &kvm->irqfds.resampler_list, link) {
358                         if (resampler->notifier.gsi == irqfd->gsi) {
359                                 irqfd->resampler = resampler;
360                                 break;
361                         }
362                 }
363
364                 if (!irqfd->resampler) {
365                         resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
366                         if (!resampler) {
367                                 ret = -ENOMEM;
368                                 mutex_unlock(&kvm->irqfds.resampler_lock);
369                                 goto fail;
370                         }
371
372                         resampler->kvm = kvm;
373                         INIT_LIST_HEAD(&resampler->list);
374                         resampler->notifier.gsi = irqfd->gsi;
375                         resampler->notifier.irq_acked = irqfd_resampler_ack;
376                         INIT_LIST_HEAD(&resampler->link);
377
378                         list_add(&resampler->link, &kvm->irqfds.resampler_list);
379                         kvm_register_irq_ack_notifier(kvm,
380                                                       &resampler->notifier);
381                         irqfd->resampler = resampler;
382                 }
383
384                 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
385                 synchronize_srcu(&kvm->irq_srcu);
386
387                 mutex_unlock(&kvm->irqfds.resampler_lock);
388         }
389
390         /*
391          * Install our own custom wake-up handling so we are notified via
392          * a callback whenever someone signals the underlying eventfd
393          */
394         init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
395         init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);
396
397         spin_lock_irq(&kvm->irqfds.lock);
398
399         ret = 0;
400         list_for_each_entry(tmp, &kvm->irqfds.items, list) {
401                 if (irqfd->eventfd != tmp->eventfd)
402                         continue;
403                 /* This fd is used for another irq already. */
404                 ret = -EBUSY;
405                 spin_unlock_irq(&kvm->irqfds.lock);
406                 goto fail;
407         }
408
409         idx = srcu_read_lock(&kvm->irq_srcu);
410         irqfd_update(kvm, irqfd);
411         srcu_read_unlock(&kvm->irq_srcu, idx);
412
413         list_add_tail(&irqfd->list, &kvm->irqfds.items);
414
415         spin_unlock_irq(&kvm->irqfds.lock);
416
417         /*
418          * Check if there was an event already pending on the eventfd
419          * before we registered, and trigger it as if we didn't miss it.
420          */
421         events = f.file->f_op->poll(f.file, &irqfd->pt);
422
423         if (events & POLLIN)
424                 schedule_work(&irqfd->inject);
425
426         /*
427          * do not drop the file until the irqfd is fully initialized, otherwise
428          * we might race against the POLLHUP
429          */
430         fdput(f);
431
432         return 0;
433
434 fail:
435         if (irqfd->resampler)
436                 irqfd_resampler_shutdown(irqfd);
437
438         if (resamplefd && !IS_ERR(resamplefd))
439                 eventfd_ctx_put(resamplefd);
440
441         if (eventfd && !IS_ERR(eventfd))
442                 eventfd_ctx_put(eventfd);
443
444         fdput(f);
445
446 out:
447         kfree(irqfd);
448         return ret;
449 }
450
451 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
452 {
453         struct kvm_irq_ack_notifier *kian;
454         int gsi, idx;
455
456         idx = srcu_read_lock(&kvm->irq_srcu);
457         gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
458         if (gsi != -1)
459                 hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
460                                          link)
461                         if (kian->gsi == gsi) {
462                                 srcu_read_unlock(&kvm->irq_srcu, idx);
463                                 return true;
464                         }
465
466         srcu_read_unlock(&kvm->irq_srcu, idx);
467
468         return false;
469 }
470 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
471
472 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
473 {
474         struct kvm_irq_ack_notifier *kian;
475         int gsi, idx;
476
477         trace_kvm_ack_irq(irqchip, pin);
478
479         idx = srcu_read_lock(&kvm->irq_srcu);
480         gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
481         if (gsi != -1)
482                 hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
483                                          link)
484                         if (kian->gsi == gsi)
485                                 kian->irq_acked(kian);
486         srcu_read_unlock(&kvm->irq_srcu, idx);
487 }
488
489 void kvm_register_irq_ack_notifier(struct kvm *kvm,
490                                    struct kvm_irq_ack_notifier *kian)
491 {
492         mutex_lock(&kvm->irq_lock);
493         hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
494         mutex_unlock(&kvm->irq_lock);
495 #ifdef __KVM_HAVE_IOAPIC
496         kvm_vcpu_request_scan_ioapic(kvm);
497 #endif
498 }
499
500 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
501                                     struct kvm_irq_ack_notifier *kian)
502 {
503         mutex_lock(&kvm->irq_lock);
504         hlist_del_init_rcu(&kian->link);
505         mutex_unlock(&kvm->irq_lock);
506         synchronize_srcu(&kvm->irq_srcu);
507 #ifdef __KVM_HAVE_IOAPIC
508         kvm_vcpu_request_scan_ioapic(kvm);
509 #endif
510 }
511 #endif
512
513 void
514 kvm_eventfd_init(struct kvm *kvm)
515 {
516 #ifdef CONFIG_HAVE_KVM_IRQFD
517         spin_lock_init(&kvm->irqfds.lock);
518         INIT_LIST_HEAD(&kvm->irqfds.items);
519         INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
520         mutex_init(&kvm->irqfds.resampler_lock);
521 #endif
522         INIT_LIST_HEAD(&kvm->ioeventfds);
523 }
524
525 #ifdef CONFIG_HAVE_KVM_IRQFD
526 /*
527  * shutdown any irqfd's that match fd+gsi
528  */
529 static int
530 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
531 {
532         struct _irqfd *irqfd, *tmp;
533         struct eventfd_ctx *eventfd;
534
535         eventfd = eventfd_ctx_fdget(args->fd);
536         if (IS_ERR(eventfd))
537                 return PTR_ERR(eventfd);
538
539         spin_lock_irq(&kvm->irqfds.lock);
540
541         list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
542                 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
543                         /*
544                          * This clearing of irq_entry.type is needed for when
545                          * another thread calls kvm_irq_routing_update before
546                          * we flush workqueue below (we synchronize with
547                          * kvm_irq_routing_update using irqfds.lock).
548                          */
549                         write_seqcount_begin(&irqfd->irq_entry_sc);
550                         irqfd->irq_entry.type = 0;
551                         write_seqcount_end(&irqfd->irq_entry_sc);
552                         irqfd_deactivate(irqfd);
553                 }
554         }
555
556         spin_unlock_irq(&kvm->irqfds.lock);
557         eventfd_ctx_put(eventfd);
558
559         /*
560          * Block until we know all outstanding shutdown jobs have completed
561          * so that we guarantee there will not be any more interrupts on this
562          * gsi once this deassign function returns.
563          */
564         flush_workqueue(irqfd_cleanup_wq);
565
566         return 0;
567 }
568
569 int
570 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
571 {
572         if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
573                 return -EINVAL;
574
575         if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
576                 return kvm_irqfd_deassign(kvm, args);
577
578         return kvm_irqfd_assign(kvm, args);
579 }
580
581 /*
582  * This function is called as the kvm VM fd is being released. Shutdown all
583  * irqfds that still remain open
584  */
585 void
586 kvm_irqfd_release(struct kvm *kvm)
587 {
588         struct _irqfd *irqfd, *tmp;
589
590         spin_lock_irq(&kvm->irqfds.lock);
591
592         list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
593                 irqfd_deactivate(irqfd);
594
595         spin_unlock_irq(&kvm->irqfds.lock);
596
597         /*
598          * Block until we know all outstanding shutdown jobs have completed
599          * since we do not take a kvm* reference.
600          */
601         flush_workqueue(irqfd_cleanup_wq);
602
603 }
604
605 /*
606  * Take note of a change in irq routing.
607  * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards.
608  */
609 void kvm_irq_routing_update(struct kvm *kvm)
610 {
611         struct _irqfd *irqfd;
612
613         spin_lock_irq(&kvm->irqfds.lock);
614
615         list_for_each_entry(irqfd, &kvm->irqfds.items, list)
616                 irqfd_update(kvm, irqfd);
617
618         spin_unlock_irq(&kvm->irqfds.lock);
619 }
620
621 /*
622  * create a host-wide workqueue for issuing deferred shutdown requests
623  * aggregated from all vm* instances. We need our own isolated single-thread
624  * queue to prevent deadlock against flushing the normal work-queue.
625  */
626 int kvm_irqfd_init(void)
627 {
628         irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
629         if (!irqfd_cleanup_wq)
630                 return -ENOMEM;
631
632         return 0;
633 }
634
635 void kvm_irqfd_exit(void)
636 {
637         destroy_workqueue(irqfd_cleanup_wq);
638 }
639 #endif
640
641 /*
642  * --------------------------------------------------------------------
643  * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
644  *
645  * userspace can register a PIO/MMIO address with an eventfd for receiving
646  * notification when the memory has been touched.
647  * --------------------------------------------------------------------
648  */
649
650 struct _ioeventfd {
651         struct list_head     list;
652         u64                  addr;
653         int                  length;
654         struct eventfd_ctx  *eventfd;
655         u64                  datamatch;
656         struct kvm_io_device dev;
657         u8                   bus_idx;
658         bool                 wildcard;
659 };
660
661 static inline struct _ioeventfd *
662 to_ioeventfd(struct kvm_io_device *dev)
663 {
664         return container_of(dev, struct _ioeventfd, dev);
665 }
666
667 static void
668 ioeventfd_release(struct _ioeventfd *p)
669 {
670         eventfd_ctx_put(p->eventfd);
671         list_del(&p->list);
672         kfree(p);
673 }
674
675 static bool
676 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
677 {
678         u64 _val;
679
680         if (addr != p->addr)
681                 /* address must be precise for a hit */
682                 return false;
683
684         if (!p->length)
685                 /* length = 0 means only look at the address, so always a hit */
686                 return true;
687
688         if (len != p->length)
689                 /* address-range must be precise for a hit */
690                 return false;
691
692         if (p->wildcard)
693                 /* all else equal, wildcard is always a hit */
694                 return true;
695
696         /* otherwise, we have to actually compare the data */
697
698         BUG_ON(!IS_ALIGNED((unsigned long)val, len));
699
700         switch (len) {
701         case 1:
702                 _val = *(u8 *)val;
703                 break;
704         case 2:
705                 _val = *(u16 *)val;
706                 break;
707         case 4:
708                 _val = *(u32 *)val;
709                 break;
710         case 8:
711                 _val = *(u64 *)val;
712                 break;
713         default:
714                 return false;
715         }
716
717         return _val == p->datamatch ? true : false;
718 }
719
720 /* MMIO/PIO writes trigger an event if the addr/val match */
721 static int
722 ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
723                 const void *val)
724 {
725         struct _ioeventfd *p = to_ioeventfd(this);
726
727         if (!ioeventfd_in_range(p, addr, len, val))
728                 return -EOPNOTSUPP;
729
730         eventfd_signal(p->eventfd, 1);
731         return 0;
732 }
733
734 /*
735  * This function is called as KVM is completely shutting down.  We do not
736  * need to worry about locking just nuke anything we have as quickly as possible
737  */
738 static void
739 ioeventfd_destructor(struct kvm_io_device *this)
740 {
741         struct _ioeventfd *p = to_ioeventfd(this);
742
743         ioeventfd_release(p);
744 }
745
746 static const struct kvm_io_device_ops ioeventfd_ops = {
747         .write      = ioeventfd_write,
748         .destructor = ioeventfd_destructor,
749 };
750
751 /* assumes kvm->slots_lock held */
752 static bool
753 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
754 {
755         struct _ioeventfd *_p;
756
757         list_for_each_entry(_p, &kvm->ioeventfds, list)
758                 if (_p->bus_idx == p->bus_idx &&
759                     _p->addr == p->addr &&
760                     (!_p->length || !p->length ||
761                      (_p->length == p->length &&
762                       (_p->wildcard || p->wildcard ||
763                        _p->datamatch == p->datamatch))))
764                         return true;
765
766         return false;
767 }
768
769 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
770 {
771         if (flags & KVM_IOEVENTFD_FLAG_PIO)
772                 return KVM_PIO_BUS;
773         if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
774                 return KVM_VIRTIO_CCW_NOTIFY_BUS;
775         return KVM_MMIO_BUS;
776 }
777
778 static int
779 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
780 {
781         enum kvm_bus              bus_idx;
782         struct _ioeventfd        *p;
783         struct eventfd_ctx       *eventfd;
784         int                       ret;
785
786         bus_idx = ioeventfd_bus_from_flags(args->flags);
787         /* must be natural-word sized, or 0 to ignore length */
788         switch (args->len) {
789         case 0:
790         case 1:
791         case 2:
792         case 4:
793         case 8:
794                 break;
795         default:
796                 return -EINVAL;
797         }
798
799         /* check for range overflow */
800         if (args->addr + args->len < args->addr)
801                 return -EINVAL;
802
803         /* check for extra flags that we don't understand */
804         if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
805                 return -EINVAL;
806
807         /* ioeventfd with no length can't be combined with DATAMATCH */
808         if (!args->len &&
809             args->flags & (KVM_IOEVENTFD_FLAG_PIO |
810                            KVM_IOEVENTFD_FLAG_DATAMATCH))
811                 return -EINVAL;
812
813         eventfd = eventfd_ctx_fdget(args->fd);
814         if (IS_ERR(eventfd))
815                 return PTR_ERR(eventfd);
816
817         p = kzalloc(sizeof(*p), GFP_KERNEL);
818         if (!p) {
819                 ret = -ENOMEM;
820                 goto fail;
821         }
822
823         INIT_LIST_HEAD(&p->list);
824         p->addr    = args->addr;
825         p->bus_idx = bus_idx;
826         p->length  = args->len;
827         p->eventfd = eventfd;
828
829         /* The datamatch feature is optional, otherwise this is a wildcard */
830         if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
831                 p->datamatch = args->datamatch;
832         else
833                 p->wildcard = true;
834
835         mutex_lock(&kvm->slots_lock);
836
837         /* Verify that there isn't a match already */
838         if (ioeventfd_check_collision(kvm, p)) {
839                 ret = -EEXIST;
840                 goto unlock_fail;
841         }
842
843         kvm_iodevice_init(&p->dev, &ioeventfd_ops);
844
845         ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
846                                       &p->dev);
847         if (ret < 0)
848                 goto unlock_fail;
849
850         /* When length is ignored, MMIO is also put on a separate bus, for
851          * faster lookups.
852          */
853         if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) {
854                 ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS,
855                                               p->addr, 0, &p->dev);
856                 if (ret < 0)
857                         goto register_fail;
858         }
859
860         kvm->buses[bus_idx]->ioeventfd_count++;
861         list_add_tail(&p->list, &kvm->ioeventfds);
862
863         mutex_unlock(&kvm->slots_lock);
864
865         return 0;
866
867 register_fail:
868         kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
869 unlock_fail:
870         mutex_unlock(&kvm->slots_lock);
871
872 fail:
873         kfree(p);
874         eventfd_ctx_put(eventfd);
875
876         return ret;
877 }
878
879 static int
880 kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
881 {
882         enum kvm_bus              bus_idx;
883         struct _ioeventfd        *p, *tmp;
884         struct eventfd_ctx       *eventfd;
885         int                       ret = -ENOENT;
886
887         bus_idx = ioeventfd_bus_from_flags(args->flags);
888         eventfd = eventfd_ctx_fdget(args->fd);
889         if (IS_ERR(eventfd))
890                 return PTR_ERR(eventfd);
891
892         mutex_lock(&kvm->slots_lock);
893
894         list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
895                 bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
896
897                 if (p->bus_idx != bus_idx ||
898                     p->eventfd != eventfd  ||
899                     p->addr != args->addr  ||
900                     p->length != args->len ||
901                     p->wildcard != wildcard)
902                         continue;
903
904                 if (!p->wildcard && p->datamatch != args->datamatch)
905                         continue;
906
907                 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
908                 if (!p->length) {
909                         kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS,
910                                                   &p->dev);
911                 }
912                 kvm->buses[bus_idx]->ioeventfd_count--;
913                 ioeventfd_release(p);
914                 ret = 0;
915                 break;
916         }
917
918         mutex_unlock(&kvm->slots_lock);
919
920         eventfd_ctx_put(eventfd);
921
922         return ret;
923 }
924
925 int
926 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
927 {
928         if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
929                 return kvm_deassign_ioeventfd(kvm, args);
930
931         return kvm_assign_ioeventfd(kvm, args);
932 }