CoCalc -- eventfd.c

GitHub Repository: torvalds/linux
Path: blob/master/virt/kvm/eventfd.c
²⁶²⁷⁸ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * kvm eventfd support - use eventfd objects to signal various KVM events
4
 *
5
 * Copyright 2009 Novell.  All Rights Reserved.
6
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
7
 *
8
 * Author:
9
 *	Gregory Haskins <[email protected]>
10
 */
11

12
#include <linux/kvm_host.h>
13
#include <linux/kvm.h>
14
#include <linux/kvm_irqfd.h>
15
#include <linux/workqueue.h>
16
#include <linux/syscalls.h>
17
#include <linux/wait.h>
18
#include <linux/poll.h>
19
#include <linux/file.h>
20
#include <linux/list.h>
21
#include <linux/eventfd.h>
22
#include <linux/kernel.h>
23
#include <linux/srcu.h>
24
#include <linux/slab.h>
25
#include <linux/seqlock.h>
26
#include <linux/irqbypass.h>
27
#include <trace/events/kvm.h>
28

29
#include <kvm/iodev.h>
30

31
#ifdef CONFIG_HAVE_KVM_IRQCHIP
32

33
static struct workqueue_struct *irqfd_cleanup_wq;
34

35
bool __attribute__((weak))
36
kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
37
{
38
	return true;
39
}
40

41
static void
42
irqfd_inject(struct work_struct *work)
43
{
44
	struct kvm_kernel_irqfd *irqfd =
45
		container_of(work, struct kvm_kernel_irqfd, inject);
46
	struct kvm *kvm = irqfd->kvm;
47

48
	if (!irqfd->resampler) {
49
		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
50
				false);
51
		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
52
				false);
53
	} else
54
		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
55
			    irqfd->gsi, 1, false);
56
}
57

58
static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler)
59
{
60
	struct kvm_kernel_irqfd *irqfd;
61

62
	list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
63
				 srcu_read_lock_held(&resampler->kvm->irq_srcu))
64
		eventfd_signal(irqfd->resamplefd);
65
}
66

67
/*
68
 * Since resampler irqfds share an IRQ source ID, we de-assert once
69
 * then notify all of the resampler irqfds using this GSI.  We can't
70
 * do multiple de-asserts or we risk racing with incoming re-asserts.
71
 */
72
static void
73
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
74
{
75
	struct kvm_kernel_irqfd_resampler *resampler;
76
	struct kvm *kvm;
77
	int idx;
78

79
	resampler = container_of(kian,
80
			struct kvm_kernel_irqfd_resampler, notifier);
81
	kvm = resampler->kvm;
82

83
	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
84
		    resampler->notifier.gsi, 0, false);
85

86
	idx = srcu_read_lock(&kvm->irq_srcu);
87
	irqfd_resampler_notify(resampler);
88
	srcu_read_unlock(&kvm->irq_srcu, idx);
89
}
90

91
static void
92
irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
93
{
94
	struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
95
	struct kvm *kvm = resampler->kvm;
96

97
	mutex_lock(&kvm->irqfds.resampler_lock);
98

99
	list_del_rcu(&irqfd->resampler_link);
100

101
	if (list_empty(&resampler->list)) {
102
		list_del_rcu(&resampler->link);
103
		kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
104
		/*
105
		 * synchronize_srcu_expedited(&kvm->irq_srcu) already called
106
		 * in kvm_unregister_irq_ack_notifier().
107
		 */
108
		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
109
			    resampler->notifier.gsi, 0, false);
110
		kfree(resampler);
111
	} else {
112
		synchronize_srcu_expedited(&kvm->irq_srcu);
113
	}
114

115
	mutex_unlock(&kvm->irqfds.resampler_lock);
116
}
117

118
/*
119
 * Race-free decouple logic (ordering is critical)
120
 */
121
static void
122
irqfd_shutdown(struct work_struct *work)
123
{
124
	struct kvm_kernel_irqfd *irqfd =
125
		container_of(work, struct kvm_kernel_irqfd, shutdown);
126
	struct kvm *kvm = irqfd->kvm;
127
	u64 cnt;
128

129
	/* Make sure irqfd has been initialized in assign path. */
130
	synchronize_srcu_expedited(&kvm->irq_srcu);
131

132
	/*
133
	 * Synchronize with the wait-queue and unhook ourselves to prevent
134
	 * further events.
135
	 */
136
	eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);
137

138
	/*
139
	 * We know no new events will be scheduled at this point, so block
140
	 * until all previously outstanding events have completed
141
	 */
142
	flush_work(&irqfd->inject);
143

144
	if (irqfd->resampler) {
145
		irqfd_resampler_shutdown(irqfd);
146
		eventfd_ctx_put(irqfd->resamplefd);
147
	}
148

149
	/*
150
	 * It is now safe to release the object's resources
151
	 */
152
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
153
	irq_bypass_unregister_consumer(&irqfd->consumer);
154
#endif
155
	eventfd_ctx_put(irqfd->eventfd);
156
	kfree(irqfd);
157
}
158

159

160
/* assumes kvm->irqfds.lock is held */
161
static bool
162
irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
163
{
164
	return list_empty(&irqfd->list) ? false : true;
165
}
166

167
/*
168
 * Mark the irqfd as inactive and schedule it for removal
169
 *
170
 * assumes kvm->irqfds.lock is held
171
 */
172
static void
173
irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
174
{
175
	BUG_ON(!irqfd_is_active(irqfd));
176

177
	list_del_init(&irqfd->list);
178

179
	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
180
}
181

182
int __attribute__((weak)) kvm_arch_set_irq_inatomic(
183
				struct kvm_kernel_irq_routing_entry *irq,
184
				struct kvm *kvm, int irq_source_id,
185
				int level,
186
				bool line_status)
187
{
188
	return -EWOULDBLOCK;
189
}
190

191
/*
192
 * Called with wqh->lock held and interrupts disabled
193
 */
194
static int
195
irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
196
{
197
	struct kvm_kernel_irqfd *irqfd =
198
		container_of(wait, struct kvm_kernel_irqfd, wait);
199
	__poll_t flags = key_to_poll(key);
200
	struct kvm_kernel_irq_routing_entry irq;
201
	struct kvm *kvm = irqfd->kvm;
202
	unsigned seq;
203
	int idx;
204
	int ret = 0;
205

206
	if (flags & EPOLLIN) {
207
		/*
208
		 * WARNING: Do NOT take irqfds.lock in any path except EPOLLHUP,
209
		 * as KVM holds irqfds.lock when registering the irqfd with the
210
		 * eventfd.
211
		 */
212
		u64 cnt;
213
		eventfd_ctx_do_read(irqfd->eventfd, &cnt);
214

215
		idx = srcu_read_lock(&kvm->irq_srcu);
216
		do {
217
			seq = read_seqcount_begin(&irqfd->irq_entry_sc);
218
			irq = irqfd->irq_entry;
219
		} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
220
		/* An event has been signaled, inject an interrupt */
221
		if (kvm_arch_set_irq_inatomic(&irq, kvm,
222
					      KVM_USERSPACE_IRQ_SOURCE_ID, 1,
223
					      false) == -EWOULDBLOCK)
224
			schedule_work(&irqfd->inject);
225
		srcu_read_unlock(&kvm->irq_srcu, idx);
226
		ret = 1;
227
	}
228

229
	if (flags & EPOLLHUP) {
230
		/* The eventfd is closing, detach from KVM */
231
		unsigned long iflags;
232

233
		/*
234
		 * Taking irqfds.lock is safe here, as KVM holds a reference to
235
		 * the eventfd when registering the irqfd, i.e. this path can't
236
		 * be reached while kvm_irqfd_add() is running.
237
		 */
238
		spin_lock_irqsave(&kvm->irqfds.lock, iflags);
239

240
		/*
241
		 * We must check if someone deactivated the irqfd before
242
		 * we could acquire the irqfds.lock since the item is
243
		 * deactivated from the KVM side before it is unhooked from
244
		 * the wait-queue.  If it is already deactivated, we can
245
		 * simply return knowing the other side will cleanup for us.
246
		 * We cannot race against the irqfd going away since the
247
		 * other side is required to acquire wqh->lock, which we hold
248
		 */
249
		if (irqfd_is_active(irqfd))
250
			irqfd_deactivate(irqfd);
251

252
		spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
253
	}
254

255
	return ret;
256
}
257

258
static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
259
{
260
	struct kvm_kernel_irq_routing_entry *e;
261
	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
262
	int n_entries;
263

264
	lockdep_assert_held(&kvm->irqfds.lock);
265

266
	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
267

268
	write_seqcount_begin(&irqfd->irq_entry_sc);
269

270
	e = entries;
271
	if (n_entries == 1)
272
		irqfd->irq_entry = *e;
273
	else
274
		irqfd->irq_entry.type = 0;
275

276
	write_seqcount_end(&irqfd->irq_entry_sc);
277
}
278

279
struct kvm_irqfd_pt {
280
	struct kvm_kernel_irqfd *irqfd;
281
	struct kvm *kvm;
282
	poll_table pt;
283
	int ret;
284
};
285

286
static void kvm_irqfd_register(struct file *file, wait_queue_head_t *wqh,
287
			       poll_table *pt)
288
{
289
	struct kvm_irqfd_pt *p = container_of(pt, struct kvm_irqfd_pt, pt);
290
	struct kvm_kernel_irqfd *irqfd = p->irqfd;
291
	struct kvm *kvm = p->kvm;
292

293
	/*
294
	 * Note, irqfds.lock protects the irqfd's irq_entry, i.e. its routing,
295
	 * and irqfds.items.  It does NOT protect registering with the eventfd.
296
	 */
297
	spin_lock_irq(&kvm->irqfds.lock);
298

299
	/*
300
	 * Initialize the routing information prior to adding the irqfd to the
301
	 * eventfd's waitqueue, as irqfd_wakeup() can be invoked as soon as the
302
	 * irqfd is registered.
303
	 */
304
	irqfd_update(kvm, irqfd);
305

306
	/*
307
	 * Add the irqfd as a priority waiter on the eventfd, with a custom
308
	 * wake-up handler, so that KVM *and only KVM* is notified whenever the
309
	 * underlying eventfd is signaled.
310
	 */
311
	init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
312

313
	/*
314
	 * Temporarily lie to lockdep about holding irqfds.lock to avoid a
315
	 * false positive regarding potential deadlock with irqfd_wakeup()
316
	 * (see irqfd_wakeup() for details).
317
	 *
318
	 * Adding to the wait queue will fail if there is already a priority
319
	 * waiter, i.e. if the eventfd is associated with another irqfd (in any
320
	 * VM).  Note, kvm_irqfd_deassign() waits for all in-flight shutdown
321
	 * jobs to complete, i.e. ensures the irqfd has been removed from the
322
	 * eventfd's waitqueue before returning to userspace.
323
	 */
324
	spin_release(&kvm->irqfds.lock.dep_map, _RET_IP_);
325
	p->ret = add_wait_queue_priority_exclusive(wqh, &irqfd->wait);
326
	spin_acquire(&kvm->irqfds.lock.dep_map, 0, 0, _RET_IP_);
327
	if (p->ret)
328
		goto out;
329

330
	list_add_tail(&irqfd->list, &kvm->irqfds.items);
331

332
out:
333
	spin_unlock_irq(&kvm->irqfds.lock);
334
}
335

336
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
337
void __attribute__((weak)) kvm_arch_irq_bypass_stop(
338
				struct irq_bypass_consumer *cons)
339
{
340
}
341

342
void __attribute__((weak)) kvm_arch_irq_bypass_start(
343
				struct irq_bypass_consumer *cons)
344
{
345
}
346

347
void __weak kvm_arch_update_irqfd_routing(struct kvm_kernel_irqfd *irqfd,
348
					  struct kvm_kernel_irq_routing_entry *old,
349
					  struct kvm_kernel_irq_routing_entry *new)
350
{
351

352
}
353
#endif
354

355
static int
356
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
357
{
358
	struct kvm_kernel_irqfd *irqfd;
359
	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
360
	struct kvm_irqfd_pt irqfd_pt;
361
	int ret;
362
	__poll_t events;
363
	int idx;
364

365
	if (!kvm_arch_intc_initialized(kvm))
366
		return -EAGAIN;
367

368
	if (!kvm_arch_irqfd_allowed(kvm, args))
369
		return -EINVAL;
370

371
	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
372
	if (!irqfd)
373
		return -ENOMEM;
374

375
	irqfd->kvm = kvm;
376
	irqfd->gsi = args->gsi;
377
	INIT_LIST_HEAD(&irqfd->list);
378
	INIT_WORK(&irqfd->inject, irqfd_inject);
379
	INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
380
	seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
381

382
	CLASS(fd, f)(args->fd);
383
	if (fd_empty(f)) {
384
		ret = -EBADF;
385
		goto out;
386
	}
387

388
	eventfd = eventfd_ctx_fileget(fd_file(f));
389
	if (IS_ERR(eventfd)) {
390
		ret = PTR_ERR(eventfd);
391
		goto out;
392
	}
393

394
	irqfd->eventfd = eventfd;
395

396
	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
397
		struct kvm_kernel_irqfd_resampler *resampler;
398

399
		resamplefd = eventfd_ctx_fdget(args->resamplefd);
400
		if (IS_ERR(resamplefd)) {
401
			ret = PTR_ERR(resamplefd);
402
			goto fail;
403
		}
404

405
		irqfd->resamplefd = resamplefd;
406
		INIT_LIST_HEAD(&irqfd->resampler_link);
407

408
		mutex_lock(&kvm->irqfds.resampler_lock);
409

410
		list_for_each_entry(resampler,
411
				    &kvm->irqfds.resampler_list, link) {
412
			if (resampler->notifier.gsi == irqfd->gsi) {
413
				irqfd->resampler = resampler;
414
				break;
415
			}
416
		}
417

418
		if (!irqfd->resampler) {
419
			resampler = kzalloc(sizeof(*resampler),
420
					    GFP_KERNEL_ACCOUNT);
421
			if (!resampler) {
422
				ret = -ENOMEM;
423
				mutex_unlock(&kvm->irqfds.resampler_lock);
424
				goto fail;
425
			}
426

427
			resampler->kvm = kvm;
428
			INIT_LIST_HEAD(&resampler->list);
429
			resampler->notifier.gsi = irqfd->gsi;
430
			resampler->notifier.irq_acked = irqfd_resampler_ack;
431
			INIT_LIST_HEAD(&resampler->link);
432

433
			list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list);
434
			kvm_register_irq_ack_notifier(kvm,
435
						      &resampler->notifier);
436
			irqfd->resampler = resampler;
437
		}
438

439
		list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
440
		synchronize_srcu_expedited(&kvm->irq_srcu);
441

442
		mutex_unlock(&kvm->irqfds.resampler_lock);
443
	}
444

445
	/*
446
	 * Set the irqfd routing and add it to KVM's list before registering
447
	 * the irqfd with the eventfd, so that the routing information is valid
448
	 * and stays valid, e.g. if there are GSI routing changes, prior to
449
	 * making the irqfd visible, i.e. before it might be signaled.
450
	 *
451
	 * Note, holding SRCU ensures a stable read of routing information, and
452
	 * also prevents irqfd_shutdown() from freeing the irqfd before it's
453
	 * fully initialized.
454
	 */
455
	idx = srcu_read_lock(&kvm->irq_srcu);
456

457
	/*
458
	 * Register the irqfd with the eventfd by polling on the eventfd, and
459
	 * simultaneously and the irqfd to KVM's list.  If there was en event
460
	 * pending on the eventfd prior to registering, manually trigger IRQ
461
	 * injection.
462
	 */
463
	irqfd_pt.irqfd = irqfd;
464
	irqfd_pt.kvm = kvm;
465
	init_poll_funcptr(&irqfd_pt.pt, kvm_irqfd_register);
466

467
	events = vfs_poll(fd_file(f), &irqfd_pt.pt);
468

469
	ret = irqfd_pt.ret;
470
	if (ret)
471
		goto fail_poll;
472

473
	if (events & EPOLLIN)
474
		schedule_work(&irqfd->inject);
475

476
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
477
	if (kvm_arch_has_irq_bypass()) {
478
		irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
479
		irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
480
		irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
481
		irqfd->consumer.start = kvm_arch_irq_bypass_start;
482
		ret = irq_bypass_register_consumer(&irqfd->consumer, irqfd->eventfd);
483
		if (ret)
484
			pr_info("irq bypass consumer (eventfd %p) registration fails: %d\n",
485
				irqfd->eventfd, ret);
486
	}
487
#endif
488

489
	srcu_read_unlock(&kvm->irq_srcu, idx);
490
	return 0;
491

492
fail_poll:
493
	srcu_read_unlock(&kvm->irq_srcu, idx);
494
fail:
495
	if (irqfd->resampler)
496
		irqfd_resampler_shutdown(irqfd);
497

498
	if (resamplefd && !IS_ERR(resamplefd))
499
		eventfd_ctx_put(resamplefd);
500

501
	if (eventfd && !IS_ERR(eventfd))
502
		eventfd_ctx_put(eventfd);
503

504
out:
505
	kfree(irqfd);
506
	return ret;
507
}
508

509
bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
510
{
511
	struct kvm_irq_ack_notifier *kian;
512
	int gsi, idx;
513

514
	idx = srcu_read_lock(&kvm->irq_srcu);
515
	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
516
	if (gsi != -1)
517
		hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
518
					  link, srcu_read_lock_held(&kvm->irq_srcu))
519
			if (kian->gsi == gsi) {
520
				srcu_read_unlock(&kvm->irq_srcu, idx);
521
				return true;
522
			}
523

524
	srcu_read_unlock(&kvm->irq_srcu, idx);
525

526
	return false;
527
}
528
EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
529

530
void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
531
{
532
	struct kvm_irq_ack_notifier *kian;
533

534
	hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
535
				  link, srcu_read_lock_held(&kvm->irq_srcu))
536
		if (kian->gsi == gsi)
537
			kian->irq_acked(kian);
538
}
539

540
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
541
{
542
	int gsi, idx;
543

544
	trace_kvm_ack_irq(irqchip, pin);
545

546
	idx = srcu_read_lock(&kvm->irq_srcu);
547
	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
548
	if (gsi != -1)
549
		kvm_notify_acked_gsi(kvm, gsi);
550
	srcu_read_unlock(&kvm->irq_srcu, idx);
551
}
552

553
void kvm_register_irq_ack_notifier(struct kvm *kvm,
554
				   struct kvm_irq_ack_notifier *kian)
555
{
556
	mutex_lock(&kvm->irq_lock);
557
	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
558
	mutex_unlock(&kvm->irq_lock);
559
	kvm_arch_post_irq_ack_notifier_list_update(kvm);
560
}
561

562
void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
563
				    struct kvm_irq_ack_notifier *kian)
564
{
565
	mutex_lock(&kvm->irq_lock);
566
	hlist_del_init_rcu(&kian->link);
567
	mutex_unlock(&kvm->irq_lock);
568
	synchronize_srcu_expedited(&kvm->irq_srcu);
569
	kvm_arch_post_irq_ack_notifier_list_update(kvm);
570
}
571

572
/*
573
 * shutdown any irqfd's that match fd+gsi
574
 */
575
static int
576
kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
577
{
578
	struct kvm_kernel_irqfd *irqfd, *tmp;
579
	struct eventfd_ctx *eventfd;
580

581
	eventfd = eventfd_ctx_fdget(args->fd);
582
	if (IS_ERR(eventfd))
583
		return PTR_ERR(eventfd);
584

585
	spin_lock_irq(&kvm->irqfds.lock);
586

587
	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
588
		if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
589
			/*
590
			 * This clearing of irq_entry.type is needed for when
591
			 * another thread calls kvm_irq_routing_update before
592
			 * we flush workqueue below (we synchronize with
593
			 * kvm_irq_routing_update using irqfds.lock).
594
			 */
595
			write_seqcount_begin(&irqfd->irq_entry_sc);
596
			irqfd->irq_entry.type = 0;
597
			write_seqcount_end(&irqfd->irq_entry_sc);
598
			irqfd_deactivate(irqfd);
599
		}
600
	}
601

602
	spin_unlock_irq(&kvm->irqfds.lock);
603
	eventfd_ctx_put(eventfd);
604

605
	/*
606
	 * Block until we know all outstanding shutdown jobs have completed
607
	 * so that we guarantee there will not be any more interrupts on this
608
	 * gsi once this deassign function returns.
609
	 */
610
	flush_workqueue(irqfd_cleanup_wq);
611

612
	return 0;
613
}
614

615
int
616
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
617
{
618
	if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
619
		return -EINVAL;
620

621
	if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
622
		return kvm_irqfd_deassign(kvm, args);
623

624
	return kvm_irqfd_assign(kvm, args);
625
}
626

627
/*
628
 * This function is called as the kvm VM fd is being released. Shutdown all
629
 * irqfds that still remain open
630
 */
631
void
632
kvm_irqfd_release(struct kvm *kvm)
633
{
634
	struct kvm_kernel_irqfd *irqfd, *tmp;
635

636
	spin_lock_irq(&kvm->irqfds.lock);
637

638
	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
639
		irqfd_deactivate(irqfd);
640

641
	spin_unlock_irq(&kvm->irqfds.lock);
642

643
	/*
644
	 * Block until we know all outstanding shutdown jobs have completed
645
	 * since we do not take a kvm* reference.
646
	 */
647
	flush_workqueue(irqfd_cleanup_wq);
648

649
}
650

651
/*
652
 * Take note of a change in irq routing.
653
 * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards.
654
 */
655
void kvm_irq_routing_update(struct kvm *kvm)
656
{
657
	struct kvm_kernel_irqfd *irqfd;
658

659
	spin_lock_irq(&kvm->irqfds.lock);
660

661
	list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
662
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
663
		/* Under irqfds.lock, so can read irq_entry safely */
664
		struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry;
665
#endif
666

667
		irqfd_update(kvm, irqfd);
668

669
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
670
		if (irqfd->producer)
671
			kvm_arch_update_irqfd_routing(irqfd, &old, &irqfd->irq_entry);
672
#endif
673
	}
674

675
	spin_unlock_irq(&kvm->irqfds.lock);
676
}
677

678
bool kvm_notify_irqfd_resampler(struct kvm *kvm,
679
				unsigned int irqchip,
680
				unsigned int pin)
681
{
682
	struct kvm_kernel_irqfd_resampler *resampler;
683
	int gsi, idx;
684

685
	idx = srcu_read_lock(&kvm->irq_srcu);
686
	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
687
	if (gsi != -1) {
688
		list_for_each_entry_srcu(resampler,
689
					 &kvm->irqfds.resampler_list, link,
690
					 srcu_read_lock_held(&kvm->irq_srcu)) {
691
			if (resampler->notifier.gsi == gsi) {
692
				irqfd_resampler_notify(resampler);
693
				srcu_read_unlock(&kvm->irq_srcu, idx);
694
				return true;
695
			}
696
		}
697
	}
698
	srcu_read_unlock(&kvm->irq_srcu, idx);
699

700
	return false;
701
}
702

703
/*
704
 * create a host-wide workqueue for issuing deferred shutdown requests
705
 * aggregated from all vm* instances. We need our own isolated
706
 * queue to ease flushing work items when a VM exits.
707
 */
708
int kvm_irqfd_init(void)
709
{
710
	irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
711
	if (!irqfd_cleanup_wq)
712
		return -ENOMEM;
713

714
	return 0;
715
}
716

717
void kvm_irqfd_exit(void)
718
{
719
	destroy_workqueue(irqfd_cleanup_wq);
720
}
721
#endif
722

723
/*
724
 * --------------------------------------------------------------------
725
 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
726
 *
727
 * userspace can register a PIO/MMIO address with an eventfd for receiving
728
 * notification when the memory has been touched.
729
 * --------------------------------------------------------------------
730
 */
731

732
struct _ioeventfd {
733
	struct list_head     list;
734
	u64                  addr;
735
	int                  length;
736
	struct eventfd_ctx  *eventfd;
737
	u64                  datamatch;
738
	struct kvm_io_device dev;
739
	u8                   bus_idx;
740
	bool                 wildcard;
741
};
742

743
static inline struct _ioeventfd *
744
to_ioeventfd(struct kvm_io_device *dev)
745
{
746
	return container_of(dev, struct _ioeventfd, dev);
747
}
748

749
static void
750
ioeventfd_release(struct _ioeventfd *p)
751
{
752
	eventfd_ctx_put(p->eventfd);
753
	list_del(&p->list);
754
	kfree(p);
755
}
756

757
static bool
758
ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
759
{
760
	u64 _val;
761

762
	if (addr != p->addr)
763
		/* address must be precise for a hit */
764
		return false;
765

766
	if (!p->length)
767
		/* length = 0 means only look at the address, so always a hit */
768
		return true;
769

770
	if (len != p->length)
771
		/* address-range must be precise for a hit */
772
		return false;
773

774
	if (p->wildcard)
775
		/* all else equal, wildcard is always a hit */
776
		return true;
777

778
	/* otherwise, we have to actually compare the data */
779

780
	BUG_ON(!IS_ALIGNED((unsigned long)val, len));
781

782
	switch (len) {
783
	case 1:
784
		_val = *(u8 *)val;
785
		break;
786
	case 2:
787
		_val = *(u16 *)val;
788
		break;
789
	case 4:
790
		_val = *(u32 *)val;
791
		break;
792
	case 8:
793
		_val = *(u64 *)val;
794
		break;
795
	default:
796
		return false;
797
	}
798

799
	return _val == p->datamatch;
800
}
801

802
/* MMIO/PIO writes trigger an event if the addr/val match */
803
static int
804
ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
805
		int len, const void *val)
806
{
807
	struct _ioeventfd *p = to_ioeventfd(this);
808

809
	if (!ioeventfd_in_range(p, addr, len, val))
810
		return -EOPNOTSUPP;
811

812
	eventfd_signal(p->eventfd);
813
	return 0;
814
}
815

816
/*
817
 * This function is called as KVM is completely shutting down.  We do not
818
 * need to worry about locking just nuke anything we have as quickly as possible
819
 */
820
static void
821
ioeventfd_destructor(struct kvm_io_device *this)
822
{
823
	struct _ioeventfd *p = to_ioeventfd(this);
824

825
	ioeventfd_release(p);
826
}
827

828
static const struct kvm_io_device_ops ioeventfd_ops = {
829
	.write      = ioeventfd_write,
830
	.destructor = ioeventfd_destructor,
831
};
832

833
/* assumes kvm->slots_lock held */
834
static bool
835
ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
836
{
837
	struct _ioeventfd *_p;
838

839
	list_for_each_entry(_p, &kvm->ioeventfds, list)
840
		if (_p->bus_idx == p->bus_idx &&
841
		    _p->addr == p->addr &&
842
		    (!_p->length || !p->length ||
843
		     (_p->length == p->length &&
844
		      (_p->wildcard || p->wildcard ||
845
		       _p->datamatch == p->datamatch))))
846
			return true;
847

848
	return false;
849
}
850

851
static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
852
{
853
	if (flags & KVM_IOEVENTFD_FLAG_PIO)
854
		return KVM_PIO_BUS;
855
	if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
856
		return KVM_VIRTIO_CCW_NOTIFY_BUS;
857
	return KVM_MMIO_BUS;
858
}
859

860
static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
861
				enum kvm_bus bus_idx,
862
				struct kvm_ioeventfd *args)
863
{
864

865
	struct eventfd_ctx *eventfd;
866
	struct _ioeventfd *p;
867
	int ret;
868

869
	eventfd = eventfd_ctx_fdget(args->fd);
870
	if (IS_ERR(eventfd))
871
		return PTR_ERR(eventfd);
872

873
	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
874
	if (!p) {
875
		ret = -ENOMEM;
876
		goto fail;
877
	}
878

879
	INIT_LIST_HEAD(&p->list);
880
	p->addr    = args->addr;
881
	p->bus_idx = bus_idx;
882
	p->length  = args->len;
883
	p->eventfd = eventfd;
884

885
	/* The datamatch feature is optional, otherwise this is a wildcard */
886
	if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
887
		p->datamatch = args->datamatch;
888
	else
889
		p->wildcard = true;
890

891
	mutex_lock(&kvm->slots_lock);
892

893
	/* Verify that there isn't a match already */
894
	if (ioeventfd_check_collision(kvm, p)) {
895
		ret = -EEXIST;
896
		goto unlock_fail;
897
	}
898

899
	kvm_iodevice_init(&p->dev, &ioeventfd_ops);
900

901
	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
902
				      &p->dev);
903
	if (ret < 0)
904
		goto unlock_fail;
905

906
	kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
907
	list_add_tail(&p->list, &kvm->ioeventfds);
908

909
	mutex_unlock(&kvm->slots_lock);
910

911
	return 0;
912

913
unlock_fail:
914
	mutex_unlock(&kvm->slots_lock);
915
	kfree(p);
916

917
fail:
918
	eventfd_ctx_put(eventfd);
919

920
	return ret;
921
}
922

923
static int
924
kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
925
			   struct kvm_ioeventfd *args)
926
{
927
	struct _ioeventfd        *p;
928
	struct eventfd_ctx       *eventfd;
929
	struct kvm_io_bus	 *bus;
930
	int                       ret = -ENOENT;
931
	bool                      wildcard;
932

933
	eventfd = eventfd_ctx_fdget(args->fd);
934
	if (IS_ERR(eventfd))
935
		return PTR_ERR(eventfd);
936

937
	wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
938

939
	mutex_lock(&kvm->slots_lock);
940

941
	list_for_each_entry(p, &kvm->ioeventfds, list) {
942
		if (p->bus_idx != bus_idx ||
943
		    p->eventfd != eventfd  ||
944
		    p->addr != args->addr  ||
945
		    p->length != args->len ||
946
		    p->wildcard != wildcard)
947
			continue;
948

949
		if (!p->wildcard && p->datamatch != args->datamatch)
950
			continue;
951

952
		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
953
		bus = kvm_get_bus(kvm, bus_idx);
954
		if (bus)
955
			bus->ioeventfd_count--;
956
		ret = 0;
957
		break;
958
	}
959

960
	mutex_unlock(&kvm->slots_lock);
961

962
	eventfd_ctx_put(eventfd);
963

964
	return ret;
965
}
966

967
static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
968
{
969
	enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags);
970
	int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
971

972
	if (!args->len && bus_idx == KVM_MMIO_BUS)
973
		kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
974

975
	return ret;
976
}
977

978
static int
979
kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
980
{
981
	enum kvm_bus              bus_idx;
982
	int ret;
983

984
	bus_idx = ioeventfd_bus_from_flags(args->flags);
985
	/* must be natural-word sized, or 0 to ignore length */
986
	switch (args->len) {
987
	case 0:
988
	case 1:
989
	case 2:
990
	case 4:
991
	case 8:
992
		break;
993
	default:
994
		return -EINVAL;
995
	}
996

997
	/* check for range overflow */
998
	if (args->addr + args->len < args->addr)
999
		return -EINVAL;
1000

1001
	/* check for extra flags that we don't understand */
1002
	if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
1003
		return -EINVAL;
1004

1005
	/* ioeventfd with no length can't be combined with DATAMATCH */
1006
	if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
1007
		return -EINVAL;
1008

1009
	ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
1010
	if (ret)
1011
		goto fail;
1012

1013
	/* When length is ignored, MMIO is also put on a separate bus, for
1014
	 * faster lookups.
1015
	 */
1016
	if (!args->len && bus_idx == KVM_MMIO_BUS) {
1017
		ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
1018
		if (ret < 0)
1019
			goto fast_fail;
1020
	}
1021

1022
	return 0;
1023

1024
fast_fail:
1025
	kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
1026
fail:
1027
	return ret;
1028
}
1029

1030
int
1031
kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
1032
{
1033
	if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
1034
		return kvm_deassign_ioeventfd(kvm, args);
1035

1036
	return kvm_assign_ioeventfd(kvm, args);
1037
}
1038

1039
void
1040
kvm_eventfd_init(struct kvm *kvm)
1041
{
1042
#ifdef CONFIG_HAVE_KVM_IRQCHIP
1043
	spin_lock_init(&kvm->irqfds.lock);
1044
	INIT_LIST_HEAD(&kvm->irqfds.items);
1045
	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
1046
	mutex_init(&kvm->irqfds.resampler_lock);
1047
#endif
1048
	INIT_LIST_HEAD(&kvm->ioeventfds);
1049
}
1050

1051
Product

Resources

Company