CoCalc -- kern

GitHub Repository: freebsd/freebsd-src
Path: blob/main/sys/kern/kern_event.c
³⁹⁴⁷⁵ views
1
/*-
2
 * SPDX-License-Identifier: BSD-2-Clause
3
 *
4
 * Copyright (c) 1999,2000,2001 Jonathan Lemon <[email protected]>
5
 * Copyright 2004 John-Mark Gurney <[email protected]>
6
 * Copyright (c) 2009 Apple, Inc.
7
 * All rights reserved.
8
 *
9
 * Redistribution and use in source and binary forms, with or without
10
 * modification, are permitted provided that the following conditions
11
 * are met:
12
 * 1. Redistributions of source code must retain the above copyright
13
 *    notice, this list of conditions and the following disclaimer.
14
 * 2. Redistributions in binary form must reproduce the above copyright
15
 *    notice, this list of conditions and the following disclaimer in the
16
 *    documentation and/or other materials provided with the distribution.
17
 *
18
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28
 * SUCH DAMAGE.
29
 */
30

31
#include <sys/cdefs.h>
32
#include "opt_ktrace.h"
33
#include "opt_kqueue.h"
34

35
#ifdef COMPAT_FREEBSD11
36
#define	_WANT_FREEBSD11_KEVENT
37
#endif
38

39
#include <sys/param.h>
40
#include <sys/systm.h>
41
#include <sys/capsicum.h>
42
#include <sys/kernel.h>
43
#include <sys/limits.h>
44
#include <sys/lock.h>
45
#include <sys/mutex.h>
46
#include <sys/proc.h>
47
#include <sys/malloc.h>
48
#include <sys/unistd.h>
49
#include <sys/file.h>
50
#include <sys/filedesc.h>
51
#include <sys/filio.h>
52
#include <sys/fcntl.h>
53
#include <sys/jail.h>
54
#include <sys/jaildesc.h>
55
#include <sys/kthread.h>
56
#include <sys/selinfo.h>
57
#include <sys/queue.h>
58
#include <sys/event.h>
59
#include <sys/eventvar.h>
60
#include <sys/poll.h>
61
#include <sys/protosw.h>
62
#include <sys/resourcevar.h>
63
#include <sys/sbuf.h>
64
#include <sys/sigio.h>
65
#include <sys/signalvar.h>
66
#include <sys/socket.h>
67
#include <sys/socketvar.h>
68
#include <sys/stat.h>
69
#include <sys/sysctl.h>
70
#include <sys/sysent.h>
71
#include <sys/sysproto.h>
72
#include <sys/syscallsubr.h>
73
#include <sys/taskqueue.h>
74
#include <sys/uio.h>
75
#include <sys/user.h>
76
#ifdef KTRACE
77
#include <sys/ktrace.h>
78
#endif
79
#include <machine/atomic.h>
80
#ifdef COMPAT_FREEBSD32
81
#include <compat/freebsd32/freebsd32.h>
82
#include <compat/freebsd32/freebsd32_util.h>
83
#endif
84

85
#include <vm/uma.h>
86

87
static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
88

89
/*
90
 * This lock is used if multiple kq locks are required.  This possibly
91
 * should be made into a per proc lock.
92
 */
93
static struct mtx	kq_global;
94
MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
95
#define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
96
	if (!haslck)				\
97
		mtx_lock(lck);			\
98
	haslck = 1;				\
99
} while (0)
100
#define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
101
	if (haslck)				\
102
		mtx_unlock(lck);			\
103
	haslck = 0;				\
104
} while (0)
105

106
TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
107

108
static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
109
static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
110
static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
111
		    struct thread *td, int mflag);
112
static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
113
static void	kqueue_release(struct kqueue *kq, int locked);
114
static void	kqueue_destroy(struct kqueue *kq);
115
static void	kqueue_drain(struct kqueue *kq, struct thread *td);
116
static int	kqueue_expand(struct kqueue *kq, const struct filterops *fops,
117
		    uintptr_t ident, int mflag);
118
static void	kqueue_task(void *arg, int pending);
119
static int	kqueue_scan(struct kqueue *kq, int maxevents,
120
		    struct kevent_copyops *k_ops,
121
		    const struct timespec *timeout,
122
		    struct kevent *keva, struct thread *td);
123
static void 	kqueue_wakeup(struct kqueue *kq);
124
static const struct filterops *kqueue_fo_find(int filt);
125
static void	kqueue_fo_release(int filt);
126
struct g_kevent_args;
127
static int	kern_kevent_generic(struct thread *td,
128
		    struct g_kevent_args *uap,
129
		    struct kevent_copyops *k_ops, const char *struct_name);
130

131
static fo_ioctl_t	kqueue_ioctl;
132
static fo_poll_t	kqueue_poll;
133
static fo_kqfilter_t	kqueue_kqfilter;
134
static fo_stat_t	kqueue_stat;
135
static fo_close_t	kqueue_close;
136
static fo_fill_kinfo_t	kqueue_fill_kinfo;
137

138
static const struct fileops kqueueops = {
139
	.fo_read = invfo_rdwr,
140
	.fo_write = invfo_rdwr,
141
	.fo_truncate = invfo_truncate,
142
	.fo_ioctl = kqueue_ioctl,
143
	.fo_poll = kqueue_poll,
144
	.fo_kqfilter = kqueue_kqfilter,
145
	.fo_stat = kqueue_stat,
146
	.fo_close = kqueue_close,
147
	.fo_chmod = invfo_chmod,
148
	.fo_chown = invfo_chown,
149
	.fo_sendfile = invfo_sendfile,
150
	.fo_cmp = file_kcmp_generic,
151
	.fo_fill_kinfo = kqueue_fill_kinfo,
152
};
153

154
static int 	knote_attach(struct knote *kn, struct kqueue *kq);
155
static void 	knote_drop(struct knote *kn, struct thread *td);
156
static void 	knote_drop_detached(struct knote *kn, struct thread *td);
157
static void 	knote_enqueue(struct knote *kn);
158
static void 	knote_dequeue(struct knote *kn);
159
static void 	knote_init(void);
160
static struct 	knote *knote_alloc(int mflag);
161
static void 	knote_free(struct knote *kn);
162

163
static void	filt_kqdetach(struct knote *kn);
164
static int	filt_kqueue(struct knote *kn, long hint);
165
static int	filt_procattach(struct knote *kn);
166
static void	filt_procdetach(struct knote *kn);
167
static int	filt_proc(struct knote *kn, long hint);
168
static int	filt_jailattach(struct knote *kn);
169
static void	filt_jaildetach(struct knote *kn);
170
static int	filt_jail(struct knote *kn, long hint);
171
static int	filt_fileattach(struct knote *kn);
172
static void	filt_timerexpire(void *knx);
173
static void	filt_timerexpire_l(struct knote *kn, bool proc_locked);
174
static int	filt_timerattach(struct knote *kn);
175
static void	filt_timerdetach(struct knote *kn);
176
static void	filt_timerstart(struct knote *kn, sbintime_t to);
177
static void	filt_timertouch(struct knote *kn, struct kevent *kev,
178
		    u_long type);
179
static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
180
static int	filt_timer(struct knote *kn, long hint);
181
static int	filt_userattach(struct knote *kn);
182
static void	filt_userdetach(struct knote *kn);
183
static int	filt_user(struct knote *kn, long hint);
184
static void	filt_usertouch(struct knote *kn, struct kevent *kev,
185
		    u_long type);
186

187
static const struct filterops file_filtops = {
188
	.f_isfd = 1,
189
	.f_attach = filt_fileattach,
190
};
191
static const struct filterops kqread_filtops = {
192
	.f_isfd = 1,
193
	.f_detach = filt_kqdetach,
194
	.f_event = filt_kqueue,
195
};
196
/* XXX - move to kern_proc.c?  */
197
static const struct filterops proc_filtops = {
198
	.f_isfd = 0,
199
	.f_attach = filt_procattach,
200
	.f_detach = filt_procdetach,
201
	.f_event = filt_proc,
202
};
203
static const struct filterops jail_filtops = {
204
	.f_isfd = 0,
205
	.f_attach = filt_jailattach,
206
	.f_detach = filt_jaildetach,
207
	.f_event = filt_jail,
208
};
209
static const struct filterops timer_filtops = {
210
	.f_isfd = 0,
211
	.f_attach = filt_timerattach,
212
	.f_detach = filt_timerdetach,
213
	.f_event = filt_timer,
214
	.f_touch = filt_timertouch,
215
};
216
static const struct filterops user_filtops = {
217
	.f_attach = filt_userattach,
218
	.f_detach = filt_userdetach,
219
	.f_event = filt_user,
220
	.f_touch = filt_usertouch,
221
};
222

223
static uma_zone_t	knote_zone;
224
static unsigned int __exclusive_cache_line	kq_ncallouts;
225
static unsigned int 	kq_calloutmax = 4 * 1024;
226
SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
227
    &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
228

229
/* XXX - ensure not influx ? */
230
#define KNOTE_ACTIVATE(kn, islock) do { 				\
231
	if ((islock))							\
232
		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
233
	else								\
234
		KQ_LOCK((kn)->kn_kq);					\
235
	(kn)->kn_status |= KN_ACTIVE;					\
236
	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
237
		knote_enqueue((kn));					\
238
	if (!(islock))							\
239
		KQ_UNLOCK((kn)->kn_kq);					\
240
} while (0)
241
#define KQ_LOCK(kq) do {						\
242
	mtx_lock(&(kq)->kq_lock);					\
243
} while (0)
244
#define KQ_FLUX_WAKEUP(kq) do {						\
245
	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
246
		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
247
		wakeup((kq));						\
248
	}								\
249
} while (0)
250
#define KQ_UNLOCK_FLUX(kq) do {						\
251
	KQ_FLUX_WAKEUP(kq);						\
252
	mtx_unlock(&(kq)->kq_lock);					\
253
} while (0)
254
#define KQ_UNLOCK(kq) do {						\
255
	mtx_unlock(&(kq)->kq_lock);					\
256
} while (0)
257
#define KQ_OWNED(kq) do {						\
258
	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
259
} while (0)
260
#define KQ_NOTOWNED(kq) do {						\
261
	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
262
} while (0)
263

264
static struct knlist *
265
kn_list_lock(struct knote *kn)
266
{
267
	struct knlist *knl;
268

269
	knl = kn->kn_knlist;
270
	if (knl != NULL)
271
		knl->kl_lock(knl->kl_lockarg);
272
	return (knl);
273
}
274

275
static void
276
kn_list_unlock(struct knlist *knl)
277
{
278
	bool do_free;
279

280
	if (knl == NULL)
281
		return;
282
	do_free = knl->kl_autodestroy && knlist_empty(knl);
283
	knl->kl_unlock(knl->kl_lockarg);
284
	if (do_free) {
285
		knlist_destroy(knl);
286
		free(knl, M_KQUEUE);
287
	}
288
}
289

290
static bool
291
kn_in_flux(struct knote *kn)
292
{
293

294
	return (kn->kn_influx > 0);
295
}
296

297
static void
298
kn_enter_flux(struct knote *kn)
299
{
300

301
	KQ_OWNED(kn->kn_kq);
302
	MPASS(kn->kn_influx < INT_MAX);
303
	kn->kn_influx++;
304
}
305

306
static bool
307
kn_leave_flux(struct knote *kn)
308
{
309

310
	KQ_OWNED(kn->kn_kq);
311
	MPASS(kn->kn_influx > 0);
312
	kn->kn_influx--;
313
	return (kn->kn_influx == 0);
314
}
315

316
#define	KNL_ASSERT_LOCK(knl, islocked) do {				\
317
	if (islocked)							\
318
		KNL_ASSERT_LOCKED(knl);				\
319
	else								\
320
		KNL_ASSERT_UNLOCKED(knl);				\
321
} while (0)
322
#ifdef INVARIANTS
323
#define	KNL_ASSERT_LOCKED(knl) do {					\
324
	knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED);		\
325
} while (0)
326
#define	KNL_ASSERT_UNLOCKED(knl) do {					\
327
	knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED);		\
328
} while (0)
329
#else /* !INVARIANTS */
330
#define	KNL_ASSERT_LOCKED(knl) do {} while (0)
331
#define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
332
#endif /* INVARIANTS */
333

334
#ifndef	KN_HASHSIZE
335
#define	KN_HASHSIZE		64		/* XXX should be tunable */
336
#endif
337

338
#define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
339

340
static int
341
filt_nullattach(struct knote *kn)
342
{
343

344
	return (ENXIO);
345
};
346

347
static const struct filterops null_filtops = {
348
	.f_isfd = 0,
349
	.f_attach = filt_nullattach,
350
};
351

352
/* XXX - make SYSINIT to add these, and move into respective modules. */
353
extern const struct filterops sig_filtops;
354
extern const struct filterops fs_filtops;
355

356
/*
357
 * Table for all system-defined filters.
358
 */
359
static struct mtx	filterops_lock;
360
MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops", MTX_DEF);
361
static struct {
362
	const struct filterops *for_fop;
363
	int for_nolock;
364
	int for_refcnt;
365
} sysfilt_ops[EVFILT_SYSCOUNT] = {
366
	[~EVFILT_READ] = { &file_filtops, 1 },
367
	[~EVFILT_WRITE] = { &file_filtops, 1 },
368
	[~EVFILT_AIO] = { &null_filtops },
369
	[~EVFILT_VNODE] = { &file_filtops, 1 },
370
	[~EVFILT_PROC] = { &proc_filtops, 1 },
371
	[~EVFILT_SIGNAL] = { &sig_filtops, 1 },
372
	[~EVFILT_TIMER] = { &timer_filtops, 1 },
373
	[~EVFILT_PROCDESC] = { &file_filtops, 1 },
374
	[~EVFILT_FS] = { &fs_filtops, 1 },
375
	[~EVFILT_LIO] = { &null_filtops },
376
	[~EVFILT_USER] = { &user_filtops, 1 },
377
	[~EVFILT_SENDFILE] = { &null_filtops },
378
	[~EVFILT_EMPTY] = { &file_filtops, 1 },
379
	[~EVFILT_JAIL] = { &jail_filtops, 1 },
380
	[~EVFILT_JAILDESC] = { &file_filtops, 1 },
381
};
382

383
/*
384
 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
385
 * method.
386
 */
387
static int
388
filt_fileattach(struct knote *kn)
389
{
390

391
	return (fo_kqfilter(kn->kn_fp, kn));
392
}
393

394
/*ARGSUSED*/
395
static int
396
kqueue_kqfilter(struct file *fp, struct knote *kn)
397
{
398
	struct kqueue *kq = kn->kn_fp->f_data;
399

400
	if (kn->kn_filter != EVFILT_READ)
401
		return (EINVAL);
402

403
	kn->kn_status |= KN_KQUEUE;
404
	kn->kn_fop = &kqread_filtops;
405
	knlist_add(&kq->kq_sel.si_note, kn, 0);
406

407
	return (0);
408
}
409

410
static void
411
filt_kqdetach(struct knote *kn)
412
{
413
	struct kqueue *kq = kn->kn_fp->f_data;
414

415
	knlist_remove(&kq->kq_sel.si_note, kn, 0);
416
}
417

418
/*ARGSUSED*/
419
static int
420
filt_kqueue(struct knote *kn, long hint)
421
{
422
	struct kqueue *kq = kn->kn_fp->f_data;
423

424
	kn->kn_data = kq->kq_count;
425
	return (kn->kn_data > 0);
426
}
427

428
/* XXX - move to kern_proc.c?  */
429
static int
430
filt_procattach(struct knote *kn)
431
{
432
	struct proc *p;
433
	int error;
434
	bool exiting, immediate;
435

436
	exiting = immediate = false;
437
	if (kn->kn_sfflags & NOTE_EXIT)
438
		p = pfind_any(kn->kn_id);
439
	else
440
		p = pfind(kn->kn_id);
441
	if (p == NULL)
442
		return (ESRCH);
443
	if (p->p_flag & P_WEXIT)
444
		exiting = true;
445

446
	if ((error = p_cansee(curthread, p))) {
447
		PROC_UNLOCK(p);
448
		return (error);
449
	}
450

451
	kn->kn_ptr.p_proc = p;
452
	kn->kn_flags |= EV_CLEAR;		/* automatically set */
453

454
	/*
455
	 * Internal flag indicating registration done by kernel for the
456
	 * purposes of getting a NOTE_CHILD notification.
457
	 */
458
	if (kn->kn_flags & EV_FLAG2) {
459
		kn->kn_flags &= ~EV_FLAG2;
460
		kn->kn_data = kn->kn_sdata;		/* ppid */
461
		kn->kn_fflags = NOTE_CHILD;
462
		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
463
		immediate = true; /* Force immediate activation of child note. */
464
	}
465
	/*
466
	 * Internal flag indicating registration done by kernel (for other than
467
	 * NOTE_CHILD).
468
	 */
469
	if (kn->kn_flags & EV_FLAG1) {
470
		kn->kn_flags &= ~EV_FLAG1;
471
	}
472

473
	knlist_add(p->p_klist, kn, 1);
474

475
	/*
476
	 * Immediately activate any child notes or, in the case of a zombie
477
	 * target process, exit notes.  The latter is necessary to handle the
478
	 * case where the target process, e.g. a child, dies before the kevent
479
	 * is registered.
480
	 */
481
	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
482
		KNOTE_ACTIVATE(kn, 0);
483

484
	PROC_UNLOCK(p);
485

486
	return (0);
487
}
488

489
/*
490
 * The knote may be attached to a different process, which may exit,
491
 * leaving nothing for the knote to be attached to.  So when the process
492
 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
493
 * it will be deleted when read out.  However, as part of the knote deletion,
494
 * this routine is called, so a check is needed to avoid actually performing
495
 * a detach, because the original process does not exist any more.
496
 */
497
/* XXX - move to kern_proc.c?  */
498
static void
499
filt_procdetach(struct knote *kn)
500
{
501

502
	knlist_remove(kn->kn_knlist, kn, 0);
503
	kn->kn_ptr.p_proc = NULL;
504
}
505

506
/* XXX - move to kern_proc.c?  */
507
static int
508
filt_proc(struct knote *kn, long hint)
509
{
510
	struct proc *p;
511
	u_int event;
512

513
	p = kn->kn_ptr.p_proc;
514
	if (p == NULL) /* already activated, from attach filter */
515
		return (0);
516

517
	/* Mask off extra data. */
518
	event = (u_int)hint & NOTE_PCTRLMASK;
519

520
	/* If the user is interested in this event, record it. */
521
	if (kn->kn_sfflags & event)
522
		kn->kn_fflags |= event;
523

524
	/* Process is gone, so flag the event as finished. */
525
	if (event == NOTE_EXIT) {
526
		kn->kn_flags |= EV_EOF | EV_ONESHOT;
527
		kn->kn_ptr.p_proc = NULL;
528
		if (kn->kn_fflags & NOTE_EXIT)
529
			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
530
		if (kn->kn_fflags == 0)
531
			kn->kn_flags |= EV_DROP;
532
		return (1);
533
	}
534

535
	return (kn->kn_fflags != 0);
536
}
537

538
/*
539
 * Called when the process forked. It mostly does the same as the
540
 * knote(), activating all knotes registered to be activated when the
541
 * process forked. Additionally, for each knote attached to the
542
 * parent, check whether user wants to track the new process. If so
543
 * attach a new knote to it, and immediately report an event with the
544
 * child's pid.
545
 */
546
void
547
knote_fork(struct knlist *list, int pid)
548
{
549
	struct kqueue *kq;
550
	struct knote *kn;
551
	struct kevent kev;
552
	int error;
553

554
	MPASS(list != NULL);
555
	KNL_ASSERT_LOCKED(list);
556
	if (SLIST_EMPTY(&list->kl_list))
557
		return;
558

559
	memset(&kev, 0, sizeof(kev));
560
	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
561
		kq = kn->kn_kq;
562
		KQ_LOCK(kq);
563
		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
564
			KQ_UNLOCK(kq);
565
			continue;
566
		}
567

568
		/*
569
		 * The same as knote(), activate the event.
570
		 */
571
		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
572
			if (kn->kn_fop->f_event(kn, NOTE_FORK))
573
				KNOTE_ACTIVATE(kn, 1);
574
			KQ_UNLOCK(kq);
575
			continue;
576
		}
577

578
		/*
579
		 * The NOTE_TRACK case. In addition to the activation
580
		 * of the event, we need to register new events to
581
		 * track the child. Drop the locks in preparation for
582
		 * the call to kqueue_register().
583
		 */
584
		kn_enter_flux(kn);
585
		KQ_UNLOCK(kq);
586
		list->kl_unlock(list->kl_lockarg);
587

588
		/*
589
		 * Activate existing knote and register tracking knotes with
590
		 * new process.
591
		 *
592
		 * First register a knote to get just the child notice. This
593
		 * must be a separate note from a potential NOTE_EXIT
594
		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
595
		 * to use the data field (in conflicting ways).
596
		 */
597
		kev.ident = pid;
598
		kev.filter = kn->kn_filter;
599
		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
600
		    EV_FLAG2;
601
		kev.fflags = kn->kn_sfflags;
602
		kev.data = kn->kn_id;		/* parent */
603
		kev.udata = kn->kn_kevent.udata;/* preserve udata */
604
		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
605
		if (error)
606
			kn->kn_fflags |= NOTE_TRACKERR;
607

608
		/*
609
		 * Then register another knote to track other potential events
610
		 * from the new process.
611
		 */
612
		kev.ident = pid;
613
		kev.filter = kn->kn_filter;
614
		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
615
		kev.fflags = kn->kn_sfflags;
616
		kev.data = kn->kn_id;		/* parent */
617
		kev.udata = kn->kn_kevent.udata;/* preserve udata */
618
		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
619
		if (error)
620
			kn->kn_fflags |= NOTE_TRACKERR;
621
		if (kn->kn_fop->f_event(kn, NOTE_FORK))
622
			KNOTE_ACTIVATE(kn, 0);
623
		list->kl_lock(list->kl_lockarg);
624
		KQ_LOCK(kq);
625
		kn_leave_flux(kn);
626
		KQ_UNLOCK_FLUX(kq);
627
	}
628
}
629

630
int
631
filt_jailattach(struct knote *kn)
632
{
633
	struct prison *pr;
634

635
	if (kn->kn_id == 0) {
636
		/* Let jid=0 watch the current prison (including prison0). */
637
		pr = curthread->td_ucred->cr_prison;
638
		mtx_lock(&pr->pr_mtx);
639
	} else {
640
		sx_slock(&allprison_lock);
641
		pr = prison_find_child(curthread->td_ucred->cr_prison,
642
		    kn->kn_id);
643
		sx_sunlock(&allprison_lock);
644
		if (pr == NULL)
645
			return (ENOENT);
646
		if (!prison_isalive(pr)) {
647
			mtx_unlock(&pr->pr_mtx);
648
			return (ENOENT);
649
		}
650
	}
651
	kn->kn_ptr.p_prison = pr;
652
	kn->kn_flags |= EV_CLEAR;
653
	knlist_add(pr->pr_klist, kn, 1);
654
	mtx_unlock(&pr->pr_mtx);
655
	return (0);
656
}
657

658
void
659
filt_jaildetach(struct knote *kn)
660
{
661
	if (kn->kn_ptr.p_prison != NULL) {
662
		knlist_remove(kn->kn_knlist, kn, 0);
663
		kn->kn_ptr.p_prison = NULL;
664
	} else
665
		kn->kn_status |= KN_DETACHED;
666
}
667

668
int
669
filt_jail(struct knote *kn, long hint)
670
{
671
	struct prison *pr;
672
	u_int event;
673

674
	pr = kn->kn_ptr.p_prison;
675
	if (pr == NULL) /* already activated, from attach filter */
676
		return (0);
677

678
	/*
679
	 * Mask off extra data.  In the NOTE_JAIL_CHILD case, that's
680
	 * everything except the NOTE_JAIL_CHILD bit itself, since a
681
	 * JID is any positive integer.
682
	 */
683
	event = ((u_int)hint & NOTE_JAIL_CHILD) ? NOTE_JAIL_CHILD :
684
	    (u_int)hint & NOTE_JAIL_CTRLMASK;
685

686
	/* If the user is interested in this event, record it. */
687
	if (kn->kn_sfflags & event) {
688
		kn->kn_fflags |= event;
689
		/* Report the created jail id or attached process id. */
690
		if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) {
691
			if (kn->kn_data != 0)
692
				kn->kn_fflags |= NOTE_JAIL_MULTI;
693
			kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U :
694
			    (u_int)hint & ~event;
695
		}
696
	}
697

698
	/* Prison is gone, so flag the event as finished. */
699
	if (event == NOTE_JAIL_REMOVE) {
700
		kn->kn_flags |= EV_EOF | EV_ONESHOT;
701
		kn->kn_ptr.p_prison = NULL;
702
		if (kn->kn_fflags == 0)
703
			kn->kn_flags |= EV_DROP;
704
		return (1);
705
	}
706

707
	return (kn->kn_fflags != 0);
708
}
709

710
/*
711
 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
712
 * interval timer support code.
713
 */
714

715
#define NOTE_TIMER_PRECMASK						\
716
    (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
717

718
static sbintime_t
719
timer2sbintime(int64_t data, int flags)
720
{
721
	int64_t secs;
722

723
        /*
724
         * Macros for converting to the fractional second portion of an
725
         * sbintime_t using 64bit multiplication to improve precision.
726
         */
727
#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
728
#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
729
#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
730
	switch (flags & NOTE_TIMER_PRECMASK) {
731
	case NOTE_SECONDS:
732
#ifdef __LP64__
733
		if (data > (SBT_MAX / SBT_1S))
734
			return (SBT_MAX);
735
#endif
736
		return ((sbintime_t)data << 32);
737
	case NOTE_MSECONDS: /* FALLTHROUGH */
738
	case 0:
739
		if (data >= 1000) {
740
			secs = data / 1000;
741
#ifdef __LP64__
742
			if (secs > (SBT_MAX / SBT_1S))
743
				return (SBT_MAX);
744
#endif
745
			return (secs << 32 | MS_TO_SBT(data % 1000));
746
		}
747
		return (MS_TO_SBT(data));
748
	case NOTE_USECONDS:
749
		if (data >= 1000000) {
750
			secs = data / 1000000;
751
#ifdef __LP64__
752
			if (secs > (SBT_MAX / SBT_1S))
753
				return (SBT_MAX);
754
#endif
755
			return (secs << 32 | US_TO_SBT(data % 1000000));
756
		}
757
		return (US_TO_SBT(data));
758
	case NOTE_NSECONDS:
759
		if (data >= 1000000000) {
760
			secs = data / 1000000000;
761
#ifdef __LP64__
762
			if (secs > (SBT_MAX / SBT_1S))
763
				return (SBT_MAX);
764
#endif
765
			return (secs << 32 | NS_TO_SBT(data % 1000000000));
766
		}
767
		return (NS_TO_SBT(data));
768
	default:
769
		break;
770
	}
771
	return (-1);
772
}
773

774
struct kq_timer_cb_data {
775
	struct callout c;
776
	struct proc *p;
777
	struct knote *kn;
778
	int cpuid;
779
	int flags;
780
	TAILQ_ENTRY(kq_timer_cb_data) link;
781
	sbintime_t next;	/* next timer event fires at */
782
	sbintime_t to;		/* precalculated timer period, 0 for abs */
783
};
784

785
#define	KQ_TIMER_CB_ENQUEUED	0x01
786

787
static void
788
kqtimer_sched_callout(struct kq_timer_cb_data *kc)
789
{
790
	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn,
791
	    kc->cpuid, C_ABSOLUTE);
792
}
793

794
void
795
kqtimer_proc_continue(struct proc *p)
796
{
797
	struct kq_timer_cb_data *kc, *kc1;
798
	struct bintime bt;
799
	sbintime_t now;
800

801
	PROC_LOCK_ASSERT(p, MA_OWNED);
802

803
	getboottimebin(&bt);
804
	now = bttosbt(bt);
805

806
	TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) {
807
		TAILQ_REMOVE(&p->p_kqtim_stop, kc, link);
808
		kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
809
		if (kc->next <= now)
810
			filt_timerexpire_l(kc->kn, true);
811
		else
812
			kqtimer_sched_callout(kc);
813
	}
814
}
815

816
static void
817
filt_timerexpire_l(struct knote *kn, bool proc_locked)
818
{
819
	struct kq_timer_cb_data *kc;
820
	struct proc *p;
821
	uint64_t delta;
822
	sbintime_t now;
823

824
	kc = kn->kn_ptr.p_v;
825

826
	if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) {
827
		kn->kn_data++;
828
		KNOTE_ACTIVATE(kn, 0);
829
		return;
830
	}
831

832
	now = sbinuptime();
833
	if (now >= kc->next) {
834
		delta = (now - kc->next) / kc->to;
835
		if (delta == 0)
836
			delta = 1;
837
		kn->kn_data += delta;
838
		kc->next += delta * kc->to;
839
		if (now >= kc->next)	/* overflow */
840
			kc->next = now + kc->to;
841
		KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
842
	}
843

844
	/*
845
	 * Initial check for stopped kc->p is racy.  It is fine to
846
	 * miss the set of the stop flags, at worst we would schedule
847
	 * one more callout.  On the other hand, it is not fine to not
848
	 * schedule when we we missed clearing of the flags, we
849
	 * recheck them under the lock and observe consistent state.
850
	 */
851
	p = kc->p;
852
	if (P_SHOULDSTOP(p) || P_KILLED(p)) {
853
		if (!proc_locked)
854
			PROC_LOCK(p);
855
		if (P_SHOULDSTOP(p) || P_KILLED(p)) {
856
			if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) {
857
				kc->flags |= KQ_TIMER_CB_ENQUEUED;
858
				TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link);
859
			}
860
			if (!proc_locked)
861
				PROC_UNLOCK(p);
862
			return;
863
		}
864
		if (!proc_locked)
865
			PROC_UNLOCK(p);
866
	}
867
	kqtimer_sched_callout(kc);
868
}
869

870
static void
871
filt_timerexpire(void *knx)
872
{
873
	filt_timerexpire_l(knx, false);
874
}
875

876
/*
877
 * data contains amount of time to sleep
878
 */
879
static int
880
filt_timervalidate(struct knote *kn, sbintime_t *to)
881
{
882
	struct bintime bt;
883
	sbintime_t sbt;
884

885
	if (kn->kn_sdata < 0)
886
		return (EINVAL);
887
	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
888
		kn->kn_sdata = 1;
889
	/*
890
	 * The only fflags values supported are the timer unit
891
	 * (precision) and the absolute time indicator.
892
	 */
893
	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
894
		return (EINVAL);
895

896
	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
897
	if (*to < 0)
898
		return (EINVAL);
899
	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
900
		getboottimebin(&bt);
901
		sbt = bttosbt(bt);
902
		*to = MAX(0, *to - sbt);
903
	}
904
	return (0);
905
}
906

907
static int
908
filt_timerattach(struct knote *kn)
909
{
910
	struct kq_timer_cb_data *kc;
911
	sbintime_t to;
912
	int error;
913

914
	to = -1;
915
	error = filt_timervalidate(kn, &to);
916
	if (error != 0)
917
		return (error);
918
	KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 ||
919
	    (kn->kn_sfflags & NOTE_ABSTIME) != 0,
920
	    ("%s: periodic timer has a calculated zero timeout", __func__));
921
	KASSERT(to >= 0,
922
	    ("%s: timer has a calculated negative timeout", __func__));
923

924
	if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
925
		atomic_subtract_int(&kq_ncallouts, 1);
926
		return (ENOMEM);
927
	}
928

929
	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
930
		kn->kn_flags |= EV_CLEAR;	/* automatically set */
931
	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
932
	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
933
	kc->kn = kn;
934
	kc->p = curproc;
935
	kc->cpuid = PCPU_GET(cpuid);
936
	kc->flags = 0;
937
	callout_init(&kc->c, 1);
938
	filt_timerstart(kn, to);
939

940
	return (0);
941
}
942

943
static void
944
filt_timerstart(struct knote *kn, sbintime_t to)
945
{
946
	struct kq_timer_cb_data *kc;
947

948
	kc = kn->kn_ptr.p_v;
949
	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
950
		kc->next = to;
951
		kc->to = 0;
952
	} else {
953
		kc->next = to + sbinuptime();
954
		kc->to = to;
955
	}
956
	kqtimer_sched_callout(kc);
957
}
958

959
static void
960
filt_timerdetach(struct knote *kn)
961
{
962
	struct kq_timer_cb_data *kc;
963
	unsigned int old __unused;
964
	bool pending;
965

966
	kc = kn->kn_ptr.p_v;
967
	do {
968
		callout_drain(&kc->c);
969

970
		/*
971
		 * kqtimer_proc_continue() might have rescheduled this callout.
972
		 * Double-check, using the process mutex as an interlock.
973
		 */
974
		PROC_LOCK(kc->p);
975
		if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) {
976
			kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
977
			TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link);
978
		}
979
		pending = callout_pending(&kc->c);
980
		PROC_UNLOCK(kc->p);
981
	} while (pending);
982
	free(kc, M_KQUEUE);
983
	old = atomic_fetchadd_int(&kq_ncallouts, -1);
984
	KASSERT(old > 0, ("Number of callouts cannot become negative"));
985
	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
986
}
987

988
static void
989
filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
990
{
991
	struct kq_timer_cb_data *kc;
992
	struct kqueue *kq;
993
	sbintime_t to;
994
	int error;
995

996
	switch (type) {
997
	case EVENT_REGISTER:
998
		/* Handle re-added timers that update data/fflags */
999
		if (kev->flags & EV_ADD) {
1000
			kc = kn->kn_ptr.p_v;
1001

1002
			/* Drain any existing callout. */
1003
			callout_drain(&kc->c);
1004

1005
			/* Throw away any existing undelivered record
1006
			 * of the timer expiration. This is done under
1007
			 * the presumption that if a process is
1008
			 * re-adding this timer with new parameters,
1009
			 * it is no longer interested in what may have
1010
			 * happened under the old parameters. If it is
1011
			 * interested, it can wait for the expiration,
1012
			 * delete the old timer definition, and then
1013
			 * add the new one.
1014
			 *
1015
			 * This has to be done while the kq is locked:
1016
			 *   - if enqueued, dequeue
1017
			 *   - make it no longer active
1018
			 *   - clear the count of expiration events
1019
			 */
1020
			kq = kn->kn_kq;
1021
			KQ_LOCK(kq);
1022
			if (kn->kn_status & KN_QUEUED)
1023
				knote_dequeue(kn);
1024

1025
			kn->kn_status &= ~KN_ACTIVE;
1026
			kn->kn_data = 0;
1027
			KQ_UNLOCK(kq);
1028

1029
			/* Reschedule timer based on new data/fflags */
1030
			kn->kn_sfflags = kev->fflags;
1031
			kn->kn_sdata = kev->data;
1032
			error = filt_timervalidate(kn, &to);
1033
			if (error != 0) {
1034
			  	kn->kn_flags |= EV_ERROR;
1035
				kn->kn_data = error;
1036
			} else
1037
			  	filt_timerstart(kn, to);
1038
		}
1039
		break;
1040

1041
        case EVENT_PROCESS:
1042
		*kev = kn->kn_kevent;
1043
		if (kn->kn_flags & EV_CLEAR) {
1044
			kn->kn_data = 0;
1045
			kn->kn_fflags = 0;
1046
		}
1047
		break;
1048

1049
	default:
1050
		panic("filt_timertouch() - invalid type (%ld)", type);
1051
		break;
1052
	}
1053
}
1054

1055
static int
1056
filt_timer(struct knote *kn, long hint)
1057
{
1058

1059
	return (kn->kn_data != 0);
1060
}
1061

1062
static int
1063
filt_userattach(struct knote *kn)
1064
{
1065

1066
	/*
1067
	 * EVFILT_USER knotes are not attached to anything in the kernel.
1068
	 */
1069
	kn->kn_hook = NULL;
1070
	if (kn->kn_fflags & NOTE_TRIGGER)
1071
		kn->kn_hookid = 1;
1072
	else
1073
		kn->kn_hookid = 0;
1074
	return (0);
1075
}
1076

1077
static void
1078
filt_userdetach(__unused struct knote *kn)
1079
{
1080

1081
	/*
1082
	 * EVFILT_USER knotes are not attached to anything in the kernel.
1083
	 */
1084
}
1085

1086
static int
1087
filt_user(struct knote *kn, __unused long hint)
1088
{
1089

1090
	return (kn->kn_hookid);
1091
}
1092

1093
static void
1094
filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
1095
{
1096
	u_int ffctrl;
1097

1098
	switch (type) {
1099
	case EVENT_REGISTER:
1100
		if (kev->fflags & NOTE_TRIGGER)
1101
			kn->kn_hookid = 1;
1102

1103
		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1104
		kev->fflags &= NOTE_FFLAGSMASK;
1105
		switch (ffctrl) {
1106
		case NOTE_FFNOP:
1107
			break;
1108

1109
		case NOTE_FFAND:
1110
			kn->kn_sfflags &= kev->fflags;
1111
			break;
1112

1113
		case NOTE_FFOR:
1114
			kn->kn_sfflags |= kev->fflags;
1115
			break;
1116

1117
		case NOTE_FFCOPY:
1118
			kn->kn_sfflags = kev->fflags;
1119
			break;
1120

1121
		default:
1122
			/* XXX Return error? */
1123
			break;
1124
		}
1125
		kn->kn_sdata = kev->data;
1126
		if (kev->flags & EV_CLEAR) {
1127
			kn->kn_hookid = 0;
1128
			kn->kn_data = 0;
1129
			kn->kn_fflags = 0;
1130
		}
1131
		break;
1132

1133
        case EVENT_PROCESS:
1134
		*kev = kn->kn_kevent;
1135
		kev->fflags = kn->kn_sfflags;
1136
		kev->data = kn->kn_sdata;
1137
		if (kn->kn_flags & EV_CLEAR) {
1138
			kn->kn_hookid = 0;
1139
			kn->kn_data = 0;
1140
			kn->kn_fflags = 0;
1141
		}
1142
		break;
1143

1144
	default:
1145
		panic("filt_usertouch() - invalid type (%ld)", type);
1146
		break;
1147
	}
1148
}
1149

1150
int
1151
sys_kqueue(struct thread *td, struct kqueue_args *uap)
1152
{
1153

1154
	return (kern_kqueue(td, 0, NULL));
1155
}
1156

1157
int
1158
sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
1159
{
1160
	int flags;
1161

1162
	if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
1163
		return (EINVAL);
1164
	flags = 0;
1165
	if ((uap->flags & KQUEUE_CLOEXEC) != 0)
1166
		flags |= O_CLOEXEC;
1167
	return (kern_kqueue(td, flags, NULL));
1168
}
1169

1170
static void
1171
kqueue_init(struct kqueue *kq)
1172
{
1173

1174
	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
1175
	TAILQ_INIT(&kq->kq_head);
1176
	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
1177
	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
1178
}
1179

1180
int
1181
kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
1182
{
1183
	struct filedesc *fdp;
1184
	struct kqueue *kq;
1185
	struct file *fp;
1186
	struct ucred *cred;
1187
	int fd, error;
1188

1189
	fdp = td->td_proc->p_fd;
1190
	cred = td->td_ucred;
1191
	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
1192
		return (ENOMEM);
1193

1194
	error = falloc_caps(td, &fp, &fd, flags, fcaps);
1195
	if (error != 0) {
1196
		chgkqcnt(cred->cr_ruidinfo, -1, 0);
1197
		return (error);
1198
	}
1199

1200
	/* An extra reference on `fp' has been held for us by falloc(). */
1201
	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
1202
	kqueue_init(kq);
1203
	kq->kq_fdp = fdp;
1204
	kq->kq_cred = crhold(cred);
1205

1206
	FILEDESC_XLOCK(fdp);
1207
	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
1208
	FILEDESC_XUNLOCK(fdp);
1209

1210
	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
1211
	fdrop(fp, td);
1212

1213
	td->td_retval[0] = fd;
1214
	return (0);
1215
}
1216

1217
struct g_kevent_args {
1218
	int	fd;
1219
	const void *changelist;
1220
	int	nchanges;
1221
	void	*eventlist;
1222
	int	nevents;
1223
	const struct timespec *timeout;
1224
};
1225

1226
int
1227
sys_kevent(struct thread *td, struct kevent_args *uap)
1228
{
1229
	struct kevent_copyops k_ops = {
1230
		.arg = uap,
1231
		.k_copyout = kevent_copyout,
1232
		.k_copyin = kevent_copyin,
1233
		.kevent_size = sizeof(struct kevent),
1234
	};
1235
	struct g_kevent_args gk_args = {
1236
		.fd = uap->fd,
1237
		.changelist = uap->changelist,
1238
		.nchanges = uap->nchanges,
1239
		.eventlist = uap->eventlist,
1240
		.nevents = uap->nevents,
1241
		.timeout = uap->timeout,
1242
	};
1243

1244
	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
1245
}
1246

1247
static int
1248
kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
1249
    struct kevent_copyops *k_ops, const char *struct_name)
1250
{
1251
	struct timespec ts, *tsp;
1252
#ifdef KTRACE
1253
	struct kevent *eventlist = uap->eventlist;
1254
#endif
1255
	int error;
1256

1257
	if (uap->timeout != NULL) {
1258
		error = copyin(uap->timeout, &ts, sizeof(ts));
1259
		if (error)
1260
			return (error);
1261
		tsp = &ts;
1262
	} else
1263
		tsp = NULL;
1264

1265
#ifdef KTRACE
1266
	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
1267
		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
1268
		    uap->nchanges, k_ops->kevent_size);
1269
#endif
1270

1271
	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
1272
	    k_ops, tsp);
1273

1274
#ifdef KTRACE
1275
	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
1276
		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
1277
		    td->td_retval[0], k_ops->kevent_size);
1278
#endif
1279

1280
	return (error);
1281
}
1282

1283
/*
1284
 * Copy 'count' items into the destination list pointed to by uap->eventlist.
1285
 */
1286
static int
1287
kevent_copyout(void *arg, struct kevent *kevp, int count)
1288
{
1289
	struct kevent_args *uap;
1290
	int error;
1291

1292
	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1293
	uap = (struct kevent_args *)arg;
1294

1295
	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
1296
	if (error == 0)
1297
		uap->eventlist += count;
1298
	return (error);
1299
}
1300

1301
/*
1302
 * Copy 'count' items from the list pointed to by uap->changelist.
1303
 */
1304
static int
1305
kevent_copyin(void *arg, struct kevent *kevp, int count)
1306
{
1307
	struct kevent_args *uap;
1308
	int error;
1309

1310
	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1311
	uap = (struct kevent_args *)arg;
1312

1313
	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
1314
	if (error == 0)
1315
		uap->changelist += count;
1316
	return (error);
1317
}
1318

1319
#ifdef COMPAT_FREEBSD11
1320
static int
1321
kevent11_copyout(void *arg, struct kevent *kevp, int count)
1322
{
1323
	struct freebsd11_kevent_args *uap;
1324
	struct freebsd11_kevent kev11;
1325
	int error, i;
1326

1327
	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1328
	uap = (struct freebsd11_kevent_args *)arg;
1329

1330
	for (i = 0; i < count; i++) {
1331
		kev11.ident = kevp->ident;
1332
		kev11.filter = kevp->filter;
1333
		kev11.flags = kevp->flags;
1334
		kev11.fflags = kevp->fflags;
1335
		kev11.data = kevp->data;
1336
		kev11.udata = kevp->udata;
1337
		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
1338
		if (error != 0)
1339
			break;
1340
		uap->eventlist++;
1341
		kevp++;
1342
	}
1343
	return (error);
1344
}
1345

1346
/*
1347
 * Copy 'count' items from the list pointed to by uap->changelist.
1348
 */
1349
static int
1350
kevent11_copyin(void *arg, struct kevent *kevp, int count)
1351
{
1352
	struct freebsd11_kevent_args *uap;
1353
	struct freebsd11_kevent kev11;
1354
	int error, i;
1355

1356
	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
1357
	uap = (struct freebsd11_kevent_args *)arg;
1358

1359
	for (i = 0; i < count; i++) {
1360
		error = copyin(uap->changelist, &kev11, sizeof(kev11));
1361
		if (error != 0)
1362
			break;
1363
		kevp->ident = kev11.ident;
1364
		kevp->filter = kev11.filter;
1365
		kevp->flags = kev11.flags;
1366
		kevp->fflags = kev11.fflags;
1367
		kevp->data = (uintptr_t)kev11.data;
1368
		kevp->udata = kev11.udata;
1369
		bzero(&kevp->ext, sizeof(kevp->ext));
1370
		uap->changelist++;
1371
		kevp++;
1372
	}
1373
	return (error);
1374
}
1375

1376
int
1377
freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
1378
{
1379
	struct kevent_copyops k_ops = {
1380
		.arg = uap,
1381
		.k_copyout = kevent11_copyout,
1382
		.k_copyin = kevent11_copyin,
1383
		.kevent_size = sizeof(struct freebsd11_kevent),
1384
	};
1385
	struct g_kevent_args gk_args = {
1386
		.fd = uap->fd,
1387
		.changelist = uap->changelist,
1388
		.nchanges = uap->nchanges,
1389
		.eventlist = uap->eventlist,
1390
		.nevents = uap->nevents,
1391
		.timeout = uap->timeout,
1392
	};
1393

1394
	return (kern_kevent_generic(td, &gk_args, &k_ops, "freebsd11_kevent"));
1395
}
1396
#endif
1397

1398
int
1399
kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
1400
    struct kevent_copyops *k_ops, const struct timespec *timeout)
1401
{
1402
	cap_rights_t rights;
1403
	struct file *fp;
1404
	int error;
1405

1406
	cap_rights_init_zero(&rights);
1407
	if (nchanges > 0)
1408
		cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
1409
	if (nevents > 0)
1410
		cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
1411
	error = fget(td, fd, &rights, &fp);
1412
	if (error != 0)
1413
		return (error);
1414

1415
	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
1416
	fdrop(fp, td);
1417

1418
	return (error);
1419
}
1420

1421
static int
1422
kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
1423
    struct kevent_copyops *k_ops, const struct timespec *timeout)
1424
{
1425
	struct kevent keva[KQ_NEVENTS];
1426
	struct kevent *kevp, *changes;
1427
	int i, n, nerrors, error;
1428

1429
	if (nchanges < 0)
1430
		return (EINVAL);
1431

1432
	nerrors = 0;
1433
	while (nchanges > 0) {
1434
		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1435
		error = k_ops->k_copyin(k_ops->arg, keva, n);
1436
		if (error)
1437
			return (error);
1438
		changes = keva;
1439
		for (i = 0; i < n; i++) {
1440
			kevp = &changes[i];
1441
			if (!kevp->filter)
1442
				continue;
1443
			kevp->flags &= ~EV_SYSFLAGS;
1444
			error = kqueue_register(kq, kevp, td, M_WAITOK);
1445
			if (error || (kevp->flags & EV_RECEIPT)) {
1446
				if (nevents == 0)
1447
					return (error);
1448
				kevp->flags = EV_ERROR;
1449
				kevp->data = error;
1450
				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
1451
				nevents--;
1452
				nerrors++;
1453
			}
1454
		}
1455
		nchanges -= n;
1456
	}
1457
	if (nerrors) {
1458
		td->td_retval[0] = nerrors;
1459
		return (0);
1460
	}
1461

1462
	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
1463
}
1464

1465
int
1466
kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
1467
    struct kevent_copyops *k_ops, const struct timespec *timeout)
1468
{
1469
	struct kqueue *kq;
1470
	int error;
1471

1472
	error = kqueue_acquire(fp, &kq);
1473
	if (error != 0)
1474
		return (error);
1475
	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
1476
	kqueue_release(kq, 0);
1477
	return (error);
1478
}
1479

1480
/*
1481
 * Performs a kevent() call on a temporarily created kqueue. This can be
1482
 * used to perform one-shot polling, similar to poll() and select().
1483
 */
1484
int
1485
kern_kevent_anonymous(struct thread *td, int nevents,
1486
    struct kevent_copyops *k_ops)
1487
{
1488
	struct kqueue kq = {};
1489
	int error;
1490

1491
	kqueue_init(&kq);
1492
	kq.kq_refcnt = 1;
1493
	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
1494
	kqueue_drain(&kq, td);
1495
	kqueue_destroy(&kq);
1496
	return (error);
1497
}
1498

1499
int
1500
kqueue_add_filteropts(int filt, const struct filterops *filtops)
1501
{
1502
	int error;
1503

1504
	error = 0;
1505
	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1506
		printf(
1507
"trying to add a filterop that is out of range: %d is beyond %d\n",
1508
		    ~filt, EVFILT_SYSCOUNT);
1509
		return EINVAL;
1510
	}
1511
	mtx_lock(&filterops_lock);
1512
	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1513
	    sysfilt_ops[~filt].for_fop != NULL)
1514
		error = EEXIST;
1515
	else {
1516
		sysfilt_ops[~filt].for_fop = filtops;
1517
		sysfilt_ops[~filt].for_refcnt = 0;
1518
	}
1519
	mtx_unlock(&filterops_lock);
1520

1521
	return (error);
1522
}
1523

1524
int
1525
kqueue_del_filteropts(int filt)
1526
{
1527
	int error;
1528

1529
	error = 0;
1530
	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1531
		return EINVAL;
1532

1533
	mtx_lock(&filterops_lock);
1534
	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1535
	    sysfilt_ops[~filt].for_fop == NULL)
1536
		error = EINVAL;
1537
	else if (sysfilt_ops[~filt].for_refcnt != 0)
1538
		error = EBUSY;
1539
	else {
1540
		sysfilt_ops[~filt].for_fop = &null_filtops;
1541
		sysfilt_ops[~filt].for_refcnt = 0;
1542
	}
1543
	mtx_unlock(&filterops_lock);
1544

1545
	return error;
1546
}
1547

1548
static const struct filterops *
1549
kqueue_fo_find(int filt)
1550
{
1551

1552
	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1553
		return NULL;
1554

1555
	if (sysfilt_ops[~filt].for_nolock)
1556
		return sysfilt_ops[~filt].for_fop;
1557

1558
	mtx_lock(&filterops_lock);
1559
	sysfilt_ops[~filt].for_refcnt++;
1560
	if (sysfilt_ops[~filt].for_fop == NULL)
1561
		sysfilt_ops[~filt].for_fop = &null_filtops;
1562
	mtx_unlock(&filterops_lock);
1563

1564
	return sysfilt_ops[~filt].for_fop;
1565
}
1566

1567
static void
1568
kqueue_fo_release(int filt)
1569
{
1570

1571
	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1572
		return;
1573

1574
	if (sysfilt_ops[~filt].for_nolock)
1575
		return;
1576

1577
	mtx_lock(&filterops_lock);
1578
	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1579
	    ("filter object refcount not valid on release"));
1580
	sysfilt_ops[~filt].for_refcnt--;
1581
	mtx_unlock(&filterops_lock);
1582
}
1583

1584
/*
1585
 * A ref to kq (obtained via kqueue_acquire) must be held.
1586
 */
1587
static int
1588
kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
1589
    int mflag)
1590
{
1591
	const struct filterops *fops;
1592
	struct file *fp;
1593
	struct knote *kn, *tkn;
1594
	struct knlist *knl;
1595
	int error, filt, event;
1596
	int haskqglobal, filedesc_unlock;
1597

1598
	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
1599
		return (EINVAL);
1600

1601
	fp = NULL;
1602
	kn = NULL;
1603
	knl = NULL;
1604
	error = 0;
1605
	haskqglobal = 0;
1606
	filedesc_unlock = 0;
1607

1608
	filt = kev->filter;
1609
	fops = kqueue_fo_find(filt);
1610
	if (fops == NULL)
1611
		return EINVAL;
1612

1613
	if (kev->flags & EV_ADD) {
1614
		/* Reject an invalid flag pair early */
1615
		if (kev->flags & EV_KEEPUDATA) {
1616
			tkn = NULL;
1617
			error = EINVAL;
1618
			goto done;
1619
		}
1620

1621
		/*
1622
		 * Prevent waiting with locks.  Non-sleepable
1623
		 * allocation failures are handled in the loop, only
1624
		 * if the spare knote appears to be actually required.
1625
		 */
1626
		tkn = knote_alloc(mflag);
1627
	} else {
1628
		tkn = NULL;
1629
	}
1630

1631
findkn:
1632
	if (fops->f_isfd) {
1633
		KASSERT(td != NULL, ("td is NULL"));
1634
		if (kev->ident > INT_MAX)
1635
			error = EBADF;
1636
		else
1637
			error = fget(td, kev->ident, &cap_event_rights, &fp);
1638
		if (error)
1639
			goto done;
1640

1641
		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1642
		    kev->ident, M_NOWAIT) != 0) {
1643
			/* try again */
1644
			fdrop(fp, td);
1645
			fp = NULL;
1646
			error = kqueue_expand(kq, fops, kev->ident, mflag);
1647
			if (error)
1648
				goto done;
1649
			goto findkn;
1650
		}
1651

1652
		if (fp->f_type == DTYPE_KQUEUE) {
1653
			/*
1654
			 * If we add some intelligence about what we are doing,
1655
			 * we should be able to support events on ourselves.
1656
			 * We need to know when we are doing this to prevent
1657
			 * getting both the knlist lock and the kq lock since
1658
			 * they are the same thing.
1659
			 */
1660
			if (fp->f_data == kq) {
1661
				error = EINVAL;
1662
				goto done;
1663
			}
1664

1665
			/*
1666
			 * Pre-lock the filedesc before the global
1667
			 * lock mutex, see the comment in
1668
			 * kqueue_close().
1669
			 */
1670
			FILEDESC_XLOCK(td->td_proc->p_fd);
1671
			filedesc_unlock = 1;
1672
			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1673
		}
1674

1675
		KQ_LOCK(kq);
1676
		if (kev->ident < kq->kq_knlistsize) {
1677
			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1678
				if (kev->filter == kn->kn_filter)
1679
					break;
1680
		}
1681
	} else {
1682
		if ((kev->flags & EV_ADD) == EV_ADD) {
1683
			error = kqueue_expand(kq, fops, kev->ident, mflag);
1684
			if (error != 0)
1685
				goto done;
1686
		}
1687

1688
		KQ_LOCK(kq);
1689

1690
		/*
1691
		 * If possible, find an existing knote to use for this kevent.
1692
		 */
1693
		if (kev->filter == EVFILT_PROC &&
1694
		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
1695
			/* This is an internal creation of a process tracking
1696
			 * note. Don't attempt to coalesce this with an
1697
			 * existing note.
1698
			 */
1699
			;
1700
		} else if (kq->kq_knhashmask != 0) {
1701
			struct klist *list;
1702

1703
			list = &kq->kq_knhash[
1704
			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1705
			SLIST_FOREACH(kn, list, kn_link)
1706
				if (kev->ident == kn->kn_id &&
1707
				    kev->filter == kn->kn_filter)
1708
					break;
1709
		}
1710
	}
1711

1712
	/* knote is in the process of changing, wait for it to stabilize. */
1713
	if (kn != NULL && kn_in_flux(kn)) {
1714
		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1715
		if (filedesc_unlock) {
1716
			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1717
			filedesc_unlock = 0;
1718
		}
1719
		kq->kq_state |= KQ_FLUXWAIT;
1720
		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1721
		if (fp != NULL) {
1722
			fdrop(fp, td);
1723
			fp = NULL;
1724
		}
1725
		goto findkn;
1726
	}
1727

1728
	/*
1729
	 * kn now contains the matching knote, or NULL if no match
1730
	 */
1731
	if (kn == NULL) {
1732
		if (kev->flags & EV_ADD) {
1733
			kn = tkn;
1734
			tkn = NULL;
1735
			if (kn == NULL) {
1736
				KQ_UNLOCK(kq);
1737
				error = ENOMEM;
1738
				goto done;
1739
			}
1740
			kn->kn_fp = fp;
1741
			kn->kn_kq = kq;
1742
			kn->kn_fop = fops;
1743
			/*
1744
			 * apply reference counts to knote structure, and
1745
			 * do not release it at the end of this routine.
1746
			 */
1747
			fops = NULL;
1748
			fp = NULL;
1749

1750
			kn->kn_sfflags = kev->fflags;
1751
			kn->kn_sdata = kev->data;
1752
			kev->fflags = 0;
1753
			kev->data = 0;
1754
			kn->kn_kevent = *kev;
1755
			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1756
			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
1757
			kn->kn_status = KN_DETACHED;
1758
			if ((kev->flags & EV_DISABLE) != 0)
1759
				kn->kn_status |= KN_DISABLED;
1760
			kn_enter_flux(kn);
1761

1762
			error = knote_attach(kn, kq);
1763
			KQ_UNLOCK(kq);
1764
			if (error != 0) {
1765
				tkn = kn;
1766
				goto done;
1767
			}
1768

1769
			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1770
				knote_drop_detached(kn, td);
1771
				goto done;
1772
			}
1773
			knl = kn_list_lock(kn);
1774
			goto done_ev_add;
1775
		} else {
1776
			/* No matching knote and the EV_ADD flag is not set. */
1777
			KQ_UNLOCK(kq);
1778
			error = ENOENT;
1779
			goto done;
1780
		}
1781
	}
1782

1783
	if (kev->flags & EV_DELETE) {
1784
		kn_enter_flux(kn);
1785
		KQ_UNLOCK(kq);
1786
		knote_drop(kn, td);
1787
		goto done;
1788
	}
1789

1790
	if (kev->flags & EV_FORCEONESHOT) {
1791
		kn->kn_flags |= EV_ONESHOT;
1792
		KNOTE_ACTIVATE(kn, 1);
1793
	}
1794

1795
	if ((kev->flags & EV_ENABLE) != 0)
1796
		kn->kn_status &= ~KN_DISABLED;
1797
	else if ((kev->flags & EV_DISABLE) != 0)
1798
		kn->kn_status |= KN_DISABLED;
1799

1800
	/*
1801
	 * The user may change some filter values after the initial EV_ADD,
1802
	 * but doing so will not reset any filter which has already been
1803
	 * triggered.
1804
	 */
1805
	kn->kn_status |= KN_SCAN;
1806
	kn_enter_flux(kn);
1807
	KQ_UNLOCK(kq);
1808
	knl = kn_list_lock(kn);
1809
	if ((kev->flags & EV_KEEPUDATA) == 0)
1810
		kn->kn_kevent.udata = kev->udata;
1811
	if (!fops->f_isfd && fops->f_touch != NULL) {
1812
		fops->f_touch(kn, kev, EVENT_REGISTER);
1813
	} else {
1814
		kn->kn_sfflags = kev->fflags;
1815
		kn->kn_sdata = kev->data;
1816
	}
1817

1818
done_ev_add:
1819
	/*
1820
	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1821
	 * the initial attach event decides that the event is "completed"
1822
	 * already, e.g., filt_procattach() is called on a zombie process.  It
1823
	 * will call filt_proc() which will remove it from the list, and NULL
1824
	 * kn_knlist.
1825
	 *
1826
	 * KN_DISABLED will be stable while the knote is in flux, so the
1827
	 * unlocked read will not race with an update.
1828
	 */
1829
	if ((kn->kn_status & KN_DISABLED) == 0)
1830
		event = kn->kn_fop->f_event(kn, 0);
1831
	else
1832
		event = 0;
1833

1834
	KQ_LOCK(kq);
1835
	if (event)
1836
		kn->kn_status |= KN_ACTIVE;
1837
	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
1838
	    KN_ACTIVE)
1839
		knote_enqueue(kn);
1840
	kn->kn_status &= ~KN_SCAN;
1841
	kn_leave_flux(kn);
1842
	kn_list_unlock(knl);
1843
	KQ_UNLOCK_FLUX(kq);
1844

1845
done:
1846
	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1847
	if (filedesc_unlock)
1848
		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1849
	if (fp != NULL)
1850
		fdrop(fp, td);
1851
	knote_free(tkn);
1852
	if (fops != NULL)
1853
		kqueue_fo_release(filt);
1854
	return (error);
1855
}
1856

1857
static int
1858
kqueue_acquire(struct file *fp, struct kqueue **kqp)
1859
{
1860
	int error;
1861
	struct kqueue *kq;
1862

1863
	error = 0;
1864

1865
	kq = fp->f_data;
1866
	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1867
		return (EINVAL);
1868
	*kqp = kq;
1869
	KQ_LOCK(kq);
1870
	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1871
		KQ_UNLOCK(kq);
1872
		return (EBADF);
1873
	}
1874
	kq->kq_refcnt++;
1875
	KQ_UNLOCK(kq);
1876

1877
	return error;
1878
}
1879

1880
static void
1881
kqueue_release(struct kqueue *kq, int locked)
1882
{
1883
	if (locked)
1884
		KQ_OWNED(kq);
1885
	else
1886
		KQ_LOCK(kq);
1887
	kq->kq_refcnt--;
1888
	if (kq->kq_refcnt == 1)
1889
		wakeup(&kq->kq_refcnt);
1890
	if (!locked)
1891
		KQ_UNLOCK(kq);
1892
}
1893

1894
static void
1895
ast_kqueue(struct thread *td, int tda __unused)
1896
{
1897
	taskqueue_quiesce(taskqueue_kqueue_ctx);
1898
}
1899

1900
static void
1901
kqueue_schedtask(struct kqueue *kq)
1902
{
1903
	KQ_OWNED(kq);
1904
	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1905
	    ("scheduling kqueue task while draining"));
1906

1907
	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1908
		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
1909
		kq->kq_state |= KQ_TASKSCHED;
1910
		ast_sched(curthread, TDA_KQUEUE);
1911
	}
1912
}
1913

1914
/*
1915
 * Expand the kq to make sure we have storage for fops/ident pair.
1916
 *
1917
 * Return 0 on success (or no work necessary), return errno on failure.
1918
 */
1919
static int
1920
kqueue_expand(struct kqueue *kq, const struct filterops *fops, uintptr_t ident,
1921
    int mflag)
1922
{
1923
	struct klist *list, *tmp_knhash, *to_free;
1924
	u_long tmp_knhashmask;
1925
	int error, fd, size;
1926

1927
	KQ_NOTOWNED(kq);
1928

1929
	error = 0;
1930
	to_free = NULL;
1931
	if (fops->f_isfd) {
1932
		fd = ident;
1933
		if (kq->kq_knlistsize <= fd) {
1934
			size = kq->kq_knlistsize;
1935
			while (size <= fd)
1936
				size += KQEXTENT;
1937
			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1938
			if (list == NULL)
1939
				return ENOMEM;
1940
			KQ_LOCK(kq);
1941
			if ((kq->kq_state & KQ_CLOSING) != 0) {
1942
				to_free = list;
1943
				error = EBADF;
1944
			} else if (kq->kq_knlistsize > fd) {
1945
				to_free = list;
1946
			} else {
1947
				if (kq->kq_knlist != NULL) {
1948
					bcopy(kq->kq_knlist, list,
1949
					    kq->kq_knlistsize * sizeof(*list));
1950
					to_free = kq->kq_knlist;
1951
					kq->kq_knlist = NULL;
1952
				}
1953
				bzero((caddr_t)list +
1954
				    kq->kq_knlistsize * sizeof(*list),
1955
				    (size - kq->kq_knlistsize) * sizeof(*list));
1956
				kq->kq_knlistsize = size;
1957
				kq->kq_knlist = list;
1958
			}
1959
			KQ_UNLOCK(kq);
1960
		}
1961
	} else {
1962
		if (kq->kq_knhashmask == 0) {
1963
			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
1964
			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
1965
			    HASH_WAITOK : HASH_NOWAIT);
1966
			if (tmp_knhash == NULL)
1967
				return (ENOMEM);
1968
			KQ_LOCK(kq);
1969
			if ((kq->kq_state & KQ_CLOSING) != 0) {
1970
				to_free = tmp_knhash;
1971
				error = EBADF;
1972
			} else if (kq->kq_knhashmask == 0) {
1973
				kq->kq_knhash = tmp_knhash;
1974
				kq->kq_knhashmask = tmp_knhashmask;
1975
			} else {
1976
				to_free = tmp_knhash;
1977
			}
1978
			KQ_UNLOCK(kq);
1979
		}
1980
	}
1981
	free(to_free, M_KQUEUE);
1982

1983
	KQ_NOTOWNED(kq);
1984
	return (error);
1985
}
1986

1987
static void
1988
kqueue_task(void *arg, int pending)
1989
{
1990
	struct kqueue *kq;
1991
	int haskqglobal;
1992

1993
	haskqglobal = 0;
1994
	kq = arg;
1995

1996
	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1997
	KQ_LOCK(kq);
1998

1999
	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
2000

2001
	kq->kq_state &= ~KQ_TASKSCHED;
2002
	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
2003
		wakeup(&kq->kq_state);
2004
	}
2005
	KQ_UNLOCK(kq);
2006
	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2007
}
2008

2009
/*
2010
 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
2011
 * We treat KN_MARKER knotes as if they are in flux.
2012
 */
2013
static int
2014
kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
2015
    const struct timespec *tsp, struct kevent *keva, struct thread *td)
2016
{
2017
	struct kevent *kevp;
2018
	struct knote *kn, *marker;
2019
	struct knlist *knl;
2020
	sbintime_t asbt, rsbt;
2021
	int count, error, haskqglobal, influx, nkev, touch;
2022

2023
	count = maxevents;
2024
	nkev = 0;
2025
	error = 0;
2026
	haskqglobal = 0;
2027

2028
	if (maxevents == 0)
2029
		goto done_nl;
2030
	if (maxevents < 0) {
2031
		error = EINVAL;
2032
		goto done_nl;
2033
	}
2034

2035
	rsbt = 0;
2036
	if (tsp != NULL) {
2037
		if (!timespecvalid_interval(tsp)) {
2038
			error = EINVAL;
2039
			goto done_nl;
2040
		}
2041
		if (timespecisset(tsp)) {
2042
			if (tsp->tv_sec <= INT32_MAX) {
2043
				rsbt = tstosbt(*tsp);
2044
				if (TIMESEL(&asbt, rsbt))
2045
					asbt += tc_tick_sbt;
2046
				if (asbt <= SBT_MAX - rsbt)
2047
					asbt += rsbt;
2048
				else
2049
					asbt = 0;
2050
				rsbt >>= tc_precexp;
2051
			} else
2052
				asbt = 0;
2053
		} else
2054
			asbt = -1;
2055
	} else
2056
		asbt = 0;
2057
	marker = knote_alloc(M_WAITOK);
2058
	marker->kn_status = KN_MARKER;
2059
	KQ_LOCK(kq);
2060

2061
retry:
2062
	kevp = keva;
2063
	if (kq->kq_count == 0) {
2064
		if (asbt == -1) {
2065
			error = EWOULDBLOCK;
2066
		} else {
2067
			kq->kq_state |= KQ_SLEEP;
2068
			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
2069
			    "kqread", asbt, rsbt, C_ABSOLUTE);
2070
		}
2071
		if (error == 0)
2072
			goto retry;
2073
		/* don't restart after signals... */
2074
		if (error == ERESTART)
2075
			error = EINTR;
2076
		else if (error == EWOULDBLOCK)
2077
			error = 0;
2078
		goto done;
2079
	}
2080

2081
	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
2082
	influx = 0;
2083
	while (count) {
2084
		KQ_OWNED(kq);
2085
		kn = TAILQ_FIRST(&kq->kq_head);
2086

2087
		if ((kn->kn_status == KN_MARKER && kn != marker) ||
2088
		    kn_in_flux(kn)) {
2089
			if (influx) {
2090
				influx = 0;
2091
				KQ_FLUX_WAKEUP(kq);
2092
			}
2093
			kq->kq_state |= KQ_FLUXWAIT;
2094
			error = msleep(kq, &kq->kq_lock, PSOCK,
2095
			    "kqflxwt", 0);
2096
			continue;
2097
		}
2098

2099
		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2100
		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
2101
			kn->kn_status &= ~KN_QUEUED;
2102
			kq->kq_count--;
2103
			continue;
2104
		}
2105
		if (kn == marker) {
2106
			KQ_FLUX_WAKEUP(kq);
2107
			if (count == maxevents)
2108
				goto retry;
2109
			goto done;
2110
		}
2111
		KASSERT(!kn_in_flux(kn),
2112
		    ("knote %p is unexpectedly in flux", kn));
2113

2114
		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
2115
			kn->kn_status &= ~KN_QUEUED;
2116
			kn_enter_flux(kn);
2117
			kq->kq_count--;
2118
			KQ_UNLOCK(kq);
2119
			/*
2120
			 * We don't need to lock the list since we've
2121
			 * marked it as in flux.
2122
			 */
2123
			knote_drop(kn, td);
2124
			KQ_LOCK(kq);
2125
			continue;
2126
		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
2127
			kn->kn_status &= ~KN_QUEUED;
2128
			kn_enter_flux(kn);
2129
			kq->kq_count--;
2130
			KQ_UNLOCK(kq);
2131
			/*
2132
			 * We don't need to lock the list since we've
2133
			 * marked the knote as being in flux.
2134
			 */
2135
			*kevp = kn->kn_kevent;
2136
			knote_drop(kn, td);
2137
			KQ_LOCK(kq);
2138
			kn = NULL;
2139
		} else {
2140
			kn->kn_status |= KN_SCAN;
2141
			kn_enter_flux(kn);
2142
			KQ_UNLOCK(kq);
2143
			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
2144
				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
2145
			knl = kn_list_lock(kn);
2146
			if (kn->kn_fop->f_event(kn, 0) == 0) {
2147
				KQ_LOCK(kq);
2148
				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2149
				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
2150
				    KN_SCAN);
2151
				kn_leave_flux(kn);
2152
				kq->kq_count--;
2153
				kn_list_unlock(knl);
2154
				influx = 1;
2155
				continue;
2156
			}
2157
			touch = (!kn->kn_fop->f_isfd &&
2158
			    kn->kn_fop->f_touch != NULL);
2159
			if (touch)
2160
				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
2161
			else
2162
				*kevp = kn->kn_kevent;
2163
			KQ_LOCK(kq);
2164
			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
2165
			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
2166
				/*
2167
				 * Manually clear knotes who weren't
2168
				 * 'touch'ed.
2169
				 */
2170
				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
2171
					kn->kn_data = 0;
2172
					kn->kn_fflags = 0;
2173
				}
2174
				if (kn->kn_flags & EV_DISPATCH)
2175
					kn->kn_status |= KN_DISABLED;
2176
				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
2177
				kq->kq_count--;
2178
			} else
2179
				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2180

2181
			kn->kn_status &= ~KN_SCAN;
2182
			kn_leave_flux(kn);
2183
			kn_list_unlock(knl);
2184
			influx = 1;
2185
		}
2186

2187
		/* we are returning a copy to the user */
2188
		kevp++;
2189
		nkev++;
2190
		count--;
2191

2192
		if (nkev == KQ_NEVENTS) {
2193
			influx = 0;
2194
			KQ_UNLOCK_FLUX(kq);
2195
			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2196
			nkev = 0;
2197
			kevp = keva;
2198
			KQ_LOCK(kq);
2199
			if (error)
2200
				break;
2201
		}
2202
	}
2203
	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
2204
done:
2205
	KQ_OWNED(kq);
2206
	KQ_UNLOCK_FLUX(kq);
2207
	knote_free(marker);
2208
done_nl:
2209
	KQ_NOTOWNED(kq);
2210
	if (nkev != 0)
2211
		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
2212
	td->td_retval[0] = maxevents - count;
2213
	return (error);
2214
}
2215

2216
/*ARGSUSED*/
2217
static int
2218
kqueue_ioctl(struct file *fp, u_long cmd, void *data,
2219
	struct ucred *active_cred, struct thread *td)
2220
{
2221
	/*
2222
	 * Enabling sigio causes two major problems:
2223
	 * 1) infinite recursion:
2224
	 * Synopsys: kevent is being used to track signals and have FIOASYNC
2225
	 * set.  On receipt of a signal this will cause a kqueue to recurse
2226
	 * into itself over and over.  Sending the sigio causes the kqueue
2227
	 * to become ready, which in turn posts sigio again, forever.
2228
	 * Solution: this can be solved by setting a flag in the kqueue that
2229
	 * we have a SIGIO in progress.
2230
	 * 2) locking problems:
2231
	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
2232
	 * us above the proc and pgrp locks.
2233
	 * Solution: Post a signal using an async mechanism, being sure to
2234
	 * record a generation count in the delivery so that we do not deliver
2235
	 * a signal to the wrong process.
2236
	 *
2237
	 * Note, these two mechanisms are somewhat mutually exclusive!
2238
	 */
2239
#if 0
2240
	struct kqueue *kq;
2241

2242
	kq = fp->f_data;
2243
	switch (cmd) {
2244
	case FIOASYNC:
2245
		if (*(int *)data) {
2246
			kq->kq_state |= KQ_ASYNC;
2247
		} else {
2248
			kq->kq_state &= ~KQ_ASYNC;
2249
		}
2250
		return (0);
2251

2252
	case FIOSETOWN:
2253
		return (fsetown(*(int *)data, &kq->kq_sigio));
2254

2255
	case FIOGETOWN:
2256
		*(int *)data = fgetown(&kq->kq_sigio);
2257
		return (0);
2258
	}
2259
#endif
2260

2261
	return (ENOTTY);
2262
}
2263

2264
/*ARGSUSED*/
2265
static int
2266
kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
2267
	struct thread *td)
2268
{
2269
	struct kqueue *kq;
2270
	int revents = 0;
2271
	int error;
2272

2273
	if ((error = kqueue_acquire(fp, &kq)))
2274
		return POLLERR;
2275

2276
	KQ_LOCK(kq);
2277
	if (events & (POLLIN | POLLRDNORM)) {
2278
		if (kq->kq_count) {
2279
			revents |= events & (POLLIN | POLLRDNORM);
2280
		} else {
2281
			selrecord(td, &kq->kq_sel);
2282
			if (SEL_WAITING(&kq->kq_sel))
2283
				kq->kq_state |= KQ_SEL;
2284
		}
2285
	}
2286
	kqueue_release(kq, 1);
2287
	KQ_UNLOCK(kq);
2288
	return (revents);
2289
}
2290

2291
/*ARGSUSED*/
2292
static int
2293
kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
2294
{
2295

2296
	bzero((void *)st, sizeof *st);
2297
	/*
2298
	 * We no longer return kq_count because the unlocked value is useless.
2299
	 * If you spent all this time getting the count, why not spend your
2300
	 * syscall better by calling kevent?
2301
	 *
2302
	 * XXX - This is needed for libc_r.
2303
	 */
2304
	st->st_mode = S_IFIFO;
2305
	return (0);
2306
}
2307

2308
static void
2309
kqueue_drain(struct kqueue *kq, struct thread *td)
2310
{
2311
	struct knote *kn;
2312
	int i;
2313

2314
	KQ_LOCK(kq);
2315

2316
	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
2317
	    ("kqueue already closing"));
2318
	kq->kq_state |= KQ_CLOSING;
2319
	if (kq->kq_refcnt > 1)
2320
		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
2321

2322
	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
2323

2324
	KASSERT(knlist_empty(&kq->kq_sel.si_note),
2325
	    ("kqueue's knlist not empty"));
2326

2327
	for (i = 0; i < kq->kq_knlistsize; i++) {
2328
		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
2329
			if (kn_in_flux(kn)) {
2330
				kq->kq_state |= KQ_FLUXWAIT;
2331
				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
2332
				continue;
2333
			}
2334
			kn_enter_flux(kn);
2335
			KQ_UNLOCK(kq);
2336
			knote_drop(kn, td);
2337
			KQ_LOCK(kq);
2338
		}
2339
	}
2340
	if (kq->kq_knhashmask != 0) {
2341
		for (i = 0; i <= kq->kq_knhashmask; i++) {
2342
			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
2343
				if (kn_in_flux(kn)) {
2344
					kq->kq_state |= KQ_FLUXWAIT;
2345
					msleep(kq, &kq->kq_lock, PSOCK,
2346
					       "kqclo2", 0);
2347
					continue;
2348
				}
2349
				kn_enter_flux(kn);
2350
				KQ_UNLOCK(kq);
2351
				knote_drop(kn, td);
2352
				KQ_LOCK(kq);
2353
			}
2354
		}
2355
	}
2356

2357
	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
2358
		kq->kq_state |= KQ_TASKDRAIN;
2359
		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
2360
	}
2361

2362
	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2363
		selwakeuppri(&kq->kq_sel, PSOCK);
2364
		if (!SEL_WAITING(&kq->kq_sel))
2365
			kq->kq_state &= ~KQ_SEL;
2366
	}
2367

2368
	KQ_UNLOCK(kq);
2369
}
2370

2371
static void
2372
kqueue_destroy(struct kqueue *kq)
2373
{
2374

2375
	KASSERT(kq->kq_fdp == NULL,
2376
	    ("kqueue still attached to a file descriptor"));
2377
	seldrain(&kq->kq_sel);
2378
	knlist_destroy(&kq->kq_sel.si_note);
2379
	mtx_destroy(&kq->kq_lock);
2380

2381
	if (kq->kq_knhash != NULL)
2382
		free(kq->kq_knhash, M_KQUEUE);
2383
	if (kq->kq_knlist != NULL)
2384
		free(kq->kq_knlist, M_KQUEUE);
2385

2386
	funsetown(&kq->kq_sigio);
2387
}
2388

2389
/*ARGSUSED*/
2390
static int
2391
kqueue_close(struct file *fp, struct thread *td)
2392
{
2393
	struct kqueue *kq = fp->f_data;
2394
	struct filedesc *fdp;
2395
	int error;
2396
	int filedesc_unlock;
2397

2398
	if ((error = kqueue_acquire(fp, &kq)))
2399
		return error;
2400
	kqueue_drain(kq, td);
2401

2402
	/*
2403
	 * We could be called due to the knote_drop() doing fdrop(),
2404
	 * called from kqueue_register().  In this case the global
2405
	 * lock is owned, and filedesc sx is locked before, to not
2406
	 * take the sleepable lock after non-sleepable.
2407
	 */
2408
	fdp = kq->kq_fdp;
2409
	kq->kq_fdp = NULL;
2410
	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
2411
		FILEDESC_XLOCK(fdp);
2412
		filedesc_unlock = 1;
2413
	} else
2414
		filedesc_unlock = 0;
2415
	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
2416
	if (filedesc_unlock)
2417
		FILEDESC_XUNLOCK(fdp);
2418

2419
	kqueue_destroy(kq);
2420
	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
2421
	crfree(kq->kq_cred);
2422
	free(kq, M_KQUEUE);
2423
	fp->f_data = NULL;
2424

2425
	return (0);
2426
}
2427

2428
static int
2429
kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2430
{
2431
	struct kqueue *kq = fp->f_data;
2432

2433
	kif->kf_type = KF_TYPE_KQUEUE;
2434
	kif->kf_un.kf_kqueue.kf_kqueue_addr = (uintptr_t)kq;
2435
	kif->kf_un.kf_kqueue.kf_kqueue_count = kq->kq_count;
2436
	kif->kf_un.kf_kqueue.kf_kqueue_state = kq->kq_state;
2437
	return (0);
2438
}
2439

2440
static void
2441
kqueue_wakeup(struct kqueue *kq)
2442
{
2443
	KQ_OWNED(kq);
2444

2445
	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
2446
		kq->kq_state &= ~KQ_SLEEP;
2447
		wakeup(kq);
2448
	}
2449
	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
2450
		selwakeuppri(&kq->kq_sel, PSOCK);
2451
		if (!SEL_WAITING(&kq->kq_sel))
2452
			kq->kq_state &= ~KQ_SEL;
2453
	}
2454
	if (!knlist_empty(&kq->kq_sel.si_note))
2455
		kqueue_schedtask(kq);
2456
	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
2457
		pgsigio(&kq->kq_sigio, SIGIO, 0);
2458
	}
2459
}
2460

2461
/*
2462
 * Walk down a list of knotes, activating them if their event has triggered.
2463
 *
2464
 * There is a possibility to optimize in the case of one kq watching another.
2465
 * Instead of scheduling a task to wake it up, you could pass enough state
2466
 * down the chain to make up the parent kqueue.  Make this code functional
2467
 * first.
2468
 */
2469
void
2470
knote(struct knlist *list, long hint, int lockflags)
2471
{
2472
	struct kqueue *kq;
2473
	struct knote *kn, *tkn;
2474
	int error;
2475

2476
	if (list == NULL)
2477
		return;
2478

2479
	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
2480

2481
	if ((lockflags & KNF_LISTLOCKED) == 0)
2482
		list->kl_lock(list->kl_lockarg); 
2483

2484
	/*
2485
	 * If we unlock the list lock (and enter influx), we can
2486
	 * eliminate the kqueue scheduling, but this will introduce
2487
	 * four lock/unlock's for each knote to test.  Also, marker
2488
	 * would be needed to keep iteration position, since filters
2489
	 * or other threads could remove events.
2490
	 */
2491
	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
2492
		kq = kn->kn_kq;
2493
		KQ_LOCK(kq);
2494
		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
2495
			/*
2496
			 * Do not process the influx notes, except for
2497
			 * the influx coming from the kq unlock in the
2498
			 * kqueue_scan().  In the later case, we do
2499
			 * not interfere with the scan, since the code
2500
			 * fragment in kqueue_scan() locks the knlist,
2501
			 * and cannot proceed until we finished.
2502
			 */
2503
			KQ_UNLOCK(kq);
2504
		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2505
			kn_enter_flux(kn);
2506
			KQ_UNLOCK(kq);
2507
			error = kn->kn_fop->f_event(kn, hint);
2508
			KQ_LOCK(kq);
2509
			kn_leave_flux(kn);
2510
			if (error)
2511
				KNOTE_ACTIVATE(kn, 1);
2512
			KQ_UNLOCK_FLUX(kq);
2513
		} else {
2514
			if (kn->kn_fop->f_event(kn, hint))
2515
				KNOTE_ACTIVATE(kn, 1);
2516
			KQ_UNLOCK(kq);
2517
		}
2518
	}
2519
	if ((lockflags & KNF_LISTLOCKED) == 0)
2520
		list->kl_unlock(list->kl_lockarg); 
2521
}
2522

2523
/*
2524
 * add a knote to a knlist
2525
 */
2526
void
2527
knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2528
{
2529

2530
	KNL_ASSERT_LOCK(knl, islocked);
2531
	KQ_NOTOWNED(kn->kn_kq);
2532
	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
2533
	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2534
	    ("knote %p was not detached", kn));
2535
	if (!islocked)
2536
		knl->kl_lock(knl->kl_lockarg);
2537
	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2538
	if (!islocked)
2539
		knl->kl_unlock(knl->kl_lockarg);
2540
	KQ_LOCK(kn->kn_kq);
2541
	kn->kn_knlist = knl;
2542
	kn->kn_status &= ~KN_DETACHED;
2543
	KQ_UNLOCK(kn->kn_kq);
2544
}
2545

2546
static void
2547
knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
2548
    int kqislocked)
2549
{
2550

2551
	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
2552
	KNL_ASSERT_LOCK(knl, knlislocked);
2553
	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2554
	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
2555
	KASSERT((kn->kn_status & KN_DETACHED) == 0,
2556
	    ("knote %p was already detached", kn));
2557
	if (!knlislocked)
2558
		knl->kl_lock(knl->kl_lockarg);
2559
	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2560
	kn->kn_knlist = NULL;
2561
	if (!knlislocked)
2562
		kn_list_unlock(knl);
2563
	if (!kqislocked)
2564
		KQ_LOCK(kn->kn_kq);
2565
	kn->kn_status |= KN_DETACHED;
2566
	if (!kqislocked)
2567
		KQ_UNLOCK(kn->kn_kq);
2568
}
2569

2570
/*
2571
 * remove knote from the specified knlist
2572
 */
2573
void
2574
knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2575
{
2576

2577
	knlist_remove_kq(knl, kn, islocked, 0);
2578
}
2579

2580
int
2581
knlist_empty(struct knlist *knl)
2582
{
2583

2584
	KNL_ASSERT_LOCKED(knl);
2585
	return (SLIST_EMPTY(&knl->kl_list));
2586
}
2587

2588
static struct mtx knlist_lock;
2589
MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2590
    MTX_DEF);
2591
static void knlist_mtx_lock(void *arg);
2592
static void knlist_mtx_unlock(void *arg);
2593

2594
static void
2595
knlist_mtx_lock(void *arg)
2596
{
2597

2598
	mtx_lock((struct mtx *)arg);
2599
}
2600

2601
static void
2602
knlist_mtx_unlock(void *arg)
2603
{
2604

2605
	mtx_unlock((struct mtx *)arg);
2606
}
2607

2608
static void
2609
knlist_mtx_assert_lock(void *arg, int what)
2610
{
2611

2612
	if (what == LA_LOCKED)
2613
		mtx_assert((struct mtx *)arg, MA_OWNED);
2614
	else
2615
		mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2616
}
2617

2618
void
2619
knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2620
    void (*kl_unlock)(void *),
2621
    void (*kl_assert_lock)(void *, int))
2622
{
2623

2624
	if (lock == NULL)
2625
		knl->kl_lockarg = &knlist_lock;
2626
	else
2627
		knl->kl_lockarg = lock;
2628

2629
	if (kl_lock == NULL)
2630
		knl->kl_lock = knlist_mtx_lock;
2631
	else
2632
		knl->kl_lock = kl_lock;
2633
	if (kl_unlock == NULL)
2634
		knl->kl_unlock = knlist_mtx_unlock;
2635
	else
2636
		knl->kl_unlock = kl_unlock;
2637
	if (kl_assert_lock == NULL)
2638
		knl->kl_assert_lock = knlist_mtx_assert_lock;
2639
	else
2640
		knl->kl_assert_lock = kl_assert_lock;
2641

2642
	knl->kl_autodestroy = 0;
2643
	SLIST_INIT(&knl->kl_list);
2644
}
2645

2646
void
2647
knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2648
{
2649

2650
	knlist_init(knl, lock, NULL, NULL, NULL);
2651
}
2652

2653
struct knlist *
2654
knlist_alloc(struct mtx *lock)
2655
{
2656
	struct knlist *knl;
2657

2658
	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
2659
	knlist_init_mtx(knl, lock);
2660
	return (knl);
2661
}
2662

2663
void
2664
knlist_destroy(struct knlist *knl)
2665
{
2666

2667
	KASSERT(KNLIST_EMPTY(knl),
2668
	    ("destroying knlist %p with knotes on it", knl));
2669
}
2670

2671
void
2672
knlist_detach(struct knlist *knl)
2673
{
2674

2675
	KNL_ASSERT_LOCKED(knl);
2676
	knl->kl_autodestroy = 1;
2677
	if (knlist_empty(knl)) {
2678
		knlist_destroy(knl);
2679
		free(knl, M_KQUEUE);
2680
	}
2681
}
2682

2683
/*
2684
 * Even if we are locked, we may need to drop the lock to allow any influx
2685
 * knotes time to "settle".
2686
 */
2687
void
2688
knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2689
{
2690
	struct knote *kn, *kn2;
2691
	struct kqueue *kq;
2692

2693
	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
2694
	if (islocked)
2695
		KNL_ASSERT_LOCKED(knl);
2696
	else {
2697
		KNL_ASSERT_UNLOCKED(knl);
2698
again:		/* need to reacquire lock since we have dropped it */
2699
		knl->kl_lock(knl->kl_lockarg);
2700
	}
2701

2702
	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2703
		kq = kn->kn_kq;
2704
		KQ_LOCK(kq);
2705
		if (kn_in_flux(kn)) {
2706
			KQ_UNLOCK(kq);
2707
			continue;
2708
		}
2709
		knlist_remove_kq(knl, kn, 1, 1);
2710
		if (killkn) {
2711
			kn_enter_flux(kn);
2712
			KQ_UNLOCK(kq);
2713
			knote_drop_detached(kn, td);
2714
		} else {
2715
			/* Make sure cleared knotes disappear soon */
2716
			kn->kn_flags |= EV_EOF | EV_ONESHOT;
2717
			KQ_UNLOCK(kq);
2718
		}
2719
		kq = NULL;
2720
	}
2721

2722
	if (!SLIST_EMPTY(&knl->kl_list)) {
2723
		/* there are still in flux knotes remaining */
2724
		kn = SLIST_FIRST(&knl->kl_list);
2725
		kq = kn->kn_kq;
2726
		KQ_LOCK(kq);
2727
		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
2728
		knl->kl_unlock(knl->kl_lockarg);
2729
		kq->kq_state |= KQ_FLUXWAIT;
2730
		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2731
		kq = NULL;
2732
		goto again;
2733
	}
2734

2735
	if (islocked)
2736
		KNL_ASSERT_LOCKED(knl);
2737
	else {
2738
		knl->kl_unlock(knl->kl_lockarg);
2739
		KNL_ASSERT_UNLOCKED(knl);
2740
	}
2741
}
2742

2743
/*
2744
 * Remove all knotes referencing a specified fd must be called with FILEDESC
2745
 * lock.  This prevents a race where a new fd comes along and occupies the
2746
 * entry and we attach a knote to the fd.
2747
 */
2748
void
2749
knote_fdclose(struct thread *td, int fd)
2750
{
2751
	struct filedesc *fdp = td->td_proc->p_fd;
2752
	struct kqueue *kq;
2753
	struct knote *kn;
2754
	int influx;
2755

2756
	FILEDESC_XLOCK_ASSERT(fdp);
2757

2758
	/*
2759
	 * We shouldn't have to worry about new kevents appearing on fd
2760
	 * since filedesc is locked.
2761
	 */
2762
	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2763
		KQ_LOCK(kq);
2764

2765
again:
2766
		influx = 0;
2767
		while (kq->kq_knlistsize > fd &&
2768
		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2769
			if (kn_in_flux(kn)) {
2770
				/* someone else might be waiting on our knote */
2771
				if (influx)
2772
					wakeup(kq);
2773
				kq->kq_state |= KQ_FLUXWAIT;
2774
				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2775
				goto again;
2776
			}
2777
			kn_enter_flux(kn);
2778
			KQ_UNLOCK(kq);
2779
			influx = 1;
2780
			knote_drop(kn, td);
2781
			KQ_LOCK(kq);
2782
		}
2783
		KQ_UNLOCK_FLUX(kq);
2784
	}
2785
}
2786

2787
static int
2788
knote_attach(struct knote *kn, struct kqueue *kq)
2789
{
2790
	struct klist *list;
2791

2792
	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
2793
	KQ_OWNED(kq);
2794

2795
	if ((kq->kq_state & KQ_CLOSING) != 0)
2796
		return (EBADF);
2797
	if (kn->kn_fop->f_isfd) {
2798
		if (kn->kn_id >= kq->kq_knlistsize)
2799
			return (ENOMEM);
2800
		list = &kq->kq_knlist[kn->kn_id];
2801
	} else {
2802
		if (kq->kq_knhash == NULL)
2803
			return (ENOMEM);
2804
		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2805
	}
2806
	SLIST_INSERT_HEAD(list, kn, kn_link);
2807
	return (0);
2808
}
2809

2810
static void
2811
knote_drop(struct knote *kn, struct thread *td)
2812
{
2813

2814
	if ((kn->kn_status & KN_DETACHED) == 0)
2815
		kn->kn_fop->f_detach(kn);
2816
	knote_drop_detached(kn, td);
2817
}
2818

2819
static void
2820
knote_drop_detached(struct knote *kn, struct thread *td)
2821
{
2822
	struct kqueue *kq;
2823
	struct klist *list;
2824

2825
	kq = kn->kn_kq;
2826

2827
	KASSERT((kn->kn_status & KN_DETACHED) != 0,
2828
	    ("knote %p still attached", kn));
2829
	KQ_NOTOWNED(kq);
2830

2831
	KQ_LOCK(kq);
2832
	for (;;) {
2833
		KASSERT(kn->kn_influx >= 1,
2834
		    ("knote_drop called on %p with influx %d",
2835
		    kn, kn->kn_influx));
2836
		if (kn->kn_influx == 1)
2837
			break;
2838
		kq->kq_state |= KQ_FLUXWAIT;
2839
		msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2840
	}
2841

2842
	if (kn->kn_fop->f_isfd)
2843
		list = &kq->kq_knlist[kn->kn_id];
2844
	else
2845
		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2846

2847
	if (!SLIST_EMPTY(list))
2848
		SLIST_REMOVE(list, kn, knote, kn_link);
2849
	if (kn->kn_status & KN_QUEUED)
2850
		knote_dequeue(kn);
2851
	KQ_UNLOCK_FLUX(kq);
2852

2853
	if (kn->kn_fop->f_isfd) {
2854
		fdrop(kn->kn_fp, td);
2855
		kn->kn_fp = NULL;
2856
	}
2857
	kqueue_fo_release(kn->kn_kevent.filter);
2858
	kn->kn_fop = NULL;
2859
	knote_free(kn);
2860
}
2861

2862
static void
2863
knote_enqueue(struct knote *kn)
2864
{
2865
	struct kqueue *kq = kn->kn_kq;
2866

2867
	KQ_OWNED(kn->kn_kq);
2868
	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2869

2870
	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2871
	kn->kn_status |= KN_QUEUED;
2872
	kq->kq_count++;
2873
	kqueue_wakeup(kq);
2874
}
2875

2876
static void
2877
knote_dequeue(struct knote *kn)
2878
{
2879
	struct kqueue *kq = kn->kn_kq;
2880

2881
	KQ_OWNED(kn->kn_kq);
2882
	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2883

2884
	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2885
	kn->kn_status &= ~KN_QUEUED;
2886
	kq->kq_count--;
2887
}
2888

2889
static void
2890
knote_init(void)
2891
{
2892

2893
	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2894
	    NULL, NULL, UMA_ALIGN_PTR, 0);
2895
	ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
2896
	prison0.pr_klist = knlist_alloc(&prison0.pr_mtx);
2897
}
2898
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2899

2900
static struct knote *
2901
knote_alloc(int mflag)
2902
{
2903

2904
	return (uma_zalloc(knote_zone, mflag | M_ZERO));
2905
}
2906

2907
static void
2908
knote_free(struct knote *kn)
2909
{
2910

2911
	uma_zfree(knote_zone, kn);
2912
}
2913

2914
/*
2915
 * Register the kev w/ the kq specified by fd.
2916
 */
2917
int
2918
kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
2919
{
2920
	struct kqueue *kq;
2921
	struct file *fp;
2922
	cap_rights_t rights;
2923
	int error;
2924

2925
	error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
2926
	    &fp);
2927
	if (error != 0)
2928
		return (error);
2929
	if ((error = kqueue_acquire(fp, &kq)) != 0)
2930
		goto noacquire;
2931

2932
	error = kqueue_register(kq, kev, td, mflag);
2933
	kqueue_release(kq, 0);
2934

2935
noacquire:
2936
	fdrop(fp, td);
2937
	return (error);
2938
}
2939

2940
struct knote_status_export_bit {
2941
	int kn_status_bit;
2942
	int knt_status_bit;
2943
};
2944

2945
#define	ST(name) \
2946
    { .kn_status_bit = KN_##name, .knt_status_bit = KNOTE_STATUS_##name }
2947
static const struct knote_status_export_bit knote_status_export_bits[] = {
2948
	ST(ACTIVE),
2949
	ST(QUEUED),
2950
	ST(DISABLED),
2951
	ST(DETACHED),
2952
	ST(KQUEUE),
2953
};
2954
#undef ST
2955

2956
static int
2957
knote_status_export(int kn_status)
2958
{
2959
	const struct knote_status_export_bit *b;
2960
	unsigned i;
2961
	int res;
2962

2963
	res = 0;
2964
	for (i = 0; i < nitems(knote_status_export_bits); i++) {
2965
		b = &knote_status_export_bits[i];
2966
		if ((kn_status & b->kn_status_bit) != 0)
2967
			res |= b->knt_status_bit;
2968
	}
2969
	return (res);
2970
}
2971

2972
static int
2973
kern_proc_kqueue_report_one(struct sbuf *s, struct proc *p,
2974
    int kq_fd, struct kqueue *kq, struct knote *kn, bool compat32 __unused)
2975
{
2976
	struct kinfo_knote kin;
2977
#ifdef COMPAT_FREEBSD32
2978
	struct kinfo_knote32 kin32;
2979
#endif
2980
	int error;
2981

2982
	if (kn->kn_status == KN_MARKER)
2983
		return (0);
2984

2985
	memset(&kin, 0, sizeof(kin));
2986
	kin.knt_kq_fd = kq_fd;
2987
	memcpy(&kin.knt_event, &kn->kn_kevent, sizeof(struct kevent));
2988
	kin.knt_status = knote_status_export(kn->kn_status);
2989
	kn_enter_flux(kn);
2990
	KQ_UNLOCK_FLUX(kq);
2991
	if (kn->kn_fop->f_userdump != NULL)
2992
		(void)kn->kn_fop->f_userdump(p, kn, &kin);
2993
#ifdef COMPAT_FREEBSD32
2994
	if (compat32) {
2995
		freebsd32_kinfo_knote_to_32(&kin, &kin32);
2996
		error = sbuf_bcat(s, &kin32, sizeof(kin32));
2997
	} else
2998
#endif
2999
		error = sbuf_bcat(s, &kin, sizeof(kin));
3000
	KQ_LOCK(kq);
3001
	kn_leave_flux(kn);
3002
	return (error);
3003
}
3004

3005
static int
3006
kern_proc_kqueue_report(struct sbuf *s, struct proc *p, int kq_fd,
3007
    struct kqueue *kq, bool compat32)
3008
{
3009
	struct knote *kn;
3010
	int error, i;
3011

3012
	error = 0;
3013
	KQ_LOCK(kq);
3014
	for (i = 0; i < kq->kq_knlistsize; i++) {
3015
		SLIST_FOREACH(kn, &kq->kq_knlist[i], kn_link) {
3016
			error = kern_proc_kqueue_report_one(s, p, kq_fd,
3017
			    kq, kn, compat32);
3018
			if (error != 0)
3019
				goto out;
3020
		}
3021
	}
3022
	if (kq->kq_knhashmask == 0)
3023
		goto out;
3024
	for (i = 0; i <= kq->kq_knhashmask; i++) {
3025
		SLIST_FOREACH(kn, &kq->kq_knhash[i], kn_link) {
3026
			error = kern_proc_kqueue_report_one(s, p, kq_fd,
3027
			    kq, kn, compat32);
3028
			if (error != 0)
3029
				goto out;
3030
		}
3031
	}
3032
out:
3033
	KQ_UNLOCK_FLUX(kq);
3034
	return (error);
3035
}
3036

3037
struct kern_proc_kqueues_out1_cb_args {
3038
	struct sbuf *s;
3039
	bool compat32;
3040
};
3041

3042
static int
3043
kern_proc_kqueues_out1_cb(struct proc *p, int fd, struct file *fp, void *arg)
3044
{
3045
	struct kqueue *kq;
3046
	struct kern_proc_kqueues_out1_cb_args *a;
3047

3048
	if (fp->f_type != DTYPE_KQUEUE)
3049
		return (0);
3050
	a = arg;
3051
	kq = fp->f_data;
3052
	return (kern_proc_kqueue_report(a->s, p, fd, kq, a->compat32));
3053
}
3054

3055
static int
3056
kern_proc_kqueues_out1(struct thread *td, struct proc *p, struct sbuf *s,
3057
    bool compat32)
3058
{
3059
	struct kern_proc_kqueues_out1_cb_args a;
3060

3061
	a.s = s;
3062
	a.compat32 = compat32;
3063
	return (fget_remote_foreach(td, p, kern_proc_kqueues_out1_cb, &a));
3064
}
3065

3066
int
3067
kern_proc_kqueues_out(struct proc *p, struct sbuf *sb, size_t maxlen,
3068
    bool compat32)
3069
{
3070
	struct sbuf *s, sm;
3071
	size_t sb_len;
3072
	int error;
3073

3074
	if (maxlen == -1 || maxlen == 0)
3075
		sb_len = 128;
3076
	else
3077
		sb_len = maxlen;
3078
	s = sbuf_new(&sm, NULL, sb_len, maxlen == -1 ? SBUF_AUTOEXTEND :
3079
	    SBUF_FIXEDLEN);
3080
	error = kern_proc_kqueues_out1(curthread, p, s, compat32);
3081
	sbuf_finish(s);
3082
	if (error == 0) {
3083
		sbuf_bcat(sb, sbuf_data(s), MIN(sbuf_len(s), maxlen == -1 ?
3084
		    SIZE_T_MAX : maxlen));
3085
	}
3086
	sbuf_delete(s);
3087
	return (error);
3088
}
3089

3090
static int
3091
sysctl_kern_proc_kqueue_one(struct thread *td, struct sbuf *s, struct proc *p,
3092
    int kq_fd, bool compat32)
3093
{
3094
	struct file *fp;
3095
	struct kqueue *kq;
3096
	int error;
3097

3098
	error = fget_remote(td, p, kq_fd, &fp);
3099
	if (error == 0) {
3100
		if (fp->f_type != DTYPE_KQUEUE) {
3101
			error = EINVAL;
3102
		} else {
3103
			kq = fp->f_data;
3104
			error = kern_proc_kqueue_report(s, p, kq_fd, kq,
3105
			    compat32);
3106
		}
3107
		fdrop(fp, td);
3108
	}
3109
	return (error);
3110
}
3111

3112
static int
3113
sysctl_kern_proc_kqueue(SYSCTL_HANDLER_ARGS)
3114
{
3115
	struct thread *td;
3116
	struct proc *p;
3117
	struct sbuf *s, sm;
3118
	int error, error1, *name;
3119
	bool compat32;
3120

3121
	name = (int *)arg1;
3122
	if ((u_int)arg2 > 2 || (u_int)arg2 == 0)
3123
		return (EINVAL);
3124

3125
	error = pget((pid_t)name[0], PGET_HOLD | PGET_CANDEBUG, &p);
3126
	if (error != 0)
3127
		return (error);
3128

3129
	td = curthread;
3130
#ifdef COMPAT_FREEBSD32
3131
	compat32 = SV_CURPROC_FLAG(SV_ILP32);
3132
#else
3133
	compat32 = false;
3134
#endif
3135

3136
	s = sbuf_new_for_sysctl(&sm, NULL, 0, req);
3137
	if (s == NULL) {
3138
		error = ENOMEM;
3139
		goto out;
3140
	}
3141
	sbuf_clear_flags(s, SBUF_INCLUDENUL);
3142

3143
	if ((u_int)arg2 == 1) {
3144
		error = kern_proc_kqueues_out1(td, p, s, compat32);
3145
	} else {
3146
		error = sysctl_kern_proc_kqueue_one(td, s, p,
3147
		    name[1] /* kq_fd */, compat32);
3148
	}
3149

3150
	error1 = sbuf_finish(s);
3151
	if (error == 0)
3152
		error = error1;
3153
	sbuf_delete(s);
3154

3155
out:
3156
	PRELE(p);
3157
	return (error);
3158
}
3159

3160
static SYSCTL_NODE(_kern_proc, KERN_PROC_KQUEUE, kq,
3161
    CTLFLAG_RD | CTLFLAG_MPSAFE,
3162
    sysctl_kern_proc_kqueue, "KQueue events");
3163

3164
Product

Resources

Company