FreeBSD kernel kern code
kern_event.c
Go to the documentation of this file.
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4  * Copyright (c) 2009 Apple, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$BSDSUniX$");
31 
32 #include "opt_ktrace.h"
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/capability.h>
37 #include <sys/kernel.h>
38 #include <sys/lock.h>
39 #include <sys/mutex.h>
40 #include <sys/proc.h>
41 #include <sys/malloc.h>
42 #include <sys/unistd.h>
43 #include <sys/file.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/fcntl.h>
47 #include <sys/kthread.h>
48 #include <sys/selinfo.h>
49 #include <sys/queue.h>
50 #include <sys/event.h>
51 #include <sys/eventvar.h>
52 #include <sys/poll.h>
53 #include <sys/protosw.h>
54 #include <sys/sigio.h>
55 #include <sys/signalvar.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/stat.h>
59 #include <sys/sysctl.h>
60 #include <sys/sysproto.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/taskqueue.h>
63 #include <sys/uio.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 
68 #include <vm/uma.h>
69 
70 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
71 
72 /*
73  * This lock is used if multiple kq locks are required. This possibly
74  * should be made into a per proc lock.
75  */
76 static struct mtx kq_global;
77 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
78 #define KQ_GLOBAL_LOCK(lck, haslck) do { \
79  if (!haslck) \
80  mtx_lock(lck); \
81  haslck = 1; \
82 } while (0)
83 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
84  if (haslck) \
85  mtx_unlock(lck); \
86  haslck = 0; \
87 } while (0)
88 
90 
91 static int kevent_copyout(void *arg, struct kevent *kevp, int count);
92 static int kevent_copyin(void *arg, struct kevent *kevp, int count);
93 static int kqueue_register(struct kqueue *kq, struct kevent *kev,
94  struct thread *td, int waitok);
95 static int kqueue_acquire(struct file *fp, struct kqueue **kqp);
96 static void kqueue_release(struct kqueue *kq, int locked);
97 static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
98  uintptr_t ident, int waitok);
99 static void kqueue_task(void *arg, int pending);
100 static int kqueue_scan(struct kqueue *kq, int maxevents,
101  struct kevent_copyops *k_ops,
102  const struct timespec *timeout,
103  struct kevent *keva, struct thread *td);
104 static void kqueue_wakeup(struct kqueue *kq);
105 static struct filterops *kqueue_fo_find(int filt);
106 static void kqueue_fo_release(int filt);
107 
108 static fo_rdwr_t kqueue_read;
109 static fo_rdwr_t kqueue_write;
110 static fo_truncate_t kqueue_truncate;
111 static fo_ioctl_t kqueue_ioctl;
112 static fo_poll_t kqueue_poll;
113 static fo_kqfilter_t kqueue_kqfilter;
114 static fo_stat_t kqueue_stat;
115 static fo_close_t kqueue_close;
116 
117 static struct fileops kqueueops = {
118  .fo_read = kqueue_read,
119  .fo_write = kqueue_write,
120  .fo_truncate = kqueue_truncate,
121  .fo_ioctl = kqueue_ioctl,
122  .fo_poll = kqueue_poll,
123  .fo_kqfilter = kqueue_kqfilter,
124  .fo_stat = kqueue_stat,
125  .fo_close = kqueue_close,
126  .fo_chmod = invfo_chmod,
127  .fo_chown = invfo_chown,
128 };
129 
130 static int knote_attach(struct knote *kn, struct kqueue *kq);
131 static void knote_drop(struct knote *kn, struct thread *td);
132 static void knote_enqueue(struct knote *kn);
133 static void knote_dequeue(struct knote *kn);
134 static void knote_init(void);
135 static struct knote *knote_alloc(int waitok);
136 static void knote_free(struct knote *kn);
137 
138 static void filt_kqdetach(struct knote *kn);
139 static int filt_kqueue(struct knote *kn, long hint);
140 static int filt_procattach(struct knote *kn);
141 static void filt_procdetach(struct knote *kn);
142 static int filt_proc(struct knote *kn, long hint);
143 static int filt_fileattach(struct knote *kn);
144 static void filt_timerexpire(void *knx);
145 static int filt_timerattach(struct knote *kn);
146 static void filt_timerdetach(struct knote *kn);
147 static int filt_timer(struct knote *kn, long hint);
148 static int filt_userattach(struct knote *kn);
149 static void filt_userdetach(struct knote *kn);
150 static int filt_user(struct knote *kn, long hint);
151 static void filt_usertouch(struct knote *kn, struct kevent *kev,
152  u_long type);
153 
154 static struct filterops file_filtops = {
155  .f_isfd = 1,
156  .f_attach = filt_fileattach,
157 };
158 static struct filterops kqread_filtops = {
159  .f_isfd = 1,
160  .f_detach = filt_kqdetach,
161  .f_event = filt_kqueue,
162 };
163 /* XXX - move to kern_proc.c? */
164 static struct filterops proc_filtops = {
165  .f_isfd = 0,
166  .f_attach = filt_procattach,
167  .f_detach = filt_procdetach,
168  .f_event = filt_proc,
169 };
170 static struct filterops timer_filtops = {
171  .f_isfd = 0,
172  .f_attach = filt_timerattach,
173  .f_detach = filt_timerdetach,
174  .f_event = filt_timer,
175 };
176 static struct filterops user_filtops = {
177  .f_attach = filt_userattach,
178  .f_detach = filt_userdetach,
179  .f_event = filt_user,
180  .f_touch = filt_usertouch,
181 };
182 
183 static uma_zone_t knote_zone;
184 static int kq_ncallouts = 0;
185 static int kq_calloutmax = (4 * 1024);
186 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
187  &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
188 
189 /* XXX - ensure not KN_INFLUX?? */
190 #define KNOTE_ACTIVATE(kn, islock) do { \
191  if ((islock)) \
192  mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
193  else \
194  KQ_LOCK((kn)->kn_kq); \
195  (kn)->kn_status |= KN_ACTIVE; \
196  if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
197  knote_enqueue((kn)); \
198  if (!(islock)) \
199  KQ_UNLOCK((kn)->kn_kq); \
200 } while(0)
201 #define KQ_LOCK(kq) do { \
202  mtx_lock(&(kq)->kq_lock); \
203 } while (0)
204 #define KQ_FLUX_WAKEUP(kq) do { \
205  if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
206  (kq)->kq_state &= ~KQ_FLUXWAIT; \
207  wakeup((kq)); \
208  } \
209 } while (0)
210 #define KQ_UNLOCK_FLUX(kq) do { \
211  KQ_FLUX_WAKEUP(kq); \
212  mtx_unlock(&(kq)->kq_lock); \
213 } while (0)
214 #define KQ_UNLOCK(kq) do { \
215  mtx_unlock(&(kq)->kq_lock); \
216 } while (0)
217 #define KQ_OWNED(kq) do { \
218  mtx_assert(&(kq)->kq_lock, MA_OWNED); \
219 } while (0)
220 #define KQ_NOTOWNED(kq) do { \
221  mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
222 } while (0)
223 #define KN_LIST_LOCK(kn) do { \
224  if (kn->kn_knlist != NULL) \
225  kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \
226 } while (0)
227 #define KN_LIST_UNLOCK(kn) do { \
228  if (kn->kn_knlist != NULL) \
229  kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \
230 } while (0)
231 #define KNL_ASSERT_LOCK(knl, islocked) do { \
232  if (islocked) \
233  KNL_ASSERT_LOCKED(knl); \
234  else \
235  KNL_ASSERT_UNLOCKED(knl); \
236 } while (0)
237 #ifdef INVARIANTS
238 #define KNL_ASSERT_LOCKED(knl) do { \
239  knl->kl_assert_locked((knl)->kl_lockarg); \
240 } while (0)
241 #define KNL_ASSERT_UNLOCKED(knl) do { \
242  knl->kl_assert_unlocked((knl)->kl_lockarg); \
243 } while (0)
244 #else /* !INVARIANTS */
245 #define KNL_ASSERT_LOCKED(knl) do {} while(0)
246 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0)
247 #endif /* INVARIANTS */
248 
249 #define KN_HASHSIZE 64 /* XXX should be tunable */
250 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
251 
252 static int
254 {
255 
256  return (ENXIO);
257 };
258 
259 struct filterops null_filtops = {
260  .f_isfd = 0,
261  .f_attach = filt_nullattach,
262 };
263 
264 /* XXX - make SYSINIT to add these, and move into respective modules. */
265 extern struct filterops sig_filtops;
266 extern struct filterops fs_filtops;
267 
268 /*
269  * Table for for all system-defined filters.
270  */
271 static struct mtx filterops_lock;
272 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
273  MTX_DEF);
274 static struct {
275  struct filterops *for_fop;
277 } sysfilt_ops[EVFILT_SYSCOUNT] = {
278  { &file_filtops }, /* EVFILT_READ */
279  { &file_filtops }, /* EVFILT_WRITE */
280  { &null_filtops }, /* EVFILT_AIO */
281  { &file_filtops }, /* EVFILT_VNODE */
282  { &proc_filtops }, /* EVFILT_PROC */
283  { &sig_filtops }, /* EVFILT_SIGNAL */
284  { &timer_filtops }, /* EVFILT_TIMER */
285  { &null_filtops }, /* former EVFILT_NETDEV */
286  { &fs_filtops }, /* EVFILT_FS */
287  { &null_filtops }, /* EVFILT_LIO */
288  { &user_filtops }, /* EVFILT_USER */
289 };
290 
291 /*
292  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
293  * method.
294  */
295 static int
297 {
298 
299  return (fo_kqfilter(kn->kn_fp, kn));
300 }
301 
302 /*ARGSUSED*/
303 static int
304 kqueue_kqfilter(struct file *fp, struct knote *kn)
305 {
306  struct kqueue *kq = kn->kn_fp->f_data;
307 
308  if (kn->kn_filter != EVFILT_READ)
309  return (EINVAL);
310 
311  kn->kn_status |= KN_KQUEUE;
312  kn->kn_fop = &kqread_filtops;
313  knlist_add(&kq->kq_sel.si_note, kn, 0);
314 
315  return (0);
316 }
317 
318 static void
319 filt_kqdetach(struct knote *kn)
320 {
321  struct kqueue *kq = kn->kn_fp->f_data;
322 
323  knlist_remove(&kq->kq_sel.si_note, kn, 0);
324 }
325 
326 /*ARGSUSED*/
327 static int
328 filt_kqueue(struct knote *kn, long hint)
329 {
330  struct kqueue *kq = kn->kn_fp->f_data;
331 
332  kn->kn_data = kq->kq_count;
333  return (kn->kn_data > 0);
334 }
335 
336 /* XXX - move to kern_proc.c? */
337 static int
339 {
340  struct proc *p;
341  int immediate;
342  int error;
343 
344  immediate = 0;
345  p = pfind(kn->kn_id);
346  if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
347  p = zpfind(kn->kn_id);
348  immediate = 1;
349  } else if (p != NULL && (p->p_flag & P_WEXIT)) {
350  immediate = 1;
351  }
352 
353  if (p == NULL)
354  return (ESRCH);
355  if ((error = p_cansee(curthread, p))) {
356  PROC_UNLOCK(p);
357  return (error);
358  }
359 
360  kn->kn_ptr.p_proc = p;
361  kn->kn_flags |= EV_CLEAR; /* automatically set */
362 
363  /*
364  * internal flag indicating registration done by kernel
365  */
366  if (kn->kn_flags & EV_FLAG1) {
367  kn->kn_data = kn->kn_sdata; /* ppid */
368  kn->kn_fflags = NOTE_CHILD;
369  kn->kn_flags &= ~EV_FLAG1;
370  }
371 
372  if (immediate == 0)
373  knlist_add(&p->p_klist, kn, 1);
374 
375  /*
376  * Immediately activate any exit notes if the target process is a
377  * zombie. This is necessary to handle the case where the target
378  * process, e.g. a child, dies before the kevent is registered.
379  */
380  if (immediate && filt_proc(kn, NOTE_EXIT))
381  KNOTE_ACTIVATE(kn, 0);
382 
383  PROC_UNLOCK(p);
384 
385  return (0);
386 }
387 
388 /*
389  * The knote may be attached to a different process, which may exit,
390  * leaving nothing for the knote to be attached to. So when the process
391  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
392  * it will be deleted when read out. However, as part of the knote deletion,
393  * this routine is called, so a check is needed to avoid actually performing
394  * a detach, because the original process does not exist any more.
395  */
396 /* XXX - move to kern_proc.c? */
397 static void
399 {
400  struct proc *p;
401 
402  p = kn->kn_ptr.p_proc;
403  knlist_remove(&p->p_klist, kn, 0);
404  kn->kn_ptr.p_proc = NULL;
405 }
406 
407 /* XXX - move to kern_proc.c? */
408 static int
409 filt_proc(struct knote *kn, long hint)
410 {
411  struct proc *p = kn->kn_ptr.p_proc;
412  u_int event;
413 
414  /*
415  * mask off extra data
416  */
417  event = (u_int)hint & NOTE_PCTRLMASK;
418 
419  /*
420  * if the user is interested in this event, record it.
421  */
422  if (kn->kn_sfflags & event)
423  kn->kn_fflags |= event;
424 
425  /*
426  * process is gone, so flag the event as finished.
427  */
428  if (event == NOTE_EXIT) {
429  if (!(kn->kn_status & KN_DETACHED))
430  knlist_remove_inevent(&p->p_klist, kn);
431  kn->kn_flags |= (EV_EOF | EV_ONESHOT);
432  kn->kn_ptr.p_proc = NULL;
433  if (kn->kn_fflags & NOTE_EXIT)
434  kn->kn_data = p->p_xstat;
435  if (kn->kn_fflags == 0)
436  kn->kn_flags |= EV_DROP;
437  return (1);
438  }
439 
440  return (kn->kn_fflags != 0);
441 }
442 
443 /*
444  * Called when the process forked. It mostly does the same as the
445  * knote(), activating all knotes registered to be activated when the
446  * process forked. Additionally, for each knote attached to the
447  * parent, check whether user wants to track the new process. If so
448  * attach a new knote to it, and immediately report an event with the
449  * child's pid.
450  */
451 void
452 knote_fork(struct knlist *list, int pid)
453 {
454  struct kqueue *kq;
455  struct knote *kn;
456  struct kevent kev;
457  int error;
458 
459  if (list == NULL)
460  return;
461  list->kl_lock(list->kl_lockarg);
462 
463  SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
464  if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
465  continue;
466  kq = kn->kn_kq;
467  KQ_LOCK(kq);
468  if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
469  KQ_UNLOCK(kq);
470  continue;
471  }
472 
473  /*
474  * The same as knote(), activate the event.
475  */
476  if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
477  kn->kn_status |= KN_HASKQLOCK;
478  if (kn->kn_fop->f_event(kn, NOTE_FORK))
479  KNOTE_ACTIVATE(kn, 1);
480  kn->kn_status &= ~KN_HASKQLOCK;
481  KQ_UNLOCK(kq);
482  continue;
483  }
484 
485  /*
486  * The NOTE_TRACK case. In addition to the activation
487  * of the event, we need to register new event to
488  * track the child. Drop the locks in preparation for
489  * the call to kqueue_register().
490  */
491  kn->kn_status |= KN_INFLUX;
492  KQ_UNLOCK(kq);
493  list->kl_unlock(list->kl_lockarg);
494 
495  /*
496  * Activate existing knote and register a knote with
497  * new process.
498  */
499  kev.ident = pid;
500  kev.filter = kn->kn_filter;
501  kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
502  kev.fflags = kn->kn_sfflags;
503  kev.data = kn->kn_id; /* parent */
504  kev.udata = kn->kn_kevent.udata;/* preserve udata */
505  error = kqueue_register(kq, &kev, NULL, 0);
506  if (error)
507  kn->kn_fflags |= NOTE_TRACKERR;
508  if (kn->kn_fop->f_event(kn, NOTE_FORK))
509  KNOTE_ACTIVATE(kn, 0);
510  KQ_LOCK(kq);
511  kn->kn_status &= ~KN_INFLUX;
512  KQ_UNLOCK_FLUX(kq);
513  list->kl_lock(list->kl_lockarg);
514  }
515  list->kl_unlock(list->kl_lockarg);
516 }
517 
518 /*
519  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
520  * interval timer support code.
521  */
522 static int
523 timertoticks(intptr_t data)
524 {
525  struct timeval tv;
526  int tticks;
527 
528  tv.tv_sec = data / 1000;
529  tv.tv_usec = (data % 1000) * 1000;
530  tticks = tvtohz(&tv);
531 
532  return tticks;
533 }
534 
535 static void
537 {
538  struct knote *kn = knx;
539  struct callout *calloutp;
540 
541  kn->kn_data++;
542  KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
543 
544  /*
545  * timertoticks() uses tvtohz() which always adds 1 to allow
546  * for the time until the next clock interrupt being strictly
547  * less than 1 clock tick. We don't want that here since we
548  * want to appear to be in sync with the clock interrupt even
549  * when we're delayed.
550  */
551  if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
552  calloutp = (struct callout *)kn->kn_hook;
553  callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
554  filt_timerexpire, kn);
555  }
556 }
557 
558 /*
559  * data contains amount of time to sleep, in milliseconds
560  */
561 static int
563 {
564  struct callout *calloutp;
565 
566  atomic_add_int(&kq_ncallouts, 1);
567 
568  if (kq_ncallouts >= kq_calloutmax) {
569  atomic_add_int(&kq_ncallouts, -1);
570  return (ENOMEM);
571  }
572 
573  kn->kn_flags |= EV_CLEAR; /* automatically set */
574  kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */
575  calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
576  callout_init(calloutp, CALLOUT_MPSAFE);
577  kn->kn_hook = calloutp;
578  callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
579  filt_timerexpire, kn);
580 
581  return (0);
582 }
583 
584 static void
586 {
587  struct callout *calloutp;
588 
589  calloutp = (struct callout *)kn->kn_hook;
590  callout_drain(calloutp);
591  free(calloutp, M_KQUEUE);
592  atomic_add_int(&kq_ncallouts, -1);
593  kn->kn_status |= KN_DETACHED; /* knlist_remove usually clears it */
594 }
595 
596 static int
597 filt_timer(struct knote *kn, long hint)
598 {
599 
600  return (kn->kn_data != 0);
601 }
602 
603 static int
605 {
606 
607  /*
608  * EVFILT_USER knotes are not attached to anything in the kernel.
609  */
610  kn->kn_hook = NULL;
611  if (kn->kn_fflags & NOTE_TRIGGER)
612  kn->kn_hookid = 1;
613  else
614  kn->kn_hookid = 0;
615  return (0);
616 }
617 
618 static void
619 filt_userdetach(__unused struct knote *kn)
620 {
621 
622  /*
623  * EVFILT_USER knotes are not attached to anything in the kernel.
624  */
625 }
626 
627 static int
628 filt_user(struct knote *kn, __unused long hint)
629 {
630 
631  return (kn->kn_hookid);
632 }
633 
634 static void
635 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
636 {
637  u_int ffctrl;
638 
639  switch (type) {
640  case EVENT_REGISTER:
641  if (kev->fflags & NOTE_TRIGGER)
642  kn->kn_hookid = 1;
643 
644  ffctrl = kev->fflags & NOTE_FFCTRLMASK;
645  kev->fflags &= NOTE_FFLAGSMASK;
646  switch (ffctrl) {
647  case NOTE_FFNOP:
648  break;
649 
650  case NOTE_FFAND:
651  kn->kn_sfflags &= kev->fflags;
652  break;
653 
654  case NOTE_FFOR:
655  kn->kn_sfflags |= kev->fflags;
656  break;
657 
658  case NOTE_FFCOPY:
659  kn->kn_sfflags = kev->fflags;
660  break;
661 
662  default:
663  /* XXX Return error? */
664  break;
665  }
666  kn->kn_sdata = kev->data;
667  if (kev->flags & EV_CLEAR) {
668  kn->kn_hookid = 0;
669  kn->kn_data = 0;
670  kn->kn_fflags = 0;
671  }
672  break;
673 
674  case EVENT_PROCESS:
675  *kev = kn->kn_kevent;
676  kev->fflags = kn->kn_sfflags;
677  kev->data = kn->kn_sdata;
678  if (kn->kn_flags & EV_CLEAR) {
679  kn->kn_hookid = 0;
680  kn->kn_data = 0;
681  kn->kn_fflags = 0;
682  }
683  break;
684 
685  default:
686  panic("filt_usertouch() - invalid type (%ld)", type);
687  break;
688  }
689 }
690 
691 int
692 sys_kqueue(struct thread *td, struct kqueue_args *uap)
693 {
694  struct filedesc *fdp;
695  struct kqueue *kq;
696  struct file *fp;
697  int fd, error;
698 
699  fdp = td->td_proc->p_fd;
700  error = falloc(td, &fp, &fd, 0);
701  if (error)
702  goto done2;
703 
704  /* An extra reference on `nfp' has been held for us by falloc(). */
705  kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
706  mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
707  TAILQ_INIT(&kq->kq_head);
708  kq->kq_fdp = fdp;
709  knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
710  TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
711 
712  FILEDESC_XLOCK(fdp);
713  TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
714  FILEDESC_XUNLOCK(fdp);
715 
716  finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
717  fdrop(fp, td);
718 
719  td->td_retval[0] = fd;
720 done2:
721  return (error);
722 }
723 
724 #ifndef _SYS_SYSPROTO_H_
725 struct kevent_args {
726  int fd;
727  const struct kevent *changelist;
728  int nchanges;
729  struct kevent *eventlist;
730  int nevents;
731  const struct timespec *timeout;
732 };
733 #endif
734 int
735 sys_kevent(struct thread *td, struct kevent_args *uap)
736 {
737  struct timespec ts, *tsp;
738  struct kevent_copyops k_ops = { uap,
740  kevent_copyin};
741  int error;
742 #ifdef KTRACE
743  struct uio ktruio;
744  struct iovec ktriov;
745  struct uio *ktruioin = NULL;
746  struct uio *ktruioout = NULL;
747 #endif
748 
749  if (uap->timeout != NULL) {
750  error = copyin(uap->timeout, &ts, sizeof(ts));
751  if (error)
752  return (error);
753  tsp = &ts;
754  } else
755  tsp = NULL;
756 
757 #ifdef KTRACE
758  if (KTRPOINT(td, KTR_GENIO)) {
759  ktriov.iov_base = uap->changelist;
760  ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
761  ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
762  .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
763  .uio_td = td };
764  ktruioin = cloneuio(&ktruio);
765  ktriov.iov_base = uap->eventlist;
766  ktriov.iov_len = uap->nevents * sizeof(struct kevent);
767  ktruioout = cloneuio(&ktruio);
768  }
769 #endif
770 
771  error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
772  &k_ops, tsp);
773 
774 #ifdef KTRACE
775  if (ktruioin != NULL) {
776  ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
777  ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
778  ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
779  ktrgenio(uap->fd, UIO_READ, ktruioout, error);
780  }
781 #endif
782 
783  return (error);
784 }
785 
786 /*
787  * Copy 'count' items into the destination list pointed to by uap->eventlist.
788  */
789 static int
790 kevent_copyout(void *arg, struct kevent *kevp, int count)
791 {
792  struct kevent_args *uap;
793  int error;
794 
795  KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
796  uap = (struct kevent_args *)arg;
797 
798  error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
799  if (error == 0)
800  uap->eventlist += count;
801  return (error);
802 }
803 
804 /*
805  * Copy 'count' items from the list pointed to by uap->changelist.
806  */
807 static int
808 kevent_copyin(void *arg, struct kevent *kevp, int count)
809 {
810  struct kevent_args *uap;
811  int error;
812 
813  KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
814  uap = (struct kevent_args *)arg;
815 
816  error = copyin(uap->changelist, kevp, count * sizeof *kevp);
817  if (error == 0)
818  uap->changelist += count;
819  return (error);
820 }
821 
822 int
823 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
824  struct kevent_copyops *k_ops, const struct timespec *timeout)
825 {
826  struct kevent keva[KQ_NEVENTS];
827  struct kevent *kevp, *changes;
828  struct kqueue *kq;
829  struct file *fp;
830  int i, n, nerrors, error;
831 
832  if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
833  return (error);
834  if ((error = kqueue_acquire(fp, &kq)) != 0)
835  goto done_norel;
836 
837  nerrors = 0;
838 
839  while (nchanges > 0) {
840  n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
841  error = k_ops->k_copyin(k_ops->arg, keva, n);
842  if (error)
843  goto done;
844  changes = keva;
845  for (i = 0; i < n; i++) {
846  kevp = &changes[i];
847  if (!kevp->filter)
848  continue;
849  kevp->flags &= ~EV_SYSFLAGS;
850  error = kqueue_register(kq, kevp, td, 1);
851  if (error || (kevp->flags & EV_RECEIPT)) {
852  if (nevents != 0) {
853  kevp->flags = EV_ERROR;
854  kevp->data = error;
855  (void) k_ops->k_copyout(k_ops->arg,
856  kevp, 1);
857  nevents--;
858  nerrors++;
859  } else {
860  goto done;
861  }
862  }
863  }
864  nchanges -= n;
865  }
866  if (nerrors) {
867  td->td_retval[0] = nerrors;
868  error = 0;
869  goto done;
870  }
871 
872  error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
873 done:
874  kqueue_release(kq, 0);
875 done_norel:
876  fdrop(fp, td);
877  return (error);
878 }
879 
880 int
881 kqueue_add_filteropts(int filt, struct filterops *filtops)
882 {
883  int error;
884 
885  error = 0;
886  if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
887  printf(
888 "trying to add a filterop that is out of range: %d is beyond %d\n",
889  ~filt, EVFILT_SYSCOUNT);
890  return EINVAL;
891  }
892  mtx_lock(&filterops_lock);
893  if (sysfilt_ops[~filt].for_fop != &null_filtops &&
894  sysfilt_ops[~filt].for_fop != NULL)
895  error = EEXIST;
896  else {
897  sysfilt_ops[~filt].for_fop = filtops;
898  sysfilt_ops[~filt].for_refcnt = 0;
899  }
900  mtx_unlock(&filterops_lock);
901 
902  return (error);
903 }
904 
905 int
907 {
908  int error;
909 
910  error = 0;
911  if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
912  return EINVAL;
913 
914  mtx_lock(&filterops_lock);
915  if (sysfilt_ops[~filt].for_fop == &null_filtops ||
916  sysfilt_ops[~filt].for_fop == NULL)
917  error = EINVAL;
918  else if (sysfilt_ops[~filt].for_refcnt != 0)
919  error = EBUSY;
920  else {
921  sysfilt_ops[~filt].for_fop = &null_filtops;
922  sysfilt_ops[~filt].for_refcnt = 0;
923  }
924  mtx_unlock(&filterops_lock);
925 
926  return error;
927 }
928 
929 static struct filterops *
930 kqueue_fo_find(int filt)
931 {
932 
933  if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
934  return NULL;
935 
936  mtx_lock(&filterops_lock);
937  sysfilt_ops[~filt].for_refcnt++;
938  if (sysfilt_ops[~filt].for_fop == NULL)
939  sysfilt_ops[~filt].for_fop = &null_filtops;
940  mtx_unlock(&filterops_lock);
941 
942  return sysfilt_ops[~filt].for_fop;
943 }
944 
945 static void
947 {
948 
949  if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
950  return;
951 
952  mtx_lock(&filterops_lock);
953  KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
954  ("filter object refcount not valid on release"));
955  sysfilt_ops[~filt].for_refcnt--;
956  mtx_unlock(&filterops_lock);
957 }
958 
959 /*
960  * A ref to kq (obtained via kqueue_acquire) must be held. waitok will
961  * influence if memory allocation should wait. Make sure it is 0 if you
962  * hold any mutexes.
963  */
964 static int
965 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
966 {
967  struct filterops *fops;
968  struct file *fp;
969  struct knote *kn, *tkn;
970  int error, filt, event;
971  int haskqglobal, filedesc_unlock;
972 
973  fp = NULL;
974  kn = NULL;
975  error = 0;
976  haskqglobal = 0;
977  filedesc_unlock = 0;
978 
979  filt = kev->filter;
980  fops = kqueue_fo_find(filt);
981  if (fops == NULL)
982  return EINVAL;
983 
984  tkn = knote_alloc(waitok); /* prevent waiting with locks */
985 
986 findkn:
987  if (fops->f_isfd) {
988  KASSERT(td != NULL, ("td is NULL"));
989  error = fget(td, kev->ident, CAP_POLL_EVENT, &fp);
990  if (error)
991  goto done;
992 
993  if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
994  kev->ident, 0) != 0) {
995  /* try again */
996  fdrop(fp, td);
997  fp = NULL;
998  error = kqueue_expand(kq, fops, kev->ident, waitok);
999  if (error)
1000  goto done;
1001  goto findkn;
1002  }
1003 
1004  if (fp->f_type == DTYPE_KQUEUE) {
1005  /*
1006  * if we add some inteligence about what we are doing,
1007  * we should be able to support events on ourselves.
1008  * We need to know when we are doing this to prevent
1009  * getting both the knlist lock and the kq lock since
1010  * they are the same thing.
1011  */
1012  if (fp->f_data == kq) {
1013  error = EINVAL;
1014  goto done;
1015  }
1016 
1017  /*
1018  * Pre-lock the filedesc before the global
1019  * lock mutex, see the comment in
1020  * kqueue_close().
1021  */
1022  FILEDESC_XLOCK(td->td_proc->p_fd);
1023  filedesc_unlock = 1;
1024  KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1025  }
1026 
1027  KQ_LOCK(kq);
1028  if (kev->ident < kq->kq_knlistsize) {
1029  SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1030  if (kev->filter == kn->kn_filter)
1031  break;
1032  }
1033  } else {
1034  if ((kev->flags & EV_ADD) == EV_ADD)
1035  kqueue_expand(kq, fops, kev->ident, waitok);
1036 
1037  KQ_LOCK(kq);
1038  if (kq->kq_knhashmask != 0) {
1039  struct klist *list;
1040 
1041  list = &kq->kq_knhash[
1042  KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1043  SLIST_FOREACH(kn, list, kn_link)
1044  if (kev->ident == kn->kn_id &&
1045  kev->filter == kn->kn_filter)
1046  break;
1047  }
1048  }
1049 
1050  /* knote is in the process of changing, wait for it to stablize. */
1051  if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1052  KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1053  if (filedesc_unlock) {
1054  FILEDESC_XUNLOCK(td->td_proc->p_fd);
1055  filedesc_unlock = 0;
1056  }
1057  kq->kq_state |= KQ_FLUXWAIT;
1058  msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1059  if (fp != NULL) {
1060  fdrop(fp, td);
1061  fp = NULL;
1062  }
1063  goto findkn;
1064  }
1065 
1066  /*
1067  * kn now contains the matching knote, or NULL if no match
1068  */
1069  if (kn == NULL) {
1070  if (kev->flags & EV_ADD) {
1071  kn = tkn;
1072  tkn = NULL;
1073  if (kn == NULL) {
1074  KQ_UNLOCK(kq);
1075  error = ENOMEM;
1076  goto done;
1077  }
1078  kn->kn_fp = fp;
1079  kn->kn_kq = kq;
1080  kn->kn_fop = fops;
1081  /*
1082  * apply reference counts to knote structure, and
1083  * do not release it at the end of this routine.
1084  */
1085  fops = NULL;
1086  fp = NULL;
1087 
1088  kn->kn_sfflags = kev->fflags;
1089  kn->kn_sdata = kev->data;
1090  kev->fflags = 0;
1091  kev->data = 0;
1092  kn->kn_kevent = *kev;
1093  kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1094  EV_ENABLE | EV_DISABLE);
1095  kn->kn_status = KN_INFLUX|KN_DETACHED;
1096 
1097  error = knote_attach(kn, kq);
1098  KQ_UNLOCK(kq);
1099  if (error != 0) {
1100  tkn = kn;
1101  goto done;
1102  }
1103 
1104  if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1105  knote_drop(kn, td);
1106  goto done;
1107  }
1108  KN_LIST_LOCK(kn);
1109  goto done_ev_add;
1110  } else {
1111  /* No matching knote and the EV_ADD flag is not set. */
1112  KQ_UNLOCK(kq);
1113  error = ENOENT;
1114  goto done;
1115  }
1116  }
1117 
1118  if (kev->flags & EV_DELETE) {
1119  kn->kn_status |= KN_INFLUX;
1120  KQ_UNLOCK(kq);
1121  if (!(kn->kn_status & KN_DETACHED))
1122  kn->kn_fop->f_detach(kn);
1123  knote_drop(kn, td);
1124  goto done;
1125  }
1126 
1127  /*
1128  * The user may change some filter values after the initial EV_ADD,
1129  * but doing so will not reset any filter which has already been
1130  * triggered.
1131  */
1132  kn->kn_status |= KN_INFLUX | KN_SCAN;
1133  KQ_UNLOCK(kq);
1134  KN_LIST_LOCK(kn);
1135  kn->kn_kevent.udata = kev->udata;
1136  if (!fops->f_isfd && fops->f_touch != NULL) {
1137  fops->f_touch(kn, kev, EVENT_REGISTER);
1138  } else {
1139  kn->kn_sfflags = kev->fflags;
1140  kn->kn_sdata = kev->data;
1141  }
1142 
1143  /*
1144  * We can get here with kn->kn_knlist == NULL. This can happen when
1145  * the initial attach event decides that the event is "completed"
1146  * already. i.e. filt_procattach is called on a zombie process. It
1147  * will call filt_proc which will remove it from the list, and NULL
1148  * kn_knlist.
1149  */
1150 done_ev_add:
1151  event = kn->kn_fop->f_event(kn, 0);
1152  KQ_LOCK(kq);
1153  if (event)
1154  KNOTE_ACTIVATE(kn, 1);
1155  kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1156  KN_LIST_UNLOCK(kn);
1157 
1158  if ((kev->flags & EV_DISABLE) &&
1159  ((kn->kn_status & KN_DISABLED) == 0)) {
1160  kn->kn_status |= KN_DISABLED;
1161  }
1162 
1163  if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
1164  kn->kn_status &= ~KN_DISABLED;
1165  if ((kn->kn_status & KN_ACTIVE) &&
1166  ((kn->kn_status & KN_QUEUED) == 0))
1167  knote_enqueue(kn);
1168  }
1169  KQ_UNLOCK_FLUX(kq);
1170 
1171 done:
1172  KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1173  if (filedesc_unlock)
1174  FILEDESC_XUNLOCK(td->td_proc->p_fd);
1175  if (fp != NULL)
1176  fdrop(fp, td);
1177  if (tkn != NULL)
1178  knote_free(tkn);
1179  if (fops != NULL)
1180  kqueue_fo_release(filt);
1181  return (error);
1182 }
1183 
1184 static int
1185 kqueue_acquire(struct file *fp, struct kqueue **kqp)
1186 {
1187  int error;
1188  struct kqueue *kq;
1189 
1190  error = 0;
1191 
1192  kq = fp->f_data;
1193  if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1194  return (EBADF);
1195  *kqp = kq;
1196  KQ_LOCK(kq);
1197  if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1198  KQ_UNLOCK(kq);
1199  return (EBADF);
1200  }
1201  kq->kq_refcnt++;
1202  KQ_UNLOCK(kq);
1203 
1204  return error;
1205 }
1206 
1207 static void
1208 kqueue_release(struct kqueue *kq, int locked)
1209 {
1210  if (locked)
1211  KQ_OWNED(kq);
1212  else
1213  KQ_LOCK(kq);
1214  kq->kq_refcnt--;
1215  if (kq->kq_refcnt == 1)
1216  wakeup(&kq->kq_refcnt);
1217  if (!locked)
1218  KQ_UNLOCK(kq);
1219 }
1220 
1221 static void
1222 kqueue_schedtask(struct kqueue *kq)
1223 {
1224 
1225  KQ_OWNED(kq);
1226  KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1227  ("scheduling kqueue task while draining"));
1228 
1229  if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1230  taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
1231  kq->kq_state |= KQ_TASKSCHED;
1232  }
1233 }
1234 
1235 /*
1236  * Expand the kq to make sure we have storage for fops/ident pair.
1237  *
1238  * Return 0 on success (or no work necessary), return errno on failure.
1239  *
1240  * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1241  * If kqueue_register is called from a non-fd context, there usually/should
1242  * be no locks held.
1243  */
1244 static int
1245 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1246  int waitok)
1247 {
1248  struct klist *list, *tmp_knhash, *to_free;
1249  u_long tmp_knhashmask;
1250  int size;
1251  int fd;
1252  int mflag = waitok ? M_WAITOK : M_NOWAIT;
1253 
1254  KQ_NOTOWNED(kq);
1255 
1256  to_free = NULL;
1257  if (fops->f_isfd) {
1258  fd = ident;
1259  if (kq->kq_knlistsize <= fd) {
1260  size = kq->kq_knlistsize;
1261  while (size <= fd)
1262  size += KQEXTENT;
1263  list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1264  if (list == NULL)
1265  return ENOMEM;
1266  KQ_LOCK(kq);
1267  if (kq->kq_knlistsize > fd) {
1268  to_free = list;
1269  list = NULL;
1270  } else {
1271  if (kq->kq_knlist != NULL) {
1272  bcopy(kq->kq_knlist, list,
1273  kq->kq_knlistsize * sizeof(*list));
1274  to_free = kq->kq_knlist;
1275  kq->kq_knlist = NULL;
1276  }
1277  bzero((caddr_t)list +
1278  kq->kq_knlistsize * sizeof(*list),
1279  (size - kq->kq_knlistsize) * sizeof(*list));
1280  kq->kq_knlistsize = size;
1281  kq->kq_knlist = list;
1282  }
1283  KQ_UNLOCK(kq);
1284  }
1285  } else {
1286  if (kq->kq_knhashmask == 0) {
1287  tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1288  &tmp_knhashmask);
1289  if (tmp_knhash == NULL)
1290  return ENOMEM;
1291  KQ_LOCK(kq);
1292  if (kq->kq_knhashmask == 0) {
1293  kq->kq_knhash = tmp_knhash;
1294  kq->kq_knhashmask = tmp_knhashmask;
1295  } else {
1296  to_free = tmp_knhash;
1297  }
1298  KQ_UNLOCK(kq);
1299  }
1300  }
1301  free(to_free, M_KQUEUE);
1302 
1303  KQ_NOTOWNED(kq);
1304  return 0;
1305 }
1306 
1307 static void
1308 kqueue_task(void *arg, int pending)
1309 {
1310  struct kqueue *kq;
1311  int haskqglobal;
1312 
1313  haskqglobal = 0;
1314  kq = arg;
1315 
1316  KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1317  KQ_LOCK(kq);
1318 
1319  KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1320 
1321  kq->kq_state &= ~KQ_TASKSCHED;
1322  if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1323  wakeup(&kq->kq_state);
1324  }
1325  KQ_UNLOCK(kq);
1326  KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1327 }
1328 
1329 /*
1330  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1331  * We treat KN_MARKER knotes as if they are INFLUX.
1332  */
1333 static int
1334 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1335  const struct timespec *tsp, struct kevent *keva, struct thread *td)
1336 {
1337  struct kevent *kevp;
1338  struct timeval atv, rtv, ttv;
1339  struct knote *kn, *marker;
1340  int count, timeout, nkev, error, influx;
1341  int haskqglobal, touch;
1342 
1343  count = maxevents;
1344  nkev = 0;
1345  error = 0;
1346  haskqglobal = 0;
1347 
1348  if (maxevents == 0)
1349  goto done_nl;
1350 
1351  if (tsp != NULL) {
1352  TIMESPEC_TO_TIMEVAL(&atv, tsp);
1353  if (itimerfix(&atv)) {
1354  error = EINVAL;
1355  goto done_nl;
1356  }
1357  if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
1358  timeout = -1;
1359  else
1360  timeout = atv.tv_sec > 24 * 60 * 60 ?
1361  24 * 60 * 60 * hz : tvtohz(&atv);
1362  getmicrouptime(&rtv);
1363  timevaladd(&atv, &rtv);
1364  } else {
1365  atv.tv_sec = 0;
1366  atv.tv_usec = 0;
1367  timeout = 0;
1368  }
1369  marker = knote_alloc(1);
1370  if (marker == NULL) {
1371  error = ENOMEM;
1372  goto done_nl;
1373  }
1374  marker->kn_status = KN_MARKER;
1375  KQ_LOCK(kq);
1376  goto start;
1377 
1378 retry:
1379  if (atv.tv_sec || atv.tv_usec) {
1380  getmicrouptime(&rtv);
1381  if (timevalcmp(&rtv, &atv, >=))
1382  goto done;
1383  ttv = atv;
1384  timevalsub(&ttv, &rtv);
1385  timeout = ttv.tv_sec > 24 * 60 * 60 ?
1386  24 * 60 * 60 * hz : tvtohz(&ttv);
1387  }
1388 
1389 start:
1390  kevp = keva;
1391  if (kq->kq_count == 0) {
1392  if (timeout < 0) {
1393  error = EWOULDBLOCK;
1394  } else {
1395  kq->kq_state |= KQ_SLEEP;
1396  error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
1397  "kqread", timeout);
1398  }
1399  if (error == 0)
1400  goto retry;
1401  /* don't restart after signals... */
1402  if (error == ERESTART)
1403  error = EINTR;
1404  else if (error == EWOULDBLOCK)
1405  error = 0;
1406  goto done;
1407  }
1408 
1409  TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1410  influx = 0;
1411  while (count) {
1412  KQ_OWNED(kq);
1413  kn = TAILQ_FIRST(&kq->kq_head);
1414 
1415  if ((kn->kn_status == KN_MARKER && kn != marker) ||
1416  (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1417  if (influx) {
1418  influx = 0;
1419  KQ_FLUX_WAKEUP(kq);
1420  }
1421  kq->kq_state |= KQ_FLUXWAIT;
1422  error = msleep(kq, &kq->kq_lock, PSOCK,
1423  "kqflxwt", 0);
1424  continue;
1425  }
1426 
1427  TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1428  if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1429  kn->kn_status &= ~KN_QUEUED;
1430  kq->kq_count--;
1431  continue;
1432  }
1433  if (kn == marker) {
1434  KQ_FLUX_WAKEUP(kq);
1435  if (count == maxevents)
1436  goto retry;
1437  goto done;
1438  }
1439  KASSERT((kn->kn_status & KN_INFLUX) == 0,
1440  ("KN_INFLUX set when not suppose to be"));
1441 
1442  if ((kn->kn_flags & EV_DROP) == EV_DROP) {
1443  kn->kn_status &= ~KN_QUEUED;
1444  kn->kn_status |= KN_INFLUX;
1445  kq->kq_count--;
1446  KQ_UNLOCK(kq);
1447  /*
1448  * We don't need to lock the list since we've marked
1449  * it _INFLUX.
1450  */
1451  if (!(kn->kn_status & KN_DETACHED))
1452  kn->kn_fop->f_detach(kn);
1453  knote_drop(kn, td);
1454  KQ_LOCK(kq);
1455  continue;
1456  } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1457  kn->kn_status &= ~KN_QUEUED;
1458  kn->kn_status |= KN_INFLUX;
1459  kq->kq_count--;
1460  KQ_UNLOCK(kq);
1461  /*
1462  * We don't need to lock the list since we've marked
1463  * it _INFLUX.
1464  */
1465  *kevp = kn->kn_kevent;
1466  if (!(kn->kn_status & KN_DETACHED))
1467  kn->kn_fop->f_detach(kn);
1468  knote_drop(kn, td);
1469  KQ_LOCK(kq);
1470  kn = NULL;
1471  } else {
1472  kn->kn_status |= KN_INFLUX | KN_SCAN;
1473  KQ_UNLOCK(kq);
1474  if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1475  KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1476  KN_LIST_LOCK(kn);
1477  if (kn->kn_fop->f_event(kn, 0) == 0) {
1478  KQ_LOCK(kq);
1479  KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1480  kn->kn_status &=
1481  ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
1482  KN_SCAN);
1483  kq->kq_count--;
1484  KN_LIST_UNLOCK(kn);
1485  influx = 1;
1486  continue;
1487  }
1488  touch = (!kn->kn_fop->f_isfd &&
1489  kn->kn_fop->f_touch != NULL);
1490  if (touch)
1491  kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1492  else
1493  *kevp = kn->kn_kevent;
1494  KQ_LOCK(kq);
1495  KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1496  if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1497  /*
1498  * Manually clear knotes who weren't
1499  * 'touch'ed.
1500  */
1501  if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1502  kn->kn_data = 0;
1503  kn->kn_fflags = 0;
1504  }
1505  if (kn->kn_flags & EV_DISPATCH)
1506  kn->kn_status |= KN_DISABLED;
1507  kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1508  kq->kq_count--;
1509  } else
1510  TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1511 
1512  kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1513  KN_LIST_UNLOCK(kn);
1514  influx = 1;
1515  }
1516 
1517  /* we are returning a copy to the user */
1518  kevp++;
1519  nkev++;
1520  count--;
1521 
1522  if (nkev == KQ_NEVENTS) {
1523  influx = 0;
1524  KQ_UNLOCK_FLUX(kq);
1525  error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1526  nkev = 0;
1527  kevp = keva;
1528  KQ_LOCK(kq);
1529  if (error)
1530  break;
1531  }
1532  }
1533  TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1534 done:
1535  KQ_OWNED(kq);
1536  KQ_UNLOCK_FLUX(kq);
1537  knote_free(marker);
1538 done_nl:
1539  KQ_NOTOWNED(kq);
1540  if (nkev != 0)
1541  error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1542  td->td_retval[0] = maxevents - count;
1543  return (error);
1544 }
1545 
1546 /*
1547  * XXX
1548  * This could be expanded to call kqueue_scan, if desired.
1549  */
1550 /*ARGSUSED*/
1551 static int
1552 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1553  int flags, struct thread *td)
1554 {
1555  return (ENXIO);
1556 }
1557 
1558 /*ARGSUSED*/
1559 static int
1560 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1561  int flags, struct thread *td)
1562 {
1563  return (ENXIO);
1564 }
1565 
1566 /*ARGSUSED*/
1567 static int
1568 kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1569  struct thread *td)
1570 {
1571 
1572  return (EINVAL);
1573 }
1574 
1575 /*ARGSUSED*/
1576 static int
1577 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1578  struct ucred *active_cred, struct thread *td)
1579 {
1580  /*
1581  * Enabling sigio causes two major problems:
1582  * 1) infinite recursion:
1583  * Synopsys: kevent is being used to track signals and have FIOASYNC
1584  * set. On receipt of a signal this will cause a kqueue to recurse
1585  * into itself over and over. Sending the sigio causes the kqueue
1586  * to become ready, which in turn posts sigio again, forever.
1587  * Solution: this can be solved by setting a flag in the kqueue that
1588  * we have a SIGIO in progress.
1589  * 2) locking problems:
1590  * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1591  * us above the proc and pgrp locks.
1592  * Solution: Post a signal using an async mechanism, being sure to
1593  * record a generation count in the delivery so that we do not deliver
1594  * a signal to the wrong process.
1595  *
1596  * Note, these two mechanisms are somewhat mutually exclusive!
1597  */
1598 #if 0
1599  struct kqueue *kq;
1600 
1601  kq = fp->f_data;
1602  switch (cmd) {
1603  case FIOASYNC:
1604  if (*(int *)data) {
1605  kq->kq_state |= KQ_ASYNC;
1606  } else {
1607  kq->kq_state &= ~KQ_ASYNC;
1608  }
1609  return (0);
1610 
1611  case FIOSETOWN:
1612  return (fsetown(*(int *)data, &kq->kq_sigio));
1613 
1614  case FIOGETOWN:
1615  *(int *)data = fgetown(&kq->kq_sigio);
1616  return (0);
1617  }
1618 #endif
1619 
1620  return (ENOTTY);
1621 }
1622 
1623 /*ARGSUSED*/
1624 static int
1625 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1626  struct thread *td)
1627 {
1628  struct kqueue *kq;
1629  int revents = 0;
1630  int error;
1631 
1632  if ((error = kqueue_acquire(fp, &kq)))
1633  return POLLERR;
1634 
1635  KQ_LOCK(kq);
1636  if (events & (POLLIN | POLLRDNORM)) {
1637  if (kq->kq_count) {
1638  revents |= events & (POLLIN | POLLRDNORM);
1639  } else {
1640  selrecord(td, &kq->kq_sel);
1641  if (SEL_WAITING(&kq->kq_sel))
1642  kq->kq_state |= KQ_SEL;
1643  }
1644  }
1645  kqueue_release(kq, 1);
1646  KQ_UNLOCK(kq);
1647  return (revents);
1648 }
1649 
1650 /*ARGSUSED*/
1651 static int
1652 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1653  struct thread *td)
1654 {
1655 
1656  bzero((void *)st, sizeof *st);
1657  /*
1658  * We no longer return kq_count because the unlocked value is useless.
1659  * If you spent all this time getting the count, why not spend your
1660  * syscall better by calling kevent?
1661  *
1662  * XXX - This is needed for libc_r.
1663  */
1664  st->st_mode = S_IFIFO;
1665  return (0);
1666 }
1667 
1668 /*ARGSUSED*/
1669 static int
1670 kqueue_close(struct file *fp, struct thread *td)
1671 {
1672  struct kqueue *kq = fp->f_data;
1673  struct filedesc *fdp;
1674  struct knote *kn;
1675  int i;
1676  int error;
1677  int filedesc_unlock;
1678 
1679  if ((error = kqueue_acquire(fp, &kq)))
1680  return error;
1681 
1682  filedesc_unlock = 0;
1683  KQ_LOCK(kq);
1684 
1685  KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1686  ("kqueue already closing"));
1687  kq->kq_state |= KQ_CLOSING;
1688  if (kq->kq_refcnt > 1)
1689  msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1690 
1691  KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1692  fdp = kq->kq_fdp;
1693 
1694  KASSERT(knlist_empty(&kq->kq_sel.si_note),
1695  ("kqueue's knlist not empty"));
1696 
1697  for (i = 0; i < kq->kq_knlistsize; i++) {
1698  while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1699  if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1700  kq->kq_state |= KQ_FLUXWAIT;
1701  msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
1702  continue;
1703  }
1704  kn->kn_status |= KN_INFLUX;
1705  KQ_UNLOCK(kq);
1706  if (!(kn->kn_status & KN_DETACHED))
1707  kn->kn_fop->f_detach(kn);
1708  knote_drop(kn, td);
1709  KQ_LOCK(kq);
1710  }
1711  }
1712  if (kq->kq_knhashmask != 0) {
1713  for (i = 0; i <= kq->kq_knhashmask; i++) {
1714  while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1715  if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1716  kq->kq_state |= KQ_FLUXWAIT;
1717  msleep(kq, &kq->kq_lock, PSOCK,
1718  "kqclo2", 0);
1719  continue;
1720  }
1721  kn->kn_status |= KN_INFLUX;
1722  KQ_UNLOCK(kq);
1723  if (!(kn->kn_status & KN_DETACHED))
1724  kn->kn_fop->f_detach(kn);
1725  knote_drop(kn, td);
1726  KQ_LOCK(kq);
1727  }
1728  }
1729  }
1730 
1731  if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1732  kq->kq_state |= KQ_TASKDRAIN;
1733  msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1734  }
1735 
1736  if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1737  selwakeuppri(&kq->kq_sel, PSOCK);
1738  if (!SEL_WAITING(&kq->kq_sel))
1739  kq->kq_state &= ~KQ_SEL;
1740  }
1741 
1742  KQ_UNLOCK(kq);
1743 
1744  /*
1745  * We could be called due to the knote_drop() doing fdrop(),
1746  * called from kqueue_register(). In this case the global
1747  * lock is owned, and filedesc sx is locked before, to not
1748  * take the sleepable lock after non-sleepable.
1749  */
1750  if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
1751  FILEDESC_XLOCK(fdp);
1752  filedesc_unlock = 1;
1753  } else
1754  filedesc_unlock = 0;
1755  TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
1756  if (filedesc_unlock)
1757  FILEDESC_XUNLOCK(fdp);
1758 
1759  seldrain(&kq->kq_sel);
1760  knlist_destroy(&kq->kq_sel.si_note);
1761  mtx_destroy(&kq->kq_lock);
1762  kq->kq_fdp = NULL;
1763 
1764  if (kq->kq_knhash != NULL)
1765  free(kq->kq_knhash, M_KQUEUE);
1766  if (kq->kq_knlist != NULL)
1767  free(kq->kq_knlist, M_KQUEUE);
1768 
1769  funsetown(&kq->kq_sigio);
1770  free(kq, M_KQUEUE);
1771  fp->f_data = NULL;
1772 
1773  return (0);
1774 }
1775 
1776 static void
1777 kqueue_wakeup(struct kqueue *kq)
1778 {
1779  KQ_OWNED(kq);
1780 
1781  if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
1782  kq->kq_state &= ~KQ_SLEEP;
1783  wakeup(kq);
1784  }
1785  if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1786  selwakeuppri(&kq->kq_sel, PSOCK);
1787  if (!SEL_WAITING(&kq->kq_sel))
1788  kq->kq_state &= ~KQ_SEL;
1789  }
1790  if (!knlist_empty(&kq->kq_sel.si_note))
1791  kqueue_schedtask(kq);
1792  if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1793  pgsigio(&kq->kq_sigio, SIGIO, 0);
1794  }
1795 }
1796 
1797 /*
1798  * Walk down a list of knotes, activating them if their event has triggered.
1799  *
1800  * There is a possibility to optimize in the case of one kq watching another.
1801  * Instead of scheduling a task to wake it up, you could pass enough state
1802  * down the chain to make up the parent kqueue. Make this code functional
1803  * first.
1804  */
1805 void
1806 knote(struct knlist *list, long hint, int lockflags)
1807 {
1808  struct kqueue *kq;
1809  struct knote *kn;
1810  int error;
1811 
1812  if (list == NULL)
1813  return;
1814 
1815  KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
1816 
1817  if ((lockflags & KNF_LISTLOCKED) == 0)
1818  list->kl_lock(list->kl_lockarg);
1819 
1820  /*
1821  * If we unlock the list lock (and set KN_INFLUX), we can eliminate
1822  * the kqueue scheduling, but this will introduce four
1823  * lock/unlock's for each knote to test. If we do, continue to use
1824  * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
1825  * only safe if you want to remove the current item, which we are
1826  * not doing.
1827  */
1828  SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
1829  kq = kn->kn_kq;
1830  KQ_LOCK(kq);
1831  if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
1832  /*
1833  * Do not process the influx notes, except for
1834  * the influx coming from the kq unlock in the
1835  * kqueue_scan(). In the later case, we do
1836  * not interfere with the scan, since the code
1837  * fragment in kqueue_scan() locks the knlist,
1838  * and cannot proceed until we finished.
1839  */
1840  KQ_UNLOCK(kq);
1841  } else if ((lockflags & KNF_NOKQLOCK) != 0) {
1842  kn->kn_status |= KN_INFLUX;
1843  KQ_UNLOCK(kq);
1844  error = kn->kn_fop->f_event(kn, hint);
1845  KQ_LOCK(kq);
1846  kn->kn_status &= ~KN_INFLUX;
1847  if (error)
1848  KNOTE_ACTIVATE(kn, 1);
1849  KQ_UNLOCK_FLUX(kq);
1850  } else {
1851  kn->kn_status |= KN_HASKQLOCK;
1852  if (kn->kn_fop->f_event(kn, hint))
1853  KNOTE_ACTIVATE(kn, 1);
1854  kn->kn_status &= ~KN_HASKQLOCK;
1855  KQ_UNLOCK(kq);
1856  }
1857  }
1858  if ((lockflags & KNF_LISTLOCKED) == 0)
1859  list->kl_unlock(list->kl_lockarg);
1860 }
1861 
1862 /*
1863  * add a knote to a knlist
1864  */
1865 void
1866 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
1867 {
1868  KNL_ASSERT_LOCK(knl, islocked);
1869  KQ_NOTOWNED(kn->kn_kq);
1870  KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
1871  (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
1872  if (!islocked)
1873  knl->kl_lock(knl->kl_lockarg);
1874  SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
1875  if (!islocked)
1876  knl->kl_unlock(knl->kl_lockarg);
1877  KQ_LOCK(kn->kn_kq);
1878  kn->kn_knlist = knl;
1879  kn->kn_status &= ~KN_DETACHED;
1880  KQ_UNLOCK(kn->kn_kq);
1881 }
1882 
1883 static void
1884 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
1885 {
1886  KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
1887  KNL_ASSERT_LOCK(knl, knlislocked);
1888  mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
1889  if (!kqislocked)
1890  KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
1891  ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
1892  if (!knlislocked)
1893  knl->kl_lock(knl->kl_lockarg);
1894  SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
1895  kn->kn_knlist = NULL;
1896  if (!knlislocked)
1897  knl->kl_unlock(knl->kl_lockarg);
1898  if (!kqislocked)
1899  KQ_LOCK(kn->kn_kq);
1900  kn->kn_status |= KN_DETACHED;
1901  if (!kqislocked)
1902  KQ_UNLOCK(kn->kn_kq);
1903 }
1904 
1905 /*
1906  * remove all knotes from a specified klist
1907  */
1908 void
1909 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
1910 {
1911 
1912  knlist_remove_kq(knl, kn, islocked, 0);
1913 }
1914 
1915 /*
1916  * remove knote from a specified klist while in f_event handler.
1917  */
1918 void
1919 knlist_remove_inevent(struct knlist *knl, struct knote *kn)
1920 {
1921 
1922  knlist_remove_kq(knl, kn, 1,
1923  (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
1924 }
1925 
1926 int
1927 knlist_empty(struct knlist *knl)
1928 {
1929  KNL_ASSERT_LOCKED(knl);
1930  return SLIST_EMPTY(&knl->kl_list);
1931 }
1932 
1933 static struct mtx knlist_lock;
1934 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
1935  MTX_DEF);
1936 static void knlist_mtx_lock(void *arg);
1937 static void knlist_mtx_unlock(void *arg);
1938 
1939 static void
1941 {
1942  mtx_lock((struct mtx *)arg);
1943 }
1944 
1945 static void
1947 {
1948  mtx_unlock((struct mtx *)arg);
1949 }
1950 
1951 static void
1953 {
1954  mtx_assert((struct mtx *)arg, MA_OWNED);
1955 }
1956 
1957 static void
1959 {
1960  mtx_assert((struct mtx *)arg, MA_NOTOWNED);
1961 }
1962 
1963 void
1964 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
1965  void (*kl_unlock)(void *),
1966  void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
1967 {
1968 
1969  if (lock == NULL)
1970  knl->kl_lockarg = &knlist_lock;
1971  else
1972  knl->kl_lockarg = lock;
1973 
1974  if (kl_lock == NULL)
1975  knl->kl_lock = knlist_mtx_lock;
1976  else
1977  knl->kl_lock = kl_lock;
1978  if (kl_unlock == NULL)
1979  knl->kl_unlock = knlist_mtx_unlock;
1980  else
1981  knl->kl_unlock = kl_unlock;
1982  if (kl_assert_locked == NULL)
1983  knl->kl_assert_locked = knlist_mtx_assert_locked;
1984  else
1985  knl->kl_assert_locked = kl_assert_locked;
1986  if (kl_assert_unlocked == NULL)
1987  knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
1988  else
1989  knl->kl_assert_unlocked = kl_assert_unlocked;
1990 
1991  SLIST_INIT(&knl->kl_list);
1992 }
1993 
1994 void
1995 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
1996 {
1997 
1998  knlist_init(knl, lock, NULL, NULL, NULL, NULL);
1999 }
2000 
2001 void
2002 knlist_destroy(struct knlist *knl)
2003 {
2004 
2005 #ifdef INVARIANTS
2006  /*
2007  * if we run across this error, we need to find the offending
2008  * driver and have it call knlist_clear.
2009  */
2010  if (!SLIST_EMPTY(&knl->kl_list))
2011  printf("WARNING: destroying knlist w/ knotes on it!\n");
2012 #endif
2013 
2014  knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
2015  SLIST_INIT(&knl->kl_list);
2016 }
2017 
2018 /*
2019  * Even if we are locked, we may need to drop the lock to allow any influx
2020  * knotes time to "settle".
2021  */
2022 void
2023 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2024 {
2025  struct knote *kn, *kn2;
2026  struct kqueue *kq;
2027 
2028  if (islocked)
2029  KNL_ASSERT_LOCKED(knl);
2030  else {
2031  KNL_ASSERT_UNLOCKED(knl);
2032 again: /* need to reacquire lock since we have dropped it */
2033  knl->kl_lock(knl->kl_lockarg);
2034  }
2035 
2036  SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2037  kq = kn->kn_kq;
2038  KQ_LOCK(kq);
2039  if ((kn->kn_status & KN_INFLUX)) {
2040  KQ_UNLOCK(kq);
2041  continue;
2042  }
2043  knlist_remove_kq(knl, kn, 1, 1);
2044  if (killkn) {
2045  kn->kn_status |= KN_INFLUX | KN_DETACHED;
2046  KQ_UNLOCK(kq);
2047  knote_drop(kn, td);
2048  } else {
2049  /* Make sure cleared knotes disappear soon */
2050  kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2051  KQ_UNLOCK(kq);
2052  }
2053  kq = NULL;
2054  }
2055 
2056  if (!SLIST_EMPTY(&knl->kl_list)) {
2057  /* there are still KN_INFLUX remaining */
2058  kn = SLIST_FIRST(&knl->kl_list);
2059  kq = kn->kn_kq;
2060  KQ_LOCK(kq);
2061  KASSERT(kn->kn_status & KN_INFLUX,
2062  ("knote removed w/o list lock"));
2063  knl->kl_unlock(knl->kl_lockarg);
2064  kq->kq_state |= KQ_FLUXWAIT;
2065  msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2066  kq = NULL;
2067  goto again;
2068  }
2069 
2070  if (islocked)
2071  KNL_ASSERT_LOCKED(knl);
2072  else {
2073  knl->kl_unlock(knl->kl_lockarg);
2074  KNL_ASSERT_UNLOCKED(knl);
2075  }
2076 }
2077 
2078 /*
2079  * Remove all knotes referencing a specified fd must be called with FILEDESC
2080  * lock. This prevents a race where a new fd comes along and occupies the
2081  * entry and we attach a knote to the fd.
2082  */
2083 void
2084 knote_fdclose(struct thread *td, int fd)
2085 {
2086  struct filedesc *fdp = td->td_proc->p_fd;
2087  struct kqueue *kq;
2088  struct knote *kn;
2089  int influx;
2090 
2091  FILEDESC_XLOCK_ASSERT(fdp);
2092 
2093  /*
2094  * We shouldn't have to worry about new kevents appearing on fd
2095  * since filedesc is locked.
2096  */
2097  TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2098  KQ_LOCK(kq);
2099 
2100 again:
2101  influx = 0;
2102  while (kq->kq_knlistsize > fd &&
2103  (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2104  if (kn->kn_status & KN_INFLUX) {
2105  /* someone else might be waiting on our knote */
2106  if (influx)
2107  wakeup(kq);
2108  kq->kq_state |= KQ_FLUXWAIT;
2109  msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2110  goto again;
2111  }
2112  kn->kn_status |= KN_INFLUX;
2113  KQ_UNLOCK(kq);
2114  if (!(kn->kn_status & KN_DETACHED))
2115  kn->kn_fop->f_detach(kn);
2116  knote_drop(kn, td);
2117  influx = 1;
2118  KQ_LOCK(kq);
2119  }
2120  KQ_UNLOCK_FLUX(kq);
2121  }
2122 }
2123 
2124 static int
2125 knote_attach(struct knote *kn, struct kqueue *kq)
2126 {
2127  struct klist *list;
2128 
2129  KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
2130  KQ_OWNED(kq);
2131 
2132  if (kn->kn_fop->f_isfd) {
2133  if (kn->kn_id >= kq->kq_knlistsize)
2134  return ENOMEM;
2135  list = &kq->kq_knlist[kn->kn_id];
2136  } else {
2137  if (kq->kq_knhash == NULL)
2138  return ENOMEM;
2139  list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2140  }
2141 
2142  SLIST_INSERT_HEAD(list, kn, kn_link);
2143 
2144  return 0;
2145 }
2146 
2147 /*
2148  * knote must already have been detached using the f_detach method.
2149  * no lock need to be held, it is assumed that the KN_INFLUX flag is set
2150  * to prevent other removal.
2151  */
2152 static void
2153 knote_drop(struct knote *kn, struct thread *td)
2154 {
2155  struct kqueue *kq;
2156  struct klist *list;
2157 
2158  kq = kn->kn_kq;
2159 
2160  KQ_NOTOWNED(kq);
2161  KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
2162  ("knote_drop called without KN_INFLUX set in kn_status"));
2163 
2164  KQ_LOCK(kq);
2165  if (kn->kn_fop->f_isfd)
2166  list = &kq->kq_knlist[kn->kn_id];
2167  else
2168  list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2169 
2170  if (!SLIST_EMPTY(list))
2171  SLIST_REMOVE(list, kn, knote, kn_link);
2172  if (kn->kn_status & KN_QUEUED)
2173  knote_dequeue(kn);
2174  KQ_UNLOCK_FLUX(kq);
2175 
2176  if (kn->kn_fop->f_isfd) {
2177  fdrop(kn->kn_fp, td);
2178  kn->kn_fp = NULL;
2179  }
2180  kqueue_fo_release(kn->kn_kevent.filter);
2181  kn->kn_fop = NULL;
2182  knote_free(kn);
2183 }
2184 
2185 static void
2187 {
2188  struct kqueue *kq = kn->kn_kq;
2189 
2190  KQ_OWNED(kn->kn_kq);
2191  KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2192 
2193  TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2194  kn->kn_status |= KN_QUEUED;
2195  kq->kq_count++;
2196  kqueue_wakeup(kq);
2197 }
2198 
2199 static void
2201 {
2202  struct kqueue *kq = kn->kn_kq;
2203 
2204  KQ_OWNED(kn->kn_kq);
2205  KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2206 
2207  TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2208  kn->kn_status &= ~KN_QUEUED;
2209  kq->kq_count--;
2210 }
2211 
2212 static void
2214 {
2215 
2216  knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2217  NULL, NULL, UMA_ALIGN_PTR, 0);
2218 }
2219 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2220 
2221 static struct knote *
2222 knote_alloc(int waitok)
2223 {
2224  return ((struct knote *)uma_zalloc(knote_zone,
2225  (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
2226 }
2227 
2228 static void
2229 knote_free(struct knote *kn)
2230 {
2231  if (kn != NULL)
2232  uma_zfree(knote_zone, kn);
2233 }
2234 
2235 /*
2236  * Register the kev w/ the kq specified by fd.
2237  */
2238 int
2239 kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
2240 {
2241  struct kqueue *kq;
2242  struct file *fp;
2243  int error;
2244 
2245  if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
2246  return (error);
2247  if ((error = kqueue_acquire(fp, &kq)) != 0)
2248  goto noacquire;
2249 
2250  error = kqueue_register(kq, kev, td, waitok);
2251 
2252  kqueue_release(kq, 0);
2253 
2254 noacquire:
2255  fdrop(fp, td);
2256 
2257  return error;
2258 }
#define KN_LIST_LOCK(kn)
Definition: kern_event.c:223
static void knote_free(struct knote *kn)
Definition: kern_event.c:2229
int kqueue_del_filteropts(int filt)
Definition: kern_event.c:906
static int filt_user(struct knote *kn, long hint)
#define KQ_OWNED(kq)
Definition: kern_event.c:217
int tvtohz(struct timeval *tv)
Definition: kern_clock.c:590
struct uio * cloneuio(struct uio *uiop)
Definition: subr_uio.c:544
pid_t fgetown(struct sigio **sigiop)
int fsetown(pid_t pgid, struct sigio **sigiop)
int fd
Definition: kern_exec.c:199
void knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
Definition: kern_event.c:2023
struct callout_handle timeout(timeout_t *ftn, void *arg, int to_ticks)
Definition: kern_timeout.c:713
#define KQ_NOTOWNED(kq)
Definition: kern_event.c:220
#define KN_HASHSIZE
Definition: kern_event.c:249
static int kq_ncallouts
Definition: kern_event.c:184
struct filterops * for_fop
Definition: kern_event.c:275
void * hashinit(int elements, struct malloc_type *type, u_long *hashmask)
Definition: subr_hash.c:83
int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
static fo_stat_t kqueue_stat
Definition: kern_event.c:114
struct timespec * ts
Definition: clock_if.m:39
#define KQ_UNLOCK_FLUX(kq)
Definition: kern_event.c:210
static int filt_timerattach(struct knote *kn)
Definition: kern_event.c:562
#define KN_LIST_UNLOCK(kn)
Definition: kern_event.c:227
void knote_fdclose(struct thread *td, int fd)
Definition: kern_event.c:2084
void *** start
Definition: linker_if.m:86
struct filterops fs_filtops
Definition: vfs_subr.c:4419
void selrecord(struct thread *selector, struct selinfo *sip)
Definition: sys_generic.c:1606
static struct filterops user_filtops
Definition: kern_event.c:176
static int kevent_copyin(void *arg, struct kevent *kevp, int count)
Definition: kern_event.c:808
void * malloc(unsigned long size, struct malloc_type *mtp, int flags)
Definition: kern_malloc.c:454
static MALLOC_DEFINE(M_KQUEUE,"kqueue","memory for kqueue system")
static void knote_drop(struct knote *kn, struct thread *td)
Definition: kern_event.c:2153
void knlist_remove_inevent(struct knlist *knl, struct knote *kn)
Definition: kern_event.c:1919
static struct filterops timer_filtops
Definition: kern_event.c:170
int sys_kqueue(struct thread *td, struct kqueue_args *uap)
Definition: kern_event.c:692
#define KQ_LOCK(kq)
Definition: kern_event.c:201
TASKQUEUE_DEFINE_THREAD(kqueue)
void panic(const char *fmt,...)
static fo_truncate_t kqueue_truncate
Definition: kern_event.c:110
static int kq_calloutmax
Definition: kern_event.c:185
#define KN_HASH(val, mask)
Definition: kern_event.c:250
#define KQ_FLUX_WAKEUP(kq)
Definition: kern_event.c:204
void knote(struct knlist *list, long hint, int lockflags)
Definition: kern_event.c:1806
static void kqueue_task(void *arg, int pending)
Definition: kern_event.c:1308
static int kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, const struct timespec *timeout, struct kevent *keva, struct thread *td)
Definition: kern_event.c:1334
int knlist_empty(struct knlist *knl)
Definition: kern_event.c:1927
struct filterops null_filtops
Definition: kern_event.c:259
void selwakeuppri(struct selinfo *sip, int pri)
Definition: sys_generic.c:1664
static struct mtx kq_global
Definition: kern_event.c:76
struct kevent * eventlist
Definition: kern_event.c:729
int falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
static void knlist_mtx_unlock(void *arg)
Definition: kern_event.c:1946
#define KNL_ASSERT_LOCKED(knl)
Definition: kern_event.c:245
static int filt_userattach(struct knote *kn)
Definition: kern_event.c:604
void knlist_destroy(struct knlist *knl)
Definition: kern_event.c:2002
struct filterops sig_filtops
Definition: kern_sig.c:118
int fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
static void knote_enqueue(struct knote *kn)
Definition: kern_event.c:2186
int * type
Definition: cpufreq_if.m:98
static struct mtx filterops_lock
Definition: kern_event.c:271
int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td)
void funsetown(struct sigio **sigiop)
Definition: kern_descrip.c:990
void getmicrouptime(struct timeval *tvp)
Definition: kern_tc.c:255
static fo_kqfilter_t kqueue_kqfilter
Definition: kern_event.c:113
static struct @1 sysfilt_ops[EVFILT_SYSCOUNT]
static int filt_proc(struct knote *kn, long hint)
Definition: kern_event.c:409
void timevalsub(struct timeval *t1, const struct timeval *t2)
Definition: kern_time.c:885
struct proc * pfind(pid_t pid)
Definition: kern_proc.c:304
static struct fileops kqueueops
Definition: kern_event.c:117
int kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
Definition: kern_event.c:2239
static void knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
Definition: kern_event.c:1884
SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,&kq_calloutmax, 0,"Maximum number of callouts allocated for kqueue")
static int kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
Definition: kern_event.c:965
int kqueue_add_filteropts(int filt, struct filterops *filtops)
Definition: kern_event.c:881
static int kqueue_acquire(struct file *fp, struct kqueue **kqp)
Definition: kern_event.c:1185
void knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
Definition: kern_event.c:1909
void knlist_init(struct knlist *knl, void *lock, void(*kl_lock)(void *), void(*kl_unlock)(void *), void(*kl_assert_locked)(void *), void(*kl_assert_unlocked)(void *))
Definition: kern_event.c:1964
__FBSDID("$BSDSUniX$")
static struct filterops * kqueue_fo_find(int filt)
Definition: kern_event.c:930
#define KQ_GLOBAL_UNLOCK(lck, haslck)
Definition: kern_event.c:83
static fo_close_t kqueue_close
Definition: kern_event.c:115
static int timertoticks(intptr_t data)
Definition: kern_event.c:523
void seldrain(struct selinfo *sip)
Definition: sys_generic.c:1587
static void knlist_mtx_assert_unlocked(void *arg)
Definition: kern_event.c:1958
#define KQ_UNLOCK(kq)
Definition: kern_event.c:214
#define KNL_ASSERT_LOCK(knl, islocked)
Definition: kern_event.c:231
void timevaladd(struct timeval *t1, const struct timeval *t2)
Definition: kern_time.c:876
static int kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident, int waitok)
Definition: kern_event.c:1245
static int filt_timer(struct knote *kn, long hint)
Definition: kern_event.c:597
#define KNOTE_ACTIVATE(kn, islock)
Definition: kern_event.c:190
static void kqueue_release(struct kqueue *kq, int locked)
Definition: kern_event.c:1208
int sys_kevent(struct thread *td, struct kevent_args *uap)
Definition: kern_event.c:735
static void kqueue_schedtask(struct kqueue *kq)
Definition: kern_event.c:1222
void knlist_add(struct knlist *knl, struct knote *kn, int islocked)
Definition: kern_event.c:1866
#define KQ_GLOBAL_LOCK(lck, haslck)
Definition: kern_event.c:78
static struct filterops kqread_filtops
Definition: kern_event.c:158
static void knote_dequeue(struct knote *kn)
Definition: kern_event.c:2200
static int filt_procattach(struct knote *kn)
Definition: kern_event.c:338
static void filt_procdetach(struct knote *kn)
Definition: kern_event.c:398
int for_refcnt
Definition: kern_event.c:276
static fo_rdwr_t kqueue_write
Definition: kern_event.c:109
const struct timespec * timeout
Definition: kern_event.c:731
void free(void *addr, struct malloc_type *mtp)
Definition: kern_malloc.c:554
int printf(const char *fmt,...)
Definition: subr_prf.c:367
int taskqueue_enqueue(struct taskqueue *queue, struct task *task)
static int filt_fileattach(struct knote *kn)
Definition: kern_event.c:296
MTX_SYSINIT(kq_global,&kq_global,"kqueue order", MTX_DEF)
static void filt_timerexpire(void *knx)
Definition: kern_event.c:536
static struct filterops file_filtops
Definition: kern_event.c:154
static void knlist_mtx_assert_locked(void *arg)
Definition: kern_event.c:1952
void callout_init(struct callout *c, int mpsafe)
static uma_zone_t knote_zone
Definition: kern_event.c:183
static fo_ioctl_t kqueue_ioctl
Definition: kern_event.c:111
static int knote_attach(struct knote *kn, struct kqueue *kq)
Definition: kern_event.c:2125
static void knote_init(void)
Definition: kern_event.c:2213
void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
void mtx_init(struct mtx *m, const char *name, const char *type, int opts)
Definition: kern_mutex.c:837
static void filt_timerdetach(struct knote *kn)
Definition: kern_event.c:585
const struct kevent * changelist
Definition: kern_event.c:727
void wakeup(void *ident)
Definition: kern_synch.c:378
static struct filterops proc_filtops
Definition: kern_event.c:164
int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout)
Definition: kern_event.c:823
static fo_poll_t kqueue_poll
Definition: kern_event.c:112
int itimerfix(struct timeval *tv)
Definition: kern_time.c:817
static struct knote * knote_alloc(int waitok)
Definition: kern_event.c:2222
static struct mtx knlist_lock
Definition: kern_event.c:1933
static void kqueue_fo_release(int filt)
Definition: kern_event.c:946
static void kqueue_wakeup(struct kqueue *kq)
Definition: kern_event.c:1777
static void filt_userdetach(struct knote *kn)
static int kevent_copyout(void *arg, struct kevent *kevp, int count)
Definition: kern_event.c:790
static int filt_nullattach(struct knote *kn)
Definition: kern_event.c:253
void knote_fork(struct knlist *list, int pid)
Definition: kern_event.c:452
static fo_rdwr_t kqueue_read
Definition: kern_event.c:108
#define KNL_ASSERT_UNLOCKED(knl)
Definition: kern_event.c:246
static void filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
Definition: kern_event.c:635
static void filt_kqdetach(struct knote *kn)
Definition: kern_event.c:319
struct proc * zpfind(pid_t pid)
Definition: kern_proc.c:1075
static int filt_kqueue(struct knote *kn, long hint)
Definition: kern_event.c:328
void mtx_destroy(struct mtx *m)
Definition: kern_mutex.c:884
int p_cansee(struct thread *td, struct proc *p)
Definition: kern_prot.c:1426
static void knlist_mtx_lock(void *arg)
Definition: kern_event.c:1940
void knlist_init_mtx(struct knlist *knl, struct mtx *lock)
Definition: kern_event.c:1995
void pgsigio(struct sigio **sigiop, int sig, int checkctty)
Definition: kern_sig.c:3372
int hz
Definition: subr_param.c:84
int * count
Definition: cpufreq_if.m:63