FreeBSD kernel kern code
sys_pipe.c
Go to the documentation of this file.
1 /*-
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice immediately at the beginning of the file, without modification,
10  * this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  * notice, this list of conditions and the following disclaimer in the
13  * documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  * John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  * are met.
18  */
19 
20 /*
21  * This file contains a high-performance replacement for the socket-based
22  * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
23  * all features of sockets, but does do everything that pipes normally
24  * do.
25  */
26 
27 /*
28  * This code has two modes of operation, a small write mode and a large
29  * write mode. The small write mode acts like conventional pipes with
30  * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
31  * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
32  * and PIPE_SIZE in size, the sending process pins the underlying pages in
33  * memory, and the receiving process copies directly from these pinned pages
34  * in the sending process.
35  *
36  * If the sending process receives a signal, it is possible that it will
37  * go away, and certainly its address space can change, because control
38  * is returned back to the user-mode side. In that case, the pipe code
39  * arranges to copy the buffer supplied by the user process, to a pageable
40  * kernel buffer, and the receiving process will grab the data from the
41  * pageable kernel buffer. Since signals don't happen all that often,
42  * the copy operation is normally eliminated.
43  *
44  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45  * happen for small transfers so that the system will not spend all of
46  * its time context switching.
47  *
48  * In order to limit the resource use of pipes, two sysctls exist:
49  *
50  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51  * address space available to us in pipe_map. This value is normally
52  * autotuned, but may also be loader tuned.
53  *
54  * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
55  * memory in use by pipes.
56  *
57  * Based on how large pipekva is relative to maxpipekva, the following
58  * will happen:
59  *
60  * 0% - 50%:
61  * New pipes are given 16K of memory backing, pipes may dynamically
62  * grow to as large as 64K where needed.
63  * 50% - 75%:
64  * New pipes are given 4K (or PAGE_SIZE) of memory backing,
65  * existing pipes may NOT grow.
66  * 75% - 100%:
67  * New pipes are given 4K (or PAGE_SIZE) of memory backing,
68  * existing pipes will be shrunk down to 4K whenever possible.
69  *
70  * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If
71  * that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
72  * resize which MUST occur for reverse-direction pipes when they are
73  * first used.
74  *
75  * Additional information about the current state of pipes may be obtained
76  * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
77  * and kern.ipc.piperesizefail.
78  *
79  * Locking rules: There are two locks present here: A mutex, used via
80  * PIPE_LOCK, and a flag, used via pipelock(). All locking is done via
81  * the flag, as mutexes can not persist over uiomove. The mutex
82  * exists only to guard access to the flag, and is not in itself a
83  * locking mechanism. Also note that there is only a single mutex for
84  * both directions of a pipe.
85  *
86  * As pipelock() may have to sleep before it can acquire the flag, it
87  * is important to reread all data after a call to pipelock(); everything
88  * in the structure may have changed.
89  */
90 
91 #include <sys/cdefs.h>
92 __FBSDID("$BSDSUniX$");
93 
94 #include <sys/param.h>
95 #include <sys/systm.h>
96 #include <sys/conf.h>
97 #include <sys/fcntl.h>
98 #include <sys/file.h>
99 #include <sys/filedesc.h>
100 #include <sys/filio.h>
101 #include <sys/kernel.h>
102 #include <sys/lock.h>
103 #include <sys/mutex.h>
104 #include <sys/ttycom.h>
105 #include <sys/stat.h>
106 #include <sys/malloc.h>
107 #include <sys/poll.h>
108 #include <sys/selinfo.h>
109 #include <sys/signalvar.h>
110 #include <sys/syscallsubr.h>
111 #include <sys/sysctl.h>
112 #include <sys/sysproto.h>
113 #include <sys/pipe.h>
114 #include <sys/proc.h>
115 #include <sys/vnode.h>
116 #include <sys/uio.h>
117 #include <sys/event.h>
118 
119 #include <security/mac/mac_framework.h>
120 
121 #include <vm/vm.h>
122 #include <vm/vm_param.h>
123 #include <vm/vm_object.h>
124 #include <vm/vm_kern.h>
125 #include <vm/vm_extern.h>
126 #include <vm/pmap.h>
127 #include <vm/vm_map.h>
128 #include <vm/vm_page.h>
129 #include <vm/uma.h>
130 
131 /* XXX */
132 int do_pipe(struct thread *td, int fildes[2], int flags);
133 
134 /*
135  * Use this define if you want to disable *fancy* VM things. Expect an
136  * approx 30% decrease in transfer rate. This could be useful for
137  * NetBSD or OpenBSD.
138  */
139 /* #define PIPE_NODIRECT */
140 
141 /*
142  * interfaces to the outside world
143  */
144 static fo_rdwr_t pipe_read;
145 static fo_rdwr_t pipe_write;
146 static fo_truncate_t pipe_truncate;
147 static fo_ioctl_t pipe_ioctl;
148 static fo_poll_t pipe_poll;
149 static fo_kqfilter_t pipe_kqfilter;
150 static fo_stat_t pipe_stat;
151 static fo_close_t pipe_close;
152 
153 static struct fileops pipeops = {
154  .fo_read = pipe_read,
155  .fo_write = pipe_write,
156  .fo_truncate = pipe_truncate,
157  .fo_ioctl = pipe_ioctl,
158  .fo_poll = pipe_poll,
159  .fo_kqfilter = pipe_kqfilter,
160  .fo_stat = pipe_stat,
161  .fo_close = pipe_close,
162  .fo_chmod = invfo_chmod,
163  .fo_chown = invfo_chown,
164  .fo_flags = DFLAG_PASSABLE
165 };
166 
167 static void filt_pipedetach(struct knote *kn);
168 static int filt_piperead(struct knote *kn, long hint);
169 static int filt_pipewrite(struct knote *kn, long hint);
170 
171 static struct filterops pipe_rfiltops = {
172  .f_isfd = 1,
173  .f_detach = filt_pipedetach,
174  .f_event = filt_piperead
175 };
176 static struct filterops pipe_wfiltops = {
177  .f_isfd = 1,
178  .f_detach = filt_pipedetach,
179  .f_event = filt_pipewrite
180 };
181 
182 /*
183  * Default pipe buffer size(s), this can be kind-of large now because pipe
184  * space is pageable. The pipe code will try to maintain locality of
185  * reference for performance reasons, so small amounts of outstanding I/O
186  * will not wipe the cache.
187  */
188 #define MINPIPESIZE (PIPE_SIZE/3)
189 #define MAXPIPESIZE (2*PIPE_SIZE/3)
190 
191 static long amountpipekva;
192 static int pipefragretry;
193 static int pipeallocfail;
194 static int piperesizefail;
195 static int piperesizeallowed = 1;
196 
197 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
198  &maxpipekva, 0, "Pipe KVA limit");
199 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
200  &amountpipekva, 0, "Pipe KVA usage");
201 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
202  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
203 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
204  &pipeallocfail, 0, "Pipe allocation failures");
205 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
206  &piperesizefail, 0, "Pipe resize failures");
207 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
208  &piperesizeallowed, 0, "Pipe resizing allowed");
209 
210 static void pipeinit(void *dummy __unused);
211 static void pipeclose(struct pipe *cpipe);
212 static void pipe_free_kmem(struct pipe *cpipe);
213 static int pipe_create(struct pipe *pipe, int backing);
214 static __inline int pipelock(struct pipe *cpipe, int catch);
215 static __inline void pipeunlock(struct pipe *cpipe);
216 static __inline void pipeselwakeup(struct pipe *cpipe);
217 #ifndef PIPE_NODIRECT
218 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
219 static void pipe_destroy_write_buffer(struct pipe *wpipe);
220 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
221 static void pipe_clone_write_buffer(struct pipe *wpipe);
222 #endif
223 static int pipespace(struct pipe *cpipe, int size);
224 static int pipespace_new(struct pipe *cpipe, int size);
225 
226 static int pipe_zone_ctor(void *mem, int size, void *arg, int flags);
227 static int pipe_zone_init(void *mem, int size, int flags);
228 static void pipe_zone_fini(void *mem, int size);
229 
230 static uma_zone_t pipe_zone;
231 static struct unrhdr *pipeino_unr;
232 static dev_t pipedev_ino;
233 
234 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
235 
236 static void
237 pipeinit(void *dummy __unused)
238 {
239 
240  pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
242  UMA_ALIGN_PTR, 0);
243  KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
244  pipeino_unr = new_unrhdr(1, INT32_MAX, NULL);
245  KASSERT(pipeino_unr != NULL, ("pipe fake inodes not initialized"));
246  pipedev_ino = devfs_alloc_cdp_inode();
247  KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
248 }
249 
250 static int
251 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
252 {
253  struct pipepair *pp;
254  struct pipe *rpipe, *wpipe;
255 
256  KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
257 
258  pp = (struct pipepair *)mem;
259 
260  /*
261  * We zero both pipe endpoints to make sure all the kmem pointers
262  * are NULL, flag fields are zero'd, etc. We timestamp both
263  * endpoints with the same time.
264  */
265  rpipe = &pp->pp_rpipe;
266  bzero(rpipe, sizeof(*rpipe));
267  vfs_timestamp(&rpipe->pipe_ctime);
268  rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
269 
270  wpipe = &pp->pp_wpipe;
271  bzero(wpipe, sizeof(*wpipe));
272  wpipe->pipe_ctime = rpipe->pipe_ctime;
273  wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
274 
275  rpipe->pipe_peer = wpipe;
276  rpipe->pipe_pair = pp;
277  wpipe->pipe_peer = rpipe;
278  wpipe->pipe_pair = pp;
279 
280  /*
281  * Mark both endpoints as present; they will later get free'd
282  * one at a time. When both are free'd, then the whole pair
283  * is released.
284  */
285  rpipe->pipe_present = PIPE_ACTIVE;
286  wpipe->pipe_present = PIPE_ACTIVE;
287 
288  /*
289  * Eventually, the MAC Framework may initialize the label
290  * in ctor or init, but for now we do it elswhere to avoid
291  * blocking in ctor or init.
292  */
293  pp->pp_label = NULL;
294 
295  return (0);
296 }
297 
298 static int
299 pipe_zone_init(void *mem, int size, int flags)
300 {
301  struct pipepair *pp;
302 
303  KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
304 
305  pp = (struct pipepair *)mem;
306 
307  mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
308  return (0);
309 }
310 
311 static void
312 pipe_zone_fini(void *mem, int size)
313 {
314  struct pipepair *pp;
315 
316  KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
317 
318  pp = (struct pipepair *)mem;
319 
320  mtx_destroy(&pp->pp_mtx);
321 }
322 
323 /*
324  * The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let
325  * the zone pick up the pieces via pipeclose().
326  */
327 int
328 kern_pipe(struct thread *td, int fildes[2])
329 {
330 
331  return (do_pipe(td, fildes, 0));
332 }
333 
334 int
335 do_pipe(struct thread *td, int fildes[2], int flags)
336 {
337  struct filedesc *fdp = td->td_proc->p_fd;
338  struct file *rf, *wf;
339  struct pipepair *pp;
340  struct pipe *rpipe, *wpipe;
341  int fd, fflags, error;
342 
343  pp = uma_zalloc(pipe_zone, M_WAITOK);
344 #ifdef MAC
345  /*
346  * The MAC label is shared between the connected endpoints. As a
347  * result mac_pipe_init() and mac_pipe_create() are called once
348  * for the pair, and not on the endpoints.
349  */
350  mac_pipe_init(pp);
351  mac_pipe_create(td->td_ucred, pp);
352 #endif
353  rpipe = &pp->pp_rpipe;
354  wpipe = &pp->pp_wpipe;
355 
356  knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
357  knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
358 
359  /* Only the forward direction pipe is backed by default */
360  if ((error = pipe_create(rpipe, 1)) != 0 ||
361  (error = pipe_create(wpipe, 0)) != 0) {
362  pipeclose(rpipe);
363  pipeclose(wpipe);
364  return (error);
365  }
366 
367  rpipe->pipe_state |= PIPE_DIRECTOK;
368  wpipe->pipe_state |= PIPE_DIRECTOK;
369 
370  error = falloc(td, &rf, &fd, flags);
371  if (error) {
372  pipeclose(rpipe);
373  pipeclose(wpipe);
374  return (error);
375  }
376  /* An extra reference on `rf' has been held for us by falloc(). */
377  fildes[0] = fd;
378 
379  fflags = FREAD | FWRITE;
380  if ((flags & O_NONBLOCK) != 0)
381  fflags |= FNONBLOCK;
382 
383  /*
384  * Warning: once we've gotten past allocation of the fd for the
385  * read-side, we can only drop the read side via fdrop() in order
386  * to avoid races against processes which manage to dup() the read
387  * side while we are blocked trying to allocate the write side.
388  */
389  finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
390  error = falloc(td, &wf, &fd, flags);
391  if (error) {
392  fdclose(fdp, rf, fildes[0], td);
393  fdrop(rf, td);
394  /* rpipe has been closed by fdrop(). */
395  pipeclose(wpipe);
396  return (error);
397  }
398  /* An extra reference on `wf' has been held for us by falloc(). */
399  finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
400  fdrop(wf, td);
401  fildes[1] = fd;
402  fdrop(rf, td);
403 
404  return (0);
405 }
406 
407 /* ARGSUSED */
408 int
409 sys_pipe(struct thread *td, struct pipe_args *uap)
410 {
411  int error;
412  int fildes[2];
413 
414  error = kern_pipe(td, fildes);
415  if (error)
416  return (error);
417 
418  td->td_retval[0] = fildes[0];
419  td->td_retval[1] = fildes[1];
420 
421  return (0);
422 }
423 
424 /*
425  * Allocate kva for pipe circular buffer, the space is pageable
426  * This routine will 'realloc' the size of a pipe safely, if it fails
427  * it will retain the old buffer.
428  * If it fails it will return ENOMEM.
429  */
430 static int
431 pipespace_new(cpipe, size)
432  struct pipe *cpipe;
433  int size;
434 {
435  caddr_t buffer;
436  int error, cnt, firstseg;
437  static int curfail = 0;
438  static struct timeval lastfail;
439 
440  KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
441  KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
442  ("pipespace: resize of direct writes not allowed"));
443 retry:
444  cnt = cpipe->pipe_buffer.cnt;
445  if (cnt > size)
446  size = cnt;
447 
448  size = round_page(size);
449  buffer = (caddr_t) vm_map_min(pipe_map);
450 
451  error = vm_map_find(pipe_map, NULL, 0,
452  (vm_offset_t *) &buffer, size, 1,
453  VM_PROT_ALL, VM_PROT_ALL, 0);
454  if (error != KERN_SUCCESS) {
455  if ((cpipe->pipe_buffer.buffer == NULL) &&
456  (size > SMALL_PIPE_SIZE)) {
457  size = SMALL_PIPE_SIZE;
458  pipefragretry++;
459  goto retry;
460  }
461  if (cpipe->pipe_buffer.buffer == NULL) {
462  pipeallocfail++;
463  if (ppsratecheck(&lastfail, &curfail, 1))
464  printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
465  } else {
466  piperesizefail++;
467  }
468  return (ENOMEM);
469  }
470 
471  /* copy data, then free old resources if we're resizing */
472  if (cnt > 0) {
473  if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
474  firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
475  bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
476  buffer, firstseg);
477  if ((cnt - firstseg) > 0)
478  bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
479  cpipe->pipe_buffer.in);
480  } else {
481  bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
482  buffer, cnt);
483  }
484  }
485  pipe_free_kmem(cpipe);
486  cpipe->pipe_buffer.buffer = buffer;
487  cpipe->pipe_buffer.size = size;
488  cpipe->pipe_buffer.in = cnt;
489  cpipe->pipe_buffer.out = 0;
490  cpipe->pipe_buffer.cnt = cnt;
491  atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
492  return (0);
493 }
494 
495 /*
496  * Wrapper for pipespace_new() that performs locking assertions.
497  */
498 static int
499 pipespace(cpipe, size)
500  struct pipe *cpipe;
501  int size;
502 {
503 
504  KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
505  ("Unlocked pipe passed to pipespace"));
506  return (pipespace_new(cpipe, size));
507 }
508 
509 /*
510  * lock a pipe for I/O, blocking other access
511  */
512 static __inline int
513 pipelock(cpipe, catch)
514  struct pipe *cpipe;
515  int catch;
516 {
517  int error;
518 
519  PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
520  while (cpipe->pipe_state & PIPE_LOCKFL) {
521  cpipe->pipe_state |= PIPE_LWANT;
522  error = msleep(cpipe, PIPE_MTX(cpipe),
523  catch ? (PRIBIO | PCATCH) : PRIBIO,
524  "pipelk", 0);
525  if (error != 0)
526  return (error);
527  }
528  cpipe->pipe_state |= PIPE_LOCKFL;
529  return (0);
530 }
531 
532 /*
533  * unlock a pipe I/O lock
534  */
535 static __inline void
537  struct pipe *cpipe;
538 {
539 
540  PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
541  KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
542  ("Unlocked pipe passed to pipeunlock"));
543  cpipe->pipe_state &= ~PIPE_LOCKFL;
544  if (cpipe->pipe_state & PIPE_LWANT) {
545  cpipe->pipe_state &= ~PIPE_LWANT;
546  wakeup(cpipe);
547  }
548 }
549 
550 static __inline void
552  struct pipe *cpipe;
553 {
554 
555  PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
556  if (cpipe->pipe_state & PIPE_SEL) {
557  selwakeuppri(&cpipe->pipe_sel, PSOCK);
558  if (!SEL_WAITING(&cpipe->pipe_sel))
559  cpipe->pipe_state &= ~PIPE_SEL;
560  }
561  if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
562  pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
563  KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
564 }
565 
566 /*
567  * Initialize and allocate VM and memory for pipe. The structure
568  * will start out zero'd from the ctor, so we just manage the kmem.
569  */
570 static int
571 pipe_create(pipe, backing)
572  struct pipe *pipe;
573  int backing;
574 {
575  int error;
576 
577  if (backing) {
578  if (amountpipekva > maxpipekva / 2)
579  error = pipespace_new(pipe, SMALL_PIPE_SIZE);
580  else
581  error = pipespace_new(pipe, PIPE_SIZE);
582  } else {
583  /* If we're not backing this pipe, no need to do anything. */
584  error = 0;
585  }
586  pipe->pipe_ino = -1;
587  return (error);
588 }
589 
590 /* ARGSUSED */
591 static int
592 pipe_read(fp, uio, active_cred, flags, td)
593  struct file *fp;
594  struct uio *uio;
595  struct ucred *active_cred;
596  struct thread *td;
597  int flags;
598 {
599  struct pipe *rpipe = fp->f_data;
600  int error;
601  int nread = 0;
602  int size;
603 
604  PIPE_LOCK(rpipe);
605  ++rpipe->pipe_busy;
606  error = pipelock(rpipe, 1);
607  if (error)
608  goto unlocked_error;
609 
610 #ifdef MAC
611  error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
612  if (error)
613  goto locked_error;
614 #endif
615  if (amountpipekva > (3 * maxpipekva) / 4) {
616  if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
617  (rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
618  (rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
619  (piperesizeallowed == 1)) {
620  PIPE_UNLOCK(rpipe);
621  pipespace(rpipe, SMALL_PIPE_SIZE);
622  PIPE_LOCK(rpipe);
623  }
624  }
625 
626  while (uio->uio_resid) {
627  /*
628  * normal pipe buffer receive
629  */
630  if (rpipe->pipe_buffer.cnt > 0) {
631  size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
632  if (size > rpipe->pipe_buffer.cnt)
633  size = rpipe->pipe_buffer.cnt;
634  if (size > uio->uio_resid)
635  size = uio->uio_resid;
636 
637  PIPE_UNLOCK(rpipe);
638  error = uiomove(
639  &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
640  size, uio);
641  PIPE_LOCK(rpipe);
642  if (error)
643  break;
644 
645  rpipe->pipe_buffer.out += size;
646  if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
647  rpipe->pipe_buffer.out = 0;
648 
649  rpipe->pipe_buffer.cnt -= size;
650 
651  /*
652  * If there is no more to read in the pipe, reset
653  * its pointers to the beginning. This improves
654  * cache hit stats.
655  */
656  if (rpipe->pipe_buffer.cnt == 0) {
657  rpipe->pipe_buffer.in = 0;
658  rpipe->pipe_buffer.out = 0;
659  }
660  nread += size;
661 #ifndef PIPE_NODIRECT
662  /*
663  * Direct copy, bypassing a kernel buffer.
664  */
665  } else if ((size = rpipe->pipe_map.cnt) &&
666  (rpipe->pipe_state & PIPE_DIRECTW)) {
667  if (size > uio->uio_resid)
668  size = (u_int) uio->uio_resid;
669 
670  PIPE_UNLOCK(rpipe);
671  error = uiomove_fromphys(rpipe->pipe_map.ms,
672  rpipe->pipe_map.pos, size, uio);
673  PIPE_LOCK(rpipe);
674  if (error)
675  break;
676  nread += size;
677  rpipe->pipe_map.pos += size;
678  rpipe->pipe_map.cnt -= size;
679  if (rpipe->pipe_map.cnt == 0) {
680  rpipe->pipe_state &= ~PIPE_DIRECTW;
681  wakeup(rpipe);
682  }
683 #endif
684  } else {
685  /*
686  * detect EOF condition
687  * read returns 0 on EOF, no need to set error
688  */
689  if (rpipe->pipe_state & PIPE_EOF)
690  break;
691 
692  /*
693  * If the "write-side" has been blocked, wake it up now.
694  */
695  if (rpipe->pipe_state & PIPE_WANTW) {
696  rpipe->pipe_state &= ~PIPE_WANTW;
697  wakeup(rpipe);
698  }
699 
700  /*
701  * Break if some data was read.
702  */
703  if (nread > 0)
704  break;
705 
706  /*
707  * Unlock the pipe buffer for our remaining processing.
708  * We will either break out with an error or we will
709  * sleep and relock to loop.
710  */
711  pipeunlock(rpipe);
712 
713  /*
714  * Handle non-blocking mode operation or
715  * wait for more data.
716  */
717  if (fp->f_flag & FNONBLOCK) {
718  error = EAGAIN;
719  } else {
720  rpipe->pipe_state |= PIPE_WANTR;
721  if ((error = msleep(rpipe, PIPE_MTX(rpipe),
722  PRIBIO | PCATCH,
723  "piperd", 0)) == 0)
724  error = pipelock(rpipe, 1);
725  }
726  if (error)
727  goto unlocked_error;
728  }
729  }
730 #ifdef MAC
731 locked_error:
732 #endif
733  pipeunlock(rpipe);
734 
735  /* XXX: should probably do this before getting any locks. */
736  if (error == 0)
737  vfs_timestamp(&rpipe->pipe_atime);
738 unlocked_error:
739  --rpipe->pipe_busy;
740 
741  /*
742  * PIPE_WANT processing only makes sense if pipe_busy is 0.
743  */
744  if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
745  rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
746  wakeup(rpipe);
747  } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
748  /*
749  * Handle write blocking hysteresis.
750  */
751  if (rpipe->pipe_state & PIPE_WANTW) {
752  rpipe->pipe_state &= ~PIPE_WANTW;
753  wakeup(rpipe);
754  }
755  }
756 
757  if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
758  pipeselwakeup(rpipe);
759 
760  PIPE_UNLOCK(rpipe);
761  return (error);
762 }
763 
764 #ifndef PIPE_NODIRECT
765 /*
766  * Map the sending processes' buffer into kernel space and wire it.
767  * This is similar to a physical write operation.
768  */
769 static int
771  struct pipe *wpipe;
772  struct uio *uio;
773 {
774  u_int size;
775  int i;
776 
777  PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
778  KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
779  ("Clone attempt on non-direct write pipe!"));
780 
781  if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
782  size = wpipe->pipe_buffer.size;
783  else
784  size = uio->uio_iov->iov_len;
785 
786  if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
787  (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
788  wpipe->pipe_map.ms, PIPENPAGES)) < 0)
789  return (EFAULT);
790 
791 /*
792  * set up the control block
793  */
794  wpipe->pipe_map.npages = i;
795  wpipe->pipe_map.pos =
796  ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
797  wpipe->pipe_map.cnt = size;
798 
799 /*
800  * and update the uio data
801  */
802 
803  uio->uio_iov->iov_len -= size;
804  uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
805  if (uio->uio_iov->iov_len == 0)
806  uio->uio_iov++;
807  uio->uio_resid -= size;
808  uio->uio_offset += size;
809  return (0);
810 }
811 
812 /*
813  * unmap and unwire the process buffer
814  */
815 static void
817  struct pipe *wpipe;
818 {
819 
820  PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
821  vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
822  wpipe->pipe_map.npages = 0;
823 }
824 
825 /*
826  * In the case of a signal, the writing process might go away. This
827  * code copies the data into the circular buffer so that the source
828  * pages can be freed without loss of data.
829  */
830 static void
832  struct pipe *wpipe;
833 {
834  struct uio uio;
835  struct iovec iov;
836  int size;
837  int pos;
838 
839  PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
840  size = wpipe->pipe_map.cnt;
841  pos = wpipe->pipe_map.pos;
842 
843  wpipe->pipe_buffer.in = size;
844  wpipe->pipe_buffer.out = 0;
845  wpipe->pipe_buffer.cnt = size;
846  wpipe->pipe_state &= ~PIPE_DIRECTW;
847 
848  PIPE_UNLOCK(wpipe);
849  iov.iov_base = wpipe->pipe_buffer.buffer;
850  iov.iov_len = size;
851  uio.uio_iov = &iov;
852  uio.uio_iovcnt = 1;
853  uio.uio_offset = 0;
854  uio.uio_resid = size;
855  uio.uio_segflg = UIO_SYSSPACE;
856  uio.uio_rw = UIO_READ;
857  uio.uio_td = curthread;
858  uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
859  PIPE_LOCK(wpipe);
861 }
862 
863 /*
864  * This implements the pipe buffer write mechanism. Note that only
865  * a direct write OR a normal pipe write can be pending at any given time.
866  * If there are any characters in the pipe buffer, the direct write will
867  * be deferred until the receiving process grabs all of the bytes from
868  * the pipe buffer. Then the direct mapping write is set-up.
869  */
870 static int
871 pipe_direct_write(wpipe, uio)
872  struct pipe *wpipe;
873  struct uio *uio;
874 {
875  int error;
876 
877 retry:
878  PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
879  error = pipelock(wpipe, 1);
880  if (wpipe->pipe_state & PIPE_EOF)
881  error = EPIPE;
882  if (error) {
883  pipeunlock(wpipe);
884  goto error1;
885  }
886  while (wpipe->pipe_state & PIPE_DIRECTW) {
887  if (wpipe->pipe_state & PIPE_WANTR) {
888  wpipe->pipe_state &= ~PIPE_WANTR;
889  wakeup(wpipe);
890  }
891  pipeselwakeup(wpipe);
892  wpipe->pipe_state |= PIPE_WANTW;
893  pipeunlock(wpipe);
894  error = msleep(wpipe, PIPE_MTX(wpipe),
895  PRIBIO | PCATCH, "pipdww", 0);
896  if (error)
897  goto error1;
898  else
899  goto retry;
900  }
901  wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
902  if (wpipe->pipe_buffer.cnt > 0) {
903  if (wpipe->pipe_state & PIPE_WANTR) {
904  wpipe->pipe_state &= ~PIPE_WANTR;
905  wakeup(wpipe);
906  }
907  pipeselwakeup(wpipe);
908  wpipe->pipe_state |= PIPE_WANTW;
909  pipeunlock(wpipe);
910  error = msleep(wpipe, PIPE_MTX(wpipe),
911  PRIBIO | PCATCH, "pipdwc", 0);
912  if (error)
913  goto error1;
914  else
915  goto retry;
916  }
917 
918  wpipe->pipe_state |= PIPE_DIRECTW;
919 
920  PIPE_UNLOCK(wpipe);
921  error = pipe_build_write_buffer(wpipe, uio);
922  PIPE_LOCK(wpipe);
923  if (error) {
924  wpipe->pipe_state &= ~PIPE_DIRECTW;
925  pipeunlock(wpipe);
926  goto error1;
927  }
928 
929  error = 0;
930  while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
931  if (wpipe->pipe_state & PIPE_EOF) {
933  pipeselwakeup(wpipe);
934  pipeunlock(wpipe);
935  error = EPIPE;
936  goto error1;
937  }
938  if (wpipe->pipe_state & PIPE_WANTR) {
939  wpipe->pipe_state &= ~PIPE_WANTR;
940  wakeup(wpipe);
941  }
942  pipeselwakeup(wpipe);
943  pipeunlock(wpipe);
944  error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
945  "pipdwt", 0);
946  pipelock(wpipe, 0);
947  }
948 
949  if (wpipe->pipe_state & PIPE_EOF)
950  error = EPIPE;
951  if (wpipe->pipe_state & PIPE_DIRECTW) {
952  /*
953  * this bit of trickery substitutes a kernel buffer for
954  * the process that might be going away.
955  */
957  } else {
959  }
960  pipeunlock(wpipe);
961  return (error);
962 
963 error1:
964  wakeup(wpipe);
965  return (error);
966 }
967 #endif
968 
969 static int
970 pipe_write(fp, uio, active_cred, flags, td)
971  struct file *fp;
972  struct uio *uio;
973  struct ucred *active_cred;
974  struct thread *td;
975  int flags;
976 {
977  int error = 0;
978  int desiredsize;
979  ssize_t orig_resid;
980  struct pipe *wpipe, *rpipe;
981 
982  rpipe = fp->f_data;
983  wpipe = rpipe->pipe_peer;
984 
985  PIPE_LOCK(rpipe);
986  error = pipelock(wpipe, 1);
987  if (error) {
988  PIPE_UNLOCK(rpipe);
989  return (error);
990  }
991  /*
992  * detect loss of pipe read side, issue SIGPIPE if lost.
993  */
994  if (wpipe->pipe_present != PIPE_ACTIVE ||
995  (wpipe->pipe_state & PIPE_EOF)) {
996  pipeunlock(wpipe);
997  PIPE_UNLOCK(rpipe);
998  return (EPIPE);
999  }
1000 #ifdef MAC
1001  error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
1002  if (error) {
1003  pipeunlock(wpipe);
1004  PIPE_UNLOCK(rpipe);
1005  return (error);
1006  }
1007 #endif
1008  ++wpipe->pipe_busy;
1009 
1010  /* Choose a larger size if it's advantageous */
1011  desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
1012  while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
1013  if (piperesizeallowed != 1)
1014  break;
1015  if (amountpipekva > maxpipekva / 2)
1016  break;
1017  if (desiredsize == BIG_PIPE_SIZE)
1018  break;
1019  desiredsize = desiredsize * 2;
1020  }
1021 
1022  /* Choose a smaller size if we're in a OOM situation */
1023  if ((amountpipekva > (3 * maxpipekva) / 4) &&
1024  (wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
1025  (wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
1026  (piperesizeallowed == 1))
1027  desiredsize = SMALL_PIPE_SIZE;
1028 
1029  /* Resize if the above determined that a new size was necessary */
1030  if ((desiredsize != wpipe->pipe_buffer.size) &&
1031  ((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
1032  PIPE_UNLOCK(wpipe);
1033  pipespace(wpipe, desiredsize);
1034  PIPE_LOCK(wpipe);
1035  }
1036  if (wpipe->pipe_buffer.size == 0) {
1037  /*
1038  * This can only happen for reverse direction use of pipes
1039  * in a complete OOM situation.
1040  */
1041  error = ENOMEM;
1042  --wpipe->pipe_busy;
1043  pipeunlock(wpipe);
1044  PIPE_UNLOCK(wpipe);
1045  return (error);
1046  }
1047 
1048  pipeunlock(wpipe);
1049 
1050  orig_resid = uio->uio_resid;
1051 
1052  while (uio->uio_resid) {
1053  int space;
1054 
1055  pipelock(wpipe, 0);
1056  if (wpipe->pipe_state & PIPE_EOF) {
1057  pipeunlock(wpipe);
1058  error = EPIPE;
1059  break;
1060  }
1061 #ifndef PIPE_NODIRECT
1062  /*
1063  * If the transfer is large, we can gain performance if
1064  * we do process-to-process copies directly.
1065  * If the write is non-blocking, we don't use the
1066  * direct write mechanism.
1067  *
1068  * The direct write mechanism will detect the reader going
1069  * away on us.
1070  */
1071  if (uio->uio_segflg == UIO_USERSPACE &&
1072  uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
1073  wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
1074  (fp->f_flag & FNONBLOCK) == 0) {
1075  pipeunlock(wpipe);
1076  error = pipe_direct_write(wpipe, uio);
1077  if (error)
1078  break;
1079  continue;
1080  }
1081 #endif
1082 
1083  /*
1084  * Pipe buffered writes cannot be coincidental with
1085  * direct writes. We wait until the currently executing
1086  * direct write is completed before we start filling the
1087  * pipe buffer. We break out if a signal occurs or the
1088  * reader goes away.
1089  */
1090  if (wpipe->pipe_state & PIPE_DIRECTW) {
1091  if (wpipe->pipe_state & PIPE_WANTR) {
1092  wpipe->pipe_state &= ~PIPE_WANTR;
1093  wakeup(wpipe);
1094  }
1095  pipeselwakeup(wpipe);
1096  wpipe->pipe_state |= PIPE_WANTW;
1097  pipeunlock(wpipe);
1098  error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1099  "pipbww", 0);
1100  if (error)
1101  break;
1102  else
1103  continue;
1104  }
1105 
1106  space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1107 
1108  /* Writes of size <= PIPE_BUF must be atomic. */
1109  if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1110  space = 0;
1111 
1112  if (space > 0) {
1113  int size; /* Transfer size */
1114  int segsize; /* first segment to transfer */
1115 
1116  /*
1117  * Transfer size is minimum of uio transfer
1118  * and free space in pipe buffer.
1119  */
1120  if (space > uio->uio_resid)
1121  size = uio->uio_resid;
1122  else
1123  size = space;
1124  /*
1125  * First segment to transfer is minimum of
1126  * transfer size and contiguous space in
1127  * pipe buffer. If first segment to transfer
1128  * is less than the transfer size, we've got
1129  * a wraparound in the buffer.
1130  */
1131  segsize = wpipe->pipe_buffer.size -
1132  wpipe->pipe_buffer.in;
1133  if (segsize > size)
1134  segsize = size;
1135 
1136  /* Transfer first segment */
1137 
1138  PIPE_UNLOCK(rpipe);
1139  error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1140  segsize, uio);
1141  PIPE_LOCK(rpipe);
1142 
1143  if (error == 0 && segsize < size) {
1144  KASSERT(wpipe->pipe_buffer.in + segsize ==
1145  wpipe->pipe_buffer.size,
1146  ("Pipe buffer wraparound disappeared"));
1147  /*
1148  * Transfer remaining part now, to
1149  * support atomic writes. Wraparound
1150  * happened.
1151  */
1152 
1153  PIPE_UNLOCK(rpipe);
1154  error = uiomove(
1155  &wpipe->pipe_buffer.buffer[0],
1156  size - segsize, uio);
1157  PIPE_LOCK(rpipe);
1158  }
1159  if (error == 0) {
1160  wpipe->pipe_buffer.in += size;
1161  if (wpipe->pipe_buffer.in >=
1162  wpipe->pipe_buffer.size) {
1163  KASSERT(wpipe->pipe_buffer.in ==
1164  size - segsize +
1165  wpipe->pipe_buffer.size,
1166  ("Expected wraparound bad"));
1167  wpipe->pipe_buffer.in = size - segsize;
1168  }
1169 
1170  wpipe->pipe_buffer.cnt += size;
1171  KASSERT(wpipe->pipe_buffer.cnt <=
1172  wpipe->pipe_buffer.size,
1173  ("Pipe buffer overflow"));
1174  }
1175  pipeunlock(wpipe);
1176  if (error != 0)
1177  break;
1178  } else {
1179  /*
1180  * If the "read-side" has been blocked, wake it up now.
1181  */
1182  if (wpipe->pipe_state & PIPE_WANTR) {
1183  wpipe->pipe_state &= ~PIPE_WANTR;
1184  wakeup(wpipe);
1185  }
1186 
1187  /*
1188  * don't block on non-blocking I/O
1189  */
1190  if (fp->f_flag & FNONBLOCK) {
1191  error = EAGAIN;
1192  pipeunlock(wpipe);
1193  break;
1194  }
1195 
1196  /*
1197  * We have no more space and have something to offer,
1198  * wake up select/poll.
1199  */
1200  pipeselwakeup(wpipe);
1201 
1202  wpipe->pipe_state |= PIPE_WANTW;
1203  pipeunlock(wpipe);
1204  error = msleep(wpipe, PIPE_MTX(rpipe),
1205  PRIBIO | PCATCH, "pipewr", 0);
1206  if (error != 0)
1207  break;
1208  }
1209  }
1210 
1211  pipelock(wpipe, 0);
1212  --wpipe->pipe_busy;
1213 
1214  if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1215  wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1216  wakeup(wpipe);
1217  } else if (wpipe->pipe_buffer.cnt > 0) {
1218  /*
1219  * If we have put any characters in the buffer, we wake up
1220  * the reader.
1221  */
1222  if (wpipe->pipe_state & PIPE_WANTR) {
1223  wpipe->pipe_state &= ~PIPE_WANTR;
1224  wakeup(wpipe);
1225  }
1226  }
1227 
1228  /*
1229  * Don't return EPIPE if I/O was successful
1230  */
1231  if ((wpipe->pipe_buffer.cnt == 0) &&
1232  (uio->uio_resid == 0) &&
1233  (error == EPIPE)) {
1234  error = 0;
1235  }
1236 
1237  if (error == 0)
1238  vfs_timestamp(&wpipe->pipe_mtime);
1239 
1240  /*
1241  * We have something to offer,
1242  * wake up select/poll.
1243  */
1244  if (wpipe->pipe_buffer.cnt)
1245  pipeselwakeup(wpipe);
1246 
1247  pipeunlock(wpipe);
1248  PIPE_UNLOCK(rpipe);
1249  return (error);
1250 }
1251 
1252 /* ARGSUSED */
1253 static int
1254 pipe_truncate(fp, length, active_cred, td)
1255  struct file *fp;
1256  off_t length;
1257  struct ucred *active_cred;
1258  struct thread *td;
1259 {
1260 
1261  return (EINVAL);
1262 }
1263 
1264 /*
1265  * we implement a very minimal set of ioctls for compatibility with sockets.
1266  */
1267 static int
1268 pipe_ioctl(fp, cmd, data, active_cred, td)
1269  struct file *fp;
1270  u_long cmd;
1271  void *data;
1272  struct ucred *active_cred;
1273  struct thread *td;
1274 {
1275  struct pipe *mpipe = fp->f_data;
1276  int error;
1277 
1278  PIPE_LOCK(mpipe);
1279 
1280 #ifdef MAC
1281  error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1282  if (error) {
1283  PIPE_UNLOCK(mpipe);
1284  return (error);
1285  }
1286 #endif
1287 
1288  error = 0;
1289  switch (cmd) {
1290 
1291  case FIONBIO:
1292  break;
1293 
1294  case FIOASYNC:
1295  if (*(int *)data) {
1296  mpipe->pipe_state |= PIPE_ASYNC;
1297  } else {
1298  mpipe->pipe_state &= ~PIPE_ASYNC;
1299  }
1300  break;
1301 
1302  case FIONREAD:
1303  if (mpipe->pipe_state & PIPE_DIRECTW)
1304  *(int *)data = mpipe->pipe_map.cnt;
1305  else
1306  *(int *)data = mpipe->pipe_buffer.cnt;
1307  break;
1308 
1309  case FIOSETOWN:
1310  PIPE_UNLOCK(mpipe);
1311  error = fsetown(*(int *)data, &mpipe->pipe_sigio);
1312  goto out_unlocked;
1313 
1314  case FIOGETOWN:
1315  *(int *)data = fgetown(&mpipe->pipe_sigio);
1316  break;
1317 
1318  /* This is deprecated, FIOSETOWN should be used instead. */
1319  case TIOCSPGRP:
1320  PIPE_UNLOCK(mpipe);
1321  error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1322  goto out_unlocked;
1323 
1324  /* This is deprecated, FIOGETOWN should be used instead. */
1325  case TIOCGPGRP:
1326  *(int *)data = -fgetown(&mpipe->pipe_sigio);
1327  break;
1328 
1329  default:
1330  error = ENOTTY;
1331  break;
1332  }
1333  PIPE_UNLOCK(mpipe);
1334 out_unlocked:
1335  return (error);
1336 }
1337 
1338 static int
1339 pipe_poll(fp, events, active_cred, td)
1340  struct file *fp;
1341  int events;
1342  struct ucred *active_cred;
1343  struct thread *td;
1344 {
1345  struct pipe *rpipe = fp->f_data;
1346  struct pipe *wpipe;
1347  int revents = 0;
1348 #ifdef MAC
1349  int error;
1350 #endif
1351 
1352  wpipe = rpipe->pipe_peer;
1353  PIPE_LOCK(rpipe);
1354 #ifdef MAC
1355  error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
1356  if (error)
1357  goto locked_error;
1358 #endif
1359  if (events & (POLLIN | POLLRDNORM))
1360  if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1361  (rpipe->pipe_buffer.cnt > 0))
1362  revents |= events & (POLLIN | POLLRDNORM);
1363 
1364  if (events & (POLLOUT | POLLWRNORM))
1365  if (wpipe->pipe_present != PIPE_ACTIVE ||
1366  (wpipe->pipe_state & PIPE_EOF) ||
1367  (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1368  ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
1369  wpipe->pipe_buffer.size == 0)))
1370  revents |= events & (POLLOUT | POLLWRNORM);
1371 
1372  if ((events & POLLINIGNEOF) == 0) {
1373  if (rpipe->pipe_state & PIPE_EOF) {
1374  revents |= (events & (POLLIN | POLLRDNORM));
1375  if (wpipe->pipe_present != PIPE_ACTIVE ||
1376  (wpipe->pipe_state & PIPE_EOF))
1377  revents |= POLLHUP;
1378  }
1379  }
1380 
1381  if (revents == 0) {
1382  if (events & (POLLIN | POLLRDNORM)) {
1383  selrecord(td, &rpipe->pipe_sel);
1384  if (SEL_WAITING(&rpipe->pipe_sel))
1385  rpipe->pipe_state |= PIPE_SEL;
1386  }
1387 
1388  if (events & (POLLOUT | POLLWRNORM)) {
1389  selrecord(td, &wpipe->pipe_sel);
1390  if (SEL_WAITING(&wpipe->pipe_sel))
1391  wpipe->pipe_state |= PIPE_SEL;
1392  }
1393  }
1394 #ifdef MAC
1395 locked_error:
1396 #endif
1397  PIPE_UNLOCK(rpipe);
1398 
1399  return (revents);
1400 }
1401 
1402 /*
1403  * We shouldn't need locks here as we're doing a read and this should
1404  * be a natural race.
1405  */
1406 static int
1407 pipe_stat(fp, ub, active_cred, td)
1408  struct file *fp;
1409  struct stat *ub;
1410  struct ucred *active_cred;
1411  struct thread *td;
1412 {
1413  struct pipe *pipe;
1414  int new_unr;
1415 #ifdef MAC
1416  int error;
1417 #endif
1418 
1419  pipe = fp->f_data;
1420  PIPE_LOCK(pipe);
1421 #ifdef MAC
1422  error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
1423  if (error) {
1424  PIPE_UNLOCK(pipe);
1425  return (error);
1426  }
1427 #endif
1428  /*
1429  * Lazily allocate an inode number for the pipe. Most pipe
1430  * users do not call fstat(2) on the pipe, which means that
1431  * postponing the inode allocation until it is must be
1432  * returned to userland is useful. If alloc_unr failed,
1433  * assign st_ino zero instead of returning an error.
1434  * Special pipe_ino values:
1435  * -1 - not yet initialized;
1436  * 0 - alloc_unr failed, return 0 as st_ino forever.
1437  */
1438  if (pipe->pipe_ino == (ino_t)-1) {
1439  new_unr = alloc_unr(pipeino_unr);
1440  if (new_unr != -1)
1441  pipe->pipe_ino = new_unr;
1442  else
1443  pipe->pipe_ino = 0;
1444  }
1445  PIPE_UNLOCK(pipe);
1446 
1447  bzero(ub, sizeof(*ub));
1448  ub->st_mode = S_IFIFO;
1449  ub->st_blksize = PAGE_SIZE;
1450  if (pipe->pipe_state & PIPE_DIRECTW)
1451  ub->st_size = pipe->pipe_map.cnt;
1452  else
1453  ub->st_size = pipe->pipe_buffer.cnt;
1454  ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1455  ub->st_atim = pipe->pipe_atime;
1456  ub->st_mtim = pipe->pipe_mtime;
1457  ub->st_ctim = pipe->pipe_ctime;
1458  ub->st_uid = fp->f_cred->cr_uid;
1459  ub->st_gid = fp->f_cred->cr_gid;
1460  ub->st_dev = pipedev_ino;
1461  ub->st_ino = pipe->pipe_ino;
1462  /*
1463  * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
1464  */
1465  return (0);
1466 }
1467 
1468 /* ARGSUSED */
1469 static int
1470 pipe_close(fp, td)
1471  struct file *fp;
1472  struct thread *td;
1473 {
1474  struct pipe *cpipe = fp->f_data;
1475 
1476  fp->f_ops = &badfileops;
1477  fp->f_data = NULL;
1478  funsetown(&cpipe->pipe_sigio);
1479  pipeclose(cpipe);
1480  return (0);
1481 }
1482 
1483 static void
1485  struct pipe *cpipe;
1486 {
1487 
1488  KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1489  ("pipe_free_kmem: pipe mutex locked"));
1490 
1491  if (cpipe->pipe_buffer.buffer != NULL) {
1492  atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
1493  vm_map_remove(pipe_map,
1494  (vm_offset_t)cpipe->pipe_buffer.buffer,
1495  (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1496  cpipe->pipe_buffer.buffer = NULL;
1497  }
1498 #ifndef PIPE_NODIRECT
1499  {
1500  cpipe->pipe_map.cnt = 0;
1501  cpipe->pipe_map.pos = 0;
1502  cpipe->pipe_map.npages = 0;
1503  }
1504 #endif
1505 }
1506 
1507 /*
1508  * shutdown the pipe
1509  */
1510 static void
1512  struct pipe *cpipe;
1513 {
1514  struct pipepair *pp;
1515  struct pipe *ppipe;
1516  ino_t ino;
1517 
1518  KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1519 
1520  PIPE_LOCK(cpipe);
1521  pipelock(cpipe, 0);
1522  pp = cpipe->pipe_pair;
1523 
1524  pipeselwakeup(cpipe);
1525 
1526  /*
1527  * If the other side is blocked, wake it up saying that
1528  * we want to close it down.
1529  */
1530  cpipe->pipe_state |= PIPE_EOF;
1531  while (cpipe->pipe_busy) {
1532  wakeup(cpipe);
1533  cpipe->pipe_state |= PIPE_WANT;
1534  pipeunlock(cpipe);
1535  msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1536  pipelock(cpipe, 0);
1537  }
1538 
1539 
1540  /*
1541  * Disconnect from peer, if any.
1542  */
1543  ppipe = cpipe->pipe_peer;
1544  if (ppipe->pipe_present == PIPE_ACTIVE) {
1545  pipeselwakeup(ppipe);
1546 
1547  ppipe->pipe_state |= PIPE_EOF;
1548  wakeup(ppipe);
1549  KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
1550  }
1551 
1552  /*
1553  * Mark this endpoint as free. Release kmem resources. We
1554  * don't mark this endpoint as unused until we've finished
1555  * doing that, or the pipe might disappear out from under
1556  * us.
1557  */
1558  PIPE_UNLOCK(cpipe);
1559  pipe_free_kmem(cpipe);
1560  PIPE_LOCK(cpipe);
1561  cpipe->pipe_present = PIPE_CLOSING;
1562  pipeunlock(cpipe);
1563 
1564  /*
1565  * knlist_clear() may sleep dropping the PIPE_MTX. Set the
1566  * PIPE_FINALIZED, that allows other end to free the
1567  * pipe_pair, only after the knotes are completely dismantled.
1568  */
1569  knlist_clear(&cpipe->pipe_sel.si_note, 1);
1570  cpipe->pipe_present = PIPE_FINALIZED;
1571  seldrain(&cpipe->pipe_sel);
1572  knlist_destroy(&cpipe->pipe_sel.si_note);
1573 
1574  /*
1575  * Postpone the destroy of the fake inode number allocated for
1576  * our end, until pipe mtx is unlocked.
1577  */
1578  ino = cpipe->pipe_ino;
1579 
1580  /*
1581  * If both endpoints are now closed, release the memory for the
1582  * pipe pair. If not, unlock.
1583  */
1584  if (ppipe->pipe_present == PIPE_FINALIZED) {
1585  PIPE_UNLOCK(cpipe);
1586 #ifdef MAC
1587  mac_pipe_destroy(pp);
1588 #endif
1589  uma_zfree(pipe_zone, cpipe->pipe_pair);
1590  } else
1591  PIPE_UNLOCK(cpipe);
1592 
1593  if (ino != 0 && ino != (ino_t)-1)
1594  free_unr(pipeino_unr, ino);
1595 }
1596 
1597 /*ARGSUSED*/
1598 static int
1599 pipe_kqfilter(struct file *fp, struct knote *kn)
1600 {
1601  struct pipe *cpipe;
1602 
1603  cpipe = kn->kn_fp->f_data;
1604  PIPE_LOCK(cpipe);
1605  switch (kn->kn_filter) {
1606  case EVFILT_READ:
1607  kn->kn_fop = &pipe_rfiltops;
1608  break;
1609  case EVFILT_WRITE:
1610  kn->kn_fop = &pipe_wfiltops;
1611  if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
1612  /* other end of pipe has been closed */
1613  PIPE_UNLOCK(cpipe);
1614  return (EPIPE);
1615  }
1616  cpipe = cpipe->pipe_peer;
1617  break;
1618  default:
1619  PIPE_UNLOCK(cpipe);
1620  return (EINVAL);
1621  }
1622 
1623  knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1624  PIPE_UNLOCK(cpipe);
1625  return (0);
1626 }
1627 
1628 static void
1630 {
1631  struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1632 
1633  PIPE_LOCK(cpipe);
1634  if (kn->kn_filter == EVFILT_WRITE)
1635  cpipe = cpipe->pipe_peer;
1636  knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1637  PIPE_UNLOCK(cpipe);
1638 }
1639 
1640 /*ARGSUSED*/
1641 static int
1642 filt_piperead(struct knote *kn, long hint)
1643 {
1644  struct pipe *rpipe = kn->kn_fp->f_data;
1645  struct pipe *wpipe = rpipe->pipe_peer;
1646  int ret;
1647 
1648  PIPE_LOCK(rpipe);
1649  kn->kn_data = rpipe->pipe_buffer.cnt;
1650  if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1651  kn->kn_data = rpipe->pipe_map.cnt;
1652 
1653  if ((rpipe->pipe_state & PIPE_EOF) ||
1654  wpipe->pipe_present != PIPE_ACTIVE ||
1655  (wpipe->pipe_state & PIPE_EOF)) {
1656  kn->kn_flags |= EV_EOF;
1657  PIPE_UNLOCK(rpipe);
1658  return (1);
1659  }
1660  ret = kn->kn_data > 0;
1661  PIPE_UNLOCK(rpipe);
1662  return ret;
1663 }
1664 
1665 /*ARGSUSED*/
1666 static int
1667 filt_pipewrite(struct knote *kn, long hint)
1668 {
1669  struct pipe *rpipe = kn->kn_fp->f_data;
1670  struct pipe *wpipe = rpipe->pipe_peer;
1671 
1672  PIPE_LOCK(rpipe);
1673  if (wpipe->pipe_present != PIPE_ACTIVE ||
1674  (wpipe->pipe_state & PIPE_EOF)) {
1675  kn->kn_data = 0;
1676  kn->kn_flags |= EV_EOF;
1677  PIPE_UNLOCK(rpipe);
1678  return (1);
1679  }
1680  kn->kn_data = (wpipe->pipe_buffer.size > 0) ?
1681  (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) : PIPE_BUF;
1682  if (wpipe->pipe_state & PIPE_DIRECTW)
1683  kn->kn_data = 0;
1684 
1685  PIPE_UNLOCK(rpipe);
1686  return (kn->kn_data >= PIPE_BUF);
1687 }
static struct unrhdr * pipeino_unr
Definition: sys_pipe.c:231
pid_t fgetown(struct sigio **sigiop)
int fsetown(pid_t pgid, struct sigio **sigiop)
int fd
Definition: kern_exec.c:199
static struct fileops pipeops
Definition: sys_pipe.c:153
static fo_kqfilter_t pipe_kqfilter
Definition: sys_pipe.c:149
static __inline void * new_unr(struct unrhdr *uh, void **p1, void **p2)
Definition: subr_unit.c:267
int invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
Definition: kern_time.c:948
static void pipe_free_kmem(struct pipe *cpipe)
Definition: sys_pipe.c:1484
static int pipe_zone_ctor(void *mem, int size, void *arg, int flags)
Definition: sys_pipe.c:251
static uma_zone_t pipe_zone
Definition: sys_pipe.c:230
void selrecord(struct thread *selector, struct selinfo *sip)
Definition: sys_generic.c:1606
static fo_rdwr_t pipe_read
Definition: sys_pipe.c:144
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL)
int do_pipe(struct thread *td, int fildes[2], int flags)
Definition: sys_pipe.c:335
static int pipe_create(struct pipe *pipe, int backing)
Definition: sys_pipe.c:571
void knote(struct knlist *list, long hint, int lockflags)
Definition: kern_event.c:1806
void selwakeuppri(struct selinfo *sip, int pri)
Definition: sys_generic.c:1664
int alloc_unr(struct unrhdr *uh)
Definition: subr_unit.c:620
int falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
static void filt_pipedetach(struct knote *kn)
Definition: sys_pipe.c:1629
void knlist_destroy(struct knlist *knl)
Definition: kern_event.c:2002
static fo_poll_t pipe_poll
Definition: sys_pipe.c:148
int kern_pipe(struct thread *td, int fildes[2])
Definition: sys_pipe.c:328
int invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td)
void funsetown(struct sigio **sigiop)
Definition: kern_descrip.c:990
static __inline void pipeselwakeup(struct pipe *cpipe)
Definition: sys_pipe.c:551
void vfs_timestamp(struct timespec *tsp)
Definition: vfs_subr.c:635
static int pipe_zone_init(void *mem, int size, int flags)
Definition: sys_pipe.c:299
static __inline int pipelock(struct pipe *cpipe, int catch)
Definition: sys_pipe.c:513
static int pipespace(struct pipe *cpipe, int size)
Definition: sys_pipe.c:499
static void pipe_clone_write_buffer(struct pipe *wpipe)
Definition: sys_pipe.c:831
static int dummy
static void pipe_destroy_write_buffer(struct pipe *wpipe)
Definition: sys_pipe.c:816
int sys_pipe(struct thread *td, struct pipe_args *uap)
Definition: sys_pipe.c:409
#define MINPIPESIZE
Definition: sys_pipe.c:188
void knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
Definition: kern_event.c:1909
static int piperesizefail
Definition: sys_pipe.c:194
static struct filterops pipe_rfiltops
Definition: sys_pipe.c:171
static int filt_pipewrite(struct knote *kn, long hint)
Definition: sys_pipe.c:1667
static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio)
Definition: sys_pipe.c:770
void seldrain(struct selinfo *sip)
Definition: sys_generic.c:1587
static fo_ioctl_t pipe_ioctl
Definition: sys_pipe.c:147
static void pipe_zone_fini(void *mem, int size)
Definition: sys_pipe.c:312
static int pipeallocfail
Definition: sys_pipe.c:193
SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,&maxpipekva, 0,"Pipe KVA limit")
void knlist_add(struct knlist *knl, struct knote *kn, int islocked)
Definition: kern_event.c:1866
struct unrhdr * new_unrhdr(int low, int high, struct mtx *mutex)
Definition: subr_unit.c:325
void fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
static void pipeclose(struct pipe *cpipe)
Definition: sys_pipe.c:1511
int uiomove(void *cp, int n, struct uio *uio)
Definition: subr_uio.c:202
__FBSDID("$BSDSUniX$")
static int pipespace_new(struct pipe *cpipe, int size)
Definition: sys_pipe.c:431
static int pipefragretry
Definition: sys_pipe.c:192
int printf(const char *fmt,...)
Definition: subr_prf.c:367
static long amountpipekva
Definition: sys_pipe.c:191
static int pipe_direct_write(struct pipe *wpipe, struct uio *uio)
Definition: sys_pipe.c:871
static fo_stat_t pipe_stat
Definition: sys_pipe.c:150
SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,&pipefragretry, 0,"Pipe allocation retries due to fragmentation")
void finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
static dev_t pipedev_ino
Definition: sys_pipe.c:232
long maxpipekva
Definition: subr_param.c:100
void mtx_init(struct mtx *m, const char *name, const char *type, int opts)
Definition: kern_mutex.c:837
void wakeup(void *ident)
Definition: kern_synch.c:378
static int piperesizeallowed
Definition: sys_pipe.c:195
static struct filterops pipe_wfiltops
Definition: sys_pipe.c:176
void mtx_destroy(struct mtx *m)
Definition: kern_mutex.c:884
static fo_rdwr_t pipe_write
Definition: sys_pipe.c:145
static void pipeinit(void *dummy __unused)
Definition: sys_pipe.c:237
void free_unr(struct unrhdr *uh, u_int item)
Definition: subr_unit.c:872
static __inline void pipeunlock(struct pipe *cpipe)
Definition: sys_pipe.c:536
void knlist_init_mtx(struct knlist *knl, struct mtx *lock)
Definition: kern_event.c:1995
void pgsigio(struct sigio **sigiop, int sig, int checkctty)
Definition: kern_sig.c:3372
static int filt_piperead(struct knote *kn, long hint)
Definition: sys_pipe.c:1642
static fo_truncate_t pipe_truncate
Definition: sys_pipe.c:146
struct fileops badfileops
static fo_close_t pipe_close
Definition: sys_pipe.c:151