FreeBSD kernel kern code
vfs_vnops.c
Go to the documentation of this file.
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1993
3  * The Regents of the University of California. All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
11  * Copyright (c) 2013 The FreeBSD Foundation
12  *
13  * Portions of this software were developed by Konstantin Belousov
14  * under sponsorship from the FreeBSD Foundation.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  * notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  * notice, this list of conditions and the following disclaimer in the
23  * documentation and/or other materials provided with the distribution.
24  * 4. Neither the name of the University nor the names of its contributors
25  * may be used to endorse or promote products derived from this software
26  * without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$BSDSUniX$");
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/kdb.h>
51 #include <sys/stat.h>
52 #include <sys/priv.h>
53 #include <sys/proc.h>
54 #include <sys/limits.h>
55 #include <sys/lock.h>
56 #include <sys/mount.h>
57 #include <sys/mutex.h>
58 #include <sys/namei.h>
59 #include <sys/vnode.h>
60 #include <sys/bio.h>
61 #include <sys/buf.h>
62 #include <sys/filio.h>
63 #include <sys/resourcevar.h>
64 #include <sys/sx.h>
65 #include <sys/sysctl.h>
66 #include <sys/ttycom.h>
67 #include <sys/conf.h>
68 #include <sys/syslog.h>
69 #include <sys/unistd.h>
70 
71 #include <security/audit/audit.h>
72 #include <security/mac/mac_framework.h>
73 
74 #include <vm/vm.h>
75 #include <vm/vm_extern.h>
76 #include <vm/pmap.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_object.h>
79 #include <vm/vm_page.h>
80 
81 static fo_rdwr_t vn_read;
82 static fo_rdwr_t vn_write;
83 static fo_rdwr_t vn_io_fault;
84 static fo_truncate_t vn_truncate;
85 static fo_ioctl_t vn_ioctl;
86 static fo_poll_t vn_poll;
87 static fo_kqfilter_t vn_kqfilter;
88 static fo_stat_t vn_statfile;
89 static fo_close_t vn_closefile;
90 
91 struct fileops vnops = {
92  .fo_read = vn_io_fault,
93  .fo_write = vn_io_fault,
94  .fo_truncate = vn_truncate,
95  .fo_ioctl = vn_ioctl,
96  .fo_poll = vn_poll,
97  .fo_kqfilter = vn_kqfilter,
98  .fo_stat = vn_statfile,
99  .fo_close = vn_closefile,
100  .fo_chmod = vn_chmod,
101  .fo_chown = vn_chown,
102  .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
103 };
104 
105 int
106 vn_open(ndp, flagp, cmode, fp)
107  struct nameidata *ndp;
108  int *flagp, cmode;
109  struct file *fp;
110 {
111  struct thread *td = ndp->ni_cnd.cn_thread;
112 
113  return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
114 }
115 
116 /*
117  * Common code for vnode open operations.
118  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
119  *
120  * Note that this does NOT free nameidata for the successful case,
121  * due to the NDINIT being done elsewhere.
122  */
123 int
124 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
125  struct ucred *cred, struct file *fp)
126 {
127  struct vnode *vp;
128  struct mount *mp;
129  struct thread *td = ndp->ni_cnd.cn_thread;
130  struct vattr vat;
131  struct vattr *vap = &vat;
132  int fmode, error;
133  accmode_t accmode;
134  int vfslocked, mpsafe;
135 
136  mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
137 restart:
138  vfslocked = 0;
139  fmode = *flagp;
140  if (fmode & O_CREAT) {
141  ndp->ni_cnd.cn_nameiop = CREATE;
142  ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
143  MPSAFE;
144  if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
145  ndp->ni_cnd.cn_flags |= FOLLOW;
146  if (!(vn_open_flags & VN_OPEN_NOAUDIT))
147  ndp->ni_cnd.cn_flags |= AUDITVNODE1;
148  bwillwrite();
149  if ((error = namei(ndp)) != 0)
150  return (error);
151  vfslocked = NDHASGIANT(ndp);
152  if (!mpsafe)
153  ndp->ni_cnd.cn_flags &= ~MPSAFE;
154  if (ndp->ni_vp == NULL) {
155  VATTR_NULL(vap);
156  vap->va_type = VREG;
157  vap->va_mode = cmode;
158  if (fmode & O_EXCL)
159  vap->va_vaflags |= VA_EXCLUSIVE;
160  if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
161  NDFREE(ndp, NDF_ONLY_PNBUF);
162  vput(ndp->ni_dvp);
163  VFS_UNLOCK_GIANT(vfslocked);
164  if ((error = vn_start_write(NULL, &mp,
165  V_XSLEEP | PCATCH)) != 0)
166  return (error);
167  goto restart;
168  }
169 #ifdef MAC
170  error = mac_vnode_check_create(cred, ndp->ni_dvp,
171  &ndp->ni_cnd, vap);
172  if (error == 0)
173 #endif
174  error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
175  &ndp->ni_cnd, vap);
176  vput(ndp->ni_dvp);
177  vn_finished_write(mp);
178  if (error) {
179  VFS_UNLOCK_GIANT(vfslocked);
180  NDFREE(ndp, NDF_ONLY_PNBUF);
181  return (error);
182  }
183  fmode &= ~O_TRUNC;
184  vp = ndp->ni_vp;
185  } else {
186  if (ndp->ni_dvp == ndp->ni_vp)
187  vrele(ndp->ni_dvp);
188  else
189  vput(ndp->ni_dvp);
190  ndp->ni_dvp = NULL;
191  vp = ndp->ni_vp;
192  if (fmode & O_EXCL) {
193  error = EEXIST;
194  goto bad;
195  }
196  fmode &= ~O_CREAT;
197  }
198  } else {
199  ndp->ni_cnd.cn_nameiop = LOOKUP;
200  ndp->ni_cnd.cn_flags = ISOPEN |
201  ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
202  LOCKLEAF | MPSAFE;
203  if (!(fmode & FWRITE))
204  ndp->ni_cnd.cn_flags |= LOCKSHARED;
205  if (!(vn_open_flags & VN_OPEN_NOAUDIT))
206  ndp->ni_cnd.cn_flags |= AUDITVNODE1;
207  if ((error = namei(ndp)) != 0)
208  return (error);
209  if (!mpsafe)
210  ndp->ni_cnd.cn_flags &= ~MPSAFE;
211  vfslocked = NDHASGIANT(ndp);
212  vp = ndp->ni_vp;
213  }
214  if (vp->v_type == VLNK) {
215  error = EMLINK;
216  goto bad;
217  }
218  if (vp->v_type == VSOCK) {
219  error = EOPNOTSUPP;
220  goto bad;
221  }
222  if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
223  error = ENOTDIR;
224  goto bad;
225  }
226  accmode = 0;
227  if (fmode & (FWRITE | O_TRUNC)) {
228  if (vp->v_type == VDIR) {
229  error = EISDIR;
230  goto bad;
231  }
232  accmode |= VWRITE;
233  }
234  if (fmode & FREAD)
235  accmode |= VREAD;
236  if (fmode & FEXEC)
237  accmode |= VEXEC;
238  if ((fmode & O_APPEND) && (fmode & FWRITE))
239  accmode |= VAPPEND;
240 #ifdef MAC
241  error = mac_vnode_check_open(cred, vp, accmode);
242  if (error)
243  goto bad;
244 #endif
245  if ((fmode & O_CREAT) == 0) {
246  if (accmode & VWRITE) {
247  error = vn_writechk(vp);
248  if (error)
249  goto bad;
250  }
251  if (accmode) {
252  error = VOP_ACCESS(vp, accmode, cred, td);
253  if (error)
254  goto bad;
255  }
256  }
257  if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
258  vn_lock(vp, LK_UPGRADE | LK_RETRY);
259  if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
260  goto bad;
261 
262  if (fmode & FWRITE)
263  VOP_ADD_WRITECOUNT(vp, 1);
264  *flagp = fmode;
265  ASSERT_VOP_LOCKED(vp, "vn_open_cred");
266  if (!mpsafe)
267  VFS_UNLOCK_GIANT(vfslocked);
268  return (0);
269 bad:
270  NDFREE(ndp, NDF_ONLY_PNBUF);
271  vput(vp);
272  VFS_UNLOCK_GIANT(vfslocked);
273  *flagp = fmode;
274  ndp->ni_vp = NULL;
275  return (error);
276 }
277 
278 /*
279  * Check for write permissions on the specified vnode.
280  * Prototype text segments cannot be written.
281  */
282 int
284  register struct vnode *vp;
285 {
286 
287  ASSERT_VOP_LOCKED(vp, "vn_writechk");
288  /*
289  * If there's shared text associated with
290  * the vnode, try to free it up once. If
291  * we fail, we can't allow writing.
292  */
293  if (VOP_IS_TEXT(vp))
294  return (ETXTBSY);
295 
296  return (0);
297 }
298 
299 /*
300  * Vnode close call
301  */
302 int
303 vn_close(vp, flags, file_cred, td)
304  register struct vnode *vp;
305  int flags;
306  struct ucred *file_cred;
307  struct thread *td;
308 {
309  struct mount *mp;
310  int error, lock_flags;
311 
312  if (vp->v_type != VFIFO && !(flags & FWRITE) && vp->v_mount != NULL &&
313  vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
314  lock_flags = LK_SHARED;
315  else
316  lock_flags = LK_EXCLUSIVE;
317 
318  VFS_ASSERT_GIANT(vp->v_mount);
319 
320  vn_start_write(vp, &mp, V_WAIT);
321  vn_lock(vp, lock_flags | LK_RETRY);
322  if (flags & FWRITE) {
323  VNASSERT(vp->v_writecount > 0, vp,
324  ("vn_close: negative writecount"));
325  VOP_ADD_WRITECOUNT(vp, -1);
326  }
327  error = VOP_CLOSE(vp, flags, file_cred, td);
328  vput(vp);
329  vn_finished_write(mp);
330  return (error);
331 }
332 
333 /*
334  * Heuristic to detect sequential operation.
335  */
336 static int
337 sequential_heuristic(struct uio *uio, struct file *fp)
338 {
339 
340  if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
341  return (fp->f_seqcount << IO_SEQSHIFT);
342 
343  /*
344  * Offset 0 is handled specially. open() sets f_seqcount to 1 so
345  * that the first I/O is normally considered to be slightly
346  * sequential. Seeking to offset 0 doesn't change sequentiality
347  * unless previous seeks have reduced f_seqcount to 0, in which
348  * case offset 0 is not special.
349  */
350  if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
351  uio->uio_offset == fp->f_nextoff) {
352  /*
353  * f_seqcount is in units of fixed-size blocks so that it
354  * depends mainly on the amount of sequential I/O and not
355  * much on the number of sequential I/O's. The fixed size
356  * of 16384 is hard-coded here since it is (not quite) just
357  * a magic size that works well here. This size is more
358  * closely related to the best I/O size for real disks than
359  * to any block size used by software.
360  */
361  fp->f_seqcount += howmany(uio->uio_resid, 16384);
362  if (fp->f_seqcount > IO_SEQMAX)
363  fp->f_seqcount = IO_SEQMAX;
364  return (fp->f_seqcount << IO_SEQSHIFT);
365  }
366 
367  /* Not sequential. Quickly draw-down sequentiality. */
368  if (fp->f_seqcount > 1)
369  fp->f_seqcount = 1;
370  else
371  fp->f_seqcount = 0;
372  return (0);
373 }
374 
375 /*
376  * Package up an I/O request on a vnode into a uio and do it.
377  */
378 int
379 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
380  enum uio_seg segflg, int ioflg, struct ucred *active_cred,
381  struct ucred *file_cred, ssize_t *aresid, struct thread *td)
382 {
383  struct uio auio;
384  struct iovec aiov;
385  struct mount *mp;
386  struct ucred *cred;
387  void *rl_cookie;
388  int error, lock_flags;
389 
390  VFS_ASSERT_GIANT(vp->v_mount);
391 
392  auio.uio_iov = &aiov;
393  auio.uio_iovcnt = 1;
394  aiov.iov_base = base;
395  aiov.iov_len = len;
396  auio.uio_resid = len;
397  auio.uio_offset = offset;
398  auio.uio_segflg = segflg;
399  auio.uio_rw = rw;
400  auio.uio_td = td;
401  error = 0;
402 
403  if ((ioflg & IO_NODELOCKED) == 0) {
404  if (rw == UIO_READ) {
405  rl_cookie = vn_rangelock_rlock(vp, offset,
406  offset + len);
407  } else {
408  rl_cookie = vn_rangelock_wlock(vp, offset,
409  offset + len);
410  }
411  mp = NULL;
412  if (rw == UIO_WRITE) {
413  if (vp->v_type != VCHR &&
414  (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
415  != 0)
416  goto out;
417  if (MNT_SHARED_WRITES(mp) ||
418  ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
419  lock_flags = LK_SHARED;
420  else
421  lock_flags = LK_EXCLUSIVE;
422  } else
423  lock_flags = LK_SHARED;
424  vn_lock(vp, lock_flags | LK_RETRY);
425  } else
426  rl_cookie = NULL;
427 
428  ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
429 #ifdef MAC
430  if ((ioflg & IO_NOMACCHECK) == 0) {
431  if (rw == UIO_READ)
432  error = mac_vnode_check_read(active_cred, file_cred,
433  vp);
434  else
435  error = mac_vnode_check_write(active_cred, file_cred,
436  vp);
437  }
438 #endif
439  if (error == 0) {
440  if (file_cred != NULL)
441  cred = file_cred;
442  else
443  cred = active_cred;
444  if (rw == UIO_READ)
445  error = VOP_READ(vp, &auio, ioflg, cred);
446  else
447  error = VOP_WRITE(vp, &auio, ioflg, cred);
448  }
449  if (aresid)
450  *aresid = auio.uio_resid;
451  else
452  if (auio.uio_resid && error == 0)
453  error = EIO;
454  if ((ioflg & IO_NODELOCKED) == 0) {
455  VOP_UNLOCK(vp, 0);
456  if (mp != NULL)
457  vn_finished_write(mp);
458  }
459  out:
460  if (rl_cookie != NULL)
461  vn_rangelock_unlock(vp, rl_cookie);
462  return (error);
463 }
464 
465 /*
466  * Package up an I/O request on a vnode into a uio and do it. The I/O
467  * request is split up into smaller chunks and we try to avoid saturating
468  * the buffer cache while potentially holding a vnode locked, so we
469  * check bwillwrite() before calling vn_rdwr(). We also call kern_yield()
470  * to give other processes a chance to lock the vnode (either other processes
471  * core'ing the same binary, or unrelated processes scanning the directory).
472  */
473 int
474 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
475  file_cred, aresid, td)
476  enum uio_rw rw;
477  struct vnode *vp;
478  void *base;
479  size_t len;
480  off_t offset;
481  enum uio_seg segflg;
482  int ioflg;
483  struct ucred *active_cred;
484  struct ucred *file_cred;
485  size_t *aresid;
486  struct thread *td;
487 {
488  int error = 0;
489  ssize_t iaresid;
490 
491  VFS_ASSERT_GIANT(vp->v_mount);
492 
493  do {
494  int chunk;
495 
496  /*
497  * Force `offset' to a multiple of MAXBSIZE except possibly
498  * for the first chunk, so that filesystems only need to
499  * write full blocks except possibly for the first and last
500  * chunks.
501  */
502  chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
503 
504  if (chunk > len)
505  chunk = len;
506  if (rw != UIO_READ && vp->v_type == VREG)
507  bwillwrite();
508  iaresid = 0;
509  error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
510  ioflg, active_cred, file_cred, &iaresid, td);
511  len -= chunk; /* aresid calc already includes length */
512  if (error)
513  break;
514  offset += chunk;
515  base = (char *)base + chunk;
516  kern_yield(PRI_USER);
517  } while (len);
518  if (aresid)
519  *aresid = len + iaresid;
520  return (error);
521 }
522 
523 off_t
524 foffset_lock(struct file *fp, int flags)
525 {
526  struct mtx *mtxp;
527  off_t res;
528 
529  KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
530 
531 #if OFF_MAX <= LONG_MAX
532  /*
533  * Caller only wants the current f_offset value. Assume that
534  * the long and shorter integer types reads are atomic.
535  */
536  if ((flags & FOF_NOLOCK) != 0)
537  return (fp->f_offset);
538 #endif
539 
540  /*
541  * According to McKusick the vn lock was protecting f_offset here.
542  * It is now protected by the FOFFSET_LOCKED flag.
543  */
544  mtxp = mtx_pool_find(mtxpool_sleep, fp);
545  mtx_lock(mtxp);
546  if ((flags & FOF_NOLOCK) == 0) {
547  while (fp->f_vnread_flags & FOFFSET_LOCKED) {
548  fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
549  msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
550  "vofflock", 0);
551  }
552  fp->f_vnread_flags |= FOFFSET_LOCKED;
553  }
554  res = fp->f_offset;
555  mtx_unlock(mtxp);
556  return (res);
557 }
558 
559 void
560 foffset_unlock(struct file *fp, off_t val, int flags)
561 {
562  struct mtx *mtxp;
563 
564  KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
565 
566 #if OFF_MAX <= LONG_MAX
567  if ((flags & FOF_NOLOCK) != 0) {
568  if ((flags & FOF_NOUPDATE) == 0)
569  fp->f_offset = val;
570  if ((flags & FOF_NEXTOFF) != 0)
571  fp->f_nextoff = val;
572  return;
573  }
574 #endif
575 
576  mtxp = mtx_pool_find(mtxpool_sleep, fp);
577  mtx_lock(mtxp);
578  if ((flags & FOF_NOUPDATE) == 0)
579  fp->f_offset = val;
580  if ((flags & FOF_NEXTOFF) != 0)
581  fp->f_nextoff = val;
582  if ((flags & FOF_NOLOCK) == 0) {
583  KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
584  ("Lost FOFFSET_LOCKED"));
585  if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
586  wakeup(&fp->f_vnread_flags);
587  fp->f_vnread_flags = 0;
588  }
589  mtx_unlock(mtxp);
590 }
591 
592 void
593 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
594 {
595 
596  if ((flags & FOF_OFFSET) == 0)
597  uio->uio_offset = foffset_lock(fp, flags);
598 }
599 
600 void
601 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
602 {
603 
604  if ((flags & FOF_OFFSET) == 0)
605  foffset_unlock(fp, uio->uio_offset, flags);
606 }
607 
608 static int
609 get_advice(struct file *fp, struct uio *uio)
610 {
611  struct mtx *mtxp;
612  int ret;
613 
614  ret = POSIX_FADV_NORMAL;
615  if (fp->f_advice == NULL)
616  return (ret);
617 
618  mtxp = mtx_pool_find(mtxpool_sleep, fp);
619  mtx_lock(mtxp);
620  if (uio->uio_offset >= fp->f_advice->fa_start &&
621  uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
622  ret = fp->f_advice->fa_advice;
623  mtx_unlock(mtxp);
624  return (ret);
625 }
626 
627 /*
628  * File table vnode read routine.
629  */
630 static int
631 vn_read(fp, uio, active_cred, flags, td)
632  struct file *fp;
633  struct uio *uio;
634  struct ucred *active_cred;
635  int flags;
636  struct thread *td;
637 {
638  struct vnode *vp;
639  struct mtx *mtxp;
640  int error, ioflag;
641  int advice, vfslocked;
642  off_t offset, start, end;
643 
644  KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
645  uio->uio_td, td));
646  KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
647  vp = fp->f_vnode;
648  ioflag = 0;
649  if (fp->f_flag & FNONBLOCK)
650  ioflag |= IO_NDELAY;
651  if (fp->f_flag & O_DIRECT)
652  ioflag |= IO_DIRECT;
653  advice = get_advice(fp, uio);
654  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
655  vn_lock(vp, LK_SHARED | LK_RETRY);
656 
657  switch (advice) {
658  case POSIX_FADV_NORMAL:
659  case POSIX_FADV_SEQUENTIAL:
660  case POSIX_FADV_NOREUSE:
661  ioflag |= sequential_heuristic(uio, fp);
662  break;
663  case POSIX_FADV_RANDOM:
664  /* Disable read-ahead for random I/O. */
665  break;
666  }
667  offset = uio->uio_offset;
668 
669 #ifdef MAC
670  error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
671  if (error == 0)
672 #endif
673  error = VOP_READ(vp, uio, ioflag, fp->f_cred);
674  fp->f_nextoff = uio->uio_offset;
675  VOP_UNLOCK(vp, 0);
676  if (error == 0 && advice == POSIX_FADV_NOREUSE &&
677  offset != uio->uio_offset) {
678  /*
679  * Use POSIX_FADV_DONTNEED to flush clean pages and
680  * buffers for the backing file after a
681  * POSIX_FADV_NOREUSE read(2). To optimize the common
682  * case of using POSIX_FADV_NOREUSE with sequential
683  * access, track the previous implicit DONTNEED
684  * request and grow this request to include the
685  * current read(2) in addition to the previous
686  * DONTNEED. With purely sequential access this will
687  * cause the DONTNEED requests to continously grow to
688  * cover all of the previously read regions of the
689  * file. This allows filesystem blocks that are
690  * accessed by multiple calls to read(2) to be flushed
691  * once the last read(2) finishes.
692  */
693  start = offset;
694  end = uio->uio_offset - 1;
695  mtxp = mtx_pool_find(mtxpool_sleep, fp);
696  mtx_lock(mtxp);
697  if (fp->f_advice != NULL &&
698  fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
699  if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
700  start = fp->f_advice->fa_prevstart;
701  else if (fp->f_advice->fa_prevstart != 0 &&
702  fp->f_advice->fa_prevstart == end + 1)
703  end = fp->f_advice->fa_prevend;
704  fp->f_advice->fa_prevstart = start;
705  fp->f_advice->fa_prevend = end;
706  }
707  mtx_unlock(mtxp);
708  error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
709  }
710  VFS_UNLOCK_GIANT(vfslocked);
711  return (error);
712 }
713 
714 /*
715  * File table vnode write routine.
716  */
717 static int
718 vn_write(fp, uio, active_cred, flags, td)
719  struct file *fp;
720  struct uio *uio;
721  struct ucred *active_cred;
722  int flags;
723  struct thread *td;
724 {
725  struct vnode *vp;
726  struct mount *mp;
727  struct mtx *mtxp;
728  int error, ioflag, lock_flags;
729  int advice, vfslocked;
730  off_t offset, start, end;
731 
732  KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
733  uio->uio_td, td));
734  KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
735  vp = fp->f_vnode;
736  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
737  if (vp->v_type == VREG)
738  bwillwrite();
739  ioflag = IO_UNIT;
740  if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
741  ioflag |= IO_APPEND;
742  if (fp->f_flag & FNONBLOCK)
743  ioflag |= IO_NDELAY;
744  if (fp->f_flag & O_DIRECT)
745  ioflag |= IO_DIRECT;
746  if ((fp->f_flag & O_FSYNC) ||
747  (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
748  ioflag |= IO_SYNC;
749  mp = NULL;
750  if (vp->v_type != VCHR &&
751  (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
752  goto unlock;
753 
754  advice = get_advice(fp, uio);
755 
756  if ((MNT_SHARED_WRITES(mp) ||
757  ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
758  (flags & FOF_OFFSET) != 0) {
759  lock_flags = LK_SHARED;
760  } else {
761  lock_flags = LK_EXCLUSIVE;
762  }
763 
764  vn_lock(vp, lock_flags | LK_RETRY);
765  switch (advice) {
766  case POSIX_FADV_NORMAL:
767  case POSIX_FADV_SEQUENTIAL:
768  case POSIX_FADV_NOREUSE:
769  ioflag |= sequential_heuristic(uio, fp);
770  break;
771  case POSIX_FADV_RANDOM:
772  /* XXX: Is this correct? */
773  break;
774  }
775  offset = uio->uio_offset;
776 
777 #ifdef MAC
778  error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
779  if (error == 0)
780 #endif
781  error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
782  fp->f_nextoff = uio->uio_offset;
783  VOP_UNLOCK(vp, 0);
784  if (vp->v_type != VCHR)
785  vn_finished_write(mp);
786  if (error == 0 && advice == POSIX_FADV_NOREUSE &&
787  offset != uio->uio_offset) {
788  /*
789  * Use POSIX_FADV_DONTNEED to flush clean pages and
790  * buffers for the backing file after a
791  * POSIX_FADV_NOREUSE write(2). To optimize the
792  * common case of using POSIX_FADV_NOREUSE with
793  * sequential access, track the previous implicit
794  * DONTNEED request and grow this request to include
795  * the current write(2) in addition to the previous
796  * DONTNEED. With purely sequential access this will
797  * cause the DONTNEED requests to continously grow to
798  * cover all of the previously written regions of the
799  * file.
800  *
801  * Note that the blocks just written are almost
802  * certainly still dirty, so this only works when
803  * VOP_ADVISE() calls from subsequent writes push out
804  * the data written by this write(2) once the backing
805  * buffers are clean. However, as compared to forcing
806  * IO_DIRECT, this gives much saner behavior. Write
807  * clustering is still allowed, and clean pages are
808  * merely moved to the cache page queue rather than
809  * outright thrown away. This means a subsequent
810  * read(2) can still avoid hitting the disk if the
811  * pages have not been reclaimed.
812  *
813  * This does make POSIX_FADV_NOREUSE largely useless
814  * with non-sequential access. However, sequential
815  * access is the more common use case and the flag is
816  * merely advisory.
817  */
818  start = offset;
819  end = uio->uio_offset - 1;
820  mtxp = mtx_pool_find(mtxpool_sleep, fp);
821  mtx_lock(mtxp);
822  if (fp->f_advice != NULL &&
823  fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
824  if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
825  start = fp->f_advice->fa_prevstart;
826  else if (fp->f_advice->fa_prevstart != 0 &&
827  fp->f_advice->fa_prevstart == end + 1)
828  end = fp->f_advice->fa_prevend;
829  fp->f_advice->fa_prevstart = start;
830  fp->f_advice->fa_prevend = end;
831  }
832  mtx_unlock(mtxp);
833  error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
834  }
835 
836 unlock:
837  VFS_UNLOCK_GIANT(vfslocked);
838  return (error);
839 }
840 
841 static const int io_hold_cnt = 16;
842 static int vn_io_fault_enable = 0;
843 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
844  &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
845 static u_long vn_io_faults_cnt;
846 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
847  &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
848 
849 /*
850  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
851  * prevent the following deadlock:
852  *
853  * Assume that the thread A reads from the vnode vp1 into userspace
854  * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is
855  * currently not resident, then system ends up with the call chain
856  * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
857  * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
858  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
859  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
860  * backed by the pages of vnode vp1, and some page in buf2 is not
861  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
862  *
863  * To prevent the lock order reversal and deadlock, vn_io_fault() does
864  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
865  * Instead, it first tries to do the whole range i/o with pagefaults
866  * disabled. If all pages in the i/o buffer are resident and mapped,
867  * VOP will succeed (ignoring the genuine filesystem errors).
868  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
869  * i/o in chunks, with all pages in the chunk prefaulted and held
870  * using vm_fault_quick_hold_pages().
871  *
872  * Filesystems using this deadlock avoidance scheme should use the
873  * array of the held pages from uio, saved in the curthread->td_ma,
874  * instead of doing uiomove(). A helper function
875  * vn_io_fault_uiomove() converts uiomove request into
876  * uiomove_fromphys() over td_ma array.
877  *
878  * Since vnode locks do not cover the whole i/o anymore, rangelocks
879  * make the current i/o request atomic with respect to other i/os and
880  * truncations.
881  */
882 static int
883 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
884  int flags, struct thread *td)
885 {
886  vm_page_t ma[io_hold_cnt + 2];
887  struct uio *uio_clone, short_uio;
888  struct iovec short_iovec[1];
889  fo_rdwr_t *doio;
890  struct vnode *vp;
891  void *rl_cookie;
892  struct mount *mp;
893  vm_page_t *prev_td_ma;
894  int error, cnt, save, saveheld, prev_td_ma_cnt;
895  vm_offset_t addr, end;
896  vm_prot_t prot;
897  size_t len, resid;
898  ssize_t adv;
899 
900  if (uio->uio_rw == UIO_READ)
901  doio = vn_read;
902  else
903  doio = vn_write;
904  vp = fp->f_vnode;
905  foffset_lock_uio(fp, uio, flags);
906 
907  if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
908  ((mp = vp->v_mount) != NULL &&
909  (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
911  error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
912  goto out_last;
913  }
914 
915  /*
916  * The UFS follows IO_UNIT directive and replays back both
917  * uio_offset and uio_resid if an error is encountered during the
918  * operation. But, since the iovec may be already advanced,
919  * uio is still in an inconsistent state.
920  *
921  * Cache a copy of the original uio, which is advanced to the redo
922  * point using UIO_NOCOPY below.
923  */
924  uio_clone = cloneuio(uio);
925  resid = uio->uio_resid;
926 
927  short_uio.uio_segflg = UIO_USERSPACE;
928  short_uio.uio_rw = uio->uio_rw;
929  short_uio.uio_td = uio->uio_td;
930 
931  if (uio->uio_rw == UIO_READ) {
932  prot = VM_PROT_WRITE;
933  rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
934  uio->uio_offset + uio->uio_resid);
935  } else {
936  prot = VM_PROT_READ;
937  if ((fp->f_flag & O_APPEND) != 0 || (flags & FOF_OFFSET) == 0)
938  /* For appenders, punt and lock the whole range. */
939  rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
940  else
941  rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
942  uio->uio_offset + uio->uio_resid);
943  }
944 
945  save = vm_fault_disable_pagefaults();
946  error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
947  if (error != EFAULT)
948  goto out;
949 
950  atomic_add_long(&vn_io_faults_cnt, 1);
951  uio_clone->uio_segflg = UIO_NOCOPY;
952  uiomove(NULL, resid - uio->uio_resid, uio_clone);
953  uio_clone->uio_segflg = uio->uio_segflg;
954 
955  saveheld = curthread_pflags_set(TDP_UIOHELD);
956  prev_td_ma = td->td_ma;
957  prev_td_ma_cnt = td->td_ma_cnt;
958 
959  while (uio_clone->uio_resid != 0) {
960  len = uio_clone->uio_iov->iov_len;
961  if (len == 0) {
962  KASSERT(uio_clone->uio_iovcnt >= 1,
963  ("iovcnt underflow"));
964  uio_clone->uio_iov++;
965  uio_clone->uio_iovcnt--;
966  continue;
967  }
968  if (len > io_hold_cnt * PAGE_SIZE)
969  len = io_hold_cnt * PAGE_SIZE;
970  addr = (uintptr_t)uio_clone->uio_iov->iov_base;
971  end = round_page(addr + len);
972  if (end < addr) {
973  error = EFAULT;
974  break;
975  }
976  cnt = atop(end - trunc_page(addr));
977  /*
978  * A perfectly misaligned address and length could cause
979  * both the start and the end of the chunk to use partial
980  * page. +2 accounts for such a situation.
981  */
982  cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
983  addr, len, prot, ma, io_hold_cnt + 2);
984  if (cnt == -1) {
985  error = EFAULT;
986  break;
987  }
988  short_uio.uio_iov = &short_iovec[0];
989  short_iovec[0].iov_base = (void *)addr;
990  short_uio.uio_iovcnt = 1;
991  short_uio.uio_resid = short_iovec[0].iov_len = len;
992  short_uio.uio_offset = uio_clone->uio_offset;
993  td->td_ma = ma;
994  td->td_ma_cnt = cnt;
995 
996  error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
997  td);
998  vm_page_unhold_pages(ma, cnt);
999  adv = len - short_uio.uio_resid;
1000 
1001  uio_clone->uio_iov->iov_base =
1002  (char *)uio_clone->uio_iov->iov_base + adv;
1003  uio_clone->uio_iov->iov_len -= adv;
1004  uio_clone->uio_resid -= adv;
1005  uio_clone->uio_offset += adv;
1006 
1007  uio->uio_resid -= adv;
1008  uio->uio_offset += adv;
1009 
1010  if (error != 0 || adv == 0)
1011  break;
1012  }
1013  td->td_ma = prev_td_ma;
1014  td->td_ma_cnt = prev_td_ma_cnt;
1015  curthread_pflags_restore(saveheld);
1016 out:
1017  vm_fault_enable_pagefaults(save);
1018  vn_rangelock_unlock(vp, rl_cookie);
1019  free(uio_clone, M_IOV);
1020 out_last:
1021  foffset_unlock_uio(fp, uio, flags);
1022  return (error);
1023 }
1024 
1025 /*
1026  * Helper function to perform the requested uiomove operation using
1027  * the held pages for io->uio_iov[0].iov_base buffer instead of
1028  * copyin/copyout. Access to the pages with uiomove_fromphys()
1029  * instead of iov_base prevents page faults that could occur due to
1030  * pmap_collect() invalidating the mapping created by
1031  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1032  * object cleanup revoking the write access from page mappings.
1033  *
1034  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1035  * instead of plain uiomove().
1036  */
1037 int
1038 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1039 {
1040  struct uio transp_uio;
1041  struct iovec transp_iov[1];
1042  struct thread *td;
1043  size_t adv;
1044  int error, pgadv;
1045 
1046  td = curthread;
1047  if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1048  uio->uio_segflg != UIO_USERSPACE)
1049  return (uiomove(data, xfersize, uio));
1050 
1051  KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1052  transp_iov[0].iov_base = data;
1053  transp_uio.uio_iov = &transp_iov[0];
1054  transp_uio.uio_iovcnt = 1;
1055  if (xfersize > uio->uio_resid)
1056  xfersize = uio->uio_resid;
1057  transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1058  transp_uio.uio_offset = 0;
1059  transp_uio.uio_segflg = UIO_SYSSPACE;
1060  /*
1061  * Since transp_iov points to data, and td_ma page array
1062  * corresponds to original uio->uio_iov, we need to invert the
1063  * direction of the i/o operation as passed to
1064  * uiomove_fromphys().
1065  */
1066  switch (uio->uio_rw) {
1067  case UIO_WRITE:
1068  transp_uio.uio_rw = UIO_READ;
1069  break;
1070  case UIO_READ:
1071  transp_uio.uio_rw = UIO_WRITE;
1072  break;
1073  }
1074  transp_uio.uio_td = uio->uio_td;
1075  error = uiomove_fromphys(td->td_ma,
1076  ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1077  xfersize, &transp_uio);
1078  adv = xfersize - transp_uio.uio_resid;
1079  pgadv =
1080  (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1081  (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1082  td->td_ma += pgadv;
1083  KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1084  pgadv));
1085  td->td_ma_cnt -= pgadv;
1086  uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1087  uio->uio_iov->iov_len -= adv;
1088  uio->uio_resid -= adv;
1089  uio->uio_offset += adv;
1090  return (error);
1091 }
1092 
1093 int
1094 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1095  struct uio *uio)
1096 {
1097  struct thread *td;
1098  vm_offset_t iov_base;
1099  int cnt, pgadv;
1100 
1101  td = curthread;
1102  if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1103  uio->uio_segflg != UIO_USERSPACE)
1104  return (uiomove_fromphys(ma, offset, xfersize, uio));
1105 
1106  KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1107  cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1108  iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1109  switch (uio->uio_rw) {
1110  case UIO_WRITE:
1111  pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1112  offset, cnt);
1113  break;
1114  case UIO_READ:
1115  pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1116  cnt);
1117  break;
1118  }
1119  pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1120  td->td_ma += pgadv;
1121  KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1122  pgadv));
1123  td->td_ma_cnt -= pgadv;
1124  uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1125  uio->uio_iov->iov_len -= cnt;
1126  uio->uio_resid -= cnt;
1127  uio->uio_offset += cnt;
1128  return (0);
1129 }
1130 
1131 
1132 /*
1133  * File table truncate routine.
1134  */
1135 static int
1136 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1137  struct thread *td)
1138 {
1139  struct vattr vattr;
1140  struct mount *mp;
1141  struct vnode *vp;
1142  void *rl_cookie;
1143  int vfslocked;
1144  int error;
1145 
1146  vp = fp->f_vnode;
1147 
1148  /*
1149  * Lock the whole range for truncation. Otherwise split i/o
1150  * might happen partly before and partly after the truncation.
1151  */
1152  rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1153  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1154  error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1155  if (error)
1156  goto out1;
1157  vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1158  if (vp->v_type == VDIR) {
1159  error = EISDIR;
1160  goto out;
1161  }
1162 #ifdef MAC
1163  error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1164  if (error)
1165  goto out;
1166 #endif
1167  error = vn_writechk(vp);
1168  if (error == 0) {
1169  VATTR_NULL(&vattr);
1170  vattr.va_size = length;
1171  error = VOP_SETATTR(vp, &vattr, fp->f_cred);
1172  }
1173 out:
1174  VOP_UNLOCK(vp, 0);
1175  vn_finished_write(mp);
1176 out1:
1177  VFS_UNLOCK_GIANT(vfslocked);
1178  vn_rangelock_unlock(vp, rl_cookie);
1179  return (error);
1180 }
1181 
1182 /*
1183  * File table vnode stat routine.
1184  */
1185 static int
1186 vn_statfile(fp, sb, active_cred, td)
1187  struct file *fp;
1188  struct stat *sb;
1189  struct ucred *active_cred;
1190  struct thread *td;
1191 {
1192  struct vnode *vp = fp->f_vnode;
1193  int vfslocked;
1194  int error;
1195 
1196  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1197  vn_lock(vp, LK_SHARED | LK_RETRY);
1198  error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
1199  VOP_UNLOCK(vp, 0);
1200  VFS_UNLOCK_GIANT(vfslocked);
1201 
1202  return (error);
1203 }
1204 
1205 /*
1206  * Stat a vnode; implementation for the stat syscall
1207  */
1208 int
1209 vn_stat(vp, sb, active_cred, file_cred, td)
1210  struct vnode *vp;
1211  register struct stat *sb;
1212  struct ucred *active_cred;
1213  struct ucred *file_cred;
1214  struct thread *td;
1215 {
1216  struct vattr vattr;
1217  register struct vattr *vap;
1218  int error;
1219  u_short mode;
1220 
1221 #ifdef MAC
1222  error = mac_vnode_check_stat(active_cred, file_cred, vp);
1223  if (error)
1224  return (error);
1225 #endif
1226 
1227  vap = &vattr;
1228 
1229  /*
1230  * Initialize defaults for new and unusual fields, so that file
1231  * systems which don't support these fields don't need to know
1232  * about them.
1233  */
1234  vap->va_birthtime.tv_sec = -1;
1235  vap->va_birthtime.tv_nsec = 0;
1236  vap->va_fsid = VNOVAL;
1237  vap->va_rdev = NODEV;
1238 
1239  error = VOP_GETATTR(vp, vap, active_cred);
1240  if (error)
1241  return (error);
1242 
1243  /*
1244  * Zero the spare stat fields
1245  */
1246  bzero(sb, sizeof *sb);
1247 
1248  /*
1249  * Copy from vattr table
1250  */
1251  if (vap->va_fsid != VNOVAL)
1252  sb->st_dev = vap->va_fsid;
1253  else
1254  sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1255  sb->st_ino = vap->va_fileid;
1256  mode = vap->va_mode;
1257  switch (vap->va_type) {
1258  case VREG:
1259  mode |= S_IFREG;
1260  break;
1261  case VDIR:
1262  mode |= S_IFDIR;
1263  break;
1264  case VBLK:
1265  mode |= S_IFBLK;
1266  break;
1267  case VCHR:
1268  mode |= S_IFCHR;
1269  break;
1270  case VLNK:
1271  mode |= S_IFLNK;
1272  break;
1273  case VSOCK:
1274  mode |= S_IFSOCK;
1275  break;
1276  case VFIFO:
1277  mode |= S_IFIFO;
1278  break;
1279  default:
1280  return (EBADF);
1281  };
1282  sb->st_mode = mode;
1283  sb->st_nlink = vap->va_nlink;
1284  sb->st_uid = vap->va_uid;
1285  sb->st_gid = vap->va_gid;
1286  sb->st_rdev = vap->va_rdev;
1287  if (vap->va_size > OFF_MAX)
1288  return (EOVERFLOW);
1289  sb->st_size = vap->va_size;
1290  sb->st_atim = vap->va_atime;
1291  sb->st_mtim = vap->va_mtime;
1292  sb->st_ctim = vap->va_ctime;
1293  sb->st_birthtim = vap->va_birthtime;
1294 
1295  /*
1296  * According to www.opengroup.org, the meaning of st_blksize is
1297  * "a filesystem-specific preferred I/O block size for this
1298  * object. In some filesystem types, this may vary from file
1299  * to file"
1300  * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
1301  */
1302 
1303  sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
1304 
1305  sb->st_flags = vap->va_flags;
1306  if (priv_check(td, PRIV_VFS_GENERATION))
1307  sb->st_gen = 0;
1308  else
1309  sb->st_gen = vap->va_gen;
1310 
1311  sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1312  return (0);
1313 }
1314 
1315 /*
1316  * File table vnode ioctl routine.
1317  */
1318 static int
1319 vn_ioctl(fp, com, data, active_cred, td)
1320  struct file *fp;
1321  u_long com;
1322  void *data;
1323  struct ucred *active_cred;
1324  struct thread *td;
1325 {
1326  struct vnode *vp = fp->f_vnode;
1327  struct vattr vattr;
1328  int vfslocked;
1329  int error;
1330 
1331  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1332  error = ENOTTY;
1333  switch (vp->v_type) {
1334  case VREG:
1335  case VDIR:
1336  if (com == FIONREAD) {
1337  vn_lock(vp, LK_SHARED | LK_RETRY);
1338  error = VOP_GETATTR(vp, &vattr, active_cred);
1339  VOP_UNLOCK(vp, 0);
1340  if (!error)
1341  *(int *)data = vattr.va_size - fp->f_offset;
1342  } else if (com == FIONBIO || com == FIOASYNC) /* XXX */
1343  error = 0;
1344  else
1345  error = VOP_IOCTL(vp, com, data, fp->f_flag,
1346  active_cred, td);
1347  break;
1348 
1349  default:
1350  break;
1351  }
1352  VFS_UNLOCK_GIANT(vfslocked);
1353  return (error);
1354 }
1355 
1356 /*
1357  * File table vnode poll routine.
1358  */
1359 static int
1360 vn_poll(fp, events, active_cred, td)
1361  struct file *fp;
1362  int events;
1363  struct ucred *active_cred;
1364  struct thread *td;
1365 {
1366  struct vnode *vp;
1367  int vfslocked;
1368  int error;
1369 
1370  vp = fp->f_vnode;
1371  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1372 #ifdef MAC
1373  vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1374  error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1375  VOP_UNLOCK(vp, 0);
1376  if (!error)
1377 #endif
1378 
1379  error = VOP_POLL(vp, events, fp->f_cred, td);
1380  VFS_UNLOCK_GIANT(vfslocked);
1381  return (error);
1382 }
1383 
1384 /*
1385  * Acquire the requested lock and then check for validity. LK_RETRY
1386  * permits vn_lock to return doomed vnodes.
1387  */
1388 int
1389 _vn_lock(struct vnode *vp, int flags, char *file, int line)
1390 {
1391  int error;
1392 
1393  VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1394  ("vn_lock called with no locktype."));
1395  do {
1396 #ifdef DEBUG_VFS_LOCKS
1397  KASSERT(vp->v_holdcnt != 0,
1398  ("vn_lock %p: zero hold count", vp));
1399 #endif
1400  error = VOP_LOCK1(vp, flags, file, line);
1401  flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
1402  KASSERT((flags & LK_RETRY) == 0 || error == 0,
1403  ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
1404  flags, error));
1405  /*
1406  * Callers specify LK_RETRY if they wish to get dead vnodes.
1407  * If RETRY is not set, we return ENOENT instead.
1408  */
1409  if (error == 0 && vp->v_iflag & VI_DOOMED &&
1410  (flags & LK_RETRY) == 0) {
1411  VOP_UNLOCK(vp, 0);
1412  error = ENOENT;
1413  break;
1414  }
1415  } while (flags & LK_RETRY && error != 0);
1416  return (error);
1417 }
1418 
1419 /*
1420  * File table vnode close routine.
1421  */
1422 static int
1424  struct file *fp;
1425  struct thread *td;
1426 {
1427  struct vnode *vp;
1428  struct flock lf;
1429  int vfslocked;
1430  int error;
1431 
1432  vp = fp->f_vnode;
1433 
1434  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1435  if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
1436  lf.l_whence = SEEK_SET;
1437  lf.l_start = 0;
1438  lf.l_len = 0;
1439  lf.l_type = F_UNLCK;
1440  (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1441  }
1442 
1443  fp->f_ops = &badfileops;
1444 
1445  error = vn_close(vp, fp->f_flag, fp->f_cred, td);
1446  VFS_UNLOCK_GIANT(vfslocked);
1447  return (error);
1448 }
1449 
1450 /*
1451  * Preparing to start a filesystem write operation. If the operation is
1452  * permitted, then we bump the count of operations in progress and
1453  * proceed. If a suspend request is in progress, we wait until the
1454  * suspension is over, and then proceed.
1455  */
1456 static int
1457 vn_start_write_locked(struct mount *mp, int flags)
1458 {
1459  int error;
1460 
1461  mtx_assert(MNT_MTX(mp), MA_OWNED);
1462  error = 0;
1463 
1464  /*
1465  * Check on status of suspension.
1466  */
1467  if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1468  mp->mnt_susp_owner != curthread) {
1469  while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1470  if (flags & V_NOWAIT) {
1471  error = EWOULDBLOCK;
1472  goto unlock;
1473  }
1474  error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1475  (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
1476  if (error)
1477  goto unlock;
1478  }
1479  }
1480  if (flags & V_XSLEEP)
1481  goto unlock;
1482  mp->mnt_writeopcount++;
1483 unlock:
1484  if (error != 0 || (flags & V_XSLEEP) != 0)
1485  MNT_REL(mp);
1486  MNT_IUNLOCK(mp);
1487  return (error);
1488 }
1489 
1490 int
1491 vn_start_write(vp, mpp, flags)
1492  struct vnode *vp;
1493  struct mount **mpp;
1494  int flags;
1495 {
1496  struct mount *mp;
1497  int error;
1498 
1499  error = 0;
1500  /*
1501  * If a vnode is provided, get and return the mount point that
1502  * to which it will write.
1503  */
1504  if (vp != NULL) {
1505  if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1506  *mpp = NULL;
1507  if (error != EOPNOTSUPP)
1508  return (error);
1509  return (0);
1510  }
1511  }
1512  if ((mp = *mpp) == NULL)
1513  return (0);
1514 
1515  /*
1516  * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1517  * a vfs_ref().
1518  * As long as a vnode is not provided we need to acquire a
1519  * refcount for the provided mountpoint too, in order to
1520  * emulate a vfs_ref().
1521  */
1522  MNT_ILOCK(mp);
1523  if (vp == NULL)
1524  MNT_REF(mp);
1525 
1526  return (vn_start_write_locked(mp, flags));
1527 }
1528 
1529 /*
1530  * Secondary suspension. Used by operations such as vop_inactive
1531  * routines that are needed by the higher level functions. These
1532  * are allowed to proceed until all the higher level functions have
1533  * completed (indicated by mnt_writeopcount dropping to zero). At that
1534  * time, these operations are halted until the suspension is over.
1535  */
1536 int
1538  struct vnode *vp;
1539  struct mount **mpp;
1540  int flags;
1541 {
1542  struct mount *mp;
1543  int error;
1544 
1545  retry:
1546  if (vp != NULL) {
1547  if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1548  *mpp = NULL;
1549  if (error != EOPNOTSUPP)
1550  return (error);
1551  return (0);
1552  }
1553  }
1554  /*
1555  * If we are not suspended or have not yet reached suspended
1556  * mode, then let the operation proceed.
1557  */
1558  if ((mp = *mpp) == NULL)
1559  return (0);
1560 
1561  /*
1562  * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1563  * a vfs_ref().
1564  * As long as a vnode is not provided we need to acquire a
1565  * refcount for the provided mountpoint too, in order to
1566  * emulate a vfs_ref().
1567  */
1568  MNT_ILOCK(mp);
1569  if (vp == NULL)
1570  MNT_REF(mp);
1571  if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1572  mp->mnt_secondary_writes++;
1573  mp->mnt_secondary_accwrites++;
1574  MNT_IUNLOCK(mp);
1575  return (0);
1576  }
1577  if (flags & V_NOWAIT) {
1578  MNT_REL(mp);
1579  MNT_IUNLOCK(mp);
1580  return (EWOULDBLOCK);
1581  }
1582  /*
1583  * Wait for the suspension to finish.
1584  */
1585  error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1586  (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1587  vfs_rel(mp);
1588  if (error == 0)
1589  goto retry;
1590  return (error);
1591 }
1592 
1593 /*
1594  * Filesystem write operation has completed. If we are suspending and this
1595  * operation is the last one, notify the suspender that the suspension is
1596  * now in effect.
1597  */
1598 void
1600  struct mount *mp;
1601 {
1602  if (mp == NULL)
1603  return;
1604  MNT_ILOCK(mp);
1605  MNT_REL(mp);
1606  mp->mnt_writeopcount--;
1607  if (mp->mnt_writeopcount < 0)
1608  panic("vn_finished_write: neg cnt");
1609  if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1610  mp->mnt_writeopcount <= 0)
1611  wakeup(&mp->mnt_writeopcount);
1612  MNT_IUNLOCK(mp);
1613 }
1614 
1615 
1616 /*
1617  * Filesystem secondary write operation has completed. If we are
1618  * suspending and this operation is the last one, notify the suspender
1619  * that the suspension is now in effect.
1620  */
1621 void
1623  struct mount *mp;
1624 {
1625  if (mp == NULL)
1626  return;
1627  MNT_ILOCK(mp);
1628  MNT_REL(mp);
1629  mp->mnt_secondary_writes--;
1630  if (mp->mnt_secondary_writes < 0)
1631  panic("vn_finished_secondary_write: neg cnt");
1632  if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1633  mp->mnt_secondary_writes <= 0)
1634  wakeup(&mp->mnt_secondary_writes);
1635  MNT_IUNLOCK(mp);
1636 }
1637 
1638 
1639 
1640 /*
1641  * Request a filesystem to suspend write operations.
1642  */
1643 int
1645  struct mount *mp;
1646 {
1647  int error;
1648 
1649  MNT_ILOCK(mp);
1650  if (mp->mnt_susp_owner == curthread) {
1651  MNT_IUNLOCK(mp);
1652  return (EALREADY);
1653  }
1654  while (mp->mnt_kern_flag & MNTK_SUSPEND)
1655  msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1656  mp->mnt_kern_flag |= MNTK_SUSPEND;
1657  mp->mnt_susp_owner = curthread;
1658  if (mp->mnt_writeopcount > 0)
1659  (void) msleep(&mp->mnt_writeopcount,
1660  MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1661  else
1662  MNT_IUNLOCK(mp);
1663  if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1664  vfs_write_resume(mp);
1665  return (error);
1666 }
1667 
1668 /*
1669  * Request a filesystem to resume write operations.
1670  */
1671 void
1672 vfs_write_resume_flags(struct mount *mp, int flags)
1673 {
1674 
1675  MNT_ILOCK(mp);
1676  if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1677  KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1678  mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1679  MNTK_SUSPENDED);
1680  mp->mnt_susp_owner = NULL;
1681  wakeup(&mp->mnt_writeopcount);
1682  wakeup(&mp->mnt_flag);
1683  curthread->td_pflags &= ~TDP_IGNSUSP;
1684  if ((flags & VR_START_WRITE) != 0) {
1685  MNT_REF(mp);
1686  mp->mnt_writeopcount++;
1687  }
1688  MNT_IUNLOCK(mp);
1689  if ((flags & VR_NO_SUSPCLR) == 0)
1690  VFS_SUSP_CLEAN(mp);
1691  } else if ((flags & VR_START_WRITE) != 0) {
1692  MNT_REF(mp);
1693  vn_start_write_locked(mp, 0);
1694  } else {
1695  MNT_IUNLOCK(mp);
1696  }
1697 }
1698 
1699 void
1700 vfs_write_resume(struct mount *mp)
1701 {
1702 
1703  vfs_write_resume_flags(mp, 0);
1704 }
1705 
1706 /*
1707  * Implement kqueues for files by translating it to vnode operation.
1708  */
1709 static int
1710 vn_kqfilter(struct file *fp, struct knote *kn)
1711 {
1712  int vfslocked;
1713  int error;
1714 
1715  vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1716  error = VOP_KQFILTER(fp->f_vnode, kn);
1717  VFS_UNLOCK_GIANT(vfslocked);
1718 
1719  return error;
1720 }
1721 
1722 /*
1723  * Simplified in-kernel wrapper calls for extended attribute access.
1724  * Both calls pass in a NULL credential, authorizing as "kernel" access.
1725  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1726  */
1727 int
1728 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1729  const char *attrname, int *buflen, char *buf, struct thread *td)
1730 {
1731  struct uio auio;
1732  struct iovec iov;
1733  int error;
1734 
1735  iov.iov_len = *buflen;
1736  iov.iov_base = buf;
1737 
1738  auio.uio_iov = &iov;
1739  auio.uio_iovcnt = 1;
1740  auio.uio_rw = UIO_READ;
1741  auio.uio_segflg = UIO_SYSSPACE;
1742  auio.uio_td = td;
1743  auio.uio_offset = 0;
1744  auio.uio_resid = *buflen;
1745 
1746  if ((ioflg & IO_NODELOCKED) == 0)
1747  vn_lock(vp, LK_SHARED | LK_RETRY);
1748 
1749  ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1750 
1751  /* authorize attribute retrieval as kernel */
1752  error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1753  td);
1754 
1755  if ((ioflg & IO_NODELOCKED) == 0)
1756  VOP_UNLOCK(vp, 0);
1757 
1758  if (error == 0) {
1759  *buflen = *buflen - auio.uio_resid;
1760  }
1761 
1762  return (error);
1763 }
1764 
1765 /*
1766  * XXX failure mode if partially written?
1767  */
1768 int
1769 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1770  const char *attrname, int buflen, char *buf, struct thread *td)
1771 {
1772  struct uio auio;
1773  struct iovec iov;
1774  struct mount *mp;
1775  int error;
1776 
1777  iov.iov_len = buflen;
1778  iov.iov_base = buf;
1779 
1780  auio.uio_iov = &iov;
1781  auio.uio_iovcnt = 1;
1782  auio.uio_rw = UIO_WRITE;
1783  auio.uio_segflg = UIO_SYSSPACE;
1784  auio.uio_td = td;
1785  auio.uio_offset = 0;
1786  auio.uio_resid = buflen;
1787 
1788  if ((ioflg & IO_NODELOCKED) == 0) {
1789  if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1790  return (error);
1791  vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1792  }
1793 
1794  ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1795 
1796  /* authorize attribute setting as kernel */
1797  error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1798 
1799  if ((ioflg & IO_NODELOCKED) == 0) {
1800  vn_finished_write(mp);
1801  VOP_UNLOCK(vp, 0);
1802  }
1803 
1804  return (error);
1805 }
1806 
1807 int
1808 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1809  const char *attrname, struct thread *td)
1810 {
1811  struct mount *mp;
1812  int error;
1813 
1814  if ((ioflg & IO_NODELOCKED) == 0) {
1815  if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1816  return (error);
1817  vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1818  }
1819 
1820  ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1821 
1822  /* authorize attribute removal as kernel */
1823  error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1824  if (error == EOPNOTSUPP)
1825  error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1826  NULL, td);
1827 
1828  if ((ioflg & IO_NODELOCKED) == 0) {
1829  vn_finished_write(mp);
1830  VOP_UNLOCK(vp, 0);
1831  }
1832 
1833  return (error);
1834 }
1835 
1836 int
1837 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
1838 {
1839  struct mount *mp;
1840  int ltype, error;
1841 
1842  mp = vp->v_mount;
1843  ltype = VOP_ISLOCKED(vp);
1844  KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
1845  ("vn_vget_ino: vp not locked"));
1846  error = vfs_busy(mp, MBF_NOWAIT);
1847  if (error != 0) {
1848  vfs_ref(mp);
1849  VOP_UNLOCK(vp, 0);
1850  error = vfs_busy(mp, 0);
1851  vn_lock(vp, ltype | LK_RETRY);
1852  vfs_rel(mp);
1853  if (error != 0)
1854  return (ENOENT);
1855  if (vp->v_iflag & VI_DOOMED) {
1856  vfs_unbusy(mp);
1857  return (ENOENT);
1858  }
1859  }
1860  VOP_UNLOCK(vp, 0);
1861  error = VFS_VGET(mp, ino, lkflags, rvp);
1862  vfs_unbusy(mp);
1863  vn_lock(vp, ltype | LK_RETRY);
1864  if (vp->v_iflag & VI_DOOMED) {
1865  if (error == 0)
1866  vput(*rvp);
1867  error = ENOENT;
1868  }
1869  return (error);
1870 }
1871 
1872 int
1873 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
1874  const struct thread *td)
1875 {
1876 
1877  if (vp->v_type != VREG || td == NULL)
1878  return (0);
1879  PROC_LOCK(td->td_proc);
1880  if ((uoff_t)uio->uio_offset + uio->uio_resid >
1881  lim_cur(td->td_proc, RLIMIT_FSIZE)) {
1882  kern_psignal(td->td_proc, SIGXFSZ);
1883  PROC_UNLOCK(td->td_proc);
1884  return (EFBIG);
1885  }
1886  PROC_UNLOCK(td->td_proc);
1887  return (0);
1888 }
1889 
1890 int
1891 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
1892  struct thread *td)
1893 {
1894  struct vnode *vp;
1895  int error, vfslocked;
1896 
1897  vp = fp->f_vnode;
1898  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1899 #ifdef AUDIT
1900  vn_lock(vp, LK_SHARED | LK_RETRY);
1901  AUDIT_ARG_VNODE1(vp);
1902  VOP_UNLOCK(vp, 0);
1903 #endif
1904  error = setfmode(td, active_cred, vp, mode);
1905  VFS_UNLOCK_GIANT(vfslocked);
1906  return (error);
1907 }
1908 
1909 int
1910 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
1911  struct thread *td)
1912 {
1913  struct vnode *vp;
1914  int error, vfslocked;
1915 
1916  vp = fp->f_vnode;
1917  vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1918 #ifdef AUDIT
1919  vn_lock(vp, LK_SHARED | LK_RETRY);
1920  AUDIT_ARG_VNODE1(vp);
1921  VOP_UNLOCK(vp, 0);
1922 #endif
1923  error = setfown(td, active_cred, vp, uid, gid);
1924  VFS_UNLOCK_GIANT(vfslocked);
1925  return (error);
1926 }
1927 
1928 void
1929 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
1930 {
1931  vm_object_t object;
1932 
1933  if ((object = vp->v_object) == NULL)
1934  return;
1935  VM_OBJECT_LOCK(object);
1936  vm_object_page_remove(object, start, end, 0);
1937  VM_OBJECT_UNLOCK(object);
1938 }
1939 
1940 int
1941 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
1942 {
1943  struct vattr va;
1944  daddr_t bn, bnp;
1945  uint64_t bsize;
1946  off_t noff;
1947  int error;
1948 
1949  KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
1950  ("Wrong command %lu", cmd));
1951 
1952  if (vn_lock(vp, LK_SHARED) != 0)
1953  return (EBADF);
1954  if (vp->v_type != VREG) {
1955  error = ENOTTY;
1956  goto unlock;
1957  }
1958  error = VOP_GETATTR(vp, &va, cred);
1959  if (error != 0)
1960  goto unlock;
1961  noff = *off;
1962  if (noff >= va.va_size) {
1963  error = ENXIO;
1964  goto unlock;
1965  }
1966  bsize = vp->v_mount->mnt_stat.f_iosize;
1967  for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
1968  error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
1969  if (error == EOPNOTSUPP) {
1970  error = ENOTTY;
1971  goto unlock;
1972  }
1973  if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
1974  (bnp != -1 && cmd == FIOSEEKDATA)) {
1975  noff = bn * bsize;
1976  if (noff < *off)
1977  noff = *off;
1978  goto unlock;
1979  }
1980  }
1981  if (noff > va.va_size)
1982  noff = va.va_size;
1983  /* noff == va.va_size. There is an implicit hole at the end of file. */
1984  if (cmd == FIOSEEKDATA)
1985  error = ENXIO;
1986 unlock:
1987  VOP_UNLOCK(vp, 0);
1988  if (error == 0)
1989  *off = noff;
1990  return (error);
1991 }
static u_long vn_io_faults_cnt
Definition: vfs_vnops.c:845
int vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred, struct ucred *file_cred, struct thread *td)
Definition: vfs_vnops.c:1209
struct uio * cloneuio(struct uio *uiop)
Definition: subr_uio.c:544
off_t foffset_lock(struct file *fp, int flags)
Definition: vfs_vnops.c:524
static int vn_start_write_locked(struct mount *mp, int flags)
Definition: vfs_vnops.c:1457
struct buf * buf
Definition: vfs_bio.c:97
rlim_t lim_cur(struct proc *p, int which)
void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
Definition: vfs_vnops.c:1929
int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td)
Definition: vfs_vnops.c:474
struct mtx_pool * mtxpool_sleep
Definition: kern_mtxpool.c:91
void NDFREE(struct nameidata *ndp, const u_int flags)
Definition: vfs_lookup.c:1091
static fo_close_t vn_closefile
Definition: vfs_vnops.c:89
static fo_rdwr_t vn_read
Definition: vfs_vnops.c:81
void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
Definition: vfs_vnops.c:601
int mode
void *** start
Definition: linker_if.m:86
void vfs_rel(struct mount *mp)
Definition: vfs_mount.c:439
int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
Definition: vfs_vnops.c:1941
int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
Definition: vfs_vnops.c:1837
void panic(const char *fmt,...)
void bwillwrite(void)
Definition: vfs_bio.c:1409
void vn_finished_write(struct mount *mp)
Definition: vfs_vnops.c:1599
int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
Definition: vfs_vnops.c:1038
void knote(struct knlist *list, long hint, int lockflags)
Definition: kern_event.c:1806
__FBSDID("$BSDSUniX$")
int vn_writechk(struct vnode *vp)
Definition: vfs_vnops.c:283
void kern_psignal(struct proc *p, int sig)
Definition: kern_sig.c:1975
static fo_rdwr_t vn_io_fault
Definition: vfs_vnops.c:83
struct mtx * mtx_pool_find(struct mtx_pool *pool, void *ptr)
Definition: kern_mtxpool.c:109
int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td)
Definition: vfs_vnops.c:1910
accmode_t accmode
Definition: subr_acl_nfs4.c:66
void vfs_write_resume(struct mount *mp)
Definition: vfs_vnops.c:1700
int priv_check(struct thread *td, int priv)
Definition: kern_priv.c:170
int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio)
Definition: vfs_vnops.c:1094
static fo_truncate_t vn_truncate
Definition: vfs_vnops.c:84
void vput(struct vnode *vp)
Definition: vfs_subr.c:2428
int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp)
Definition: vfs_vnops.c:124
void vfs_ref(struct mount *mp)
Definition: vfs_mount.c:429
void foffset_unlock(struct file *fp, off_t val, int flags)
Definition: vfs_vnops.c:560
int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid)
static int vn_io_fault_enable
Definition: vfs_vnops.c:842
int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
Definition: vfs_vnops.c:1891
static fo_rdwr_t vn_write
Definition: vfs_vnops.c:82
void vfs_unbusy(struct mount *mp)
Definition: vfs_subr.c:442
static fo_poll_t vn_poll
Definition: vfs_vnops.c:86
int namei(struct nameidata *ndp)
Definition: vfs_lookup.c:135
int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
int vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, const struct thread *td)
Definition: vfs_vnops.c:1873
static fo_ioctl_t vn_ioctl
Definition: vfs_vnops.c:85
int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td)
Definition: vfs_vnops.c:379
static const int io_hold_cnt
Definition: vfs_vnops.c:841
void vfs_write_resume_flags(struct mount *mp, int flags)
Definition: vfs_vnops.c:1672
int uiomove(void *cp, int n, struct uio *uio)
Definition: subr_uio.c:202
void free(void *addr, struct malloc_type *mtp)
Definition: kern_malloc.c:554
int vfs_write_suspend(struct mount *mp)
Definition: vfs_vnops.c:1644
int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td)
Definition: vfs_vnops.c:303
int vfs_busy(struct mount *mp, int flags)
Definition: vfs_subr.c:395
int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td)
Definition: vfs_vnops.c:1728
SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,&vn_io_fault_enable, 0,"Enable vn_io_fault lock avoidance")
void kern_yield(int prio)
Definition: kern_synch.c:592
int _vn_lock(struct vnode *vp, int flags, char *file, int line)
Definition: vfs_vnops.c:1389
static fo_stat_t vn_statfile
Definition: vfs_vnops.c:88
SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,&vn_io_faults_cnt, 0,"Count of vn_io_fault lock avoidance triggers")
void vn_finished_secondary_write(struct mount *mp)
Definition: vfs_vnops.c:1622
void wakeup(void *ident)
Definition: kern_synch.c:378
void vrele(struct vnode *vp)
Definition: vfs_subr.c:2416
int vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
Definition: vfs_vnops.c:1491
int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td)
Definition: vfs_vnops.c:1769
void foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
Definition: vfs_vnops.c:593
int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
Definition: vfs_vnops.c:106
struct fileops vnops
Definition: vfs_vnops.c:91
static int sequential_heuristic(struct uio *uio, struct file *fp)
Definition: vfs_vnops.c:337
int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td)
Definition: vfs_vnops.c:1808
int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
Definition: vfs_vnops.c:1537
static int get_advice(struct file *fp, struct uio *uio)
Definition: vfs_vnops.c:609
struct fileops badfileops
static fo_kqfilter_t vn_kqfilter
Definition: vfs_vnops.c:87