FreeBSD kernel kern code
vfs_cluster.c
Go to the documentation of this file.
1 /*-
2  * Copyright (c) 1993
3  * The Regents of the University of California. All rights reserved.
4  * Modifications/enhancements:
5  * Copyright (c) 1995 John S. Dyson. All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  * 4. Neither the name of the University nor the names of its contributors
16  * may be used to endorse or promote products derived from this software
17  * without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$BSDSUniX$");
36 
37 #include "opt_debug_cluster.h"
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/proc.h>
43 #include <sys/bio.h>
44 #include <sys/buf.h>
45 #include <sys/vnode.h>
46 #include <sys/malloc.h>
47 #include <sys/mount.h>
48 #include <sys/resourcevar.h>
49 #include <sys/vmmeter.h>
50 #include <vm/vm.h>
51 #include <vm/vm_object.h>
52 #include <vm/vm_page.h>
53 #include <sys/sysctl.h>
54 
55 #if defined(CLUSTERDEBUG)
56 static int rcluster= 0;
57 SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0,
58  "Debug VFS clustering code");
59 #endif
60 
61 static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
62 
63 static struct cluster_save *cluster_collectbufs(struct vnode *vp,
64  struct buf *last_bp, int gbflags);
65 static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
66  daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
67  struct buf *fbp);
68 static void cluster_callback(struct buf *);
69 
70 static int write_behind = 1;
71 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
72  "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
73 
74 static int read_max = 64;
75 SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
76  "Cluster read-ahead max block count");
77 
78 static int read_min = 1;
79 SYSCTL_INT(_vfs, OID_AUTO, read_min, CTLFLAG_RW, &read_min, 0,
80  "Cluster read min block count");
81 
82 /* Page expended to mark partially backed buffers */
83 extern vm_page_t bogus_page;
84 
85 /*
86  * Read data to a buf, including read-ahead if we find this to be beneficial.
87  * cluster_read replaces bread.
88  */
89 int
90 cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
91  struct ucred *cred, long totread, int seqcount, struct buf **bpp)
92 {
93 
94  return (cluster_read_gb(vp, filesize, lblkno, size, cred, totread,
95  seqcount, 0, bpp));
96 }
97 
98 int
99 cluster_read_gb(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
100  struct ucred *cred, long totread, int seqcount, int gbflags,
101  struct buf **bpp)
102 {
103  struct buf *bp, *rbp, *reqbp;
104  struct bufobj *bo;
105  daddr_t blkno, origblkno;
106  int maxra, racluster;
107  int error, ncontig;
108  int i;
109 
110  error = 0;
111  bo = &vp->v_bufobj;
112  if (!unmapped_buf_allowed)
113  gbflags &= ~GB_UNMAPPED;
114 
115  /*
116  * Try to limit the amount of read-ahead by a few
117  * ad-hoc parameters. This needs work!!!
118  */
119  racluster = vp->v_mount->mnt_iosize_max / size;
120  maxra = seqcount;
121  maxra = min(read_max, maxra);
122  maxra = min(nbuf/8, maxra);
123  if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
124  maxra = (filesize / size) - lblkno;
125 
126  /*
127  * get the requested block
128  */
129  *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
130  origblkno = lblkno;
131 
132  /*
133  * if it is in the cache, then check to see if the reads have been
134  * sequential. If they have, then try some read-ahead, otherwise
135  * back-off on prospective read-aheads.
136  */
137  if (bp->b_flags & B_CACHE) {
138  if (!seqcount) {
139  return 0;
140  } else if ((bp->b_flags & B_RAM) == 0) {
141  return 0;
142  } else {
143  bp->b_flags &= ~B_RAM;
144  BO_LOCK(bo);
145  for (i = 1; i < maxra; i++) {
146  /*
147  * Stop if the buffer does not exist or it
148  * is invalid (about to go away?)
149  */
150  rbp = gbincore(&vp->v_bufobj, lblkno+i);
151  if (rbp == NULL || (rbp->b_flags & B_INVAL))
152  break;
153 
154  /*
155  * Set another read-ahead mark so we know
156  * to check again. (If we can lock the
157  * buffer without waiting)
158  */
159  if ((((i % racluster) == (racluster - 1)) ||
160  (i == (maxra - 1)))
161  && (0 == BUF_LOCK(rbp,
162  LK_EXCLUSIVE | LK_NOWAIT, NULL))) {
163  rbp->b_flags |= B_RAM;
164  BUF_UNLOCK(rbp);
165  }
166  }
167  BO_UNLOCK(bo);
168  if (i >= maxra) {
169  return 0;
170  }
171  lblkno += i;
172  }
173  reqbp = bp = NULL;
174  /*
175  * If it isn't in the cache, then get a chunk from
176  * disk if sequential, otherwise just get the block.
177  */
178  } else {
179  off_t firstread = bp->b_offset;
180  int nblks;
181  long minread;
182 
183  KASSERT(bp->b_offset != NOOFFSET,
184  ("cluster_read: no buffer offset"));
185 
186  ncontig = 0;
187 
188  /*
189  * Adjust totread if needed
190  */
191  minread = read_min * size;
192  if (minread > totread)
193  totread = minread;
194 
195  /*
196  * Compute the total number of blocks that we should read
197  * synchronously.
198  */
199  if (firstread + totread > filesize)
200  totread = filesize - firstread;
201  nblks = howmany(totread, size);
202  if (nblks > racluster)
203  nblks = racluster;
204 
205  /*
206  * Now compute the number of contiguous blocks.
207  */
208  if (nblks > 1) {
209  error = VOP_BMAP(vp, lblkno, NULL,
210  &blkno, &ncontig, NULL);
211  /*
212  * If this failed to map just do the original block.
213  */
214  if (error || blkno == -1)
215  ncontig = 0;
216  }
217 
218  /*
219  * If we have contiguous data available do a cluster
220  * otherwise just read the requested block.
221  */
222  if (ncontig) {
223  /* Account for our first block. */
224  ncontig = min(ncontig + 1, nblks);
225  if (ncontig < nblks)
226  nblks = ncontig;
227  bp = cluster_rbuild(vp, filesize, lblkno,
228  blkno, size, nblks, gbflags, bp);
229  lblkno += (bp->b_bufsize / size);
230  } else {
231  bp->b_flags |= B_RAM;
232  bp->b_iocmd = BIO_READ;
233  lblkno += 1;
234  }
235  }
236 
237  /*
238  * handle the synchronous read so that it is available ASAP.
239  */
240  if (bp) {
241  if ((bp->b_flags & B_CLUSTER) == 0) {
242  vfs_busy_pages(bp, 0);
243  }
244  bp->b_flags &= ~B_INVAL;
245  bp->b_ioflags &= ~BIO_ERROR;
246  if ((bp->b_flags & B_ASYNC) || bp->b_iodone != NULL)
247  BUF_KERNPROC(bp);
248  bp->b_iooffset = dbtob(bp->b_blkno);
249  bstrategy(bp);
250  curthread->td_ru.ru_inblock++;
251  }
252 
253  /*
254  * If we have been doing sequential I/O, then do some read-ahead.
255  */
256  while (lblkno < (origblkno + maxra)) {
257  error = VOP_BMAP(vp, lblkno, NULL, &blkno, &ncontig, NULL);
258  if (error)
259  break;
260 
261  if (blkno == -1)
262  break;
263 
264  /*
265  * We could throttle ncontig here by maxra but we might as
266  * well read the data if it is contiguous. We're throttled
267  * by racluster anyway.
268  */
269  if (ncontig) {
270  ncontig = min(ncontig + 1, racluster);
271  rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
272  size, ncontig, gbflags, NULL);
273  lblkno += (rbp->b_bufsize / size);
274  if (rbp->b_flags & B_DELWRI) {
275  bqrelse(rbp);
276  continue;
277  }
278  } else {
279  rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
280  lblkno += 1;
281  if (rbp->b_flags & B_DELWRI) {
282  bqrelse(rbp);
283  continue;
284  }
285  rbp->b_flags |= B_ASYNC | B_RAM;
286  rbp->b_iocmd = BIO_READ;
287  rbp->b_blkno = blkno;
288  }
289  if (rbp->b_flags & B_CACHE) {
290  rbp->b_flags &= ~B_ASYNC;
291  bqrelse(rbp);
292  continue;
293  }
294  if ((rbp->b_flags & B_CLUSTER) == 0) {
295  vfs_busy_pages(rbp, 0);
296  }
297  rbp->b_flags &= ~B_INVAL;
298  rbp->b_ioflags &= ~BIO_ERROR;
299  if ((rbp->b_flags & B_ASYNC) || rbp->b_iodone != NULL)
300  BUF_KERNPROC(rbp);
301  rbp->b_iooffset = dbtob(rbp->b_blkno);
302  bstrategy(rbp);
303  curthread->td_ru.ru_inblock++;
304  }
305 
306  if (reqbp)
307  return (bufwait(reqbp));
308  else
309  return (error);
310 }
311 
312 /*
313  * If blocks are contiguous on disk, use this to provide clustered
314  * read ahead. We will read as many blocks as possible sequentially
315  * and then parcel them up into logical blocks in the buffer hash table.
316  */
317 static struct buf *
318 cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
319  daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
320 {
321  struct bufobj *bo;
322  struct buf *bp, *tbp;
323  daddr_t bn;
324  off_t off;
325  long tinc, tsize;
326  int i, inc, j, toff;
327 
328  KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
329  ("cluster_rbuild: size %ld != filesize %jd\n",
330  size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
331 
332  /*
333  * avoid a division
334  */
335  while ((u_quad_t) size * (lbn + run) > filesize) {
336  --run;
337  }
338 
339  if (fbp) {
340  tbp = fbp;
341  tbp->b_iocmd = BIO_READ;
342  } else {
343  tbp = getblk(vp, lbn, size, 0, 0, gbflags);
344  if (tbp->b_flags & B_CACHE)
345  return tbp;
346  tbp->b_flags |= B_ASYNC | B_RAM;
347  tbp->b_iocmd = BIO_READ;
348  }
349  tbp->b_blkno = blkno;
350  if( (tbp->b_flags & B_MALLOC) ||
351  ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
352  return tbp;
353 
354  bp = trypbuf(&cluster_pbuf_freecnt);
355  if (bp == 0)
356  return tbp;
357 
358  /*
359  * We are synthesizing a buffer out of vm_page_t's, but
360  * if the block size is not page aligned then the starting
361  * address may not be either. Inherit the b_data offset
362  * from the original buffer.
363  */
364  bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
365  if ((gbflags & GB_UNMAPPED) != 0) {
366  bp->b_flags |= B_UNMAPPED;
367  bp->b_data = unmapped_buf;
368  } else {
369  bp->b_data = (char *)((vm_offset_t)bp->b_data |
370  ((vm_offset_t)tbp->b_data & PAGE_MASK));
371  }
372  bp->b_iocmd = BIO_READ;
373  bp->b_iodone = cluster_callback;
374  bp->b_blkno = blkno;
375  bp->b_lblkno = lbn;
376  bp->b_offset = tbp->b_offset;
377  KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset"));
378  pbgetvp(vp, bp);
379 
380  TAILQ_INIT(&bp->b_cluster.cluster_head);
381 
382  bp->b_bcount = 0;
383  bp->b_bufsize = 0;
384  bp->b_npages = 0;
385 
386  inc = btodb(size);
387  bo = &vp->v_bufobj;
388  for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
389  if (i != 0) {
390  if ((bp->b_npages * PAGE_SIZE) +
391  round_page(size) > vp->v_mount->mnt_iosize_max) {
392  break;
393  }
394 
395  tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
396  (gbflags & GB_UNMAPPED));
397 
398  /* Don't wait around for locked bufs. */
399  if (tbp == NULL)
400  break;
401 
402  /*
403  * Stop scanning if the buffer is fully valid
404  * (marked B_CACHE), or locked (may be doing a
405  * background write), or if the buffer is not
406  * VMIO backed. The clustering code can only deal
407  * with VMIO-backed buffers.
408  */
409  BO_LOCK(bo);
410  if ((tbp->b_vflags & BV_BKGRDINPROG) ||
411  (tbp->b_flags & B_CACHE) ||
412  (tbp->b_flags & B_VMIO) == 0) {
413  BO_UNLOCK(bo);
414  bqrelse(tbp);
415  break;
416  }
417  BO_UNLOCK(bo);
418 
419  /*
420  * The buffer must be completely invalid in order to
421  * take part in the cluster. If it is partially valid
422  * then we stop.
423  */
424  off = tbp->b_offset;
425  tsize = size;
426  VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
427  for (j = 0; tsize > 0; j++) {
428  toff = off & PAGE_MASK;
429  tinc = tsize;
430  if (toff + tinc > PAGE_SIZE)
431  tinc = PAGE_SIZE - toff;
432  VM_OBJECT_LOCK_ASSERT(tbp->b_pages[j]->object,
433  MA_OWNED);
434  if ((tbp->b_pages[j]->valid &
435  vm_page_bits(toff, tinc)) != 0)
436  break;
437  off += tinc;
438  tsize -= tinc;
439  }
440  VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
441  if (tsize > 0) {
442  bqrelse(tbp);
443  break;
444  }
445 
446  /*
447  * Set a read-ahead mark as appropriate
448  */
449  if ((fbp && (i == 1)) || (i == (run - 1)))
450  tbp->b_flags |= B_RAM;
451 
452  /*
453  * Set the buffer up for an async read (XXX should
454  * we do this only if we do not wind up brelse()ing?).
455  * Set the block number if it isn't set, otherwise
456  * if it is make sure it matches the block number we
457  * expect.
458  */
459  tbp->b_flags |= B_ASYNC;
460  tbp->b_iocmd = BIO_READ;
461  if (tbp->b_blkno == tbp->b_lblkno) {
462  tbp->b_blkno = bn;
463  } else if (tbp->b_blkno != bn) {
464  brelse(tbp);
465  break;
466  }
467  }
468  /*
469  * XXX fbp from caller may not be B_ASYNC, but we are going
470  * to biodone() it in cluster_callback() anyway
471  */
472  BUF_KERNPROC(tbp);
473  TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
474  tbp, b_cluster.cluster_entry);
475  VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
476  for (j = 0; j < tbp->b_npages; j += 1) {
477  vm_page_t m;
478  m = tbp->b_pages[j];
479  vm_page_io_start(m);
480  vm_object_pip_add(m->object, 1);
481  if ((bp->b_npages == 0) ||
482  (bp->b_pages[bp->b_npages-1] != m)) {
483  bp->b_pages[bp->b_npages] = m;
484  bp->b_npages++;
485  }
486  if (m->valid == VM_PAGE_BITS_ALL)
487  tbp->b_pages[j] = bogus_page;
488  }
489  VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
490  /*
491  * Don't inherit tbp->b_bufsize as it may be larger due to
492  * a non-page-aligned size. Instead just aggregate using
493  * 'size'.
494  */
495  if (tbp->b_bcount != size)
496  printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp->b_bcount, size);
497  if (tbp->b_bufsize != size)
498  printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp->b_bufsize, size);
499  bp->b_bcount += size;
500  bp->b_bufsize += size;
501  }
502 
503  /*
504  * Fully valid pages in the cluster are already good and do not need
505  * to be re-read from disk. Replace the page with bogus_page
506  */
507  VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
508  for (j = 0; j < bp->b_npages; j++) {
509  VM_OBJECT_LOCK_ASSERT(bp->b_pages[j]->object, MA_OWNED);
510  if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
511  bp->b_pages[j] = bogus_page;
512  }
513  VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
514  if (bp->b_bufsize > bp->b_kvasize)
515  panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
516  bp->b_bufsize, bp->b_kvasize);
517  bp->b_kvasize = bp->b_bufsize;
518 
519  if ((bp->b_flags & B_UNMAPPED) == 0) {
520  pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
521  (vm_page_t *)bp->b_pages, bp->b_npages);
522  }
523  return (bp);
524 }
525 
526 /*
527  * Cleanup after a clustered read or write.
528  * This is complicated by the fact that any of the buffers might have
529  * extra memory (if there were no empty buffer headers at allocbuf time)
530  * that we will need to shift around.
531  */
532 static void
534  struct buf *bp;
535 {
536  struct buf *nbp, *tbp;
537  int error = 0;
538 
539  /*
540  * Must propogate errors to all the components.
541  */
542  if (bp->b_ioflags & BIO_ERROR)
543  error = bp->b_error;
544 
545  if ((bp->b_flags & B_UNMAPPED) == 0) {
546  pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
547  bp->b_npages);
548  }
549  /*
550  * Move memory from the large cluster buffer into the component
551  * buffers and mark IO as done on these.
552  */
553  for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
554  tbp; tbp = nbp) {
555  nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
556  if (error) {
557  tbp->b_ioflags |= BIO_ERROR;
558  tbp->b_error = error;
559  } else {
560  tbp->b_dirtyoff = tbp->b_dirtyend = 0;
561  tbp->b_flags &= ~B_INVAL;
562  tbp->b_ioflags &= ~BIO_ERROR;
563  /*
564  * XXX the bdwrite()/bqrelse() issued during
565  * cluster building clears B_RELBUF (see bqrelse()
566  * comment). If direct I/O was specified, we have
567  * to restore it here to allow the buffer and VM
568  * to be freed.
569  */
570  if (tbp->b_flags & B_DIRECT)
571  tbp->b_flags |= B_RELBUF;
572  }
573  bufdone(tbp);
574  }
575  pbrelvp(bp);
576  relpbuf(bp, &cluster_pbuf_freecnt);
577 }
578 
579 /*
580  * cluster_wbuild_wb:
581  *
582  * Implement modified write build for cluster.
583  *
584  * write_behind = 0 write behind disabled
585  * write_behind = 1 write behind normal (default)
586  * write_behind = 2 write behind backed-off
587  */
588 
589 static __inline int
590 cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
591  int gbflags)
592 {
593  int r = 0;
594 
595  switch (write_behind) {
596  case 2:
597  if (start_lbn < len)
598  break;
599  start_lbn -= len;
600  /* FALLTHROUGH */
601  case 1:
602  r = cluster_wbuild_gb(vp, size, start_lbn, len, gbflags);
603  /* FALLTHROUGH */
604  default:
605  /* FALLTHROUGH */
606  break;
607  }
608  return(r);
609 }
610 
611 /*
612  * Do clustered write for FFS.
613  *
614  * Three cases:
615  * 1. Write is not sequential (write asynchronously)
616  * Write is sequential:
617  * 2. beginning of cluster - begin cluster
618  * 3. middle of a cluster - add to cluster
619  * 4. end of a cluster - asynchronously write cluster
620  */
621 void
622 cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount)
623 {
624 
625  cluster_write_gb(vp, bp, filesize, seqcount, 0);
626 }
627 
628 void
629 cluster_write_gb(struct vnode *vp, struct buf *bp, u_quad_t filesize,
630  int seqcount, int gbflags)
631 {
632  daddr_t lbn;
633  int maxclen, cursize;
634  int lblocksize;
635  int async;
636 
637  if (!unmapped_buf_allowed)
638  gbflags &= ~GB_UNMAPPED;
639 
640  if (vp->v_type == VREG) {
641  async = DOINGASYNC(vp);
642  lblocksize = vp->v_mount->mnt_stat.f_iosize;
643  } else {
644  async = 0;
645  lblocksize = bp->b_bufsize;
646  }
647  lbn = bp->b_lblkno;
648  KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset"));
649 
650  /* Initialize vnode to beginning of file. */
651  if (lbn == 0)
652  vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
653 
654  if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
655  (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
656  maxclen = vp->v_mount->mnt_iosize_max / lblocksize - 1;
657  if (vp->v_clen != 0) {
658  /*
659  * Next block is not sequential.
660  *
661  * If we are not writing at end of file, the process
662  * seeked to another point in the file since its last
663  * write, or we have reached our maximum cluster size,
664  * then push the previous cluster. Otherwise try
665  * reallocating to make it sequential.
666  *
667  * Change to algorithm: only push previous cluster if
668  * it was sequential from the point of view of the
669  * seqcount heuristic, otherwise leave the buffer
670  * intact so we can potentially optimize the I/O
671  * later on in the buf_daemon or update daemon
672  * flush.
673  */
674  cursize = vp->v_lastw - vp->v_cstart + 1;
675  if (((u_quad_t) bp->b_offset + lblocksize) != filesize ||
676  lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
677  if (!async && seqcount > 0) {
678  cluster_wbuild_wb(vp, lblocksize,
679  vp->v_cstart, cursize, gbflags);
680  }
681  } else {
682  struct buf **bpp, **endbp;
683  struct cluster_save *buflist;
684 
685  buflist = cluster_collectbufs(vp, bp, gbflags);
686  endbp = &buflist->bs_children
687  [buflist->bs_nchildren - 1];
688  if (VOP_REALLOCBLKS(vp, buflist)) {
689  /*
690  * Failed, push the previous cluster
691  * if *really* writing sequentially
692  * in the logical file (seqcount > 1),
693  * otherwise delay it in the hopes that
694  * the low level disk driver can
695  * optimize the write ordering.
696  */
697  for (bpp = buflist->bs_children;
698  bpp < endbp; bpp++)
699  brelse(*bpp);
700  free(buflist, M_SEGMENT);
701  if (seqcount > 1) {
702  cluster_wbuild_wb(vp,
703  lblocksize, vp->v_cstart,
704  cursize, gbflags);
705  }
706  } else {
707  /*
708  * Succeeded, keep building cluster.
709  */
710  for (bpp = buflist->bs_children;
711  bpp <= endbp; bpp++)
712  bdwrite(*bpp);
713  free(buflist, M_SEGMENT);
714  vp->v_lastw = lbn;
715  vp->v_lasta = bp->b_blkno;
716  return;
717  }
718  }
719  }
720  /*
721  * Consider beginning a cluster. If at end of file, make
722  * cluster as large as possible, otherwise find size of
723  * existing cluster.
724  */
725  if ((vp->v_type == VREG) &&
726  ((u_quad_t) bp->b_offset + lblocksize) != filesize &&
727  (bp->b_blkno == bp->b_lblkno) &&
728  (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
729  bp->b_blkno == -1)) {
730  bawrite(bp);
731  vp->v_clen = 0;
732  vp->v_lasta = bp->b_blkno;
733  vp->v_cstart = lbn + 1;
734  vp->v_lastw = lbn;
735  return;
736  }
737  vp->v_clen = maxclen;
738  if (!async && maxclen == 0) { /* I/O not contiguous */
739  vp->v_cstart = lbn + 1;
740  bawrite(bp);
741  } else { /* Wait for rest of cluster */
742  vp->v_cstart = lbn;
743  bdwrite(bp);
744  }
745  } else if (lbn == vp->v_cstart + vp->v_clen) {
746  /*
747  * At end of cluster, write it out if seqcount tells us we
748  * are operating sequentially, otherwise let the buf or
749  * update daemon handle it.
750  */
751  bdwrite(bp);
752  if (seqcount > 1) {
753  cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
754  vp->v_clen + 1, gbflags);
755  }
756  vp->v_clen = 0;
757  vp->v_cstart = lbn + 1;
758  } else if (vm_page_count_severe()) {
759  /*
760  * We are low on memory, get it going NOW
761  */
762  bawrite(bp);
763  } else {
764  /*
765  * In the middle of a cluster, so just delay the I/O for now.
766  */
767  bdwrite(bp);
768  }
769  vp->v_lastw = lbn;
770  vp->v_lasta = bp->b_blkno;
771 }
772 
773 
774 /*
775  * This is an awful lot like cluster_rbuild...wish they could be combined.
776  * The last lbn argument is the current block on which I/O is being
777  * performed. Check to see that it doesn't fall in the middle of
778  * the current block (if last_bp == NULL).
779  */
780 int
781 cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len)
782 {
783 
784  return (cluster_wbuild_gb(vp, size, start_lbn, len, 0));
785 }
786 
787 int
788 cluster_wbuild_gb(struct vnode *vp, long size, daddr_t start_lbn, int len,
789  int gbflags)
790 {
791  struct buf *bp, *tbp;
792  struct bufobj *bo;
793  int i, j;
794  int totalwritten = 0;
795  int dbsize = btodb(size);
796 
797  if (!unmapped_buf_allowed)
798  gbflags &= ~GB_UNMAPPED;
799 
800  bo = &vp->v_bufobj;
801  while (len > 0) {
802  /*
803  * If the buffer is not delayed-write (i.e. dirty), or it
804  * is delayed-write but either locked or inval, it cannot
805  * partake in the clustered write.
806  */
807  BO_LOCK(bo);
808  if ((tbp = gbincore(&vp->v_bufobj, start_lbn)) == NULL ||
809  (tbp->b_vflags & BV_BKGRDINPROG)) {
810  BO_UNLOCK(bo);
811  ++start_lbn;
812  --len;
813  continue;
814  }
815  if (BUF_LOCK(tbp,
816  LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_MTX(bo))) {
817  ++start_lbn;
818  --len;
819  continue;
820  }
821  if ((tbp->b_flags & (B_INVAL | B_DELWRI)) != B_DELWRI) {
822  BUF_UNLOCK(tbp);
823  ++start_lbn;
824  --len;
825  continue;
826  }
827  if (tbp->b_pin_count > 0) {
828  BUF_UNLOCK(tbp);
829  ++start_lbn;
830  --len;
831  continue;
832  }
833  bremfree(tbp);
834  tbp->b_flags &= ~B_DONE;
835 
836  /*
837  * Extra memory in the buffer, punt on this buffer.
838  * XXX we could handle this in most cases, but we would
839  * have to push the extra memory down to after our max
840  * possible cluster size and then potentially pull it back
841  * up if the cluster was terminated prematurely--too much
842  * hassle.
843  */
844  if (((tbp->b_flags & (B_CLUSTEROK | B_MALLOC | B_VMIO)) !=
845  (B_CLUSTEROK | B_VMIO)) ||
846  (tbp->b_bcount != tbp->b_bufsize) ||
847  (tbp->b_bcount != size) ||
848  (len == 1) ||
849  ((bp = (vp->v_vflag & VV_MD) != 0 ?
850  trypbuf(&cluster_pbuf_freecnt) :
851  getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
852  totalwritten += tbp->b_bufsize;
853  bawrite(tbp);
854  ++start_lbn;
855  --len;
856  continue;
857  }
858 
859  /*
860  * We got a pbuf to make the cluster in.
861  * so initialise it.
862  */
863  TAILQ_INIT(&bp->b_cluster.cluster_head);
864  bp->b_bcount = 0;
865  bp->b_bufsize = 0;
866  bp->b_npages = 0;
867  if (tbp->b_wcred != NOCRED)
868  bp->b_wcred = crhold(tbp->b_wcred);
869 
870  bp->b_blkno = tbp->b_blkno;
871  bp->b_lblkno = tbp->b_lblkno;
872  bp->b_offset = tbp->b_offset;
873 
874  /*
875  * We are synthesizing a buffer out of vm_page_t's, but
876  * if the block size is not page aligned then the starting
877  * address may not be either. Inherit the b_data offset
878  * from the original buffer.
879  */
880  if ((gbflags & GB_UNMAPPED) == 0 ||
881  (tbp->b_flags & B_VMIO) == 0) {
882  bp->b_data = (char *)((vm_offset_t)bp->b_data |
883  ((vm_offset_t)tbp->b_data & PAGE_MASK));
884  } else {
885  bp->b_flags |= B_UNMAPPED;
886  bp->b_data = unmapped_buf;
887  }
888  bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
889  B_NEEDCOMMIT));
890  bp->b_iodone = cluster_callback;
891  pbgetvp(vp, bp);
892  /*
893  * From this location in the file, scan forward to see
894  * if there are buffers with adjacent data that need to
895  * be written as well.
896  */
897  for (i = 0; i < len; ++i, ++start_lbn) {
898  if (i != 0) { /* If not the first buffer */
899  /*
900  * If the adjacent data is not even in core it
901  * can't need to be written.
902  */
903  BO_LOCK(bo);
904  if ((tbp = gbincore(bo, start_lbn)) == NULL ||
905  (tbp->b_vflags & BV_BKGRDINPROG)) {
906  BO_UNLOCK(bo);
907  break;
908  }
909 
910  /*
911  * If it IS in core, but has different
912  * characteristics, or is locked (which
913  * means it could be undergoing a background
914  * I/O or be in a weird state), then don't
915  * cluster with it.
916  */
917  if (BUF_LOCK(tbp,
918  LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
919  BO_MTX(bo)))
920  break;
921 
922  if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
923  B_INVAL | B_DELWRI | B_NEEDCOMMIT))
924  != (B_DELWRI | B_CLUSTEROK |
925  (bp->b_flags & (B_VMIO | B_NEEDCOMMIT))) ||
926  tbp->b_wcred != bp->b_wcred) {
927  BUF_UNLOCK(tbp);
928  break;
929  }
930 
931  /*
932  * Check that the combined cluster
933  * would make sense with regard to pages
934  * and would not be too large
935  */
936  if ((tbp->b_bcount != size) ||
937  ((bp->b_blkno + (dbsize * i)) !=
938  tbp->b_blkno) ||
939  ((tbp->b_npages + bp->b_npages) >
940  (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) {
941  BUF_UNLOCK(tbp);
942  break;
943  }
944 
945  /*
946  * Do not pull in pinned buffers.
947  */
948  if (tbp->b_pin_count > 0) {
949  BUF_UNLOCK(tbp);
950  break;
951  }
952 
953  /*
954  * Ok, it's passed all the tests,
955  * so remove it from the free list
956  * and mark it busy. We will use it.
957  */
958  bremfree(tbp);
959  tbp->b_flags &= ~B_DONE;
960  } /* end of code for non-first buffers only */
961  /*
962  * If the IO is via the VM then we do some
963  * special VM hackery (yuck). Since the buffer's
964  * block size may not be page-aligned it is possible
965  * for a page to be shared between two buffers. We
966  * have to get rid of the duplication when building
967  * the cluster.
968  */
969  if (tbp->b_flags & B_VMIO) {
970  vm_page_t m;
971 
972  VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
973  if (i != 0) { /* if not first buffer */
974  for (j = 0; j < tbp->b_npages; j += 1) {
975  m = tbp->b_pages[j];
976  if (m->oflags & VPO_BUSY) {
977  VM_OBJECT_UNLOCK(
978  tbp->b_object);
979  bqrelse(tbp);
980  goto finishcluster;
981  }
982  }
983  }
984  for (j = 0; j < tbp->b_npages; j += 1) {
985  m = tbp->b_pages[j];
986  vm_page_io_start(m);
987  vm_object_pip_add(m->object, 1);
988  if ((bp->b_npages == 0) ||
989  (bp->b_pages[bp->b_npages - 1] != m)) {
990  bp->b_pages[bp->b_npages] = m;
991  bp->b_npages++;
992  }
993  }
994  VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
995  }
996  bp->b_bcount += size;
997  bp->b_bufsize += size;
998  /*
999  * If any of the clustered buffers have their
1000  * B_BARRIER flag set, transfer that request to
1001  * the cluster.
1002  */
1003  bp->b_flags |= (tbp->b_flags & B_BARRIER);
1004  tbp->b_flags &= ~(B_DONE | B_BARRIER);
1005  tbp->b_flags |= B_ASYNC;
1006  tbp->b_ioflags &= ~BIO_ERROR;
1007  tbp->b_iocmd = BIO_WRITE;
1008  bundirty(tbp);
1009  reassignbuf(tbp); /* put on clean list */
1010  bufobj_wref(tbp->b_bufobj);
1011  BUF_KERNPROC(tbp);
1012  TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
1013  tbp, b_cluster.cluster_entry);
1014  }
1015  finishcluster:
1016  if ((bp->b_flags & B_UNMAPPED) == 0) {
1017  pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
1018  (vm_page_t *)bp->b_pages, bp->b_npages);
1019  }
1020  if (bp->b_bufsize > bp->b_kvasize)
1021  panic(
1022  "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
1023  bp->b_bufsize, bp->b_kvasize);
1024  bp->b_kvasize = bp->b_bufsize;
1025  totalwritten += bp->b_bufsize;
1026  bp->b_dirtyoff = 0;
1027  bp->b_dirtyend = bp->b_bufsize;
1028  bawrite(bp);
1029 
1030  len -= i;
1031  }
1032  return totalwritten;
1033 }
1034 
1035 /*
1036  * Collect together all the buffers in a cluster.
1037  * Plus add one additional buffer.
1038  */
1039 static struct cluster_save *
1040 cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
1041 {
1042  struct cluster_save *buflist;
1043  struct buf *bp;
1044  daddr_t lbn;
1045  int i, len;
1046 
1047  len = vp->v_lastw - vp->v_cstart + 1;
1048  buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
1049  M_SEGMENT, M_WAITOK);
1050  buflist->bs_nchildren = 0;
1051  buflist->bs_children = (struct buf **) (buflist + 1);
1052  for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
1053  (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
1054  gbflags, &bp);
1055  buflist->bs_children[i] = bp;
1056  if (bp->b_blkno == bp->b_lblkno)
1057  VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
1058  NULL, NULL);
1059  }
1060  buflist->bs_children[i] = bp = last_bp;
1061  if (bp->b_blkno == bp->b_lblkno)
1062  VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
1063  buflist->bs_nchildren = i + 1;
1064  return (buflist);
1065 }
static struct cluster_save * cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
Definition: vfs_cluster.c:1040
struct buf * buf
Definition: vfs_bio.c:97
void cluster_write_gb(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount, int gbflags)
Definition: vfs_cluster.c:629
static MALLOC_DEFINE(M_SEGMENT,"cl_savebuf","cluster_save buffer")
void * malloc(unsigned long size, struct malloc_type *mtp, int flags)
Definition: kern_malloc.c:454
void bufobj_wref(struct bufobj *bo)
Definition: vfs_bio.c:4479
void panic(const char *fmt,...)
static int write_behind
Definition: vfs_cluster.c:70
void cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount)
Definition: vfs_cluster.c:622
void bufdone(struct buf *bp)
Definition: vfs_bio.c:3825
int cluster_read_gb(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, int gbflags, struct buf **bpp)
Definition: vfs_cluster.c:99
void brelse(struct buf *bp)
Definition: vfs_bio.c:1451
int nbuf
Definition: subr_param.c:93
static int read_max
Definition: vfs_cluster.c:74
vm_page_t bogus_page
Definition: vfs_bio.c:256
int cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len)
Definition: vfs_cluster.c:781
struct buf * getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo, int flags)
Definition: vfs_bio.c:3094
void reassignbuf(struct buf *bp)
Definition: vfs_subr.c:2090
int cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size, struct ucred *cred, long totread, int seqcount, struct buf **bpp)
Definition: vfs_cluster.c:90
void bdwrite(struct buf *bp)
Definition: vfs_bio.c:1195
static __inline int cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags)
Definition: vfs_cluster.c:590
static void cluster_callback(struct buf *)
Definition: vfs_cluster.c:533
SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW,&write_behind, 0,"Cluster write-behind; 0: disable, 1: enable, 2: backed off")
void bundirty(struct buf *bp)
Definition: vfs_bio.c:1326
struct ucred * crhold(struct ucred *cr)
Definition: kern_prot.c:1824
void bawrite(struct buf *bp)
Definition: vfs_bio.c:1357
void free(void *addr, struct malloc_type *mtp)
Definition: kern_malloc.c:554
int bufwait(struct buf *bp)
Definition: vfs_bio.c:3719
struct buf * gbincore(struct bufobj *bo, daddr_t lblkno)
Definition: vfs_subr.c:1705
int printf(const char *fmt,...)
Definition: subr_prf.c:367
static int read_min
Definition: vfs_cluster.c:78
static struct buf * cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn, daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
Definition: vfs_cluster.c:318
caddr_t unmapped_buf
Definition: vfs_bio.c:98
void bqrelse(struct buf *bp)
Definition: vfs_bio.c:1760
__FBSDID("$BSDSUniX$")
int bread_gb(struct vnode *vp, daddr_t blkno, int cnt, struct ucred *cred, int gbflags, struct buf **bpp)
Definition: vfs_bio.c:984
void bremfree(struct buf *bp)
Definition: vfs_bio.c:846
void vfs_busy_pages(struct buf *bp, int clear_modify)
Definition: vfs_bio.c:4105
int cluster_wbuild_gb(struct vnode *vp, long size, daddr_t start_lbn, int len, int gbflags)
Definition: vfs_cluster.c:788