FreeBSD kernel kern code
kern_fork.c
Go to the documentation of this file.
1 /*-
2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
3  * The Regents of the University of California. All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  * may be used to endorse or promote products derived from this software
20  * without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$BSDSUniX$");
39 
40 #include "opt_kdtrace.h"
41 #include "opt_ktrace.h"
42 #include "opt_kstack_pages.h"
43 #include "opt_procdesc.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/eventhandler.h>
49 #include <sys/fcntl.h>
50 #include <sys/filedesc.h>
51 #include <sys/jail.h>
52 #include <sys/kernel.h>
53 #include <sys/kthread.h>
54 #include <sys/sysctl.h>
55 #include <sys/lock.h>
56 #include <sys/malloc.h>
57 #include <sys/mutex.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/procdesc.h>
61 #include <sys/pioctl.h>
62 #include <sys/ptrace.h>
63 #include <sys/racct.h>
64 #include <sys/resourcevar.h>
65 #include <sys/sched.h>
66 #include <sys/syscall.h>
67 #include <sys/vmmeter.h>
68 #include <sys/vnode.h>
69 #include <sys/acct.h>
70 #include <sys/ktr.h>
71 #include <sys/ktrace.h>
72 #include <sys/unistd.h>
73 #include <sys/sdt.h>
74 #include <sys/sx.h>
75 #include <sys/sysent.h>
76 #include <sys/signalvar.h>
77 
78 #include <security/audit/audit.h>
79 #include <security/mac/mac_framework.h>
80 
81 #include <vm/vm.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_extern.h>
85 #include <vm/uma.h>
86 
87 #ifdef KDTRACE_HOOKS
88 #include <sys/dtrace_bsd.h>
89 dtrace_fork_func_t dtrace_fasttrap_fork;
90 #endif
91 
93 SDT_PROBE_DEFINE3(proc, kernel, , create, "struct proc *",
94  "struct proc *", "int");
95 
96 #ifndef _SYS_SYSPROTO_H_
97 struct fork_args {
98  int dummy;
99 };
100 #endif
101 
102 /* ARGSUSED */
103 int
104 sys_fork(struct thread *td, struct fork_args *uap)
105 {
106  int error;
107  struct proc *p2;
108 
109  error = fork1(td, RFFDG | RFPROC, 0, &p2, NULL, 0);
110  if (error == 0) {
111  td->td_retval[0] = p2->p_pid;
112  td->td_retval[1] = 0;
113  }
114  return (error);
115 }
116 
117 /* ARGUSED */
118 int
119 sys_pdfork(td, uap)
120  struct thread *td;
121  struct pdfork_args *uap;
122 {
123 #ifdef PROCDESC
124  int error, fd;
125  struct proc *p2;
126 
127  /*
128  * It is necessary to return fd by reference because 0 is a valid file
129  * descriptor number, and the child needs to be able to distinguish
130  * itself from the parent using the return value.
131  */
132  error = fork1(td, RFFDG | RFPROC | RFPROCDESC, 0, &p2,
133  &fd, uap->flags);
134  if (error == 0) {
135  td->td_retval[0] = p2->p_pid;
136  td->td_retval[1] = 0;
137  error = copyout(&fd, uap->fdp, sizeof(fd));
138  }
139  return (error);
140 #else
141  return (ENOSYS);
142 #endif
143 }
144 
145 /* ARGSUSED */
146 int
147 sys_vfork(struct thread *td, struct vfork_args *uap)
148 {
149  int error, flags;
150  struct proc *p2;
151 
152 #ifdef XEN
153  flags = RFFDG | RFPROC; /* validate that this is still an issue */
154 #else
155  flags = RFFDG | RFPROC | RFPPWAIT | RFMEM;
156 #endif
157  error = fork1(td, flags, 0, &p2, NULL, 0);
158  if (error == 0) {
159  td->td_retval[0] = p2->p_pid;
160  td->td_retval[1] = 0;
161  }
162  return (error);
163 }
164 
165 int
166 sys_rfork(struct thread *td, struct rfork_args *uap)
167 {
168  struct proc *p2;
169  int error;
170 
171  /* Don't allow kernel-only flags. */
172  if ((uap->flags & RFKERNELONLY) != 0)
173  return (EINVAL);
174 
175  AUDIT_ARG_FFLAGS(uap->flags);
176  error = fork1(td, uap->flags, 0, &p2, NULL, 0);
177  if (error == 0) {
178  td->td_retval[0] = p2 ? p2->p_pid : 0;
179  td->td_retval[1] = 0;
180  }
181  return (error);
182 }
183 
184 int nprocs = 1; /* process 0 */
185 int lastpid = 0;
186 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
187  "Last used PID");
188 
189 /*
190  * Random component to lastpid generation. We mix in a random factor to make
191  * it a little harder to predict. We sanity check the modulus value to avoid
192  * doing it in critical paths. Don't let it be too small or we pointlessly
193  * waste randomness entropy, and don't let it be impossibly large. Using a
194  * modulus that is too big causes a LOT more process table scans and slows
195  * down fork processing as the pidchecked caching is defeated.
196  */
197 static int randompid = 0;
198 
199 static int
200 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
201 {
202  int error, pid;
203 
204  error = sysctl_wire_old_buffer(req, sizeof(int));
205  if (error != 0)
206  return(error);
207  sx_xlock(&allproc_lock);
208  pid = randompid;
209  error = sysctl_handle_int(oidp, &pid, 0, req);
210  if (error == 0 && req->newptr != NULL) {
211  if (pid < 0 || pid > pid_max - 100) /* out of range */
212  pid = pid_max - 100;
213  else if (pid < 2) /* NOP */
214  pid = 0;
215  else if (pid < 100) /* Make it reasonable */
216  pid = 100;
217  randompid = pid;
218  }
219  sx_xunlock(&allproc_lock);
220  return (error);
221 }
222 
223 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
224  0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
225 
226 static int
227 fork_findpid(int flags)
228 {
229  struct proc *p;
230  int trypid;
231  static int pidchecked = 0;
232 
233  /*
234  * Requires allproc_lock in order to iterate over the list
235  * of processes, and proctree_lock to access p_pgrp.
236  */
237  sx_assert(&allproc_lock, SX_LOCKED);
238  sx_assert(&proctree_lock, SX_LOCKED);
239 
240  /*
241  * Find an unused process ID. We remember a range of unused IDs
242  * ready to use (from lastpid+1 through pidchecked-1).
243  *
244  * If RFHIGHPID is set (used during system boot), do not allocate
245  * low-numbered pids.
246  */
247  trypid = lastpid + 1;
248  if (flags & RFHIGHPID) {
249  if (trypid < 10)
250  trypid = 10;
251  } else {
252  if (randompid)
253  trypid += arc4random() % randompid;
254  }
255 retry:
256  /*
257  * If the process ID prototype has wrapped around,
258  * restart somewhat above 0, as the low-numbered procs
259  * tend to include daemons that don't exit.
260  */
261  if (trypid >= pid_max) {
262  trypid = trypid % pid_max;
263  if (trypid < 100)
264  trypid += 100;
265  pidchecked = 0;
266  }
267  if (trypid >= pidchecked) {
268  int doingzomb = 0;
269 
270  pidchecked = PID_MAX;
271  /*
272  * Scan the active and zombie procs to check whether this pid
273  * is in use. Remember the lowest pid that's greater
274  * than trypid, so we can avoid checking for a while.
275  */
276  p = LIST_FIRST(&allproc);
277 again:
278  for (; p != NULL; p = LIST_NEXT(p, p_list)) {
279  while (p->p_pid == trypid ||
280  (p->p_pgrp != NULL &&
281  (p->p_pgrp->pg_id == trypid ||
282  (p->p_session != NULL &&
283  p->p_session->s_sid == trypid)))) {
284  trypid++;
285  if (trypid >= pidchecked)
286  goto retry;
287  }
288  if (p->p_pid > trypid && pidchecked > p->p_pid)
289  pidchecked = p->p_pid;
290  if (p->p_pgrp != NULL) {
291  if (p->p_pgrp->pg_id > trypid &&
292  pidchecked > p->p_pgrp->pg_id)
293  pidchecked = p->p_pgrp->pg_id;
294  if (p->p_session != NULL &&
295  p->p_session->s_sid > trypid &&
296  pidchecked > p->p_session->s_sid)
297  pidchecked = p->p_session->s_sid;
298  }
299  }
300  if (!doingzomb) {
301  doingzomb = 1;
302  p = LIST_FIRST(&zombproc);
303  goto again;
304  }
305  }
306 
307  /*
308  * RFHIGHPID does not mess with the lastpid counter during boot.
309  */
310  if (flags & RFHIGHPID)
311  pidchecked = 0;
312  else
313  lastpid = trypid;
314 
315  return (trypid);
316 }
317 
318 static int
319 fork_norfproc(struct thread *td, int flags)
320 {
321  int error;
322  struct proc *p1;
323 
324  KASSERT((flags & RFPROC) == 0,
325  ("fork_norfproc called with RFPROC set"));
326  p1 = td->td_proc;
327 
328  if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
329  (flags & (RFCFDG | RFFDG))) {
330  PROC_LOCK(p1);
331  if (thread_single(SINGLE_BOUNDARY)) {
332  PROC_UNLOCK(p1);
333  return (ERESTART);
334  }
335  PROC_UNLOCK(p1);
336  }
337 
338  error = vm_forkproc(td, NULL, NULL, NULL, flags);
339  if (error)
340  goto fail;
341 
342  /*
343  * Close all file descriptors.
344  */
345  if (flags & RFCFDG) {
346  struct filedesc *fdtmp;
347  fdtmp = fdinit(td->td_proc->p_fd);
348  fdfree(td);
349  p1->p_fd = fdtmp;
350  }
351 
352  /*
353  * Unshare file descriptors (from parent).
354  */
355  if (flags & RFFDG)
356  fdunshare(p1, td);
357 
358 fail:
359  if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
360  (flags & (RFCFDG | RFFDG))) {
361  PROC_LOCK(p1);
363  PROC_UNLOCK(p1);
364  }
365  return (error);
366 }
367 
368 static void
369 do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2,
370  struct vmspace *vm2, int pdflags)
371 {
372  struct proc *p1, *pptr;
373  int p2_held, trypid;
374  struct filedesc *fd;
375  struct filedesc_to_leader *fdtol;
376  struct sigacts *newsigacts;
377 
378  sx_assert(&proctree_lock, SX_SLOCKED);
379  sx_assert(&allproc_lock, SX_XLOCKED);
380 
381  p2_held = 0;
382  p1 = td->td_proc;
383 
384  /*
385  * Increment the nprocs resource before blocking can occur. There
386  * are hard-limits as to the number of processes that can run.
387  */
388  nprocs++;
389 
390  trypid = fork_findpid(flags);
391 
392  sx_sunlock(&proctree_lock);
393 
394  p2->p_state = PRS_NEW; /* protect against others */
395  p2->p_pid = trypid;
396  AUDIT_ARG_PID(p2->p_pid);
397  LIST_INSERT_HEAD(&allproc, p2, p_list);
398  LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
399  tidhash_add(td2);
400  PROC_LOCK(p2);
401  PROC_LOCK(p1);
402 
403  sx_xunlock(&allproc_lock);
404 
405  bcopy(&p1->p_startcopy, &p2->p_startcopy,
406  __rangeof(struct proc, p_startcopy, p_endcopy));
407  pargs_hold(p2->p_args);
408  PROC_UNLOCK(p1);
409 
410  bzero(&p2->p_startzero,
411  __rangeof(struct proc, p_startzero, p_endzero));
412  p2->p_treeflag = 0;
413 
414  p2->p_ucred = crhold(td->td_ucred);
415 
416  /* Tell the prison that we exist. */
417  prison_proc_hold(p2->p_ucred->cr_prison);
418 
419  PROC_UNLOCK(p2);
420 
421  /*
422  * Malloc things while we don't hold any locks.
423  */
424  if (flags & RFSIGSHARE)
425  newsigacts = NULL;
426  else
427  newsigacts = sigacts_alloc();
428 
429  /*
430  * Copy filedesc.
431  */
432  if (flags & RFCFDG) {
433  fd = fdinit(p1->p_fd);
434  fdtol = NULL;
435  } else if (flags & RFFDG) {
436  fd = fdcopy(p1->p_fd);
437  fdtol = NULL;
438  } else {
439  fd = fdshare(p1->p_fd);
440  if (p1->p_fdtol == NULL)
441  p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
442  p1->p_leader);
443  if ((flags & RFTHREAD) != 0) {
444  /*
445  * Shared file descriptor table, and shared
446  * process leaders.
447  */
448  fdtol = p1->p_fdtol;
449  FILEDESC_XLOCK(p1->p_fd);
450  fdtol->fdl_refcount++;
451  FILEDESC_XUNLOCK(p1->p_fd);
452  } else {
453  /*
454  * Shared file descriptor table, and different
455  * process leaders.
456  */
457  fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
458  p1->p_fd, p2);
459  }
460  }
461  /*
462  * Make a proc table entry for the new process.
463  * Start by zeroing the section of proc that is zero-initialized,
464  * then copy the section that is copied directly from the parent.
465  */
466 
467  PROC_LOCK(p2);
468  PROC_LOCK(p1);
469 
470  bzero(&td2->td_startzero,
471  __rangeof(struct thread, td_startzero, td_endzero));
472 
473  bcopy(&td->td_startcopy, &td2->td_startcopy,
474  __rangeof(struct thread, td_startcopy, td_endcopy));
475 
476  bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
477  td2->td_sigstk = td->td_sigstk;
478  td2->td_sigmask = td->td_sigmask;
479  td2->td_flags = TDF_INMEM;
480  td2->td_lend_user_pri = PRI_MAX;
481  td2->td_dbg_sc_code = td->td_dbg_sc_code;
482  td2->td_dbg_sc_narg = td->td_dbg_sc_narg;
483 
484 #ifdef VIMAGE
485  td2->td_vnet = NULL;
486  td2->td_vnet_lpush = NULL;
487 #endif
488 
489  /*
490  * Allow the scheduler to initialize the child.
491  */
492  thread_lock(td);
493  sched_fork(td, td2);
494  thread_unlock(td);
495 
496  /*
497  * Duplicate sub-structures as needed.
498  * Increase reference counts on shared objects.
499  */
500  p2->p_flag = P_INMEM;
501  p2->p_flag2 = 0;
502  p2->p_swtick = ticks;
503  if (p1->p_flag & P_PROFIL)
504  startprofclock(p2);
505  td2->td_ucred = crhold(p2->p_ucred);
506 
507  if (flags & RFSIGSHARE) {
508  p2->p_sigacts = sigacts_hold(p1->p_sigacts);
509  } else {
510  sigacts_copy(newsigacts, p1->p_sigacts);
511  p2->p_sigacts = newsigacts;
512  }
513 
514  if (flags & RFTSIGZMB)
515  p2->p_sigparent = RFTSIGNUM(flags);
516  else if (flags & RFLINUXTHPN)
517  p2->p_sigparent = SIGUSR1;
518  else
519  p2->p_sigparent = SIGCHLD;
520 
521  p2->p_textvp = p1->p_textvp;
522  p2->p_fd = fd;
523  p2->p_fdtol = fdtol;
524 
525  if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
526  p2->p_flag |= P_PROTECTED;
527  p2->p_flag2 |= P2_INHERIT_PROTECTED;
528  }
529 
530  /*
531  * p_limit is copy-on-write. Bump its refcount.
532  */
533  lim_fork(p1, p2);
534 
535  pstats_fork(p1->p_stats, p2->p_stats);
536 
537  PROC_UNLOCK(p1);
538  PROC_UNLOCK(p2);
539 
540  /* Bump references to the text vnode (for procfs). */
541  if (p2->p_textvp)
542  vref(p2->p_textvp);
543 
544  /*
545  * Set up linkage for kernel based threading.
546  */
547  if ((flags & RFTHREAD) != 0) {
548  mtx_lock(&ppeers_lock);
549  p2->p_peers = p1->p_peers;
550  p1->p_peers = p2;
551  p2->p_leader = p1->p_leader;
552  mtx_unlock(&ppeers_lock);
553  PROC_LOCK(p1->p_leader);
554  if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
555  PROC_UNLOCK(p1->p_leader);
556  /*
557  * The task leader is exiting, so process p1 is
558  * going to be killed shortly. Since p1 obviously
559  * isn't dead yet, we know that the leader is either
560  * sending SIGKILL's to all the processes in this
561  * task or is sleeping waiting for all the peers to
562  * exit. We let p1 complete the fork, but we need
563  * to go ahead and kill the new process p2 since
564  * the task leader may not get a chance to send
565  * SIGKILL to it. We leave it on the list so that
566  * the task leader will wait for this new process
567  * to commit suicide.
568  */
569  PROC_LOCK(p2);
570  kern_psignal(p2, SIGKILL);
571  PROC_UNLOCK(p2);
572  } else
573  PROC_UNLOCK(p1->p_leader);
574  } else {
575  p2->p_peers = NULL;
576  p2->p_leader = p2;
577  }
578 
579  sx_xlock(&proctree_lock);
580  PGRP_LOCK(p1->p_pgrp);
581  PROC_LOCK(p2);
582  PROC_LOCK(p1);
583 
584  /*
585  * Preserve some more flags in subprocess. P_PROFIL has already
586  * been preserved.
587  */
588  p2->p_flag |= p1->p_flag & P_SUGID;
589  td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
590  SESS_LOCK(p1->p_session);
591  if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
592  p2->p_flag |= P_CONTROLT;
593  SESS_UNLOCK(p1->p_session);
594  if (flags & RFPPWAIT)
595  p2->p_flag |= P_PPWAIT;
596 
597  p2->p_pgrp = p1->p_pgrp;
598  LIST_INSERT_AFTER(p1, p2, p_pglist);
599  PGRP_UNLOCK(p1->p_pgrp);
600  LIST_INIT(&p2->p_children);
601  LIST_INIT(&p2->p_orphans);
602 
603  callout_init_mtx(&p2->p_itcallout, &p2->p_mtx, 0);
604 
605  /*
606  * If PF_FORK is set, the child process inherits the
607  * procfs ioctl flags from its parent.
608  */
609  if (p1->p_pfsflags & PF_FORK) {
610  p2->p_stops = p1->p_stops;
611  p2->p_pfsflags = p1->p_pfsflags;
612  }
613 
614  /*
615  * This begins the section where we must prevent the parent
616  * from being swapped.
617  */
618  _PHOLD(p1);
619  PROC_UNLOCK(p1);
620 
621  /*
622  * Attach the new process to its parent.
623  *
624  * If RFNOWAIT is set, the newly created process becomes a child
625  * of init. This effectively disassociates the child from the
626  * parent.
627  */
628  if (flags & RFNOWAIT)
629  pptr = initproc;
630  else
631  pptr = p1;
632  p2->p_pptr = pptr;
633  LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
634  sx_xunlock(&proctree_lock);
635 
636  /* Inform accounting that we have forked. */
637  p2->p_acflag = AFORK;
638  PROC_UNLOCK(p2);
639 
640 #ifdef KTRACE
641  ktrprocfork(p1, p2);
642 #endif
643 
644  /*
645  * Finish creating the child process. It will return via a different
646  * execution path later. (ie: directly into user mode)
647  */
648  vm_forkproc(td, p2, td2, vm2, flags);
649 
650  if (flags == (RFFDG | RFPROC)) {
651  PCPU_INC(cnt.v_forks);
652  PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
653  p2->p_vmspace->vm_ssize);
654  } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
655  PCPU_INC(cnt.v_vforks);
656  PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
657  p2->p_vmspace->vm_ssize);
658  } else if (p1 == &proc0) {
659  PCPU_INC(cnt.v_kthreads);
660  PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
661  p2->p_vmspace->vm_ssize);
662  } else {
663  PCPU_INC(cnt.v_rforks);
664  PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
665  p2->p_vmspace->vm_ssize);
666  }
667 
668 #ifdef PROCDESC
669  /*
670  * Associate the process descriptor with the process before anything
671  * can happen that might cause that process to need the descriptor.
672  * However, don't do this until after fork(2) can no longer fail.
673  */
674  if (flags & RFPROCDESC)
675  procdesc_new(p2, pdflags);
676 #endif
677 
678  /*
679  * Both processes are set up, now check if any loadable modules want
680  * to adjust anything.
681  */
682  EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
683 
684  /*
685  * Set the child start time and mark the process as being complete.
686  */
687  PROC_LOCK(p2);
688  PROC_LOCK(p1);
689  microuptime(&p2->p_stats->p_start);
690  PROC_SLOCK(p2);
691  p2->p_state = PRS_NORMAL;
692  PROC_SUNLOCK(p2);
693 
694 #ifdef KDTRACE_HOOKS
695  /*
696  * Tell the DTrace fasttrap provider about the new process so that any
697  * tracepoints inherited from the parent can be removed. We have to do
698  * this only after p_state is PRS_NORMAL since the fasttrap module will
699  * use pfind() later on.
700  */
701  if ((flags & RFMEM) == 0 && dtrace_fasttrap_fork)
702  dtrace_fasttrap_fork(p1, p2);
703 #endif
704  if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
705  P_FOLLOWFORK)) {
706  /*
707  * Arrange for debugger to receive the fork event.
708  *
709  * We can report PL_FLAG_FORKED regardless of
710  * P_FOLLOWFORK settings, but it does not make a sense
711  * for runaway child.
712  */
713  td->td_dbgflags |= TDB_FORK;
714  td->td_dbg_forked = p2->p_pid;
715  td2->td_dbgflags |= TDB_STOPATFORK;
716  _PHOLD(p2);
717  p2_held = 1;
718  }
719  if (flags & RFPPWAIT) {
720  td->td_pflags |= TDP_RFPPWAIT;
721  td->td_rfppwait_p = p2;
722  }
723  PROC_UNLOCK(p2);
724  if ((flags & RFSTOPPED) == 0) {
725  /*
726  * If RFSTOPPED not requested, make child runnable and
727  * add to run queue.
728  */
729  thread_lock(td2);
730  TD_SET_CAN_RUN(td2);
731  sched_add(td2, SRQ_BORING);
732  thread_unlock(td2);
733  }
734 
735  /*
736  * Now can be swapped.
737  */
738  _PRELE(p1);
739  PROC_UNLOCK(p1);
740 
741  /*
742  * Tell any interested parties about the new process.
743  */
744  knote_fork(&p1->p_klist, p2->p_pid);
745  SDT_PROBE3(proc, kernel, , create, p2, p1, flags);
746 
747  /*
748  * Wait until debugger is attached to child.
749  */
750  PROC_LOCK(p2);
751  while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
752  cv_wait(&p2->p_dbgwait, &p2->p_mtx);
753  if (p2_held)
754  _PRELE(p2);
755  PROC_UNLOCK(p2);
756 }
757 
758 int
759 fork1(struct thread *td, int flags, int pages, struct proc **procp,
760  int *procdescp, int pdflags)
761 {
762  struct proc *p1;
763  struct proc *newproc;
764  int ok;
765  struct thread *td2;
766  struct vmspace *vm2;
767  vm_ooffset_t mem_charged;
768  int error;
769  static int curfail;
770  static struct timeval lastfail;
771 #ifdef PROCDESC
772  struct file *fp_procdesc = NULL;
773 #endif
774 
775  /* Check for the undefined or unimplemented flags. */
776  if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
777  return (EINVAL);
778 
779  /* Signal value requires RFTSIGZMB. */
780  if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
781  return (EINVAL);
782 
783  /* Can't copy and clear. */
784  if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
785  return (EINVAL);
786 
787  /* Check the validity of the signal number. */
788  if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
789  return (EINVAL);
790 
791 #ifdef PROCDESC
792  if ((flags & RFPROCDESC) != 0) {
793  /* Can't not create a process yet get a process descriptor. */
794  if ((flags & RFPROC) == 0)
795  return (EINVAL);
796 
797  /* Must provide a place to put a procdesc if creating one. */
798  if (procdescp == NULL)
799  return (EINVAL);
800  }
801 #endif
802 
803  p1 = td->td_proc;
804 
805  /*
806  * Here we don't create a new process, but we divorce
807  * certain parts of a process from itself.
808  */
809  if ((flags & RFPROC) == 0) {
810  *procp = NULL;
811  return (fork_norfproc(td, flags));
812  }
813 
814 #ifdef PROCDESC
815  /*
816  * If required, create a process descriptor in the parent first; we
817  * will abandon it if something goes wrong. We don't finit() until
818  * later.
819  */
820  if (flags & RFPROCDESC) {
821  error = falloc(td, &fp_procdesc, procdescp, 0);
822  if (error != 0)
823  return (error);
824  }
825 #endif
826 
827  mem_charged = 0;
828  vm2 = NULL;
829  if (pages == 0)
830  pages = KSTACK_PAGES;
831  /* Allocate new proc. */
832  newproc = uma_zalloc(proc_zone, M_WAITOK);
833  td2 = FIRST_THREAD_IN_PROC(newproc);
834  if (td2 == NULL) {
835  td2 = thread_alloc(pages);
836  if (td2 == NULL) {
837  error = ENOMEM;
838  goto fail1;
839  }
840  proc_linkup(newproc, td2);
841  } else {
842  if (td2->td_kstack == 0 || td2->td_kstack_pages != pages) {
843  if (td2->td_kstack != 0)
844  vm_thread_dispose(td2);
845  if (!thread_alloc_stack(td2, pages)) {
846  error = ENOMEM;
847  goto fail1;
848  }
849  }
850  }
851 
852  if ((flags & RFMEM) == 0) {
853  vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
854  if (vm2 == NULL) {
855  error = ENOMEM;
856  goto fail1;
857  }
858  if (!swap_reserve(mem_charged)) {
859  /*
860  * The swap reservation failed. The accounting
861  * from the entries of the copied vm2 will be
862  * substracted in vmspace_free(), so force the
863  * reservation there.
864  */
865  swap_reserve_force(mem_charged);
866  error = ENOMEM;
867  goto fail1;
868  }
869  } else
870  vm2 = NULL;
871 
872  /*
873  * XXX: This is ugly; when we copy resource usage, we need to bump
874  * per-cred resource counters.
875  */
876  newproc->p_ucred = p1->p_ucred;
877 
878  /*
879  * Initialize resource accounting for the child process.
880  */
881  error = racct_proc_fork(p1, newproc);
882  if (error != 0) {
883  error = EAGAIN;
884  goto fail1;
885  }
886 
887 #ifdef MAC
888  mac_proc_init(newproc);
889 #endif
890  knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
891  STAILQ_INIT(&newproc->p_ktr);
892 
893  /* We have to lock the process tree while we look for a pid. */
894  sx_slock(&proctree_lock);
895 
896  /*
897  * Although process entries are dynamically created, we still keep
898  * a global limit on the maximum number we will create. Don't allow
899  * a nonprivileged user to use the last ten processes; don't let root
900  * exceed the limit. The variable nprocs is the current number of
901  * processes, maxproc is the limit.
902  */
903  sx_xlock(&allproc_lock);
904  if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
905  PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
906  error = EAGAIN;
907  goto fail;
908  }
909 
910  /*
911  * Increment the count of procs running with this uid. Don't allow
912  * a nonprivileged user to exceed their current limit.
913  *
914  * XXXRW: Can we avoid privilege here if it's not needed?
915  */
916  error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
917  if (error == 0)
918  ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
919  else {
920  PROC_LOCK(p1);
921  ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
922  lim_cur(p1, RLIMIT_NPROC));
923  PROC_UNLOCK(p1);
924  }
925  if (ok) {
926  do_fork(td, flags, newproc, td2, vm2, pdflags);
927 
928  /*
929  * Return child proc pointer to parent.
930  */
931  *procp = newproc;
932 #ifdef PROCDESC
933  if (flags & RFPROCDESC)
934  procdesc_finit(newproc->p_procdesc, fp_procdesc);
935 #endif
936  racct_proc_fork_done(newproc);
937  return (0);
938  }
939 
940  error = EAGAIN;
941 fail:
942  sx_sunlock(&proctree_lock);
943  if (ppsratecheck(&lastfail, &curfail, 1))
944  printf("maxproc limit exceeded by uid %u (pid %d); see tuning(7) and login.conf(5)\n",
945  td->td_ucred->cr_ruid, p1->p_pid);
946  sx_xunlock(&allproc_lock);
947 #ifdef MAC
948  mac_proc_destroy(newproc);
949 #endif
950  racct_proc_exit(newproc);
951 fail1:
952  if (vm2 != NULL)
953  vmspace_free(vm2);
954  uma_zfree(proc_zone, newproc);
955 #ifdef PROCDESC
956  if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
957  fdrop(fp_procdesc, td);
958 #endif
959  pause("fork", hz / 2);
960  return (error);
961 }
962 
963 /*
964  * Handle the return of a child process from fork1(). This function
965  * is called from the MD fork_trampoline() entry point.
966  */
967 void
968 fork_exit(void (*callout)(void *, struct trapframe *), void *arg,
969  struct trapframe *frame)
970 {
971  struct proc *p;
972  struct thread *td;
973  struct thread *dtd;
974 
975  td = curthread;
976  p = td->td_proc;
977  KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
978 
979  CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
980  td, td->td_sched, p->p_pid, td->td_name);
981 
982  sched_fork_exit(td);
983  /*
984  * Processes normally resume in mi_switch() after being
985  * cpu_switch()'ed to, but when children start up they arrive here
986  * instead, so we must do much the same things as mi_switch() would.
987  */
988  if ((dtd = PCPU_GET(deadthread))) {
989  PCPU_SET(deadthread, NULL);
990  thread_stash(dtd);
991  }
992  thread_unlock(td);
993 
994  /*
995  * cpu_set_fork_handler intercepts this function call to
996  * have this call a non-return function to stay in kernel mode.
997  * initproc has its own fork handler, but it does return.
998  */
999  KASSERT(callout != NULL, ("NULL callout in fork_exit"));
1000  callout(arg, frame);
1001 
1002  /*
1003  * Check if a kernel thread misbehaved and returned from its main
1004  * function.
1005  */
1006  if (p->p_flag & P_KTHREAD) {
1007  printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
1008  td->td_name, p->p_pid);
1009  kthread_exit();
1010  }
1011  mtx_assert(&Giant, MA_NOTOWNED);
1012 
1013  if (p->p_sysent->sv_schedtail != NULL)
1014  (p->p_sysent->sv_schedtail)(td);
1015 }
1016 
1017 /*
1018  * Simplified back end of syscall(), used when returning from fork()
1019  * directly into user mode. Giant is not held on entry, and must not
1020  * be held on return. This function is passed in to fork_exit() as the
1021  * first parameter and is called when returning to a new userland process.
1022  */
1023 void
1024 fork_return(struct thread *td, struct trapframe *frame)
1025 {
1026  struct proc *p, *dbg;
1027 
1028  p = td->td_proc;
1029  if (td->td_dbgflags & TDB_STOPATFORK) {
1030  sx_xlock(&proctree_lock);
1031  PROC_LOCK(p);
1032  if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
1033  (P_TRACED | P_FOLLOWFORK)) {
1034  /*
1035  * If debugger still wants auto-attach for the
1036  * parent's children, do it now.
1037  */
1038  dbg = p->p_pptr->p_pptr;
1039  p->p_flag |= P_TRACED;
1040  p->p_oppid = p->p_pptr->p_pid;
1041  CTR2(KTR_PTRACE,
1042  "fork_return: attaching to new child pid %d: oppid %d",
1043  p->p_pid, p->p_oppid);
1044  proc_reparent(p, dbg);
1045  sx_xunlock(&proctree_lock);
1046  td->td_dbgflags |= TDB_CHILD | TDB_SCX;
1047  ptracestop(td, SIGSTOP);
1048  td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX);
1049  } else {
1050  /*
1051  * ... otherwise clear the request.
1052  */
1053  sx_xunlock(&proctree_lock);
1054  td->td_dbgflags &= ~TDB_STOPATFORK;
1055  cv_broadcast(&p->p_dbgwait);
1056  }
1057  PROC_UNLOCK(p);
1058  } else if (p->p_flag & P_TRACED) {
1059  /*
1060  * This is the start of a new thread in a traced
1061  * process. Report a system call exit event.
1062  */
1063  PROC_LOCK(p);
1064  td->td_dbgflags |= TDB_SCX;
1065  _STOPEVENT(p, S_SCX, td->td_dbg_sc_code);
1066  if ((p->p_stops & S_PT_SCX) != 0)
1067  ptracestop(td, SIGTRAP);
1068  td->td_dbgflags &= ~TDB_SCX;
1069  PROC_UNLOCK(p);
1070  }
1071 
1072  userret(td, frame);
1073 
1074 #ifdef KTRACE
1075  if (KTRPOINT(td, KTR_SYSRET))
1076  ktrsysret(SYS_fork, 0, 0);
1077 #endif
1078  mtx_assert(&Giant, MA_NOTOWNED);
1079 }
struct proclist allproc
Definition: kern_proc.c:134
void sigacts_copy(struct sigacts *dest, struct sigacts *src)
Definition: kern_sig.c:3485
void userret(struct thread *td, struct trapframe *frame)
Definition: subr_trap.c:101
static int fork_norfproc(struct thread *td, int flags)
Definition: kern_fork.c:319
struct sigacts * sigacts_alloc(void)
Definition: kern_sig.c:3452
int fd
Definition: kern_exec.c:199
int nprocs
Definition: kern_fork.c:184
void thread_single_end(void)
Definition: kern_thread.c:958
__FBSDID("$BSDSUniX$")
rlim_t lim_cur(struct proc *p, int which)
pid_t pid_max
Definition: subr_param.c:97
static int fork_findpid(int flags)
Definition: kern_fork.c:227
int priv_check_cred(struct ucred *cred, int priv, int flags)
Definition: kern_priv.c:76
int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
Definition: kern_time.c:948
void pstats_fork(struct pstats *src, struct pstats *dst)
Definition: kern_proc.c:1040
struct filedesc_to_leader * filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
void sched_fork_exit(struct thread *td)
Definition: sched_4bsd.c:1685
struct proc proc0
Definition: init_main.c:99
void prison_proc_hold(struct prison *pr)
Definition: kern_jail.c:2641
struct mtx ppeers_lock
Definition: kern_proc.c:138
void proc_linkup(struct proc *p, struct thread *td)
Definition: kern_thread.c:252
int racct_proc_fork(struct proc *parent, struct proc *child)
Definition: kern_racct.c:1273
struct proc * initproc
Definition: init_main.c:102
int maxproc
Definition: subr_param.c:87
int falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
void kern_psignal(struct proc *p, int sig)
Definition: kern_sig.c:1975
uma_zone_t proc_zone
Definition: kern_proc.c:139
int fork1(struct thread *td, int flags, int pages, struct proc **procp, int *procdescp, int pdflags)
Definition: kern_fork.c:759
SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_kern_randompid,"I","Random PID modulus")
void vref(struct vnode *vp)
Definition: vfs_subr.c:2302
void sched_add(struct thread *td, int flags)
Definition: sched_4bsd.c:1258
void startprofclock(struct proc *p)
Definition: kern_clock.c:652
struct thread * thread_alloc(int pages)
Definition: kern_thread.c:342
void fdunshare(struct proc *p, struct thread *td)
struct sx allproc_lock
Definition: kern_proc.c:136
int sys_pdfork(struct thread *td, struct pdfork_args *uap)
Definition: kern_fork.c:119
int dummy
Definition: kern_fork.c:98
void racct_proc_fork_done(struct proc *child)
Definition: kern_racct.c:1280
void fork_return(struct thread *td, struct trapframe *frame)
Definition: kern_fork.c:1024
struct filedesc * fdinit(struct filedesc *fdp)
struct mtx Giant
Definition: kern_mutex.c:140
SDT_PROVIDER_DECLARE(proc)
void sched_fork(struct thread *td, struct thread *childtd)
Definition: sched_4bsd.c:786
void lim_fork(struct proc *p1, struct proc *p2)
int chgproccnt(struct uidinfo *uip, int diff, rlim_t max)
static int sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
Definition: kern_fork.c:200
int thread_alloc_stack(struct thread *td, int pages)
Definition: kern_thread.c:359
int ptracestop(struct thread *td, int sig)
Definition: kern_sig.c:2434
int sysctl_handle_int(SYSCTL_HANDLER_ARGS)
Definition: kern_sysctl.c:986
void tidhash_add(struct thread *td)
Definition: kern_thread.c:1043
int thread_single(int mode)
Definition: kern_thread.c:616
int sys_fork(struct thread *td, struct fork_args *uap)
Definition: kern_fork.c:104
struct ucred * crhold(struct ucred *cr)
Definition: kern_prot.c:1824
struct proclist zombproc
Definition: kern_proc.c:135
int pause(const char *wmesg, int timo)
Definition: kern_synch.c:350
static int randompid
Definition: kern_fork.c:197
int printf(const char *fmt,...)
Definition: subr_prf.c:367
void fdfree(struct thread *td)
static void do_fork(struct thread *td, int flags, struct proc *p2, struct thread *td2, struct vmspace *vm2, int pdflags)
Definition: kern_fork.c:369
void fork_exit(void(*callout)(void *, struct trapframe *), void *arg, struct trapframe *frame)
Definition: kern_fork.c:968
int sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
Definition: kern_sysctl.c:1364
SDT_PROBE_DEFINE3(proc, kernel,, create,"struct proc *","struct proc *","int")
int sys_rfork(struct thread *td, struct rfork_args *uap)
Definition: kern_fork.c:166
struct filedesc * fdcopy(struct filedesc *fdp)
void racct_proc_exit(struct proc *p)
Definition: kern_racct.c:1285
struct sigacts * sigacts_hold(struct sigacts *ps)
Definition: kern_sig.c:3476
void thread_stash(struct thread *td)
Definition: kern_thread.c:304
volatile int ticks
Definition: kern_clock.c:387
void microuptime(struct timeval *tvp)
Definition: kern_tc.c:194
void knote_fork(struct knlist *list, int pid)
Definition: kern_event.c:452
void proc_reparent(struct proc *child, struct proc *parent)
Definition: kern_exit.c:1306
struct sx proctree_lock
Definition: kern_proc.c:137
SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD,&lastpid, 0,"Last used PID")
int sys_vfork(struct thread *td, struct vfork_args *uap)
Definition: kern_fork.c:147
void kthread_exit(void)
Definition: kern_kthread.c:322
void knlist_init_mtx(struct knlist *knl, struct mtx *lock)
Definition: kern_event.c:1995
void pargs_hold(struct pargs *pa)
Definition: kern_proc.c:1455
struct filedesc * fdshare(struct filedesc *fdp)
int hz
Definition: subr_param.c:84
int lastpid
Definition: kern_fork.c:185