原来进程是这么创建的

AshinZ

发布于 2023-11-01 17:03:16

2840

发布于 2023-11-01 17:03:16

文章被收录于专栏：程栩的性能优化笔记程栩的性能优化笔记

大家好，我是程栩，一个专注于性能的大厂程序员，分享包括但不限于计算机体系结构、性能优化、云原生的知识。

引

前面我们介绍了一些关于进程的知识，今天我们来聊一聊进程是如何创建的。今天的内容基于《Linux内核设计与实现》以及Linux v6.3版本。

进程创建

许多操作系统都提供了产生进程的机制，Linux内核中，采取了组合的方式来实现这样的机制，通过fork和exec的组合，将进程的生成分为两个步骤：简单来说就是fork负责生成一个进程，然后exec读入可执行文件执行：

fork与exec的简化过程

当然，以上只是简化的步骤。进程的创建并不是复制进程描述符即可，需要做许多细节的操作。

在内核中，通过kernel_clone来实现fork系统调用，而与fork类似的系统调用，例如vfork、__clone等，都是通过给kernel_clone传入不同的参数来实现：

// kernel/fork.c L2999
#ifdef __ARCH_WANT_SYS_FORK
// 通过SYSCALL_DEFINE0宏定义声明一个0参数系统调用
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
    // 设置kernel参数
 struct kernel_clone_args args = {
  .exit_signal = SIGCHLD,
 }; 
 // 调用kernel_clone
 return kernel_clone(&args);
#else
 /* can not support in nommu mode */
 return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
 struct kernel_clone_args args = {
  .flags  = CLONE_VFORK | CLONE_VM,
  .exit_signal = SIGCHLD,
 };

 return kernel_clone(&args);
}
#endif

我们来看看kernel_clone的实现：

// kernel/fork.c L2869
/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
 u64 clone_flags = args->flags;
 struct completion vfork;
 struct pid *pid;
 struct task_struct *p;
 int trace = 0;
 pid_t nr;
 // 做一些权限校验
 /*
  * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
  * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
  * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
  * field in struct clone_args and it still doesn't make sense to have
  * them both point at the same memory location. Performing this check
  * here has the advantage that we don't need to have a separate helper
  * to check for legacy clone().
  */
 if ((args->flags & CLONE_PIDFD) &&
     (args->flags & CLONE_PARENT_SETTID) &&
     (args->pidfd == args->parent_tid))
  return -EINVAL;
 
 /*
  * Determine whether and which event to report to ptracer.  When
  * called from kernel_thread or CLONE_UNTRACED is explicitly
  * requested, no event is reported; otherwise, report if the event
  * for the type of forking is enabled.
  */
 if (!(clone_flags & CLONE_UNTRACED)) {
  if (clone_flags & CLONE_VFORK)
   trace = PTRACE_EVENT_VFORK;
  else if (args->exit_signal != SIGCHLD)
   trace = PTRACE_EVENT_CLONE;
  else
   trace = PTRACE_EVENT_FORK;

  if (likely(!ptrace_event_enabled(current, trace)))
   trace = 0;
 }
 // 复制进程结构体
 p = copy_process(NULL, trace, NUMA_NO_NODE, args);
 add_latent_entropy();

 if (IS_ERR(p))
  return PTR_ERR(p);

 /*
  * Do this prior waking up the new thread - the thread pointer
  * might get invalid after that point, if the thread exits quickly.
  */
 trace_sched_process_fork(current, p);

 pid = get_task_pid(p, PIDTYPE_PID);
 nr = pid_vnr(pid);

 if (clone_flags & CLONE_PARENT_SETTID)
  put_user(nr, args->parent_tid);

 if (clone_flags & CLONE_VFORK) {
  p->vfork_done = &vfork;
  init_completion(&vfork);
  get_task_struct(p);
 }

 if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
  /* lock the task to synchronize with memcg migration */
  task_lock(p);
  lru_gen_add_mm(p->mm);
  task_unlock(p);
 }
 // 唤醒子进程，尽可能让子进程先执行
 wake_up_new_task(p);

 /* forking complete and child started to run, tell ptracer */
 if (unlikely(trace))
  ptrace_event_pid(trace, pid);

 if (clone_flags & CLONE_VFORK) {
  if (!wait_for_vfork_done(p, &vfork))
   ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
 }

 put_pid(pid);
 return nr;
}

为了简化，我们尝试画一个简单的图：

kernel_clone

copy_process

那么copy_process所做的事情就成为了重中之重了，复制进程的时候到底复制了什么呢？我们来看：

// kernel/fork.c L2238
/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
__latent_entropy struct task_struct *copy_process(
     struct pid *pid,
     int trace,
     int node,
     struct kernel_clone_args *args)

该函数是一个非常长的函数（L2238-L2808），因为设置到针对各种参数的处理。copy_process的大致执行过程如下：

首先copy_process会进行各种权限的校验，如：

// kernel/fork.c L2304
if (clone_flags & CLONE_PIDFD) {
  /*
   * - CLONE_DETACHED is blocked so that we can potentially
   *   reuse it later for CLONE_PIDFD.
   * - CLONE_THREAD is blocked until someone really needs it.
   */
  if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
   return ERR_PTR(-EINVAL);
 }

完成校验后，copy_process会进行信号的相关处理：

// kernel/fork.c L2314
 /*
  * Force any signals received before this point to be delivered
  * before the fork happens.  Collect up signals sent to multiple
  * processes that happen during the fork and delay them so that
  * they appear to happen after the fork.
  */
 sigemptyset(&delayed.signal);
 INIT_HLIST_NODE(&delayed.node);

 spin_lock_irq(&current->sighand->siglock);
 if (!(clone_flags & CLONE_THREAD))
  hlist_add_head(&delayed.node, &current->signal->multiprocess);
 recalc_sigpending();
 spin_unlock_irq(&current->sighand->siglock);
 retval = -ERESTARTNOINTR;
 if (task_sigpending(current))
  goto fork_out;

从注释中我们可以看出，这里会将fork前收到的信号传送出去，而fork执行过程中的信号则做一些延迟。

接着，copy_process会调用dup_task_struct为新进程创建内核栈、task_info等结构体，这时候子进程和父进程的进程描述符是完全一样的：

// kernel/fork.c L2333
 p = dup_task_struct(current, node);
 if (!p)
  goto fork_out;

在执行完这一步后，子进程会设置部分flags的值并进行诸多成员的清零和初始化：

// kernel/fork.c L2336
 p->flags &= ~PF_KTHREAD;
 if (args->kthread)
  p->flags |= PF_KTHREAD;
 if (args->user_worker)
  p->flags |= PF_USER_WORKER;
 if (args->io_thread) {
  /*
   * Mark us an IO worker, and block any signal that isn't
   * fatal or STOP
   */
  p->flags |= PF_IO_WORKER;
  siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
 }

 if (args->name)
  strscpy_pad(p->comm, args->name, sizeof(p->comm));

 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
 /*
  * Clear TID on mm_release()?
  */
 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

之后会将该进程分配到某个CPU上去：

// kernel/fork.c L2476
 /* Perform scheduler related setup. Assign this task to a CPU. */
 retval = sched_fork(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_policy;

 retval = perf_event_init_task(p, clone_flags);
 if (retval)
  goto bad_fork_cleanup_policy;
 retval = audit_alloc(p);
 if (retval)
  goto bad_fork_cleanup_perf;

接着，根据传递给kernel_clone的参数，copy_process拷贝或者共享打开的文件、文件系统信息等内容：

// kernel/fork.c L2487
 /* copy all the process information */
 shm_init_task(p);
 retval = security_task_alloc(p, clone_flags);
 if (retval)
  goto bad_fork_cleanup_audit;
 retval = copy_semundo(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_security;
 retval = copy_files(clone_flags, p, args->no_files);
 if (retval)
  goto bad_fork_cleanup_semundo;
 retval = copy_fs(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_files;
 retval = copy_sighand(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_fs;
 retval = copy_signal(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_sighand;
 retval = copy_mm(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_signal;
 retval = copy_namespaces(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_mm;
 retval = copy_io(clone_flags, p);
 if (retval)
  goto bad_fork_cleanup_namespaces;
 retval = copy_thread(p, args);
 if (retval)
  goto bad_fork_cleanup_io;

接着，调用alloc_pid为新进程分配一个有效的pid：

// kernel/fork.c L2525
 if (pid != &init_struct_pid) {
  pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
    args->set_tid_size);
  if (IS_ERR(pid)) {
   retval = PTR_ERR(pid);
   goto bad_fork_cleanup_thread;
  }
 }

最后，copy_process做一些扫尾的工作并返回相应的指针。

在阅读《Linux内核设计与实现》一书过程中，其在这里讲解的进程创建过程与笔者记录的并不完全一致。「简单的说，copy_process就是对当前进程做了一个复制，并且基于传入的参数对这个进程描述符做或多或少的修改，在以一个新的pid作为进程的标记之后就返回。」

接着，我们就需要尽可能的让子进程优先于父进程运行。一般子进程在执行之后就会立刻调用exec函数，如果我们让子进程先运行的话，就可以避免写时拷贝的额外开销；而如果父进程受限制性，则可能立马就会做写入。

线程创建

首先我们需要知道，在Linux中，我们并没有对线程thread做更细节的描述，而是把线程看成是一个特殊的进程来实现。「也即线程是一个与其他进程共享某些资源的进程。」而在线程创建的过程中，也就自然而然的复用了进程创建的过程，只不过在传入的参数上有所区别：

// kernel/fork.c L2964
/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
      unsigned long flags)
{
 struct kernel_clone_args args = {
  .flags  = ((lower_32_bits(flags) | CLONE_VM |
        CLONE_UNTRACED) & ~CSIGNAL),
  .exit_signal = (lower_32_bits(flags) & CSIGNAL),
  .fn  = fn,
  .fn_arg  = arg,
  .name  = name,
  .kthread = 1,
 };

 return kernel_clone(&args);
}

/*
 * Create a user mode thread.
 */
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
 struct kernel_clone_args args = {
  .flags  = ((lower_32_bits(flags) | CLONE_VM |
        CLONE_UNTRACED) & ~CSIGNAL),
  .exit_signal = (lower_32_bits(flags) & CSIGNAL),
  .fn  = fn,
  .fn_arg  = arg,
 };

 return kernel_clone(&args);
}

可以看到，无论是内核线程还是用户线程，都是通过调用kernel_clone来进行实现的。这里的诸如CLONE_VM、CLONE_UNTRACED等标志都是来告诉内核到底这个线程共享了哪些内容的，例如CLONE_VM就是指父子共享地址空间。相关参数定义可以在include/uapi/linux/sched.h中找到：

// kernel/fork.c L7
/*
 * cloning flags:
 */
#define CSIGNAL  0x000000ff /* signal mask to be sent at exit */
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#define CLONE_DETACHED  0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED  0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#define CLONE_NEWCGROUP  0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS  0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC  0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER  0x10000000 /* New user namespace */
#define CLONE_NEWPID  0x20000000 /* New pid namespace */
#define CLONE_NEWNET  0x40000000 /* New network namespace */
#define CLONE_IO  0x80000000 /* Clone io context */

/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */

/*
 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
 * syscalls only:
 */
#define CLONE_NEWTIME 0x00000080 /* New time namespace */

值得注意的是，内核的内核线程是在kernel/kthread.c中实现的，但是其底层也是调用我们前面说的kernel_thread函数：

// kernel/kthread.c L394
static void create_kthread(struct kthread_create_info *create)
{
 int pid;

#ifdef CONFIG_NUMA
 current->pref_node_fork = create->node;
#endif
 /* We want our own signal handler (we take no signals by default). */
 pid = kernel_thread(kthread, create, create->full_name,
       CLONE_FS | CLONE_FILES | SIGCHLD);
 if (pid < 0) {
  /* Release the structure when caller killed by a fatal signal. */
  struct completion *done = xchg(&create->done, NULL);

  kfree(create->full_name);
  if (!done) {
   kfree(create);
   return;
  }
  create->result = ERR_PTR(pid);
  complete(done);
 }
}