当前位置：首页 > news >正文

Linux内核进程创建流程

news 2025/8/21 19:02:14

本文代码基于Linux5.10
内容主要参考《Linux内核深度解析》余华兵

当Linux内核要创建一个新进程时，流程大致如下

ret = fork();
if (ret == 0) {/* 子进程装载程序 */ret = execve(filename, argv, envp);
} else if (ret > 0) {/* 父进程 */
}

大致可以分为创建新进程和装载程序这两个过程。

创建新进程

Linux中创建新进程有两个系统调用，分别是clone和fork，其定义如下：

kernel/fork.c
SYSCALL_DEFINE0(fork) {#ifdef CONFIG_MMUstruct kernel_clone_args args = {.exit_signal = SIGCHLD,};return kernel_clone(&args);
#else/* can not support in nommu mode */return -EINVAL;
#endif
}
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,int, stack_size,int __user *, parent_tidptr,int __user *, child_tidptr,unsigned long, tls) {struct kernel_clone_args args = {.flags		= (lower_32_bits(clone_flags) & ~CSIGNAL),.pidfd		= parent_tidptr,.child_tid	= child_tidptr,.parent_tid	= parent_tidptr,.exit_signal	= (lower_32_bits(clone_flags) & CSIGNAL),.stack		= newsp,.tls		= tls,};return kernel_clone(&args);
}

可以理解为fork是clone的简化版本， clone可以更精确的控制创建进程的行为，我们在创建线程时，就是使用的clone(没错，在Linux里面，线程实际上也是进程)。

clone 和 fork 都会调用kernel_clone 这个函数去创建进程，只不过两者传递的参数不同。

Linux 目前通过kernel_clone_args 这个数据结构来传递参数。

include/linux/sched/task.h
struct kernel_clone_args {u64 flags;int __user *pidfd;int __user *child_tid;int __user *parent_tid;int exit_signal;unsigned long stack;unsigned long stack_size;unsigned long tls;pid_t *set_tid;/* Number of elements in *set_tid */size_t set_tid_size;int cgroup;struct cgroup *cgrp;struct css_set *cset;
};

flags : clone 标志。

stack ：只在创建线程时有意义，用来指定线程的用户栈的地址

stack_size：只在创建线程时有意义，用来指定线程的用户栈的大小

创建新进程的流程大致如下：

调用函数copy_process 创建新进程
调用函数wake_up_new_task 唤醒新进程。

copy process

copy process的流程如下：

1.检查标志是否合法。

kernel/fork.c/** Don't allow sharing the root directory with processes in a different* namespace*/if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))return ERR_PTR(-EINVAL);if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))return ERR_PTR(-EINVAL);.....

2.dup_task_struct。已当前进程为模板，创建task_struct数据结构

这里面会分配task_struct 的数据结构，并分配内核栈。

内核栈也是一个slab。

kernel/fork.c
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,int node)
{unsigned long *stack;stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);stack = kasan_reset_tag(stack);tsk->stack = stack;return stack;
}
void thread_stack_cache_init(void)
{thread_stack_cache = kmem_cache_create_usercopy("thread_stack",THREAD_SIZE, THREAD_SIZE, 0, 0,THREAD_SIZE, NULL);BUG_ON(thread_stack_cache == NULL);
}

3.检查用户的进程数量限制

kernel/fork.cif (atomic_read(&p->real_cred->user->processes) >=task_rlimit(p, RLIMIT_NPROC)) {if (p->real_cred->user != INIT_USER &&!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))goto bad_fork_free;}

在用户空间，可以通过ulimit -u 来设置用户最大可以创建的进程数量。

4.copy_creds

调用copy_cread 复制或者共享证书，如果新进程和当前进程属于同一个线程组，那么他们共享证书。

5. 检查线程数量限制

kernel/fork.cif (data_race(nr_threads >= max_threads))goto bad_fork_cleanup_count;nr_threads 会在每次创建进程/线程后+1

6.sched_fork

设置调度器相关的参数

7.复制或者共享资源

这里会复制虚拟内存，文件，文件系统数据，信号处理数据等各种资源。这里重点介绍一下copy_thread 这个流程，这里会复制进程的各种寄存器。

arch/arm64/kernel/process.c
int copy_thread(unsigned long clone_flags, unsigned long stack_start,unsigned long stk_sz, struct task_struct *p, unsigned long tls)
{struct pt_regs *childregs = task_pt_regs(p);memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));   /*        1         *//** In case p was allocated the same task_struct pointer as some* other recently-exited task, make sure p is disassociated from* any cpu that may have run that now-exited task recently.* Otherwise we could erroneously skip reloading the FPSIMD* registers for p.*/fpsimd_flush_task_state(p);ptrauth_thread_init_kernel(p);if (likely(!(p->flags & PF_KTHREAD))) {            /*        2         */         *childregs = *current_pt_regs();childregs->regs[0] = 0;													/*        3         *//** Read the current TLS pointer from tpidr_el0 as it may be* out-of-sync with the saved value.*/*task_user_tls(p) = read_sysreg(tpidr_el0);if (stack_start) {                         /*        4         */if (is_compat_thread(task_thread_info(p)))childregs->compat_sp = stack_start;elsechildregs->sp = stack_start;}/** If a TLS pointer was passed to clone, use it for the new* thread.*/if (clone_flags & CLONE_SETTLS)p->thread.uw.tp_value = tls;} else {                /*        5         *//*  * A kthread has no context to ERET to, so ensure any buggy* ERET is treated as an illegal exception return.** When a user task is created from a kthread, childregs will* be initialized by start_thread() or start_compat_thread().*/memset(childregs, 0, sizeof(struct pt_regs));childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT;p->thread.cpu_context.x19 = stack_start;p->thread.cpu_context.x20 = stk_sz;}p->thread.cpu_context.pc = (unsigned long)ret_from_fork; /*        6         */p->thread.cpu_context.sp = (unsigned long)childregs;ptrace_hw_copy_thread(p);return 0;
}

用户态相关的运行环境缓存在pt_regs 中，内核态保存在thread结构体中。

(1) 获取pt_regs，并初始化thread 结构体

(2) 对于用户进程的处理

(3) 设置返回值为0。(子进程fork返回0就是在这里设置)

(4) 设置线程的用户栈

(5) 对于内核进程的处理, 这里X19存储线程函数的地址，X20存放线程函数的参数

(6) 设置内核态的PC和SP值，在发生进程切换时，会切到原因的地方去

wake up new task

在新进程创建之后，会尝试去唤醒它，让它尽快得到执行，其流程大致如下：

新进程第一次运行

前文说到，copy_thread是会把新进程的PC设置为ret_from_fork。

arch/arm64/kernel/entry.S
/** This is how we return from a fork.*/
SYM_CODE_START(ret_from_fork)bl	schedule_tailcbz	x19, 1f				// not a kernel threadmov	x0, x20blr	x19
1:	get_current_task tskb	ret_to_user
SYM_CODE_END(ret_from_fork)

在ret_from_fork中，首先进行调度切换的清理工作(schedule_tail)。如果是用户进程，调用ret_to_user返回用户空间，如果是内核进程，X19存储线程函数的地址，X20存放线程函数的参数，这里会跳转到x19所存储的函数地址执行。

装载程序

一般来说，用户层会调用execve或者execveat 执行某个具体的程序。

int execve(const char *filename, char *const argv[ ], char *const envp[ ]);

用户程序一般是一个elf文件，内核会按照elf文件的格式去解析它，并设置PC到对应的entry。这部分内容不在此详细说明。

实例： init 进程的创建和运行

init 是kernel运行的第一个进程，我们来看看它是怎么创建和运行起来。

在rest_init中，会调用kernel_thread 创建init进程

init/main.c
noinline void __ref rest_init(void)
{.....pid = kernel_thread(kernel_init, NULL, CLONE_FS);.....
}pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{struct kernel_clone_args args = {.flags		= ((lower_32_bits(flags) | CLONE_VM |CLONE_UNTRACED) & ~CSIGNAL),.exit_signal	= (lower_32_bits(flags) & CSIGNAL),.stack		= (unsigned long)fn,.stack_size	= (unsigned long)arg,};return kernel_clone(&args);
}

可以看到kernel_thread其实也是调用kernel_clone创建线程，其中stack被设置成了入口函数，stack_size被设置成了参数。

在kernel_init中，会尝试装载init进程。

init/main.c
static int __ref kernel_init(void *unused)
{
.....
if (ramdisk_execute_command) {ret = run_init_process(ramdisk_execute_command);if (!ret)return 0;pr_err("Failed to execute %s (error %d)\n",ramdisk_execute_command, ret);}
....
}

装载完成之后，就会调转到用户态的init进程执行了。

查看全文

http://www.lryc.cn/news/91615.html