前言
内核版本2.6.11.1,每个内核版本都会有所差异。在代码中已添加注释所以就没更多的文字说明。
1、do_fork()
do_fork()函数负责处理clone()、fork()、vfork()系统调用,
/* Fork a new task - this creates a new program thread.
* This is called indirectly via a small wrapper
*/
asmlinkage int sys_fork(struct pt_regs *regs)
{
return do_fork(SIGCHLD, regs->ARM_sp, regs, 0, NULL, NULL);
}
/* Clone a task - this clones the calling program thread.
* This is called indirectly via a small wrapper
*/
asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp,
int __user *parent_tidptr, int tls_val,
int __user *child_tidptr, struct pt_regs *regs)
{
if (!newsp)
newsp = regs->ARM_sp;
return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
}
asmlinkage int sys_vfork(struct pt_regs *regs)
{
return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->ARM_sp, regs, 0, NULL, NULL);
}
这三个创建进程的函数最终都会调用到do_fork()。
do_fork()函数原型如下:
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
参数列表如下:
参数 | 描述 |
clone_flags | 低字节指定了进程结束时发送到父进程的信号代码,通常选择SIGCHLD信号。剩余的3个字节给clone标志组用于编码。 |
stack_start | 表示把用户态堆栈指针赋值给子进程的esp寄存器。 |
regs | 指向通用寄存器的指针,通用寄存器的值是在用户态切换到内核态时被保存到内核堆栈中的 |
stack_size | 未使用(总被设置为0) |
parent_tidptr | 表示父进程的用户态变量地址,该父进程具有与新轻量级进程相同的 PID,只有在CLONE_PARENT_SETTID标志被设置时才有意义 |
child_tidptr | 表示新轻量级进程用户态变量地址,该进程具有这一类进程的 PID。只有在CLONE_CHILD_SETTID标志被设置时才有意义 |
2、do_fork()的处理流程
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
struct task_struct *p;
int trace = 0;
/*根据pidmap_array的位图来获取PID*/
long pid = alloc_pidmap();
if (pid < 0)
return -EAGAIN;
/* 检查父进程的ptrace字段(current->ptrace):如果它的值不等于0,表示有另外的进程在跟踪父进程,
* 并且,do_fork()检查debug程序是否想跟踪子进程。如果子进程不是内核线程,
* 那么设置子进程的CLONE_PTRACE标志,并把CLONE__UNTRACHED标志清零。
*/
if (unlikely(current->ptrace)) {
trace = fork_traceflag (clone_flags);
if (trace)
clone_flags |= CLONE_PTRACE;
}
/*复制进程描述符。如果所有必须的资源都是可用的,该函数返回刚创建的task_struct描述符的地址。*/
p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
/*
* Do this prior waking up the new thread - the thread pointer
* might get invalid after that point, if the thread exits quickly.
*/
if (!IS_ERR(p)) {
struct completion vfork;
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
init_completion(&vfork);
}
/*
* 如果父进程被跟踪或设置了CLONE_STOPPED标志
* 那么子进程在启动时将会收到一个SIGSTOP信号
* 这样子子进程就是以暂停状态启动的
*/
if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {
/*
* We'll start up with an immediate SIGSTOP.
*/
/*把SIGSTOP信号加入到进程描述符的未决信号的集合中*/
sigaddset(&p->pending.signal, SIGSTOP);
/*在子进程启动的时候就会收到一个SIGSTOP信号,此时子进程就会被暂停*/
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags); /*把子进程添加到CPU的运行队列中去*/
else
/*
* 如果CLONE_STOPPED标志被设置,就把子进程设置为TASK_STOPPED状态,
* 让它等待一个唤醒信号
*/
p->state = TASK_STOPPED;
/*如果父进程被跟踪,则把子进程的PID存放在父进程的ptrace_message字段中。
*调用ptrace_notify()函数使当前进程停止运行,并向当前进程的父进程发送SIGCHID信号
*子进程的祖父进程就是跟踪父进程的debugger进程。SIGCHID信号通知debugger进程:current已经创建了
*一个子进程,可以通过ptrace_message字段获取子进程的PID
*/
if (unlikely (trace)) {
current->ptrace_message = pid;
ptrace_notify ((trace << 8) | SIGTRAP);
}
/*
* 如果设置了CLONE_VFORK,也就是说这是对vfork()的调用
* 那么就将父进ERRUO程设置成阻塞状态
* 并发送通知给一个跟踪者(如果父进程激活了跟踪功能的话)。
* 这是通过把父进程放在等待队列中,并让它保持TASK_UNINTERRUPTIBLEZ状态。
* 直到子进程调用exit()或者execv()来实现的
*/
if (clone_flags & CLONE_VFORK) {
wait_for_completion(&vfork);
if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
}
} else {
free_pidmap(pid);
pid = PTR_ERR(p);
}
//函数结束并返回子进程的PID
return pid;
}
3、do_fork中其他重要函数
3.1、fork_traceflag
static inline int fork_traceflag (unsigned clone_flags)
{
/*如果设置了CLONE_UNTRACED标志,那么代表CLONE_PTRACE标志失去作用,也就不是跟踪状态,直接返回0*/
if (clone_flags & CLONE_UNTRACED)
return 0;
else if (clone_flags & CLONE_VFORK) {
/*若果设置了CLONE_VFORK标志*/
if (current->ptrace & PT_TRACE_VFORK) /*如果ptrace字段 设置了PT_TRACE_VFORK*/
return PTRACE_EVENT_VFORK; //返回PTRACE_EVENT_VFORK(被跟踪的vfrok事件)
} else if ((clone_flags & CSIGNAL) != SIGCHLD) {
//如果没有设置SIGCHID信号
if (current->ptrace & PT_TRACE_CLONE) //判断ptrace字段 设置了PT_TRACE_CLONE
return PTRACE_EVENT_CLONE; //返回一个PTRACE_EVENT_CLONE(被跟踪的CLONE事件)
} else if (current->ptrace & PT_TRACE_FORK) /*如果ptrace字段 设置了PT_TRACE_FORK*/
return PTRACE_EVENT_FORK; /*返回一个PTRACE_EVENT_FORK(被跟踪的fork事件)*/
return 0; /*都不是的话返回0*/
}
3.2、copy_process
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
static task_t *copy_process(unsigned long clone_flags,
unsigned long stack_start,
struct pt_regs *regs,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
int pid)
{
int retval;
struct task_struct *p = NULL;
/*检查标志:CLONE_FS与CLONE_NEWNS一起使用的话 返回错误码-22*/
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
/*CLONE_THREAD标志被设置但CLONE_SIGHAND标志被清0 (同一线程组中的轻量级进程必须共享信号) 返回错误码-22*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
/*CLONE_SIGHAND标志被设置但CLONE_VM标志被清0(共享信号处理程序的轻量级进程也必须共享内存描述符) */
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*执行安全检查 其实是个空函数*/
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
retval = -ENOMEM;
/*为子进程获取进程描述符*/
p = dup_task_struct(current);
if (!p)
goto fork_out;
retval = -EAGAIN;
/*首先看前面的两个if,第一个if里面的rlim数组包含在task_sturct数组中。
*对进程占用的资源数做出限制,rlim[RLIMIT_NPROC]限制了改进程用户
*可以拥有的总进程数量,如果当前用户所拥有的进程数量超过了
*规定的最大拥有进程数量,在内核中就直接goto bad_fork_free了。
*第2个if使用了capable()函数来对权限做出检查,检查是否有权
*对指定的资源进行操作,该函数返回0则代表无权操作。
*/
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
p->user != &root_user)
goto bad_fork_free;
}
/*给user_struct的使用计数器加一*/
atomic_inc(&p->user->__count);
/*给用户所拥有的进程的计数器加1*/
atomic_inc(&p->user->processes);
get_group_info(p->group_info);
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
/*检查创建的进程是否超过了系统进程总量*/
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
if (!try_module_get(p->thread_info->exec_domain->module))
goto bad_fork_cleanup_count;
if (p->binfmt && !try_module_get(p->binfmt->module))
goto bad_fork_cleanup_put_domain;
/*初始化did_exec字段为0,它记录了进程发出的execve()系统调用的次数*/
p->did_exec = 0;
/*更新task_struct中的flags*/
copy_flags(clone_flags, p);
/*把新进程的PID存入task_struct中的pid字段*/
p->pid = pid;
retval = -EFAULT;
/*如果设置了CLONE_PARENT_SETTID标志位,那么就把子进程的PID复制到参数parent_tidptr指向的用户变量中去*/
if (clone_flags & CLONE_PARENT_SETTID)
if (put_user(p->pid, parent_tidptr))
goto bad_fork_cleanup;
p->proc_dentry = NULL;
/*初始化进程描述符中的两个链表和自旋锁*/
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
p->vfork_done = NULL;
spin_lock_init(&p->alloc_lock);
spin_lock_init(&p->proc_lock);
clear_tsk_thread_flag(p, TIF_SIGPENDING);
/*初始化挂起的信号*/
init_sigpending(&p->pending);
p->it_real_value = 0;
p->it_real_incr = 0;
p->it_virt_value = cputime_zero;
p->it_virt_incr = cputime_zero;
p->it_prof_value = cputime_zero;
p->it_prof_incr = cputime_zero;
/*初始化定时器*/
init_timer(&p->real_timer);
p->real_timer.data = (unsigned long) p;
p->utime = cputime_zero;
p->stime = cputime_zero;
p->rchar = 0; /* I/O counter: bytes read */
p->wchar = 0; /* I/O counter: bytes written */
p->syscr = 0; /* I/O counter: read syscalls */
p->syscw = 0; /* I/O counter: write syscalls */
acct_clear_integrals(p);
p->lock_depth = -1; /* -1 = no lock */
do_posix_clock_monotonic_gettime(&p->start_time);
p->security = NULL;
p->io_context = NULL;
p->io_wait = NULL;
p->audit_context = NULL;
#ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup;
}
#endif
p->tgid = p->pid;
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
if ((retval = security_task_alloc(p)))
goto bad_fork_cleanup_policy;
if ((retval = audit_alloc(p)))
goto bad_fork_cleanup_security;
/* copy all the process information */
if ((retval = copy_semundo(clone_flags, p)))
goto bad_fork_cleanup_audit;
if ((retval = copy_files(clone_flags, p)))
goto bad_fork_cleanup_semundo;
if ((retval = copy_fs(clone_flags, p)))
goto bad_fork_cleanup_files;
if ((retval = copy_sighand(clone_flags, p)))
goto bad_fork_cleanup_fs;
if ((retval = copy_signal(clone_flags, p)))
goto bad_fork_cleanup_sighand;
if ((retval = copy_mm(clone_flags, p)))
goto bad_fork_cleanup_signal;
if ((retval = copy_keys(clone_flags, p)))
goto bad_fork_cleanup_mm;
if ((retval = copy_namespace(clone_flags, p)))
goto bad_fork_cleanup_keys;
/**/
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_namespace;
/*如果设置了CLONE_CHILD_SETTID标志,那么就把child_tidptr的值赋值给p->set_child_tid字段*/
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
/*如果设置了CLONE_CHILD_CLEARTID标志,那么就把child_tidptr的值赋值给p->sclear_child_tid字段*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
/*
* Syscall tracing should be turned off in the child regardless
* of CLONE_PTRACE.
*/
/*清除子进程thread_info结构中的TIF_SYSCALL_TRACE标志*/
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
p->parent_exec_id = p->self_exec_id;
/* ok, now we should be set up.. */
/*如果设置了CLONE_THREAD标志,那么就把exit_signal置为-1*/
p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
p->pdeath_signal = 0;
p->exit_state = 0;
/* Perform scheduler related setup 执行调度程序相关设置*/
/*完成对新进程调度程序数据结构的初始化,此函数把新进程的状态设置为TASK_RUNNING*/
sched_fork(p);
/*
* Ok, make it visible to the rest of the system.
* We dont wake it up yet.
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
/*
* The task hasn't been attached yet, so cpus_allowed mask cannot
* have changed. The cpus_allowed mask of the parent may have
* changed after it was copied first time, and it may then move to
* another CPU - so we re-copy it here and set the child's CPU to
* the parent's CPU. This avoids alot of nasty races.
*/
p->cpus_allowed = current->cpus_allowed;
/*设置子进程运行的CPU smp_processor_id()函数返回本地CPU号*/
set_task_cpu(p, smp_processor_id());
/*
* Check for pending SIGKILL! The new thread should not be allowed
* to slip out of an OOM kill. (or normal SIGKILL.)
*/
if (sigismember(¤t->pending.signal, SIGKILL)) {
write_unlock_irq(&tasklist_lock);
retval = -EINTR;
goto bad_fork_cleanup_namespace;
}
/* CLONE_PARENT re-uses the old parent */
/*如果设置了CLONE_PARENT和CLONE_THREAD标志,那么当前进程的real_parent就为当前进程的real_parent,
*否则子进程的real_parent为当前进程
*/
if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
p->real_parent = current->real_parent;
else
p->real_parent = current;
p->parent = p->real_parent;
if (clone_flags & CLONE_THREAD) {
spin_lock(¤t->sighand->siglock);
/*
* Important: if an exit-all has been started then
* do not create this new thread - the whole thread
* group is supposed to exit anyway.
*/
if (current->signal->flags & SIGNAL_GROUP_EXIT) {
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -EAGAIN;
goto bad_fork_cleanup_namespace;
}
p->group_leader = current->group_leader;
if (current->signal->group_stop_count > 0) {
/*
* There is an all-stop in progress for the group.
* We ourselves will stop as soon as we check signals.
* Make the new thread part of that group stop too.
*/
current->signal->group_stop_count++;
set_tsk_thread_flag(p, TIF_SIGPENDING);
}
spin_unlock(¤t->sighand->siglock);
}
/*把新进程插入到进程链表中去*/
SET_LINKS(p);
/*如果新进程的PT_PTRACED标志被设置,表示子进程必须被跟踪,那么就把当前进程的parent赋值给新进程
并把新进程插入到调试进程的链表中*/
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);
/*把新进程的PID插入到pid_hash[type][pid_hashfn(nr)]散列表*/
attach_pid(p, PIDTYPE_PID, p->pid);
attach_pid(p, PIDTYPE_TGID, p->tgid);
if (thread_group_leader(p)) {
attach_pid(p, PIDTYPE_PGID, process_group(p));
attach_pid(p, PIDTYPE_SID, p->signal->session);
if (p->pid)
__get_cpu_var(process_counts)++;
}
/*新进程已经加入到了进程集合,此时nr_threads递增*/
nr_threads++;
/*递增total_forks变量以记录被创建的进程地数量*/
total_forks++;
write_unlock_irq(&tasklist_lock);
retval = 0;
/*函数终止,并返回新进程的进程描述符指针*/
fork_out:
if (retval)
return ERR_PTR(retval);
return p;
bad_fork_cleanup_namespace:
exit_namespace(p);
bad_fork_cleanup_keys:
exit_keys(p);
bad_fork_cleanup_mm:
if (p->mm)
mmput(p->mm);
bad_fork_cleanup_signal:
exit_signal(p);
bad_fork_cleanup_sighand:
exit_sighand(p);
bad_fork_cleanup_fs:
exit_fs(p); /* blocking */
bad_fork_cleanup_files:
exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_security:
security_task_free(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_free(p->mempolicy);
#endif
bad_fork_cleanup:
if (p->binfmt)
module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
module_put(p->thread_info->exec_domain->module);
bad_fork_cleanup_count:
put_group_info(p->group_info);
atomic_dec(&p->user->processes);
free_uid(p->user);
bad_fork_free:
free_task(p);
goto fork_out;
}
3.3、dup_task_struct
static struct task_struct *dup_task_struct(struct task_struct *orig)
{
struct task_struct *tsk;
struct thread_info *ti;
/*空函数 已被注释
*函数作用:把FPU、MMX和SSE/SSE2寄存器的内容保存到父进程的thread_info结构中。稍后,会把这些值
*赋值到子进程的thread_info结构中
*/
prepare_to_copy(orig);
/*为新进程获取进程描述符,并将描述符地址保存在tsk局部变量中*/
tsk = alloc_task_struct();
if (!tsk)
return NULL;
/*为新进程分配线程描述符,存放新进程的thread_info结构和内核栈,这块区域的大小由__get_free_pages(GFP_KERNEL,1)分配一页的大小*/
ti = alloc_thread_info(tsk);
if (!ti) {
free_task_struct(tsk);
return NULL;
}
/*把父进程的线程描述符内容赋值给ti所指向的结构*/
*ti = *orig->thread_info;
/*把父进程的进程描述符的内容赋值给tsk所指向的结构*/
*tsk = *orig;
/*把tsk->thread_info设置为ti*/
tsk->thread_info = ti;
/*把ti->task设置为tsk*/
ti->task = tsk;
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2); //把新进程描述符的使用计数器设置为2,用来表示进程描述符正在被使用而且其相应的进程处于活动状态
/*返回子进程的进程描述符指针*/
return tsk;
}
3.4、 copy_flags
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
{
unsigned long new_flags = p->flags;
/*清除PF_SUPERPRIV位(表示进程是否使用了超级特权)*/
new_flags &= ~PF_SUPERPRIV;
/*设置PF_FORKNOEXEC标志,表示系统还没有发出execve()系统调用*/
new_flags |= PF_FORKNOEXEC;
if (!(clone_flags & CLONE_PTRACE))
p->ptrace = 0;
/*更新flags*/
p->flags = new_flags;
}