do_fork
int do_fork(unsigned long clone_flags, unsigned long stack_start, struct pt_regs *regs, unsigned long stack_size) { int retval = -ENOMEM; struct task_struct *p; DECLARE_MUTEX_LOCKED(sem); if (clone_flags & CLONE_PID) { if (current->pid) return -EPERM; } current->vfork_sem = &sem; p = alloc_task_struct(); *p = *current; if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur) goto bad_fork_free; atomic_inc(&p->user->__count); atomic_inc(&p->user->processes); if (nr_threads >= max_threads) goto bad_fork_cleanup_count; get_exec_domain(p->exec_domain); if (p->binfmt && p->binfmt->module) __MOD_INC_USE_COUNT(p->binfmt->module); p->did_exec = 0; p->swappable = 0; p->state = TASK_UNINTERRUPTIBLE; copy_flags(clone_flags, p); p->pid = get_pid(clone_flags); p->run_list.next = NULL; p->run_list.prev = NULL; if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) { p->p_opptr = current; if (!(p->ptrace & PT_PTRACED)) p->p_pptr = current; } p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); p->vfork_sem = NULL; spin_lock_init(&p->alloc_lock); p->sigpending = 0; init_sigpending(&p->pending); p->it_real_value = p->it_virt_value = p->it_prof_value = 0; p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; init_timer(&p->real_timer); p->real_timer.data = (unsigned long) p; p->leader = 0; p->tty_old_pgrp = 0; p->times.tms_utime = p->times.tms_stime = 0; p->times.tms_cutime = p->times.tms_cstime = 0; p->lock_depth = -1; p->start_time = jiffies; retval = -ENOMEM; if (copy_files(clone_flags, p)) goto bad_fork_cleanup; if (copy_fs(clone_flags, p)) goto bad_fork_cleanup_files; if (copy_sighand(clone_flags, p)) goto bad_fork_cleanup_fs; if (copy_mm(clone_flags, p)) goto bad_fork_cleanup_sighand; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_sighand; p->semundo = NULL; p->parent_exec_id = p->self_exec_id; p->swappable = 1; p->exit_signal = clone_flags & CSIGNAL; p->pdeath_signal = 0; p->counter = (current->counter + 1) >> 1; current->counter >>= 1; if (!current->counter) current->need_resched = 1; retval = p->pid; p->tgid = retval; INIT_LIST_HEAD(&p->thread_group); write_lock_irq(&tasklist_lock); if (clone_flags & CLONE_THREAD) { p->tgid = current->tgid; list_add(&p->thread_group, ¤t->thread_group); } SET_LINKS(p); hash_pid(p); nr_threads++; write_unlock_irq(&tasklist_lock); if (p->ptrace & PT_PTRACED) send_sig(SIGSTOP, p, 1); wake_up_process(p); ++total_forks; }
1) unsigned long clone_flags flags控制fork时复制资源的深度。 #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ #define CLONE_VM 0x00000100 /* set if VM shared between processes */ #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ #define CLONE_PID 0x00001000 /* set if pid shared */ #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) 2) p = alloc_task_struct(); *p = *current; 申请2个页面,页面的起始做为task_struct,剩余部分做为内核栈。 把当前进程的结构体内容拷贝给新的task_struct 3) if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur) 检查用户的进程数是否超限。 4) if (nr_threads >= max_threads) 检查总进程数是否超限。 5) p->state = TASK_UNINTERRUPTIBLE; 进程设置为 TASK_UNINTERRUPTIBLE 状态,因为下面get_pid有可能进入睡眠状态,并且不允许被信号打断。 6) copy_flags(clone_flags, p); p->pid = get_pid(clone_flags); 拷贝标记位,获取进程的pid。 7) init_waitqueue_head(&p->wait_chldexit); 初始化wait4的队列。 8) p->sigpending = 0; init_sigpending(&p->pending); 初始化待处理信号。 9) p->start_time = jiffies; 初始化进程启动时间。 10) if (copy_files(clone_flags, p)) 有条件的复制打开文件。 只在CLONE_FILES为0时才复制。 11) if (copy_fs(clone_flags, p)) if (copy_sighand(clone_flags, p)) if (copy_mm(clone_flags, p)) 拷贝task_struct结构体里面的各类资源。 12) retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 拷贝内核栈。
copy_files
#define BITS_PER_LONG 32 struct files_struct { atomic_t count; rwlock_t file_lock; int max_fds; int max_fdset; int next_fd; struct file ** fd; fd_set *close_on_exec; fd_set *open_fds; fd_set close_on_exec_init; fd_set open_fds_init; struct file * fd_array[NR_OPEN_DEFAULT]; };
static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; int open_files, nfds, size, i, error = 0; oldf = current->files; if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; } tsk->files = NULL; error = -ENOMEM; newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); if (!newf) goto out; atomic_set(&newf->count, 1); newf->file_lock = RW_LOCK_UNLOCKED; newf->next_fd = 0; newf->max_fds = NR_OPEN_DEFAULT; newf->max_fdset = __FD_SETSIZE; newf->close_on_exec = &newf->close_on_exec_init; newf->open_fds = &newf->open_fds_init; newf->fd = &newf->fd_array[0]; size = oldf->max_fdset; if (size > __FD_SETSIZE) { newf->max_fdset = 0; write_lock(&newf->file_lock); error = expand_fdset(newf, size); write_unlock(&newf->file_lock); if (error) goto out_release; } read_lock(&oldf->file_lock); open_files = count_open_files(oldf, size); nfds = NR_OPEN_DEFAULT; if (open_files > nfds) { read_unlock(&oldf->file_lock); newf->max_fds = 0; write_lock(&newf->file_lock); error = expand_fd_array(newf, open_files); write_unlock(&newf->file_lock); if (error) goto out_release; nfds = newf->max_fds; read_lock(&oldf->file_lock); } old_fds = oldf->fd; new_fds = newf->fd; memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) get_file(f); *new_fds++ = f; } read_unlock(&oldf->file_lock); size = (newf->max_fds - open_files) * sizeof(struct file *); memset(new_fds, 0, size); if (newf->max_fdset > open_files) { int left = (newf->max_fdset-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); memset(&newf->open_fds->fds_bits[start], 0, left); memset(&newf->close_on_exec->fds_bits[start], 0, left); } tsk->files = newf; error = 0; }
1) if (clone_flags & CLONE_FILES)
如果CLONE_FILES被屏蔽了,增加atomic_inc(&oldf->count),然后返回。
2) newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL)
从slab中申请一个file_struct
3) 复制位图和file数组
memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
4) 位图和file数组都是固定大小的,如果超过了默认容量(32个)则要进行expand_fdset和expand_fd_array。
copy_fs
拷贝进程的根目录,当前workdir,umask等。 由于这个函数不是do_fork的重点略过。
copy_sighand
信号至于进程,如同中断至于处理器。 如果进程设置了信号处理函数,则task_struct的sig成员非空。
#define _NSIG 64 struct signal_struct { atomic_t count; struct k_sigaction action[_NSIG]; spinlock_t siglock; };
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { struct signal_struct *sig; if (clone_flags & CLONE_SIGHAND) { atomic_inc(¤t->sig->count); return 0; } sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); tsk->sig = sig; if (!sig) return -1; spin_lock_init(&sig->siglock); atomic_set(&sig->count, 1); memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action)); return 0; }
1) if (clone_flags & CLONE_SIGHAND) 同样,只有当CLONE_SIGHAND为0才进行拷贝。 2) sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); 从slab中申请一个signal_struct结构体。 3) spin_lock_init(&sig->siglock); 初始化锁 4) atomic_set(&sig->count, 1); 设置计数 5) memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action)); 拷贝action数组。
copy_mm
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; tsk->nswap = tsk->cnswap = 0; tsk->mm = NULL; tsk->active_mm = NULL; oldmm = current->mm; if (!oldmm) return 0; if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; goto good_mm; } retval = -ENOMEM; mm = allocate_mm(); memcpy(mm, oldmm, sizeof(*mm)); if (!mm_init(mm)) goto fail_nomem; down(&oldmm->mmap_sem); retval = dup_mmap(mm); up(&oldmm->mmap_sem); spin_lock(&mmlist_lock); list_add(&mm->mmlist, &oldmm->mmlist); spin_unlock(&mmlist_lock); if (retval) goto free_pt; copy_segments(tsk, mm); if (init_new_context(tsk,mm)) goto free_pt; good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; free_pt: mmput(mm); fail_nomem: return retval; }
1) tsk->min_flt = tsk->maj_flt = 0; 初始化计算swap权重的相关指标。 2) oldmm = current->mm; if (!oldmm) return 0; 如果是内核线程没有mm,直接返回。 3) if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; goto good_mm; } 同样,只有当CLONE_VM为0才进行拷贝。 4) mm = allocate_mm(); 申请一个mm_struct结构体。 5) memcpy(mm, oldmm, sizeof(*mm)); 浅拷贝mm_struct结构体。 拷贝了指向vma的指针,和start_code, end_code, stat_brk, brk等。 但是,并没有拷贝vma。 6) retval = dup_mmap(mm); 深度拷贝vma。
dup_mmap
static inline int dup_mmap(struct mm_struct * mm) { struct vm_area_struct * mpnt, *tmp, **pprev; int retval; flush_cache_mm(current->mm); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_avl = NULL; mm->mmap_cache = NULL; mm->map_count = 0; mm->cpu_vm_mask = 0; mm->swap_cnt = 0; mm->swap_address = 0; pprev = &mm->mmap; for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { struct file *file; retval = -ENOMEM; if(mpnt->vm_flags & VM_DONTCOPY) continue; tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; mm->map_count++; tmp->vm_next = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file->f_dentry->d_inode; get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); spin_lock(&inode->i_mapping->i_shared_lock); if((tmp->vm_next_share = mpnt->vm_next_share) != NULL) mpnt->vm_next_share->vm_pprev_share = &tmp->vm_next_share; mpnt->vm_next_share = tmp; tmp->vm_pprev_share = &mpnt->vm_next_share; spin_unlock(&inode->i_mapping->i_shared_lock); } retval = copy_page_range(mm, current->mm, tmp); if (!retval && tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); *pprev = tmp; pprev = &tmp->vm_next; if (retval) goto fail_nomem; } retval = 0; if (mm->map_count >= AVL_MIN_MAP_COUNT) build_mmap_avl(mm); }
1) for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) 遍历所有的vma,逐个拷贝。 2) tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 从slab中申请一个vma 3) *tmp = *mpnt; 拷贝vma,可以看fork出来的进程的虚拟地址和父进程完全一样。 4) retval = copy_page_range(mm, current->mm, tmp); 拷贝vma对应地址段下面所有的页表。
copy_page_range
int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pgd_t * src_pgd, * dst_pgd; unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; for (;;) { pmd_t * src_pmd, * dst_pmd; src_pgd++; dst_pgd++; if (pgd_none(*src_pgd)) goto skip_copy_pmd_range; if (pgd_bad(*src_pgd)) { pgd_ERROR(*src_pgd); pgd_clear(src_pgd); skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; if (!address || (address >= end)) goto out; continue; } if (pgd_none(*dst_pgd)) { if (!pmd_alloc(dst_pgd, 0)) goto nomem; } src_pmd = pmd_offset(src_pgd, address); dst_pmd = pmd_offset(dst_pgd, address); do { pte_t * src_pte, * dst_pte; if (pmd_none(*src_pmd)) goto skip_copy_pte_range; if (pmd_bad(*src_pmd)) { pmd_ERROR(*src_pmd); pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; if (address >= end) goto out; goto cont_copy_pmd_range; } if (pmd_none(*dst_pmd)) { if (!pte_alloc(dst_pmd, 0)) goto nomem; } src_pte = pte_offset(src_pmd, address); dst_pte = pte_offset(dst_pmd, address); do { pte_t pte = *src_pte; struct page *ptepage; if (pte_none(pte)) goto cont_copy_pte_range_noset; if (!pte_present(pte)) { swap_duplicate(pte_to_swp_entry(pte)); goto cont_copy_pte_range; } ptepage = pte_page(pte); if ((!VALID_PAGE(ptepage)) || PageReserved(ptepage)) goto cont_copy_pte_range; if (cow) { ptep_set_wrprotect(src_pte); pte = *src_pte; } if (vma->vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(ptepage); cont_copy_pte_range: set_pte(dst_pte, pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) goto out; src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } out: return 0; nomem: return -ENOMEM; }
1) for (;;) 循环遍历vma对应的所有的pgd。 2) if (!pmd_alloc(dst_pgd, 0) 分配pmd 3) if (pmd_none(*dst_pmd)) { if (!pte_alloc(dst_pmd, 0)) goto nomem; } 分配pte。 4) if (cow) { ptep_set_wrprotect(src_pte); pte = *src_pte; } 如果是cow,把父进程和子进程的pte设置为只读的。 这样,在父进程或子进程写页面的时候,引发页面异常。 在page_fault里面会分配一个页面,并把内容拷贝过去,同时把老的pte从只读改成读写。 5) 如果父进程的vma中有空洞则会跳过。 if (pgd_none(*src_pgd)) if (pmd_none(*src_pmd)) if (pte_none(pte))
copy_thread 这是一个非常重要并且非常有趣的函数!!!
int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { struct pt_regs * childregs; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; struct_cpy(childregs, regs); childregs->eax = 0; childregs->esp = esp; p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); unlazy_fpu(current); struct_cpy(&p->thread.i387, ¤t->thread.i387); return 0; }
1) do_fork中之前的一系列copy_开头的函数都是在构造新进程的task_struct结构体。 这个函数构造的是新进程的内核栈。 2) childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; 找到新进程内核栈的pt_regs的起始位置。进入内核态时会把现场通过SAVE_ALL保存到内核栈的顶端。 注意:THREAD_SIZE+p时内核栈的顶端,-1正好走道了pt_regs的开始位置。 3) struct_cpy(childregs, regs); 把父进程的现场拷贝给新的子进程。 4) childregs->eax = 0; 修改新进程中现场中的eax为0, eax就是fork系统调用的返回值,也就是子进程返回给用户态的进程pid。 5) childregs->esp = esp; 修改新进程现场中的childregs->esp为esp。 注意:esp是参数传进来的。 对于fork,esp就是regs.esp。也就是父进程和子进程在用户态的esp指向相同的地址。也就是父进程和子进程看到的用户态的栈是一样的。 那么问题来了,当子进程和父进程返回到用户态的时候,共用一个栈岂不是乱套了?这个时候COW发挥了作用,缺页异常会申请一个新页面,并把被子进程和父进程共享的页面拷贝一次。 对于clone,esp是从用户态传递过来的。库函数pthread_create创建线程的第一步就是mmap一个8M+4k的空间,然后把这个空间做为线程的栈。 6) p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); task_struct中的thread成员非常重要,它记录了进程被切换时内核栈的指针,eip等。 将thread的esp设置成pt_regs的起始地址。 将thread的esp0设置成pt_regs+1就是内核栈的顶端。当这个进程被调用运行时,内核会将这个变量写入TSS的esp0, 而这个值就是刚刚被调度起来进程的内核栈的地址。 thread_struct是CPU架构相关的。 struct thread_struct { unsigned long esp0; unsigned long eip; unsigned long esp; unsigned long fs; unsigned long gs; unsigned long debugreg[8]; unsigned long cr2, trap_no, error_code; union i387_union i387; int ioperm; unsigned long io_bitmap[IO_BITMAP_SIZE+1]; }; 7) p->thread.eip = (unsigned long) ret_from_fork; 设置子进程的下一条指令是 ret_from_fork。