内核代码阅读(21) - do_fork

do_fork

int do_fork(unsigned long clone_flags, unsigned long stack_start,
            struct pt_regs *regs, unsigned long stack_size)
    {
        int retval = -ENOMEM;
        struct task_struct *p;
        DECLARE_MUTEX_LOCKED(sem);
        if (clone_flags & CLONE_PID) {
                if (current->pid)
                        return -EPERM;
        }
        
        current->vfork_sem = &sem;
        p = alloc_task_struct();
        *p = *current;
        if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
                goto bad_fork_free;
        atomic_inc(&p->user->__count);
        atomic_inc(&p->user->processes);
        if (nr_threads >= max_threads)
                goto bad_fork_cleanup_count;
        
        get_exec_domain(p->exec_domain);
        if (p->binfmt && p->binfmt->module)
                __MOD_INC_USE_COUNT(p->binfmt->module);
        p->did_exec = 0;
        p->swappable = 0;
        p->state = TASK_UNINTERRUPTIBLE;
        copy_flags(clone_flags, p);
        p->pid = get_pid(clone_flags);
        p->run_list.next = NULL;
        p->run_list.prev = NULL;
        if ((clone_flags & CLONE_VFORK) || !(clone_flags & CLONE_PARENT)) {
                p->p_opptr = current;
                if (!(p->ptrace & PT_PTRACED))
                        p->p_pptr = current;
        }
        p->p_cptr = NULL;
        
        init_waitqueue_head(&p->wait_chldexit);
        
        p->vfork_sem = NULL;
        spin_lock_init(&p->alloc_lock);
        p->sigpending = 0;
        init_sigpending(&p->pending);
        p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
        p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
        
        init_timer(&p->real_timer);
        p->real_timer.data = (unsigned long) p;
        p->leader = 0;
        p->tty_old_pgrp = 0;
        p->times.tms_utime = p->times.tms_stime = 0;
        p->times.tms_cutime = p->times.tms_cstime = 0;
        p->lock_depth = -1;
        p->start_time = jiffies;
        retval = -ENOMEM;
        if (copy_files(clone_flags, p))
                goto bad_fork_cleanup;
        if (copy_fs(clone_flags, p))
                goto bad_fork_cleanup_files;
        if (copy_sighand(clone_flags, p))
                goto bad_fork_cleanup_fs;
        if (copy_mm(clone_flags, p))
                goto bad_fork_cleanup_sighand;
        retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
        
        if (retval)
                goto bad_fork_cleanup_sighand;
        p->semundo = NULL;
        
        p->parent_exec_id = p->self_exec_id;
        p->swappable = 1;
        p->exit_signal = clone_flags & CSIGNAL;
        p->pdeath_signal = 0;
        p->counter = (current->counter + 1) >> 1;
        current->counter >>= 1;
        if (!current->counter)
                current->need_resched = 1;
        retval = p->pid;
        p->tgid = retval;
        INIT_LIST_HEAD(&p->thread_group);
        write_lock_irq(&tasklist_lock);
        if (clone_flags & CLONE_THREAD) {
                p->tgid = current->tgid;
                list_add(&p->thread_group, &current->thread_group);
        }
        
        SET_LINKS(p);
        hash_pid(p);
        nr_threads++;
        write_unlock_irq(&tasklist_lock);
        if (p->ptrace & PT_PTRACED)
                send_sig(SIGSTOP, p, 1);
        wake_up_process(p);
        ++total_forks;
    }
1) unsigned long clone_flags
   flags控制fork时复制资源的深度。
   #define CSIGNAL                0x000000ff        /* signal mask to be sent at exit */
   #define CLONE_VM        0x00000100        /* set if VM shared between processes */
   #define CLONE_FS        0x00000200        /* set if fs info shared between processes */
   #define CLONE_FILES        0x00000400        /* set if open files shared between processes */
   #define CLONE_SIGHAND        0x00000800        /* set if signal handlers and blocked signals shared */
   #define CLONE_PID        0x00001000        /* set if pid shared */
   #define CLONE_PTRACE        0x00002000        /* set if we want to let tracing continue on the child too */
   #define CLONE_VFORK        0x00004000        /* set if the parent wants the child to wake it up on mm_release */
   #define CLONE_PARENT        0x00008000        /* set if we want to have the same parent as the cloner */
   #define CLONE_THREAD        0x00010000        /* Same thread group? */
   #define CLONE_SIGNAL        (CLONE_SIGHAND | CLONE_THREAD)
2) p = alloc_task_struct();
   *p = *current;
   申请2个页面,页面的起始做为task_struct,剩余部分做为内核栈。
   把当前进程的结构体内容拷贝给新的task_struct 
3) if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur)
   检查用户的进程数是否超限。
4) if (nr_threads >= max_threads)
   检查总进程数是否超限。
5) p->state = TASK_UNINTERRUPTIBLE;
   进程设置为 TASK_UNINTERRUPTIBLE 状态,因为下面get_pid有可能进入睡眠状态,并且不允许被信号打断。
6) copy_flags(clone_flags, p);
   p->pid = get_pid(clone_flags);
   拷贝标记位,获取进程的pid。
7) init_waitqueue_head(&p->wait_chldexit);
   初始化wait4的队列。
8) p->sigpending = 0;
   init_sigpending(&p->pending);
   初始化待处理信号。
9) p->start_time = jiffies;
   初始化进程启动时间。
10) if (copy_files(clone_flags, p))
    有条件的复制打开文件。
    只在CLONE_FILES为0时才复制。
11) if (copy_fs(clone_flags, p))
    if (copy_sighand(clone_flags, p))
    if (copy_mm(clone_flags, p))
    拷贝task_struct结构体里面的各类资源。
12) retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
    拷贝内核栈。

copy_files

#define BITS_PER_LONG 32
     struct files_struct {
        atomic_t count;
        rwlock_t file_lock;
        int max_fds;
        int max_fdset;
        int next_fd;
        struct file ** fd;
        fd_set *close_on_exec;
        fd_set *open_fds;
        fd_set close_on_exec_init;
        fd_set open_fds_init;
        struct file * fd_array[NR_OPEN_DEFAULT];
     };
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
     {
        struct files_struct *oldf, *newf;
        struct file **old_fds, **new_fds;
        int open_files, nfds, size, i, error = 0;
        oldf = current->files;
        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                goto out;
        }
        tsk->files = NULL;
        error = -ENOMEM;
        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
        if (!newf) 
                goto out;
        atomic_set(&newf->count, 1);
        newf->file_lock            = RW_LOCK_UNLOCKED;
        newf->next_fd            = 0;
        newf->max_fds            = NR_OPEN_DEFAULT;
        newf->max_fdset            = __FD_SETSIZE;
        newf->close_on_exec = &newf->close_on_exec_init;
        newf->open_fds            = &newf->open_fds_init;
        newf->fd            = &newf->fd_array[0];
        size = oldf->max_fdset;
        if (size > __FD_SETSIZE) {
                newf->max_fdset = 0;
                write_lock(&newf->file_lock);
                error = expand_fdset(newf, size);
                write_unlock(&newf->file_lock);
                if (error)
                        goto out_release;
        }
        read_lock(&oldf->file_lock);
        open_files = count_open_files(oldf, size);
        nfds = NR_OPEN_DEFAULT;
        if (open_files > nfds) {
                read_unlock(&oldf->file_lock);
                newf->max_fds = 0;
                write_lock(&newf->file_lock);
                error = expand_fd_array(newf, open_files);
                write_unlock(&newf->file_lock);
                if (error) 
                        goto out_release;
                nfds = newf->max_fds;
                read_lock(&oldf->file_lock);
        }
        old_fds = oldf->fd;
        new_fds = newf->fd;
        memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
        memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
                if (f)
                        get_file(f);
                *new_fds++ = f;
        }
        read_unlock(&oldf->file_lock);
        size = (newf->max_fds - open_files) * sizeof(struct file *);
        memset(new_fds, 0, size); 
        if (newf->max_fdset > open_files) {
                int left = (newf->max_fdset-open_files)/8;
                int start = open_files / (8 * sizeof(unsigned long));
                
                memset(&newf->open_fds->fds_bits[start], 0, left);
                memset(&newf->close_on_exec->fds_bits[start], 0, left);
        }
        tsk->files = newf;
        error = 0;
   }

1) if (clone_flags & CLONE_FILES)

如果CLONE_FILES被屏蔽了,增加atomic_inc(&oldf->count),然后返回。

2) newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL)

从slab中申请一个file_struct

3) 复制位图和file数组

memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);

memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);

4) 位图和file数组都是固定大小的,如果超过了默认容量(32个)则要进行expand_fdset和expand_fd_array。

copy_fs

拷贝进程的根目录,当前workdir,umask等。
 由于这个函数不是do_fork的重点略过。

copy_sighand

信号至于进程,如同中断至于处理器。
 如果进程设置了信号处理函数,则task_struct的sig成员非空。
#define _NSIG                64
     
     struct signal_struct {
        atomic_t                count;
        struct k_sigaction        action[_NSIG];
        spinlock_t                siglock;
     };
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
     {
        struct signal_struct *sig;
        if (clone_flags & CLONE_SIGHAND) {
                atomic_inc(&current->sig->count);
                return 0;
        }
        sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
        tsk->sig = sig;
        if (!sig)
                return -1;
        spin_lock_init(&sig->siglock);
        atomic_set(&sig->count, 1);
        memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
        return 0;
     }
1) if (clone_flags & CLONE_SIGHAND)
    同样,只有当CLONE_SIGHAND为0才进行拷贝。
 2) sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
    从slab中申请一个signal_struct结构体。
 3) spin_lock_init(&sig->siglock);
    初始化锁
 4) atomic_set(&sig->count, 1);
    设置计数
 5) memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
    拷贝action数组。

copy_mm

static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
     {
        struct mm_struct * mm, *oldmm;
        int retval;
        tsk->min_flt = tsk->maj_flt = 0;
        tsk->cmin_flt = tsk->cmaj_flt = 0;
        tsk->nswap = tsk->cnswap = 0;
        tsk->mm = NULL;
        tsk->active_mm = NULL;
        oldmm = current->mm;
        if (!oldmm)
                return 0;
        if (clone_flags & CLONE_VM) {
                atomic_inc(&oldmm->mm_users);
                mm = oldmm;
                goto good_mm;
        }
        retval = -ENOMEM;
        mm = allocate_mm();
        
        memcpy(mm, oldmm, sizeof(*mm));
        if (!mm_init(mm))
                goto fail_nomem;
        down(&oldmm->mmap_sem);
        retval = dup_mmap(mm);
        up(&oldmm->mmap_sem);
        spin_lock(&mmlist_lock);
        list_add(&mm->mmlist, &oldmm->mmlist);
        spin_unlock(&mmlist_lock);
        if (retval)
                goto free_pt;
        copy_segments(tsk, mm);
        if (init_new_context(tsk,mm))
                goto free_pt;
good_mm:
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
free_pt:
        mmput(mm);
fail_nomem:
        return retval;
     }
1) tsk->min_flt = tsk->maj_flt = 0;
    初始化计算swap权重的相关指标。
 2) oldmm = current->mm;
    if (!oldmm)
        return 0;
    如果是内核线程没有mm,直接返回。
 3) if (clone_flags & CLONE_VM) {
        atomic_inc(&oldmm->mm_users);
        mm = oldmm;
        goto good_mm;
    }
    同样,只有当CLONE_VM为0才进行拷贝。
 4) mm = allocate_mm();
    申请一个mm_struct结构体。
 5) memcpy(mm, oldmm, sizeof(*mm));
    浅拷贝mm_struct结构体。
    拷贝了指向vma的指针,和start_code, end_code, stat_brk, brk等。
    但是,并没有拷贝vma。
 6) retval = dup_mmap(mm);
    深度拷贝vma。

dup_mmap

static inline int dup_mmap(struct mm_struct * mm)
    {
        struct vm_area_struct * mpnt, *tmp, **pprev;
        int retval;
        flush_cache_mm(current->mm);
        mm->locked_vm = 0;
        mm->mmap = NULL;
        mm->mmap_avl = NULL;
        mm->mmap_cache = NULL;
        mm->map_count = 0;
        mm->cpu_vm_mask = 0;
        mm->swap_cnt = 0;
        mm->swap_address = 0;
        pprev = &mm->mmap;
        for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
                struct file *file;
                retval = -ENOMEM;
                if(mpnt->vm_flags & VM_DONTCOPY)
                        continue;
                tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
                tmp->vm_flags &= ~VM_LOCKED;
                tmp->vm_mm = mm;
                mm->map_count++;
                tmp->vm_next = NULL;
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_dentry->d_inode;
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
      
                        spin_lock(&inode->i_mapping->i_shared_lock);
                        if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
                                mpnt->vm_next_share->vm_pprev_share =
                                        &tmp->vm_next_share;
                        mpnt->vm_next_share = tmp;
                        tmp->vm_pprev_share = &mpnt->vm_next_share;
                        spin_unlock(&inode->i_mapping->i_shared_lock);
                }
                retval = copy_page_range(mm, current->mm, tmp);
                if (!retval && tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);
                *pprev = tmp;
                pprev = &tmp->vm_next;
                if (retval)
                        goto fail_nomem;
        }
        retval = 0;
        if (mm->map_count >= AVL_MIN_MAP_COUNT)
                build_mmap_avl(mm);
     }
1) for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next)
    遍历所有的vma,逐个拷贝。
 2) tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
    从slab中申请一个vma
 3) *tmp = *mpnt;
    拷贝vma,可以看fork出来的进程的虚拟地址和父进程完全一样。
 4) retval = copy_page_range(mm, current->mm, tmp);
    拷贝vma对应地址段下面所有的页表。

copy_page_range

int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
                        struct vm_area_struct *vma)
    {
        pgd_t * src_pgd, * dst_pgd;
        unsigned long address = vma->vm_start;
        unsigned long end = vma->vm_end;
        unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
        src_pgd = pgd_offset(src, address)-1;
        dst_pgd = pgd_offset(dst, address)-1;
        
        for (;;) {
                pmd_t * src_pmd, * dst_pmd;
                src_pgd++; dst_pgd++;
                
                if (pgd_none(*src_pgd))
                        goto skip_copy_pmd_range;
                if (pgd_bad(*src_pgd)) {
                        pgd_ERROR(*src_pgd);
                        pgd_clear(src_pgd);
skip_copy_pmd_range:        address = (address + PGDIR_SIZE) & PGDIR_MASK;
                        if (!address || (address >= end))
                                goto out;
                        continue;
                }
                if (pgd_none(*dst_pgd)) {
                        if (!pmd_alloc(dst_pgd, 0))
                                goto nomem;
                }
                
                src_pmd = pmd_offset(src_pgd, address);
                dst_pmd = pmd_offset(dst_pgd, address);
                do {
                        pte_t * src_pte, * dst_pte;
                
                
                        if (pmd_none(*src_pmd))
                                goto skip_copy_pte_range;
                        if (pmd_bad(*src_pmd)) {
                                pmd_ERROR(*src_pmd);
                                pmd_clear(src_pmd);
skip_copy_pte_range:                address = (address + PMD_SIZE) & PMD_MASK;
                                if (address >= end)
                                        goto out;
                                goto cont_copy_pmd_range;
                        }
                        if (pmd_none(*dst_pmd)) {
                                if (!pte_alloc(dst_pmd, 0))
                                        goto nomem;
                        }
                        
                        src_pte = pte_offset(src_pmd, address);
                        dst_pte = pte_offset(dst_pmd, address);
                        
                        do {
                                pte_t pte = *src_pte;
                                struct page *ptepage;
                                
                                if (pte_none(pte))
                                        goto cont_copy_pte_range_noset;
                                if (!pte_present(pte)) {
                                        swap_duplicate(pte_to_swp_entry(pte));
                                        goto cont_copy_pte_range;
                                }
                                ptepage = pte_page(pte);
                                if ((!VALID_PAGE(ptepage)) || 
                                    PageReserved(ptepage))
                                        goto cont_copy_pte_range;
        
                                if (cow) {
                                        ptep_set_wrprotect(src_pte);
                                        pte = *src_pte;
                                }
                                if (vma->vm_flags & VM_SHARED)
                                        pte = pte_mkclean(pte);
                                pte = pte_mkold(pte);
                                get_page(ptepage);
cont_copy_pte_range:                set_pte(dst_pte, pte);
cont_copy_pte_range_noset:        address += PAGE_SIZE;
                                if (address >= end)
                                        goto out;
                                src_pte++;
                                dst_pte++;
                        } while ((unsigned long)src_pte & PTE_TABLE_MASK);
                
cont_copy_pmd_range:        src_pmd++;
                        dst_pmd++;
                } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
        }
out:
        return 0;
nomem:
        return -ENOMEM;
    }
1) for (;;)
   循环遍历vma对应的所有的pgd。
2) if (!pmd_alloc(dst_pgd, 0)
   分配pmd
3) if (pmd_none(*dst_pmd)) {
       if (!pte_alloc(dst_pmd, 0))
           goto nomem;
   }
   分配pte。
4) if (cow) {
      ptep_set_wrprotect(src_pte);
      pte = *src_pte;
   }
   如果是cow,把父进程和子进程的pte设置为只读的。
   这样,在父进程或子进程写页面的时候,引发页面异常。
   在page_fault里面会分配一个页面,并把内容拷贝过去,同时把老的pte从只读改成读写。
5) 如果父进程的vma中有空洞则会跳过。
   if (pgd_none(*src_pgd))
   if (pmd_none(*src_pmd))
   if (pte_none(pte))

copy_thread 这是一个非常重要并且非常有趣的函数!!!

int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
        unsigned long unused,
        struct task_struct * p, struct pt_regs * regs)
    {
        struct pt_regs * childregs;
        childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
        struct_cpy(childregs, regs);
        childregs->eax = 0;
        childregs->esp = esp;
        p->thread.esp = (unsigned long) childregs;
        p->thread.esp0 = (unsigned long) (childregs+1);
        p->thread.eip = (unsigned long) ret_from_fork;
        savesegment(fs,p->thread.fs);
        savesegment(gs,p->thread.gs);
        unlazy_fpu(current);
        struct_cpy(&p->thread.i387, &current->thread.i387);
        return 0;
    }
1) do_fork中之前的一系列copy_开头的函数都是在构造新进程的task_struct结构体。
   这个函数构造的是新进程的内核栈。
2) childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1;
   找到新进程内核栈的pt_regs的起始位置。进入内核态时会把现场通过SAVE_ALL保存到内核栈的顶端。
   注意:THREAD_SIZE+p时内核栈的顶端,-1正好走道了pt_regs的开始位置。
3) struct_cpy(childregs, regs);
   把父进程的现场拷贝给新的子进程。
4) childregs->eax = 0;
   修改新进程中现场中的eax为0, eax就是fork系统调用的返回值,也就是子进程返回给用户态的进程pid。
5) childregs->esp = esp;
   修改新进程现场中的childregs->esp为esp。
   注意:esp是参数传进来的。
   对于fork,esp就是regs.esp。也就是父进程和子进程在用户态的esp指向相同的地址。也就是父进程和子进程看到的用户态的栈是一样的。
        那么问题来了,当子进程和父进程返回到用户态的时候,共用一个栈岂不是乱套了?这个时候COW发挥了作用,缺页异常会申请一个新页面,并把被子进程和父进程共享的页面拷贝一次。
   对于clone,esp是从用户态传递过来的。库函数pthread_create创建线程的第一步就是mmap一个8M+4k的空间,然后把这个空间做为线程的栈。
6) p->thread.esp = (unsigned long) childregs;
   p->thread.esp0 = (unsigned long) (childregs+1);
   task_struct中的thread成员非常重要,它记录了进程被切换时内核栈的指针,eip等。
   将thread的esp设置成pt_regs的起始地址。
   将thread的esp0设置成pt_regs+1就是内核栈的顶端。当这个进程被调用运行时,内核会将这个变量写入TSS的esp0, 而这个值就是刚刚被调度起来进程的内核栈的地址。
   
   thread_struct是CPU架构相关的。
   struct thread_struct {
       unsigned long        esp0;
       unsigned long        eip;
       unsigned long        esp;
       unsigned long        fs;
       unsigned long        gs;
       unsigned long        debugreg[8];
       unsigned long        cr2, trap_no, error_code;
       union i387_union        i387;
       int                ioperm;
       unsigned long        io_bitmap[IO_BITMAP_SIZE+1];
   };
7) p->thread.eip = (unsigned long) ret_from_fork;
   设置子进程的下一条指令是 ret_from_fork。
上一篇:《supervisor.erl 源码解读》


下一篇:时间:2014年3月30日13:59:52框架准备测试mysql类