linux用户栈内核栈的设置---进程的创建: fork/execve【转】

转自:http://blog.csdn.net/u011279649/article/details/18795547

 

版权声明:本文为博主原创文章,未经博主允许不得转载。

目录(?)[-]
应用层怎样使用fork and execve
fork的返回值怎样区分0pid
fork系统调用的入口参数来自哪里
how to implement do_fork
copy_process
How to check the kernel stack correctivity
How to set the new process entry
new process entry point
sys_execve
对elf 格式文件而言
应用层怎样使用fork and execve
/**************************************************************************/

main()
{
    int ret_from_fork,mypid;
    mypid = getPid();
    printf("before:my pid is d%\n",mypid);
    ret_from_fork = fork();
    /*该方法返回生成的子进程的进程id号。用于复制出一个进程后,他们都运行到同样的地方,
     *所以父进程中的ret_from_fork的值是id值,而不时初值0,
     *而子进程的ret_from_fork却没有获得值,还是0.通过这样就可以区别两个进程改变两个进程的走向
         **/
    switch(ret_from_fork){
        case -1:
        perror(" fork failed");
        exit(1);
        /*以下就是子进程要执行的代码,他调用exec载入用户输入的命令指定的程序,
         *清除进程空间执行用户指定的程序
         **/
        case 0:
        execvp(arglist[0],arglist);//arglist[0]中指定用户想执行的命令名。
        perror("execvp failed");
        exit(1);

        default:
        while(wait(&exitstatus)!=ret_from_fork);
        /*shell程序,等待子进程运行结束后,再接受用户输入*/
    }

}

fork的返回值怎样区分0/pid
/*
 *用户空间fork函数调用时,返回的0也不是内核的do_fork返回的,do_fork只会返回新进程的pid,
 *而 fork的0返回值是内核在ret_from_fork之后进入用户空间前RESTORE_ALL的时候pop到eax中的,
 *然后库实现的fork将 eax作为返回值;
 *实际上,fork的子进程在进入用户空间前从来不经过do_fork这条路,可以看看它的thread的eip是 ret_from_fork,
 *也就是只要开始运行子进程,就在switch_to中会执行ret_from_fork,而从ret_from_fork顺 着看,
 *一直就到了RESTORE_ALL从 而返回用户空间
**/

fork系统调用的入口,参数来自哪里?
入口参数保存在当前的内核栈中:结构为struct pt_regs
系统调用的入口:
arch/arm/kernel/entry-common.S
sys_fork_wrapper:
    add    r0, sp, #S_OFF
    b    sys_fork
ENDPROC(sys_fork_wrapper)

crash> dis sys_fork_wrapper
0xc000e800 <sys_fork_wrapper>:      add     r0, sp, #8
0xc000e804 <sys_fork_wrapper+4>:        b       0xc0011d28 <sys_fork>

arch/arm/kernel/sys_arm.c
/* Fork a new task - this creates a new program thread.
 * This is called indirectly via a small wrapper
 */
asmlinkage int sys_fork(struct pt_regs *regs)
{
#ifdef CONFIG_MMU
    return do_fork(SIGCHLD, regs->ARM_sp, regs, 0, NULL, NULL);
#else
    /* can not support in nommu mode */
    return(-EINVAL);
#endif
}

crash> dis sys_fork
0xc0011d28 <sys_fork>:  mov     r12, sp
0xc0011d2c <sys_fork+4>:        push    {r11, r12, lr, pc}
0xc0011d30 <sys_fork+8>:        sub     r11, r12, #4
0xc0011d34 <sys_fork+12>:       sub     sp, sp, #8
0xc0011d38 <sys_fork+16>:       mov     r12, #0
0xc0011d3c <sys_fork+20>:       mov     r1, r0
0xc0011d40 <sys_fork+24>:       ldr     r1, [r1, #52]   ; 0x34
0xc0011d44 <sys_fork+28>:       mov     r2, r0
0xc0011d48 <sys_fork+32>:       mov     r3, r12
0xc0011d4c <sys_fork+36>:       mov     r0, #17
0xc0011d50 <sys_fork+40>:       str     r12, [sp]
0xc0011d54 <sys_fork+44>:       str     r12, [sp, #4]
0xc0011d58 <sys_fork+48>:       bl      0xc0027550 <do_fork>
0xc0011d5c <sys_fork+52>:       sub     sp, r11, #12
0xc0011d60 <sys_fork+56>:       ldm     sp, {r11, sp, pc}

/**************************************************************/
/arch/arm/kernel/entry-header.s
@
@ Most of the stack format comes from struct pt_regs, but with
@ the addition of 8 bytes for storing syscall args 5 and 6.
@ This _must_ remain a multiple of 8 for EABI.
@
#define S_OFF        8

/**************************************************************/
/arch/arm/include/asm/ptrace.h
/*
 * This struct defines the way the registers are stored on the
 * stack during a system call.  Note that sizeof(struct pt_regs)
 * has to be a multiple of 8.
 */

struct pt_regs {
    unsigned long uregs[18];
};


#define ARM_cpsr    uregs[16]
#define ARM_pc        uregs[15]
#define ARM_lr        uregs[14]
#define ARM_sp        uregs[13]
#define ARM_ip        uregs[12]/*?*/
#define ARM_fp        uregs[11]/*frame point*/
#define ARM_r10        uregs[10]
#define ARM_r9        uregs[9]
#define ARM_r8        uregs[8]
#define ARM_r7        uregs[7]
#define ARM_r6        uregs[6]
#define ARM_r5        uregs[5]
#define ARM_r4        uregs[4]
#define ARM_r3        uregs[3]
#define ARM_r2        uregs[2]
#define ARM_r1        uregs[1]
#define ARM_r0        uregs[0]
#define ARM_ORIG_r0    uregs[17]

how to implement do_fork

/**************************************************************/
do_fork(SIGCHLD, regs->ARM_sp, regs, 0, NULL, NULL);
/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    p = copy_process(clone_flags, stack_start, regs, stack_size,
             child_tidptr, NULL, trace);

    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        nr = task_pid_vnr(p);
        wake_up_new_task(p);
    }
    return nr;
}

copy_process
/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    struct pt_regs *regs,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{/*分配了相关结构体的memory;并用原来的赋值*/
    struct task_struct *p;
    p = dup_task_struct(current);
    ----
    /* Perform scheduler related setup. Assign this task to a CPU. */
    sched_fork(p);

    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    /* copy all the process information */
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
}


static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk;
    struct thread_info *ti;
    unsigned long *stackend;
    int node = tsk_fork_get_node(orig);
    int err;

    /*分配了memory for task_struct and thread_info*/
    tsk = alloc_task_struct_node(node);
    if (!tsk)
        return NULL;

    ti = alloc_thread_info_node(tsk, node);
    if (!ti) {
        free_task_struct(tsk);
        return NULL;
    }
    /*
    int arch_dup_task_struct(struct task_struct *dst,
              struct task_struct *src)
    {
        *dst = *src;
        return 0;
    }
    */
    err = arch_dup_task_struct(tsk, orig);
    if (err)
        goto out;

    tsk->stack = ti;

    setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
    clear_tsk_need_resched(tsk);
    stackend = end_of_stack(tsk);
    *stackend = STACK_END_MAGIC;    /* for overflow detection */


    /*
     * One for us, one for whoever does the "release_task()" (usually
     * parent)
     */
    atomic_set(&tsk->usage, 2);

    tsk->splice_pipe = NULL;

    account_kernel_stack(ti, 1);

    return tsk;

out:
    free_thread_info(ti);
    free_task_struct(tsk);
    return NULL;
}

How to check the kernel stack correctivity
static inline unsigned long *end_of_stack(struct task_struct *p)
{
    return (unsigned long *)(task_thread_info(p) + 1);
}
#define STACK_END_MAGIC        0x57AC6E9D

COMMAND: "dwc_otg"
   TASK: ee1a3420  [THREAD_INFO: ee1c6000]
    CPU: 0
  STATE: TASK_INTERRUPTIBLE 
crash> thread_info ee1c6000
struct thread_info {
  flags = 0, 
  preempt_count = 1, 
  addr_limit = 0, 
  task = 0xee1a3420,

crash> struct task_struct.stack 0xee1a3420
  stack = 0xee1c6000

crash> bt -r
PID: 760    TASK: ee1a3420  CPU: 0   COMMAND: "dwc_otg"
ee1c6000:  00000000 00000001 00000000 ee1a3420 
ee1c6010:  default_exec_domain 00000000 00000015 ee1a3420 
ee1c6020:  c0f88420 init_task ee1c6000 00000000 
ee1c6030:  00000001 init_mm  ee1c7f5c ee1c7f18 
ee1c6040:  __schedule+1412 00000000 00000000 00000000 
ee1c6050:  00000000 00000000 00000000 00000000 
ee1c6060:  00000000 00000000 00000000 00000000 
ee1c6070:  00000000 00000000 00000000 00000000 
ee1c6080:  00000000 00000000 00000000 00000000 
ee1c6090:  00000000 00000000 00000000 00000000 
ee1c60a0:  00000000 00000000 00000000 00000000 
ee1c60b0:  00000000 00000000 00000000 00000000 
ee1c60c0:  00000000 00000000 00000000 00000000 
ee1c60d0:  00000000 00000000 00000000 00000000 
ee1c60e0:  00000000 00000000 00000000 00000000 
ee1c60f0:  00000000 00000000 00000000 00000000 
ee1c6100:  00000000 00000000 00000000 00000000 
ee1c6110:  00000000 00000000 00000000 00000000 
ee1c6120:  00000000 00000000 00000000 00000000 
ee1c6130:  00000000 00000000 00000000 00000000 
ee1c6140:  00000000 00000000 00000000 00000000 
ee1c6150:  00000000 00000000 00000000 00000000 
ee1c6160:  00000000 00000000 00000000 00000000 
ee1c6170:  00000000 00000000 00000000 00000000 
ee1c6180:  00000000 00000000 00000000 00000000 
ee1c6190:  00000000 00000000 00000000 00000000 
ee1c61a0:  00000000 00000000 00000000 00000000 
ee1c61b0:  00000000 00000000 00000000 00000000 
ee1c61c0:  00000000 00000000 00000000 00000000 
ee1c61d0:  00000000 00000000 00000000 00000000 
ee1c61e0:  00000000 00000000 00000000 00000000 
ee1c61f0:  00000000 00000000 00000000 00000000 
ee1c6200:  00000000 00000000 00000000 00000000 
ee1c6210:  00000000 00000000 00000000 00000000 
ee1c6220:  00000000 00000000 00000000 00000000 
ee1c6230:  00000000 00000000 00000000 00000000 
ee1c6240:  00000000 00000000 00000000 00000000 
ee1c6250:  00000000 00000000 00000000 00000000 
ee1c6260:  00000000 00000000 00000000 00000000 
ee1c6270:  00000000 00000000 00000000 00000000 
ee1c6280:  00000000 00000000 00000000 00000000 
ee1c6290:  00000000 00000000 00000000 00000000 
ee1c62a0:  00000000 00000000 00000000 00000000 
ee1c62b0:  00000000 00000000 00000000 00000000 
ee1c62c0:  00000000 00000000 do_no_restart_syscall 00000000 
ee1c62d0:  00000000 00000000 00000000 00000000 
ee1c62e0:  00000000 00000000 00000000 00000000 
ee1c62f0:  57ac6e9d/*STACK_END_MAGIC*/

asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");

How to set the new process entry
int

copy_thread(unsigned long clone_flags, unsigned long stack_start,
        unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs)
{
    struct thread_info *thread = task_thread_info(p);
    struct pt_regs *childregs = task_pt_regs(p);

    *childregs = *regs;
    childregs->ARM_r0 = 0;
    childregs->ARM_sp = stack_start;

    memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
    thread->cpu_context.sp = (unsigned long)childregs;
    thread->cpu_context.pc = (unsigned long)ret_from_fork;

    clear_ptrace_hw_breakpoint(p);

    if (clone_flags & CLONE_SETTLS)
        thread->tp_value = regs->ARM_r3;

    thread_notify(THREAD_NOTIFY_COPY, thread);

    return 0;
}

/*8K内核栈的最后是 struct pt_regs
 *对它进行赋值:返回到用户空间后使用的栈,返回地址
 **/
#define task_pt_regs(p) \
    ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)


/*
 * low level task data that entry.S needs immediate access to.
 * __switch_to() assumes cpu_context follows immediately after cpu_domain.
 */
crash> struct thread_info -o
struct thread_info {
    [0] unsigned long flags;
    [4] int preempt_count;
    [8] mm_segment_t addr_limit;
   [12] struct task_struct *task;
   [16] struct exec_domain *exec_domain;
   [20] __u32 cpu;
   [24] __u32 cpu_domain;
   [28] struct cpu_context_save cpu_context;
   [76] __u32 syscall;
   [80] __u8 used_cp[16];
   [96] unsigned long tp_value;
  [100] struct crunch_state crunchstate;
  [288] union fp_state fpstate;
  [432] union vfp_state vfpstate;
  [712] struct restart_block restart_block;
}

new process entry point
/*
 * This is how we return from a fork.
 */
ENTRY(ret_from_fork)
    bl    schedule_tail
    get_thread_info tsk
    ldr    r1, [tsk, #TI_FLAGS]        @ check for syscall tracing
    mov    why, #1
    tst    r1, #_TIF_SYSCALL_WORK        @ are we tracing syscalls?
    beq    ret_slow_syscall
    mov    r1, sp
    mov    r0, #1                @ trace exit [IP = 1]
    bl    syscall_trace
    b    ret_slow_syscall
ENDPROC(ret_from_fork)


sys_execve

/**************************************************************/
arch/arm/kernel/sys_arm.c

/* sys_execve() executes a new program.
 * This is called indirectly via a small wrapper
 */
asmlinkage int sys_execve(const char __user *filenamei,
              const char __user *const __user *argv,
              const char __user *const __user *envp, struct pt_regs *regs)
{
    int error;
    char * filename;

    filename = getname(filenamei);
    error = PTR_ERR(filename);
    if (IS_ERR(filename))
        goto out;
    error = do_execve(filename, argv, envp, regs);
    putname(filename);
out:
    return error;
}

int do_execve(const char *filename,
    const char __user *const __user *__argv,
    const char __user *const __user *__envp,
    struct pt_regs *regs)
{
    struct user_arg_ptr argv = { .ptr.native = __argv };
    struct user_arg_ptr envp = { .ptr.native = __envp };
    return do_execve_common(filename, argv, envp, regs);
}

/**************************************************************/

/*
 * sys_execve() executes a new program.
 */
static int do_execve_common(const char *filename,
                struct user_arg_ptr argv,
                struct user_arg_ptr envp,
                struct pt_regs *regs)
{
    struct linux_binprm *bprm;
    struct file *file;
    bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
    file = open_exec(filename);
    sched_exec();

    bprm->file = file;
    bprm->filename = filename;
    bprm->interp = filename;

    bprm_mm_init(bprm);

    bprm->argc = count(argv, MAX_ARG_STRINGS);

    bprm->envc = count(envp, MAX_ARG_STRINGS);
    prepare_binprm(bprm);
    search_binary_handler(bprm,regs);
}

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
int bprm_mm_init(struct linux_binprm *bprm)
{
    int err;
    struct mm_struct *mm = NULL;
    /*mm_struct*/
    bprm->mm = mm = mm_alloc();
    /*vma_struct*/
    err = __bprm_mm_init(bprm);


    return 0;
}

/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
{
    struct linux_binfmt *fmt;
    list_for_each_entry(fmt, &formats, lh)
    int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
    fn(bprm, regs);
}

对elf 格式文件而言
fs/binfmt_elf.c
static struct linux_binfmt elf_format = {
    .module        = THIS_MODULE,
    .load_binary    = load_elf_binary,
    .load_shlib    = load_elf_library,
    .core_dump    = elf_core_dump,
    .min_coredump    = ELF_EXEC_PAGESIZE,
};

static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
{
    ----
    kernel_read();
    start_thread(regs, elf_entry, bprm->p);
}

#define start_thread(regs,pc,sp)                    \
({                                    \
    unsigned long *stack = (unsigned long *)sp;            \
    memset(regs->uregs, 0, sizeof(regs->uregs));            \
    if (current->personality & ADDR_LIMIT_32BIT)            \
        regs->ARM_cpsr = USR_MODE;                \
    else                                \
        regs->ARM_cpsr = USR26_MODE;                \
    if (elf_hwcap & HWCAP_THUMB && pc & 1)                \
        regs->ARM_cpsr |= PSR_T_BIT;                \
    regs->ARM_cpsr |= PSR_ENDSTATE;                    \
    regs->ARM_pc = pc & ~1;        /* pc */            \
    regs->ARM_sp = sp;        /* sp */            \
    regs->ARM_r2 = stack[2];    /* r2 (envp) */            \
    regs->ARM_r1 = stack[1];    /* r1 (argv) */            \
    regs->ARM_r0 = stack[0];    /* r0 (argc) */            \
})

总结:当运行execve时已经运行新创建的进程,不是说在old进程中加载后,再运行新进程的。

 

 

 
 

应用层怎样使用fork and execve

/**************************************************************************/

main()
{
    int ret_from_fork,mypid;
    mypid = getPid();
    printf("before:my pid is d%\n",mypid);
    ret_from_fork = fork();
    /*该方法返回生成的子进程的进程id号。用于复制出一个进程后,他们都运行到同样的地方,
     *所以父进程中的ret_from_fork的值是id值,而不时初值0,
     *而子进程的ret_from_fork却没有获得值,还是0.通过这样就可以区别两个进程改变两个进程的走向
         **/

    switch(ret_from_fork){
        case -1:
        perror(" fork failed");
        exit(1);
        /*以下就是子进程要执行的代码,他调用exec载入用户输入的命令指定的程序,
         *清除进程空间执行用户指定的程序
         **/

        case 0:
        execvp(arglist[0],arglist);//arglist[0]中指定用户想执行的命令名
        perror("execvp failed");
        exit(1);

        default:
        while(wait(&exitstatus)!=ret_from_fork);
        /*shell程序,等待子进程运行结束后,再接受用户输入*/
    }

}

 

fork的返回值怎样区分0/pid

/*
 *用户空间fork函数调用时,返回的0也不是内核的do_fork返回的,do_fork只会返回新进程的pid,
 *而 fork的0返回值是内核在ret_from_fork之后进入用户空间前RESTORE_ALL的时候pop到eax中的,
 *然后库实现的fork将 eax作为返回值;
 *实际上,fork的子进程在进入用户空间前从来不经过do_fork这条路,可以看看它的thread的eip是 ret_from_fork,
 *也就是只要开始运行子进程,就在switch_to中会执行ret_from_fork,而从ret_from_fork顺 着看,
 *一直就到了RESTORE_ALL从 而返回用户空间
*
*/

fork系统调用的入口,参数来自哪里?

入口参数保存在当前的内核栈中:结构为struct pt_regs
系统调用的入口:
arch/arm/kernel/entry-common.S
sys_fork_wrapper:
    add    r0, sp, #S_OFF
    b    sys_fork
ENDPROC(sys_fork_wrapper)

crash> dis sys_fork_wrapper
0xc000e800 <sys_fork_wrapper>:      add     r0, sp, #8
0xc000e804 <sys_fork_wrapper+4>:        b       0xc0011d28 <sys_fork>

arch/arm/kernel/sys_arm.c
/* Fork a new task - this creates a new program thread.
 * This is called indirectly via a small wrapper
 */
asmlinkage int sys_fork(struct pt_regs *regs)
{
#ifdef CONFIG_MMU
    return do_fork(SIGCHLD, regs->ARM_sp, regs, 0, NULL, NULL);
#else
    /* can not support in nommu mode */
    return(-EINVAL);
#endif
}

crash> dis sys_fork
0xc0011d28 <sys_fork>:  mov     r12, sp
0xc0011d2c <sys_fork+4>:        push    {r11, r12, lr, pc}
0xc0011d30 <sys_fork+8>:        sub     r11, r12, #4
0xc0011d34 <sys_fork+12>:       sub     sp, sp, #8
0xc0011d38 <sys_fork+16>:       mov     r12, #0
0xc0011d3c <sys_fork+20>:       mov     r1, r0
0xc0011d40 <sys_fork+24>:       ldr     r1, [r1, #52]   ; 0x34
0xc0011d44 <sys_fork+28>:       mov     r2, r0
0xc0011d48 <sys_fork+32>:       mov     r3, r12
0xc0011d4c <sys_fork+36>:       mov     r0, #17
0xc0011d50 <sys_fork+40>:       str     r12, [sp]
0xc0011d54 <sys_fork+44>:       str     r12, [sp, #4]
0xc0011d58 <sys_fork+48>:       bl      0xc0027550 <do_fork>
0xc0011d5c <sys_fork+52>:       sub     sp, r11, #12
0xc0011d60 <sys_fork+56>:       ldm     sp, {r11, sp, pc}

/**************************************************************/
/arch/arm/kernel/entry-header.s
@
@ Most of the stack format comes from struct pt_regs, but with
@ the addition of 8 bytes for storing syscall args 5 and 6.
@ This _must_ remain a multiple of 8 for EABI.
@
#define S_OFF        8

/**************************************************************/
/arch/arm/include/asm/ptrace.h
/*
 * This struct defines the way the registers are stored on the
 * stack during a system call.  Note that sizeof(struct pt_regs)
 * has to be a multiple of 8.
 */

struct pt_regs {
    unsigned long uregs[18];
};


#define ARM_cpsr    uregs[16]
#define ARM_pc        uregs[15]
#define ARM_lr        uregs[14]
#define ARM_sp        uregs[13]
#define ARM_ip        uregs[12]/*?*/
#define ARM_fp        uregs[11]/*frame point*/
#define ARM_r10        uregs[10]
#define ARM_r9        uregs[9]
#define ARM_r8        uregs[8]
#define ARM_r7        uregs[7]
#define ARM_r6        uregs[6]
#define ARM_r5        uregs[5]
#define ARM_r4        uregs[4]
#define ARM_r3        uregs[3]
#define ARM_r2        uregs[2]
#define ARM_r1        uregs[1]
#define ARM_r0        uregs[0]
#define ARM_ORIG_r0    uregs[17]

 

how to implement do_fork


/**************************************************************/
do_fork(SIGCHLD, regs->ARM_sp, regs, 0, NULL, NULL);
/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
/
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    p = copy_process(clone_flags, stack_start, regs, stack_size,
             child_tidptr, NULL, trace);

    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        nr = task_pid_vnr(p);
        wake_up_new_task(p);
    }
    return nr;
}

 

copy_process

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 *
/
static struct task_struct *copy_process(unsigned long clone_flags,
                    unsigned long stack_start,
                    struct pt_regs *regs,
                    unsigned long stack_size,
                    int __user *child_tidptr,
                    struct pid *pid,
                    int trace)
{/*分配了相关结构体的memory;并用原来的赋值*/
    struct task_struct *p;
    p = dup_task_struct(current);
    ----
    /* Perform scheduler related setup. Assign this task to a CPU. */
    sched_fork(p);

    retval = perf_event_init_task(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    retval = audit_alloc(p);
    if (retval)
        goto bad_fork_cleanup_policy;
    /* copy all the process information */
    retval = copy_semundo(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_audit;
    retval = copy_files(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_semundo;
    retval = copy_fs(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_files;
    retval = copy_sighand(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_fs;
    retval = copy_signal(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_sighand;
    retval = copy_mm(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_signal;
    retval = copy_namespaces(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_mm;
    retval = copy_io(clone_flags, p);
    if (retval)
        goto bad_fork_cleanup_namespaces;
    retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
}


static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk;
    struct thread_info *ti;
    unsigned long *stackend;
    int node = tsk_fork_get_node(orig);
    int err;

    /*分配了memory for task_struct and thread_info*/
    tsk = alloc_task_struct_node(node);
    if (!tsk)
        return NULL;

    ti = alloc_thread_info_node(tsk, node);
    if (!ti) {
        free_task_struct(tsk);
        return NULL;
    }
    /*
    int arch_dup_task_struct(struct task_struct *dst,
              struct task_struct *src)
    {
        *dst = *src;
        return 0;
    }
    */
    err = arch_dup_task_struct(tsk, orig);
    if (err)
        goto out;

    tsk->stack = ti;

    setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
    clear_tsk_need_resched(tsk);
    stackend = end_of_stack(tsk);
    *stackend = STACK_END_MAGIC;    /* for overflow detection */


    /*
     * One for us, one for whoever does the "release_task()" (usually
     * parent)
     */
    atomic_set(&tsk->usage, 2);

    tsk->splice_pipe = NULL;

    account_kernel_stack(ti, 1);

    return tsk;

out:
    free_thread_info(ti);
    free_task_struct(tsk);
    return NULL;
}

 

How to check the kernel stack correctivity

static inline unsigned long *end_of_stack(struct task_struct *p)
{
    return (unsigned long *)(task_thread_info(p) + 1);
}
#define STACK_END_MAGIC        0x57AC6E9D

COMMAND: "dwc_otg"
   TASK: ee1a3420  [THREAD_INFO: ee1c6000]
    CPU: 0
  STATE: TASK_INTERRUPTIBLE 
crash> thread_info ee1c6000
struct thread_info {
  flags = 0, 
  preempt_count = 1, 
  addr_limit = 0, 
  task = 0xee1a3420,

crash> struct task_struct.stack 0xee1a3420
  stack = 0xee1c6000

crash> bt -r
PID: 760    TASK: ee1a3420  CPU: 0   COMMAND: "dwc_otg"
ee1c6000:  00000000 00000001 00000000 ee1a3420 
ee1c6010:  default_exec_domain 00000000 00000015 ee1a3420 
ee1c6020:  c0f88420 init_task ee1c6000 00000000 
ee1c6030:  00000001 init_mm  ee1c7f5c ee1c7f18 
ee1c6040:  __schedule+1412 00000000 00000000 00000000 
ee1c6050:  00000000 00000000 00000000 00000000 
ee1c6060:  00000000 00000000 00000000 00000000 
ee1c6070:  00000000 00000000 00000000 00000000 
ee1c6080:  00000000 00000000 00000000 00000000 
ee1c6090:  00000000 00000000 00000000 00000000 
ee1c60a0:  00000000 00000000 00000000 00000000 
ee1c60b0:  00000000 00000000 00000000 00000000 
ee1c60c0:  00000000 00000000 00000000 00000000 
ee1c60d0:  00000000 00000000 00000000 00000000 
ee1c60e0:  00000000 00000000 00000000 00000000 
ee1c60f0:  00000000 00000000 00000000 00000000 
ee1c6100:  00000000 00000000 00000000 00000000 
ee1c6110:  00000000 00000000 00000000 00000000 
ee1c6120:  00000000 00000000 00000000 00000000 
ee1c6130:  00000000 00000000 00000000 00000000 
ee1c6140:  00000000 00000000 00000000 00000000 
ee1c6150:  00000000 00000000 00000000 00000000 
ee1c6160:  00000000 00000000 00000000 00000000 
ee1c6170:  00000000 00000000 00000000 00000000 
ee1c6180:  00000000 00000000 00000000 00000000 
ee1c6190:  00000000 00000000 00000000 00000000 
ee1c61a0:  00000000 00000000 00000000 00000000 
ee1c61b0:  00000000 00000000 00000000 00000000 
ee1c61c0:  00000000 00000000 00000000 00000000 
ee1c61d0:  00000000 00000000 00000000 00000000 
ee1c61e0:  00000000 00000000 00000000 00000000 
ee1c61f0:  00000000 00000000 00000000 00000000 
ee1c6200:  00000000 00000000 00000000 00000000 
ee1c6210:  00000000 00000000 00000000 00000000 
ee1c6220:  00000000 00000000 00000000 00000000 
ee1c6230:  00000000 00000000 00000000 00000000 
ee1c6240:  00000000 00000000 00000000 00000000 
ee1c6250:  00000000 00000000 00000000 00000000 
ee1c6260:  00000000 00000000 00000000 00000000 
ee1c6270:  00000000 00000000 00000000 00000000 
ee1c6280:  00000000 00000000 00000000 00000000 
ee1c6290:  00000000 00000000 00000000 00000000 
ee1c62a0:  00000000 00000000 00000000 00000000 
ee1c62b0:  00000000 00000000 00000000 00000000 
ee1c62c0:  00000000 00000000 do_no_restart_syscall 00000000 
ee1c62d0:  00000000 00000000 00000000 00000000 
ee1c62e0:  00000000 00000000 00000000 00000000 
ee1c62f0:  57ac6e9d/*STACK_END_MAGIC*/

asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");

How to set the new process entry

int

copy_thread(unsigned long clone_flags, unsigned long stack_start,
        unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs)
{
    struct thread_info *thread = task_thread_info(p);
    struct pt_regs *childregs = task_pt_regs(p);

    *childregs = *regs;
    childregs->ARM_r0 = 0;
    childregs->ARM_sp = stack_start;

    memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
    thread->cpu_context.sp = (unsigned long)childregs;
    thread->cpu_context.pc = (unsigned long)ret_from_fork;

    clear_ptrace_hw_breakpoint(p);

    if (clone_flags & CLONE_SETTLS)
        thread->tp_value = regs->ARM_r3;

    thread_notify(THREAD_NOTIFY_COPY, thread);

    return 0;
}

/*8K内核栈的最后是 struct pt_regs
 *对它进行赋值:返回到用户空间后使用的栈,返回地址
 **/

#define task_pt_regs(p) \
    ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)


/*
 * low level task data that entry.S needs immediate access to.
 * __switch_to() assumes cpu_context follows immediately after cpu_domain.
 *
/
crash> struct thread_info -o
struct thread_info {
    [0] unsigned long flags;
    [4] int preempt_count;
    [8] mm_segment_t addr_limit;
   [12] struct task_struct *task;
   [16] struct exec_domain *exec_domain;
   [20] __u32 cpu;
   [24] __u32 cpu_domain;
   [28] struct cpu_context_save cpu_context;
   [76] __u32 syscall;
   [80] __u8 used_cp[16];
   [96] unsigned long tp_value;
  [100] struct crunch_state crunchstate;
  [288] union fp_state fpstate;
  [432] union vfp_state vfpstate;
  [712] struct restart_block restart_block;
}

 

new process entry point

/*
 * This is how we return from a fork.
 */
ENTRY(ret_from_fork)
    bl    schedule_tail
    get_thread_info tsk
    ldr    r1, [tsk, #TI_FLAGS]        @ check for syscall tracing
    mov    why, #1
    tst    r1, #_TIF_SYSCALL_WORK        @ are we tracing syscalls?
    beq    ret_slow_syscall
    mov    r1, sp
    mov    r0, #1                @ trace exit [IP = 1]
    bl    syscall_trace
    b    ret_slow_syscall

ENDPROC(ret_from_fork)

 

sys_execve


/**************************************************************/
arch/arm/kernel/sys_arm.c

/* sys_execve() executes a new program.
 * This is called indirectly via a small wrapper
 *
/
asmlinkage int sys_execve(const char __user *filenamei,
              const char __user *const __user *argv,
              const char __user *const __user *envp, struct pt_regs *regs)
{
    int error;
    char * filename;

    filename = getname(filenamei);
    error = PTR_ERR(filename);
    if (IS_ERR(filename))
        goto out;
    error = do_execve(filename, argv, envp, regs);
    putname(filename);
out:
    return error;
}

int do_execve(const char *filename,
    const char __user *const __user *__argv,
    const char __user *const __user *__envp,
    struct pt_regs *regs)
{
    struct user_arg_ptr argv = { .ptr.native = __argv };
    struct user_arg_ptr envp = { .ptr.native = __envp };
    return do_execve_common(filename, argv, envp, regs);
}

/**************************************************************/

/*
 * sys_execve() executes a new program.
 */
static int do_execve_common(const char *filename,
                struct user_arg_ptr argv,
                struct user_arg_ptr envp,
                struct pt_regs *regs)
{
    struct linux_binprm *bprm;
    struct file *file;
    bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
    file = open_exec(filename);
    sched_exec();

    bprm->file = file;
    bprm->filename = filename;
    bprm->interp = filename;

    bprm_mm_init(bprm);

    bprm->argc = count(argv, MAX_ARG_STRINGS);

    bprm->envc = count(envp, MAX_ARG_STRINGS);
    prepare_binprm(bprm);
    search_binary_handler(bprm,regs);
}

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
int bprm_mm_init(struct linux_binprm *bprm)
{
    int err;
    struct mm_struct *mm = NULL;
    /*mm_struct*/
    bprm->mm = mm = mm_alloc();
    /*vma_struct*/
    err = __bprm_mm_init(bprm);


    return 0;
}

/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
{
    struct linux_binfmt *fmt;
    list_for_each_entry(fmt, &formats, lh)
    int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
    fn(bprm, regs);
}

 

对elf 格式文件而言

fs/binfmt_elf.c
static struct linux_binfmt elf_format = {
    .module        = THIS_MODULE,
    .load_binary    = load_elf_binary,
    .load_shlib    = load_elf_library,
    .core_dump    = elf_core_dump,
    .min_coredump    = ELF_EXEC_PAGESIZE,
};

static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
{
    ----
    kernel_read();
    start_thread(regs, elf_entry, bprm->p);
}

#define start_thread(regs,pc,sp)                    \
({                                    \
    unsigned long *stack = (unsigned long *)sp;            \
    memset(regs->uregs, 0, sizeof(regs->uregs));            \
    if (current->personality & ADDR_LIMIT_32BIT)            \
        regs->ARM_cpsr = USR_MODE;                \
    else                                \
        regs->ARM_cpsr = USR26_MODE;                \
    if (elf_hwcap & HWCAP_THUMB && pc & 1)                \
        regs->ARM_cpsr |= PSR_T_BIT;                \
    regs->ARM_cpsr |= PSR_ENDSTATE;                    \
    regs->ARM_pc = pc & ~1;        /* pc */            \
    regs->ARM_sp = sp;        /* sp */            \
    regs->ARM_r2 = stack[2];    /* r2 (envp) */            \
    regs->ARM_r1 = stack[1];    /* r1 (argv) */            \
    regs->ARM_r0 = stack[0];    /* r0 (argc) */            \
})

 

总结:当运行execve时已经运行新创建的进程,不是说在old进程中加载后,再运行新进程的。

【作者】张昺华
【新浪微博】 张昺华--sky
【twitter】 @sky2030_
【facebook】 张昺华 zhangbinghua
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利.
上一篇:通过EmbeddedServletContainerCustomizer接口调优Tomcat


下一篇:Powershell管理系列(十一)Exchange完全访问权限邮箱的设置