Linux如何创建一个新进程

2016-03-31

张超《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

Linux如何创建一个新进程

1.我们先阅读理解task_struct数据结构

1235struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace; #ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
struct task_struct *last_wakee;
unsigned long wakee_flips;
unsigned long wakee_flip_decay_ts; int wake_cpu;
#endif
int on_rq; int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif #ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
bool rcu_tasks_holdout;
struct list_head rcu_tasks_holdout_list;
int rcu_tasks_idle_cpu;
#endif /* #ifdef CONFIG_TASKS_RCU */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif struct mm_struct *mm, *active_mm;
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:;
#endif
/* per-thread vma caching */
u32 vmacache_seqnum;
struct vm_area_struct *vmacache[VMACACHE_SIZE];
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
unsigned int jobctl; /* JOBCTL_*, siglock protected */ /* Used for emulating ABI behavior of previous Linux versions */
unsigned int personality; unsigned in_execve:; /* Tell the LSMs that the process is doing an
1321 * execve */
unsigned in_iowait:; /* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:;
unsigned sched_contributes_to_load:; unsigned long atomic_flags; /* Flags needing atomic access. */ pid_t pid;
pid_t tgid; #ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif
/*
1338 * pointers to (original) parent process, youngest child, younger sibling,
1339 * older sibling, respectively. (p->father can be replaced with
1340 * p->real_parent->pid)
1341 */
struct task_struct __rcu *real_parent; /* real parent process */
struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
/*
1345 * children/sibling forms the list of my natural children
1346 */
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */ /*
1352 * ptraced is the list of tasks this task is using ptrace on.
1353 * This includes both natural children and PTRACE_ATTACH targets.
1354 * p->ptrace_entry is p's link on the p->parent->ptraced list.
1355 */
struct list_head ptraced;
struct list_head ptrace_entry; /* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node; struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
struct cputime prev_cputime;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_t vtime_seqlock;
unsigned long long vtime_snap;
enum {
VTIME_SLEEPING = ,
VTIME_USER,
VTIME_SYS,
} vtime_snap_whence;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
u64 start_time; /* monotonic time in nsec */
u64 real_start_time; /* boot based time in nsec */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt; struct task_cputime cputime_expires;
struct list_head cpu_timers[]; /* process credentials */
const struct cred __rcu *real_cred; /* objective and real subjective task
1393 * credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
1395 * credentials (COW) */
char comm[TASK_COMM_LEN]; /* executable name excluding path
1397 - access with [gs]et_task_comm (which lock
1398 it with task_lock())
1399 - initialized normally by setup_new_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
/* hung task detection */
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand; sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending; unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct callback_head *task_works; struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp; /* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
1445 * mempolicy */
spinlock_t alloc_lock; /* Protection of the PI data structures: */
raw_spinlock_t pi_lock; #ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif #ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
gfp_t lockdep_reclaim_gfp;
#endif /* journalling filesystem info */
void *journal_info; /* stacked block device info */
struct bio_list *bio_list; #ifdef CONFIG_BLOCK
/* stack plugging */
struct blk_plug *plug;
#endif /* VM state */
struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; struct io_context *io_context; unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed; /* Protected by alloc_lock */
seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp */
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work; struct list_head numa_entry;
struct numa_group *numa_group; /*
1561 * Exponential decaying average of faults on a per-node basis.
1562 * Scheduling placement decisions are made based on the these counts.
1563 * The values remain static for the duration of a PTE scan
1564 */
unsigned long *numa_faults_memory;
unsigned long total_numa_faults; /*
1569 * numa_faults_buffer records faults per node during the current
1570 * scan window. When the scan completes, the counts in
1571 * numa_faults_memory decay and these values are copied.
1572 */
unsigned long *numa_faults_buffer_memory; /*
1576 * Track the nodes the process was running on when a NUMA hinting
1577 * fault was incurred.
1578 */
unsigned long *numa_faults_cpu;
unsigned long *numa_faults_buffer_cpu; /*
1583 * numa_faults_locality tracks if faults recorded during the last
1584 * scan window were remote/local. The task scan period is adapted
1585 * based on the locality of the faults with different weights
1586 * depending on whether they were shared or private faults
1587 */
unsigned long numa_faults_locality[]; unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; /*
1596 * cache last used pipe for splice
1597 */
struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
/*
1609 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
1610 * balance_dirty_pages() for some dirty throttling pause
1611 */
int nr_dirtied;
int nr_dirtied_pause;
unsigned long dirty_paused_when; /* start of a write-and-pause period */ #ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
1621 * time slack values; these are used to round up poll() and
1622 * select() etc timeout values. These are in nanoseconds.
1623 */
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns; #ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
/* time stamp for last schedule */
unsigned long long ftrace_timestamp;
/*
1635 * Number of functions that haven't been traced
1636 * because of depth overrun.
1637 */
atomic_t trace_overrun;
/* Pause for the tracing */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* state flags for use by tracers */
unsigned long trace;
/* bitmask and counter of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
unsigned int memcg_kmem_skip_account;
struct memcg_oom_info {
struct mem_cgroup *memcg;
gfp_t gfp_mask;
int order;
unsigned int may_oom:;
} memcg_oom;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
};

task_struct

关于task_struct的具体介绍,见

http://blog.csdn.net/npy_lp/article/details/7292563


它定义在linux-3.18.6/include/linux/sched.h文件中。

进程(Process)是系统进行资源分配和调度的基本单位,一个进程是一个程序的运行实例。而在Linux中,可以使用一个进程来创建另外一个进程。这样的话,Linux的进程的组织结

构其实有点像Linux目录树,是个层次结构的,可以使用 pstree命令来查看。在最上面是init程序的执行进程。它是所有进程的老祖宗。Linux提供了两个函数来创建进程。

1.fork()

fork()提供了创建进程的基本操作,可以说它是Linux系统多任务的基础。该函数在/linux-3.18.6/kernel/fork.c

2.exec系列函数

如果只有fork(),肯定是不完美的,因为fork()只能参数一个父进程的副本。而exec系列函数则可以帮助我们建立一个全新的新进程。

在Linux系统中,一个进程的PCB是一个C语言的结构体task_struct来表示,而多个PCB之间是由一个双向链表组织起来的,在《Understanding the Linux Kernel》中,则是进一步描

述这个链表是一个双向循环链表。

在Linux中创建一个新进程的方法是使用fork函数,fork()执行一次但有两个返回值。

在父进程中,返回值是子进程的进程号;在子进程中,返回值为0。因此可通过返回值来判断当前进程是父进程还是子进程。

使用fork函数得到的子进程是父进程的一个复制品,它从父进程处复制了整个进程的地址空间,包括进程上下文,进程堆栈,内存信息,打开的文件描述符,信 号控制设定,进程优

先级,进程组号,当前工作目录,根目录,资源限制,控制终端等。而子进程所独有的只是它的进程号,资源使用和计时器等。可以看出,使用 fork函数的代价是很大的,它复制了

父进程中的代码段,数据段和堆栈段里的大部分内容,使得fork函数的执行速度并不快。

创建一个进程,至少涉及的函数:

sys_clone, do_fork, dup_task_struct, copy_process, copy_thread, ret_from_fork

Linux如何创建一个新进程

这只是图中的fork一个分支

学习笔记

进程的描述

1.进程描述符task_struct数据结构(一)

为了管理进程,内核必须对每个进程进行清晰的描述,进程描述符提供了内核所需了解的进程信息。

  • struct task_struct数据结构很庞大
  • Linux进程的状态与操作系统原理中的描述的进程状态似乎有所不同,比如就绪状态和运行状态都是TASK_RUNNING,为什么呢?
  • 进程的标示pid
  • 所有进程链表struct list_head tasks;     内核的双向循环链表的实现方法 - 一个更简略的双向循环链表
  • 程序创建的进程具有父子关系,在编程时往往需要引用这样的父子关系。进程描述符中有几个域用来表示这样的关系
  • Linux为每个进程分配一个8KB大小的内存区域,用于存放该进程两个不同的数据结构:Thread_info和进程的内核堆栈

      进程处于内核态时使用,不同于用户态堆栈,即PCB中指定了内核栈,那为什么PCB中没有用户态堆栈?用户态堆栈是怎么设定的?

      内核控制路径所用的堆栈很少,因此对栈和Thread_info来说,8KB足够了

  • struct thread_struct thread; //CPU-specific state of this task
  • 文件系统和文件描述符
  • 内存管理——进程的地址空间

进程状态的切换过程和原因大致如下图:

Linux如何创建一个新进程

双向循环链表图如下:

Linux如何创建一个新进程

进程的父子关系直观图:

Linux如何创建一个新进程

进程的创建

1.进程的创建概览及fork一个进程的用户态代码

(1)进程的起源再回顾

  • 道生一(start_kernel...cpu_idle)
  • 一生二(kernel_init和kthreadd)
  • 二生三(即前面的0、1、2三个进程)
  • 三生万物(1号进程是所有用户态进程的祖先,2号进程是所有内核线程的祖先)

(2)0号进程手工写,1号进程复制、加载init程序

(3)shell命令行是如何启动进程的

fork一个子进程的代码:

   #include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char * argv[])
{
int pid;
/* fork another process */
pid = fork();
if (pid < ) 出错处理
{
/* error occurred */
fprintf(stderr,"Fork Failed!");
exit(-);
}
else if (pid == )
{
/* child process */ 子进程 pid=0时 if和else都会执行 fork系统调用在父进程和子进程各返回一次
printf("This is Child Process!\n");
}
else
{
/* parent process */
printf("This is Parent Process!\n");
/* parent will wait for the child to complete*/
wait(NULL);
printf("Child Complete!\n");
}
}

2.理解进程创建过程复杂代码的方法

(1)系统调用再回顾

Linux如何创建一个新进程

Linux如何创建一个新进程

(2)fork的子进程是从哪里开始执行的?

与基于mykernel写的精简内核对照起来。

(3)创建一个新进程在内核中的执行过程

  • fork、vfork和clone三个系统调用都可以创建一个新进程,而且都是通过调用do_fork来实现进程的创建;
  • Linux通过复制父进程来创建一个新进程,那么这就给我们理解这一个过程提供一个想象的框架:
  • 复制一个PCB——task_struct
    err = arch_dup_task_struct(tsk, orig);
  • 要给新进程分配一个新的内核堆栈
ti = alloc_thread_info_node(tsk, node);
tsk->stack = ti;
setup_thread_stack(tsk, orig); //这里只是复制thread_info,而非复制内核堆栈
  • 要修改复制过来的进程数据,比如pid、进程链表等等都要改改吧,见copy_process内部。
  • 从用户态的代码看fork();函数返回了两次,即在父子进程中各返回一次,父进程从系统调用中返回比较容易理解,子进程从系统调用中返回,那它 在系统调用处理过程中的哪里开始执行的呢?这就涉及子进程的内核堆栈数据状态和task_struct中thread记录的sp和ip的一致性问题,这是 在哪里设定的?copy_thread in copy_process
1 *childregs = *current_pt_regs(); //复制内核堆栈
2 childregs->ax = 0; //为什么子进程的fork返回0,这里就是原因!
3
4 p->thread.sp = (unsigned long) childregs; //调度到子进程时的内核栈顶
5 p->thread.ip = (unsigned long) ret_from_fork; //调度到子进程时的第一条指令地址

(4)理解复杂事物要预设一个大致的框架。

(5)创建新进程是通过复制当前进程来实现的。

(6)设想创建新进程过程中需要做哪些事

3.浏览进程创建过程相关的关键代码

(1)系统调用内核处理函数sys_fork、sys_clone、sys_vfork

Linux如何创建一个新进程

最终都是执行do_fork()。

Linux如何创建一个新进程

do_fork()里的复制进程的函数:

Linux如何创建一个新进程

具体:

Linux如何创建一个新进程

打开复制PCB的具体函数:

Linux如何创建一个新进程

打开alloc_thread_info():

Linux如何创建一个新进程

Linux如何创建一个新进程

拷贝内核堆栈数据和指定新进程的第一条指令地址。

4.创建的新进程是从哪里开始执行的?

(1)复制内核堆栈时

Linux如何创建一个新进程

打开pt_regs:

Linux如何创建一个新进程

int指令和SAVE_ALL压到内核栈的内容。

下面分析entry_32.S,也就是总控程序。

Linux如何创建一个新进程

Linux如何创建一个新进程

5.使用gdb跟踪创建新进程的过程(见作业)

实验:

1、流程

添加fork()到MenuOS

编译并启动MenuOS

用GDB连接,添加breakpoints,

根据观察copy_process是建立新进程,

weak_up_new_task则是运行这个新进程,所以要尝试添加这样一个断点

breakpoints list:b sys_clone

b sys_clone

b do_fork
b copy_process
b dup_task_struct
b alloc_task_struct_node
b arch_dup_task_struct
b copy_thread
b ret_from_fork
b wake_up_new_task

跟踪fork执行

2、实验记录

2.1 添加并验证fork()可用

2.2 跟踪fork

Linux如何创建一个新进程

Linux如何创建一个新进程

Linux如何创建一个新进程

四、总结

Fork创建的新进程是和父进程(除了PID和PPID)一样的副本,包括真实和有效的UID和GID、进程组合会话ID、环境、资源限制、打开的文件以及共享内存段。

根据代码的分析,do_fork中,copy_process管子进程运行的准备,wake_up_new_task作为子进程forking的完成。

上一篇:初探APT 攻击


下一篇:MUI功能列表