idle的启动(一)

注:此文章主要基于展锐Android R代码加上学习总结自IngresGe大佬的分析
简要流程图:
idle的启动(一)

kernel启动init

从源码解析
bsp/kernel/kernel4.14/init/main.c

head.S–>汇编指令跳转到此函数start_kernel(),调用rest_init()开启init和kthreadd进程

asmlinkage __visible void __init start_kernel(void)
{
	//各种初始化
	...
	/* Do the rest non-__init'ed, we're now alive */
	>>==rest_init()==<<;

	prevent_tail_call_optimization();
}

rest_init()具体实现如下:

static noinline void __ref rest_init(void)
{
	struct task_struct *tsk;
	int pid;
    
    //启动RCU机制,这个与后面的rcu_read_lock和rcu_read_unlock是配套的,用于多核同步 
	rcu_scheduler_starting();

	 /* 函数名既可以表示函数,也可以表示函数指针;kernel_init却作为参数传递了过去,其实传递过去的是一个函数指针
     * 用kernel_thread方式创建init进程
     * CLONE_FS 子进程与父进程共享相同的文件系统,包括root、当前目录、umask,CLONE_SIGHAND  子进程与父进程共享相同的信号处理(signal handler)表
     */
	>>==pid = kernel_thread(kernel_init, NULL, CLONE_FS);==<<
	
	rcu_read_lock();
	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
	set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
	rcu_read_unlock();
    // 设定NUMA系统的默认内存访问策略
	numa_default_policy();
    //用kernel_thread方式创建kthreadd进程,CLONE_FILES 子进程与父进程共享相同的文件描述符(file descriptor)表
	>>==pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);==<<
	//打开RCU读取锁,在此期间无法进行进程切换
	rcu_read_lock();
	// 获取kthreadd的进程描述符,期间需要检索进程pid的使用链表,所以要加锁
	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
	//关闭RCU读取锁
	rcu_read_unlock();
	system_state = SYSTEM_SCHEDULING;
    //complete和wait_for_completion是配套的同步机制,跟java的notify和wait差不多,
    //之前kernel_init函数调用了wait_for_completion(&kthreadd_done),这里调用complete就是通知kernel_init进程kthreadd进程已创建完成,可以继续执行
	complete(&kthreadd_done);
	 //0号进程主动请求调度,让出cpu,1号进程kernel_init将会运行,并且禁止抢占
	schedule_preempt_disabled();
	/* Call into cpu_idle with preempt disabled */
	// 这个函数会调用cpu_idle_loop()使得idle进程进入自己的事件处理循环
	cpu_startup_entry(CPUHP_ONLINE);
}

下面解析rest_init()中的各个函数:


bsp/kernel/kernel4.14/kernel/rcu/tree.c

void rcu_scheduler_starting(void)
{
    //WARN_ON相当于警告,会打印出当前栈信息,不会重启,num_online_cpus表示当前启动的cpu数
	WARN_ON(num_online_cpus() != 1);
	//nr_context_switches 进行进程切换的次数
	WARN_ON(nr_context_switches() > 0);
	rcu_test_sync_prims();
	//启用rcu机制
	rcu_scheduler_active = RCU_SCHEDULER_INIT;
	rcu_test_sync_prims();
}

/bsp/kernel/kernel4.14/kernel/fork.c

/*
 * Create a kernel thread.
 * C语言中 int (*fn)(void *)表示函数指针的定义,int是返回值,void是函数的参数,fn是名字
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
		(unsigned long)arg, NULL, NULL, 0);
}

do_fork函数用于创建进程,它首先调用copy_process()创建新进程,然后调用wake_up_new_task()将进程放入运行队列中并启动新进程。kernel_thread的第一个参数是一个函数指针,会在创建进程后执行,第三个参数是创建进程的方式,具体如下:

参数名 作用
CLONE_PARENT 创建的子进程的父进程是调用者的父进程,新进程与创建它的进程成了“兄弟”而不是“父子”
CLONE_FS 子进程与父进程共享相同的文件系统,包括root、当前目录、umask
CLONE_FILES 子进程与父进程共享相同的文件描述符(file descriptor)表
CLONE_NEWNS 在新的namespace启动子进程,namespace描述了进程的文件hierarchy
CLONE_SIGHAND 子进程与父进程共享相同的信号处理(signal handler)表
CLONE_PTRACE 若父进程被trace,子进程也被trace
CLONE_UNTRACED 若父进程被trace,子进程不被trace
CLONE_VFORK 父进程被挂起,直至子进程释放虚拟内存资源
CLONE_VM 子进程与父进程运行于相同的内存空间
CLONE_PID 子进程在创建时PID与父进程一致
CLONE_THREAD Linux 2.4中增加以支持POSIX线程标准,子进程与父进程共享相同的线程群

_do_fork()函数如下:

long _do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr,
	      unsigned long tls)
{
	struct task_struct *p;
	int trace = 0;
	long nr;
	if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if ((clone_flags & CSIGNAL) != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}

	>>==p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);==<<
	add_latent_entropy();
	if (!IS_ERR(p)) {
		struct completion vfork;
		struct pid *pid;

		cpufreq_task_times_alloc(p);

		trace_sched_process_fork(current, p);

		pid = get_task_pid(p, PIDTYPE_PID);
		nr = pid_vnr(pid);

		if (clone_flags & CLONE_PARENT_SETTID)
			put_user(nr, parent_tidptr);

		if (clone_flags & CLONE_VFORK) {
			p->vfork_done = &vfork;
			init_completion(&vfork);
			get_task_struct(p);
		}

		>>==wake_up_new_task(p);==<<

		/* forking complete and child started to run, tell ptracer */
		if (unlikely(trace))
			ptrace_event_pid(trace, pid);

		if (clone_flags & CLONE_VFORK) {
			if (!wait_for_vfork_done(p, &vfork))
				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
		}

		put_pid(pid);
	} else {
		nr = PTR_ERR(p);
	}
	return nr;
}

bsp/kernel/kernel4.14/mm/mempolicy.c
numa_default_policy():

/* Reset policy of current process to default */
void numa_default_policy(void)
{
    //设定NUMA系统的内存访问策略为MPOL_DEFAULT
	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
			     nodemask_t *nodes)
{
	struct mempolicy *new, *old;
	NODEMASK_SCRATCH(scratch);
	int ret;

	if (!scratch)
		return -ENOMEM;

	new = mpol_new(mode, flags, nodes);
	if (IS_ERR(new)) {
		ret = PTR_ERR(new);
		goto out;
	}

	task_lock(current);
	ret = mpol_set_nodemask(new, nodes, scratch);
	if (ret) {
		task_unlock(current);
		mpol_put(new);
		goto out;
	}
	old = current->mempolicy;
	current->mempolicy = new;
	if (new && new->mode == MPOL_INTERLEAVE)
		current->il_prev = MAX_NUMNODES-1;
	task_unlock(current);
	mpol_put(old);
	ret = 0;
out:
	NODEMASK_SCRATCH_FREE(scratch);
	return ret;
}

bsp/kernel/kernel4.14/include/linux/rcupdate.h
RCU(Read-Copy Update)是数据同步的一种方式,在当前的Linux内核中发挥着重要的作用。RCU主要针对的数据对象是链表,目的是提高遍历读取数据的效率,为了达到目的使用RCU机制读取数据的时候不对链表进行耗时的加锁操作。这样在同一时间可以有多个线程同时读取该链表,并且允许一个线程对链表进行修改(修改的时候,需要加锁)

static __always_inline void rcu_read_lock(void)
{
	__rcu_read_lock();
	__acquire(RCU);
	rcu_lock_acquire(&rcu_lock_map);
	RCU_LOCKDEP_WARN(!rcu_is_watching(),
			 "rcu_read_lock() used illegally while idle");
}

static inline void rcu_read_unlock(void)
{
	RCU_LOCKDEP_WARN(!rcu_is_watching(),
			 "rcu_read_unlock() used illegally while idle");
	__release(RCU);
	__rcu_read_unlock();
	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
}

bsp/kernel/kernel4.14/kernel/pid.c

task_struct叫进程描述符,这个结构体包含了一个进程所需的所有信息
find_task_by_pid_ns的作用就是根据pid,在hash表中获得对应pid的task_struct

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
			 "find_task_by_pid_ns() needs rcu_read_lock() protection");
	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
	struct upid *pnr;

	hlist_for_each_entry_rcu(pnr,
			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
		if (pnr->nr == nr && pnr->ns == ns)
			return container_of(pnr, struct pid,
					numbers[ns->level]);

	return NULL;
}

struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
	struct task_struct *result = NULL;
	if (pid) {
		struct hlist_node *first;
		first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
					      lockdep_tasklist_lock_is_held());
		if (first)
			result = hlist_entry(first, struct task_struct, pids[(type)].node);
	}
	return result;
}

bsp/kernel/kernel4.14/kernel/sched/core.c

/**
 * schedule_preempt_disabled - called with preemption disabled
 *
 * Returns with preemption disabled. Note: preempt_count must be 1
 */
void __sched schedule_preempt_disabled(void)
{
    //开启内核抢占
	sched_preempt_enable_no_resched();
	//并主动请求调度,让出cpu
	schedule();
	//关闭内核抢占
	preempt_disable();
}

bsp/kernel/kernel4.14/kernel/sched/idle.c

void cpu_startup_entry(enum cpuhp_state state)
{
	/*
	 * This #ifdef needs to die, but it's too late in the cycle to
	 * make this generic (arm and sh have never invoked the canary
	 * init for the non boot cpus!). Will be fixed in 3.11
	 */
#ifdef CONFIG_X86
	/*
	 * If we're the non-boot CPU, nothing set the stack canary up
	 * for us. The boot CPU already has it initialized but no harm
	 * in doing it again. This is a good place for updating it, as
	 * we wont ever return from this function (so the invalid
	 * canaries already on the stack wont ever trigger).
	 */
	 //只有在x86这种non-boot CPU机器上执行,该函数主要用于初始化stack_canary的值,用于防止栈溢出
	boot_init_stack_canary();
#endif
    //进行idle前的准备工作
	arch_cpu_idle_prepare();
	cpuhp_online_idle(state);
	while (1)
		do_idle();
}

idle进程是Linux系统的第一个进程,进程号是0,在完成系统环境初始化工作之后,开启了两个重要的进程,init进程和kthreadd进程,执行完创建工作之后,开启一个无限循环,负责进程的调度。
接下来分析kthreadd与init进程


上一篇:Python运维之获取系统CPU信息


下一篇:Linux 换成国内镜像源(加快yum下载速度)代码