0号进程初始化调度器相关结构体,并将0号进程调度类变换为idle_sched_class调度类
从arch/arm64/kernel/head.S第一次启动从汇编代码执行C语言代码的函数为start_kernel:
start_kernel----->sched_init(),下面看看sched_init函数的实现过程,我们对照函数来讲解:
void __init sched_init(void)
{
int i, j;
unsigned long alloc_size = 0, ptr;
#ifdef CONFIG_FAIR_GROUP_SCHED
/*为cfs_rq,sched_entity分配空间,每个cpu都存在cfs_rq和se*/
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
/*为rt_rq和rt_sched_entity分配空间,每个cpu都存在*/
alloc_size += 2 * nr_cpu_ids * sizeof(void **);
#endif
if (alloc_size) {
ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*root_task_group作为RB tree的根节点,作为所有task的
跟节点(不管是单个task还是进程组task) 根据上面alloc_size地址空间大小和
ptr作为alloc_size这块空间大小的首地址进行分配
可以看到分别每个cpu上的cfs_rq和se分配空间,*/
root_task_group.se = (struct sched_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.cfs_rq = (struct cfs_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
/*同上,为rt相关分配空间,也就是说root_task_group包含了cfs task
rt task*/
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
root_task_group.rt_rq = (struct rt_rq **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
#endif /* CONFIG_RT_GROUP_SCHED */
}
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
/*初始化rt thread的带宽限制,以1s一个周期计算,如果rt在rt_rq里面运行时间超过
950ms则强制将task出rt_rq队列,等到下一个1s周期再次运行,每次循环往复知道thread
运行完毕*/
init_rt_bandwidth(&def_rt_bandwidth,
global_rt_period(), global_rt_runtime());
/*意思同上,但是dl thread不知道是什么thread????*/
init_dl_bandwidth(&def_dl_bandwidth,
global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
/*更新root domain,并初始化max_cpu_capacity结构体,update_cpu_capacity时候
会被使用到*/
init_defrootdomain();
#endif
#ifdef CONFIG_RT_GROUP_SCHED
/*为root_task_group内的rt task设置带宽限制*/
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
/*将root_task_group加入到task group链表中,并设置它的孩子节点和兄弟节点链表*/
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
/*每个cpu都有一个rq,开始为所有cpu上的rq进行初始化*/
for_each_possible_cpu(i) {
struct rq *rq;
rq = cpu_rq(i); /*使用per_cpu关联cpu与rq*/
raw_spin_lock_init(&rq->lock);
/*系统初始化时,rq队列里面没有可运行的task,置为0*/
rq->nr_running = 0;
rq->calc_load_active = 0;
/*计算负载update period*/
rq->calc_load_update = jiffies + LOAD_FREQ;
/*初始化rq里面的cfs_rq,rt_rq和dl_rq
1. 初始化cfs_rq rb tree的root node和vruntime,这是主要的挑选哪个rb
node运行的关键,以后在分析,主要是一些cfs_rq成员变量的初始化*/
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*
* In case of task-groups formed thr' the cgroup filesystem, it
* gets 100% of the cpu resources in the system. This overall
* system cpu resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
* based on each entity's (task or task-group's) weight
* (se->load.weight).
*
* In other words, if root_task_group has 10 tasks of weight
* 1024) and two child groups A0 and A1 (of weight 1024 each),
* then A0's share of the cpu resource is:
*
* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
*
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
/*初始化cfs task的带宽限制,比rt要复杂些,以后在展开讲解*/
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
/*上面分析task_group结构体的时候,我们知道一个进程组里面的进程可能处在不同
的CPU上运行,那么也就是不同的cfs_rq和调度实体了,所以每个task都会标记是属于
哪个cfs_rq和调度实体的。*/
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*初始化rt可以运行的时间,默认值为950ms*/
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
/*初始化rt的rq和se*/
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
/*标记load last update time*/
rq->last_load_update_tick = jiffies;
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
/*初始化rq的capacity数值,来自dts,但是rq->cpu_capacity数值会随着系统变化
,这个有待解惑,目前我也没找到root cause*/
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
/*下面三个与balance相关,active_balance是一个flag,目的在于实现负载的强制
balance,而next_balance是一个周期性的tick,记录周期性balance的时间*/
rq->balance_callback = NULL;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
rq->push_task = NULL;
/*rq所处的cpu*/
rq->cpu = i;
rq->online = 0;
/*rq idle,也就是cpu idle的时间戳*/
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
#ifdef CONFIG_SCHED_WALT
/*在WALT中会计算处理每个irq的time,并转化为load*/
rq->cur_irqload = 0;/*当前窗口的irq运行时间*/
rq->avg_irqload = 0; /*irq可能跨多个窗口运行,经过衰减算法,计算多
的irq运行时间,作为avg_irqload个窗口*/
/*irq enter/exit时间戳*/
rq->irqload_ts = 0;
/*我们自己新添加的一个flag,目的为了性能*/
rq->is_busy = CPU_BUSY_CLR;
#endif
/*初始化cfs tasks的链表头*/
INIT_LIST_HEAD(&rq->cfs_tasks);
/*将rq挂载在默认的root domain上,对于domain还需要仔细的check!!!*/
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
rq->nohz_flags = 0;
#endif
#ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
#endif
#endif /*初始化rq的hrtimer*/
init_rq_hrtick(rq);
/*设置rq iowait数值为0*/
atomic_set(&rq->nr_iowait, 0);
#ifdef CONFIG_INTEL_DWS
init_intel_dws(rq);
#endif
} /*至此,整个rq初始化完毕*/
/*设置init_task load权重,每个task会根据task的优先级分配不同的权重值*/
set_load_weight(&init_task);
#ifdef CONFIG_PREEMPT_NOTIFIERS
/*初始化抢占通知链*/
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
/*
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
enter_lazy_tlb(&init_mm, current);
/*
* During early bootup we pretend to be a normal task:
*/
/*设置当前task为fair调度类,current也就是init_task thread*/
current->sched_class = &fair_sched_class;
/*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*//*将当前进程初始化为idle进程
比较有意思,但是最重要的一点就是,设置它的调度类为idle_sched_class*/
init_idle(current, smp_processor_id());
/*下次load update的时间*/
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
/*将boot cpu上当前task设置为idle_thread,对于其他丛cpu则在idle_threads_init里
面,为每个cpu fork出idle_threads*/
idle_thread_set_boot_cpu();
/*设置rq age_stamp,即rq的启动时间(包括idle和running时间),不是运行时间,*/
set_cpu_rq_start_time();
#endif
init_sched_fair_class();
#ifdef CONFIG_64BIT_ONLY_CPU
arch_get_64bit_only_cpus(&b64_only_cpu_mask);
#ifdef CONFIG_SCHED_COMPAT_LIMIT
/* get cpus that support AArch32 and store in compat_32bit_cpu_mask */
cpumask_andnot(&compat_32bit_cpu_mask, cpu_present_mask,
&b64_only_cpu_mask);
#endif
#endif
/*调度器开始工作了*/
scheduler_running = 1;
}
上面的调度器初始化工作完毕,后面就开始正常的调度工作了.如下
- scheduler_tick周期性的TICK_NSEC
- 根据进程状态的变化进行调度(比如进程的创建,进程从idle被wakeup等)
下一章节会讲解调度算法的工作原理.