Linux cgroup的整体框架

最近因为项目原因,发现对于cgroup的知识严重匮乏,所以恶补了一下cgroup的相关知识。

cgroup指对进程进行分组,然后控制让他们的cpu,io以及memory的使用,和系统的性能息息相关。

一、首先是cgroup的整体框架图:

Linux cgroup的整体框架

以上框图可以看出以下几点:

1. cgroup的subsys分为很多种,主要有:

acct:进行CPU资源的统计

cpuset:主要用来色值进程跑在哪个核上面

cpuctrl:主要用来设置进程在CPU上的运行时间,起作用的为cpu.shares

blkio:主要用来设置不同进程的IO量占比,可以设置为权重和绝对值两种

memory:主要用来设置进程的memory占用的最高值。

2. 每个子系统下面分为多个cgroup,以Android memory cgroup为例,其层次结构如下:

Linux cgroup的整体框架

首先其hierarchy为2(具体为什么为2没搞明白,按照理解,root为第一层,system/app为第二层,uid为第三层,pid为第四层),两层的话应该为root和system/apps两层。

其他cpu分组可以分为前后台进程占用的cpu时间,或者cpu大小核分组。

3. 一个进程对应于一个css_set,css_set又对应于多个子系统

4. 每个process可以属于多个子系统,以对该process进行多种资源的管控。

二、cgroup的初始化:

cgroup的初始化主要是对cgroup子系统进行初始化,最主要的两个函数为:

cgroup_init_early------>用来初始化root cgroup,初始化init_css_set, init_css_set_link这两个全局结构,设置init进程的cgroups指针为init_css_set,同时将一些需要进行early init的subsys进行初始化

int __init cgroup_init_early(void)
{
	static struct cgroup_sb_opts __initdata opts;
	struct cgroup_subsys *ss;
	int i;

	init_cgroup_root(&cgrp_dfl_root, &opts);
	cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;

	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

	for_each_subsys(ss, i) {//遍历所有的cgroup subsys子系统,并初始化
		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
		     ss->id, ss->name);
		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

		ss->id = i;
		ss->name = cgroup_subsys_name[i];
		if (!ss->legacy_name)
			ss->legacy_name = cgroup_subsys_name[i];

		if (ss->early_init)//如果需要进行early init,则此处调用cgroup_init_subsys()进行初始化
			cgroup_init_subsys(ss, true);
	}
	return 0;
}

cgroup_init------>用来初始化和cgroup相关的一些全局变量,同时对剩余的没有在early init时初始化的subsys进行初始化.包括添加一些susbsys制定的文件节点到文件系统中,为init_css_set设置hlist指向的hash list,最后调用register_filesystem注册一个类型为cgroup的伪文件系统,并创建/proc/cgroups。

 

cat proc/cgroups,可以查看当前cgroup的基本:

1. 当前cgroup subsys的名字:通过cgroup_subsys->legacy_name获取

2. 包含多少层:通过cgroup_subsys->root->hierarchy_id获取

2. 包含多少个cgroups:通过cgroup_subsys->root->nr_cgrps获取

3. 哪些子系统被enabled:调用cgroup_ssid_enabled函数来实现

cgroup_init_subsys------>用来继续初始化cgroup_sysbusys结构体,同时分配并初始化一个cgroup_subsys_state结构体,并将初始化完毕的cgroup_subsys结构体赋给cgroup_subsys_state成员ss。

其中分配css结构体主要是各subsys自己注册的回调函数css_alloc

static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
{
	struct cgroup_subsys_state *css;

	pr_debug("Initializing cgroup subsys %s\n", ss->name);

	mutex_lock(&cgroup_mutex);

	idr_init(&ss->css_idr);
	INIT_LIST_HEAD(&ss->cfts);

	/* Create the root cgroup state for this subsystem */
	ss->root = &cgrp_dfl_root;
	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));----分配css结构体
	/* We don't handle early failures gracefully */
	BUG_ON(IS_ERR(css));
	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);

	/*
	 * Root csses are never destroyed and we can't initialize
	 * percpu_ref during early init.  Disable refcnting.
	 */
	css->flags |= CSS_NO_REF;

	if (early) {
		/* allocation can't be done safely during early init */
		css->id = 1;
	} else {
		css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
		BUG_ON(css->id < 0);
	}

	/* Update the init_css_set to contain a subsys
	 * pointer to this state - since the subsystem is
	 * newly registered, all tasks and hence the
	 * init_css_set is in the subsystem's root cgroup. */
	init_css_set.subsys[ss->id] = css;

	have_fork_callback |= (bool)ss->fork << ss->id;
	have_exit_callback |= (bool)ss->exit << ss->id;
	have_free_callback |= (bool)ss->free << ss->id;
	have_canfork_callback |= (bool)ss->can_fork << ss->id;

	/* At system boot, before all subsystems have been
	 * registered, no tasks have been forked, so we don't
	 * need to invoke fork callbacks here. */
	BUG_ON(!list_empty(&init_task.tasks));

	BUG_ON(online_css(css));

	mutex_unlock(&cgroup_mutex);
}

以memory cgroup为例:

主要分配一个memcg结构体,同时初始化次结构体相关参数,然后返回memcg成员css。

static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
	struct mem_cgroup *memcg;
	long error = -ENOMEM;

	memcg = mem_cgroup_alloc();
	if (!memcg)
		return ERR_PTR(error);

	memcg->high = PAGE_COUNTER_MAX;
	memcg->soft_limit = PAGE_COUNTER_MAX;
	if (parent) {
		memcg->swappiness = mem_cgroup_swappiness(parent);
		memcg->oom_kill_disable = parent->oom_kill_disable;
	}
	if (parent && parent->use_hierarchy) {
		memcg->use_hierarchy = true;
		page_counter_init(&memcg->memory, &parent->memory);
		page_counter_init(&memcg->swap, &parent->swap);
		page_counter_init(&memcg->memsw, &parent->memsw);
		page_counter_init(&memcg->kmem, &parent->kmem);
		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
	} else {
		page_counter_init(&memcg->memory, NULL);
		page_counter_init(&memcg->swap, NULL);
		page_counter_init(&memcg->memsw, NULL);
		page_counter_init(&memcg->kmem, NULL);
		page_counter_init(&memcg->tcpmem, NULL);
		/*
		 * Deeper hierachy with use_hierarchy == false doesn't make
		 * much sense so let cgroup subsystem know about this
		 * unfortunate state in our controller.
		 */
		if (parent != root_mem_cgroup)
			memory_cgrp_subsys.broken_hierarchy = true;
	}

	/* The following stuff does not apply to the root */
	if (!parent) {
		root_mem_cgroup = memcg;
		return &memcg->css;
	}

	error = memcg_online_kmem(memcg);
	if (error)
		goto fail;

	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
		static_branch_inc(&memcg_sockets_enabled_key);

	return &memcg->css;
fail:
	mem_cgroup_free(memcg);
	return ERR_PTR(-ENOMEM);
}

以上为cgroup的整体框架。

 

 

 

上一篇:关于cgroup的几个核心名词及其关系


下一篇:Docker容器实现原理及容器隔离性踩坑介绍