linux内核那些事之ZONE

struct zone

从linux 三大内存模型中可以了解到,linux内核将物理内存按照实际使用用途划分成不同的ZONE区域,ZONE管理在物理内存中占用重要地位,在内核中对应的结构为struct zone,,在5.8.10版本中该结构如下:


struct zone {
	/* Read-mostly fields */

	/* zone watermarks, access with *_wmark_pages(zone) macros */
	unsigned long _watermark[NR_WMARK];
	unsigned long watermark_boost;

	unsigned long nr_reserved_highatomic;

	/*
	 * We don't know if the memory that we're going to allocate will be
	 * freeable or/and it will be released eventually, so to avoid totally
	 * wasting several GB of ram we must reserve some of the lower zone
	 * memory (otherwise we risk to run OOM on the lower zones despite
	 * there being tons of freeable ram on the higher zones).  This array is
	 * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
	 * changes.
	 */
	long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
	int node;
#endif
	struct pglist_data	*zone_pgdat;
	struct per_cpu_pageset __percpu *pageset;

#ifndef CONFIG_SPARSEMEM
	/*
	 * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
	 * In SPARSEMEM, this map is stored in struct mem_section
	 */
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
	unsigned long		zone_start_pfn;

	/*
	 * spanned_pages is the total pages spanned by the zone, including
	 * holes, which is calculated as:
	 * 	spanned_pages = zone_end_pfn - zone_start_pfn;
	 *
	 * present_pages is physical pages existing within the zone, which
	 * is calculated as:
	 *	present_pages = spanned_pages - absent_pages(pages in holes);
	 *
	 * managed_pages is present pages managed by the buddy system, which
	 * is calculated as (reserved_pages includes pages allocated by the
	 * bootmem allocator):
	 *	managed_pages = present_pages - reserved_pages;
	 *
	 * So present_pages may be used by memory hotplug or memory power
	 * management logic to figure out unmanaged pages by checking
	 * (present_pages - managed_pages). And managed_pages should be used
	 * by page allocator and vm scanner to calculate all kinds of watermarks
	 * and thresholds.
	 *
	 * Locking rules:
	 *
	 * zone_start_pfn and spanned_pages are protected by span_seqlock.
	 * It is a seqlock because it has to be read outside of zone->lock,
	 * and it is done in the main allocator path.  But, it is written
	 * quite infrequently.
	 *
	 * The span_seq lock is declared along with zone->lock because it is
	 * frequently read in proximity to zone->lock.  It's good to
	 * give them a chance of being in the same cacheline.
	 *
	 * Write access to present_pages at runtime should be protected by
	 * mem_hotplug_begin/end(). Any reader who can't tolerant drift of
	 * present_pages should get_online_mems() to get a stable value.
	 */
	atomic_long_t		managed_pages;
	unsigned long		spanned_pages;
	unsigned long		present_pages;

	const char		*name;

#ifdef CONFIG_MEMORY_ISOLATION
	/*
	 * Number of isolated pageblock. It is used to solve incorrect
	 * freepage counting problem due to racy retrieving migratetype
	 * of pageblock. Protected by zone->lock.
	 */
	unsigned long		nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
	/* see spanned/present_pages for more description */
	seqlock_t		span_seqlock;
#endif

	int initialized;

	/* Write-intensive fields used from the page allocator */
	ZONE_PADDING(_pad1_)

	/* free areas of different sizes */
	struct free_area	free_area[MAX_ORDER];

	/* zone flags, see below */
	unsigned long		flags;

	/* Primarily protects free_area */
	spinlock_t		lock;

	/* Write-intensive fields used by compaction and vmstats. */
	ZONE_PADDING(_pad2_)

	/*
	 * When free pages are below this point, additional steps are taken
	 * when reading the number of free pages to avoid per-cpu counter
	 * drift allowing watermarks to be breached
	 */
	unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	/* pfn where compaction free scanner should start */
	unsigned long		compact_cached_free_pfn;
	/* pfn where async and sync compaction migration scanner should start */
	unsigned long		compact_cached_migrate_pfn[2];
	unsigned long		compact_init_migrate_pfn;
	unsigned long		compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
	/*
	 * On compaction failure, 1<<compact_defer_shift compactions
	 * are skipped before trying again. The number attempted since
	 * last failure is tracked with compact_considered.
	 */
	unsigned int		compact_considered;
	unsigned int		compact_defer_shift;
	int			compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
	/* Set to true when the PG_migrate_skip bits should be cleared */
	bool			compact_blockskip_flush;
#endif

	bool			contiguous;

	ZONE_PADDING(_pad3_)
	/* Zone statistics */
	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
	atomic_long_t		vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;

该结构成员定义了zone中管理物理内存做需要的信息,主要成员如下:

  • unsigned long _watermark[NR_WMARK]: zone对应的watermakr,内核会根据当使用的物理内存达到不同的watermark,会做出不同的处理,比如回收内存,以释放出足够的物理空间以及有可能触发OOM等
  • long lowmem_reserve[MAX_NR_ZONES];:保留的物理内存
  • struct pglist_data    *zone_pgdat: 该zone所属的pglist_data节点
  • unsigned long        zone_start_pfn: 该zone起始物理pfn
  • atomic_long_t        managed_pages: 该zone 被buddy算法所管理的物理内存
  • unsigned long        spanned_pages:大小等于spanned_pages = zone_end_pfn - zone_start_pfn, 如果里面有空洞则包括
  • unsigned long        present_pages:spanned_pages - absent_pages(pages in holes),该zone实际的物理页数,等于spanned_pages-空洞页数
  • const char        *name: zone名称
  • struct free_area    free_area[MAX_ORDER]: buddy 根据order管理的内存
  • unsigned long        flags: zone标志位
  • atomic_long_t        vm_stat[NR_VM_ZONE_STAT_ITEMS]: 按照各个使用状态进行的内存统计
  • atomic_long_t        vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]:整个numa节点内存情况统计

 zone_type

内核将zone按照不同的使用用途划分成不同的type,位于include\linux\mmzone.h文件中:

  • ZONE_DMA:主要是为了兼容ISA 设备,在该设备中 DMA只能访问低于16M内存地址,只能将其单独划出来进行管理。
  • ZONE_DMA32: 针对32位 系统进行兼容,一般在使用ZONE_DMA时由于16M内存过小,而有些设备DMA寻址能够达到32位,在64位系统中为了能够兼容32位系统,划分了ZONE_DMA32,该物理内存为低于32位,以满足32位寻址范围的DMA,该内存区域由来见下面详细描述。
  • ZONE_NORMAL: 正常使用的物理内存区域,大部分申请的内存都使用的是该区域
  • ZONE_HIGHMEM: 只会出现在32位系统内,这时由于在32位系统中,物理内存最多能够直接映射到内核中896M内存,但是为了兼容大于896M内存系统,将大于896M的内存映射到高端内存以弥补地址空间不足的问题,注意高端内存映射并不会才有一一映射方式,而是在使用时候才映射。在64位系统中由于地址空间使用足够,因此不需要ZONE_HIGHMEM.
  • ZONE_MOVABLE: 可移动或回收区域,该zone一般称为伪ZONE,所管理的物理内存来自于ZONE_NORMAL或者ZONE_HIGHMEM,主要是防止内存碎片和支持热插拔功能,内核将ZONE_NORMAL或者ZONE_HIGHMEM中可以移动的内存重新组织在ZONE_MOVABLE,以方便查找
  • ZONE_DEVICE:设备内存,可插拔。

ZONE_DMA32历史

zone_DMA32来源颇有一段历史,了解其来源对理解整个物理内存设计非常重要,LSW中有一段专门描述该区域来源:zone_DMA32来源

Linux systems typically divide main memory into three zones. Most memory fits into the "normal" zone, ZONE_NORMAL. At the low end, however, there are 16MB of memory which are partitioned into the DMA zone ZONE_DMA; this memory is then reserved for situations where it is specifically needed. The most common user of DMA memory is older peripherals which can only address 24 bits of memory. Finally, on the high end, ZONE_HIGHMEM contains all memory which cannot be directly addressed by the kernel.

Not all systems implement all of these zones. Some newer architectures do not support ancient peripherals and leave out ZONE_DMA. In general, 64-bit systems have no addressing problems and do not need ZONE_HIGHMEM. The ia64 architecture settled on a different implementation of ZONE_DMA, defining it to cover all memory addressed below 4GB.

As it turns out, there are uses for a 4GB zone. Quite a few devices have trouble accessing memory which cannot be addressed with 32 bits. Drivers for such devices have been forced to use ZONE_DMA, the I/O memory management unit (on systems which have one), or bounce buffers. None of those solutions is ideal: ZONE_DMA is a small and scarce resource, IOMMU space can also be scarce, and bounce buffers are slow. All of these problems could be avoided if DMA memory could be reliably allocated below the 4GB boundary.

Andi Kleen has decided that the time has come for the x86-64 architecture to support a 32-bit DMA zone. So his patch adds a new zone (ZONE_DMA32) and an associated GFP flag (GFP_DMA32) for allocations. According to Andi, the reason which prevented the addition of this zone in the first place (the fact that the virtual memory subsystem had a very hard time balancing memory between zones) has gone away. Meanwhile, the lack of this zone is causing real problems.

在早期的设备 中由于DMA寻址范围最多在16M(24个bit 位),所以内核设计的过程中需要专门为该设备进行预留。但是在64位系统中,如果将所有的DMA内存都设置在16M内,那么显然资源比较紧张而且不够用,尤其是IOMMU场景,可以寻址到32位4GB内存。为了解决该问题Andi Kleen 决定引入ZONE_DMA32新的区域,该区域包含的物理内存低于4GB,这样就满足了寻址范围为32位的DMA系统,同时解决了ZONE_DMA资源不足的问题。

ZONE_MOVABLE由来

linux 内核将内存区域划分成zone,然后每个zone内存的管理申请和释放都是通过buddy算法来解决,但是buddy算法有一个很大的缺陷就是随着系统长时间申请和释放内存,会造成实际物理内存有很多的内存碎片(memory fragmentation),而此时申请较大连续物理内存时由于实际有足够的物理内存,但是实际上由于内存碎片的原因并不能申请到较大连续物理内存,这一问题曾经长期困扰着内核社区,最后 Mel Gorman提出了一个比较天才的内存碎片解决方案,并被社区所接受:

Mel Gorman's fragmentation avoidance patches have been discussed here a few times in the past. The core idea behind Mel's work is to identify pages which can be easily moved or reclaimed and group them together. Movable pages include those allocated to user space; moving them is just a matter of changing the relevant page table entries. Reclaimable pages include kernel caches which can be released should the need arise. Grouping these pages together makes it easy for the kernel to free large blocks of memory, which is useful for enabling high-order allocations or for vacating regions of memory entirely.

在Mel Gorman的方案中 将物理page划分为可以被moved(移动)、reclaim(可回收)以及unmoved(不可移动)等几个类型,并将move和reclain类型的物理内存页重新组织成新的ZONE即ZONE_MOVABLE(所以一般称ZONE_MOVABLE为伪zone)。当内核需要申请较大连续内存时,将内存不足时将moved或者reclaim内存进行回收以便挤出来足够的连续内存供申请,同时将挤出的moved内存重新申请新的物理内存供用户使用,这样整个过程用于完全感知不到。

ZONE_MOVABLE区两个重要作用:

  • 可以有效防止内存碎片化
  • 支持内存热插拔,尤其是在虚拟化场景,当不需要那么多物理内存时可以释放处理给系统其他程序使用,当需要申请新的物理内存时重新进程插入,针对这一点linus非常支持。同时在有些场景下可以将不需要的物理内存进行关闭以节省电源。

 In particular, Linus is opposed to the idea. The biggest potential use for hot-unplugging is for virtualization; it allows a hypervisor to move memory resources between guests as their needs change. Linus points out that most virtualization mechanisms already have mechanisms which allow the addition and removal of individual pages from guests; there is, he says, no need for any other support for memory changes。

Another use for this technique is allowing systems to conserve power by turning off banks of memory when they are not needed. Clearly, one must be able to move all useful data out of a memory bank before powering it down. Linus is even more dismissive of this idea:

The whole DRAM power story is a bedtime story for gullible children. Don't fall for it. It's not realistic. The hardware support for it DOES NOT EXIST today, and probably won't for several years. And the real fix is elsewhere anyway...

zone size 分配

各个类型的zone 所管理的物理内存size,在系统初始化时进行分配,下面以X86平台为主讲述其大小划分过程。

zone_sizes_init()

zone_sizes_init()为zone size分配入口函数,位于arch\x86\mm\init.c文件中:

void __init zone_sizes_init(void)
{
	unsigned long max_zone_pfns[MAX_NR_ZONES];

	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));

#ifdef CONFIG_ZONE_DMA
	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn);
#endif
	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
#ifdef CONFIG_HIGHMEM
	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
#endif

	free_area_init(max_zone_pfns);
}
  • 分别将ZONE_DMA、ZONEDMA32y以及ZONE_NORMAL的最大物理页设置到max_zone_pfns数组中,其中ZONE_DMA不能超过16M, ZONE_DMA32不能超过4GB, max_low_pfn为ZONE_NORMAL的最大物理内存页
  • free_area_init():进行zone初始化,入参为各个ZONE的 size(注意此时还没有ZONE_MOVABLE的size。

free_area_init()

该函数主要为根据max_zone_pfn数组初始化各个zone

void __init free_area_init(unsigned long *max_zone_pfn)
{
	unsigned long start_pfn, end_pfn;
	int i, nid, zone;
	bool descending;

	/* Record where the zone boundaries are */
	memset(arch_zone_lowest_possible_pfn, 0,
				sizeof(arch_zone_lowest_possible_pfn));
	memset(arch_zone_highest_possible_pfn, 0,
				sizeof(arch_zone_highest_possible_pfn));

	start_pfn = find_min_pfn_with_active_regions();
	descending = arch_has_descending_max_zone_pfns();

	for (i = 0; i < MAX_NR_ZONES; i++) {
		if (descending)
			zone = MAX_NR_ZONES - i - 1;
		else
			zone = i;

		if (zone == ZONE_MOVABLE)
			continue;

		end_pfn = max(max_zone_pfn[zone], start_pfn);
		arch_zone_lowest_possible_pfn[zone] = start_pfn;
		arch_zone_highest_possible_pfn[zone] = end_pfn;

		start_pfn = end_pfn;
	}

	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
	find_zone_movable_pfns_for_nodes();

	/* Print out the zone ranges */
	pr_info("Zone ranges:\n");
	for (i = 0; i < MAX_NR_ZONES; i++) {
		if (i == ZONE_MOVABLE)
			continue;
		pr_info("  %-8s ", zone_names[i]);
		if (arch_zone_lowest_possible_pfn[i] ==
				arch_zone_highest_possible_pfn[i])
			pr_cont("empty\n");
		else
			pr_cont("[mem %#018Lx-%#018Lx]\n",
				(u64)arch_zone_lowest_possible_pfn[i]
					<< PAGE_SHIFT,
				((u64)arch_zone_highest_possible_pfn[i]
					<< PAGE_SHIFT) - 1);
	}

	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
	pr_info("Movable zone start for each node\n");
	for (i = 0; i < MAX_NUMNODES; i++) {
		if (zone_movable_pfn[i])
			pr_info("  Node %d: %#018Lx\n", i,
			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
	}

	/*
	 * Print out the early node map, and initialize the
	 * subsection-map relative to active online memory ranges to
	 * enable future "sub-section" extensions of the memory map.
	 */
	pr_info("Early memory node ranges\n");
	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
			(u64)start_pfn << PAGE_SHIFT,
			((u64)end_pfn << PAGE_SHIFT) - 1);
		subsection_map_init(start_pfn, end_pfn - start_pfn);
	}

	/* Initialise every node */
	mminit_verify_pageflags_layout();
	setup_nr_node_ids();
	init_unavailable_mem();
	for_each_online_node(nid) {
		pg_data_t *pgdat = NODE_DATA(nid);
		free_area_init_node(nid);

		/* Any memory on that node */
		if (pgdat->node_present_pages)
			node_set_state(nid, N_MEMORY);
		check_for_memory(pgdat, nid);
	}
}

 主要过程如下:

  • 调用find_min_pfn_with_active_regions接口,获取memblock中第一个region的base作为zone的起始pfn
  • arch_has_descending_max_zone_pfns: 具体与结构有关,zone的管理的物理地址是按照升序还是降序进行划分
  • 根据max_zone_pfn数组和实际start_fn情况,获取到初步的zone 分布,arch_zone_lowest_possible_pfn为对应的zone 起始pfn,arch_zone_highest_possible_pfn对应的是zone结束pfn。
  • find_zone_movable_pfns_for_nodes: 根据配置和实际物理内存情况,从zone_normal中获取到各个节点的可以移动的pfn,并保存到zone_movable_pfn数组中,后续用于组织到ZONE_MOVABLE中。
  • 将各个zone的 arch_zone_lowest_possible_pfnarch_zone_highest_possible_pfn 打印出来,此时还没有ZONE_MOVABLE信息
  • 继续将zone_movable_pfn 数组打印出来,打印ZONE_MOVABLE信息。
  • 打印所有memblock中的详细物理内存信息,方便查看启动信息。
  • mminit_verify_pageflags_layout: pageflags处理验证
  • setup_nr_node_ids:如果是NUMA 系统则计算出可能的节点id
  • init_unavailable_mem:对没有被memblock.memory和memblock.reserved管的物理内存进行初始化 以及对memblock,reserved进行初始化
  • free_area_init_node: 初始化各个节点 物理内存信息。

find_zone_movable_pfns_for_nodes()

该函数主要是根据配置情况,从已经存在的zone中获取到movable部分,代码逻辑稍微有点复杂:

static void __init find_zone_movable_pfns_for_nodes(void)
{
	int i, nid;
	unsigned long usable_startpfn;
	unsigned long kernelcore_node, kernelcore_remaining;
	/* save the state before borrow the nodemask */
	nodemask_t saved_node_state = node_states[N_MEMORY];
	unsigned long totalpages = early_calculate_totalpages();
	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
	struct memblock_region *r;

	/* Need to find movable_zone earlier when movable_node is specified. */
	find_usable_zone_for_movable();

	/*
	 * If movable_node is specified, ignore kernelcore and movablecore
	 * options.
	 */
	if (movable_node_is_enabled()) {
		for_each_memblock(memory, r) {
			if (!memblock_is_hotpluggable(r))
				continue;

			nid = memblock_get_region_node(r);

			usable_startpfn = PFN_DOWN(r->base);
			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
				min(usable_startpfn, zone_movable_pfn[nid]) :
				usable_startpfn;
		}

		goto out2;
	}

	/*
	 * If kernelcore=mirror is specified, ignore movablecore option
	 */
	if (mirrored_kernelcore) {
		bool mem_below_4gb_not_mirrored = false;

		for_each_memblock(memory, r) {
			if (memblock_is_mirror(r))
				continue;

			nid = memblock_get_region_node(r);

			usable_startpfn = memblock_region_memory_base_pfn(r);

			if (usable_startpfn < 0x100000) {
				mem_below_4gb_not_mirrored = true;
				continue;
			}

			zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
				min(usable_startpfn, zone_movable_pfn[nid]) :
				usable_startpfn;
		}

		if (mem_below_4gb_not_mirrored)
			pr_warn("This configuration results in unmirrored kernel memory.\n");

		goto out2;
	}

	/*
	 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
	 * amount of necessary memory.
	 */
	if (required_kernelcore_percent)
		required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
				       10000UL;
	if (required_movablecore_percent)
		required_movablecore = (totalpages * 100 * required_movablecore_percent) /
					10000UL;

	/*
	 * If movablecore= was specified, calculate what size of
	 * kernelcore that corresponds so that memory usable for
	 * any allocation type is evenly spread. If both kernelcore
	 * and movablecore are specified, then the value of kernelcore
	 * will be used for required_kernelcore if it's greater than
	 * what movablecore would have allowed.
	 */
	if (required_movablecore) {
		unsigned long corepages;

		/*
		 * Round-up so that ZONE_MOVABLE is at least as large as what
		 * was requested by the user
		 */
		required_movablecore =
			roundup(required_movablecore, MAX_ORDER_NR_PAGES);
		required_movablecore = min(totalpages, required_movablecore);
		corepages = totalpages - required_movablecore;

		required_kernelcore = max(required_kernelcore, corepages);
	}

	/*
	 * If kernelcore was not specified or kernelcore size is larger
	 * than totalpages, there is no ZONE_MOVABLE.
	 */
	if (!required_kernelcore || required_kernelcore >= totalpages)
		goto out;

	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];

restart:
	/* Spread kernelcore memory as evenly as possible throughout nodes */
	kernelcore_node = required_kernelcore / usable_nodes;
	for_each_node_state(nid, N_MEMORY) {
		unsigned long start_pfn, end_pfn;

		/*
		 * Recalculate kernelcore_node if the division per node
		 * now exceeds what is necessary to satisfy the requested
		 * amount of memory for the kernel
		 */
		if (required_kernelcore < kernelcore_node)
			kernelcore_node = required_kernelcore / usable_nodes;

		/*
		 * As the map is walked, we track how much memory is usable
		 * by the kernel using kernelcore_remaining. When it is
		 * 0, the rest of the node is usable by ZONE_MOVABLE
		 */
		kernelcore_remaining = kernelcore_node;

		/* Go through each range of PFNs within this node */
		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
			unsigned long size_pages;

			start_pfn = max(start_pfn, zone_movable_pfn[nid]);
			if (start_pfn >= end_pfn)
				continue;

			/* Account for what is only usable for kernelcore */
			if (start_pfn < usable_startpfn) {
				unsigned long kernel_pages;
				kernel_pages = min(end_pfn, usable_startpfn)
								- start_pfn;

				kernelcore_remaining -= min(kernel_pages,
							kernelcore_remaining);
				required_kernelcore -= min(kernel_pages,
							required_kernelcore);

				/* Continue if range is now fully accounted */
				if (end_pfn <= usable_startpfn) {

					/*
					 * Push zone_movable_pfn to the end so
					 * that if we have to rebalance
					 * kernelcore across nodes, we will
					 * not double account here
					 */
					zone_movable_pfn[nid] = end_pfn;
					continue;
				}
				start_pfn = usable_startpfn;
			}

			/*
			 * The usable PFN range for ZONE_MOVABLE is from
			 * start_pfn->end_pfn. Calculate size_pages as the
			 * number of pages used as kernelcore
			 */
			size_pages = end_pfn - start_pfn;
			if (size_pages > kernelcore_remaining)
				size_pages = kernelcore_remaining;
			zone_movable_pfn[nid] = start_pfn + size_pages;

			/*
			 * Some kernelcore has been met, update counts and
			 * break if the kernelcore for this node has been
			 * satisfied
			 */
			required_kernelcore -= min(required_kernelcore,
								size_pages);
			kernelcore_remaining -= size_pages;
			if (!kernelcore_remaining)
				break;
		}
	}

	/*
	 * If there is still required_kernelcore, we do another pass with one
	 * less node in the count. This will push zone_movable_pfn[nid] further
	 * along on the nodes that still have memory until kernelcore is
	 * satisfied
	 */
	usable_nodes--;
	if (usable_nodes && required_kernelcore > usable_nodes)
		goto restart;

out2:
	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
	for (nid = 0; nid < MAX_NUMNODES; nid++)
		zone_movable_pfn[nid] =
			roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);

out:
	/* restore the node_state */
	node_states[N_MEMORY] = saved_node_state;
}

 该函数大概处理过程如下:

  • early_calculate_totalpages(): 从memblock中计算出所有的物理页totalpages。
  • usable_nodes = nodes_weight(node_states[N_MEMORY]): 获取到系统内拥有内存的可以使用的节点数目(NUMA场景)
  • find_usable_zone_for_movable: 查找到可以支持的zone index, 64位系统内位ZONE_NORMAL, 32系统则优先为ZONE_HIGHMEM
  • movable_node_is_enabled:判断move_node 节点释放以及配置, 可以根据实际情况专门配置一个节点为moveable节点,则此节点的所有物理内存为可移动的,即该节点支持热插拔,可以在内核启动前通过cmd的movable_node 参数指定。如果配置,则系统不再从其他节点的内存中划分为ZONE_MOVABLE.如果没有配置,则继续往下走。
  • 如果配置cmdline kernelcore 为mirror, 则将大于0x100000的内存作为movable,并忽略为movablecore配置。kernelcore配置为cmd lin 告诉系统 多少内存为不可以移动的。
  • 如果配置cmdline kernelcore按照百分比则根据当前系统实际使用情况获取到实际需要使用多少内存required_kernelcore作为不移动
  • 如果配置movablecore。则按照百分比计算多少内存为可移动required_movablecore。
  • 如果kernelcore和movablecore都配置百分比,则首先按照不可移动的部分计算 required_kernelcore原则,剩余的才为required_movablecore 可移动内存,这样计算的好处是防止kernelcore+movablecore超过100%。
  • kernelcore_node = required_kernelcore / usable_nodes, 根据节点实际情况,将不可移动的部分内存,平均划分到每个节点中。
  • for_each_node_state(nid, N_MEMORY): 遍历每个节点并按照可移动和不可移动两部分进行划分。
  • for_each_mem_pfn_range:遍历每个节点的memblock,并按照实际情况,将划分出去的不可移动部分 从required_kernelcore减去
  •  如果当前节点内划分完成不可移动部分之后,还有剩余(end_pfn <= usable_startpfn)则将剩余部分作为可以移动部分,并保存到zone_movable_pfn数组中。
  • 遍历完成该节点之后,进入下一个节点之前首先要对(required_kernelcore < kernelcore_node)进行检查。如果超过则重新计算kernelcore_node。
  • 遍历完成所有节点之后,获得详细的可以作为移动物理内存zone_movable_pfn。

相关cmdline启动参数

cmdline在启动内核时传递给内核的启动参数,以方便可以根据需要配置内核 。相关内核cmdline命令启动参数,可以从Documentation\admin-guide\kernel-parameters.txt文档中查看

movable_node

配置整个节点物理内存为可移动或插拔

[KNL] Boot-time switch to make hotplugable memory      NUMA nodes to be movable. This means that the memory     of such nodes will be usable only for movable            allocations which rules out almost all kernel      allocations. Use with caution!。

 kernelcore

配置系统中不可移动部分的内存,配置支持百分比nn%获取"mirror"两种方式

Format: nn[KMGTPE] | nn% | "mirror"
    This parameter specifies the amount of memory usable by   the kernel for non-movable allocations.  The requested  amount is spread evenly throughout all nodes in the
system as ZONE_NORMAL.  The remaining memory is used for   movable memory in its own zone, ZONE_MOVABLE.  In the  event, a node is too small to have both ZONE_NORMAL and   ZONE_MOVABLE, kernelcore memory will take priority and  other nodes will have a larger ZONE_MOVABLE.

      ZONE_MOVABLE is used for the allocation of pages that    may be reclaimed or moved by the page migration  subsystem.  Note that allocations like PTEs-from-HighMem  still use the HighMem zone if it exists, and the Normal   zone if it does not.

     It is possible to specify the exact amount of memory in    the form of "nn[KMGTPE]", a percentage of total system  memory in the form of "nn%", or "mirror".  If "mirror"            option is specified, mirrored (reliable) memory is used  for non-movable allocations and remaining memory is used   for Movable pages.  "nn[KMGTPE]", "nn%", and "mirror"
 are exclusive, so you cannot specify multiple forms.

 movablecore

配置系统中可以移动的内存,百分比配置方式。

 Format: nn[KMGTPE] | nn%
 This parameter is the complement to kernelcore=, it   specifies the amount of memory used for migratable    allocations.  If both kernelcore and movablecore is           specified, then kernelcore will be at *least* the  specified value but may be more.  If movablecore on its  own is specified, the administrator must be careful  that the amount of memory usable for all allocations  is not too small.

参考资料

https://lwn.net/Articles/152462/

https://lwn.net/Articles/224829/

https://lwn.net/Articles/843326/

上一篇:nginx限流限速


下一篇:SpringCloud升级之路2020.0.x版-23.订制Spring Cloud LoadBalancer