balance_pgdat()函数是回收页面的主函数。这个函数比较长,首先看一个框架,主体函数是一个很长的while循环。
代码如下:
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
* Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
* lower zones regardless of the number of free pages in the lower zones. This
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
/*struct scan_control数据结构用于控制页面回收的参数,例如要回收页面的个数nr_to_reclaim、
分配掩码gfp_mask、分配的阶数order(2^order个页面)、扫描LRU链表的优先级priority等。
priority成员表示扫描的优先级,用于计算每次扫描页面的数量,计算方法total_size >> priority,
初始值为12,依次递减。priority数值越低,扫描的页面数量越大,相当于逐步加大扫描粒度。
struct scan_control定义查看下面代码*/
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
};
count_vm_event(PAGEOUTRUN);
/*while大循环是页面回收机制的核心框架,可以分成三部分理解:*/
do {
unsigned long nr_attempted = 0;
bool raise_priority = true;
bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
/*
* Scan in the highmem->dma direction for the highest
* zone which needs scanning
*/
/*(1) 从高端zone往低端zone方向查找第一个处于不平衡状态的end_zone*/
/*此for循环,从ZONE_HIGHMEM->ZONE_NORMAL的方向对zone进行扫描,直到找到第一个不平衡的zone,即水位处于
WAMARK_HIGH之下的zone为止。同样使用zone_balanced()函数来计算zone是否处于WMARK_HIGH水位之上,找到之后
保存到end_zone变量中*/
for (i = pgdat->nr_zones - 1; i >= 0; i--) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (sc.priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue;
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming.
*/
age_active_anon(zone, &sc);
/*
* If the number of buffer_heads in the machine
* exceeds the maximum allowed level and this node
* has a highmem zone, force kswapd to reclaim from
* it to relieve lowmem pressure.
*/
if (buffer_heads_over_limit && is_highmem_idx(i)) {
end_zone = i;
break;
}
/*判断zone的水位是否处于高水位之上。*/
if (!zone_balanced(zone, order, 0, 0)) {
end_zone = i;
break;
} else {
/*
* If balanced, clear the dirty and congested
* flags
*/
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
}
if (i < 0)
goto out;
/*(2) 从最低端zone开始页面回收,一直到end_zone*/
/*此for循环是沿着normal_zone到刚才找到的end_zone的方向进行扫描,
确定是否需要内存规整,当zone的内存处于WMARK_LOW之上则不需要内存规整*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
/*判断zone里实际管理的页面数量是否还有zone->present_pages*/
if (!populated_zone(zone))
continue;
/*
* If any zone is currently balanced then kswapd will
* not call compaction as it is expected that the
* necessary pages are already available.
*/
/*这里判断是否需要内存规则(memory compaction),当order大于0且
当前zone处于WMARK_LOW水位之上,则不需要内存规整*/
if (pgdat_needs_compaction &&
zone_watermark_ok(zone, order,
low_wmark_pages(zone),
*classzone_idx, 0))
pgdat_needs_compaction = false;
}
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
* We do this because the page allocator works in the opposite
* direction. This prevents the page allocator from allocating
* pages behind kswapd's direction of progress, which would
* cause too much scanning of the lower zones.
*/
/*此循环的方向依然是从ZONE_NORMAL到end_zone,为什么要从ZONE_NORMAL到end_zone
的方向回收页面呢?因为伙伴系统分配系统是从ZONE_HIGHMEM到ZONE_NORMAL的方向,
恰好和回收页面的方向相反,这样有利于减少对锁的争用(页面分配路径上的直接页面回
收(directly reclaim)和kswapd有可能争用zone->lru_lock锁),提高效率。*/
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
if (sc.priority != DEF_PRIORITY &&
!zone_reclaimable(zone))
continue;
sc.nr_scanned = 0;
nr_soft_scanned = 0;
/*
* Call soft limit reclaim before calling shrink_zone.
*/
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
order, sc.gfp_mask,
&nr_soft_scanned);
sc.nr_reclaimed += nr_soft_reclaimed;
/*
* There should be no need to raise the scanning
* priority if enough pages are already being scanned
* that that high watermark would be met at 100%
* efficiency.
*/
/*kswapd_shrink_zone()是真正扫描和页面回收函数,扫描的参数和结果存放在
struct scan_control sc中,kswapd_shrink_zone函数返回true,表明已经回
收了所需要的页面,且不需要再提高扫描优先级*/
if (kswapd_shrink_zone(zone, end_zone,
&sc, &nr_attempted))
raise_priority = false;
}
/*
* If the low watermark is met there is no need for processes
* to be throttled on pfmemalloc_wait as they should not be
* able to safely make forward progress. Wake them
*/
/*
如果进程加入到了node的pgdat->pfmemalloc_wait等待队列中。在此node的kswapd进行内存回收后,
会通过再次判断此node是否平衡来唤醒这些进程,如果node平衡,则唤醒这些进程,否则不唤醒。实际
上,不唤醒也说明了node没有平衡,kswapd还是会继续进行内存回收,最后kswapd实在没办法让node
达到平衡水平下,会在kswapd睡眠前,将这些进程全部进行唤醒。
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/*
* Fragmentation may mean that the system cannot be rebalanced
* for high-order allocations in all zones. If twice the
* allocation size has been reclaimed and the zones are still
* not balanced then recheck the watermarks at order-0 to
* prevent kswapd reclaiming excessively. Assume that a
* process requested a high-order can direct reclaim/compact.
*/
/*sc.nr_reclaimed表示已经回收页面的数量。如果已经回收的页面大于等于
2^order,为了避免页面碎片,这里设置order为0,以防止kswapd内核线程过
于激进地回收页面。假如没有此判断,并且回收了2^order个页面后pgdat_balanced()
还是发现内存节点没有达到平衡状态,那么它会循环下去,直到sc.priority<=0为止。
注意要退出扫描,还需要判断当前内存节点的页面是否处于平衡状态pgdat_balanced()。*/
if (order && sc.nr_reclaimed >= 2UL << order)
order = sc.order = 0;
/* Check if kswapd should be suspending */
/*判断kswapd内核线程是否要停止或者睡眠*/
if (try_to_freeze() || kthread_should_stop())
break;
/*
* Compact if necessary and kswapd is reclaiming at least the
* high watermark number of pages as requsted
*/
/*判断是否需要对这个内存节点进行内存规整,优化内存碎片*/
if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
compact_pgdat(pgdat, order);
/*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
/*判断是否需要提高扫描的优先级和扫描粒度。变量raise_priority默认为true
当kswapd_shrink_zone()函数返回true,即成功回收了页面时,才会把
raise_priority设置为false。如果扫描一轮后没有一个页面被回收释放,
那么也需要提高优先级来增加扫描页面的强度。*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
/*(3) 整个大循环不断加大扫描粒度,并且检查从最低端zone到
classzone_idx的zone是否处于平衡状态*/
} while (sc.priority >= 1 &&
!pgdat_balanced(pgdat, order, *classzone_idx));
/*pgdat_balanced()需要注意参数classzone_idex,它表示在页面分配路径上计算出来第一个
最合适内存分配的zone的编号,通过wake_all_kswapds()传递下来
下面查看pgdat_balanced()函数的实现*/
out:
/*
* Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
*/
*classzone_idx = end_zone;
return order;
}
此函数看完之后我们需要查看kswapd_shrink_zone()函数,在后面。
struct scan_control定义如下:
[mm/vmscan.c]
struct scan_control {
/* How many pages shrink_list() should reclaim */
unsigned long nr_to_reclaim;/*需要回收的页框数量*/
/* This context's GFP mask */
gfp_t gfp_mask;/*申请内存时使用的分配标志*/
/* Allocation order */
int order;/*申请内存时使用的order值,因为只有申请内存,然后内存不足时才会进行扫描*/
/*
* Nodemask of nodes allowed by the caller. If NULL, all nodes
* are scanned.
*/
nodemask_t *nodemask;/*允许扫描的node结点的掩码*/
/*
* The memory cgroup that hit its limit and as a result is the
* primary target of this reclaim invocation.
*/
struct mem_cgroup *target_mem_cgroup;
/* Scan (total_size >> priority) pages at once */
/*扫描优先级,代码一次扫描(total_size >> priority)个页框
优先级越低,一次扫描的页框数量就越多
优先级越高,一次扫描的数量就越少
默认优先级为12
*/
int priority;
unsigned int may_writepage:1;/*是否能够进行回写操作(与分配标志的__GFP_IO和__GFP_FS有关)*/
/* Can mapped pages be reclaimed? */
unsigned int may_unmap:1;/*能够进行unmap操作,就是将所有映射了此页的页表项清空*/
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;/*是否能够进行swap交换,如果不能,在内存回收时则不扫描匿名页面LRU链表*/
/* Can cgroups be reclaimed below their normal consumption range? */
unsigned int may_thrash:1;
unsigned int hibernation_mode:1;
/* One of the zones is ready for compaction */
unsigned int compaction_ready:1;/*扫描结束后会标记,用于内存回收判断是否需要进行内存压缩*/
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;/*已经扫描的页框数量*/
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;/*已经回收的页框数量*/
};
回到balance_pgdat()函数
pgdat_balanced()函数实现:判断一个内存节点上的物理页面是否处于平衡状态,返回true,则表示该内存节点处于平衡状态。
什么是平衡状态?
对于order为0的情况,所有zone认为其是平衡的。
[kswapd()->balance_pgdat()->pgdat_balanced()]
/*
* pgdat_balanced() is used when checking if a node is balanced.
*
* For order-0, all zones must be balanced!
*
* For high-order allocations only zones that meet watermarks and are in a
* zone allowed by the callers classzone_idx are added to balanced_pages. The
* total of balanced pages must be at least 25% of the zones allowed by
* classzone_idx for the node to be considered balanced. Forcing all zones to
* be balanced for high orders can cause excessive reclaim when there are
* imbalanced zones.
* The choice of 25% is due to
* o a 16M DMA zone that is balanced will not balance a zone on any
* reasonable sized machine
* o On all other machines, the top zone must be at least a reasonable
* percentage of the middle zones. For example, on 32-bit x86, highmem
* would need to be at least 256M for it to be balance a whole node.
* Similarly, on x86-64 the Normal zone would need to be at least 1G
* to balance a node on its own. These seemed like reasonable ratios.
对于高阶分配,仅将符合水印且位于调用者classzone_idx允许的区域中的区域添加到balance_pages。
平衡页面的总数必须至少为classzone_idx允许的节点平衡区域的25%。 当存在不平衡区域时,强制所
有区域达到高阶平衡可能会导致过多的回收。
25%的选择是由于
平衡的16M DMA区域不会平衡任何大小合理的计算机上的区域
在所有其他机器上,顶部区域必须至少是中间区域的合理百分比。
例如,在32位x86上,highmem必须至少为256M,才能平衡整个节点。
类似地,在x86-64上,正常区域至少需要1G才能单独平衡节点。 这些似乎是合理的比率。
*/
/*注意参数classzone_idx是由页面分配路径上传递过来的。*/
static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
unsigned long managed_pages = 0;
unsigned long balanced_pages = 0;
int i;
/* Check the watermark levels */
/*遍历从最低端的zone到classzone_idx的页面是否处于平衡状态*/
for (i = 0; i <= classzone_idx; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
managed_pages += zone->managed_pages;/*zone->managed_pages表示被伙伴系统管理的页面数量*/
/*
* A special case here:
*
* balance_pgdat() skips over all_unreclaimable after
* DEF_PRIORITY. Effectively, it considers them balanced so
* they must be considered balanced here as well!
*/
if (!zone_reclaimable(zone)) {
balanced_pages += zone->managed_pages;
continue;
}
/*zone_balanced()函数用于判断zone的空闲页面是否高于WMARK_HIGH水位之上,
返回true,则表示zone处于WMARK_HIGH之上。
如果这个zone的空闲页面高于WMARK_HIGH水位,那么这个zone所有管理的页面可以
看作balanced_pages。下面查看此函数的实现*/
if (zone_balanced(zone, order, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
}
/*对于order为0的情况,所有的zone都是平衡的。对于order大于0的内存分配,
需要统计从最低端zone到classzone_idx_zone中所有处于平衡状态zone的页面数量
(balanced_pages),当大于这个节点的所有管理的页面managed_pages的25%,那么
就认为这个内存节点处于平衡状态。*/
if (order)
return balanced_pages >= (managed_pages >> 2);
else
return true;
}
回到balance_pgdat()函数
zone_balanced()函数实现:zone的balanced由此函数来判断,这是针对于order来说的。
此函数有两个条件:
(1) zone内的空闲内存高于高水位
水位是在内存初始化的时候根据每个zone的内存大小自动计算出来的,每个zone可能有不同的水位。具体计算水位的算法可能各个kernel版本不尽相同,比如某个版本的这么计算:对于非高端内存来说(64位机器上已经不存在高端内存了),min_watermark根据各个zone的内存占比,瓜分1024个page;low_watermark在此基础上增加25%;high_watermark在此基础上增加50%。(可以通过/proc/zoneinfo)看到系统中每一个zone,及其free_pages和watermark的情况)这里的高水位对于现在的大内存机器来说,其实只是九牛一毛。由这个高水位来作为判断zone_balanced的基础,可见内存在内存balance的问题上还是很注重系统性能的。
(2) 要求zone内的内存在0到给定order之间平衡分布
例如:总的内存超过高水位、order-1及以上的内存超过高水位的1/2、order-2及以上的内存超过高水位的1/4、......、一直到所要求的order。
为什么针对order的内存balanced不仅仅关心order阶的内存,而是关心0-order阶的所有内存呢?因为高order的连续内存是稀缺资源。如果内存分布不平衡,低order的内存请求可能因为低order内存的暂时缺货不得不将高order所对应的连续内存进行分拆。这种浪费是尽量避免的。并且这样的分拆可能导致高order内存耗尽,而导致满足不了对指定order的内存分配需求。
那么为什么针对order的内存balanced又仅仅关系0到order阶的所有内存、而不关心大于order阶的内存呢?当我们需要检查针对于order的zone_balanced时,起始是说明我们需要这个zone内2^order的连续页面,由于连续页面回收不易,也不是系统内最普遍的需求(给用户空间使用的内存基本上都是order-0的,不考虑hugepage这样的特殊情况),所以更高的order就不要考虑了。后面会看到,kswapd只针对order-0进行回收。
static bool zone_balanced(struct zone *zone, int order,
unsigned long balance_gap, int classzone_idx)
{
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
balance_gap, classzone_idx, 0))
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
order, 0, classzone_idx) == COMPACT_SKIPPED)
return false;
return true;
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
}
回到pgdat_balanced()函数
kswapd_shrink_zone()函数实现:页面回收的真正函数
[kswapd()->balanced_pgdat()->kswapd_shrink_zone]
/*
* kswapd shrinks the zone by the number of pages required to reach
* the high watermark.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim or if the lack of progress was due to pages under writeback.
* This is used to determine if the scanning priority needs to be raised.
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
struct scan_control *sc,
unsigned long *nr_attempted)
{
/*
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
};
*/
int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
/* Reclaim above the high watermark. */
/*计算一轮扫描最多回收的页面sc->nr_to_reclaim个数,SWAP_CLUSTER_MAX
定义为32个页面,high_wmark_pages()宏表示预期需要最多回收多少个页面
才能达到WMARK_HIGH水位。这里比较两者取其最大值。这里会使用到
zone->watermark[WMARK_HIGH]变量,WMARK_HIGH水位值的计算是在
__setup_per_zone_wmarks()函数中,通过min_free_kbytes和zone管理的页
面数等参数计算得出。*/
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
* Kswapd reclaims only single pages with compaction enabled. Trying
* too hard to reclaim until contiguous free pages have become
* available can hurt performance by evicting too much useful data
* from memory. Do not reclaim more than needed for compaction.
*/
if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
compaction_suitable(zone, sc->order, 0, classzone_idx)
!= COMPACT_SKIPPED)
testorder = 0;
/*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
* watermark or 1% of the zone, whichever is smaller.
*/
/* balance_gap相当于在判断zone是否处于平衡状态时增加了些难度,原来要判断空闲页面
是否超过了高水位WMARK_HIGH即可,现在需要判断是否超过(WMARK_HIGH+balance_gap)。
balance_gap值比较小,一般取低水位值或zone管理页面的1% */
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
/*
* If there is no low memory pressure or the zone is balanced then no
* reclaim is necessary
*/
/*在调用shink_zone()函数之前,需要判断当前zone的页面是否处于平衡状态,即当前水位是否
已经高于WMARK_HIGH+balanc_gap。如果已经处于平衡状态,那么不需要执行页面回收,直接返
回即可。这里还考虑了buffer_head的使用情况,buffer_heads_over_limit全局变量定义在
fd/buffer.c文件中,我们暂时先不考虑它。*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
if (!lowmem_pressure && zone_balanced(zone, testorder,
balance_gap, classzone_idx))
return true;
/*shrink_zone()函数去尝试回收zone的页面,它是kswapd内核线程的核心函数,后续详细介绍。*/
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
* If a zone reaches its high watermark, consider it to be no longer
* congested. It's possible there are dirty pages backed by congested
* BDIs but as pressure is relieved, speculatively avoid congestion
* waits.
*/
/*shrink_zone完成之后继续判断当前zone是否处于平衡状态,如果处于平衡状态,则可以
不考虑block层的堵塞问题(congest),即使还有一些页面处于回写状态也是可以控制的,
清除ZONE_CONGESTED比特位*/
if (zone_reclaimable(zone) &&
zone_balanced(zone, testorder, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
/*最后,如果扫描的页面数量(sc->nr_scaned)大于等于扫描目录(sc->nr_to_reclaim)
的话表示扫描了足够多的页面,则该函数返回true。扫描了足够多的页面,也有可能一无
所获。kswapd_shrink_zone()函数除了上面说的情况返回true以外,当zone处于平衡
状态时也会返回true,返回false只会影响balance_pgdat()函数的扫描粒度。*/
return sc->nr_scanned >= sc->nr_to_reclaim;
}
页面分配路径page allocator和页面回收路径kswapd之间有很多交互的地方,如下图:
-
当页面分配路径page allocator在低水位中分配内存失败时,会唤醒kswapd内核线程,把order和preferred_zone传递给kswapd,这两个参数是他们之间的纽带。
-
页面分配路径page allocator和页面回收路径kswapd在扫描zone时的方向是相反的,页面分配路径page allocator从ZONE_HIGHMEM往ZONE_NORMAL方向扫描zone,kswapd则相反。
-
如何判断kswapd应该停止页面回收呢?一个重要的条件是从zone_normal到preferred_zone处于平衡状态时,那么就认为这个内存节点处于平衡状态,可以停止页面回收。
-
页面分配路径page allocator和页面回收路径kswapd采用zone的水位标不同,page allocator采用低水位,即在低水位中无法分配内存,就唤醒kswapd;而kswapd判断是否停止页面回收采用的高水位。