在内核中分配内存,最后要通过伙伴系统接口进行实际物理页面的分配,一个重要的接口便是alloc_page.本文介绍下alloc_page的主要流程,各个部分的执行。主要包含正常分配流程,当页面不足的时候的处理方式。先定位到核心调用
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
order是分配页面的阶,即2的指数个页面
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
nid指定了从哪个NUMA节点分配页面,如果没有指定节点,则默认从当前节点分配
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
/* Unknown node is current node */
/*如果没有指定node ,则从当前node分配*/
if (nid < 0)
nid = numa_node_id();
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
zonelist是一组zone的列表,有两种,局部的和全局的。局部zonelist只包含本节点的zone,全局的包含所有节点的zone。下篇文章会详细介绍这些数据结构
static inline struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}
到了__alloc_pages_nodemask就进入了比较正式的流程了,拨开来讲,本函数主要包含两步:
1、直接分配
2、分配失败选择另一种方式即slowpath继续处理.
首次尝试分配是调用了static struct page *get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,struct zonelist *zonelist, int high_zoneidx, int alloc_flags,struct zone *preferred_zone, int migratetype),核心机制就是遍历zonelist上的zone,找到一个page.函数代码倒是没什么可说的,比较容易理解,这里主要看下涉及到的几个机制。该函数主要实现功能:1、在zonelist中找到一个合适的zone 2、从zone中分配页面。前者由一个循环体完成,后者由static inline struct page *buffered_rmqueue(struct zone *preferred_zone,struct zone *zone, int order, gfp_t gfp_flags,int migratetype)完成。
在选定zone的阶段,在正常情况下需要进行一系列的验证,保证当前zone有足够的可用页面供分配。那么什么是非正常情况呢?即使携带ALLOC_NO_WATERMARKS标识的,所以这里就分为两种情况。这里涉及到一个watermark,俗称分配水位,水位有三种
- #define ALLOC_WMARK_MIN WMARK_MIN
- #define ALLOC_WMARK_LOW WMARK_LOW
- #define ALLOC_WMARK_HIGH WMARK_HIGH
在分配之前一般会指定满足那个水位才允许分配,或者不管水位直接分配,这就对应ALLOC_NO_WATERMARKS标识。在zone结构中,有vm_stat字段,是一个数组,记录各个状态的页面的数量,其中就包含空闲页面,对应NR_FREE_PAGES,携带watermark标识的分配,需要验证空闲页面是否大于对应的水位,只有在大于水位了才允许分配,否则需要根据情况对页面进行回收reclaim,如果无法回收或者回收后仍然不满足条件,则直接返回了。在一些急迫的事务中,可以指定ALLOC_NO_WATERMARKS,这样会不会对水位进行验证,直接调用buffered_rmqueue分配页面。
buffered_rmqueue并不直接从伙伴系统分配,为了加速分配流程,每个CPU也会维护页框高速缓存,通过per_cpu_pageset管理
struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};
其中pcp维护了各种性质的页面链表,性质基本是根据可移动性来决定的。
struct per_cpu_pages {
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */
/*链表数组,每个迁移类型维护一个数组*/
/*MIGRATE_UNMOVABLE,
MIGRATE_RECLAIMABLE,
MIGRATE_MOVABLE,
MIGRATE_PCPTYPES, // the number of types on the pcp lists
MIGRATE_RESERVE*/
struct list_head lists[MIGRATE_PCPTYPES];
};
count链表中所有页面的数量,high和清空相关,而batch是当缓存不足时,每次从伙伴系统申请多少页面填充进来。
核心分配逻辑如下:
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
local_irq_save(flags);
/*页框高速缓存*/
pcp = &this_cpu_ptr(zone->pageset)->pcp;
/*获取缓存链表*/
list = &pcp->lists[migratetype];
/*如果链表为空*/
if (list_empty(list)) {
/*尝试从伙伴系统填充链表*/
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
/*如果依然为空,则失败*/
if (unlikely(list_empty(list)))
goto failed;
}
/*list是双链表,如果cold为真就从表尾部分配,否则从表头分配*/
if (cold)
page = list_entry(list->prev, struct page, lru);
else
page = list_entry(list->next, struct page, lru);
/*页面从链表删除*/
list_del(&page->lru);
pcp->count--;
}
slow alloc path 见__alloc_pages_slowpath函数(page_alloc.c)
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
bool sync_migration = false;
bool deferred_compaction = false;
bool contended_compaction = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
* reclaim >= MAX_ORDER areas which will never succeed. Callers may
* be using allocators in order of preference for an area that is
* too large.
*/
if (order >= MAX_ORDER) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
* __GFP_NOWARN set) should not cause reclaim since the subsystem
* (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
* using a larger set of nodes after it has established that the
* allowed per node queues are empty and that nodes are
* over allocated.
*/
if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
/*到这一步如果允许启用kswapd线程,则唤醒所有的kswapd*/
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapd(order, zonelist, high_zoneidx,
zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
* reclaim. Now things get more complex, so set up alloc_flags according
* to how we want to proceed.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
/*如果没哟没有指定cpuset,则选取最优的zonelist*/
if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
first_zones_zonelist(zonelist, high_zoneidx, NULL,
&preferred_zone);
rebalance:
/* This is the last chance, in general, before the goto nopage. */
/*尝试分配页面*/
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page)
goto got_pg;
/*如果允许不管区域的watermark*/
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
/*
* Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
zonelist = node_zonelist(numa_node_id(), gfp_mask);
/*高优先级的内存分配,遍历整个zonelist*/
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
if (page) {
goto got_pg;
}
}
/* Atomic allocations - we can't balance anything */
if (!wait)
goto nopage;
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
sync_migration = true;
/*
* If compaction is deferred for high-order allocations, it is because
* sync compaction recently failed. In this is the case and the caller
* requested a movable allocation that does not heavily disrupt the
* system then fail the allocation instead of entering direct reclaim.
*/
if ((deferred_compaction || contended_compaction) &&
(gfp_mask & __GFP_NO_KSWAPD))
goto nopage;
/* Try direct reclaim and then allocating */
/*如果压缩后仍然没哟u可用的,就对页面进行回收*/
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, &did_some_progress);
if (page)
goto got_pg;
/*
* If we failed to make any progress reclaiming, then we are
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
if ((current->flags & PF_DUMPCORE) &&
!(gfp_mask & __GFP_NOFAIL))
goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
migratetype);
if (page)
goto got_pg;
if (!(gfp_mask & __GFP_NOFAIL)) {
/*
* The oom killer is not called for high-order
* allocations that may fail, so if no progress
* is being made, there are no other options and
* retrying is unlikely to help.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto nopage;
/*
* The oom killer is not called for lowmem
* allocations to prevent needlessly killing
* innocent tasks.
*/
if (high_zoneidx < ZONE_NORMAL)
goto nopage;
}
goto restart;
}
}
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
} else {
/*
* High-order allocations do not necessarily loop after
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
migratetype, sync_migration,
&contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
}
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
return page;
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
该函数主要是在首次分配失败的情况下,采取的补救解决方案。由于首次分配的失败,要么是缓存不足,要么是真没有剩余的空闲页面了。那么我们还能怎么办?大家可能都能想到,回收内存呗!!不错,回收内存的确是当前最主要的解决思路,看下代码,在__alloc_pages_slowpath函数中,首次的调用便是wake_all_kswapd,kswapd都晓得伐,负责物理页面的换出的,定期执行以保证物理页面的实时可用性,但是毕竟是定期执行,不能保证任何时刻都有充足的内存使用,所以这里尝试唤醒所有的kswapd守护进程进行物理页面的回收。而物理页面的回收涉及了更多的东西,本文不打算深入,后续有空在单独分析。到这里已经唤醒了后台的回收线程,Ok,重新尝试下分配,继续调用get_page_from_freelist,不过此时着重设置了一个参数,alloc_flags & ~ALLOC_NO_WATERMARKS即本次尝试一定要符合水位的情况,不要求特殊处理。这样如果分配得到页面,就大功告成。如果还是没有分配到,好吧,降低要求,允许忽略水位限制,调用__alloc_pages_high_priority进行分配。到这里,如果还没有分配到怎么办?判断wait标识,如果允许等待,则接着往下走;否则返回分配失败吧!没办法
接下来既然回收页面也不行,那么我不换出页面,把内存中的页面进行适当压缩总可以吧!!恩~总算中和了双方,调用了__alloc_pages_direct_compact,额,压缩页面是最后一个解决方案了,如果在不行可真是山穷水尽了,这样里OOM就不远了……
该函数主要分两步:
1、压缩区域内的页面try_to_compact_pages
2、尝试分配页面 get_page_from_freelist
分配页面前面已经介绍,这里就不用在多说了,重点看try_to_compact_pages,该函数会按照zonelist中的顺序对各个zone进行压缩,核心逻辑看下
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
int status;
status = compact_zone_order(zone, order, gfp_mask, sync,
contended);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
alloc_flags))
break;
}
主要是compact_zone_order
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
bool sync, bool *contended)
{
unsigned long ret;
/*压缩控制器*/
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
/*执行压缩*/
ret = compact_zone(zone, &cc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
*contended = cc.contended;
return ret;
}
这里有一个结构,姑且称之为压缩控制器,记录压缩过程中需要的一些参数,设置好后又调用了compact_zone,该函数不在列举,感兴趣的参考源代码compaction.c,在该函数中,首先调用了compaction_suitable检查下当前zone是否有压缩的潜质,即通过压缩是否可以达到要求,如果可以就执行压缩,否则,呵呵……忽略吧!该函数返回三种值:
COMPACT_SKIPPED - If there are too few free pages for compaction
COMPACT_PARTIAL - If the allocation would succeed without compaction
COMPACT_CONTINUE - If compaction should run now*/
解释的比较明确,我就不多说了,唯一的一点是COMPACT_PARTIAL既然表示在不压缩的情况下也可以分配成功,为何不直接返回成功,然后尝试分配呢?还要走到下面的流程,真实捉急……根据上面的解释可能大家也都明白了,这里压缩并不是把几个页面压缩成一个页面,本质还是整理碎片,把可移动的页面尽量安排在一处,腾出来比较大的连续空间,这样增加满足需求的可能性。接下来就该执行压缩了,在此之前设置了压缩控制器的一些参数,所以有必要看下该结构
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool finished_update_free; /* True when the zone cached pfns are
* no longer being updated
*/
bool finished_update_migrate;
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
bool contended; /* True if a lock was contended */
};
如果看字段中的注释有点蒙,建议看下结构之前的注释
/*
* compact_control is used to track pages being migrated and the free pages
* they are being migrated to during memory compaction. The free_pfn starts
* at the end of a zone and migrate_pfn begins at the start. Movable pages
* are moved to the end of a zone during a compaction run and the run
* completes when free_pfn <= migrate_pfn
*/
英语不行,简要翻译下。compact_control用于在内存压缩过程中追踪正在被移动的页面和被移动的目的页面,free_pfn起始于zone的末尾,而migrate_pfn起始于zone的起始位置。在压缩过程中,可移动的页面被移动到zone的尾部。大致就这个意思,说白了,就是free_pfn从后往前扫描寻找空闲页面,migrate_pfn从前往后扫描,寻找可移动的页面,当free_pfn <= migrate_pfn,压缩就结束了!
在执行压缩之前,对压缩控制器参数的调整
cc->migrate_pfn = zone->compact_cached_migrate_pfn;
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn = cc->migrate_pfn;
}
在设置好压缩控制器之后,就调用了migrate_prep_local函数,该函数中主要调用lru_add_drain把当前LRU缓存移动到LRU链表。下面的一个while循环时压缩的主体,我么看下源码
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
unsigned long nr_migrate, nr_remaining;
int err;
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
continue;
case ISOLATE_SUCCESS:
;
}
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc,
cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
MR_COMPACTION);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
/* Release isolated pages not migrated */
if (err) {
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
if (err == -ENOMEM) {
ret = COMPACT_PARTIAL;
goto out;
}
}
}
循环的判断条件函数是compact_finished,主体就是判断压缩是否完成,主要包含两部分
1、是否有fatal信号处于pending状态,如果有则立刻退返回COMPACT_PARTIAL,否则转入2。
2、检查压缩流程是否走完,即cc->free_pfn <= cc->migrate_pfn是否满足,如果满足则返回COMPACT_COMPLETE,否则转入3。
3、cc->order == -1,返回COMPACT_CONTINUE表示继续压缩,否则转入4
4、检查zone的的freepage是否满足low 的watermark,如果不满足则返回COMPACT_CONTINUE,否则转入5
5、从cc->order开始,依次检查zone->free_area数组,对各个长度的连续页面集合中,检查符合cc->migratetype的链表是否为空,如果不为空,返回COMPACT_PARTIAL,否则转入6
6、返回COMPACT_CONTINUE
循环体内部调用isolate_migratepages,该函数是压缩的主体函数,其中调用了isolate_migratepages_range,该函数实现把特定的isolate开,即从LRU链表中摘除,添加到cc->migratepages链表,而下面执行移动的是migrate_pages,该函数实现了吧刚才隔离开的页面,移动到新的地方,核心即使就是对当前撤销映射,然后对新的页面建立映射。接着就调用update_nr_listpages对控制器做了更新,但是如果出错了就会调用putback_movable_pages把隔离开的页面重新添加回去,保证系统正常运行。
压缩函数返回后,会再次尝试获取页面,如果获取到了,OK,一切照旧;如果没获取到,那也没办法,向上峰报告吧,该做什么做什么……
以马内利
参考:linux3.10.1源码