一:物理页面分配
https://www.cnblogs.com/arnoldlu/p/8250734.html(参考)
linux内存管理是以页面为单位进行分配的,对内存的管理是通过伙伴系统管理的。
1.1:伙伴系统分配物理内存
分配物理内存的接口函数:alloc_pages(分配一个或者多个连续的物理页面,分配的页面只能是2的整数次页面,参数位一个为分配源码,一个为分配阶数)。
include\linux\gfp.h存放了GFP(Get Free Page)分配掩码,分配掩码可以分为两类:以__GFP_开头的分配掩码;以GFP_开头的一般是__GFP_的组合。__GFP_掩码分为两大类:zone modifiers和action modifiers,zone modifiers是掩码的低4位,用来指定从那个zone分配页面。
alloc_pages函数最终会调用函数__alloc_pages_nodemask,它是伙伴系统的核心函数,其次说明了这里的伙伴页面分配器是基于Zone的。
struct alloc_context是伙伴系统分配函数中用于保存相关参数的数据结构。gfp_zone()函数从分配掩码中计算出zone的zoneidx。
struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask),----------------------------------gfp_zone根据gfp_mask低4位,找到对应的zone_type。ZONE_NORMAL?ZONE_HIGHMEM? .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask),--------------------根据gfp_mask得出页面migratetype,是MIGRATE_RECLAIMABLE?MIGRATE_MOVABLE? }; gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_WAIT); if (should_fail_alloc_page(gfp_mask, order)) return NULL; /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result * of GFP_THISNODE and a memoryless node */ if (unlikely(!zonelist->_zonerefs->zone)) return NULL; if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); /* We set it here, as __alloc_pages_slowpath might have changed it */ ac.zonelist = zonelist; /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask ? : &cpuset_current_mems_allowed, &ac.preferred_zone); if (!ac.preferred_zone) goto out; ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ alloc_mask = gfp_mask|__GFP_HARDWALL; page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);---------尝试分配物理页面需要判断从哪个zone中去分配内存,扫描节点查找适合分配的zone。然后返回页面 if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. */ alloc_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(alloc_mask, order, &ac);-----------------如果分配失败,则在这里进行很多特殊场景的处理。 } if (kmemcheck_enabled && page) kmemcheck_pagealloc_alloc(page, order, gfp_mask); trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); out: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset;--------------------------------------------------重试页面分配 return page; }
static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { struct zonelist *zonelist = ac->zonelist; struct zoneref *z; struct page *page = NULL; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); int nr_fair_skipped = 0; bool zonelist_rescan; zonelist_scan:-------------------------------------------------------------------开始检查ac->zonelist。 zonelist_rescan = false; /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ //判断从哪个zone中分配内存。,扫描节点zonelist去查找合适分配内存的zone。 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,--------从zonelist给定的ac->high_zoneidx开始查找,返回的是zone。 ac->nodemask) { ...-----------------------------------------------------------------------------一系列检查条件,不满足跳出当前for循环,进入下一个zone。满足的进入水位检查。 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];-----------------这里的alloc_flags包含ALLOC_WMARK_LOW if (!zone_watermark_ok(zone, order, mark,-------------------------------所以此处会检查zone的低水位,不满足则进行检查,或者尝试zone_reclaim。 ac->classzone_idx, alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; ... ret = zone_reclaim(zone, gfp_mask, order);-------------------------通过zone_reclaim进行一些页面回收 switch (ret) { ... default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, ac->classzone_idx, alloc_flags))---------------------再次检查水位是否满足 goto try_this_zone; /* * Failed to reclaim enough to meet watermark. * Only mark the zone full if checking the min * watermark or if we failed to reclaim just * 1<<order pages or else the page allocator * fastpath will prematurely mark zones full * when the watermark is between the low and * min watermarks. */ if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || ret == ZONE_RECLAIM_SOME) goto this_zone_full; continue; } } try_this_zone:---------------------------------------------------------------包括水位各种条件都满足之后,可以在此zone进行页面分配工作。 page = buffered_rmqueue(ac->preferred_zone, zone, order,-------------从zone中进行页面分配工作 gfp_mask, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; return page; } this_zone_full: if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } /* * The first pass makes sure allocations are spread fairly within the * local node. However, the local node might have free pages left * after the fairness batches are exhausted, and remote zones haven't * even been considered yet. Try once more without fairness, and * include remote zones now, before entering the slowpath and waking * kswapd: prefer spilling to a remote zone over swapping locally. */ if (alloc_flags & ALLOC_FAIR) { alloc_flags &= ~ALLOC_FAIR; if (nr_fair_skipped) { zonelist_rescan = true; reset_alloc_batches(ac->preferred_zone); } if (nr_online_nodes > 1) zonelist_rescan = true; } if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; zonelist_rescan = true; } if (zonelist_rescan) goto zonelist_scan; return NULL; }
for_each_zone_zonelist_nodemask->first_zone_zonelist_nodmask->first_zones_zonelist->nest_zones_zonelist来计算zoneref结构体,最后返回zone结构体。有函数所分配的掩码得到分配的zone区域。zonerefs[0]表示ZONE_HIGHMEM。zonerefs[1]表示ZONE_NORMAL。
1.2:水位的计算和设置:
下面看看判断当前zone空闲页面是否满足alloc_flags指定水位的函数__zone_watermark_ok。z-zone结构体,order待分配页面的阶数,mark水位数值,classzone_idx是zone序号,alloc_flags分配掩码,free_pages当前空闲页面数。
分配物理 内存的内核路径是检查WMARK_LOW的水位,页面回收kswapd内核线程检查WMARK_HIGH水位,这导致了内存节点的zone的页面老化速度不一致。
static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { /* free_pages may go negative - that's OK */ long min = mark; int o; long free_cma = 0; free_pages -= (1 << order) - 1;---------------------------------------------减去待分配页面后剩余页面数,-1?? if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; ... if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])--------空闲页面数要保证大于min值和lowmem_resreve保留值之和 return false; for (o = 0; o < order; o++) {-----------------------------------------------遍历buddy中比当前请求分配order小的所有order,依次检查free pages是否满足watermark需求 /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o;-----------------------------从总free_pages种减去当前order的free pages /* Require fewer higher order pages to be free */ min >>= 1;--------------------------------------------------------------水位值缩半 if (free_pages <= min)--------------------------------------------------在比较是否满足水位需求 return false; } return true;----------------------------------------------------------------以上所有条件都满足,返回True }
通过zone_reclaim回收页面,然后通过buffered_rmqueue从伙伴系统中分配物理页面。
buffered_rmqueue根据order的值order=0,从zone->pageset中分配,order>0从伙伴系统中分配。
1.3:释放物理页面
__free_page free_page-->free_pages __free_pages free_hot_cold_page __free_pages_ok
二:slab分配器
https://www.cnblogs.com/arnoldlu/p/8215414.html
伙伴系统中的分配是以page大小为单位的,对于内核中的连续小内存块的分配,如果继续采用也分配就会产生浪费,出现内存碎片化,所以我们这里采用了slab分配器。
slab分配器最终还是由伙伴系统来分配出实际的物理页面,只不过slab分配器在这些连续的物理页面上实现了自己的算法,以此来对小内存块进行管理。
作用:减小内存产生的内存碎片,将频繁使用的对象缓存起来,减少分配初始化,释放对象的开销,通过着色技术调整对象以更好的使用硬件高速缓存器。
slab分配器为每种对象分配一个高速缓存,这个缓存可以看做同类型对象的储备,每个高速缓存又被划分为多个slab,每个slab由一个或者多个连续的页框组成,每个页框包换若干对象。
(对象的概念:内核中的数据结构以及对该数据结构进行创建和撤销的操作)。
每个高速缓存器通过kmem_cache结构来描述,包含了对当前高速缓存各种属性的描述。每个kmem_cache结构中并不包含对具体slab的描述,而是通过kmem_list3结构组织各个 slab。
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,---------创建slab描述符kmem_cache,此时并没有真正分配内存(只是创建描述符) unsigned long, void (*)(void *)); void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);------------------分配slab缓存对象 void kmem_cache_free(struct kmem_cache *, void *);-------------------------释放slab缓存对象 void kmem_cache_destroy(struct kmem_cache *);-----------------------------销毁slab描述符
2.1:创建slab描述符
slab对象的描述符struct kmem_cache
struct kmem_cache { struct array_cache __percpu *cpu_cache; //本地cpu的对象缓冲池。 /* 1) Cache tunables. Protected by slab_mutex */ unsigned int batchcount;-----------------------------------表示当前CPU本地缓冲池array_cache为空时,从共享缓冲池或者slabs_partial/slabs_free列表中获取对象的数目。 unsigned int limit;----------------------------------------表示当本地对象缓冲池空闲对象数目大于limit时就会主动释放batchcount个对象,便于内核回收和销毁slab。 unsigned int shared; unsigned int size;-----------------------------------------align过后的对象长度 struct reciprocal_value reciprocal_buffer_size; /* 2) touched by every alloc & free from the backend */ unsigned int flags; /* constant flags */------------分配掩码 unsigned int num; /* # of objs per slab */----------slab中有多少个对象 /* 3) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder;------------------------------------此slab占用2^gfporder个页面 /* force GFP flags, e.g. GFP_DMA */ gfp_t allocflags; size_t colour; /* cache colouring range */----一个slab有几个不同的cache line unsigned int colour_off; /* colour offset */----------一个cache order的长度,和L1 Cache Line长度相同 struct kmem_cache *freelist_cache; unsigned int freelist_size; /* constructor func */ void (*ctor)(void *obj); /* 4) cache creation/removal */ const char *name;----------------------------------------slab描述符的名称 struct list_head list; int refcount;--------------------------------------------被引用的次数,供slab描述符销毁参考 int object_size;-----------------------------------------对象的实际大小 int align;-----------------------------------------------对齐的大小 /* 5) statistics */ #ifdef CONFIG_DEBUG_SLAB unsigned long num_active; unsigned long num_allocations; unsigned long high_mark; unsigned long grown; unsigned long reaped; unsigned long errors; unsigned long max_freeable; unsigned long node_allocs; unsigned long node_frees; unsigned long node_overflow; atomic_t allochit; atomic_t allocmiss; atomic_t freehit; atomic_t freemiss; /* * If debugging is enabled, then the allocator can add additional * fields and/or padding to every object. size contains the total * object size including these internal fields, the following two * variables contain the offset to the user object and its size. */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ #ifdef CONFIG_MEMCG_KMEM struct memcg_cache_params memcg_params; #endif struct kmem_cache_node *node[MAX_NUMNODES];-------slab对应的节点的struct kmem_cache_node数据结构 }
本地缓冲池:
struct array_cache { unsigned int avail;-------------对象缓冲池中可用的对象数目 unsigned int limit; unsigned int batchcount; unsigned int touched;----------从缓冲池移除一个对象时,touched置1;收缩缓存时,touched置0。 void *entry[];-----------------保存对象的实体 };
主要的调用函数:kmem_cache_create(参数:name:slab描述符的名称,size:缓存对象的大小,align:对齐的大小,flags:分配掩码,ctor:对象的构造函数)
调用核心流程:kmem_cache_create-----------------------------进行合法性检查,以及是否有现成slab描述符可用 do_kmem_cache_create----------------------将主要参数配置到slab描述符,然后将得到的描述符加入slab_caches全局链表中。 __kmem_cache_create-------------------是创建slab描述符的核心进行对齐操作,字节长度对齐(一般为4byte)计算需要页面,align对齐的大小,alsb的状态,对象数目,对slab着色等等操作。 calculate_slab_order--------------计算slab对象需要的大小,以及一个slab描述符需要多少page setup_cpu_cache-------------------继续配置slab描述符
struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { ... s = __kmem_cache_alias(name, size, align, flags, ctor);----------------检查是否有现成的slab描述符可用,有即跳转到out_unlock。 if (s) goto out_unlock; cache_name = kstrdup_const(name, GFP_KERNEL); if (!cache_name) { err = -ENOMEM; goto out_unlock; } s = do_kmem_cache_create(cache_name, size, size,----------------------调用do_kmem_cache_create创建slab描述符 calculate_alignment(flags, align, size), flags, ctor, NULL, NULL); ... return s; }
计算slab的大小:static size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { unsigned long offslab_limit; size_t left_over = 0; int gfporder; for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {------从gfporder=0开始,直到KMALLOC_MAX_ORDER=10,即从4KB到4MB大小。 unsigned int num; size_t remainder; cache_estimate(gfporder, size, align, flags, &remainder, &num);//来计算2^gfporder页面大小的情况下,可以容纳多少个obj对象,剩下的空间用于cache colour着色。 if (!num)---------------------------------------------------------不等于0则表示gfporder已经满足条件,最低分配到一个size大小的对象。等于0则继续下一次for循环。 continue; /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ if (num > SLAB_OBJ_MAX_NUM)--------------------------------------slab中对象最大数目,SLAB_OBJ_MAX_NUM为255,所以所有的slab对象不超过255 break; if (flags & CFLGS_OFF_SLAB) { size_t freelist_size_per_obj = sizeof(freelist_idx_t); /* * Max number of objs-per-slab for caches which * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) freelist_size_per_obj += sizeof(char); offslab_limit = size; offslab_limit /= freelist_size_per_obj; if (num > offslab_limit) break; } /* Found something acceptable - save it away */ cachep->num = num; cachep->gfporder = gfporder; left_over = remainder;-------------------------------------------确定对象个数和需要的页面数 ... if (left_over * 8 <= (PAGE_SIZE << gfporder))-------------------满足着色条件,退出for循环。 break; } return left_over; }
2.3:分配slab对象
kmem_cache_alloc是slab分配缓存对象的核心函数,在slab分配缓存过程中是全程关闭本地中断的。
kmem_cache_alloc-->slab_alloc-->__do_cache_alloc是关中断的。
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep);----------------------------------------获取本地对象缓冲池 if (likely(ac->avail)) {-------------------------------------------本地对象缓冲池是否有空闲对象 ac->touched = 1; objp = ac_get_obj(cachep, ac, flags, false);-------------------从本地对象缓冲池中分配一个对象 /* * Allow for the possibility all avail objects are not allowed * by the current flags */ if (objp) { STATS_INC_ALLOCHIT(cachep); goto out;-------------------------------------------------如果成功获得objp,那么直接返回指针。 } force_refill = true; } STATS_INC_ALLOCMISS(cachep); objp = cache_alloc_refill(cachep, flags, force_refill);------------是slab分配缓存的核心 /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. */ ac = cpu_cache_get(cachep); out: /* * To avoid a false negative, if an object that is in one of the * per-CPU caches is leaked, we need to make sure kmemleak doesn't * treat the array pointers as a reference to the object. */ if (objp) kmemleak_erase(&ac->entry[ac->avail]); return objp; }
cache_alloc_refill是slab分配缓存的核心:
2.4:释放slab
lab释放对象通过kmem_cache_free进行,在释放过程中也是全程关中断的。
kmem_cache_free->kmem_cache_free->_cache_free(核心函数)
static inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { struct array_cache *ac = cpu_cache_get(cachep);----------------找到本地对象缓冲池 check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); kmemcheck_slab_free(cachep, objp, cachep->object_size); if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) return; if (ac->avail < ac->limit) { STATS_INC_FREEHIT(cachep); } else { STATS_INC_FREEMISS(cachep); cache_flusharray(cachep, ac);---------------------------------尝试回收空闲对象 } ac_put_obj(cachep, ac, objp);-------------------------------------将对象释放到本地对象缓冲池ac中 }
2.5:kmalloc分配函数
分配机制是slab,分配的内存大小对齐到2^order个字节大小。在create_kmalloc_caches函数中完成,
start_kernel-->mm_init-->kmem_cache_init-->create_kmalloc_caches。
KMALLOC_MIN_SIZE=64 KMALLOC_SHIFT_LOW=6 KMALLOC_SHIFT_HIGH=13 KMALLOC_SHIFT_MAX=23
对于kmalloc尺寸小于192B从哪个slab描述符中分配缓存,进行了特殊的映射,static s8 size_index[24]
size_index的数值对应kmalloc_caches的下标,kmalloc_caches的内容由create_kmalloc_caches创建。