[转]Linux内核最新的连续内存分配器(CMA)——避免预留大块内存

2022-06-17 04:04:55

http://blog.csdn.net/21cnbao/article/details/7309757

在我们使用ARM等嵌入式Linux系统的时候，一个头疼的问题是GPU，Camera，HDMI等都需要预留大量连续内存，这部分内存平时不用，但是一般的做法又必须先预留着。目前，Marek Szyprowski和Michal Nazarewicz实现了一套全新的Contiguous Memory Allocator。通过这套机制，我们可以做到不预留内存，这些内存平时是可用的，只有当需要的时候才被分配给Camera，HDMI等设备。下面分析它的基本代码流程。

声明连续内存

内核启动过程中arch/arm/mm/init.c中的arm_memblock_init()会调用dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

该函数位于：drivers/base/dma-contiguous.c

[cpp] view plain copy

/**
* dma_contiguous_reserve() - reserve area for contiguous memory handling
* @limit: End address of the reserved memory (optional, 0 for any).
*
* This function reserves memory from early allocator. It should be
* called by arch specific code once the early allocator (memblock or bootmem)
* has been activated and all other subsystems have already allocated/reserved
* memory.
*/
void __init dma_contiguous_reserve(phys_addr_t limit)
{
unsigned long selected_size = 0;
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
if (size_cmdline != -1) {
selected_size = size_cmdline;
} else {
#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
selected_size = size_bytes;
#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)
selected_size = cma_early_percent_memory();
#elif defined(CONFIG_CMA_SIZE_SEL_MIN)
selected_size = min(size_bytes, cma_early_percent_memory());
#elif defined(CONFIG_CMA_SIZE_SEL_MAX)
selected_size = max(size_bytes, cma_early_percent_memory());
#endif
}
if (selected_size) {
pr_debug("%s: reserving %ld MiB for global area\n", __func__,
selected_size / SZ_1M);
dma_declare_contiguous(NULL, selected_size, 0, limit);
}
};

其中的size_bytes定义为：

static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M; 默认情况下，CMA_SIZE_MBYTES会被定义为16MB，来源于CONFIG_CMA_SIZE_MBYTES=16

->

[cpp] view plain copy

由此可见，连续内存区域也是在内核启动的早期，通过__memblock_alloc_base()拿到的。

另外：

drivers/base/dma-contiguous.c里面的core_initcall()会导致cma_init_reserved_areas()被调用：

[cpp] view plain copy

static int __init cma_init_reserved_areas(void)
{
struct cma_reserved *r = cma_reserved;
unsigned i = cma_reserved_count;
pr_debug("%s()\n", __func__);
for (; i; --i, ++r) {
struct cma *cma;
cma = cma_create_area(PFN_DOWN(r->start),
r->size >> PAGE_SHIFT);
if (!IS_ERR(cma))
dev_set_cma_area(r->dev, cma);
}
return 0;
}
core_initcall(cma_init_reserved_areas);

cma_create_area()会调用cma_activate_area(),cma_activate_area()函数则会针对每个page调用：

init_cma_reserved_pageblock(pfn_to_page(base_pfn));

这个函数则会通过set_pageblock_migratetype(page, MIGRATE_CMA)将页设置为MIGRATE_CMA类型的：

[cpp] view plain copy

#ifdef CONFIG_CMA
/* Free whole pageblock and set it‘s migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
struct page *p = page;
do {
__ClearPageReserved(p);
set_page_count(p, 0);
} while (++p, --i);
set_page_refcounted(page);
set_pageblock_migratetype(page, MIGRATE_CMA);
__free_pages(page, pageblock_order);
totalram_pages += pageblock_nr_pages;
}
#endif

同时其中调用的__free_pages(page, pageblock_order);最终会调用到__free_one_page(page, zone, order, migratetype);
相关的page会被加到MIGRATE_CMA的free_list上面去：

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

申请连续内存

申请连续内存仍然使用标准的arch/arm/mm/dma-mapping.c中定义的dma_alloc_coherent()和dma_alloc_writecombine()，这二者会间接调用drivers/base/dma-contiguous.c中的

[cpp] view plain copy

struct page *dma_alloc_from_contiguous(struct device *dev, int count,
unsigned int align)

->

[cpp] view plain copy

struct page *dma_alloc_from_contiguous(struct device *dev, int count,
unsigned int align)
{
...
for (;;) {
pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
start, count, mask);
if (pageno >= cma->count) {
ret = -ENOMEM;
goto error;
}
pfn = cma->base_pfn + pageno;
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
if (ret == 0) {
bitmap_set(cma->bitmap, pageno, count);
break;
} else if (ret != -EBUSY) {
goto error;
}
pr_debug("%s(): memory range at %p is busy, retrying\n",
__func__, pfn_to_page(pfn));
/* try again with a bit different memory target */
start = pageno + mask + 1;
}
...
}

->

int alloc_contig_range(unsigned long start, unsigned long end,

unsigned migratetype)

需要隔离page，隔离page的作用通过代码的注释可以体现：

[cpp] view plain copy

/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because of the way page allocator work, we
* align the range to MAX_ORDER pages so that page allocator
* won‘t try to merge buddies from different pageblocks and
* change MIGRATE_ISOLATE to some other migration type.
*
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
* migrate the pages from an unaligned range (ie. pages that
* we are interested in). This will put all the pages in
* range back to page allocator as MIGRATE_ISOLATE.
*
* When this is done, we take the pages in range from page
* allocator removing them from the buddy system. This way
* page allocator will never consider using them.
*
* This lets us mark the pageblocks back as
* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
* MAX_ORDER aligned range but not in the unaligned, original
* range are put back to page allocator so that buddy can use
* them.
*/
ret = start_isolate_page_range(pfn_align_to_maxpage_down(start),
pfn_align_to_maxpage_up(end),
migratetype);

简单地说，就是把相关的page标记为MIGRATE_ISOLATE，这样buddy系统就不会再使用他们。

[cpp] view plain copy

/*
* start_isolate_page_range() -- make page-allocation-type of range of pages
* to be MIGRATE_ISOLATE.
* @start_pfn: The lower PFN of the range to be isolated.
* @end_pfn: The upper PFN of the range to be isolated.
* @migratetype: migrate type to set in error recovery.
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
* future will not be allocated again.
*
* start_pfn/end_pfn must be aligned to pageblock_order.
* Returns 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype)
{
unsigned long pfn;
unsigned long undo_pfn;
struct page *page;
BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
for (pfn = start_pfn;
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (page && set_migratetype_isolate(page)) {
undo_pfn = pfn;
goto undo;
}
}
return 0;
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
pfn += pageblock_nr_pages)
unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
return -EBUSY;
}

接下来调用__alloc_contig_migrate_range()进行页面隔离和迁移:

[cpp] view plain copy

其中的函数migrate_pages()会完成页面的迁移，迁移过程中通过传入的__alloc_contig_migrate_alloc()申请新的page，并将老的page付给新的page：

[cpp] view plain copy

其中的unmap_and_move()函数较为关键，它定义在mm/migrate.c中

[cpp] view plain copy

/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
struct page *page, int force, bool offlining, bool sync)
{
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
int remap_swapcache = 1;
int charge = 0;
struct mem_cgroup *mem = NULL;
struct anon_vma *anon_vma = NULL;
...
/* charge against new page */
charge = mem_cgroup_prepare_migration(page, newpage, &mem);
...
if (PageWriteback(page)) {
if (!force || !sync)
goto uncharge;
wait_on_page_writeback(page);
}
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
* This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
if (PageAnon(page)) {
/*
* Only page_lock_anon_vma() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
anon_vma = page_lock_anon_vma(page);
if (anon_vma) {
/*
* Take a reference count on the anon_vma if the
* page is mapped so that it is guaranteed to
* exist when the page is remapped later
*/
get_anon_vma(anon_vma);
page_unlock_anon_vma(anon_vma);
} else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
* swapcache page is safe to use because we don‘t
* know in advance if the VMA that this page belonged
* to still exists. If the VMA and others sharing the
* data have been freed, then the anon_vma could
* already be invalid.
*
* To avoid this possibility, swapcache pages get
* migrated but are not remapped when migration
* completes
*/
remap_swapcache = 0;
} else {
goto uncharge;
}
}
...
/* Establish migration ptes or remove ptes */
try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
skip_unmap:
if (!page_mapped(page))
rc = move_to_new_page(newpage, page, remap_swapcache);
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
/* Drop an anon_vma reference if we took one */
if (anon_vma)
drop_anon_vma(anon_vma);
uncharge:
if (!charge)
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
move_newpage:
...
}

通过unmap_and_move()，老的page就被迁移过去新的page。

接下来要回收page，回收page的作用是，不至于因为拿了连续的内存后，系统变得内存饥饿：

->

[cpp] view plain copy

/*
* Reclaim enough pages to make sure that contiguous allocation
* will not starve the system.
*/
__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);

->

[cpp] view plain copy

/*
* Trigger memory pressure bump to reclaim some pages in order to be able to
* allocate ‘count‘ pages in single page units. Does similar work as
*__alloc_pages_slowpath() function.
*/
static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zonelist *zonelist = node_zonelist(0, gfp_mask);
int did_some_progress = 0;
int order = 1;
unsigned long watermark;
/*
* Increase level of watermarks to force kswapd do his job
* to stabilise at new watermark level.
*/
__update_cma_watermarks(zone, count);
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + count;
while (!zone_watermark_ok(zone, 0, watermark, 0, 0)) {
wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
NULL);
if (!did_some_progress) {
/* Exhausted what can be done so it‘s blamo time */
out_of_memory(zonelist, gfp_mask, order, NULL);
}
}
/* Restore original watermark levels. */
__update_cma_watermarks(zone, -count);
return count;
}

释放连续内存

内存释放的时候也比较简单，直接就是：

arch/arm/mm/dma-mapping.c：

[cpp] view plain copy

void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)

->

arch/arm/mm/dma-mapping.c:

[cpp] view plain copy

static void __free_from_contiguous(struct device *dev, struct page *page,
size_t size)
{
__dma_remap(page, size, pgprot_kernel);
dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
}

->

[cpp] view plain copy

bool dma_release_from_contiguous(struct device *dev, struct page *pages,
int count)
{
...
free_contig_range(pfn, count);
..
}

->

[cpp] view plain copy

void free_contig_range(unsigned long pfn, unsigned nr_pages)
{
for (; nr_pages--; ++pfn)
__free_page(pfn_to_page(pfn));
}

将page交还给buddy。

内核内存分配的migratetype

内核内存分配的时候，带的标志是GFP_，但是GFP_可以转化为migratetype：

[cpp] view plain copy

static inline int allocflags_to_migratetype(gfp_t gfp_flags)
{
WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
if (unlikely(page_group_by_mobility_disabled))
return MIGRATE_UNMOVABLE;
/* Group based on mobility */
return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
((gfp_flags & __GFP_RECLAIMABLE) != 0);
}

之后申请内存的时候，会对比迁移类型匹配的free_list：

[cpp] view plain copy

page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);

另外，笔者也编写了一个测试程序，透过它随时测试CMA的功能：

[cpp] view plain copy

申请内存：

[plain] view plain copy

# echo 0 > /dev/cma_test

释放内存：

[plain] view plain copy

# cat /dev/cma_test

参考链接：

[1] http://www.spinics.net/lists/arm-kernel/msg160854.html

[2] http://www.spinics.net/lists/arm-kernel/msg162063.html

[3] http://lwn.net/Articles/447405/

[转]Linux内核最新的连续内存分配器(CMA)——避免预留大块内存,布布扣,bubuko.com

[转]Linux内核最新的连续内存分配器(CMA)——避免预留大块内存