Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】

2022-08-05 22:13:03

在我们使用ARM等嵌入式Linux系统的时候，一个头疼的问题是GPU，Camera，HDMI等都需要预留大量连续内存，这部分内存平时不用，但是一般的做法又必须先预留着。目前，Marek Szyprowski和Michal Nazarewicz实现了一套全新的Contiguous Memory Allocator。通过这套机制，我们可以做到不预留内存，这些内存平时是可用的，只有当需要的时候才被分配给Camera，HDMI等设备。下面分析它的基本代码流程。

1. 声明连续内存

内核启动过程中arch/arm/mm/init.c中的arm_memblock_init()会调用dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));

该函数位于：drivers/base/dma-contiguous.c

/**

 * dma_contiguous_reserve() - reserve area for contiguous memory handling

 * @limit: End address of the reserved memory (optional, 0 for any).

 *

 * This function reserves memory from early allocator. It should be

 * called by arch specific code once the early allocator (memblock or bootmem)

 * has been activated and all other subsystems have already allocated/reserved

 * memory.

 */

void __init dma_contiguous_reserve(phys_addr_t limit)

{

        unsigned long selected_size = 0;

        pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);

        if (size_cmdline != -1) {

                selected_size = size_cmdline;

        } else {

#ifdef CONFIG_CMA_SIZE_SEL_MBYTES

                selected_size = size_bytes;

#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE)

                selected_size = cma_early_percent_memory();

#elif defined(CONFIG_CMA_SIZE_SEL_MIN)

                selected_size = min(size_bytes, cma_early_percent_memory());

#elif defined(CONFIG_CMA_SIZE_SEL_MAX)

                selected_size = max(size_bytes, cma_early_percent_memory());

#endif

        }   

        if (selected_size) {

                pr_debug("%s: reserving %ld MiB for global area\n", __func__,

                         selected_size / SZ_1M);

                dma_declare_contiguous(NULL, selected_size, 0, limit);

        }

其中的size_bytes定义为：

static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M

默认情况下，CMA_SIZE_MBYTES会被定义为16MB，来源于CONFIG_CMA_SIZE_MBYTES=16

int __init dma_declare_contiguous(struct device *dev, unsigned long size,

                                  phys_addr_t base, phys_addr_t limit)

{

        ...

        /* Reserve memory */

        if (base) {

                if (memblock_is_region_reserved(base, size) ||

                    memblock_reserve(base, size) < 0) {

                        base = -EBUSY;

                        goto err;

                }

        } else {

                /*

                 * Use __memblock_alloc_base() since

                 * memblock_alloc_base() panic()s.

                 */

                phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);

                if (!addr) {

                        base = -ENOMEM;

                        goto err;

                } else if (addr + size > ~(unsigned long)0) {

                        memblock_free(addr, size);

                        base = -EINVAL;

                        base = -EINVAL;

                        goto err;

                } else {

                        base = addr;

                }

        }

        /*

         * Each reserved area must be initialised later, when more kernel

         * subsystems (like slab allocator) are available.

         */

        r->start = base;

        r->size = size;

        r->dev = dev;

        cma_reserved_count++;

        pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,

                (unsigned long)base);

        /* Architecture specific contiguous memory fixup. */

        dma_contiguous_early_fixup(base, size);

        return 0;

err:

        pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);

        return base;

}

由此可见，连续内存区域也是在内核启动的早期，通过__memblock_alloc_base()拿到的。

另外：

drivers/base/dma-contiguous.c里面的core_initcall()会导致cma_init_reserved_areas()被调用：

cma_create_area()会调用cma_activate_area(),cma_activate_area()函数则会针对每个page调用：

init_cma_reserved_pageblock(pfn_to_page(base_pfn));

这个函数则会通过set_pageblock_migratetype(page, MIGRATE_CMA)将页设置为MIGRATE_CMA类型的：

#ifdef CONFIG_CMA

/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */

void __init init_cma_reserved_pageblock(struct page *page)

{

        unsigned i = pageblock_nr_pages;

        struct page *p = page;

        do {

                __ClearPageReserved(p);

                set_page_count(p, 0);

        } while (++p, --i);

        set_page_refcounted(page);

        set_pageblock_migratetype(page, MIGRATE_CMA);

        __free_pages(page, pageblock_order);

        totalram_pages += pageblock_nr_pages;

}

#endif

同时其中调用的__free_pages(page, pageblock_order);最终会调用到__free_one_page(page, zone, order, migratetype);

相关的page会被加到MIGRATE_CMA的free_list上面去：

list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);

2. 申请连续内存

申请连续内存仍然使用标准的arch/arm/mm/dma-mapping.c中定义的dma_alloc_coherent()和dma_alloc_writecombine()，这二者会间接调用drivers/base/dma-contiguous.c中的

struct page *dma_alloc_from_contiguous(struct device *dev, int count,

                                       unsigned int align)

struct page *dma_alloc_from_contiguous(struct device *dev, int count,

                                       unsigned int align)

{

       ...

       for (;;) {

                pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,

                                                    start, count, mask);

                if (pageno >= cma->count) {

                        ret = -ENOMEM;

                        goto error;

                }

                pfn = cma->base_pfn + pageno;

                ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);

                if (ret == 0) {

                        bitmap_set(cma->bitmap, pageno, count);

                        break;

                } else if (ret != -EBUSY) {

                        goto error;

                }

                pr_debug("%s(): memory range at %p is busy, retrying\n",

                         __func__, pfn_to_page(pfn));

                /* try again with a bit different memory target */

                start = pageno + mask + 1;

        }

       ...

}

--》

int alloc_contig_range(unsigned long start, unsigned long end,

                       unsigned migratetype)

需要隔离page，隔离page的作用通过代码的注释可以体现：

 /*

         * What we do here is we mark all pageblocks in range as

         * MIGRATE_ISOLATE.  Because of the way page allocator work, we

         * align the range to MAX_ORDER pages so that page allocator

         * won't try to merge buddies from different pageblocks and

         * change MIGRATE_ISOLATE to some other migration type.

         *

         * Once the pageblocks are marked as MIGRATE_ISOLATE, we

         * migrate the pages from an unaligned range (ie. pages that

         * we are interested in).  This will put all the pages in

         * range back to page allocator as MIGRATE_ISOLATE.

         *

         * When this is done, we take the pages in range from page

         * allocator removing them from the buddy system.  This way

         * page allocator will never consider using them.

         *

         * This lets us mark the pageblocks back as

         * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the

         * MAX_ORDER aligned range but not in the unaligned, original

         * range are put back to page allocator so that buddy can use

         * them.

         */  

        ret = start_isolate_page_range(pfn_align_to_maxpage_down(start),

                                       pfn_align_to_maxpage_up(end),

                                       migratetype);

简单地说，就是把相关的page标记为MIGRATE_ISOLATE，这样buddy系统就不会再使用他们。

/*

 * start_isolate_page_range() -- make page-allocation-type of range of pages

 * to be MIGRATE_ISOLATE.

 * @start_pfn: The lower PFN of the range to be isolated.

 * @end_pfn: The upper PFN of the range to be isolated.

 * @migratetype: migrate type to set in error recovery.

 *

 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in

 * the range will never be allocated. Any free pages and pages freed in the

 * future will not be allocated again.

 *

 * start_pfn/end_pfn must be aligned to pageblock_order.

 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.

 */

int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,

                             unsigned migratetype)

{

        unsigned long pfn;

        unsigned long undo_pfn;

        struct page *page;

        BUG_ON((start_pfn) & (pageblock_nr_pages - 1));

        BUG_ON((end_pfn) & (pageblock_nr_pages - 1));

        for (pfn = start_pfn;

             pfn < end_pfn;

             pfn += pageblock_nr_pages) {

                page = __first_valid_page(pfn, pageblock_nr_pages);

                if (page && set_migratetype_isolate(page)) {

                        undo_pfn = pfn;

                        goto undo;

                }

        }

        return 0;

undo:

        for (pfn = start_pfn;

             pfn < undo_pfn;

             pfn += pageblock_nr_pages)

                unset_migratetype_isolate(pfn_to_page(pfn), migratetype);

        return -EBUSY;

}

接下来调用__alloc_contig_migrate_range()进行页面隔离和迁移:

static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)

{

        /* This function is based on compact_zone() from compaction.c. */

        unsigned long pfn = start;

        unsigned int tries = 0;

        int ret = 0; 

        struct compact_control cc = {

                .nr_migratepages = 0,

                .order = -1,

                .zone = page_zone(pfn_to_page(start)),

                .sync = true,

        };

        INIT_LIST_HEAD(&cc.migratepages);

        migrate_prep_local();

        while (pfn < end || !list_empty(&cc.migratepages)) {

                if (fatal_signal_pending(current)) {

                        ret = -EINTR;

                        break;

                }    

                if (list_empty(&cc.migratepages)) {

                        cc.nr_migratepages = 0;

                        pfn = isolate_migratepages_range(cc.zone, &cc,

                                                         pfn, end);

                        if (!pfn) {

                                ret = -EINTR;

                                break;

                        }

                        tries = 0;

                } else if (++tries == 5) {

                        ret = ret < 0 ? ret : -EBUSY;

                        break;

                }    

                ret = migrate_pages(&cc.migratepages,

                                    __alloc_contig_migrate_alloc,

                                    0, false, true);

        }    

        putback_lru_pages(&cc.migratepages);

        return ret > 0 ? 0 : ret;

}

其中的函数migrate_pages()会完成页面的迁移，迁移过程中通过传入的__alloc_contig_migrate_alloc()申请新的page，并将老的page付给新的page：

int migrate_pages(struct list_head *from,

                new_page_t get_new_page, unsigned long private, bool offlining,

                bool sync)

{

        int retry = 1;

        int nr_failed = 0;

        int pass = 0;

        struct page *page;

        struct page *page2;

        int swapwrite = current->flags & PF_SWAPWRITE;

        int rc;

        if (!swapwrite)

                current->flags |= PF_SWAPWRITE;

        for(pass = 0; pass < 10 && retry; pass++) {

                retry = 0; 

                list_for_each_entry_safe(page, page2, from, lru) {

                        cond_resched();

                        rc = unmap_and_move(get_new_page, private,

                                                page, pass > 2, offlining,

                                                sync);

                        switch(rc) {

                        case -ENOMEM:

                                goto out;

                        case -EAGAIN:

                                retry++;

                                break;

                        case 0:

                                break;

                        default:

                                /* Permanent failure */

                                nr_failed++;

                                break;

                        }

                }

        }

        rc = 0;

...

}

其中的unmap_and_move()函数较为关键，它定义在mm/migrate.c中

/*

 * Obtain the lock on page, remove all ptes and migrate the page

 * to the newly allocated page in newpage.

 */

static int unmap_and_move(new_page_t get_new_page, unsigned long private,

            struct page *page, int force, bool offlining, bool sync)

{

    int rc = 0;

    int *result = NULL;

    struct page *newpage = get_new_page(page, private, &result);

    int remap_swapcache = 1;

    int charge = 0;

    struct mem_cgroup *mem = NULL;

    struct anon_vma *anon_vma = NULL;

    ...

    /* charge against new page */

    charge = mem_cgroup_prepare_migration(page, newpage, &mem);

    ...

    if (PageWriteback(page)) {

        if (!force || !sync)

            goto uncharge;

        wait_on_page_writeback(page);

    }

    /*

     * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,

     * we cannot notice that anon_vma is freed while we migrates a page.

     * This get_anon_vma() delays freeing anon_vma pointer until the end

     * of migration. File cache pages are no problem because of page_lock()

     * File Caches may use write_page() or lock_page() in migration, then,

     * just care Anon page here.

     */

    if (PageAnon(page)) {

        /*

         * Only page_lock_anon_vma() understands the subtleties of

         * getting a hold on an anon_vma from outside one of its mms.

         */

        anon_vma = page_lock_anon_vma(page);

        if (anon_vma) {

            /*

             * Take a reference count on the anon_vma if the

             * page is mapped so that it is guaranteed to

             * exist when the page is remapped later

             */

            get_anon_vma(anon_vma);

            page_unlock_anon_vma(anon_vma);

        } else if (PageSwapCache(page)) {

            /*

             * We cannot be sure that the anon_vma of an unmapped

             * swapcache page is safe to use because we don't

             * know in advance if the VMA that this page belonged

             * to still exists. If the VMA and others sharing the

             * data have been freed, then the anon_vma could

             * already be invalid.

             *

             * To avoid this possibility, swapcache pages get

             * migrated but are not remapped when migration

             * completes

             */

            remap_swapcache = 0;

        } else {

            goto uncharge;

        }

    }

    ...

    /* Establish migration ptes or remove ptes */

    try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);

skip_unmap:

    if (!page_mapped(page))

        rc = move_to_new_page(newpage, page, remap_swapcache);

    if (rc && remap_swapcache)

        remove_migration_ptes(page, page);

    /* Drop an anon_vma reference if we took one */

    if (anon_vma)

        drop_anon_vma(anon_vma);

uncharge:

    if (!charge)

        mem_cgroup_end_migration(mem, page, newpage, rc == 0);

unlock:

    unlock_page(page);

move_newpage:

    ...

}

通过unmap_and_move()，老的page就被迁移过去新的page。

接下来要回收page，回收page的作用是，不至于因为拿了连续的内存后，系统变得内存饥饿：

/*

         * Reclaim enough pages to make sure that contiguous allocation

         * will not starve the system.

         */

        __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);

/*

 * Trigger memory pressure bump to reclaim some pages in order to be able to

 * allocate 'count' pages in single page units. Does similar work as

 *__alloc_pages_slowpath() function.

 */

static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)

{

        enum zone_type high_zoneidx = gfp_zone(gfp_mask);

        struct zonelist *zonelist = node_zonelist(0, gfp_mask);

        int did_some_progress = 0;

        int order = 1;

        unsigned long watermark;

        /*

         * Increase level of watermarks to force kswapd do his job

         * to stabilise at new watermark level.

         */

        __update_cma_watermarks(zone, count);

        /* Obey watermarks as if the page was being allocated */

        watermark = low_wmark_pages(zone) + count;

        while (!zone_watermark_ok(zone, 0, watermark, 0, 0)) {

                wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));

                did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,

                                                      NULL);

                if (!did_some_progress) {

                        /* Exhausted what can be done so it's blamo time */

                        out_of_memory(zonelist, gfp_mask, order, NULL);

                }

        }

        /* Restore original watermark levels. */

        __update_cma_watermarks(zone, -count);

        return count;

}

3. 释放连续内存

内存释放的时候也比较简单，直接就是：

arch/arm/mm/dma-mapping.c：

void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)

arch/arm/mm/dma-mapping.c:

static void __free_from_contiguous(struct device *dev, struct page *page,

                                   size_t size)

{

        __dma_remap(page, size, pgprot_kernel);

        dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);

}

bool dma_release_from_contiguous(struct device *dev, struct page *pages,

                                 int count)

{

        ...

        free_contig_range(pfn, count);

        ..

}

void free_contig_range(unsigned long pfn, unsigned nr_pages)

{

        for (; nr_pages--; ++pfn)

                __free_page(pfn_to_page(pfn));

}

将page交还给buddy。

4. 内核内存分配的migratetype

内核内存分配的时候，带的标志是GFP_，但是GFP_可以转化为migratetype：

static inline int allocflags_to_migratetype(gfp_t gfp_flags)

{

        WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);

        if (unlikely(page_group_by_mobility_disabled))

                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */

        return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |

                ((gfp_flags & __GFP_RECLAIMABLE) != 0);

}

之后申请内存的时候，会对比迁移类型匹配的free_list：

        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,

                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,

                        preferred_zone, migratetype);

另外，笔者也编写了一个测试程序，透过它随时测试CMA的功能：

/*

 * kernel module helper for testing CMA

 *

 * Licensed under GPLv2 or later.

 */

#include <linux/module.h>

#include <linux/device.h>

#include <linux/fs.h>

#include <linux/miscdevice.h>

#include <linux/dma-mapping.h>

#define CMA_NUM  10

static struct device *cma_dev;

static dma_addr_t dma_phys[CMA_NUM];

static void *dma_virt[CMA_NUM];

/* any read request will free coherent memory, eg.

 * cat /dev/cma_test

 */

static ssize_t

cma_test_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)

{

	int i;

	for (i = 0; i < CMA_NUM; i++) {

		if (dma_virt[i]) {

			dma_free_coherent(cma_dev, (i + 1) * SZ_1M, dma_virt[i], dma_phys[i]);

			_dev_info(cma_dev, "free virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]);

			dma_virt[i] = NULL;

			break;

		}

	}

	return 0;

}

/*

 * any write request will alloc coherent memory, eg.

 * echo 0 > /dev/cma_test

 */

static ssize_t

cma_test_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)

{

	int i;

	int ret;

	for (i = 0; i < CMA_NUM; i++) {

		if (!dma_virt[i]) {

			dma_virt[i] = dma_alloc_coherent(cma_dev, (i + 1) * SZ_1M, &dma_phys[i], GFP_KERNEL);

			if (dma_virt[i]) {

				void *p;

				/* touch every page in the allocated memory */

				for (p = dma_virt[i]; p <  dma_virt[i] + (i + 1) * SZ_1M; p += PAGE_SIZE)

					*(u32 *)p = 0;

				_dev_info(cma_dev, "alloc virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]);

			} else {

				dev_err(cma_dev, "no mem in CMA area\n");

				ret = -ENOMEM;

			}

			break;

		}

	}

	return count;

}

static const struct file_operations cma_test_fops = {

	.owner =    THIS_MODULE,

	.read  =    cma_test_read,

	.write =    cma_test_write,

};

static struct miscdevice cma_test_misc = {

	.name = "cma_test",

	.fops = &cma_test_fops,

};

static int __init cma_test_init(void)

{

	int ret = 0;

	ret = misc_register(&cma_test_misc);

	if (unlikely(ret)) {

		pr_err("failed to register cma test misc device!\n");

		return ret;

	}

	cma_dev = cma_test_misc.this_device;

	cma_dev->coherent_dma_mask = ~0;

	_dev_info(cma_dev, "registered.\n");

	return ret;

}

module_init(cma_test_init);

static void __exit cma_test_exit(void)

{

	misc_deregister(&cma_test_misc);

}

module_exit(cma_test_exit);

MODULE_LICENSE("GPL");

MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>");

MODULE_DESCRIPTION("kernel module to help the test of CMA");

MODULE_ALIAS("CMA test");

申请内存：

# echo 0 > /dev/cma_test

释放内存：

# cat /dev/cma_test

码农公寓

1. 声明连续内存

2. 申请连续内存

3. 释放连续内存

4. 内核内存分配的migratetype

相关文章