0. 内存在计算机系统的地位
如下是一张非常简单的系统关系图,描述的是处理器(CPU)、内存(DDR)以及磁盘(Disk)三者之间的关系,可以看到DDR
是作为一个CPU
与Disk
沟通的桥梁。CPU
作为计算机指令的执行中心,其运算速度非常快,但是其却不能存储数据,一旦断电,数据就会消失;Disk
则作为数据的存储中心,具有即使断电数据能够保存的特点。然而CPU
与Disk
的读写速度相差非常大,甚至高达1000多倍,由于木桶效应
,当出现IO
读写需求时,Disk
的将极大的降低系统性能,所以引入中间件DDR
,DDR
与CPU
类似,同样会断电丢失数据,其读写速度与CPU
仅有10倍左右的差距,将数据缓存到DDR
可以有效改善Disk
速度差的问题。
然而,DDR
由于价格的原因,其存储容量相比Disk
有较大的差异,主流的DDR
仅有8G左右的容量,而Disk
高达1T,甚至更高,同时,CPU
具有访问整个Disk
的需求,在较少容量的DDR
如何更加有效的访问Disk
就是整个内存管理要处理的问题。
1. 内存共享模型
由于CPU
架构的不断发展,现如今已经发展出来了多核心以及多内存条集成到一个计算机系统的组成架构;那就会衍生出一个问题,每个核心使用同一个主存的优先级以及速度是否相同,为此出现了两种内存共享模型
-
UMA(Uniform Memory Access)
均匀存储器存取模型(手机、PC):不同核心访问主存之间优先级以及速度无差异 -
NUMA(NonUniform Memory Access)
非均匀存储器存取模型(服务器):不同核心访问主存之间优先级以及速度有差异
2. Linux物理内存的层次
为了更好地描述物理内存,Linux把物理内存划分为三个层次来管理:
名称 | 说明 |
---|---|
存储节点(Node) | CPU被划分为多个节点(node), 内存则被分簇, 每个CPU对应一个本地物理内存, 即一个CPU-node对应一个内存簇bank,即每个内存簇被认为是一个节点 |
管理区(Zone) | 每个物理节点Node根据地址范围被划分为多个内存管理区,管理区决定了内核使用的映射方式 |
页面(Page) | Page是内存分配的最小单位 |
2.1 存储节点Node
为了支持NUMA
模型,Linux系统把物理内存划分为多个Node
,内核中通过pg_data_t
来描述一个Node
,每个Node
关联到一个处理器,对于PC、手机这种采用UMA
模型的机器来说,系统只有一个全局的Node
:contig_page_data
//kernel/mm/memblock.c
#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data;
EXPORT_SYMBOL(contig_page_data);
#endif
//kernel/inlcude/linux/mmzone.h
typedef struct pglist_data {
2.2 管理区Zone
为了应对NUMA
模型,系统把内存划分为多个Node
已经可以解决不同cpu访问不同Node
之间的速度差异问题,那为什么还将内存再次划分呢?原因是计算机组成的硬件中也存在访问内存的诸多限制,为了统一的处理这些限制问题,将内存通过地址大小所在的范围划分为Zone
- ISA总线的直接内存存储DMA处理器有一个严格的限制 : 他们只能对RAM的前16MB进行寻址
- 在具有大容量RAM的现代32位计算机中(最大4G), CPU不能直接访问所有的物理地址, 因为线性地址空间太小, 内核不可能直接映射所有物理内存到线性地址空间
Zone
类型定义:
//kernel/include/linux/mmzone.h
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
* ZONE_DMA is used when there are devices that are not able
* to do DMA to all of addressable memory (ZONE_NORMAL). Then we
* carve out the portion of memory that is needed for these devices.
* The range is arch specific.
*
* Some examples
*
* Architecture Limit
* ---------------------------
* parisc, ia64, sparc <4G
* s390, powerpc <2G
* arm Various
* alpha Unlimited or 0-16MB.
*
* i386, x86_64 and multiple other arches
* <16M.
*/
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
/*
* x86_64 needs two ZONE_DMAs because it supports devices that are
* only able to do DMA to the lower 16M but also 32 bit devices that
* can only do DMA areas below 4G.
*/
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
名称 | 说明 |
---|---|
ZONE_DMA/ZONE_DMA32 | 标记DMA的内存域 |
ZONE_NORMAL | 标记可直接映射到内存段的普通内存域 |
ZONE_HIGHMEM | 标记了超出内核虚拟地址空间的物理内存段,该段内存不能被内核直接映射 |
ZONE_MOVABLE | 引入ZONE_MOVABLE主要是为了优化内存迁移的场景,通过划分Movable以及Non-Movable的内存管理区,仅允许可迁移的页面在Movable的区域申请内存,从而保证当需要申请一块连续的大内存块可以通过迁移页面的方式实现 |
ZONE_DEVICE | 为支持热拔插而分配的Non-Volatile-Memory非易失性内存 |
可以通过如下命令获得系统Zone
信息
cat /proc/zoneinfo
2.3 页面page
page frame
是内存管理的最小单位,这样做可以提高内存分配以及回收的效率,内核中使用struct page
结构体代表页框,通过联合体的方式尽量减小结构体大小:
//kernel/include/linux/mm_types.h
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
* WARNING: bit 0 of the first word is used for PageTail(). That
* means the other users of this union MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
* pgdat->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if PageSwapCache.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
};
struct { /* page_pool used by netstack */
/**
* @dma_addr: might require a 64-bit value even on
* 32-bit architectures.
*/
dma_addr_t dma_addr;
};
struct { /* slab, slob and slub */
union {
struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
int pages; /* Nr of pages left */
int pobjects; /* Approximate count */
#else
short int pages;
short int pobjects;
#endif
};
};
struct kmem_cache *slab_cache; /* not slob */
/* Double-word boundary */
void *freelist; /* first free object */
union {
void *s_mem; /* slab: first object */
unsigned long counters; /* SLUB */
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
/* First tail page only */
unsigned char compound_dtor;
unsigned char compound_order;
atomic_t compound_mapcount;
};
struct { /* Second tail page of compound page */
unsigned long _compound_pad_1; /* compound_head */
unsigned long _compound_pad_2;
/* For both global and memcg */
struct list_head deferred_list;
};
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
unsigned long _pt_pad_2; /* mapping */
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
};
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
* mapped so the next 3 words hold the mapping, index,
* and private fields from the source anonymous or
* page cache page while the page is migrated to device
* private memory.
* ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
* use the mapping, index, and private fields when
* pmem backed DAX files are mapped.
*/
};
/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
};
union { /* This union is 4 bytes in size. */
/*
* If the page can be mapped to userspace, encodes the number
* of times this page is referenced by a page table.
*/
atomic_t _mapcount;
/*
* If the page is neither PageSlab nor mappable to userspace,
* the value stored here may help determine what this page
* is used for. See page-flags.h for a list of page types
* which are currently stored here.
*/
unsigned int page_type;
unsigned int active; /* SLAB */
int units; /* SLOB */
};
/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;
#ifdef CONFIG_MEMCG
struct mem_cgroup *mem_cgroup;
#endif
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
} _struct_page_alignment;
其中比较重要的结构体成员:flags
代表当前页的状态,系统通过枚举类型定义一系列状态位
//kernel/linux/page-flags.h
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
PG_workingset,
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
PG_writeback, /* Page is under writeback */
PG_head, /* A head page */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
#endif
__NR_PAGEFLAGS,
/* Filesystems */
PG_checked = PG_owner_priv_1,
/* SwapBacked */
PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
* when those inodes are being locally cached.
*/
PG_fscache = PG_private_2, /* page backed by cache */
/* XEN */
/* Pinned in Xen as a read-only pagetable page. */
PG_pinned = PG_owner_priv_1,
/* Pinned as part of domain save (see xen_mm_pin_all()). */
PG_savepinned = PG_dirty,
/* Has a grant mapping of another (foreign) domain's page. */
PG_foreign = PG_owner_priv_1,
/* Remapped by swiotlb-xen. */
PG_xen_remapped = PG_owner_priv_1,
/* SLOB */
PG_slob_free = PG_private,
/* Compound pages. Stored in first tail page's flags */
PG_double_map = PG_private_2,
/* non-lru isolated movable page */
PG_isolated = PG_reclaim,
};