linux kernel的virtual kernel memory layout介绍(aarch64)

相关文件:
memory.h
pgtable.h
fixmap.h
page.h

1、重要的配置
我们就以VA_BITS=48,PAGE_SIZE=4k来介绍
(1)、(VA_BITS)
(arch/arm64/Kconfig)

config ARM64_VA_BITS_36
	bool "36-bit" if EXPERT
	depends on ARM64_16K_PAGES

config ARM64_VA_BITS_39
	bool "39-bit"
	depends on ARM64_4K_PAGES

config ARM64_VA_BITS_42
	bool "42-bit"
	depends on ARM64_64K_PAGES

config ARM64_VA_BITS_47
	bool "47-bit"
	depends on ARM64_16K_PAGES

config ARM64_VA_BITS_48
	bool "48-bit"
CONFIG_ARM64_VA_BITS_48=y
CONFIG_ARM64_VA_BITS=48

(2)、(PAGE_SIZE、PAGE_SHIFT)
如果选择了ARM64_4K_PAGES,那么PAGE_SIZE = 4K,PAGE_SHIFT=12

#ifdef CONFIG_ARM64_64K_PAGES
#define PAGE_SHIFT		16
#define CONT_SHIFT		5
#elif defined(CONFIG_ARM64_16K_PAGES)
#define PAGE_SHIFT		14
#define CONT_SHIFT		7
#else
#define PAGE_SHIFT		12
#define CONT_SHIFT		4
#endif
#define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
#define PAGE_MASK		(~(PAGE_SIZE-1))

2、kernel 4.4代码 : 计算各个区域的地址
(memory.h):

#define VA_BITS			(CONFIG_ARM64_VA_BITS)
#define VA_START		(UL(0xffffffffffffffff) - \
	(UL(1) << VA_BITS) + 1)
#define PAGE_OFFSET		(UL(0xffffffffffffffff) - \
	(UL(1) << (VA_BITS - 1)) + 1)
#define KIMAGE_VADDR		(MODULES_END)
#define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR		(VA_START + KASAN_SHADOW_SIZE)
#define MODULES_VSIZE		(SZ_128M)
#define PCI_IO_END		(PAGE_OFFSET - SZ_2M)
#define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP		(PCI_IO_START - SZ_2M)
#define TASK_SIZE_64		(UL(1) << VA_BITS)
#ifdef CONFIG_KASAN
#define KASAN_SHADOW_SIZE	(UL(1) << (VA_BITS - 3))
#else
#define KASAN_SHADOW_SIZE	(0)
#endif
#define MODULES_VSIZE		(SZ_128M)
#define SZ_128M				0x08000000
#define PCI_IO_SIZE		SZ_16M  //(0x01000000)
#define PCI_IO_END		(PAGE_OFFSET - SZ_2M)
#define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)

(fixmap.h)

#define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)

(pgtable.h)

#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
#define VMALLOC_START		(MODULES_END)
#define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
#define VMEMMAP_START		(VMALLOC_END + SZ_64K)

KASAN_SHADOW_SIZE = 0x2000_0000_0000
MODULES_VSIZE = 0x0800_0000

VA_BITS = 48
VA_START = 0xffff_0000_0000_0000
PAGE_OFFSET = 0xffff_8000_0000_0000

MODULES_VADDR=(VA_START + KASAN_SHADOW_SIZE) = 0xffff_2000_0000_0000
KIMAGE_VADDR = (MODULES_END) = (MODULES_VADDR + MODULES_VSIZE) = 0xffff_2000_0800_0000
KIMAGE_VADDR = 0xffff_2000_0800_0000

PCI_IO_END = 0xffff_8000_0000_0000 - 0x0000_0800 = 0xffff_7fff_ffff_e800
PCI_IO_START = 0xffff_7fff_ffff_e800 - 0x0100_0000 = 0xffff_7fff_feff_e800
FIXADDR_TOP = (PCI_IO_START - SZ_2M) = 0xffff_7fff_feff_e800 - 0x0020_0000 = 0xffff_7fff_fedf_e800 //(end addr)
PCI_IO_START = 0xffff_7fff_feff_e800

可见VMALLOC_START和KIMAGE_VADDR是重叠的,也就是kernel迁移到VMALLOC区域

再看VMEMMAP_SIZE :
(例如VA_BITS=48, PAGE_SHIFT=12的情况下)
(1UL << (VA_BITS - PAGE_SHIFT)) 表示48位的有效虚拟地址,一共可以表示多数个page页
再乘以sizeof(struct page), 表示需要多数内存来存储struct page
也就是说VMEMMAP是用来存储所有页面的struct page结构体的

结合以上地址,我们画了张图,更直观
linux kernel的virtual kernel memory layout介绍(aarch64)

3、kernel 4.14代码 : 计算各个区域的地址
(memory.h):

#define VA_BITS			(CONFIG_ARM64_VA_BITS)
#define VA_START		(UL(0xffffffffffffffff) - \
	(UL(1) << VA_BITS) + 1)
#define PAGE_OFFSET		(UL(0xffffffffffffffff) - \
	(UL(1) << (VA_BITS - 1)) + 1)
#define KIMAGE_VADDR		(MODULES_END)
#define MODULES_END		(MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR		(VA_START + KASAN_SHADOW_SIZE)
#define MODULES_VSIZE		(SZ_128M)
#define VMEMMAP_START		(PAGE_OFFSET - VMEMMAP_SIZE)
#define PCI_IO_END		(VMEMMAP_START - SZ_2M)
#define PCI_IO_START		(PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP		(PCI_IO_START - SZ_2M)

同样,我们也画了一张图:
linux kernel的virtual kernel memory layout介绍(aarch64)

4、kernel image搬移到vmalloc区域后,virt_to_phys的变化
virt_to_phys的作用是将内核虚拟地址转换成物理地址(针对线性映射区域)。
在kernel image还在线性映射区域的时候,virt_to_phys宏可以将kernel代码中的一个地址转换成物理地址,因为线性映射区域,物理地址和虚拟地址只有一个偏移。因此两者很容易转换。
那么现在kernel image和线性映射区域分开了,virt_to_phys宏又该如何实现呢?

在kernel中PAGE_OFFSET = 0x8000_0000_0000

#define PAGE_OFFSET		(UL(0xffffffffffffffff) - \
	(UL(1) << (VA_BITS - 1)) + 1)

当virt_to_phys调用时候,先判断bit47(最高有效位),如果为1,则表示是(memory)DRAM的地址。那么直接使用X[46:0]和PHYS_OFFSET相加即可

#define __virt_to_phys(x) ({						\
	phys_addr_t __x = (phys_addr_t)(x);				\
	__x & BIT(VA_BITS - 1) ? (__x & ~PAGE_OFFSET) + PHYS_OFFSET :	\
				 (__x - kimage_voffset); })

PHYS_OFFSET是DRAM的真实物理地址

memstart_addr = round_down(memblock_start_of_DRAM(),
				   ARM64_MEMSTART_ALIGN);

当virt_to_phys调用时候,先判断bit47(最高有效位),如果为0,则表示是kernel image的地址, 那么直接使用X[46:0]和kimage_voffset相减即可

kimage_voffset来自汇编中的__mmap_switched函数

str_l	x21, __fdt_pointer, x5		// Save FDT pointer

ldr_l	x4, kimage_vaddr		// Save the offset between
sub	x4, x4, x24			// the kernel virtual and
str_l	x4, kimage_voffset, x5		// physical mappings

综上所述,virt_to_phys()当前能够转换的依然还是:线性区域、kimg区域(kernel image区域)

5、PCI/IO区域
如X86处理器为外设专门实现了一个单独的地址空间,称为"I/O地址空间"或者"I/O端口空间",CPU通过专门的I/O指令(如X86的IN和OUT指令)来访问这一空间中的地址单元
在arm64中,其实也有类似的指令,只是我们没有用到这个区域,也没有使用这些指令
具体在kernel/include/asm-generic/io.h中:
其中PCI_IOBASE对应的就是PCI/IO的及地址(PCI_IO_START)。

	 #ifndef insb
#define insb insb
static inline void insb(unsigned long addr, void *buffer, unsigned int count)
{
	readsb(PCI_IOBASE + addr, buffer, count);
}
#endif

#ifndef insw
#define insw insw
static inline void insw(unsigned long addr, void *buffer, unsigned int count)
{
	readsw(PCI_IOBASE + addr, buffer, count);
}
#endif

#ifndef insl
#define insl insl
static inline void insl(unsigned long addr, void *buffer, unsigned int count)
{
	readsl(PCI_IOBASE + addr, buffer, count);
}
#endif

6、ioremap
那么ioremap映射到了哪个区域呢?

(arch/arm64/include/asm/io.h)

#define ioremap(addr, size)		__ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
#define ioremap_nocache(addr, size)	__ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
#define ioremap_wc(addr, size)		__ioremap((addr), (size), __pgprot(PROT_NORMAL_NC))
#define ioremap_wt(addr, size)		__ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
#define iounmap				__iounmap

(ioremap.c)

void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot)
{
	return __ioremap_caller(phys_addr, size, prot,
				__builtin_return_address(0));
}
EXPORT_SYMBOL(__ioremap);
static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size,
				      pgprot_t prot, void *caller)
{
	unsigned long last_addr;
	unsigned long offset = phys_addr & ~PAGE_MASK;
	int err;
	unsigned long addr;
	struct vm_struct *area;

	/*
	 * Page align the mapping address and size, taking account of any
	 * offset.
	 */
	phys_addr &= PAGE_MASK;
	size = PAGE_ALIGN(size + offset);

	/*
	 * Don't allow wraparound, zero size or outside PHYS_MASK.
	 */
	last_addr = phys_addr + size - 1;
	if (!size || last_addr < phys_addr || (last_addr & ~PHYS_MASK))
		return NULL;

	/*
	 * Don't allow RAM to be mapped.
	 */
	if (WARN_ON(pfn_valid(__phys_to_pfn(phys_addr))))
		return NULL;

	area = get_vm_area_caller(size, VM_IOREMAP, caller);
	if (!area)
		return NULL;
	addr = (unsigned long)area->addr;
	area->phys_addr = phys_addr;

	err = ioremap_page_range(addr, addr + size, phys_addr, prot);
	if (err) {
		vunmap((void *)addr);
		return NULL;
	}

	return (void __iomem *)(offset + addr);
}

调用了get_vm_area_caller(size, VM_IOREMAP, caller)
(vmalloc.c)

struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
				       unsigned long start, unsigned long end,
				       const void *caller)
{
	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
				  GFP_KERNEL, caller);
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
		unsigned long align, unsigned long flags, unsigned long start,
		unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
	struct vmap_area *va;
	struct vm_struct *area;

	BUG_ON(in_interrupt());
	if (flags & VM_IOREMAP)
		align = 1ul << clamp_t(int, fls_long(size),
				       PAGE_SHIFT, IOREMAP_MAX_ORDER);

	size = PAGE_ALIGN(size);
	if (unlikely(!size))
		return NULL;

	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
	if (unlikely(!area))
		return NULL;

	if (!(flags & VM_NO_GUARD))
		size += PAGE_SIZE;

	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
	if (IS_ERR(va)) {
		kfree(area);
		return NULL;
	}

	setup_vmalloc_vm(area, va, flags, caller);

	return area;
}

具体代码不再深究,但可以知道iorempa是在vmalloc区域分配的

上一篇:一次华为昇腾服务器OS部署过程


下一篇:android 编译 openfst openblas arm64