mempolicy相关系统调用主要有set_mempolicy/get_mempolicy 、mbind主要配置task/process policy和vma policy,如下图:
set_mempolicy
set_mempolicy()系统调用主要作用用于修改当前进行NUMA mem policy策略以及以后该进程创建的子进程也会继承该特性,API为:
#include <numaif.h>
long set_mempolicy(int mode, const unsigned long * nodemask,unsigned long maxnode);
使用该系统调用要链接numa库:-Lnuma.
参数:
- int mode: mem policy配置策略模式,该字段包含两个部分一个是用户设置memplicy mode,另外一个部分为mem policy外部flag部分见,如下图:
- mode 部分主要包括MPOL_DEFAULT、MPOL_PREFERRED、MPOL_BIND、MPOL_INTERLEAVE、MPOL_LOCAL等字段,
- extern flag主要包含MPOL_F_STATIC_NODES和MPOL_F_RELATIVE_NODES字段,并在5.12版本加入MPOL_F_NUMA_BALANCING字段。
- 上述字段具体描述《linux内核那些事之mempolicy(1)》
- const unsigned long * nodemask: 该进程NUMA mem policy所要支持的numa节点 以bit位。
- unsigned long maxnode:该进程支持的最大节点。
kernel_set_mempolicy
kernel_set_mempolicy为set_mempolicy系统调用 kernel层处理函数:
/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
int err;
nodemask_t nodes;
unsigned short flags;
flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if ((unsigned int)mode >= MPOL_MAX)
return -EINVAL;
if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(mode, flags, &nodes);
}
- flags = mode & MPOL_MODE_FLAGS: 从mode中取出 extern flags状态标志位。
- mode &= ~MPOL_MODE_FLAGS:从mode参数中取出mempolicy mode。
- 对mode 以及flags进行参数检查。
- get_nodes:将nmask node节点转换成nodemask_t 结构。
- 调用do_set_mempolicy 修改当前进程current->mempolicy mempolicy。
get_mempolicy
get_mempolicy 用于获取当前进程或者具体内存地址mempolicy策略,API为:
#include <numaif.h>
long get_mempolicy(int mode, unsigned long * nodemask,unsigned long maxnode,void *addr, unsigned long flags);
参数:
- unsigned long flag: 如果flag为0,表明获取的是当前线程的mempolicy,此时addr参数为NULL.。如果为flags 为MPOL_F_MEMS_ALLOWD,将会忽略mode参数 ,nodemask将会返回配置支持的numa节点。如果flag参数为MPOL_F_ADDR,结果将会返回指定的addr地址对应的mempolicy numa节点。
- int mode:如果mode参数不为0,则结果返回指定mode的 mempolicy 节点。
- unsigned long *nodemask:为unsigned long类似数组,大小为maxnode/sizeof(unsigned long) 。
- unsigned long maxnode:支持的最大nume 节点。
kernel_get_mempolicy
kernel_get_mempolicy为对应get_mempolicy kernel处理函数:
/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr,
unsigned long flags)
{
int err;
int uninitialized_var(pval);
nodemask_t nodes;
addr = untagged_addr(addr);
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
调用do_get_mempolicy 获取memplicy结果。
do_get_mempolicy
do_get_mempolicy为get_mempolicy核心处理函数:
/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
if (flags & MPOL_F_MEMS_ALLOWED) {
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
task_lock(current);
*nmask = cpuset_current_mems_allowed;
task_unlock(current);
return 0;
}
if (flags & MPOL_F_ADDR) {
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
mmap_read_lock(mm);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
mmap_read_unlock(mm);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
pol = vma->vm_ops->get_policy(vma, addr);
else
pol = vma->vm_policy;
} else if (addr)
return -EINVAL;
if (!pol)
pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
* wil drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->v.nodes);
} else {
err = -EINVAL;
goto out;
}
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
/*
* Internal mempolicy flags must be masked off before exposing
* the policy to userspace.
*/
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
*nmask = pol->w.user_nodemask;
} else {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
}
out:
mpol_cond_put(pol);
if (vma)
mmap_read_unlock(mm);
if (pol_refcount)
mpol_put(pol_refcount);
return err;
}
根据传入的参数不同场景获取numa 节点,不再详细分析。
mbind
mbind系统调用为设置vma mempolicy级别,可以只设置某一个内存区域NUMA节点策略:
#include <numaif.h>
long mbind(void *addr, unsigned long len, int mode,const unsigned long *nodemask,unsigned long maxnode,unsignd int flags)
参数
- unsigned int flags:flag支持MPOL_MF_STRICT、MPOL_MF_MOVE、MPOL_MF_MOVE_ALL.
当flags为MPOL_MF_STRICT且mode 不是MPOL_DEFAULT时,意思时要严格遵循设置numa设置,如果已经申请的物理页和 numa 配置节点策略不一致直接返回EIO。
flags为MPOL_MF_MOVE:内核会尝试将已经分配的内存 不符合设置numa节点要求给进行迁移,迁移到符合配置numa节点上去。
flags为MPOL_MF_MOVE_ALL:内核会尝试将已经分配的内存 不符合设置numa节点要求给进行迁移,迁移该物理页时不管其他进程是否也再使用都进行迁移。
- 其他参数与上面API意思相同。
do_mbind
do_mbind函数为mbind系统调用核心处理函数:
static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
int err;
int ret;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
return -EINVAL;
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
if (flags & MPOL_MF_LAZY)
new->flags |= MPOL_F_MOF;
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
*/
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
start, start + len, mode, mode_flags,
nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
err = migrate_prep();
if (err)
goto mpol_out;
}
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap_write_lock(mm);
task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
task_unlock(current);
if (err)
mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
if (err)
goto mpol_out;
ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
if (ret < 0) {
err = ret;
goto up_out;
}
err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_movable_pages(&pagelist);
}
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
} else {
up_out:
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
}
mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
return err;
}
- mbind_range:将指定的内内存区域进行内存策略绑定,如果绑定成功则需要判断是否已经分配物理页。
- 如果已经分配物理内存,则会根据设置的flags要求,是否做出页页迁移动作,如果需要进行页迁移则调用migrate_pages进行页迁移。