11.1 do_page_fault()缺页中断核心函数

缺页中断处理的核心函数是do_page_fault(),该函数的实现和具体的体系结构相关。

[arch/arm/mm/fault.c]

static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
    struct task_struct *tsk;
    struct mm_struct *mm;
    int fault, sig, code;
    unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

    if (notify_page_fault(regs, fsr))
        return 0;

    tsk = current;
    mm  = tsk->mm;

    /* Enable interrupts if they were enabled in the parent context. */
    if (interrupts_enabled(regs))
        local_irq_enable();

    /*
     * If we're in an interrupt or have no user
     * context, we must not take the fault..
     */
    /*in_atomic()判断当前状态是否处于中断上下文或禁止抢占状态,如果是
    说明系统运行在原子上下文中(atomic context),那么跳转到no_context
    标签处的__do_kernel_fault()函数。如果当前进程中没有struct mm_struct
    数据结构,说明这是一个内核线程,同样跳转到__do_kernel_fault函数中。
     这里可以看出,缺页中断是应用程序导致,内核不会触发缺页中断,只是会
    调用缺页中断相关的函数*/
    if (in_atomic() || !mm)
        goto no_context; /*内核panic*/

    /*如果是用户模式,那么flags置位FAULT_FLAG_USER*/
    if (user_mode(regs))
        flags |= FAULT_FLAG_USER;
    if (fsr & FSR_WRITE)
        flags |= FAULT_FLAG_WRITE;

    /*
     * As per x86, we may deadlock here.  However, since the kernel only
     * validly references user space from well defined areas of the code,
     * we can bug out early if this is from code which shouldn't.
     */
    /*down_read_trylock()函数判断当前进程的mm->mmap_sem读写信号量是否可以获取,
    返回1则表示成功获得锁,返回0则表示锁已被别人占用。mm->mmap_sem锁被别人占用
    时要区分两种情况,一种是发生在内核空间,另一种是发生在用户空间。发生在用户空间
    的情况可以调用down_read()来睡眠等待锁持有者释放该锁;发生在内核空间时,如果
    没有在exception_tables查询到该地址,那么跳转到no_context*/
    if (!down_read_trylock(&mm->mmap_sem)) {
         /*search_exception_tables函数的作用不懂*/
        if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))
            goto no_context;
retry:
        down_read(&mm->mmap_sem);
    } else {
        /*
         * The above down_read_trylock() might have succeeded in
         * which case, we'll have missed the might_sleep() from
         * down_read()
         */
        might_sleep();
#ifdef CONFIG_DEBUG_VM
        if (!user_mode(regs) &&
            !search_exception_tables(regs->ARM_pc))
            goto no_context;
#endif
    }
    /*__do_page_fault()函数,立即查看下面的讲解。此函数通常返回VM_FAULT类型,下面有介绍
     作用: 1. 判断addr是否在vma中 2.判断权限是否正确 3. 调用handle_mm_fault*/
    fault = __do_page_fault(mm, addr, fsr, flags, tsk);

    /* If we need to retry but a fatal signal is pending, handle the
     * signal first. We do not need to release the mmap_sem because
     * it would already be released in __lock_page_or_retry in
     * mm/filemap.c. */
    if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
        return 0;

    /*
     * Major/minor page fault accounting is only done on the
     * initial attempt. If we go through a retry, it is extremely
     * likely that the page will be found in page cache at that point.
     */

    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
    if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) {
        if (fault & VM_FAULT_MAJOR) {
            tsk->maj_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
                    regs, addr);
        } else {
            tsk->min_flt++;
            perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
                    regs, addr);
        }
        if (fault & VM_FAULT_RETRY) {
            /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
            * of starvation. */
            flags &= ~FAULT_FLAG_ALLOW_RETRY;
            flags |= FAULT_FLAG_TRIED;
            goto retry;
        }
    }

    up_read(&mm->mmap_sem);

    /*
     * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR
     */
    /*如果没有返回(VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS)错误类型,
    那么说明缺页中断就处理完成。*/
    if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
        return 0;

    /*
     * If we are in kernel mode at this point, we
     * have no context to handle this fault with.
     */
    /*__do_page_fault()函数返回错误且当前处理内核模式,那么跳转
    __do_kernel_fault()来处理。*/
    if (!user_mode(regs))
        goto no_context;

    /*如果错误类型是VM_FAULT_OOM,说明当前系统没有足够的
    内存,那么调用pagefault_nout_of_memory()函数来触发OOM机制*/
    if (fault & VM_FAULT_OOM) {
        /*
         * We ran out of memory, call the OOM killer, and return to
         * userspace (which will retry the fault, or kill us if we
         * got oom-killed)
         */
        pagefault_out_of_memory();
        return 0;
    }

    if (fault & VM_FAULT_SIGBUS) {
        /*
         * We had some memory, but were unable to
         * successfully fix up this page fault.
         */
        sig = SIGBUS;
        code = BUS_ADRERR;
    } else {
        /*
         * Something tried to access memory that
         * isn't in our memory map..
         */
        sig = SIGSEGV;
        code = fault == VM_FAULT_BADACCESS ?
            SEGV_ACCERR : SEGV_MAPERR;
    }
    /*调用__do_user_fault()来给用户进程发信号(段错误),因为这时内核已经无能为力了。下面立即查看此函数的实现*/
    __do_user_fault(tsk, addr, fsr, sig, code, regs);
    return 0;

no_context:
    /*错误发生在内核模式,如果内核无法处理,那么调用__do_kernel_fault函数发送Oops错误。查看下面此函数的实现*/
    __do_kernel_fault(mm, addr, fsr, regs);
    return 0;
}

__do_page_fault()函数: 

[arch/arm/mm/fault.c]

[do_page_fault()->__do_page_fault()]

static int __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
        unsigned int flags, struct task_struct *tsk)
{
    struct vm_area_struct *vma;
    int fault;

    /*首先通过失效地址addr来查找vma,如果find_vma()找不到vma,说明addr
    地址还没有在进程地址空间中,返回VM_FAULT_BADMAP错误。*/
    vma = find_vma(mm, addr);
    fault = VM_FAULT_BADMAP;
    if (unlikely(!vma))
        goto out;
    if (unlikely(vma->vm_start > addr))
        goto check_stack;

    /*
     * Ok, we have a good vm_area for this
     * memory access, so we can handle it.
     */
good_area:
    /*access_error()判断VMA是否具备可写或可执行等权限。如果发生一个写错误
    的缺页中断,首先判断vma属性是否具有可写属性,如果没有,则返回
    VM_FAULT_BADACCESS错误。*/
    if (access_error(fsr, vma)) {
        fault = VM_FAULT_BADACCESS;
        goto out;
    }
    
    /*handle_mm_fault()是缺页中断的核心处理函数,等哈介绍*/
    return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags);

check_stack:
    /* Don't allow expansion below FIRST_USER_ADDRESS */
    if (vma->vm_flags & VM_GROWSDOWN &&
        addr >= FIRST_USER_ADDRESS && !expand_stack(vma, addr))
        goto good_area;
out:
    return fault;
}
回到do_page_fault()函数

PAGE_FAULT类型

[include/linux/mm.h]

/*
 * Different kinds of faults, as returned by handle_mm_fault().
 * Used to decide whether a process gets delivered SIGBUS or
 * just gets major/minor fault counters bumped up.
 */

#define VM_FAULT_MINOR  0 /* For backwards compat. Remove me quickly. */

#define VM_FAULT_OOM    0x0001
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR  0x0004
#define VM_FAULT_WRITE  0x0008  /* Special case for get_user_pages */
#define VM_FAULT_HWPOISON 0x0010    /* Hit poisoned small page */
#define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
#define VM_FAULT_SIGSEGV 0x0040

#define VM_FAULT_NOPAGE 0x0100  /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200  /* ->fault locked the returned page */
#define VM_FAULT_RETRY  0x0400  /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800    /* huge page fault failed, fall back to small */

#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */

#define VM_FAULT_ERROR  (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
             VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
             VM_FAULT_FALLBACK)

__do_user_fault()函数:段错误返回

[do_page_fault()->__do_user_fault()]

/*
 * Something tried to access memory that isn't in our memory map..
 * User mode accesses just cause a SIGSEGV
 */
static void
__do_user_fault(struct task_struct *tsk, unsigned long addr,
        unsigned int fsr, unsigned int sig, int code,
        struct pt_regs *regs)
{
    struct siginfo si;

#ifdef CONFIG_DEBUG_USER
    if (((user_debug & UDBG_SEGV) && (sig == SIGSEGV)) ||
        ((user_debug & UDBG_BUS)  && (sig == SIGBUS))) {
        printk(KERN_DEBUG "%s: unhandled page fault (%d) at 0x%08lx, code 0x%03x\n",
               tsk->comm, sig, addr, fsr);
        show_pte(tsk->mm, addr);
        show_regs(regs);
    }
#endif

    tsk->thread.address = addr;
    tsk->thread.error_code = fsr;
    tsk->thread.trap_no = 14;
    si.si_signo = sig;
    si.si_errno = 0;
    si.si_code = code;
    si.si_addr = (void __user *)addr;
    force_sig_info(sig, &si, tsk);
}
回到do_page_fault函数

__do_kernel_fault()函数

/*
 * Oops.  The kernel tried to access some page that wasn't present.
 */
static void
__do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
          struct pt_regs *regs)
{
    /*
     * Are we prepared to handle this kernel fault?
     */
    if (fixup_exception(regs))
        return;

    /*
     * No handler, we'll have to terminate things with extreme prejudice.
     */
    bust_spinlocks(1);
    pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
         (addr < PAGE_SIZE) ? "NULL pointer dereference" :
         "paging request", addr);

    show_pte(mm, addr);
    die("Oops", regs, fsr);
    bust_spinlocks(0);
    do_exit(SIGKILL);
}
回到do_page_fault函数

__handle_mm_fault()函数:

handle_mm_fault()函数的核心函数是__handle_mm_fault(),它的实现在mm/memory.c

[do_page_fault()->__do_page_fault()->handle_mm_fault()->__handle_mm_fault()]

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, unsigned int flags)
{
    pgd_t *pgd;
    pud_t *pud;
    pmd_t *pmd;
    pte_t *pte;

    if (unlikely(is_vm_hugetlb_page(vma)))
        return hugetlb_fault(mm, vma, address, flags);

    /*pgd_offset(mm,address)宏获取addr对应在当前进程页表的PGD页面目录项。*/
    pgd = pgd_offset(mm, address);
    /*pud_alloc(mm, pgd, address)宏获取对应的PUD表项,如果PUD表项为空,则返回VM_FAULT_OOM错误*/
    pud = pud_alloc(mm, pgd, address);
    if (!pud)
        return VM_FAULT_OOM;
    /*同样的方法获取PMD*/
    pmd = pmd_alloc(mm, pud, address);
    if (!pmd)
        return VM_FAULT_OOM;
    if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
        int ret = VM_FAULT_FALLBACK;
        if (!vma->vm_ops)
            ret = do_huge_pmd_anonymous_page(mm, vma, address,
                    pmd, flags);
        if (!(ret & VM_FAULT_FALLBACK))
            return ret;
    } else {
        pmd_t orig_pmd = *pmd;
        int ret;

        barrier();
        if (pmd_trans_huge(orig_pmd)) {
            unsigned int dirty = flags & FAULT_FLAG_WRITE;

            /*
             * If the pmd is splitting, return and retry the
             * the fault.  Alternative: wait until the split
             * is done, and goto retry.
             */
            if (pmd_trans_splitting(orig_pmd))
                return 0;

            if (pmd_protnone(orig_pmd))
                return do_huge_pmd_numa_page(mm, vma, address,
                                 orig_pmd, pmd);

            if (dirty && !pmd_write(orig_pmd)) {
                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
                              orig_pmd);
                if (!(ret & VM_FAULT_FALLBACK))
                    return ret;
            } else {
                huge_pmd_set_accessed(mm, vma, address, pmd,
                              orig_pmd, dirty);
                return 0;
            }
        }
    }

    /*
     * Use __pte_alloc instead of pte_alloc_map, because we can't
     * run pte_offset_map on the pmd, if an huge pmd could
     * materialize from under us from a different thread.
     */
    /*如果address对应的pte不存在,则会分配一个pte(大小为4K),然后调用pmd_populate函数,将
    刚分配的pte的地址填入mm->pgd+page_index(address)的地址*/
    if (unlikely(pmd_none(*pmd)) &&
        unlikely(__pte_alloc(mm, vma, pmd, address)))
        return VM_FAULT_OOM;
    /* if an huge pmd materialized from under us just retry later */
    if (unlikely(pmd_trans_huge(*pmd)))
        return 0;
    /*
     * A regular pmd is established and it can't morph into a huge pmd
     * from under us anymore at this point because we hold the mmap_sem
     * read mode and khugepaged takes it in write mode. So now it's
     * safe to run pte_offset_map().
     */
    /*获取address对应的pte表项*/
    pte = pte_offset_map(pmd, address);
    
    /*调用此函数,下面具体分析*/
    return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
/*回到handle_mm_fault()函数*/

handle_pte_fault()函数:

【do_page_fault()->__do_page_fault()->handle_mm_fault()->__handle_mm_fault()->handle_pte_fault()】

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int handle_pte_fault(struct mm_struct *mm,
             struct vm_area_struct *vma, unsigned long address,
             pte_t *pte, pmd_t *pmd, unsigned int flags)
{
    pte_t entry;
    spinlock_t *ptl;

    /*
     * some architectures can have larger ptes than wordsize,
     * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
     * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
     * The code below just needs a consistent view for the ifs and
     * we later double check anyway with the ptl lock held. So here
     * a barrier will do.
     */
    /*上面的注释说明有的处理器体系结构会大于8Byte的pte表项,例如ppc44x定义了CONFIG_PTE_64BIT
    和CONFIG_32BIT,所以READ_ONCE()和ACCESS_ONCE()并不保证访问的原子性,所以这里需要一个内存
    屏障以保证正确读取PTE表项内容才会执行后面的判断语句。*/
    entry = *pte;
    barrier();
    
    /*pte_present为0的情况,页不在内存中,即pte表项中的LPTE_PRESENT位没有置位,
    所以pte还没有映射物理页面,这是真正的缺页*/
    if (!pte_present(entry)) {
        /*
        (1)如果pte内容为空,即pte_none()
            ** 对于文件映射,通常VMA的vm_ops操作函数定义了fault()函数指针,那么调用do_fault()函数。
            ** 对于匿名也变,调用do_anonymous_page()函数
        */
        if (pte_none(entry)) {
            if (vma->vm_ops) {
                if (likely(vma->vm_ops->fault))
                    return do_fault(mm, vma, address, pte,
                            pmd, flags, entry);
            }
            return do_anonymous_page(mm, vma, address,
                         pte, pmd, flags);
        }
        /*(2) 如果pte内容不为空且PRESENT没有置位,说明该页被交换到swap分区,则
                调用do_swap_page()函数*/
        return do_swap_page(mm, vma, address,
                    pte, pmd, flags, entry);
    }

    if (pte_protnone(entry))
        return do_numa_page(mm, vma, address, entry, pte, pmd);
    
    /*这里是pte有映射物理页面,但因为之前的pte设置了只读,现在需要可写操作,
    所以触发了写时复制缺页中断,例如父子进程之间共享的内存,当其中一方需要写入新内容时,
    就会触发写时复制。*/
    ptl = pte_lockptr(mm, pmd);
    spin_lock(ptl);
    if (unlikely(!pte_same(*pte, entry)))
        goto unlock;
    /*如果传进来的flag设置了可写的属性且当前pte是只读的,那么调用do_wp_page()
    函数并返回*/
    if (flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry))/*如果传进来的flag设置了可写的属性且当前PTE是只读的,那么调用do_wp_page()函数*/
            return do_wp_page(mm, vma, address,
                    pte, pmd, ptl, entry);
        entry = pte_mkdirty(entry);
    }
    /*pte_mkyoung对于x86体系结构是设置_PAGE_ACCESSED位,这相对简单些。对于ARM体系结构
    是设置Linux版本的页表中PTE页表项的L_PTE_YOUNG位,是否需要写入硬件版本的页表由set_pte_at
    函数来决定*/
    entry = pte_mkyoung(entry);
    /*如果pte内容发生变化,则需要把新的内容写入到pte表项中,并且要flush对应的TLB和cache*/
    if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
        update_mmu_cache(vma, address, pte);
    } else {
        /*
         * This is needed only for protection faults but the arch code
         * is not yet telling us if this is a protection fault or not.
         * This still avoids useless tlb flushes for .text page faults
         * with threads.
         */
        if (flags & FAULT_FLAG_WRITE)
            flush_tlb_fix_spurious_fault(vma, address);
    }
unlock:
    pte_unmap_unlock(pte, ptl);
    return 0;
}

11.1 do_page_fault()缺页中断核心函数

上一篇:Linux内存管理 (19)总结内存管理数据结构和API【转】


下一篇:内存相关内核知识点