内核代码阅读(13) - sys_mmap

sys_mmap

static inline long do_mmap2(
        unsigned long addr, unsigned long len,
        unsigned long prot, unsigned long flags,
        unsigned long fd, unsigned long pgoff)
    {
        int error = -EBADF;
        struct file * file = NULL;
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
        if (!(flags & MAP_ANONYMOUS)) {
                file = fget(fd);
                if (!file)
                        goto out;
        }
        down(&current->mm->mmap_sem);
        error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        up(&current->mm->mmap_sem);
        if (file)
                fput(file);
    out:
        return error;
    }
1) MAP_ANONYMOUS
   这个flag表示没有文件映射,只是用来在指定的地址上分配内存。
2) file = fget(fd);
   获取进程中的file结构。
3) do_mmap_pgoff

do_mmap_pgoff 映射文件

unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
        unsigned long prot, unsigned long flags, unsigned long pgoff)
{
        struct mm_struct * mm = current->mm;
        struct vm_area_struct * vma;
        int correct_wcount = 0;
        int error;
        if (flags & MAP_FIXED) {
                if (addr & ~PAGE_MASK)
                        return -EINVAL;
        } else {
                addr = get_unmapped_area(addr, len);
                if (!addr)
                        return -ENOMEM;
        }
        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
        if (!vma)
                return -ENOMEM;
        vma->vm_mm = mm;
        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
        vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
        vma->vm_ops = NULL;
        vma->vm_pgoff = pgoff;
        vma->vm_file = NULL;
        vma->vm_private_data = NULL;
        error = -ENOMEM;
        if (do_munmap(mm, addr, len))
                goto free_vma;
        if ((mm->total_vm << PAGE_SHIFT) + len
            > current->rlim[RLIMIT_AS].rlim_cur)
                goto free_vma;
        if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
            !(flags & MAP_NORESERVE)                                 &&
            !vm_enough_memory(len >> PAGE_SHIFT))
                goto free_vma;
        if (file) {
                if (vma->vm_flags & VM_DENYWRITE) {
                        error = deny_write_access(file);
                        if (error)
                                goto free_vma;
                        correct_wcount = 1;
                }
                vma->vm_file = file;
                get_file(file);
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
        } else if (flags & MAP_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
                        goto free_vma;
        }
        flags = vma->vm_flags;
        addr = vma->vm_start;
        insert_vm_struct(mm, vma);
        if (correct_wcount)
                atomic_inc(&file->f_dentry->d_inode->i_writecount);
        
        mm->total_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED) {
                mm->locked_vm += len >> PAGE_SHIFT;
                make_pages_present(addr, addr + len);
        }
        return addr;
    unmap_and_free_vma:
        if (correct_wcount)
                atomic_inc(&file->f_dentry->d_inode->i_writecount);
        vma->vm_file = NULL;
        fput(file);
        flush_cache_range(mm, vma->vm_start, vma->vm_end);
        zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
        flush_tlb_range(mm, vma->vm_start, vma->vm_end);
    free_vma:
        kmem_cache_free(vm_area_cachep, vma);
        return error;
    }
1) if (flags & MAP_FIXED)
   MAP_FIXED: 表示映射文件到进程空间的起始地址必须是addr,如果满足不了则返回错误。
2) if (addr & ~PAGE_MASK)
   addr必须要page对齐。
3) addr = get_unmapped_area(addr, len);
   如果MAP_FIXED没有设置,则从进程的地址空间中分配一个addr。
4) vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
   分配一个vma,每次mmap都会增加一个vma。而brk会判断新增的vma和已有的vma相邻而且属性相同会进行合并。
5) vma->vm_pgoff = pgoff;
   记录文件的偏移量到vma中,以供缺页中断时读取文件。
6) if (do_munmap(mm, addr, len))
   解除已经映射的addr。当MAP_FIXED没有指定了,并且addr和之前的map重复了。
7) goto free_vma;
   TODO
   内核中有很多都是先分配资源,然后进一步检查条件,如果检查失败,则释放资源。
   之所以采用这种看似浪费的操作,是因为分配资源会导致进程切换。当先检查成功后,再分配资源,就在分配资源过程中发生了进程切换,会导致先前检查的条件已经不成立了。
8) vma->vm_file = file;
   设置file
9) error = file->f_op->mmap(file, vma);
   file->f_op->mmap 和具体的文件系统相关,ext2中对应的是 generic_file_mmap。
10) insert_vm_struct(mm, vma);
    把新的vma插入到当前进程的mm中。
11) if (flags & VM_LOCKED)
    如果设置了VM_LOCKED标记,表示把文件的内容锁在内存中,此时调用 make_pages_present,把文件读进内存。

get_unmapped_area 用户进程中分配虚拟地址区间

unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
    {
        struct vm_area_struct * vmm;
        if (len > TASK_SIZE)
                return 0;
        if (!addr)
                addr = TASK_UNMAPPED_BASE;
        addr = PAGE_ALIGN(addr);
        for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) {
                if (TASK_SIZE - len < addr)
                        return 0;
                if (!vmm || addr + len <= vmm->vm_start)
                        return addr;
                addr = vmm->vm_end;
        }
    }
1) addr = TASK_UNMAPPED_BASE;
   如果addr为0,则从TASK_SIZE/3=1G的位置开始往上找。也就是说,mmap是从1G开始的。
2) for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next)
   从第一个vma->start大于addr开始找。
3) if (!vmm || addr + len <= vmm->vm_start)
   如果当前addr+len < vma->vm_start,找到了一个空洞。

generic_file_mmap ext2的mmap

mmap的定义
struct file_operations ext2_file_operations = {
        llseek:                ext2_file_lseek,
        read:                generic_file_read,
        write:                generic_file_write,
        ioctl:                ext2_ioctl,
        mmap:                generic_file_mmap,
        open:                ext2_open_file,
        release:        ext2_release_file,
        fsync:                ext2_sync_file,
   };
generic_file_mmap
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
    {
        struct vm_operations_struct * ops;
        struct inode *inode = file->f_dentry->d_inode;
        ops = &file_private_mmap;
        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
                if (!inode->i_mapping->a_ops->writepage)
                        return -EINVAL;
                ops = &file_shared_mmap;
        }
        if (!inode->i_sb || !S_ISREG(inode->i_mode))
                return -EACCES;
        if (!inode->i_mapping->a_ops->readpage)
                return -ENOEXEC;
        UPDATE_ATIME(inode);
        vma->vm_ops = ops;
        return 0;
    }
1) vma->vm_ops = ops;
   设置vm_ops,这是缺页的回调函数。
2) ops = &file_private_mmap;
   static struct vm_operations_struct file_private_mmap = {
       nopage:                filemap_nopage,
   };
3) if (!inode->i_mapping->a_ops->writepage)
   检查 address_space_operations。
4) address_space
   TODO
   struct address_space {
    struct list_head        clean_pages;        /* list of clean pages */
    struct list_head        dirty_pages;        /* list of dirty pages */
    struct list_head        locked_pages;        /* list of locked pages */
    unsigned long                nrpages;        /* number of total pages */
    struct address_space_operations *a_ops;        /* methods */
    struct inode                *host;                /* owner: inode, block_device */
    struct vm_area_struct        *i_mmap;        /* list of private mappings */
    struct vm_area_struct        *i_mmap_shared; /* list of shared mappings */
    spinlock_t                i_shared_lock;  /* and spinlock protecting it */
  };
5) address_space_operations
   struct address_space_operations ext2_aops = {
    readpage: ext2_readpage,
    writepage: ext2_writepage,
    sync_page: block_sync_page,
    prepare_write: ext2_prepare_write,
    commit_write: generic_commit_write,
    bmap: ext2_bmap
   };

make_pages_present 主动触发缺页

int make_pages_present(unsigned long addr, unsigned long end)
    {
        int write;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct * vma;
        vma = find_vma(mm, addr);
        write = (vma->vm_flags & VM_WRITE) != 0;
        if (addr >= end)
                BUG();
        do {
                if (handle_mm_fault(mm, vma, addr, write) < 0)
                        return -1;
                addr += PAGE_SIZE;
        } while (addr < end);
        return 0;
    }
1) 每隔一个 PAGE_SIZE 调用一次 handle_mm_fault。
handle_mm_fault -> handle_pte_fault -> do_no_page 主动触发缺页
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
        unsigned long address, int write_access, pte_t *page_table)
    {
        struct page * new_page;
        pte_t entry;
        if (!vma->vm_ops || !vma->vm_ops->nopage)
                return do_anonymous_page(mm, vma, page_table, write_access, address);
        new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
        if (new_page == NULL)        /* no page was available -- SIGBUS */
                return 0;
        if (new_page == NOPAGE_OOM)
                return -1;
        ++mm->rss;
        flush_page_to_ram(new_page);
        flush_icache_page(vma, new_page);
        entry = mk_pte(new_page, vma->vm_page_prot);
        if (write_access) {
                entry = pte_mkwrite(pte_mkdirty(entry));
        } else if (page_count(new_page) > 1 &&
                   !(vma->vm_flags & VM_SHARED))
                entry = pte_wrprotect(entry);
        set_pte(page_table, entry);
        update_mmu_cache(vma, address, entry);
        return 2;        /* Major fault */
    }
1) new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);
   调用mmap的缺页回调。
vma->vm_ops->nopage
vm_ops在 generic_file_mmap中已经设置好了。
   ext2文件系统的设置:
ops = &file_private_mmap;
       
       static struct vm_operations_struct file_private_mmap = {
           nopage:                filemap_nopage,
       };
所以 nopage最终会进入 filemap_nopage。
# filemap_nopage 缺页处理
struct page * filemap_nopage(struct vm_area_struct * area,
        unsigned long address, int no_share)
    {
        int error;
        struct file *file = area->vm_file;
        struct inode *inode = file->f_dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        struct page *page, **hash, *old_page;
        unsigned long size, pgoff;
        pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
    retry_all:
        size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if ((pgoff >= size) && (area->vm_mm == current->mm))
                return NULL;
        hash = page_hash(mapping, pgoff);
    retry_find:
        page = __find_get_page(mapping, pgoff, hash);
        if (!page)
                goto no_cached_page;
        if (!Page_Uptodate(page))
                goto page_not_uptodate;
    success:
        if (VM_SequentialReadHint(area))
                nopage_sequential_readahead(area, pgoff, size);
        old_page = page;
        if (no_share) {
                struct page *new_page = page_cache_alloc();
                if (new_page) {
                        copy_user_highpage(new_page, old_page, address);
                        flush_page_to_ram(new_page);
                } else
                        new_page = NOPAGE_OOM;
                page_cache_release(page);
                return new_page;
        }
        flush_page_to_ram(old_page);
        return old_page;
    no_cached_page:
        if ((pgoff < size) && !VM_RandomReadHint(area))
                error = read_cluster_nonblocking(file, pgoff, size);
        else
                error = page_cache_read(file, pgoff);
        if (error >= 0)
                goto retry_find;
                
        if (error == -ENOMEM)
                return NOPAGE_OOM;
        return NULL;
    page_not_uptodate:
        lock_page(page);
        if (!page->mapping) {
                UnlockPage(page);
                page_cache_release(page);
                goto retry_all;
        }
        if (Page_Uptodate(page)) {
                UnlockPage(page);
                goto success;
        }
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page(page);
                if (Page_Uptodate(page))
                        goto success;
        }
        lock_page(page);
        if (!page->mapping) {
                UnlockPage(page);
                page_cache_release(page);
                goto retry_all;
        }
        if (Page_Uptodate(page)) {
                UnlockPage(page);
                goto success;
        }
        ClearPageError(page);
        if (!mapping->a_ops->readpage(file, page)) {
                wait_on_page(page);
                if (Page_Uptodate(page))
                        goto success;
        }
        page_cache_release(page);
        return NULL;
    }
1) hash = page_hash(mapping, pgoff);
   page = __find_get_page(mapping, pgoff, hash);
   首先在全局的page_hash_table里尝试搜索pgoff的页面。
2) if (!Page_Uptodate(page))
   如果找到了,检查页面的内容是否是最新的。
3) error = read_cluster_nonblocking(file, pgoff, size);
   如果页面不再hash里面,则分配新的物理页,并从设备上读入。把物理页加入相应的队列中。
   这个函数会向前预读一些页面。
## page_cache_read 从文件读内容到一个页面
static inline int page_cache_read(struct file * file, unsigned long offset) 
    {
        struct inode *inode = file->f_dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        struct page **hash = page_hash(mapping, offset);
        struct page *page; 
        spin_lock(&pagecache_lock);
        page = __find_page_nolock(mapping, offset, *hash); 
        spin_unlock(&pagecache_lock);
        if (page)
                return 0;
        page = page_cache_alloc();
        if (!page)
                return -ENOMEM;
        if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
                int error = mapping->a_ops->readpage(file, page);
                page_cache_release(page);
                return error;
        }
        page_cache_free(page);
        return 0;
    }
1) page = __find_page_nolock(mapping, offset, *hash);
   再次到hash表中看看是否已经有别人把这个页面读进来了。
2) if (!add_to_page_cache_unique(page, mapping, offset, hash))
   添加页面到3个链表中。
3) int error = mapping->a_ops->readpage(file, page);
   调用相应文件系统的接口读入内容。
### add_to_page_cache_unique
static int add_to_page_cache_unique(struct page * page,
        struct address_space *mapping, unsigned long offset,
        struct page **hash)
    {
        int err;
        struct page *alias;
        spin_lock(&pagecache_lock);
        alias = __find_page_nolock(mapping, offset, *hash);
        err = 1;
        if (!alias) {
                __add_to_page_cache(page,mapping,offset,hash);
                err = 0;
        }
        spin_unlock(&pagecache_lock);
        return err;
    }
    
    static inline void __add_to_page_cache(struct page * page,
        struct address_space *mapping, unsigned long offset,
        struct page **hash)
    {
        unsigned long flags;
        if (PageLocked(page))
                BUG();
        flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
        page->flags = flags | (1 << PG_locked);
        page_cache_get(page);
        page->index = offset;
        add_page_to_inode_queue(mapping, page);
        add_page_to_hash_queue(page, hash);
        lru_cache_add(page);
    }
1) add_page_to_inode_queue(mapping, page);
   添加物理页到 i_mapping的clean_pages中。
2) add_page_to_hash_queue(page, hash);
   添加物理页到hash表中。
3) lru_cache_add(page);
   添加物理页到 active_list中
上一篇:《hotwheel 网络模型和进程模型》


下一篇:简单总结如何启动一个Erlang程序