1. inode缓存
1: struct inode {
2: /* RCU path lookup touches following: */
3: umode_t i_mode;
4: uid_t i_uid;
5: gid_t i_gid;
6: const struct inode_operations *i_op;
7: struct super_block *i_sb;
8:
9: spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
10: unsigned int i_flags;
11: unsigned long i_state;
12: #ifdef CONFIG_SECURITY
13: void *i_security;
14: #endif
15: struct mutex i_mutex;
16:
17:
18: unsigned long dirtied_when; /* jiffies of first dirtying */
19:
20: struct hlist_node i_hash;
21: struct list_head i_wb_list; /* backing dev IO list */
22: struct list_head i_lru; /* inode LRU list */
23: struct list_head i_sb_list;
24: union {
25: struct list_head i_dentry;
26: struct rcu_head i_rcu;
27: };
28: unsigned long i_ino;
29: atomic_t i_count;
30: unsigned int i_nlink;
31: dev_t i_rdev;
32: unsigned int i_blkbits;
33: u64 i_version;
34: loff_t i_size;
35: #ifdef __NEED_I_SIZE_ORDERED
36: seqcount_t i_size_seqcount;
37: #endif
38: struct timespec i_atime;
39: struct timespec i_mtime;
40: struct timespec i_ctime;
41: blkcnt_t i_blocks;
42: unsigned short i_bytes;
43: struct rw_semaphore i_alloc_sem;
44: const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
45: struct file_lock *i_flock;
46: struct address_space *i_mapping;
47: struct address_space i_data;
48: #ifdef CONFIG_QUOTA
49: struct dquot *i_dquot[MAXQUOTAS];
50: #endif
51: struct list_head i_devices;
52: union {
53: struct pipe_inode_info *i_pipe;
54: struct block_device *i_bdev;
55: struct cdev *i_cdev;
56: };
57:
58: __u32 i_generation;
59:
60: #ifdef CONFIG_FSNOTIFY
61: __u32 i_fsnotify_mask; /* all events this inode cares about */
62: struct hlist_head i_fsnotify_marks;
63: #endif
64:
65: #ifdef CONFIG_IMA
66: atomic_t i_readcount; /* struct files open RO */
67: #endif
68: atomic_t i_writecount;
69: #ifdef CONFIG_FS_POSIX_ACL
70: struct posix_acl *i_acl;
71: struct posix_acl *i_default_acl;
72: #endif
73: void *i_private; /* fs or device private pointer */
74: };
inode可能处于三种状态:
1)unused,里面没有保存有效的内容,可以被复用为新的用途;
2)in use,正在被使用,其成员i_count以及i_nlink一定大于0,此时inode与文件系统或者说设备上的文件相关联,但是自从上次与设备同步后,内容没有发生改变,即不是dirty的;
3)dirty,inode里面的内容已经与文件系统中的文件内容不一致了,即脏了,需要进行文件同步操作。
前两种状态的inode都各自位于一个全局的链表中,而第三种的inode位于super_block结构体中的一个链表中。
先看inode结构体中的一个成员:
struct list_head i_lru; /* inode LRU list */
对应着一个全局的链表:
static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);
1: /*
2: * Called when we‘re dropping the last reference
3: * to an inode.
4: *
5: * Call the FS "drop_inode()" function, defaulting to
6: * the legacy UNIX filesystem behaviour. If it tells
7: * us to evict inode, do so. Otherwise, retain inode
8: * in cache if fs is alive, sync and evict if fs is
9: * shutting down.
10: */
11: static void iput_final(struct inode *inode)
12: {
13: struct super_block *sb = inode->i_sb;
14: const struct super_operations *op = inode->i_sb->s_op;
15: int drop;
16:
17: WARN_ON(inode->i_state & I_NEW);
18:
19: if (op && op->drop_inode)
20: drop = op->drop_inode(inode);
21: else
22: drop = generic_drop_inode(inode);
23:
24: if (!drop && (sb->s_flags & MS_ACTIVE)) {
25: inode->i_state |= I_REFERENCED;
26: if (!(inode->i_state & (I_DIRTY|I_SYNC)))
27: inode_lru_list_add(inode);
28: spin_unlock(&inode->i_lock);
29: return;
30: }
31:
32: if (!drop) {
33: inode->i_state |= I_WILL_FREE;
34: spin_unlock(&inode->i_lock);
35: write_inode_now(inode, 1);
36: spin_lock(&inode->i_lock);
37: WARN_ON(inode->i_state & I_NEW);
38: inode->i_state &= ~I_WILL_FREE;
39: }
40:
41: inode->i_state |= I_FREEING;
42: inode_lru_list_del(inode);
43: spin_unlock(&inode->i_lock);
44:
45: evict(inode);
46: }
函数iput_final是在当inode没有被任何地方引用后,即变成了unused状态后,回收inode的机制。
if (op && op->drop_inode)
drop = op->drop_inode(inode);
else
drop = generic_drop_inode(inode);
drop为0时,表示i_nlink为0,并且inode没有保存着inode_hashtable中的拉链表,即这个inode可以被释放掉。
1: /*
2: * Normal UNIX filesystem behaviour: delete the
3: * inode when the usage count drops to zero, and
4: * i_nlink is zero.
5: */
6: int generic_drop_inode(struct inode *inode)
7: {
8: return !inode->i_nlink || inode_unhashed(inode);
9: }
10: EXPORT_SYMBOL_GPL(generic_drop_inode);
if (!drop && (sb->s_flags & MS_ACTIVE)) {
inode->i_state |= I_REFERENCED;
if (!(inode->i_state & (I_DIRTY|I_SYNC)))
inode_lru_list_add(inode);
spin_unlock(&inode->i_lock);
return;
}
如果superblock还存在在系统中,就调用inode_lru_list_add将inode添加到unused列表中,即将inode缓存起来。
否则,就先调用write_inode_now写回到磁盘上,再调用inode_lru_list_del将已经缓存下来的inode删除掉,最后调用evict函数将inode彻底删除。
1: static void inode_lru_list_add(struct inode *inode)
2: {
3: spin_lock(&inode_lru_lock);
4: if (list_empty(&inode->i_lru)) {
5: list_add(&inode->i_lru, &inode_lru);
6: inodes_stat.nr_unused++;
7: }
8: spin_unlock(&inode_lru_lock);
9: }
因此inode_lru就是全局的unused inode列表,通过“Least Recently Used”的顺序保存。
此外,操作inode_lru的函数还有prune_icache
1: /*
2: * Scan `goal‘ inodes on the unused list for freeable ones. They are moved to a
3: * temporary list and then are freed outside inode_lru_lock by dispose_list().
4: *
5: * Any inodes which are pinned purely because of attached pagecache have their
6: * pagecache removed. If the inode has metadata buffers attached to
7: * mapping->private_list then try to remove them.
8: *
9: * If the inode has the I_REFERENCED flag set, then it means that it has been
10: * used recently - the flag is set in iput_final(). When we encounter such an
11: * inode, clear the flag and move it to the back of the LRU so it gets another
12: * pass through the LRU before it gets reclaimed. This is necessary because of
13: * the fact we are doing lazy LRU updates to minimise lock contention so the
14: * LRU does not have strict ordering. Hence we don‘t want to reclaim inodes
15: * with this flag set because they are the inodes that are out of order.
16: */
17: static void prune_icache(int nr_to_scan)
18: {
19: LIST_HEAD(freeable);
20: int nr_scanned;
21: unsigned long reap = 0;
22:
23: down_read(&iprune_sem);
24: spin_lock(&inode_lru_lock);
25: for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
26: struct inode *inode;
27:
28: if (list_empty(&inode_lru))
29: break;
30:
31: inode = list_entry(inode_lru.prev, struct inode, i_lru);
32:
33: /*
34: * we are inverting the inode_lru_lock/inode->i_lock here,
35: * so use a trylock. If we fail to get the lock, just move the
36: * inode to the back of the list so we don‘t spin on it.
37: */
38: if (!spin_trylock(&inode->i_lock)) {
39: list_move(&inode->i_lru, &inode_lru);
40: continue;
41: }
42:
43: /*
44: * Referenced or dirty inodes are still in use. Give them
45: * another pass through the LRU as we canot reclaim them now.
46: */
47: if (atomic_read(&inode->i_count) ||
48: (inode->i_state & ~I_REFERENCED)) {
49: list_del_init(&inode->i_lru);
50: spin_unlock(&inode->i_lock);
51: inodes_stat.nr_unused--;
52: continue;
53: }
54:
55: /* recently referenced inodes get one more pass */
56: if (inode->i_state & I_REFERENCED) {
57: inode->i_state &= ~I_REFERENCED;
58: list_move(&inode->i_lru, &inode_lru);
59: spin_unlock(&inode->i_lock);
60: continue;
61: }
62: if (inode_has_buffers(inode) || inode->i_data.nrpages) {
63: __iget(inode);
64: spin_unlock(&inode->i_lock);
65: spin_unlock(&inode_lru_lock);
66: if (remove_inode_buffers(inode))
67: reap += invalidate_mapping_pages(&inode->i_data,
68: 0, -1);
69: iput(inode);
70: spin_lock(&inode_lru_lock);
71:
72: if (inode != list_entry(inode_lru.next,
73: struct inode, i_lru))
74: continue; /* wrong inode or list_empty */
75: /* avoid lock inversions with trylock */
76: if (!spin_trylock(&inode->i_lock))
77: continue;
78: if (!can_unuse(inode)) {
79: spin_unlock(&inode->i_lock);
80: continue;
81: }
82: }
83: WARN_ON(inode->i_state & I_NEW);
84: inode->i_state |= I_FREEING;
85: spin_unlock(&inode->i_lock);
86:
87: list_move(&inode->i_lru, &freeable);
88: inodes_stat.nr_unused--;
89: }
90: if (current_is_kswapd())
91: __count_vm_events(KSWAPD_INODESTEAL, reap);
92: else
93: __count_vm_events(PGINODESTEAL, reap);
94: spin_unlock(&inode_lru_lock);
95:
96: dispose_list(&freeable);
97: up_read(&iprune_sem);
98: }
该函数的作用是在内存压力较大时,通过缩减缓存的inode列表inode_lru以释放出更多的内存。
该函数就是从inode_lru中从头开始取inode出来,做一些简单检查,如果inode还有一些原因需要继续存在在缓存中,就将该inode移到链表的尾部,然后检查下一个inode。
使得inode继续保留的原因包括:无法获取到操作inode中数据的锁i_lock;inode中的数据是脏的;inode的使用计数非0;inode刚刚被引用过等等。
还有一个比较实用的问题,我们看到在调用iput_final时,检查如果i_nlink为0,并且没有被用作拉链表的话,就将其放到缓存inode_lru中,但是在prune_icache时,会检查i_count引用计数是否为0。
这也就是说,如果一个inode对应的磁盘文件已经被删除了,但是还有进程对其进行操作的话,那么它不会被直接删除,而是会保存在缓存中,也就是说对其操作的进程仍然可以对已经缓存下来的数据页面page进行操作。
直到没有进程再对其进行操作了,才有可能被清除出缓存。
inode中有两个链表头元素,分别是i_sb_list和i_wb_list,其中i_sb_list是super_block->s_inodes列表的元素,而i_wb_list是用于维护设备的后备inode列表。
2. dentry缓存
dentry缓存的目的,为了减少对慢速磁盘的访问,每当VFS文件系统对底层的数据进行访问时,都会将访问的结果缓存下来,保存成一个dentry对象。
而且dentry对象的组织与管理,是和inode缓存极其相似的,也有一个hash表,和一个lru队列。
而且当内存压力较大时,也会调用prune_dcache来企图释放lru中优先级较低的dentry项目。
1: static struct hlist_bl_head *dentry_hashtable __read_mostly;
在super_block中
1: /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
2: struct list_head s_dentry_lru; /* unused dentry lru */