写在前面
内核源码版本,3.10。小菜鸟,下面是个人笔记,还有很多不明白的,如有错误请多多指正。
跟踪过程
目的路径,是利用nfs方式进行挂载的。
通过调用lookup
的堆栈信息,可以观察到调用过程包括system_call_fastpath
->sys_newlstat
->trace_do_page_fault
->sysc_newlstat
->vfs_fstatat
->user_path_at
->user_path_at_empty
->filename_lookup
->path_lookupat
->lookup_slow
->lookup_hash
->lookup_real
->d_alloc
->nfs_lookup
从vfs_fstatat
函数为切入点,跟踪ll
命令的调用流程。
vfs_fstatat
vfs_fstatat
函数定义如下:
// 入参的struct kstat结构体封装了文件的基本属性信息,定义如下
struct kstat {
u64 ino;
dev_t dev;
umode_t mode;
unsigned int nlink;
kuid_t uid;
kgid_t gid;
dev_t rdev;
loff_t size;
struct timespec atime;
struct timespec mtime;
struct timespec ctime;
unsigned long blksize;
unsigned long long blocks;
};
// 中间变量path结构体,封装了挂载点和dentry
struct path {
struct vfsmount *mnt; // root的sb和dentry
struct dentry *dentry; // 查找项的dentry
};
// 入参: dfd (查资料是文件的基目录), filename 文件名, stat 文件基本属性封装
int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
int flag)
{
struct path path;
int error = -EINVAL;
unsigned int lookup_flags = 0;
... // 安全性检查函数省略
retry:
error = user_path_at(dfd, filename, lookup_flags, &path); // 根据filename,获取path信息
if (error)
goto out;
error = vfs_getattr(&path, stat); // 根据path,获取文件的stat信息
path_put(&path); // 暂缓
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
out:
return error;
}
user_path_at
user_path_at
函数实际调用了user_path_at_empty
。
int user_path_at(int dfd, const char __user *name, unsigned flags,
struct path *path)
{
return user_path_at_empty(dfd, name, flags, path, NULL);
}
// 中间变量信息 struct nameidata nd; 查找过程用来存储一些临时数据,充当辅助查找
struct nameidata {
struct path path;
struct qstr last; // 路径中的最后一个component
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags; // 查找标识
unsigned seq;
int last_type;
unsigned depth;
char *saved_names[MAX_NESTED_LINKS + 1];
};
struct filename {
const char *name; /* pointer to actual string */
const __user char *uptr; /* original userland pointer */
struct audit_names *aname;
bool separate; /* should "name" be freed? */
};
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
{
struct nameidata nd; // 中间变量信息,用来存储临时数据
struct filename *tmp = getname_flags(name, flags, empty);
int err = PTR_ERR(tmp);
if (!IS_ERR(tmp)) {
BUG_ON(flags & LOOKUP_PARENT);
err = filename_lookup(dfd, tmp, flags, &nd); // tmp存放着name信息,&nd用来存放查找过程的临时数据信息
putname(tmp);
if (!err)
*path = nd.path;
}
return err;
}
filename_lookup
static int filename_lookup(int dfd, struct filename *name, // 查找文件名
unsigned int flags, struct nameidata *nd)
{
int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(dfd, name->name, flags, nd);
if (unlikely(retval == -ESTALE))
retval = path_lookupat(dfd, name->name,
flags | LOOKUP_REVAL, nd);
if (likely(!retval))
audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
return retval;
}
path_lookupat
static int path_lookupat(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
struct file *base = NULL;
struct path path;
int err;
/*
* Path walking is largely split up into 2 different synchronisation
* schemes, rcu-walk and ref-walk (explained in
* Documentation/filesystems/path-lookup.txt). These share much of the
* path walk code, but some things particularly setup, cleanup, and
* following mounts are sufficiently divergent that functions are
* duplicated. Typically there is a function foo(), and its RCU
* analogue, foo_rcu().
*
* -ECHILD is the error number of choice (just to avoid *es) that
* is returned if some aspect of an rcu-walk fails. Such an error must
* be handled by restarting a traditional ref-walk (which will always
* be able to complete).
*/
err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); // 初始化nd的信息,获取查找起始路径的索引dentry
if (unlikely(err))
return err;
current->total_link_count = 0;
err = link_path_walk(name, nd); // 逐层查找每一层级的dentry
if (!err && !(flags & LOOKUP_PARENT)) {
...
}
if (!err)
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
if (!can_lookup(nd->inode)) {
path_put(&nd->path);
err = -ENOTDIR;
}
}
if (base)
fput(base);
if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
path_put(&nd->root);
nd->root.mnt = NULL;
}
return err;
}
// path init函数, 先放一下,没看明白
static int path_init(int dfd, const char *name, unsigned int flags,
struct nameidata *nd, struct file **fp)
{
int retval = 0;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED;
nd->depth = 0;
if (flags & LOOKUP_ROOT) {
struct inode *inode = nd->root.dentry->d_inode; // 这一步有点奇怪,nd传进来的时候不为空吗?
if (*name) {
if (!can_lookup(inode))
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
if (retval)
return retval;
}
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
lock_rcu_walk();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
path_get(&nd->path);
}
return 0;
}
nd->root.mnt = NULL;
if (*name=='/') {
if (flags & LOOKUP_RCU) {
lock_rcu_walk();
set_root_rcu(nd);
} else {
set_root(nd);
path_get(&nd->root);
}
nd->path = nd->root;
} else if (dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
lock_rcu_walk();
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
}
} else {
/* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(dfd);
struct dentry *dentry;
if (!f.file)
return -EBADF;
dentry = f.file->f_path.dentry;
if (*name) {
if (!can_lookup(dentry->d_inode)) {
fdput(f);
return -ENOTDIR;
}
}
nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
if (f.need_put)
*fp = f.file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
lock_rcu_walk();
} else {
path_get(&nd->path);
fdput(f);
}
}
nd->inode = nd->path.dentry->d_inode;
return 0;
}
link_path_walk
static int link_path_walk(const char *name, struct nameidata *nd)
{
struct path next;
int err;
// 第一部分, 将文件名字符串的最前面的斜杠符去掉。'/'可能有多个,所以需要while循环
while (*name=='/')
name++;
if (!*name)
return 0;
/* At this point we know we have a real path component. */
// 循环遍历名字字符的每一轮,就是以'/'字符分隔的每一层字符
for(;;) {
struct qstr this;
long len;
int type;
err = may_lookup(nd);
if (err)
break;
// 计算name的hash值
// 解析当前剩余路径名中第一层路径的长度和内容
len = hash_name(name, &this.hash);
this.name = name;
this.len = len;
type = LAST_NORM;
if (name[0] == '.') switch (len) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
nd->flags |= LOOKUP_JUMPED;
}
break;
case 1:
type = LAST_DOT;
}
if (likely(type == LAST_NORM)) {
struct dentry *parent = nd->path.dentry;
nd->flags &= ~LOOKUP_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
err = parent->d_op->d_hash(parent, nd->inode,
&this);
if (err < 0)
break;
}
}
nd->last = this;
nd->last_type = type;
if (!name[len])
return 0;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
len++;
} while (unlikely(name[len] == '/'));
if (!name[len])
return 0;
name += len;
err = walk_component(nd, &next, LOOKUP_FOLLOW); // 查找下一级路径
if (err < 0)
return err;
if (err) {
err = nested_symlink(&next, nd);
if (err)
return err;
}
if (!can_lookup(nd->inode)) {
err = -ENOTDIR;
break;
}
}
terminate_walk(nd);
return err;
}
walk_component
static inline int walk_component(struct nameidata *nd, struct path *path,
int follow)
{
struct inode *inode;
int err;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM))
return handle_dots(nd, nd->last_type);
err = lookup_fast(nd, path, &inode); // 查找缓存是否命中
if (unlikely(err)) { // 没有检索成功
if (err < 0)
goto out_err;
err = lookup_slow(nd, path); // 继续查找
if (err < 0)
goto out_err;
inode = path->dentry->d_inode;
}
err = -ENOENT;
if (!inode)
goto out_path_put;
if (should_follow_link(inode, follow)) {
if (nd->flags & LOOKUP_RCU) {
if (unlikely(unlazy_walk(nd, path->dentry))) {
err = -ECHILD;
goto out_err;
}
}
BUG_ON(inode != path->dentry->d_inode);
return 1;
}
path_to_nameidata(path, nd);
nd->inode = inode;
return 0;
out_path_put:
path_to_nameidata(path, nd);
out_err:
terminate_walk(nd);
return err;
}
lookup_slow
/* Fast lookup failed, do it the slow way */
static int lookup_slow(struct nameidata *nd, struct path *path)
{
struct dentry *dentry, *parent;
int err;
parent = nd->path.dentry; // parent定向为nd->path.dentry
BUG_ON(nd->inode != parent->d_inode);
mutex_lock(&parent->d_inode->i_mutex);
dentry = __lookup_hash(&nd->last, parent, nd->flags); // 传入参数,包括nd->last(最后一个分量名), parent, nd->flags
mutex_unlock(&parent->d_inode->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
path->mnt = nd->path.mnt; // 此处构建了 path的mount point和dentry
path->dentry = dentry;
err = follow_managed(path, nd->flags);
if (unlikely(err < 0)) {
path_put_conditional(path, nd);
return err;
}
if (err)
nd->flags |= LOOKUP_JUMPED;
return 0;
}
__lookup_hash
// 这里的入参,nd->last 路径中的最后一个component, parent=nd->path.dentry, nd->flags
static struct dentry *__lookup_hash(struct qstr *name,
struct dentry *base, unsigned int flags)
{
bool need_lookup;
struct dentry *dentry;
dentry = lookup_dcache(name, base, flags, &need_lookup); // 若没有查询到dentry,会d_alloc一个dentry出来
if (!need_lookup)
return dentry;
return lookup_real(base->d_inode, dentry, flags);
}
static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
unsigned int flags, bool *need_lookup)
{
struct dentry *dentry;
int error;
*need_lookup = false;
dentry = d_lookup(dir, name);
if (dentry) {
...
}
if (!dentry) {
dentry = d_alloc(dir, name); // d_alloc一个dentry, name是最后一个component, dir是parent
if (unlikely(!dentry))
return ERR_PTR(-ENOMEM);
*need_lookup = true;
}
return dentry;
}
lookup_real
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct dentry *old;
/* Don't create child dentry for a dead directory. */
if (unlikely(IS_DEADDIR(dir))) {
dput(dentry);
return ERR_PTR(-ENOENT);
}
old = dir->i_op->lookup(dir, dentry, flags); // parent的dir(inode), dentry, flags
if (unlikely(old)) {
dput(dentry);
dentry = old;
}
return dentry; // 最终返回的仍旧是dentry
}
dir->i_op->lookup
最终调用底层文件系统的lookup
函数,以nfs为例
struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
{
struct dentry *res;
struct dentry *parent;
struct inode *inode = NULL;
struct nfs_fh *fhandle = NULL;
struct nfs_fattr *fattr = NULL;
int error;
dfprintk(VFS, "NFS: lookup(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
nfs_inc_stats(dir, NFSIOS_VFSLOOKUP);
res = ERR_PTR(-ENAMETOOLONG);
if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
goto out;
/*
* If we're doing an exclusive create, optimize away the lookup
* but don't hash the dentry.
*/
if (nfs_is_exclusive_create(dir, flags)) {
d_instantiate(dentry, NULL);
res = NULL;
goto out;
}
res = ERR_PTR(-ENOMEM);
fhandle = nfs_alloc_fhandle();
fattr = nfs_alloc_fattr();
if (fhandle == NULL || fattr == NULL)
goto out;
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
nfs_block_sillyrename(parent);
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
goto out_unblock_sillyrename;
}
inode = nfs_fhget(dentry->d_sb, fhandle, fattr);
res = ERR_CAST(inode);
if (IS_ERR(res))
goto out_unblock_sillyrename;
/* Success: notify readdir to use READDIRPLUS */
nfs_advise_use_readdirplus(dir);
no_entry:
res = d_materialise_unique(dentry, inode);
if (res != NULL) {
if (IS_ERR(res))
goto out_unblock_sillyrename;
dentry = res;
}
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
out:
nfs_free_fattr(fattr);
nfs_free_fhandle(fhandle);
return res;
}
EXPORT_SYMBOL_GPL(nfs_lookup);