Linux open系统调用流程(3)

1. 闲言少叙，继续分析__link_path_walk函数:

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
/**
处理三种情形:
(1)正在解析路径名
(2)解析父目录
(3)解析符号链接(第一次找出符号链接对应的文件路径，第二次解析文件路径)
**/
static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
{
	struct path next;
	struct inode *inode;
	int err;
	/*查询标志*/
	unsigned int lookup_flags = nd->flags;
	/*如果第一个字符为/*/
	while (*name=='/')
		name++;
	/*只有一个根*/
	if (!*name)
		goto return_reval;
	/*得到索引节点,第一次是开始目录的索引节点，以后就是上一次目录的索引节点*/
	inode = nd->dentry->d_inode;
	/*设置符号链接*/
	if (nd->depth)
		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);

	/* At this point we know we have a real path component. */
	for(;;) {
		/*hash值*/
		unsigned long hash;
		/*包括hash值，分量长度和分量名*/
		struct qstr this;
		unsigned int c;
		/*设置继续查询标志*/
		nd->flags |= LOOKUP_CONTINUE;
		/*检查权限信息，如果一个目录能够被遍历，首先必须具有执行权限*/
		err = exec_permission_lite(inode, nd);
		if (err == -EAGAIN)
			err = vfs_permission(nd, MAY_EXEC);
 		if (err)
			break;
		/*name指的是第一个分量的第一个字符的地址*/
		this.name = name;
		/*取得第一个字符，如/proc,那么c='p'*/
		c = *(const unsigned char *)name;
		/*初始化hash值*/
		hash = init_name_hash();
		do {
			name++;
		/*计算部分hash，直到结尾，如/proc,那么计算的hash值就是proc*/	
			hash = partial_name_hash(c, hash);
			c = *(const unsigned char *)name;
		} while (c && (c != '/'));
		/*计算每个分量的长度*/
		this.len = name - (const char *) this.name;
		/*this.hash赋上hash值*/
		this.hash = end_name_hash(hash);

		/* remove trailing slashes? */
		/*到达最后一个分量*/
		if (!c)
			goto last_component;
		while (*++name == '/');
		/*最后一个字符是/*/
		if (!*name)
			goto last_with_slashes;

		/*
		 * "." and ".." are special - ".." especially so because it has
		 * to be able to know about the current root directory and
		 * parent relationships.
		 */
		/*如果分量名第一个是.*/
		if (this.name[0] == '.') switch (this.len) {
			default:
				break;
			case 2:	/*并且第二个字符不是.,那么可能是隐藏文件，即不影响*/
				if (this.name[1] != '.')
					break;
				/*如果第二个字符也是.，需要回溯到父目录*/
				follow_dotdot(nd);
				inode = nd->dentry->d_inode;
				/* fallthrough */
			case 1:
				continue;
		}
		/*
		 * See if the low-level filesystem might want
		 * to use its own hash..
		 如果底层文件系统具有计算hash值的函数，则使用
		 */
		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
			if (err < 0)
				break;
		}
		/* This does the actual lookups..真正的查找函数*/
		/*nd结构体，this包含了分量名，next指向分量的目录项对象和安装点对象*/
		err = do_lookup(nd, &this, &next);
		if (err)
			break;

		err = -ENOENT;
		/*上一次解析分量的索引节点对象*/
		inode = next.dentry->d_inode;
		if (!inode)
			goto out_dput;
		err = -ENOTDIR; 
		if (!inode->i_op)
			goto out_dput;
		/*处理符号链接*/
		if (inode->i_op->follow_link) {
			/*处理符号链接*/
			err = do_follow_link(&next, nd);
			if (err)
				goto return_err;
			err = -ENOENT;
			inode = nd->dentry->d_inode;
			if (!inode)
				break;
			err = -ENOTDIR; 
			if (!inode->i_op)
				break;
		} else
			/*将目录项对象和安装点对象赋值给nd*/
			path_to_nameidata(&next, nd);
		err = -ENOTDIR; 
		if (!inode->i_op->lookup)/*如果不是目录*/
			break;
		continue;
		/* here ends the main loop */

last_with_slashes:
		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
last_component:
		/* Clear LOOKUP_CONTINUE iff it was previously unset 解析到最后一项，清除掉LOOKUP_CONTINUE*/
		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
		/*有些情况下，不需要找到最后一个分量，例如创建一个文件/foo/bar，此时bar文件不存在，则应该找到foo的目录项对象*/
		if (lookup_flags & LOOKUP_PARENT)
			goto lookup_parent;
		if (this.name[0] == '.') switch (this.len) {
			default:
				break;
			case 2:	
				if (this.name[1] != '.')
					break;
				follow_dotdot(nd);
				inode = nd->dentry->d_inode;
				/* fallthrough */
			case 1:
				goto return_reval;
		}
		/*如果底层文件系统定义了计算hash值的方法，则使用它*/
		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
			if (err < 0)
				break;
		}
		/*查询最后一个component的hash值*/
		err = do_lookup(nd, &this, &next);
		if (err)
			break;
		/*最后一个分量的索引节点*/
		inode = next.dentry->d_inode;
		if ((lookup_flags & LOOKUP_FOLLOW)/*如果是符号链接*/
		    && inode && inode->i_op && inode->i_op->follow_link) {
			err = do_follow_link(&next, nd);
			if (err)
				goto return_err;
			inode = nd->dentry->d_inode;
		} else
			/*设置nameidata的mnt和dentry*/
			path_to_nameidata(&next, nd);
		err = -ENOENT;
		if (!inode)/*如果索引节点为空，即文件不存在*/
			break;
		if (lookup_flags & LOOKUP_DIRECTORY) {/*如果是目录*/
			err = -ENOTDIR; 
			if (!inode->i_op || !inode->i_op->lookup)/*如果没有目录方法*/
				break;
		}
		goto return_base;/*正常返回0，则nd包含了最后一个分量的目录项对象和所属的文件系统安装点*/
lookup_parent:/*创建一个文件时需要父目录项对象*/
		/*最后一个分量名*/
		nd->last = this;
		/*最后一个分量类型*/
		nd->last_type = LAST_NORM;
		/*不是.代表文件*/
		if (this.name[0] != '.')
			goto return_base;
		/*如果长度为1，代表当前目录*/
		if (this.len == 1)
			nd->last_type = LAST_DOT;
		/*长度为2，代表父目录*/
		else if (this.len == 2 && this.name[1] == '.')
			nd->last_type = LAST_DOTDOT;
		else
			goto return_base;
return_reval:
		/*
		 * We bypassed the ordinary revalidation routines.
		 * We may need to check the cached dentry for staleness.
		 */
		if (nd->dentry && nd->dentry->d_sb &&
		    (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
			err = -ESTALE;
			/* Note: we do not d_invalidate() */
			if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
				break;
		}
return_base:
		return 0;
out_dput:
		dput_path(&next, nd);
		break;
	}
	path_release(nd);
return_err:
	return err;
}

这个函数主要做三件事:

(1)解析已经存在的文件路径，即打开标志

(2)解析不存在的文件路径，即创建文件标志，这样，需要得到父目录项对象和安装点对象

(3)解析符号链接，第一次找到符号链接的文件路径，第二次解析路径名

第23－26行，只有/,跳至return_reval. 这里多个根当作一个根处理，如//

第31-32行，设置符号链接标志。

第39行，定义qstr结构，这个结构包括hash值，分量长度和分量名。

第43-46行，进行权限检查，遍厍目录，必须具有执行权限。

第55-60行，计算每个分量的hash值。

第68行，如果解析到最后一个分量，跳至last_component.

第72行，如果遇到类似/proc/的目录，跳至last_with_slashes.

第80行，如果分量的第一个字符是.，但第二个字符不是.，则正常解析。

第88行，当第二个字符也是. ,说明是父目录，调用follow_dotdot进行回溯。

我们分析一下这个函数:

static __always_inline void follow_dotdot(struct nameidata *nd)
{
	/*得到fs_struct结构体*/
	struct fs_struct *fs = current->fs;

	while(1) {
		struct vfsmount *parent;
		/*上一次的目录项对象*/
		struct dentry *old = nd->dentry;
                read_lock(&fs->lock);
		/*如果回溯的目录是进程的根目录，则不允许，调用follow_mount函数*/
		if (nd->dentry == fs->root &&
		    nd->mnt == fs->rootmnt) {
                        read_unlock(&fs->lock);
			break;
		}
                read_unlock(&fs->lock);
		spin_lock(&dcache_lock);
		/*如果目录项对象不是根目录，则返回上一级目录项对象*/
		if (nd->dentry != nd->mnt->mnt_root) {
			nd->dentry = dget(nd->dentry->d_parent);
			spin_unlock(&dcache_lock);
			dput(old);
			break;
		}
		spin_unlock(&dcache_lock);
		spin_lock(&vfsmount_lock);
		parent = nd->mnt->mnt_parent;
		if (parent == nd->mnt) {
			spin_unlock(&vfsmount_lock);
			break;
		}
		mntget(parent);
		nd->dentry = dget(nd->mnt->mnt_mountpoint);
		spin_unlock(&vfsmount_lock);
		dput(old);
		mntput(nd->mnt);
		nd->mnt = parent;
	}
	/*回溯到最底层的文件系统，nd->mnt指向挂载点*/
	follow_mount(&nd->mnt, &nd->dentry);
}

第11-16行，如果回溯的是进程的根目录，则不允许，调用follow_mount函数。

第19-23行，如果目录项对象不是根目录，则通过nd->dentry=dget(nd->dentry->d_parent)返回上一级目录项对象。

不管怎么样，最终会调用follow_mount函数。有时，人的好奇心是很强的，同样，对于Linux内核源码，也需要好奇心。哈哈，看一下follow_mount函数:

/*一直回溯到没有挂载其它文件系统的挂载点，mnt指向这个最底层的挂载点*/
static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
	while (d_mountpoint(*dentry)) {
		/*返回子挂载点*/
		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
		if (!mounted)
			break;
		dput(*dentry);
		mntput(*mnt);
		*mnt = mounted;
		*dentry = dget(mounted->mnt_root);
	}
}

这个函数首先判断一下dentry目录项是不是挂载点，如果是，调用lookup_mnt函数返回子挂载点。在第11行，将mnt赋值mounted,接着，寻找子挂载点。最终，找到一个没有其它文件系统安装在其之上的文件系统挂载点。这里，需要解释一下，如果/dev/sda1和/dev/sda2先后挂载在/usr目录下，那么/dev/sda1的相关目录将会被隐藏，而/dev/sda2的父挂载点是/dev/sda1. 而上面的过程是通过父挂载点找子挂载点，直到找到一个没有挂载其它文件系统的挂载点为止。这个，文件系统称暂且称为底层文件系统。也不知道，这么叫对不对,或许是顶层文件系统。总之，follow_dotdot回溯到了上一级目录。

接着__link_path_walk解释，

第97行，如果底层文件系统具有计算hash值的函数，则调用。

第106行，查找分量的目录项对象函数do_lookup，这个函数一会分析。

第119行，判断是否是符号链接，调用do_follow_link处理符号链接，稍后分析。

第142行，处理最后一个分量。

第167行，调用do_lookup函数，找到一个最后分量的目录项对象和挂载点对象。

第172行，如果最后一个分量是符号链接，调用do_follow_link进一步处理。

第190行，当只是建立文件时，跳至lookup_parent.

第192-205行，最后一个分量名和分量类型，此时，nd保存了上一个分量的目录项对象和挂载点对象。

如果正确解析，返回0.

下面，分析一下do_lookup函数:

/* 查询目录项对象，其结果保存在nameidata中，如果目录项缓存中存在，则直接返回，否则，创建目录项对象并插入目录项缓存，在创建索引节点，插入索引节点缓存(inode cache)，然后让ndr dentry与mtn分别指向目录项对象和分量名所属的文件系统的安装点对象
 传入参数：nd,name指分量名
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
 */
static int do_lookup(struct nameidata *nd, struct qstr *name,
		     struct path *path)
{
	struct vfsmount *mnt = nd->mnt;
	/*首先在目录项缓存查找，如果没有，则从底层建立目录项对象*/
	struct dentry *dentry = __d_lookup(nd->dentry, name);
	/*如果目录项缓存不存在*/
	if (!dentry)
		goto need_lookup;
	if (dentry->d_op && dentry->d_op->d_revalidate)
		goto need_revalidate;
done:
	path->mnt = mnt;/*安装点对象*/
	path->dentry = dentry;/*目录项对象*/
	/*找到子挂载点的mnt和目录项对象，即最底层的文件系统挂载点对象*/
	__follow_mount(path);
	return 0;

need_lookup:
	/*如果dentry cache没有，则在内存分配一个dentry，并在内存分配索引节点，将dentry和索引节点关联*/
	dentry = real_lookup(nd->dentry, name, nd);
	if (IS_ERR(dentry))
		goto fail;
	goto done;

need_revalidate:
	/*验证目录项对象是否还有效*/
	dentry = do_revalidate(dentry, nd);
	if (!dentry)
		goto need_lookup;
	if (IS_ERR(dentry))
		goto fail;
	goto done;

fail:
	return PTR_ERR(dentry);
}

这个函数的主要功能是查询目录项对象，并将挂载点和目录项对象保存在nameidata结构。具体如下:

第10行，nd保存了上一个目录项对象和挂载点对象。

第12行，首先在目录项缓存dentry cache查找，如果缓存不存在，跳转到need_lookup，调用real_lookup在内存分配一个dentry，并将dentry和索引节点关联。

第17行，如果存在，需要验证目录项对象是否有效，跳至34行，如果有效，将mnt和dentry赋值给path. 在__link_path_walk会将path值赋给nd.

继续跟踪__do_lookup函数:

//从目录项缓存查找相应的目录项对象即struct dentry
struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
{
	unsigned int len = name->len;/*分量名的长度*/
	unsigned int hash = name->hash;/*分量名的hash值*/
	const unsigned char *str = name->name;/*分量名*/
	struct hlist_head *head = d_hash(parent,hash);/*得到hash节点指针*/
	struct dentry *found = NULL;
	struct hlist_node *node;
	struct dentry *dentry;

	rcu_read_lock();
	/*dentry cache查找*/
	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
		struct qstr *qstr;
		/*hash值是否相同,hash值和名称相关联*/
		if (dentry->d_name.hash != hash)
			continue;
		/*父目录项是否是parent*/
		if (dentry->d_parent != parent)
			continue;

		spin_lock(&dentry->d_lock);

		/*
		 * Recheck the dentry after taking the lock - d_move may have
		 * changed things.  Don't bother checking the hash because we're
		 * about to compare the whole name anyway.
		 */
		if (dentry->d_parent != parent)
			goto next;

		/*
		 * It is safe to compare names since d_move() cannot
		 * change the qstr (protected by d_lock).
		 */
		/*detnry->d_name表示分量名，长度*/
		qstr = &dentry->d_name;
		if (parent->d_op && parent->d_op->d_compare) {/*匹配分量名，不同文件系统可以有不同的实现，如MS-DOS不分大小写*/
			if (parent->d_op->d_compare(parent, qstr, name))
				goto next;
		} else {
			if (qstr->len != len)
				goto next;
			if (memcmp(qstr->name, str, len))
				goto next;
		}
		
		if (!d_unhashed(dentry)) {
			atomic_inc(&dentry->d_count);
			found = dentry;
		}
		spin_unlock(&dentry->d_lock);
		break;
next:
		spin_unlock(&dentry->d_lock);
 	}
 	rcu_read_unlock();

 	return found;
}

第4-7行，赋值len,hash和name,并取得head指针，为下面比较做准备。

第14行，判断hash值是是否相同。

第20行，判断父目录项parent是否相同。

第39行，匹配分量名。

如果找到，返回目录项对象。

从这个查找过程，可以看出，是用目录名或是文件名计算hash值，然后返回对应的目录项对象。这也是为什么目录名或文件名不放在索引节点而放在目录项对象的原因。

如果目录项缓存没有，继续跟踪real_lookup函数:

/*
 * This is called when everything else fails, and we actually have
 * to go to the low-level filesystem to find out what we should do..
 *
 * We get the directory semaphore, and after getting that we also
 * make sure that nobody added the entry to the dcache in the meantime..
 * SMP-safe
返回目录项对象
 */
static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
{
	struct dentry * result;
	/*上一级的inode节点*/
	struct inode *dir = parent->d_inode;
	mutex_lock(&dir->i_mutex);
	/*
	 * First re-do the cached lookup just in case it was created
	 * while we waited for the directory semaphore..
	 *
	 * FIXME! This could use version numbering or similar to
	 * avoid unnecessary cache lookups.
	 *
	 * The "dcache_lock" is purely to protect the RCU list walker
	 * from concurrent renames at this point (we mustn't get false
	 * negatives from the RCU list walk here, unlike the optimistic
	 * fast walk).
	 *
	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
	 */
	/*重新搜索一下目录项缓存*/
	result = d_lookup(parent, name);
	if (!result) {/*如果没有*/
	/*分配一个目录项对象，并初始化，对应分量的目录项对象的父目录项对象设置为上一次解析出来的目录项对象，即nd->dentry*/
		struct dentry * dentry = d_alloc(parent, name);
		result = ERR_PTR(-ENOMEM);
		if (dentry) {
			/*具体的文件系统相关函数，读取磁盘的inode节点信息，并将inode节点和目录项对象相关联,在iget索引节点时，将索引节点加入了inode cache,在关联inode节点时，将目录项对象加入了dentry cache*/
			result = dir->i_op->lookup(dir, dentry, nd);
			if (result)
				dput(dentry);
			else
				result = dentry;
		}
		mutex_unlock(&dir->i_mutex);
		return result;
	}

	/*
	 * Uhhuh! Nasty case: the cache was re-populated while
	 * we waited on the semaphore. Need to revalidate.
	 */
	mutex_unlock(&dir->i_mutex);
	if (result->d_op && result->d_op->d_revalidate) {
		result = do_revalidate(result, nd);
		if (!result)
			result = ERR_PTR(-ENOENT);
	}
	return result;
}

在第33行，重新搜索一下目录项缓存，由于进程在查找过程中可能阻塞，在这期间，目录项可能已经加入了dentry cache,所以需要重新查找一下。

第34行，如果没有找到，调用d_alloc分配一个目录项对象。

第35行，具体的文件系统索引节点查找函数，读取磁盘索引节点信息，并将索引节点和目录项对象关联。在iget索引节点时，将索引节点加入了inode cache. 在关联inode节点时，将目录项对象加入了dentry cache.

在第53行，验证目录项对象是否有效，最终返回目录项对象。

可以看到，此时返回的目录项对象已经加入到了dentry cache,并关联了索引节点。即dentry->d_innode=inode.

我们继续跟踪上面的两个函数，首先跟踪d_alloc函数:

/**分配一个目录项对象，并初始化
 * d_alloc	-	allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
	struct dentry *dentry;
	char *dname;

	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); 
	if (!dentry)
		return NULL;

	if (name->len > DNAME_INLINE_LEN-1) {
		dname = kmalloc(name->len + 1, GFP_KERNEL);
		if (!dname) {
			kmem_cache_free(dentry_cache, dentry); 
			return NULL;
		}
	} else  {
		dname = dentry->d_iname;
	}	
	dentry->d_name.name = dname;

	dentry->d_name.len = name->len;
	dentry->d_name.hash = name->hash;
	memcpy(dname, name->name, name->len);
	dname[name->len] = 0;

	atomic_set(&dentry->d_count, 1);
	dentry->d_flags = DCACHE_UNHASHED;
	spin_lock_init(&dentry->d_lock);
	dentry->d_inode = NULL;
	dentry->d_parent = NULL;
	dentry->d_sb = NULL;
	dentry->d_op = NULL;
	dentry->d_fsdata = NULL;
	dentry->d_mounted = 0;
#ifdef CONFIG_PROFILING
	dentry->d_cookie = NULL;
#endif
	INIT_HLIST_NODE(&dentry->d_hash);
	INIT_LIST_HEAD(&dentry->d_lru);
	INIT_LIST_HEAD(&dentry->d_subdirs);
	INIT_LIST_HEAD(&dentry->d_alias);

	if (parent) {
		/*设置父目录项对象为parent*/
		dentry->d_parent = dget(parent);
		/*目录项对象对应的超级块对象*/
		dentry->d_sb = parent->d_sb;
	} else {
		INIT_LIST_HEAD(&dentry->d_u.d_child);
	}

	spin_lock(&dcache_lock);
	if (parent)
		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
	dentry_stat.nr_dentry++;
	spin_unlock(&dcache_lock);

	return dentry;
}

第16行，为目录项对象分配内存。

第29-32行，设置名称，长度和hash值。

第48-51行，初始化相关链表。

第53行，如果父目录项对象存在，就设置父目录项对象和超级块对象。这样，就建立了一个子目录项对象。

接着跟踪lookup函数,以ext3为例，ext3_lookup：

/*查找文件名在目录项对象dentry下的inode节点*/
static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
	struct inode * inode;
	struct ext3_dir_entry_2 * de;
	struct buffer_head * bh;

	if (dentry->d_name.len > EXT3_NAME_LEN)
		return ERR_PTR(-ENAMETOOLONG);
	/*得到ext3_dir_entry_2对象，该对象包含inode节点号，再根据inode节点后从超级块的read_inode得到inode结构体*/
	bh = ext3_find_entry(dentry, &de);
	inode = NULL;
	if (bh) {
		unsigned long ino = le32_to_cpu(de->inode);
		brelse (bh);
		if (!ext3_valid_inum(dir->i_sb, ino)) {
			ext3_error(dir->i_sb, "ext3_lookup",
				   "bad inode number: %lu", ino);
			inode = NULL;
		} else
			/*创建内存索引节点，并填充相关信息，i_fop，并将索引节点加入inode cache*/
			inode = iget(dir->i_sb, ino);

		if (!inode)
			return ERR_PTR(-EACCES);
	}
	/*将目录项对象关联inode节点*/
	return d_splice_alias(inode, dentry);
}

第11行，得到ext3_dir_entry_2对象，该对象包含了索引节点号。

第13－16行，判断索引节点号是否合法。

第21行，创建内存索引节点，并填充相关信息，将索引节点加入inode cache.

第28行，将目录项对象和索引节点关联。

首先，跟踪iget函数:

static inline struct inode *iget(struct super_block *sb, unsigned long ino)
{
	/*在内存分配一个新的索引节点*/
	struct inode *inode = iget_locked(sb, ino);
	/*如果是一个新的索引节点，读取磁盘上的索引节点并填充内存索引节点的相关信息*/
	if (inode && (inode->i_state & I_NEW)) {
		sb->s_op->read_inode(inode);
		unlock_new_inode(inode);
	}

	return inode;
}

首先调用iget_locked分配内存索引节点。如果是新分配的，需要调用read_inode调用磁盘上的索引节点填充相关信息。

继续跟踪iget_locked函数:

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:		super block of file system
 * @ino:	inode number to get
 *
 * This is iget() without the read_inode() portion of get_new_inode_fast().
 *
 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
 * the inode cache and if present it is returned with an increased reference
 * count. This is for file systems where the inode number is sufficient for
 * unique identification of an inode.
 *
 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
 * The file system gets to fill it in before unlocking it via
 * unlock_new_inode().
 */
/**
这个函数首先在inode节点缓存查找inode节点，如果存在，则返回
如果缓存不存在，调用get_new_inode_fast分配一个inode节点
**/
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
	/*inode_hashtable查找*/
	struct hlist_head *head = inode_hashtable + hash(sb, ino);
	struct inode *inode;
	/*首先在inode cache查找*/
	inode = ifind_fast(sb, head, ino);
	if (inode)
		return inode;
	/*
	 * get_new_inode_fast() will do the right thing, re-trying the search
	 * in case it had to block at any point.
	 */
	/*新分配一个索引节点，并加入到inode cache,即inode_hashtable*/
	return get_new_inode_fast(sb, head, ino);
}

第28行，在inode cache查找，如果没有，调用get_new_inode_fast分配一个索引节点并插入inode cache.

ifind_fast留给读者自行分析吧！

分析一下，get_new_inode_fast函数:

/*
 * get_new_inode_fast is the fast path version of get_new_inode, see the
 * comment at iget_locked for details.
 */
static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
{
	struct inode * inode;
	/*分配一个索引节点*/
	inode = alloc_inode(sb);
	if (inode) {
		struct inode * old;

		spin_lock(&inode_lock);
		/* We released the lock, so.. */
		old = find_inode_fast(sb, head, ino);
		if (!old) {
			/*设置索引节点号*/
			inode->i_ino = ino;
			inodes_stat.nr_inodes++;
			/*加入已经使用链表inode_in_use*/
			list_add(&inode->i_list, &inode_in_use);
			/*加入超级块链表*/
			list_add(&inode->i_sb_list, &sb->s_inodes);
			/*加入inode_hashtable*/
			hlist_add_head(&inode->i_hash, head);
			/*设置状态*/
			inode->i_state = I_LOCK|I_NEW;
			spin_unlock(&inode_lock);

			/* Return the locked inode with I_NEW set, the
			 * caller is responsible for filling in the contents
			 */
			return inode;
		}

		/*
		 * Uhhuh, somebody else created the same inode under
		 * us. Use the old inode instead of the one we just
		 * allocated.
		 */
		__iget(old);
		spin_unlock(&inode_lock);
		destroy_inode(inode);
		inode = old;
		wait_on_inode(inode);
	}
	return inode;
}

第9行，分配索引节点。

第17－28行，索引节点的初始化。包括:

(1)设置索引节点号

(2)加入inode_in_use链表

(3)加入inode_hashtable，即加入inode cache

(4)设置状态为I_NEW

返回索引节点。

接下来，继续分析iget函数中的第二个函数read_inode.

void ext3_read_inode(struct inode * inode)
{	/*描述索引节点的位置信息*/
	struct ext3_iloc iloc;
	struct ext3_inode *raw_inode;
	struct ext3_inode_info *ei = EXT3_I(inode);
	struct buffer_head *bh;
	int block;

#ifdef CONFIG_EXT3_FS_POSIX_ACL
	ei->i_acl = EXT3_ACL_NOT_CACHED;
	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
	ei->i_block_alloc_info = NULL;

	if (__ext3_get_inode_loc(inode, &iloc, 0))
		goto bad_inode;
	bh = iloc.bh;
	/*磁盘上原始索引节点，读取它并填充新分配的索引节点*/
	raw_inode = ext3_raw_inode(&iloc);
	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
	if(!(test_opt (inode->i_sb, NO_UID32))) {
		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
	}
	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
	inode->i_size = le32_to_cpu(raw_inode->i_size);
	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;

	ei->i_state = 0;
	ei->i_dir_start_lookup = 0;
	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
	/* We now have enough fields to check if the inode was active or not.
	 * This is needed because nfsd might try to access dead inodes
	 * the test is that same one that e2fsck uses
	 * NeilBrown 1999oct15
	 */
	if (inode->i_nlink == 0) {
		if (inode->i_mode == 0 ||
		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
			/* this inode is deleted */
			brelse (bh);
			goto bad_inode;
		}
		/* The only unlinked inodes we let through here have
		 * valid i_mode and are being read by the orphan
		 * recovery code: that's fine, we're about to complete
		 * the process of deleting those. */
	}
	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
#ifdef EXT3_FRAGMENTS
	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
	ei->i_frag_no = raw_inode->i_frag;
	ei->i_frag_size = raw_inode->i_fsize;
#endif
	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
	if (!S_ISREG(inode->i_mode)) {
		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
	} else {
		inode->i_size |=
			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
	}
	ei->i_disksize = inode->i_size;
	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
	ei->i_block_group = iloc.block_group;
	/*
	 * NOTE! The in-memory inode i_data array is in little-endian order
	 * even on big-endian machines: we do NOT byteswap the block numbers!
	 */
	for (block = 0; block < EXT3_N_BLOCKS; block++)
		ei->i_data[block] = raw_inode->i_block[block];
	INIT_LIST_HEAD(&ei->i_orphan);

	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
	    EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
		/*
		 * When mke2fs creates big inodes it does not zero out
		 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
		 * so ignore those first few inodes.
		 */
		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
		if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
		    EXT3_INODE_SIZE(inode->i_sb))
			goto bad_inode;
		if (ei->i_extra_isize == 0) {
			/* The extra space is currently unused. Use it. */
			ei->i_extra_isize = sizeof(struct ext3_inode) -
					    EXT3_GOOD_OLD_INODE_SIZE;
		} else {
			__le32 *magic = (void *)raw_inode +
					EXT3_GOOD_OLD_INODE_SIZE +
					ei->i_extra_isize;
			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
				 ei->i_state |= EXT3_STATE_XATTR;
		}
	} else
		ei->i_extra_isize = 0;

	if (S_ISREG(inode->i_mode)) {
		/*inode节点相关方法和文件操作方法，这个非常重要，最后将inode->i_fop赋给file对象*/
		inode->i_op = &ext3_file_inode_operations;
		inode->i_fop = &ext3_file_operations;
		ext3_set_aops(inode);
	} else if (S_ISDIR(inode->i_mode)) {
		inode->i_op = &ext3_dir_inode_operations;
		inode->i_fop = &ext3_dir_operations;
	} else if (S_ISLNK(inode->i_mode)) {
		if (ext3_inode_is_fast_symlink(inode))
			inode->i_op = &ext3_fast_symlink_inode_operations;
		else {
			inode->i_op = &ext3_symlink_inode_operations;
			ext3_set_aops(inode);
		}
	} else {//将相关操作赋值给inode->i_op
		inode->i_op = &ext3_special_inode_operations;
		if (raw_inode->i_block[0])
			init_special_inode(inode, inode->i_mode,
			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
		else
			init_special_inode(inode, inode->i_mode,
			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
	}
	brelse (iloc.bh);
	ext3_set_inode_flags(inode);
	return;

bad_inode:
	make_bad_inode(inode);
	return;
}

简单说一下功能:

第19行，读取磁盘上原始索引节点，用来填充新分配的索引节点。

第20-32行，inode相关域设置。

第104行，如果是文件，则将文件相关操作的指针赋给inode->i_fop,这非常重要，因为，最后将i_fop赋给了文件对象file->f_op. 表示了文件的相关操作。

第109-111行，目录相关操作。

第112-118行，符号链接相关操作。

第119-128行，设备相关操作。具体就不分析了。

到此为止，我们已经得到了一个inode节点，并且填充了相关域。

iget函数返回，ext3_lookup继续往下走，调用d_splice_alias函数:

/**将索引节点和目录项对象相关联
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
 * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
 * and return it, else simply d_add the inode to the dentry and return NULL.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
	struct dentry *new = NULL;

	if (inode && S_ISDIR(inode->i_mode)) {
		spin_lock(&dcache_lock);
		new = __d_find_alias(inode, 1);
		if (new) {
			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
			fsnotify_d_instantiate(new, inode);
			spin_unlock(&dcache_lock);
			security_d_instantiate(new, inode);
			d_rehash(dentry);
			d_move(new, dentry);
			iput(inode);
		} else {
			/* d_instantiate takes dcache_lock, so we do it by hand */
			/*加入正在使用目录项链表，即表头在i_dentry*/
			list_add(&dentry->d_alias, &inode->i_dentry);
			/*目录项对象和索引节点对象关联*/
			dentry->d_inode = inode;
			fsnotify_d_instantiate(dentry, inode);
			spin_unlock(&dcache_lock);
			security_d_instantiate(dentry, inode);
			/*将目录项对象加入dentry_hashtable即目录项缓存*/
			d_rehash(dentry);
		}
	} else
		d_add(dentry, inode);
	return new;
}

第37行，将目录项对象和索引节点相关联。

最后，返回dentry.

如果，你现在仍然很清醒，那么恭喜你，你已经基本了解了整个过程。

lookup函数返回，在__link_path_walk函数调用path_to_nameidata将path->mnt和path->dentry赋给nd->mnt和nd->dentry.表示找到的目录项对象和挂载点对象。

接下来，处理符号链接,调用do_follow_link函数:

/*
 * This limits recursive symlink follows to 8, while
 * limiting consecutive symlinks to 40.
 *
 * Without that kind of total limit, nasty chains of consecutive
 * symlinks can cause almost arbitrarily long lookups. 
 */
static inline int do_follow_link(struct path *path, struct nameidata *nd)
{
	int err = -ELOOP;
	if (current->link_count >= MAX_NESTED_LINKS)/*检查符号链接数，如果软链接不停的链接自己，可能导致内核栈溢出*/
		goto loop;
	/*表示总的符号链接数*/
	if (current->total_link_count >= 40)
		goto loop;
	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
	cond_resched();
	err = security_inode_follow_link(path->dentry, nd);
	if (err)
		goto loop;
	current->link_count++;/*增加链接数*/
	current->total_link_count++;
	nd->depth++;/*增加链接深度*/
	err = __do_follow_link(path, nd);
	current->link_count--;
	nd->depth--;
	return err;
loop:
	dput_path(path, nd);
	path_release(nd);
	return err;
}

这个函数首先松果符号链接数，不能超过MAX_NESTED_LINKS.

最终调用__do_follow_link进行处理。

static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
{
	int error;
	void *cookie;
	struct dentry *dentry = path->dentry;

	touch_atime(path->mnt, dentry);/*更新inode节点的存取时间*/
	/*先将nd->saved_names数组置空*/
	nd_set_link(nd, NULL);
	if (path->mnt != nd->mnt) {
		path_to_nameidata(path, nd);
		dget(dentry);
	}
	mntget(path->mnt);
	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);/*提取存储在符号链接的路径，并保存在nd->saved_names数组*/
	error = PTR_ERR(cookie);
	if (!IS_ERR(cookie)) {
		/*路径名放在s*/
		char *s = nd_get_link(nd);
		error = 0;
		if (s)
			error = __vfs_follow_link(nd, s);/*解析路径名*/
		if (dentry->d_inode->i_op->put_link)
			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
	}
	dput(dentry);
	mntput(path->mnt);

	return error;
}

第15行，取出符号链接的路径，放到nd->saved_names可以看出，符号链接有自己的inode节点，并且inode节点保存的内容是真正的文件路径。所以，符号链接可以跨文件系统。

第22行，调用__vfs_follow_link解析路径名。

/*按照符号链接保存的路径名调用link_path_walk解析真正的链接*/
static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
{
	int res = 0;
	char *name;
	if (IS_ERR(link))
		goto fail;
	/*如果第一个字符是/，那么从根开始查找*/
	if (*link == '/') {
		path_release(nd);
		if (!walk_init_root(link, nd))
			/* weird __emul_prefix() stuff did it */
			goto out;
	}
	res = link_path_walk(link, nd);
out:
	if (nd->depth || res || nd->last_type!=LAST_NORM)
		return res;
	/*
	 * If it is an iterative symlinks resolution in open_namei() we
	 * have to copy the last component. And all that crap because of
	 * bloody create() on broken symlinks. Furrfu...
	 */
	name = __getname();
	if (unlikely(!name)) {
		path_release(nd);
		return -ENOMEM;
	}
	strcpy(name, nd->last.name);
	nd->last.name = name;
	return 0;
fail:
	path_release(nd);
	return PTR_ERR(link);
}

第15行，调用link_path_walk. 看到这个函数，松了一口气，因为前面已经分析过了。

当__link_path_walk返回时，link_path_walk也跟着返回，之后do_path_lookup也返回了，最终回到open_namei函数。

如果是打开文件，返回即可。

如果是创建文件，还需调用open_namei_create函数：

static int open_namei_create(struct nameidata *nd, struct path *path,
				int flag, int mode)
{
	int error;
	struct dentry *dir = nd->dentry;

	if (!IS_POSIXACL(dir->d_inode))
		mode &= ~current->fs->umask;
	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
	mutex_unlock(&dir->d_inode->i_mutex);
	dput(nd->dentry);
	nd->dentry = path->dentry;/*更改nd目录项对象指向新创建的文件*/
	if (error)
		return error;
	/* Don't check for write permission, don't truncate */
	return may_open(nd, 0, flag & ~O_TRUNC);
}

封装了vfs_create函数:

int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
		struct nameidata *nd)
{
	int error = may_create(dir, dentry, nd);

	if (error)
		return error;

	if (!dir->i_op || !dir->i_op->create)
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	DQUOT_INIT(dir);
	error = dir->i_op->create(dir, dentry, mode, nd);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}

调用inode的create方法创建索引节点。以ext3为例，调用ext3_create函数:

/*已经创建了目录项缓存对象，但是没有创建索引节点对象
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
		struct nameidata *nd)
{
	handle_t *handle;
	struct inode * inode;
	int err, retries = 0;

retry:
	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
		handle->h_sync = 1;

	inode = ext3_new_inode (handle, dir, mode);
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
		inode->i_op = &ext3_file_inode_operations;
		inode->i_fop = &ext3_file_operations;
		ext3_set_aops(inode);
		err = ext3_add_nondir(handle, dentry, inode);
	}
	ext3_journal_stop(handle);
	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
		goto retry;
	return err;
}

第26行，创建索引节点。

第29-33行，inode->i_op和inode->i_fop赋值。

之后，还会将索引节点标识为脏，需要回写到磁盘上，具体实现就不分析了。

当open_namei函数返回时，open系统调用也就分析完了。

总结:

(1)建立一个struct file结构体，将nameidata相关域填充到这个结构体，最重要的两个域mnt, dentry. 从dentry可得到inode，从而将i_fop赋给文件对象。

(2)在路径查找时，通过父目录项建立子目录项，然后将子目录项关联inode节点。

(3)打开文件和建立文件不同。打开文件，只需要找到目录项对象，然后关联索引节点即可，因为索引节点存在。而建立文件时，由于文件不存在，首先找到目录的目录项对象，然后建立子目录项对象和索引节点对象，最后索引节点对象需要同步到磁盘上。

(4)有两个缓存,dentry cache和inode cache,分别用来缓存目录项对象和索引节点对象。

(5)将文件对象和进程的files_struct相关联。

(6)对于普通文件，不需要打开操作，而对于设备文件，需要打开操作，例如SCSI设备的sg_open函数。

(7)主要处理三种情形:打开文件，建立文件和符号链接

参考文献: <深入理解Linux内核第3版>

Linux open系统调用流程(3)

猜你喜欢