2. 首先调用sys_mount,代码在<fs/namaspace.c>文件,如下:
/**
dev_name:包含一个文件系统的设备文件名,如/dev/sda
dir_name:安装点目录
type:已注册的文件系统类型
flags:安装标志
data:文件系统相关的数据结构,可以为NULL
**/
asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
char __user * type, unsigned long flags,
void __user * data)
{
int retval;
unsigned long data_page;
unsigned long type_page;
unsigned long dev_page;
char *dir_page;
retval = copy_mount_options(type, &type_page);/*将类型复制到类型页,如果不足一页,补0*/
if (retval < 0)
return retval;
dir_page = getname(dir_name);/*路径名从用户空间复制到内存页*/
retval = PTR_ERR(dir_page);
if (IS_ERR(dir_page))
goto out1;
retval = copy_mount_options(dev_name, &dev_page);/*将设备名从用户空间复制到内存*/
if (retval < 0)
goto out2;
retval = copy_mount_options(data, &data_page);
if (retval < 0)
goto out3;
/*锁定内核 */
lock_kernel();
/*安装文件系统*/
retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
flags, (void *)data_page);
unlock_kernel();
free_page(data_page);
out3:
free_page(dev_page);
out2:
putname(dir_page);
out1:
free_page(type_page);
return retval;
}
sys_mount函数首先将用户空间传入的设备路径名dev_name, 文件系统类型type和data分别复制到内核页dev_page, type_page和data_page中,然后锁定内核,调用do_mount函数进行处理。
/*
* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
*
* data is a (void *) that can point to any structure up to
* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
* information (or be NULL).
*
* Pre-0.97 versions of mount() didn't have a flags word.
* When the flags word was introduced its top half was required
* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
* Therefore, if this magic number is present, it carries no information
* and must be discarded.
*/
long do_mount(char *dev_name, char *dir_name, char *type_page,
unsigned long flags, void *data_page)
{
struct nameidata nd;
int retval = 0;
int mnt_flags = 0;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
flags &= ~MS_MGC_MSK;
/* Basic sanity checks */
/*基本检查,包括目录名是否为空,设备名是否为空等*/
if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
return -EINVAL;
if (dev_name && !memchr(dev_name, 0, PAGE_SIZE))
return -EINVAL;
if (data_page)
((char *)data_page)[PAGE_SIZE - 1] = 0;
/*安装标志*/
/* Separate the per-mountpoint flags */
if (flags & MS_NOSUID)
mnt_flags |= MNT_NOSUID;/*禁止使用setuid和setgid标志*/
if (flags & MS_NODEV)
mnt_flags |= MNT_NODEV;/*禁止访问设备文件*/
if (flags & MS_NOEXEC)
mnt_flags |= MNT_NOEXEC;/*不允许执行程序*/
if (flags & MS_NOATIME)
mnt_flags |= MNT_NOATIME;/*不更新文件的存取时间*/
if (flags & MS_NODIRATIME)
mnt_flags |= MNT_NODIRATIME;/*不更新目录的存取时间*/
if (flags & MS_RELATIME)
mnt_flags |= MNT_RELATIME;
/*清除这些标志*/
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME);
/* ... and get the mountpoint查询路径,存储在namidata结构体对象,存放了安装点目录项对象和安装点对象 */
retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
if (retval)
return retval;
retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
if (retval)
goto dput_out;
/*是否需要重新挂载,通常改变文件挂载的标志,如将只读的文件系统变为可写,一般不改变安装点*/
if (flags & MS_REMOUNT)
retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
/*可以将文件系统的部分目录挂载到另外一个地方,这样,在两个地方都可以访问该目录*/
else if (flags & MS_BIND)
retval = do_loopback(&nd, dev_name, flags & MS_REC);
/*改变安装点的类型*/
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&nd, flags);
/*将已经挂载的文件系统移动到新的安装点,即移动目录树*/
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
else/*通常情况下调用这个函数,建立一个新的安装点*/
retval = do_new_mount(&nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_release(&nd);
return retval;
}
do_mount首先进行基本检查,包括目录名是否为空,设备名是否为空等,接下来,根据flags设置挂载标志mnt_flags. 其中, path_lookup是路径查找函数,根据目录名找到目录项对象,并将目录项对象和安装点对象存储在nameidata结构体。根据标志作一些判断,包括是否需要重新挂载文件系统(do_remount), 我们关心的是do_new_mount这个函数,即挂载一个新的文件系统,继续跟踪这个函数:
/*创建一个新的挂载
* create a new mount for userspace and request it to be added into the
* namespace's tree
*/
/**
参数1:nameidata结构体指针
参数2:挂载点类型
参数3:原挂载标志
参数4:新挂载标志
参数5:设备名称指针
参数6:私有数据结构指针
**/
static int do_new_mount(struct nameidata *nd, char *type, int flags,
int mnt_flags, char *name, void *data)
{
struct vfsmount *mnt;/*vfsmount结构*/
if (!type || !memchr(type, 0, PAGE_SIZE))
return -EINVAL;
/* we need capabilities... 查看是否具有挂载权限*/
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/*返回一个新的安装点对象,包括建立一个超级块对象*/
mnt = do_kern_mount(type, flags, name, data);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
/*将安装点添加到安装目录树,hash表和父安装点的子链表*/
return do_add_mount(mnt, nd, mnt_flags, NULL);
}
这个函数主要完成两大功能,第一,建立一个新的安装点对象和超级块对象,并将安装点对象和超级块对象相关联。第二,将安装点对象加入到mount tree。我们分别看一下这两个函数,首先跟踪do_kern_mount函数:
/**
参数1:要挂载的文件系统类型,如ext3
参数2:挂载标志
参数3:块设备路径名,如/dev/sda
参数4:指向additional data的指针,传入 read_super函数
返回值:vfsmount指针
**/
struct vfsmount *
do_kern_mount(const char *fstype, int flags, const char *name, void *data)
{
/*在file_system链表查找,得到一个已经注册的文件系统类型*/
struct file_system_type *type = get_fs_type(fstype);
struct vfsmount *mnt;
if (!type)
return ERR_PTR(-ENODEV);
/*返回挂载点对象*/
mnt = vfs_kern_mount(type, flags, name, data);
put_filesystem(type);
return mnt;
}
首先在file_system链表查找已经注册的文件系统类型,在文件系统类型一节提到。定义一个mnt指针,调用vfs_kern_mount:
/**
参数1:文件系统类型
参数2:挂载标志,如MS_BIND
参数3:设备路径
参数4:私有additional data,传入read_super函数
返回值:已经和superblock关联的vfsmount对象
**/
struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
struct vfsmount *mnt;
char *secdata = NULL;
int error;
if (!type)
return ERR_PTR(-ENODEV);
error = -ENOMEM;
/*分配并初始化文件系统对象*/
mnt = alloc_vfsmnt(name);
if (!mnt)
goto out;
if (data) {
secdata = alloc_secdata();
if (!secdata)
goto out_mnt;
error = security_sb_copy_data(type, data, secdata);
if (error)
goto out_free_secdata;
}
/*根据具体的文件系统,分配超级块,并初始化超级块信息,建立超级块和vfsmount之间关系*/
error = type->get_sb(type, flags, name, data, mnt);
if (error < 0)
goto out_free_secdata;
error = security_sb_kern_mount(mnt->mnt_sb, secdata);
if (error)
goto out_sb;
/*设置安装点目录项对象和父安装点,在以后graft_free持载到目录树中更新为合适的值*/
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
up_write(&mnt->mnt_sb->s_umount);
free_secdata(secdata);
return mnt;
out_sb:
dput(mnt->mnt_root);
up_write(&mnt->mnt_sb->s_umount);
deactivate_super(mnt->mnt_sb);
out_free_secdata:
free_secdata(secdata);
out_mnt:
free_vfsmnt(mnt);
out:
return ERR_PTR(error);
}
这个函数比较复杂,包括几个关键的部分,首先调用alloc_vfsmnt分配并初始化安装点对象。接下来,调用type->get_sb分配并初始化超级块信息,并将超级块信息和mnt相关联,type涉及到具体的文件系统,一会分析。最后设置安装点目录项对象和父安装点。先看一下alloc_vfsmnt函数:
/*分配并初始化安装点对象vfsmount*/
struct vfsmount *alloc_vfsmnt(const char *name)
{ /*在内存分配一个struct vfsmount*/
struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
if (mnt) {
memset(mnt, 0, sizeof(struct vfsmount));
atomic_set(&mnt->mnt_count, 1);
/*hash链表指针*/
INIT_LIST_HEAD(&mnt->mnt_hash);
/*子安装点的下一个对象指针*/
INIT_LIST_HEAD(&mnt->mnt_child);
/*子安装点链表的头指针*/
INIT_LIST_HEAD(&mnt->mnt_mounts);
/*指向命名空间的下一个安装点对象*/
INIT_LIST_HEAD(&mnt->mnt_list);
/*文件系统的过期链表*/
INIT_LIST_HEAD(&mnt->mnt_expire);
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
if (name) {
int size = strlen(name) + 1;
/*分配设备名内存*/
char *newname = kmalloc(size, GFP_KERNEL);
if (newname) {
memcpy(newname, name, size);
/*将安装点对象关联设备名称*/
mnt->mnt_devname = newname;
}
}
}
return mnt;
}
看到这个函数,应该欣喜,比较简单,在内存分配一个vfsmount,并初始化相应的链表信息。最后将挂载点对象关联设备名称。
下面看一下type->get_sb函数,由于在注册文件系统类型时就注册了get_sb函数,所以这个函数与具体的文件系统类型相关,以ext3为例,其对应函数为: ext3_get_sb:
static int ext3_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt);
}
调用了get_sb_bdev,设置回调函数ext3_fill_super,进行超级块的填充,这个函数会在以后调用。
/**
每个文件系统类型对应多个超级块对象,每个文件系统有一个超级块对象,例如ext3文件系统类型可对应多个超级块对象,而/dev/sda,/dev/sdb拥有一个超级块对象
最后将安装点和超级块相关联,这样vfsmount和super_block之间的关系就建立好了
**/
int get_sb_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int),
struct vfsmount *mnt)
{
struct block_device *bdev;
struct super_block *s;
int error = 0;
/*打开一个块设备,传入类型,只读或者读写*/
bdev = open_bdev_excl(dev_name, flags, fs_type);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
/*
* once the super is inserted into the list by sget, s_umount
* will protect the lockfs code from trying to start a snapshot
* while we are mounting
*/
down(&bdev->bd_mount_sem);
/*得到一个超级块对象,根据bdev查询*/
s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
up(&bdev->bd_mount_sem);
if (IS_ERR(s))
goto error_s;
/*如果原超级块存在*/
if (s->s_root) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
up_write(&s->s_umount);
deactivate_super(s);
error = -EBUSY;
goto error_bdev;
}
/*关闭块设备*/
close_bdev_excl(bdev);
} else {
char b[BDEVNAME_SIZE];
/*设置挂载标志*/
s->s_flags = flags;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
/*设置块大小,在512字节-4K之间*/
sb_set_blocksize(s, block_size(bdev));
/*填充超级块对象相关信息,包括建立超级块的根目录项对象,相关操作方法super_operations等*/
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
if (error) {
up_write(&s->s_umount);
deactivate_super(s);
goto error;
}
s->s_flags |= MS_ACTIVE;
bdev_uevent(bdev, KOBJ_MOUNT);
}
/*将安装点和超级块相关联,成功返回0*/
return simple_set_mnt(mnt, s);
error_s:
error = PTR_ERR(s);
error_bdev:
close_bdev_excl(bdev);
error:
return error;
}
首先,以互斥的方式打开设备open_bdev_excl, 接下来sget函数得到一个超级块对象,fill_super填充超级块的相关信息,最后simple_set_mnt函数将超级块对象和挂载点对象相关联。那么sget函数是怎么得到一个超级块对象呢?
/**从type->fs_supers链表查找属于同一个文件系统类型的超级块对象,如果找到,则返回超级块对象地址,否则,创建一个超级块对象,并将超级块对象加入到type->fs_supers链表
* sget - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
* @set: setup callback
* @data: argument to each of them
*/
struct super_block *sget(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
void *data)
{
struct super_block *s = NULL;
struct list_head *p;
int err;
retry:
spin_lock(&sb_lock);
/*s->s_bdev == data在新创建一个超级块时,进行了设置!因此,当test为真时,说明此超级块很有可能已经被创建过,在属于同一文件系统类型的超级块链表查找,fs->supers指向表头,s_instances指向下一个超级块对象*/
if (test) list_for_each(p, &type->fs_supers) {
struct super_block *old;
old = list_entry(p, struct super_block, s_instances);
if (!test(old, data))/*说明不是此超级块*/
continue;
if (!grab_super(old))
goto retry;
if (s)
destroy_super(s);
/*找到返回*/
return old;
}
/*如果没找到*/
if (!s) {
spin_unlock(&sb_lock);
/*创建一个超级块对象*/
s = alloc_super(type);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
}
/*将s->s_bdev和data相关联*/
err = set(s, data);
if (err) {
spin_unlock(&sb_lock);
destroy_super(s);
return ERR_PTR(err);
}
/*设置所属文件系统类型*/
s->s_type = type;
/*将包含超级块设备的名称复制到s_id字符数组*/
strlcpy(s->s_id, type->name, sizeof(s->s_id));
/*将超级块加入到所有超级块链表,表头存在super_blocks变量*/
list_add_tail(&s->s_list, &super_blocks);
/*将超级块加入到属于同种文件系统类型的链表*/
list_add(&s->s_instances, &type->fs_supers);
spin_unlock(&sb_lock);
/*增加文件系统计数*/
get_filesystem(type);
return s;
}
这个函数,首先在超级块链表fs_supers查找超级块对象,如果找到,则返回。否则,创建一个超级块对象。alloc_super在内存分配超级块对象,然后设置所属的文件系统类型,将超级块对象加入到所有超级块链表(super_blocks)和将超级块加入到属于同种文件系统类型的链表(type->fs_supers). 最后返回超级块对象s. 在得到超级块对象之后,在get_sb_bdev函数的第30行,判断一下原超级块是否存在,如果原超级块存在,说明文件系统已存在,并且超级块已经填充过,此时调用close_bdev_excl关闭块设备。如果原超级块不存在,接下来,对超级块进行填充,以ext3文件系统为例,调用ext3_fill_super函数。
/*填充超级块信息*/
static int ext3_fill_super (struct super_block *sb, void *data, int silent)
{
struct buffer_head * bh;
/*超级块的磁盘结构*/
struct ext3_super_block *es = NULL;
/*超级块相关信息,存于内存*/
struct ext3_sb_info *sbi;
ext3_fsblk_t block;
/*得到超级块的逻辑块号*/
ext3_fsblk_t sb_block = get_sb_block(&data);
ext3_fsblk_t logic_sb_block;
unsigned long offset = 0;
unsigned int journal_inum = 0;
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
/*根索引节点对象*/
struct inode *root;
int blocksize;
int hblock;
int db_count;
int i;
int needs_recovery;
__le32 features;
/*分配超级块的相关信息的内存结构*/
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
/*将s_fs_info指向sbi*/
sb->s_fs_info = sbi;
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT3_DEF_RESUID;
sbi->s_resgid = EXT3_DEF_RESGID;
unlock_kernel();
/*得到块的大小*/
blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE);
if (!blocksize) {
printk(KERN_ERR "EXT3-fs: unable to set blocksize\n");
goto out_fail;
}
/*
* The ext3 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
*/
if (blocksize != EXT3_MIN_BLOCK_SIZE) {
logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
} else {
logic_sb_block = sb_block;
}
/*读取超级块信息*/
if (!(bh = sb_bread(sb, logic_sb_block))) {
printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
goto out_fail;
}
/*
* Note: s_es must be initialized as soon as possible because
* some ext3 macro-instructions depend on its value
*/
/*取得磁盘上的struct ext3_super_block信息*/
es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
/*将s_es指向缓冲区的es*/
sbi->s_es = es;
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT3_SUPER_MAGIC)
goto cantfind_ext3;
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
if (def_mount_opts & EXT3_DEFM_DEBUG)
set_opt(sbi->s_mount_opt, DEBUG);
if (def_mount_opts & EXT3_DEFM_BSDGROUPS)
set_opt(sbi->s_mount_opt, GRPID);
if (def_mount_opts & EXT3_DEFM_UID16)
set_opt(sbi->s_mount_opt, NO_UID32);
if (def_mount_opts & EXT3_DEFM_XATTR_USER)
set_opt(sbi->s_mount_opt, XATTR_USER);
if (def_mount_opts & EXT3_DEFM_ACL)
set_opt(sbi->s_mount_opt, POSIX_ACL);
if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_DATA)
sbi->s_mount_opt |= EXT3_MOUNT_JOURNAL_DATA;
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_ORDERED)
sbi->s_mount_opt |= EXT3_MOUNT_ORDERED_DATA;
else if ((def_mount_opts & EXT3_DEFM_JMODE) == EXT3_DEFM_JMODE_WBACK)
sbi->s_mount_opt |= EXT3_MOUNT_WRITEBACK_DATA;
if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_PANIC)
set_opt(sbi->s_mount_opt, ERRORS_PANIC);
else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
set_opt(sbi->s_mount_opt, ERRORS_RO);
else
set_opt(sbi->s_mount_opt, ERRORS_CONT);
sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
set_opt(sbi->s_mount_opt, RESERVATION);
if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
NULL, 0))
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
(EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
printk(KERN_WARNING
"EXT3-fs warning: feature flags set on rev 0 fs, "
"running e2fsck is recommended\n");
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
*/
features = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP);
if (features) {
printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
"unsupported optional features (%x).\n",
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
features = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP);
if (!(sb->s_flags & MS_RDONLY) && features) {
printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
"unsupported optional features (%x).\n",
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
if (blocksize < EXT3_MIN_BLOCK_SIZE ||
blocksize > EXT3_MAX_BLOCK_SIZE) {
printk(KERN_ERR
"EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
blocksize, sb->s_id);
goto failed_mount;
}
hblock = bdev_hardsect_size(sb->s_bdev);
if (sb->s_blocksize != blocksize) {
/*
* Make sure the blocksize for the filesystem is larger
* than the hardware sectorsize for the machine.
*/
if (blocksize < hblock) {
printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
"device blocksize %d.\n", blocksize, hblock);
goto failed_mount;
}
brelse (bh);
sb_set_blocksize(sb, blocksize);
logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
bh = sb_bread(sb, logic_sb_block);
if (!bh) {
printk(KERN_ERR
"EXT3-fs: Can't read superblock on 2nd try.\n");
goto failed_mount;
}
es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
printk (KERN_ERR
"EXT3-fs: Magic mismatch, very weird !\n");
goto failed_mount;
}
}
sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
} else {
sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
if ((sbi->s_inode_size < EXT3_GOOD_OLD_INODE_SIZE) ||
(sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
(sbi->s_inode_size > blocksize)) {
printk (KERN_ERR
"EXT3-fs: unsupported inode size: %d\n",
sbi->s_inode_size);
goto failed_mount;
}
}
sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
le32_to_cpu(es->s_log_frag_size);
if (blocksize != sbi->s_frag_size) {
printk(KERN_ERR
"EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
sbi->s_frag_size, blocksize);
goto failed_mount;
}
sbi->s_frags_per_block = 1;
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
if (EXT3_INODE_SIZE(sb) == 0)
goto cantfind_ext3;
sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
if (sbi->s_inodes_per_block == 0)
goto cantfind_ext3;
sbi->s_itb_per_group = sbi->s_inodes_per_group /
sbi->s_inodes_per_block;
sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
sbi->s_sbh = bh;
sbi->s_mount_state = le16_to_cpu(es->s_state);
sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb));
for (i=0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
if (sbi->s_blocks_per_group > blocksize * 8) {
printk (KERN_ERR
"EXT3-fs: #blocks per group too big: %lu\n",
sbi->s_blocks_per_group);
goto failed_mount;
}
if (sbi->s_frags_per_group > blocksize * 8) {
printk (KERN_ERR
"EXT3-fs: #fragments per group too big: %lu\n",
sbi->s_frags_per_group);
goto failed_mount;
}
if (sbi->s_inodes_per_group > blocksize * 8) {
printk (KERN_ERR
"EXT3-fs: #inodes per group too big: %lu\n",
sbi->s_inodes_per_group);
goto failed_mount;
}
if (le32_to_cpu(es->s_blocks_count) >
(sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
printk(KERN_ERR "EXT3-fs: filesystem on %s:"
" too large to mount safely\n", sb->s_id);
if (sizeof(sector_t) < 8)
printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
"enabled\n");
goto failed_mount;
}
if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
goto cantfind_ext3;
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT3_BLOCKS_PER_GROUP(sb)) + 1;
db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
EXT3_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
printk (KERN_ERR "EXT3-fs: not enough memory\n");
goto failed_mount;
}
bgl_lock_init(&sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
printk (KERN_ERR "EXT3-fs: "
"can't read group descriptor %d\n", i);
db_count = i;
goto failed_mount2;
}
}
if (!ext3_check_descriptors (sb)) {
printk(KERN_ERR "EXT3-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
percpu_counter_init(&sbi->s_freeblocks_counter,
ext3_count_free_blocks(sb));
percpu_counter_init(&sbi->s_freeinodes_counter,
ext3_count_free_inodes(sb));
percpu_counter_init(&sbi->s_dirs_counter,
ext3_count_dirs(sb));
/* per fileystem reservation list head & lock */
spin_lock_init(&sbi->s_rsv_window_lock);
sbi->s_rsv_window_root = RB_ROOT;
/* Add a single, static dummy reservation to the start of the
* reservation window list --- it gives us a placeholder for
* append-at-start-of-list which makes the allocation logic
* _much_ simpler. */
sbi->s_rsv_window_head.rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
sbi->s_rsv_window_head.rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
sbi->s_rsv_window_head.rsv_alloc_hit = 0;
sbi->s_rsv_window_head.rsv_goal_size = 0;
ext3_rsv_window_add(sb, &sbi->s_rsv_window_head);
/*
* set up enough so that it can read an inode
*/
sb->s_op = &ext3_sops;/*超级块对象操作,read_inode读索引节点*/
sb->s_export_op = &ext3_export_ops;
sb->s_xattr = ext3_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->s_qcop = &ext3_qctl_operations;
sb->dq_op = &ext3_quota_operations;
#endif
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
/*将s_root设置为空*/
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
EXT3_HAS_INCOMPAT_FEATURE(sb,
EXT3_FEATURE_INCOMPAT_RECOVER));
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
*/
if (!test_opt(sb, NOLOAD) &&
EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext3_load_journal(sb, es, journal_devnum))
goto failed_mount3;
} else if (journal_inum) {
if (ext3_create_journal(sb, es, journal_inum))
goto failed_mount3;
} else {
if (!silent)
printk (KERN_ERR
"ext3: No journal on filesystem on %s\n",
sb->s_id);
goto failed_mount3;
}
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
switch (test_opt(sb, DATA_FLAGS)) {
case 0:
/* No mode set, assume a default based on the journal
capabilities: ORDERED_DATA if the journal can
cope, else JOURNAL_DATA */
if (journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
set_opt(sbi->s_mount_opt, ORDERED_DATA);
else
set_opt(sbi->s_mount_opt, JOURNAL_DATA);
break;
case EXT3_MOUNT_ORDERED_DATA:
case EXT3_MOUNT_WRITEBACK_DATA:
if (!journal_check_available_features
(sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
printk(KERN_ERR "EXT3-fs: Journal does not support "
"requested data journaling mode\n");
goto failed_mount4;
}
default:
break;
}
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
printk(KERN_WARNING "EXT3-fs: Ignoring nobh option - "
"its supported only with writeback mode\n");
clear_opt(sbi->s_mount_opt, NOBH);
}
}
/*
* The journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
*/
/*根据根的索引节点号,得到索引节点对象,首先在inode cache查找*/
root = iget(sb, EXT3_ROOT_INO);
/*分配目录项对象,并将目录项对象与索引节点对象关联*/
sb->s_root = d_alloc_root(root);
if (!sb->s_root) {
printk(KERN_ERR "EXT3-fs: get root inode failed\n");
iput(root);
goto failed_mount4;
}
/*如是不是目录*/
if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
dput(sb->s_root);
sb->s_root = NULL;
printk(KERN_ERR "EXT3-fs: corrupt root inode, run e2fsck\n");
goto failed_mount4;
}
/*将超级块写到磁盘上*/
ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
* in numerous places. Here we just pop the lock - it's relatively
* harmless, because we are now ready to accept write_super() requests,
* and aviro says that's the only reason for hanging onto the
* superblock lock.
*/
EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
ext3_orphan_cleanup(sb, es);
EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
if (needs_recovery)
printk (KERN_INFO "EXT3-fs: recovery complete.\n");
ext3_mark_recovery_complete(sb, es);
printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
lock_kernel();
return 0;
cantfind_ext3:
if (!silent)
printk(KERN_ERR "VFS: Can't find ext3 filesystem on dev %s.\n",
sb->s_id);
goto failed_mount;
failed_mount4:
journal_destroy(sbi->s_journal);
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
failed_mount:
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
#endif
ext3_blkdev_remove(sbi);
brelse(bh);
out_fail:
sb->s_fs_info = NULL;
kfree(sbi);
lock_kernel();
return -EINVAL;
}
这个函数非常长,主要对内存超级块对象填充,在378-380行,得到根的索引节点,并将sb->s_root指向根的目录项对象, 然后调用ext3_setup_super将超级块写到磁盘上。
在填充超级块之后,在get_sb_bdev函数的第58行,调用simple_set_mnt将挂载点对象和超级块对象相关联,具体如下:
/*将安装点和超级块关联*/
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
/*vfsmount对应的超级块指针*/
mnt->mnt_sb = sb;
/*安装点对应的根目录项*/
mnt->mnt_root = dget(sb->s_root);
return 0;
}
主要进行赋值操作,将安装点对象的mnt_sb指向超级块,安装点的根目录项对象mnt_root指向sb->s_root. 至此,挂载点已经完成了,超级块对象的创建,挂载点对象的创建,并且把超级块对象和挂载点对象关联起来了。回到vfs_kern_mount函数,第42行和第43行,设置挂载点的目录项对象和父挂载点。在以后的do_add_mount函数会重新设置。当vfs_kern_mount函数返回时,do_kern_mount也就返回了。接下来,在do_new_mount函数中继续调用do_add_mount函数,将挂载点对象加入到mount tree,具体实现如下:
/* 将vfsmount对象加入到命名空间的安装树
* add a mount into a namespace's mount tree
* - provide the option of adding the new mount to an expiration list
*/
/**
传入参数:
newmnt-新的安装点对象
nd-包含了分量的目录项对象和安装点对象
mnt_flags-安装标志
fslist-过期链表
**/
int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
int mnt_flags, struct list_head *fslist)
{
int err;
/*得到写信号量*/
down_write(&namespace_sem);
/* Something was mounted here while we slept */
while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
if (!check_mnt(nd->mnt))
goto unlock;
/* Refuse the same filesystem on the same mount point 将同一个文件系统两次安装在同一个安装点,就是已经挂载了 */
err = -EBUSY;
/*超级块相同,并且目录项对象相同,也就是同一文件系统挂载到相同的目录下,没有实际意义*/
if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
nd->mnt->mnt_root == nd->dentry)
goto unlock;
err = -EINVAL;
/*如果安装点是一个符号链接*/
if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
goto unlock;
/*安装标志*/
newmnt->mnt_flags = mnt_flags;
/*将新的安装点插入到namespace list对象,hash表和父安装点的子链表中*/
if ((err = graft_tree(newmnt, nd)))
goto unlock;
/*加入过期链表*/
if (fslist) {
/* add to the specified expiration list */
spin_lock(&vfsmount_lock);
list_add_tail(&newmnt->mnt_expire, fslist);
spin_unlock(&vfsmount_lock);
}
up_write(&namespace_sem);
return 0;
unlock:
up_write(&namespace_sem);
mntput(newmnt);
return err;
}
第28行,nd->mnt->mnt_sb==newmnt->mnt_sb表示超级块相同,代表同一个文件系统。nd->mnt->mnt_root==nd->dentry表示安装在同一目录。即将同一个文件系统两将安装在同一个目录,则返回,没有什么实际意义。第36行,设置挂载点标志,调用graft_tree函数。
static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
{
int err;
if (mnt->mnt_sb->s_flags & MS_NOUSER)
return -EINVAL;
if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
S_ISDIR(mnt->mnt_root->d_inode->i_mode))
return -ENOTDIR;
err = -ENOENT;
mutex_lock(&nd->dentry->d_inode->i_mutex);
if (IS_DEADDIR(nd->dentry->d_inode))
goto out_unlock;
err = security_sb_check_sb(mnt, nd);
if (err)
goto out_unlock;
err = -ENOENT;
/*调用attach_recursive_mnt加入到全局安装树*/
if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
err = attach_recursive_mnt(mnt, nd, NULL);
out_unlock:
mutex_unlock(&nd->dentry->d_inode->i_mutex);
if (!err)
security_sb_post_addmount(mnt, nd);
return err;
}
这个函数调用attach_recursive_mnt将安装点加入到全局mount tree中。传入参数分别是挂载点对象,nameidata和原父挂载点。具体原型如下所示:
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
* @parent_nd : if non-null, detach the source_mnt from its parent and
* store the parent mount and mountpoint dentry.
* (done when source_mnt is moved)
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
* ---------------------------------------------------------------------------
* | BIND MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (++) | shared (+) | shared(+++)| invalid |
* | | | | | |
* |non-shared| shared (+) | private | slave (*) | invalid |
* ***************************************************************************
* A bind operation clones the source mount and mounts the clone on the
* destination mount.
*
* (++) the cloned mount is propagated to all the mounts in the propagation
* tree of the destination mount and the cloned mount is added to
* the peer group of the source mount.
* (+) the cloned mount is created under the destination mount and is marked
* as shared. The cloned mount is added to the peer group of the source
* mount.
* (+++) the mount is propagated to all the mounts in the propagation tree
* of the destination mount and the cloned mount is made slave
* of the same master as that of the source mount. The cloned mount
* is marked as 'shared and slave'.
* (*) the cloned mount is made a slave of the same master as that of the
* source mount.
*
* ---------------------------------------------------------------------------
* | MOVE MOUNT OPERATION |
* |**************************************************************************
* | source-->| shared | private | slave | unbindable |
* | dest | | | | |
* | | | | | | |
* | v | | | | |
* |**************************************************************************
* | shared | shared (+) | shared (+) | shared(+++) | invalid |
* | | | | | |
* |non-shared| shared (+*) | private | slave (*) | unbindable |
* ***************************************************************************
*
* (+) the mount is moved to the destination. And is then propagated to
* all the mounts in the propagation tree of the destination mount.
* (+*) the mount is moved to the destination.
* (+++) the mount is moved to the destination and is then propagated to
* all the mounts belonging to the destination mount's propagation tree.
* the mount is marked as 'shared and slave'.
* (*) the mount continues to be a slave at the new location.
*
* if the source mount is a tree, the operations explained above is
* applied to each mount in the tree.
* Must be called without spinlocks held, since this function can sleep
* in allocations.
*/
/**
第一步:设置父安装点nd->mnt和安装点目录项对象nd->dentry
第二步:将安装点加入到全局目录树,即
将安装点添加到三个链表:
(1)全局hash链表
(2)命名空间链表mnt_list
(3)父安装点的子链表
**/
static int attach_recursive_mnt(struct vfsmount *source_mnt,
struct nameidata *nd, struct nameidata *parent_nd)
{
LIST_HEAD(tree_list);
/*nd->mnt指向父文件系统安装点*/
struct vfsmount *dest_mnt = nd->mnt;
/*安装点的目录项对象*/
struct dentry *dest_dentry = nd->dentry;
struct vfsmount *child, *p;
if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
return -EINVAL;
if (IS_MNT_SHARED(dest_mnt)) {
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
}
spin_lock(&vfsmount_lock);
if (parent_nd) {/*如果父文件系统安装点存在,先与父文件系统的安装点断开,再添加到新的父文件系统的安装点*/
detach_mnt(source_mnt, parent_nd);
attach_mnt(source_mnt, nd);//链接到父安装点
touch_mnt_namespace(current->nsproxy->mnt_ns);
} else {
/*设置父安装点,安装点目录项和d_mounted*/
mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
/*将安装点加入hash链表,命名空间链表和父安装点的子链表*/
commit_tree(source_mnt);
}
list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
list_del_init(&child->mnt_hash);
commit_tree(child);
}
spin_unlock(&vfsmount_lock);
return 0;
}
第91行,首先判断父挂载点是否存在,如果存在,先与父挂载点断后,再链接到新的父挂载点。如果父挂载点不存在,则调用mnt_set_mountpoint设置父挂载点,挂载点目录项对象和d_mounted,然后将挂载点加入到全局hash表,命名空间链表和父挂载点的子链表。注意,nd->mnt表示的是新的父挂载点。mnt_set_mountpoint和commit_tree函数如下所示,分别对关键部分进行了注释:
/*设置安装点和父目录*/
void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
struct vfsmount *child_mnt)
{
/*将子文件系统安装点mnt_parent指向父文件系统安装点*/
child_mnt->mnt_parent = mntget(mnt);
/*装载点的目录项对象*/
child_mnt->mnt_mountpoint = dget(dentry);
/*目录项对象加1,由于同一个目录项可以装载多个文件系统*/
dentry->d_mounted++;
}
/*
* the caller must hold vfsmount_lock
*/
static void commit_tree(struct vfsmount *mnt)
{
struct vfsmount *parent = mnt->mnt_parent;/*父文件系统安装点对象*/
struct vfsmount *m;
LIST_HEAD(head);
struct mnt_namespace *n = parent->mnt_ns;
BUG_ON(parent == mnt);
/*加入到命名空间的list链表*/
list_add_tail(&head, &mnt->mnt_list);
list_for_each_entry(m, &head, mnt_list)
m->mnt_ns = n;
list_splice(&head, n->list.prev);
/*添加到hash表,mount_hashtable*/
list_add_tail(&mnt->mnt_hash, mount_hashtable +
hash(parent, mnt->mnt_mountpoint));
/*添加到父文件系统的子链表*/
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
touch_mnt_namespace(n);
}
至此,mount系统调用就完成了,呵呵,条理还算清晰。
3.总结
mount系统调用可以总结如下:
(1)得到一个挂载点对象(vfsmount)->do_kern_mount
(2)将挂载点对象加入到mount tree->do_add_mount
其中(1)又分为:
构建vfsmount对象,构建超级块对象super_block,将超级块对象和挂载点对象相关联。
(2)可分为:
设置vfsmount的父挂载点,安装点目录项,加入到全局mount_hashtable, 命名空间链表list和父挂载点的子链表mnt_mounts.
对于mount系统调用就写到这了,在接下来,我们将一步一步分析Linux内核,包括文件系统,块设备层,I/O调度层, SCSI设备驱动。有机会的话,还将分析一下Linux内核对SSD的支持,包括trim命令。