背景
由于内核文件系统引入page cache机制,通常的写操作被延迟写入磁盘,当内存中的page cache数据被用户写了但是还没有刷入到磁盘设备,则page cache被标识为脏页dirty,脏页会在下面的几种情况下刷入磁盘:
- 脏页时间超过了某个阈值
- 脏页比例超过了某个阈值
- 内存紧张申请得不到满足
- 用户系统调用sync之类
在内核2.6.1x版本使用的是pdflush机制,因为管理了所有的磁盘设备所以存在严重的IO性能瓶颈,所以在2.6.3x开始脏页回写由bdi_wirteback机制负责,bdi_wirteback为每个磁盘创建一个bdi和对应线程,专门复制磁盘的刷入工作提高IO性能。
pdflush
pdflush是2.6.1x版本之前采用的机制,由于没有看过代码所以暂不分析。
BDI
BDI是backing device info的缩写,它用于描述后端存储(如磁盘)设备相关的信息。相对于内存来说,后端存储的I/O比较慢,因此写盘操作需要通过page cache进行缓存延迟写入。
bdi-default
最初的BDI子系统里,内核版本2.6.3x,模块启动的时候创建bdi-default进程,然后为每个注册的设备创建flush-x:y(x,y为主次设备号)的进程,用于脏数据的回写。由于没有看过代码所以暂不分析。
workqueue
在Linux 3.10.0版本之后,BDI子系统使用workqueue机制代替原来的线程创建,需要回写时,将flush任务提交给workqueue,最终由通用的[kworker]进程负责处理。
BDI子系统初始化的代码如下:
static int __init default_bdi_init(void)
{
int err;
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
WQ_UNBOUND | WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
}
subsys_initcall(default_bdi_init);
设备注册
mount ext4文件系统时,初始化设置默认的default_backing_dev_info,但是在哪儿注册的呢?
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
}
struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
struct block_device *bdev;
struct super_block *s;
s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
bdev);
s = alloc_super(type, flags);
s->s_bdi = &default_backing_dev_info;
数据回写
bdi_queue_work
BDI子系统使用workqueue机制进行数据回写,其回写接口为bdi_queue_work()将具体某个bdi的回写请求(wb_writeback_work)挂到bdi_wq上。
static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
trace_writeback_queue(bdi, work);
spin_lock_bh(&bdi->wb_lock);
if (!test_bit(BDI_REGISTERED, &bdi->state)) {
if (work->done)
complete(work->done);
goto out_unlock;
}
list_add_tail(&work->list, &bdi->work_list);
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
out_unlock:
spin_unlock_bh(&bdi->wb_lock);
}
然后调用wait_for_completion(&done);阻塞等待请求被取走。sync_inodes_sb同步函数会调用到这里。
void sync_inodes_sb(struct super_block *sb)
{
DECLARE_COMPLETION_ONSTACK(done);
struct wb_writeback_work work = {
.sb = sb,
.sync_mode = WB_SYNC_ALL,
.nr_pages = LONG_MAX,
.range_cyclic = 0,
.done = &done,
.reason = WB_REASON_SYNC,
};
/* Nothing to do? */
if (sb->s_bdi == &noop_backing_dev_info)
return;
WARN_ON(!rwsem_is_locked(&sb->s_umount));
bdi_queue_work(sb->s_bdi, &work);
wait_for_completion(&done);
wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);
bdi_writeback_workfn
bdi_queue_work()提交了work给bdi_wq上,由对应的bdi处理函数进行处理,默认的函数为bdi_writeback_workfn。
void bdi_writeback_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
struct bdi_writeback, dwork);
struct backing_dev_info *bdi = wb->bdi;
long pages_written;
set_worker_desc("flush-%s", dev_name(bdi->dev));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
!test_bit(BDI_REGISTERED, &bdi->state))) {
/*
* The normal path. Keep writing back @bdi until its
* work_list is empty. Note that this path is also taken
* if @bdi is shutting down even when we're running off the
* rescuer as work_list needs to be drained.
*/
do {
pages_written = wb_do_writeback(wb, 0);
trace_writeback_pages_written(pages_written);
} while (!list_empty(&bdi->work_list));
} else {
/*
* bdi_wq can't get enough workers and we're running off
* the emergency worker. Don't hog it. Hopefully, 1024 is
* enough for efficient IO.
*/
pages_written = writeback_inodes_wb(&bdi->wb, 1024,
WB_REASON_FORKER_THREAD);
trace_writeback_pages_written(pages_written);
}
if (!list_empty(&bdi->work_list) ||
(wb_has_dirty_io(wb) && dirty_writeback_interval))
queue_delayed_work(bdi_wq, &wb->dwork,
msecs_to_jiffies(dirty_writeback_interval * 10));
current->flags &= ~PF_SWAPWRITE;
}
首先判断当前workqueue能否获得足够的worker进行处理,如果能则将bdi上所有work全部提交,否则只提交一个work并限制写入1024个pages。正常情况下通过调用wb_do_writeback函数处理回写。
long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
struct backing_dev_info *bdi = wb->bdi;
struct wb_writeback_work *work;
long wrote = 0;
set_bit(BDI_writeback_running, &wb->bdi->state);
while ((work = get_next_work_item(bdi)) != NULL) {
/*
* Override sync mode, in case we must wait for completion
* because this thread is exiting now.
*/
if (force_wait)
work->sync_mode = WB_SYNC_ALL;
trace_writeback_exec(bdi, work);
wrote += wb_writeback(wb, work);
/*
* Notify the caller of completion if this is a synchronous
* work item, otherwise just free it.
*/
if (work->done)
complete(work->done);
else
kfree(work);
}
/*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
for (;;) {
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
static long writeback_sb_inodes(struct super_block *sb,
struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
__writeback_single_inode(inode, &wbc);
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
struct address_space *mapping = inode->i_mapping;
ret = do_writepages(mapping, wbc);
if (wbc->sync_mode == WB_SYNC_ALL) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
if (wbc->nr_to_write <= 0)
return 0;
if (mapping->a_ops->writepages)
ret = mapping->a_ops->writepages(mapping, wbc);
最终调用a_ops->writepages刷入pages到磁盘设备。