背景

由于内核文件系统引入page cache机制，通常的写操作被延迟写入磁盘，当内存中的page cache数据被用户写了但是还没有刷入到磁盘设备，则page cache被标识为脏页dirty，脏页会在下面的几种情况下刷入磁盘：

脏页时间超过了某个阈值
脏页比例超过了某个阈值
内存紧张申请得不到满足
用户系统调用sync之类

在内核2.6.1x版本使用的是pdflush机制，因为管理了所有的磁盘设备所以存在严重的IO性能瓶颈，所以在2.6.3x开始脏页回写由bdi_wirteback机制负责，bdi_wirteback为每个磁盘创建一个bdi和对应线程，专门复制磁盘的刷入工作提高IO性能。

pdflush

pdflush是2.6.1x版本之前采用的机制，由于没有看过代码所以暂不分析。

BDI

BDI是backing device info的缩写，它用于描述后端存储（如磁盘）设备相关的信息。相对于内存来说，后端存储的I/O比较慢，因此写盘操作需要通过page cache进行缓存延迟写入。

bdi-default

最初的BDI子系统里，内核版本2.6.3x，模块启动的时候创建bdi-default进程，然后为每个注册的设备创建flush-x:y（x,y为主次设备号）的进程，用于脏数据的回写。由于没有看过代码所以暂不分析。

workqueue

在Linux 3.10.0版本之后，BDI子系统使用workqueue机制代替原来的线程创建，需要回写时，将flush任务提交给workqueue，最终由通用的[kworker]进程负责处理。

BDI子系统初始化的代码如下：

static int __init default_bdi_init(void)
{
	int err;

	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
					      WQ_UNBOUND | WQ_SYSFS, 0);
	if (!bdi_wq)
		return -ENOMEM;

	err = bdi_init(&default_backing_dev_info);
	if (!err)
		bdi_register(&default_backing_dev_info, NULL, "default");
	err = bdi_init(&noop_backing_dev_info);

	return err;
}
subsys_initcall(default_bdi_init);

设备注册

mount ext4文件系统时，初始化设置默认的default_backing_dev_info，但是在哪儿注册的呢？

static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
		       const char *dev_name, void *data)
{
	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
}

struct dentry *mount_bdev(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data,
	int (*fill_super)(struct super_block *, void *, int))
{
	struct block_device *bdev;
	struct super_block *s;
	s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC,
		 bdev);
		s = alloc_super(type, flags);
			s->s_bdi = &default_backing_dev_info;

数据回写

bdi_queue_work

BDI子系统使用workqueue机制进行数据回写，其回写接口为bdi_queue_work()将具体某个bdi的回写请求（wb_writeback_work）挂到bdi_wq上。

static void bdi_queue_work(struct backing_dev_info *bdi,
			   struct wb_writeback_work *work)
{
	trace_writeback_queue(bdi, work);

	spin_lock_bh(&bdi->wb_lock);
	if (!test_bit(BDI_REGISTERED, &bdi->state)) {
		if (work->done)
			complete(work->done);
		goto out_unlock;
	}
	list_add_tail(&work->list, &bdi->work_list);
	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
out_unlock:
	spin_unlock_bh(&bdi->wb_lock);
}

然后调用wait_for_completion(&done);阻塞等待请求被取走。sync_inodes_sb同步函数会调用到这里。

void sync_inodes_sb(struct super_block *sb)
{
	DECLARE_COMPLETION_ONSTACK(done);
	struct wb_writeback_work work = {
		.sb		= sb,
		.sync_mode	= WB_SYNC_ALL,
		.nr_pages	= LONG_MAX,
		.range_cyclic	= 0,
		.done		= &done,
		.reason		= WB_REASON_SYNC,
	};

	/* Nothing to do? */
	if (sb->s_bdi == &noop_backing_dev_info)
		return;
	WARN_ON(!rwsem_is_locked(&sb->s_umount));

	bdi_queue_work(sb->s_bdi, &work);
	wait_for_completion(&done);

	wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);

bdi_writeback_workfn

bdi_queue_work()提交了work给bdi_wq上，由对应的bdi处理函数进行处理，默认的函数为bdi_writeback_workfn。

void bdi_writeback_workfn(struct work_struct *work)
{
	struct bdi_writeback *wb = container_of(to_delayed_work(work),
						struct bdi_writeback, dwork);
	struct backing_dev_info *bdi = wb->bdi;
	long pages_written;

	set_worker_desc("flush-%s", dev_name(bdi->dev));
	current->flags |= PF_SWAPWRITE;

	if (likely(!current_is_workqueue_rescuer() ||
		!test_bit(BDI_REGISTERED, &bdi->state))) {
		/*
		 * The normal path.  Keep writing back @bdi until its
		 * work_list is empty.  Note that this path is also taken
		 * if @bdi is shutting down even when we're running off the
		 * rescuer as work_list needs to be drained.
		 */
		do {
			pages_written = wb_do_writeback(wb, 0);
			trace_writeback_pages_written(pages_written);
		} while (!list_empty(&bdi->work_list));
	} else {
		/*
		 * bdi_wq can't get enough workers and we're running off
		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
		 * enough for efficient IO.
		 */
		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
						    WB_REASON_FORKER_THREAD);
		trace_writeback_pages_written(pages_written);
	}

	if (!list_empty(&bdi->work_list) ||
	    (wb_has_dirty_io(wb) && dirty_writeback_interval))
		queue_delayed_work(bdi_wq, &wb->dwork,
			msecs_to_jiffies(dirty_writeback_interval * 10));

	current->flags &= ~PF_SWAPWRITE;
}

首先判断当前workqueue能否获得足够的worker进行处理，如果能则将bdi上所有work全部提交，否则只提交一个work并限制写入1024个pages。正常情况下通过调用wb_do_writeback函数处理回写。

long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
{
	struct backing_dev_info *bdi = wb->bdi;
	struct wb_writeback_work *work;
	long wrote = 0;

	set_bit(BDI_writeback_running, &wb->bdi->state);
	while ((work = get_next_work_item(bdi)) != NULL) {
		/*
		 * Override sync mode, in case we must wait for completion
		 * because this thread is exiting now.
		 */
		if (force_wait)
			work->sync_mode = WB_SYNC_ALL;

		trace_writeback_exec(bdi, work);

		wrote += wb_writeback(wb, work);

		/*
		 * Notify the caller of completion if this is a synchronous
		 * work item, otherwise just free it.
		 */
		if (work->done)
			complete(work->done);
		else
			kfree(work);
	}

	/*
	 * Check for periodic writeback, kupdated() style
	 */
	wrote += wb_check_old_data_flush(wb);
	wrote += wb_check_background_flush(wb);
	clear_bit(BDI_writeback_running, &wb->bdi->state);

	return wrote;
}

static long wb_writeback(struct bdi_writeback *wb,
			 struct wb_writeback_work *work)
{
	for (;;) {
		if (work->sb)
			progress = writeback_sb_inodes(work->sb, wb, work);

static long writeback_sb_inodes(struct super_block *sb,
				struct bdi_writeback *wb,
				struct wb_writeback_work *work)
{
	while (!list_empty(&wb->b_io)) {
		struct inode *inode = wb_inode(wb->b_io.prev);

		__writeback_single_inode(inode, &wbc);

static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
	struct address_space *mapping = inode->i_mapping;
	ret = do_writepages(mapping, wbc);

	if (wbc->sync_mode == WB_SYNC_ALL) {
		int err = filemap_fdatawait(mapping);
		if (ret == 0)
			ret = err;
	}

int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
	int ret;

	if (wbc->nr_to_write <= 0)
		return 0;
	if (mapping->a_ops->writepages)
		ret = mapping->a_ops->writepages(mapping, wbc);

BDI-2

背景