块设备驱动程序

框架分析：

app: open,read,write "1.txt"
--------------------------------------------- 文件的读写
文件系统: vfat, ext2, ext3, yaffs2, jffs2 (这部分不用操心，内核已经做好文件的读写转换为扇区的读写)
-----------------ll_rw_block做的事是--------- 扇区的读写
1. 把"读写"放入队列
2. 调用队列的处理函数(优化/调顺序/合并)
块设备驱动程序 -------这个是需要我们写的，最主要的是我们要构造一个队列供ll_rw_block使用
---------------------------------------------
硬件: 硬盘,flash

内核是怎么工作的
从这个函数开始分析ll_rw_block

void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
{
	int i;

	for (i = 0; i < nr; i++) {
		struct buffer_head *bh = bhs[i];

		if (!trylock_buffer(bh))
			continue;
		if (rw == WRITE) {
			if (test_clear_buffer_dirty(bh)) {
				bh->b_end_io = end_buffer_write_sync;
				get_bh(bh);
				submit_bh(WRITE, bh);
				continue;
			}
		} else {
			if (!buffer_uptodate(bh)) {
				bh->b_end_io = end_buffer_read_sync;
				get_bh(bh);
				submit_bh(rw, bh);
				continue;
			}
		}
		unlock_buffer(bh);
	}
}

for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
submit_bh(rw, bh);提交bh

int submit_bh(int rw, struct buffer_head * bh)
{
	struct bio *bio;
	int ret = 0;

	BUG_ON(!buffer_locked(bh));
	BUG_ON(!buffer_mapped(bh));
	BUG_ON(!bh->b_end_io);
	BUG_ON(buffer_delay(bh));
	BUG_ON(buffer_unwritten(bh));

	/*
	 * Only clear out a write error when rewriting
	 */
	if (test_set_buffer_req(bh) && (rw & WRITE))
		clear_buffer_write_io_error(bh);

	/*
	 * from here on down, it's all bio -- do the initial mapping,
	 * submit_bio -> generic_make_request may further map this bio around
	 */
	bio = bio_alloc(GFP_NOIO, 1);

	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
	bio->bi_bdev = bh->b_bdev;
	bio->bi_io_vec[0].bv_page = bh->b_page;
	bio->bi_io_vec[0].bv_len = bh->b_size;
	bio->bi_io_vec[0].bv_offset = bh_offset(bh);

	bio->bi_vcnt = 1;
	bio->bi_idx = 0;
	bio->bi_size = bh->b_size;

	bio->bi_end_io = end_bio_bh_io_sync;
	bio->bi_private = bh;

	bio_get(bio);
	submit_bio(rw, bio);

	if (bio_flagged(bio, BIO_EOPNOTSUPP))
		ret = -EOPNOTSUPP;

	bio_put(bio);
	return ret;
}

struct bio *bio; // 使用bh来构造bio (block input/output)
submit_bio(rw, bio);提交bio

void submit_bio(int rw, struct bio *bio)
{
	int count = bio_sectors(bio);

	bio->bi_rw |= rw;

	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
		if (rw & WRITE) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
			current->comm, task_pid_nr(current),
				(rw & WRITE) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_sector,
				bdevname(bio->bi_bdev, b),
				count);
		}
	}

	generic_make_request(bio);
}

// 通用的构造请求: 使用bio来构造请求(request)
generic_make_request(bio);


void generic_make_request(struct bio *bio)
{
	if (current->bio_tail) {
		/* make_request is active */
		*(current->bio_tail) = bio;
		bio->bi_next = NULL;
		current->bio_tail = &bio->bi_next;
		return;
	}
	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to the next (which is NULL) and bio_tail
	 * to &bio_list, thus initialising the bio_list of new bios to be
	 * added.  __generic_make_request may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so fixup bio_list and
	 * bio_tail or bi_next, and call into __generic_make_request again.
	 *
	 * The loop was structured like this to make only one call to
	 * __generic_make_request (which is important as it is large and
	 * inlined) and to keep the structure simple.
	 */
	BUG_ON(bio->bi_next);
	do {
		current->bio_list = bio->bi_next;
		if (bio->bi_next == NULL)
			current->bio_tail = &current->bio_list;
		else
			bio->bi_next = NULL;
		__generic_make_request(bio);
		bio = current->bio_list;
	} while (bio);
	current->bio_tail = NULL; /* deactivate */
}

调用__generic_make_request(bio);

static inline void __generic_make_request(struct bio *bio)
{
	request_queue_t *q;
	sector_t maxsector;
	sector_t old_sector;
	int ret, nr_sectors = bio_sectors(bio);
	dev_t old_dev;

	might_sleep();
	/* Test device or partition size, when known. */
	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
	if (maxsector) {
		sector_t sector = bio->bi_sector;

		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
			/*
			 * This may well happen - the kernel calls bread()
			 * without checking the size of the device, e.g., when
			 * mounting a device.
			 */
			handle_bad_sector(bio);
			goto end_io;
		}
	}

	/*
	 * Resolve the mapping until finished. (drivers are
	 * still free to implement/resolve their own stacking
	 * by explicitly returning 0)
	 *
	 * NOTE: we don't repeat the blk_size check for each new device.
	 * Stacking drivers are expected to know what they are doing.
	 */
	old_sector = -1;
	old_dev = 0;
	do {
		char b[BDEVNAME_SIZE];

		q = bdev_get_queue(bio->bi_bdev);
		if (!q) {
			printk(KERN_ERR
			       "generic_make_request: Trying to access "
				"nonexistent block-device %s (%Lu)\n",
				bdevname(bio->bi_bdev, b),
				(long long) bio->bi_sector);
end_io:
			bio_endio(bio, bio->bi_size, -EIO);
			break;
		}

		if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
			printk("bio too big device %s (%u > %u)\n", 
				bdevname(bio->bi_bdev, b),
				bio_sectors(bio),
				q->max_hw_sectors);
			goto end_io;
		}

		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
			goto end_io;

		if (should_fail_request(bio))
			goto end_io;

		/*
		 * If this device has partitions, remap block n
		 * of partition p to block n+start(p) of the disk.
		 */
		blk_partition_remap(bio);

		if (old_sector != -1)
			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
					    old_sector);

		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);

		old_sector = bio->bi_sector;
		old_dev = bio->bi_bdev->bd_dev;

		maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
		if (maxsector) {
			sector_t sector = bio->bi_sector;

			if (maxsector < nr_sectors ||
					maxsector - nr_sectors < sector) {
				/*
				 * This may well happen - partitions are not
				 * checked to make sure they are within the size
				 * of the whole device.
				 */
				handle_bad_sector(bio);
				goto end_io;
			}
		}

		ret = q->make_request_fn(q, bio);
	} while (ret);
}

request_queue_t *q = bdev_get_queue(bio->bi_bdev); // 找到队列


static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
{
	return bdev->bd_disk->queue;
}

// 调用队列的"构造请求函数"
ret = q->make_request_fn(q, bio);

struct request_queue
{
	/*
	 * Together with queue_head for cacheline sharing
	 */
	struct list_head	queue_head;
	struct request		*last_merge;
	elevator_t		*elevator;

	/*
	 * the queue request freelist, one for reads and one for writes
	 */
	struct request_list	rq;

	request_fn_proc		*request_fn;
	make_request_fn		*make_request_fn;
	prep_rq_fn		*prep_rq_fn;
	unplug_fn		*unplug_fn;
	merge_bvec_fn		*merge_bvec_fn;
	issue_flush_fn		*issue_flush_fn;
	prepare_flush_fn	*prepare_flush_fn;
	softirq_done_fn		*softirq_done_fn;

	/*
	 * Dispatch queue sorting
	 */
	sector_t		end_sector;
	struct request		*boundary_rq;

	/*
	 * Auto-unplugging state
	 */
	struct timer_list	unplug_timer;
	int			unplug_thresh;	/* After this many requests */
	unsigned long		unplug_delay;	/* After this many jiffies */
	struct work_struct	unplug_work;

	struct backing_dev_info	backing_dev_info;

	/*
	 * The queue owner gets to use this for whatever they like.
	 * ll_rw_blk doesn't touch it.
	 */
	void			*queuedata;

	/*
	 * queue needs bounce pages for pages above this limit
	 */
	unsigned long		bounce_pfn;
	gfp_t			bounce_gfp;

	/*
	 * various queue flags, see QUEUE_* below
	 */
	unsigned long		queue_flags;

	/*
	 * protects queue structures from reentrancy. ->__queue_lock should
	 * _never_ be used directly, it is queue private. always use
	 * ->queue_lock.
	 */
	spinlock_t		__queue_lock;
	spinlock_t		*queue_lock;

	/*
	 * queue kobject
	 */
	struct kobject kobj;

	/*
	 * queue settings
	 */
	unsigned long		nr_requests;	/* Max # of requests */
	unsigned int		nr_congestion_on;
	unsigned int		nr_congestion_off;
	unsigned int		nr_batching;

	unsigned int		max_sectors;
	unsigned int		max_hw_sectors;
	unsigned short		max_phys_segments;
	unsigned short		max_hw_segments;
	unsigned short		hardsect_size;
	unsigned int		max_segment_size;

	unsigned long		seg_boundary_mask;
	unsigned int		dma_alignment;

	struct blk_queue_tag	*queue_tags;

	unsigned int		nr_sorted;
	unsigned int		in_flight;

	/*
	 * sg stuff
	 */
	unsigned int		sg_timeout;
	unsigned int		sg_reserved_size;
	int			node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	struct blk_trace	*blk_trace;
#endif
	/*
	 * reserved for flush operations
	 */
	unsigned int		ordered, next_ordered, ordseq;
	int			orderr, ordcolor;
	struct request		pre_flush_rq, bar_rq, post_flush_rq;
	struct request		*orig_bar_rq;
	unsigned int		bi_size;

	struct mutex		sysfs_lock;
};

// 默认的函数是__make_request
__make_request

static void init_request_from_bio(struct request *req, struct bio *bio)
{
	req->cmd_type = REQ_TYPE_FS;

	/*
	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
	 */
	if (bio_rw_ahead(bio) || bio_failfast(bio))
		req->cmd_flags |= REQ_FAILFAST;

	/*
	 * REQ_BARRIER implies no merging, but lets make it explicit
	 */
	if (unlikely(bio_barrier(bio)))
		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);

	if (bio_sync(bio))
		req->cmd_flags |= REQ_RW_SYNC;
	if (bio_rw_meta(bio))
		req->cmd_flags |= REQ_RW_META;

	req->errors = 0;
	req->hard_sector = req->sector = bio->bi_sector;
	req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
	req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
	req->nr_phys_segments = bio_phys_segments(req->q, bio);
	req->nr_hw_segments = bio_hw_segments(req->q, bio);
	req->buffer = bio_data(bio);	/* see ->buffer comment above */
	req->bio = req->biotail = bio;
	req->ioprio = bio_prio(bio);
	req->rq_disk = bio->bi_bdev->bd_disk;
	req->start_time = jiffies;
}

static int __make_request(request_queue_t *q, struct bio *bio)
{
	struct request *req;
	int el_ret, nr_sectors, barrier, err;
	const unsigned short prio = bio_prio(bio);
	const int sync = bio_sync(bio);
	int rw_flags;

	nr_sectors = bio_sectors(bio);

	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);

	barrier = bio_barrier(bio);
	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
		err = -EOPNOTSUPP;
		goto end_io;
	}

	spin_lock_irq(q->queue_lock);

	if (unlikely(barrier) || elv_queue_empty(q))
		goto get_rq;

	el_ret = elv_merge(q, &req, bio);
	switch (el_ret) {
		case ELEVATOR_BACK_MERGE:
			BUG_ON(!rq_mergeable(req));

			if (!ll_back_merge_fn(q, req, bio))
				break;

			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

			req->biotail->bi_next = bio;
			req->biotail = bio;
			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
			req->ioprio = ioprio_best(req->ioprio, prio);
			drive_stat_acct(req, nr_sectors, 0);
			if (!attempt_back_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out;

		case ELEVATOR_FRONT_MERGE:
			BUG_ON(!rq_mergeable(req));

			if (!ll_front_merge_fn(q, req, bio))
				break;

			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

			bio->bi_next = req->bio;
			req->bio = bio;

			/*
			 * may not be valid. if the low level driver said
			 * it didn't need a bounce buffer then it better
			 * not touch req->buffer either...
			 */
			req->buffer = bio_data(bio);
			req->current_nr_sectors = bio_cur_sectors(bio);
			req->hard_cur_sectors = req->current_nr_sectors;
			req->sector = req->hard_sector = bio->bi_sector;
			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
			req->ioprio = ioprio_best(req->ioprio, prio);
			drive_stat_acct(req, nr_sectors, 0);
			if (!attempt_front_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out;

		/* ELV_NO_MERGE: elevator says don't/can't merge. */
		default:
			;
	}

get_rq:
	/*init_request_from_bio
	 * This sync check and mask will be re-done in init_request_from_bio(),
	 * but we need to set it earlier to expose the sync flag to the
	 * rq allocator and io schedulers.
	 */
	rw_flags = bio_data_dir(bio);
	if (sync)
		rw_flags |= REQ_RW_SYNC;

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	req = get_request_wait(q, rw_flags, bio);

	/* add_request
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	init_request_from_bio(req, bio);

	spin_lock_irq(q->queue_lock);
	if (elv_queue_empty(q))
		blk_plug_device(q);
	add_request(q, req);
out:
	if (sync)
		__generic_unplug_device(q);

	spin_unlock_irq(q->queue_lock);
	return 0;

end_io:
	bio_endio(bio, nr_sectors << 9, err);
	return 0;
}

// 先尝试合并
elv_merge(q, &req, bio);
// 如果合并不成，使用bio构造请求
init_request_from_bio(req, bio);
// 把请求放入队列
add_request(q, req);

// 执行队列
__generic_unplug_device(q);

void __generic_unplug_device(request_queue_t *q)
{
	if (unlikely(blk_queue_stopped(q)))
		return;

	if (!blk_remove_plug(q))
		return;

	q->request_fn(q);
}

// 调用队列的"处理函数"
q->request_fn(q);




队列的读和写并没有很快执行，而是等了好一会，如果没有相同的请求再执行
总是先读完，或者写完，因为电梯调度算法

怎么写块设备驱动程序呢？
1. 分配gendisk: alloc_disk
2. 设置
2.1 分配/设置队列: request_queue_t // 它提供读写能力，只有队列还不行，还要其他的属性
怎么分配队列呢？blk_init_queue
2.2 设置gendisk其他信息 // 它提供属性: 比如容量
3. 注册: add_disk

参考：
drivers\block\xd.c
drivers\block\z2ram.c

程序：


/* 参考:
 * drivers\block\xd.c
 * drivers\block\z2ram.c
 */

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/genhd.h>
#include <linux/hdreg.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/delay.h>
#include <linux/io.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/dma.h>

static struct gendisk *ramblock_disk;
static request_queue_t *ramblock_queue;

static int major;

static DEFINE_SPINLOCK(ramblock_lock);

#define RAMBLOCK_SIZE (1024*1024)	//分配一个1M的内存
static unsigned char *ramblock_buf;

//为了适应老工具Fdisk，假装自己有磁头
static int ramblock_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
	/* 容量=heads*cylinders*sectors*512 */
	geo->heads     = 2;//假装自己有几面
	geo->cylinders = 32;//假装自己有几柱面或者环
	geo->sectors   = RAMBLOCK_SIZE/2/32/512;
	return 0;
}


static struct block_device_operations ramblock_fops = {
	.owner	= THIS_MODULE,
	.getgeo	= ramblock_getgeo,
};


static void do_ramblock_request(request_queue_t * q)
{
	static int r_cnt = 0;
	static int w_cnt = 0;
	struct request *req;
	
	//printk("do_ramblock_request %d\n", ++cnt);
    //从请求队列中取出请求
	while ((req = elv_next_request(q)) != NULL) {
	
	    /* 数据传输三要素: 源,目的,长度 */
		/* 源/目的:	offset偏移 */
		unsigned long offset = req->sector * 512;//偏移值 = 扇区*512

		/* 目的/源: */
		// req->buffer

		/* 长度: */		
		unsigned long len = req->current_nr_sectors * 512;//扇区的个数*512

		if (rq_data_dir(req) == READ)
		{
			//printk("do_ramblock_request read %d\n", ++r_cnt);
			memcpy(req->buffer, ramblock_buf+offset, len);
		}
		else
		{
			//printk("do_ramblock_request write %d\n", ++w_cnt);
			memcpy(ramblock_buf+offset, req->buffer, len);
		}		
		
		end_request(req, 1);
	}
}

static int ramblock_init(void)
{
	/* 1. 分配一个gendisk结构体 */
	
ramblock_disk = alloc_disk(16); /* 次设备号个数: 分区个数+1 ，如果是16，最多创建16-1个*/

	/* 2. 设置 */
	/* 2.1 分配/设置队列: 提供读写能力 */
	ramblock_queue = blk_init_queue(do_ramblock_request, &ramblock_lock);
	ramblock_disk->queue = ramblock_queue;
	
	/* 2.2 设置其他属性: 比如容量 */
	major = register_blkdev(0, "ramblock");       /*其实已经退化了，只是你在 cat /proc/devices 给你返回一些信息*/	
	ramblock_disk->major       = major;
	ramblock_disk->first_minor = 0;            //从0-15都对应这个设备
	sprintf(ramblock_disk->disk_name, "ramblock");
	ramblock_disk->fops        = &ramblock_fops; //必须提供，不然会报错
	set_capacity(ramblock_disk, RAMBLOCK_SIZE / 512);	/*设置容量，以扇区为单位，就算只                    
                                                          写一个字节，
														*都要把一个扇区给擦除，然后一个扇
                                                          区写上去，
														*认为扇区永远是512字节，所以要除
                                                          以512
														*/

	/* 3. 硬件相关操作 分配一块内存*/
	ramblock_buf = kzalloc(RAMBLOCK_SIZE, GFP_KERNEL);

	/* 4. 注册 */
	add_disk(ramblock_disk);

	return 0;
}

static void ramblock_exit(void)
{
	unregister_blkdev(major, "ramblock");
	del_gendisk(ramblock_disk);
	put_disk(ramblock_disk);
	blk_cleanup_queue(ramblock_queue);

	kfree(ramblock_buf);
}

module_init(ramblock_init);
module_exit(ramblock_exit);

MODULE_LICENSE("GPL");

测试：

测试3th,4th:
在开发板上:
1. insmod ramblock.ko
2. 格式化: mkdosfs /dev/ramblock
3. 挂接: mount /dev/ramblock /tmp/
4. 读写文件: cd /tmp, 在里面vi文件
5. cd /; umount /tmp/
6. cat /dev/ramblock > /mnt/ramblock.bin
7. 在PC上查看ramblock.bin
sudo mount -o loop ramblock.bin /mnt

测试5th:
1. insmod ramblock.ko
2. ls /dev/ramblock*
3. fdisk /dev/ramblock
分区是根据柱面来分，比如1-5分为一个主分区，其它的再分
分好之后就独立了，每个分区可以分别操作

总结：只需要写好处理队列的函数就可以了

猜你喜欢