框架分析:
app: open,read,write "1.txt"
--------------------------------------------- 文件的读写
文件系统: vfat, ext2, ext3, yaffs2, jffs2 (这部分不用操心,内核已经做好文件的读写转换为扇区的读写)
-----------------ll_rw_block做的事是--------- 扇区的读写
1. 把"读写"放入队列
2. 调用队列的处理函数(优化/调顺序/合并)
块设备驱动程序 -------这个是需要我们写的,最主要的是我们要构造一个队列供ll_rw_block使用
---------------------------------------------
硬件: 硬盘,flash
内核是怎么工作的
从这个函数开始分析ll_rw_block
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
{
int i;
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
if (!trylock_buffer(bh))
continue;
if (rw == WRITE) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(WRITE, bh);
continue;
}
} else {
if (!buffer_uptodate(bh)) {
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
submit_bh(rw, bh);
continue;
}
}
unlock_buffer(bh);
}
}
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
submit_bh(rw, bh);提交bh
int submit_bh(int rw, struct buffer_head * bh)
{
struct bio *bio;
int ret = 0;
BUG_ON(!buffer_locked(bh));
BUG_ON(!buffer_mapped(bh));
BUG_ON(!bh->b_end_io);
BUG_ON(buffer_delay(bh));
BUG_ON(buffer_unwritten(bh));
/*
* Only clear out a write error when rewriting
*/
if (test_set_buffer_req(bh) && (rw & WRITE))
clear_buffer_write_io_error(bh);
/*
* from here on down, it's all bio -- do the initial mapping,
* submit_bio -> generic_make_request may further map this bio around
*/
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
bio->bi_io_vec[0].bv_len = bh->b_size;
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
bio->bi_vcnt = 1;
bio->bi_idx = 0;
bio->bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
bio_get(bio);
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
struct bio *bio; // 使用bh来构造bio (block input/output)
submit_bio(rw, bio);提交bio
void submit_bio(int rw, struct bio *bio)
{
int count = bio_sectors(bio);
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
generic_make_request(bio);
}
// 通用的构造请求: 使用bio来构造请求(request)
generic_make_request(bio);
void generic_make_request(struct bio *bio)
{
if (current->bio_tail) {
/* make_request is active */
*(current->bio_tail) = bio;
bio->bi_next = NULL;
current->bio_tail = &bio->bi_next;
return;
}
/* following loop may be a bit non-obvious, and so deserves some
* explanation.
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
* we assign bio_list to the next (which is NULL) and bio_tail
* to &bio_list, thus initialising the bio_list of new bios to be
* added. __generic_make_request may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so fixup bio_list and
* bio_tail or bi_next, and call into __generic_make_request again.
*
* The loop was structured like this to make only one call to
* __generic_make_request (which is important as it is large and
* inlined) and to keep the structure simple.
*/
BUG_ON(bio->bi_next);
do {
current->bio_list = bio->bi_next;
if (bio->bi_next == NULL)
current->bio_tail = ¤t->bio_list;
else
bio->bi_next = NULL;
__generic_make_request(bio);
bio = current->bio_list;
} while (bio);
current->bio_tail = NULL; /* deactivate */
}
调用__generic_make_request(bio);
static inline void __generic_make_request(struct bio *bio)
{
request_queue_t *q;
sector_t maxsector;
sector_t old_sector;
int ret, nr_sectors = bio_sectors(bio);
dev_t old_dev;
might_sleep();
/* Test device or partition size, when known. */
maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
if (maxsector) {
sector_t sector = bio->bi_sector;
if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
/*
* This may well happen - the kernel calls bread()
* without checking the size of the device, e.g., when
* mounting a device.
*/
handle_bad_sector(bio);
goto end_io;
}
}
/*
* Resolve the mapping until finished. (drivers are
* still free to implement/resolve their own stacking
* by explicitly returning 0)
*
* NOTE: we don't repeat the blk_size check for each new device.
* Stacking drivers are expected to know what they are doing.
*/
old_sector = -1;
old_dev = 0;
do {
char b[BDEVNAME_SIZE];
q = bdev_get_queue(bio->bi_bdev);
if (!q) {
printk(KERN_ERR
"generic_make_request: Trying to access "
"nonexistent block-device %s (%Lu)\n",
bdevname(bio->bi_bdev, b),
(long long) bio->bi_sector);
end_io:
bio_endio(bio, bio->bi_size, -EIO);
break;
}
if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
printk("bio too big device %s (%u > %u)\n",
bdevname(bio->bi_bdev, b),
bio_sectors(bio),
q->max_hw_sectors);
goto end_io;
}
if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
goto end_io;
if (should_fail_request(bio))
goto end_io;
/*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
*/
blk_partition_remap(bio);
if (old_sector != -1)
blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
old_sector);
blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
old_sector = bio->bi_sector;
old_dev = bio->bi_bdev->bd_dev;
maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
if (maxsector) {
sector_t sector = bio->bi_sector;
if (maxsector < nr_sectors ||
maxsector - nr_sectors < sector) {
/*
* This may well happen - partitions are not
* checked to make sure they are within the size
* of the whole device.
*/
handle_bad_sector(bio);
goto end_io;
}
}
ret = q->make_request_fn(q, bio);
} while (ret);
}
request_queue_t *q = bdev_get_queue(bio->bi_bdev); // 找到队列
static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
{
return bdev->bd_disk->queue;
}
// 调用队列的"构造请求函数"
ret = q->make_request_fn(q, bio);
struct request_queue
{
/*
* Together with queue_head for cacheline sharing
*/
struct list_head queue_head;
struct request *last_merge;
elevator_t *elevator;
/*
* the queue request freelist, one for reads and one for writes
*/
struct request_list rq;
request_fn_proc *request_fn;
make_request_fn *make_request_fn;
prep_rq_fn *prep_rq_fn;
unplug_fn *unplug_fn;
merge_bvec_fn *merge_bvec_fn;
issue_flush_fn *issue_flush_fn;
prepare_flush_fn *prepare_flush_fn;
softirq_done_fn *softirq_done_fn;
/*
* Dispatch queue sorting
*/
sector_t end_sector;
struct request *boundary_rq;
/*
* Auto-unplugging state
*/
struct timer_list unplug_timer;
int unplug_thresh; /* After this many requests */
unsigned long unplug_delay; /* After this many jiffies */
struct work_struct unplug_work;
struct backing_dev_info backing_dev_info;
/*
* The queue owner gets to use this for whatever they like.
* ll_rw_blk doesn't touch it.
*/
void *queuedata;
/*
* queue needs bounce pages for pages above this limit
*/
unsigned long bounce_pfn;
gfp_t bounce_gfp;
/*
* various queue flags, see QUEUE_* below
*/
unsigned long queue_flags;
/*
* protects queue structures from reentrancy. ->__queue_lock should
* _never_ be used directly, it is queue private. always use
* ->queue_lock.
*/
spinlock_t __queue_lock;
spinlock_t *queue_lock;
/*
* queue kobject
*/
struct kobject kobj;
/*
* queue settings
*/
unsigned long nr_requests; /* Max # of requests */
unsigned int nr_congestion_on;
unsigned int nr_congestion_off;
unsigned int nr_batching;
unsigned int max_sectors;
unsigned int max_hw_sectors;
unsigned short max_phys_segments;
unsigned short max_hw_segments;
unsigned short hardsect_size;
unsigned int max_segment_size;
unsigned long seg_boundary_mask;
unsigned int dma_alignment;
struct blk_queue_tag *queue_tags;
unsigned int nr_sorted;
unsigned int in_flight;
/*
* sg stuff
*/
unsigned int sg_timeout;
unsigned int sg_reserved_size;
int node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace *blk_trace;
#endif
/*
* reserved for flush operations
*/
unsigned int ordered, next_ordered, ordseq;
int orderr, ordcolor;
struct request pre_flush_rq, bar_rq, post_flush_rq;
struct request *orig_bar_rq;
unsigned int bi_size;
struct mutex sysfs_lock;
};
// 默认的函数是__make_request
__make_request
static void init_request_from_bio(struct request *req, struct bio *bio)
{
req->cmd_type = REQ_TYPE_FS;
/*
* inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
*/
if (bio_rw_ahead(bio) || bio_failfast(bio))
req->cmd_flags |= REQ_FAILFAST;
/*
* REQ_BARRIER implies no merging, but lets make it explicit
*/
if (unlikely(bio_barrier(bio)))
req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
if (bio_sync(bio))
req->cmd_flags |= REQ_RW_SYNC;
if (bio_rw_meta(bio))
req->cmd_flags |= REQ_RW_META;
req->errors = 0;
req->hard_sector = req->sector = bio->bi_sector;
req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
req->nr_phys_segments = bio_phys_segments(req->q, bio);
req->nr_hw_segments = bio_hw_segments(req->q, bio);
req->buffer = bio_data(bio); /* see ->buffer comment above */
req->bio = req->biotail = bio;
req->ioprio = bio_prio(bio);
req->rq_disk = bio->bi_bdev->bd_disk;
req->start_time = jiffies;
}
static int __make_request(request_queue_t *q, struct bio *bio)
{
struct request *req;
int el_ret, nr_sectors, barrier, err;
const unsigned short prio = bio_prio(bio);
const int sync = bio_sync(bio);
int rw_flags;
nr_sectors = bio_sectors(bio);
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
blk_queue_bounce(q, &bio);
barrier = bio_barrier(bio);
if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
err = -EOPNOTSUPP;
goto end_io;
}
spin_lock_irq(q->queue_lock);
if (unlikely(barrier) || elv_queue_empty(q))
goto get_rq;
el_ret = elv_merge(q, &req, bio);
switch (el_ret) {
case ELEVATOR_BACK_MERGE:
BUG_ON(!rq_mergeable(req));
if (!ll_back_merge_fn(q, req, bio))
break;
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
req->biotail->bi_next = bio;
req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
case ELEVATOR_FRONT_MERGE:
BUG_ON(!rq_mergeable(req));
if (!ll_front_merge_fn(q, req, bio))
break;
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
bio->bi_next = req->bio;
req->bio = bio;
/*
* may not be valid. if the low level driver said
* it didn't need a bounce buffer then it better
* not touch req->buffer either...
*/
req->buffer = bio_data(bio);
req->current_nr_sectors = bio_cur_sectors(bio);
req->hard_cur_sectors = req->current_nr_sectors;
req->sector = req->hard_sector = bio->bi_sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
/* ELV_NO_MERGE: elevator says don't/can't merge. */
default:
;
}
get_rq:
/*init_request_from_bio
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
* rq allocator and io schedulers.
*/
rw_flags = bio_data_dir(bio);
if (sync)
rw_flags |= REQ_RW_SYNC;
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
req = get_request_wait(q, rw_flags, bio);
/* add_request
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
init_request_from_bio(req, bio);
spin_lock_irq(q->queue_lock);
if (elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req);
out:
if (sync)
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
return 0;
end_io:
bio_endio(bio, nr_sectors << 9, err);
return 0;
}
// 先尝试合并
elv_merge(q, &req, bio);
// 如果合并不成,使用bio构造请求
init_request_from_bio(req, bio);
// 把请求放入队列
add_request(q, req);
// 执行队列
__generic_unplug_device(q);
void __generic_unplug_device(request_queue_t *q)
{
if (unlikely(blk_queue_stopped(q)))
return;
if (!blk_remove_plug(q))
return;
q->request_fn(q);
}
// 调用队列的"处理函数"
q->request_fn(q);
队列的读和写并没有很快执行,而是等了好一会,如果没有相同的请求再执行
总是先读完,或者写完,因为电梯调度算法
怎么写块设备驱动程序呢?
1. 分配gendisk: alloc_disk
2. 设置
2.1 分配/设置队列: request_queue_t // 它提供读写能力,只有队列还不行,还要其他的属性
怎么分配队列呢?blk_init_queue
2.2 设置gendisk其他信息 // 它提供属性: 比如容量
3. 注册: add_disk
参考:
drivers\block\xd.c
drivers\block\z2ram.c
程序:
/* 参考:
* drivers\block\xd.c
* drivers\block\z2ram.c
*/
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/genhd.h>
#include <linux/hdreg.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/dma.h>
static struct gendisk *ramblock_disk;
static request_queue_t *ramblock_queue;
static int major;
static DEFINE_SPINLOCK(ramblock_lock);
#define RAMBLOCK_SIZE (1024*1024) //分配一个1M的内存
static unsigned char *ramblock_buf;
//为了适应老工具Fdisk,假装自己有磁头
static int ramblock_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{
/* 容量=heads*cylinders*sectors*512 */
geo->heads = 2;//假装自己有几面
geo->cylinders = 32;//假装自己有几柱面或者环
geo->sectors = RAMBLOCK_SIZE/2/32/512;
return 0;
}
static struct block_device_operations ramblock_fops = {
.owner = THIS_MODULE,
.getgeo = ramblock_getgeo,
};
static void do_ramblock_request(request_queue_t * q)
{
static int r_cnt = 0;
static int w_cnt = 0;
struct request *req;
//printk("do_ramblock_request %d\n", ++cnt);
//从请求队列中取出请求
while ((req = elv_next_request(q)) != NULL) {
/* 数据传输三要素: 源,目的,长度 */
/* 源/目的: offset偏移 */
unsigned long offset = req->sector * 512;//偏移值 = 扇区*512
/* 目的/源: */
// req->buffer
/* 长度: */
unsigned long len = req->current_nr_sectors * 512;//扇区的个数*512
if (rq_data_dir(req) == READ)
{
//printk("do_ramblock_request read %d\n", ++r_cnt);
memcpy(req->buffer, ramblock_buf+offset, len);
}
else
{
//printk("do_ramblock_request write %d\n", ++w_cnt);
memcpy(ramblock_buf+offset, req->buffer, len);
}
end_request(req, 1);
}
}
static int ramblock_init(void)
{
/* 1. 分配一个gendisk结构体 */
ramblock_disk = alloc_disk(16); /* 次设备号个数: 分区个数+1 ,如果是16,最多创建16-1个*/
/* 2. 设置 */
/* 2.1 分配/设置队列: 提供读写能力 */
ramblock_queue = blk_init_queue(do_ramblock_request, &ramblock_lock);
ramblock_disk->queue = ramblock_queue;
/* 2.2 设置其他属性: 比如容量 */
major = register_blkdev(0, "ramblock"); /*其实已经退化了,只是你在 cat /proc/devices 给你返回一些信息*/
ramblock_disk->major = major;
ramblock_disk->first_minor = 0; //从0-15都对应这个设备
sprintf(ramblock_disk->disk_name, "ramblock");
ramblock_disk->fops = &ramblock_fops; //必须提供,不然会报错
set_capacity(ramblock_disk, RAMBLOCK_SIZE / 512); /*设置容量,以扇区为单位,就算只
写一个字节,
*都要把一个扇区给擦除,然后一个扇
区写上去,
*认为扇区永远是512字节,所以要除
以512
*/
/* 3. 硬件相关操作 分配一块内存*/
ramblock_buf = kzalloc(RAMBLOCK_SIZE, GFP_KERNEL);
/* 4. 注册 */
add_disk(ramblock_disk);
return 0;
}
static void ramblock_exit(void)
{
unregister_blkdev(major, "ramblock");
del_gendisk(ramblock_disk);
put_disk(ramblock_disk);
blk_cleanup_queue(ramblock_queue);
kfree(ramblock_buf);
}
module_init(ramblock_init);
module_exit(ramblock_exit);
MODULE_LICENSE("GPL");
测试:
测试3th,4th:
在开发板上:
1. insmod ramblock.ko
2. 格式化: mkdosfs /dev/ramblock
3. 挂接: mount /dev/ramblock /tmp/
4. 读写文件: cd /tmp, 在里面vi文件
5. cd /; umount /tmp/
6. cat /dev/ramblock > /mnt/ramblock.bin
7. 在PC上查看ramblock.bin
sudo mount -o loop ramblock.bin /mnt
测试5th:
1. insmod ramblock.ko
2. ls /dev/ramblock*
3. fdisk /dev/ramblock
分区是根据柱面来分,比如1-5分为一个主分区,其它的再分
分好之后就独立了,每个分区可以分别操作
总结:只需要写好处理队列的函数就可以了