对于Nvme SSD,我们有的时候会用到ioctl系统调用,该调用的流程是怎样的呢?
首先,在注册nvme设备的时候,会初始化该设备的注册了file operations:
static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = nvme_dev_ioctl, };
在nvme_dev_ioctl里,存在switch语句,列举ioctl的几种cmd,其中我们主要关注的是:NVME_IOCTL_ADMIN_CMD和NVME_IO_CMD。
static long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct nvme_ctrl *ctrl = file->private_data; void __user *argp = (void __user *)arg; switch (cmd) { case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: dev_warn(ctrl->device, "resetting controller\n"); return ctrl->ops->reset_ctrl(ctrl); case NVME_IOCTL_SUBSYS_RESET: return nvme_reset_subsystem(ctrl); case NVME_IOCTL_RESCAN: nvme_queue_scan(ctrl); return 0; default: return -ENOTTY; } }
对于ssd的读写命令,显然是要走 NVME_IOCTL_IO_CMD这一分支,该分支的函数主要做的事情是填充了nvme_command c命令:
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; int status; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&cmd, ucmd, sizeof(cmd))) return -EFAULT; if (cmd.flags) return -EINVAL; memset(&c, 0, sizeof(c)); c.common.opcode = cmd.opcode; c.common.flags = cmd.flags; c.common.nsid = cpu_to_le32(cmd.nsid); c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, &cmd.result, timeout); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) return -EFAULT; } return status; }
int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, u32 *result, unsigned timeout) { return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, result, timeout); } int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; struct gendisk *disk = ns ? ns->disk : NULL; struct request *req; struct bio *bio = NULL; void *meta = NULL; int ret; req = nvme_alloc_request(q, cmd, 0, NVME_QID_ANY); if (IS_ERR(req)) return PTR_ERR(req); req->timeout = timeout ? timeout : ADMIN_TIMEOUT; if (ubuffer && bufflen) { ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, GFP_KERNEL); if (ret) goto out; bio = req->bio; if (!disk) goto submit; bio->bi_bdev = bdget_disk(disk, 0); if (!bio->bi_bdev) { ret = -ENODEV; goto out_unmap; } if (meta_buffer && meta_len) { struct bio_integrity_payload *bip; meta = kmalloc(meta_len, GFP_KERNEL); if (!meta) { ret = -ENOMEM; goto out_unmap; } if (write) { if (copy_from_user(meta, meta_buffer, meta_len)) { ret = -EFAULT; goto out_free_meta; } } bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); if (IS_ERR(bip)) { ret = PTR_ERR(bip); goto out_free_meta; } bip->bip_iter.bi_size = meta_len; bip->bip_iter.bi_sector = meta_seed; ret = bio_integrity_add_page(bio, virt_to_page(meta), meta_len, offset_in_page(meta)); if (ret != meta_len) { ret = -ENOMEM; goto out_free_meta; } } } submit: blk_execute_rq(req->q, disk, req, 0); if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ret = -EINTR; else ret = nvme_req(req)->status; if (result) *result = le32_to_cpu(nvme_req(req)->result.u32); if (meta && !ret && !write) { if (copy_to_user(meta_buffer, meta, meta_len)) ret = -EFAULT; } out_free_meta: kfree(meta); out_unmap: if (bio) { if (disk && bio->bi_bdev) bdput(bio->bi_bdev); blk_rq_unmap_user(bio); } out: blk_mq_free_request(req); return ret; }
__nvme_submit_user_cmd做的主要事情是,通过调用nvme_alloc_request函数分配一个request,对于读写命令,还要对request的bio进行初始化。最后就是提交,调用的函数是blk_execute_rq。
调用的函数也从驱动层到了block层。
在blk-exec.c文件中,找到了blk_execute_rq函数:
void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); unsigned long hang_check; rq->end_io_data = &wait; blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); /* Prevent hang_check timer from firing at us during very long I/O */ hang_check = sysctl_hung_task_timeout_secs; if (hang_check) while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else wait_for_completion_io(&wait); }
该函数主要包括两个部分,第一部分,调用blk_execute_rq_nowait将请求发送下去,第二部分,则是调用wait_for_completion_io函数来等待请求完成(下面也会提到)。
对于blk_execute_rq_nowait函数,其作用是将request insert到software queue中,然后返回。由于NVMe现在是用的是多队列,因此在if(q->mq_ops)中,就会调用blk_mq_sched_insert_request函数。
void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); rq->rq_disk = bd_disk; rq->end_io = done; /* * don't check dying flag for MQ because the request won't * be reused after dying flag is set */ if (q->mq_ops) { blk_mq_sched_insert_request(rq, at_head, true, false, false); return; } spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { rq->rq_flags |= RQF_QUIET; __blk_end_request_all(rq, -ENXIO); spin_unlock_irq(q->queue_lock); return; } __elv_add_request(q, rq, where); __blk_run_queue(q); spin_unlock_irq(q->queue_lock); }
然后就调用到了blk_mq_sched_insert_request函数,在这个函数中,由于不是flush,而且也不是不插入队列,而blk-mq走的是mq-deadline调度器,因此会进入if(e && e->type->ops.mq.insert)里面,
void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { blk_mq_sched_insert_flush(hctx, rq, can_block); return; } if (e && blk_mq_sched_bypass_insert(hctx, rq)) goto run; if (e && e->type->ops.mq.insert_requests) { LIST_HEAD(list); list_add(&rq->queuelist, &list); e->type->ops.mq.insert_requests(hctx, &list, at_head); } else { spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); } run: if (run_queue) blk_mq_run_hw_queue(hctx, async); }
针对mq设计的deadline调度器在文件mq-deadline.c里,我们可以看到这里定义了一些操作,其中就包括insert_requests。
static struct elevator_type mq_deadline = { .ops.mq = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_queue, .exit_sched = dd_exit_queue, }, .uses_mq = true, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_owner = THIS_MODULE, };
因此调用的是dd_insert_requests函数,在这个函数中,会将请求加入到hardware queue中,而且前后需要对hardware queue队列加锁。
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); } spin_unlock(&dd->lock); }
然后对list中的每一个request,依次插入到队列中,在本次调用过程中,只存在一个request,调用的函数也非常简单,由于该request是passthrough的,而且会将request插入到队尾,因此调用的函数是list_add_tail。
static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); if (blk_mq_sched_try_insert_merge(q, rq)) return; blk_mq_sched_request_inserted(rq); if (at_head || blk_rq_is_passthrough(rq)) { if (at_head) list_add(&rq->queuelist, &dd->dispatch); else list_add_tail(&rq->queuelist, &dd->dispatch); } else { deadline_add_rq_rb(dd, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); } }
将request插入到队列之后,就会回到blk_mq_sched_insert_request函数中,调用blk_mq_run_hw_queue函数,在这个函数以及接下来调用的函数,最终会调用dispatch函数,即blk_mq_sched_dispatch_requests函数,
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { __blk_mq_delay_run_hw_queue(hctx, async, 0); } static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx) || !blk_mq_hw_queue_mapped(hctx))) return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); put_cpu(); return; } put_cpu(); } kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { int srcu_idx; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && cpu_online(hctx->next_cpu)); if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { might_sleep(); srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); } }
在blk_mq_sched_dispatch_requests函数中,首先会检查一下是否存在以前的entries,然后调用mq-deadline调度器的dispatch函数用来dispatch请求
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; bool did_work = false; LIST_HEAD(rq_list); if (unlikely(blk_mq_hctx_stopped(hctx))) return; hctx->run++; /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. */ if (!list_empty_careful(&hctx->dispatch)) { spin_lock(&hctx->lock); if (!list_empty(&hctx->dispatch)) list_splice_init(&hctx->dispatch, &rq_list); spin_unlock(&hctx->lock); } /* * Only ask the scheduler for requests, if we didn't have residual * requests from the dispatch list. This is to avoid the case where * we only ever dispatch a fraction of the requests available because * of low device queue depth. Once we pull requests out of the IO * scheduler, we can no longer merge or sort them. So it's best to * leave them there for as long as we can. Mark the hw queue as * needing a restart in that case. */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); did_work = blk_mq_dispatch_rq_list(q, &rq_list); } else if (!has_sched_dispatch) { blk_mq_flush_busy_ctxs(hctx, &rq_list); blk_mq_dispatch_rq_list(q, &rq_list); } /* * We want to dispatch from the scheduler if we had no work left * on the dispatch list, OR if we did have work but weren't able * to make progress. */ if (!did_work && has_sched_dispatch) { do { struct request *rq; rq = e->type->ops.mq.dispatch_request(hctx); if (!rq) break; list_add(&rq->queuelist, &rq_list); } while (blk_mq_dispatch_rq_list(q, &rq_list)); } }
在mq-dealine里定义的dispatch函数是dd_dispatch_request函数,该函数会根据deadline的调度器规则,从中选出一个请求返回给上层
static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; bool reads, writes; int data_dir; if (!list_empty(&dd->dispatch)) { rq = list_first_entry(&dd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); goto done; } reads = !list_empty(&dd->fifo_list[READ]); writes = !list_empty(&dd->fifo_list[WRITE]); /* * batches are currently reads XOR writes */ if (dd->next_rq[WRITE]) rq = dd->next_rq[WRITE]; else rq = dd->next_rq[READ]; if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); if (writes && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (writes) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); dd->starved = 0; data_dir = WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = rq_entry_fifo(dd->fifo_list[data_dir].next); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = dd->next_rq[data_dir]; } dd->batching = 0; dispatch_request: /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, rq); done: rq->rq_flags |= RQF_STARTED; return rq; } static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; spin_lock(&dd->lock); rq = __dd_dispatch_request(hctx); spin_unlock(&dd->lock); return rq; }
接着会调用块层函数blk_mq_dispatch_rq_list函数,该函数也是再次和驱动交互的一个函数,在该函数中,会调用queue_rq函数,这也是在nvme driver里定义的operations。
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) { struct blk_mq_hw_ctx *hctx; struct request *rq; int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; if (list_empty(list)) return false; /* * Now process all the entries, sending them to the driver. */ errors = queued = 0; do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); if (!blk_mq_get_driver_tag(rq, &hctx, false)) { if (!queued && reorder_tags_to_front(list)) continue; /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. */ if (!blk_mq_dispatch_wait_add(hctx)) break; /* * It's possible that a tag was freed in the window * between the allocation failure and adding the * hardware queue to the wait queue. */ if (!blk_mq_get_driver_tag(rq, &hctx, false)) break; } list_del_init(&rq->queuelist); bd.rq = rq; /* * Flag last if we have no more requests, or if we have more * but can't assign a driver tag to it. */ if (list_empty(list)) bd.last = true; else { struct request *nxt; nxt = list_first_entry(list, struct request, queuelist); bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); } ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; default: pr_err("blk-mq: bad return on queue: %d\n", ret); case BLK_MQ_RQ_QUEUE_ERROR: errors++; blk_mq_end_request(rq, -EIO); break; } if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; } while (!list_empty(list)); hctx->dispatched[queued_to_index(queued)]++; /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ if (!list_empty(list)) { /* * If an I/O scheduler has been configured and we got a driver * tag for the next request already, free it again. */ rq = list_first_entry(list, struct request, queuelist); blk_mq_put_driver_tag(rq); spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If TAG_WAITING is set that means that an I/O scheduler has * been configured and another thread is waiting for a driver * tag. To guarantee fairness, do not rerun this hardware queue * but let the other thread grab the driver tag. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq * and dm-rq. */ if (!blk_mq_sched_needs_restart(hctx) && !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) blk_mq_run_hw_queue(hctx, true); } return (queued + errors) != 0; }
对于nvme driver里的queue_rq就是nvme_queue_rq函数,在这个函数中做的事是:setup cmd,init iod,然后发送命令,最后会调用nvme_process_cq函数,这个函数在命令完成之后,ssd controller发中断后也会调用这个函数
static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; int ret = BLK_MQ_RQ_QUEUE_OK; /* * If formated with metadata, require the block layer provide a buffer * unless this namespace is formated such that the metadata can be * stripped/generated by the controller with PRACT=1. */ if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && !blk_rq_is_passthrough(req)) { blk_mq_end_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } } ret = nvme_setup_cmd(ns, req, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) return ret; ret = nvme_init_iod(req, dev); if (ret != BLK_MQ_RQ_QUEUE_OK) goto out_free_cmd; if (blk_rq_nr_phys_segments(req)) ret = nvme_map_data(dev, req, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) goto out_cleanup_iod; blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { ret = BLK_MQ_RQ_QUEUE_ERROR; spin_unlock_irq(&nvmeq->q_lock); goto out_cleanup_iod; } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; out_cleanup_iod: nvme_free_iod(dev, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; }
接着我们看一下__nvme_submit_cmd函数,该函数将cmd copy到SQ中,然后向SQ的doorbell里写入tail信息,告诉SSD controller来了新命令。
static void __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { u16 tail = nvmeq->sq_tail; if (nvmeq->sq_cmds_io) memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd)); else memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); if (++tail == nvmeq->q_depth) tail = 0; if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; }
至此,从ioctl到写入SQ已经结束了,接下来的事情就是需要交给SSD controller取命令,执行命令了。
ssd controller执行完命令之后,会向CQ里写入完成信息,并向host端发出中断,中断调用的函数即nvme_process_cq函数
在这个函数中,执行的主要流程是,从CQ的head检查CQ,对于有效的CQ信息(通过CQ entry里的P位,这个可以参考nvme driver specification),进行处理,并调用nvme_end_request函数,来进行一个I/O request结束处理
static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) { u16 head, phase; head = nvmeq->cq_head; phase = nvmeq->cq_phase; while (nvme_cqe_valid(nvmeq, head, phase)) { struct nvme_completion cqe = nvmeq->cqes[head]; struct request *req; if (++head == nvmeq->q_depth) { head = 0; phase = !phase; } if (tag && *tag == cqe.command_id) *tag = -1; if (unlikely(cqe.command_id >= nvmeq->q_depth)) { dev_warn(nvmeq->dev->ctrl.device, "invalid id %d completed on queue %d\n", cqe.command_id, le16_to_cpu(cqe.sq_id)); continue; } /* * AEN requests are special as they don't time out and can * survive any kind of queue freeze and often don't respond to * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ if (unlikely(nvmeq->qid == 0 && cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { nvme_complete_async_event(&nvmeq->dev->ctrl, cqe.status, &cqe.result); continue; } req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); nvme_end_request(req, cqe.status, cqe.result); } if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) return; if (likely(nvmeq->cq_vector >= 0)) if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db, nvmeq->dbbuf_cq_ei)) writel(head, nvmeq->q_db + nvmeq->dev->db_stride); nvmeq->cq_head = head; nvmeq->cq_phase = phase; nvmeq->cqe_seen = 1; } static void nvme_process_cq(struct nvme_queue *nvmeq) { __nvme_process_cq(nvmeq, NULL); }
接着我们看一下nvme_end_request函数,在这个函数中,依次调用了blk_mq_complete_request、 __blk_mq_complete_request,最后调用到nvme driver定义的softirq_done_fn函数
static void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; bool shared = false; int cpu; if (rq->internal_tag != -1) blk_mq_sched_completed_request(rq); if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); blk_stat_add(rq); } if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { rq->q->softirq_done_fn(rq); return; } cpu = get_cpu(); if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) shared = cpus_share_cache(cpu, ctx->cpu); if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; smp_call_function_single_async(ctx->cpu, &rq->csd); } else { rq->q->softirq_done_fn(rq); } put_cpu(); }
该函数(定义的过程省略,这一部分是在对nvme 设备初始化定义的)是nvme_pci_complete_rq函数
static void nvme_pci_complete_rq(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); nvme_unmap_data(iod->nvmeq->dev, req); nvme_complete_rq(req); }
void nvme_complete_rq(struct request *req) { if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { nvme_req(req)->retries++; blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q)); return; } blk_mq_end_request(req, nvme_error_status(req)); }
void blk_mq_end_request(struct request *rq, int error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); __blk_mq_end_request(rq, error); }
inline void __blk_mq_end_request(struct request *rq, int error) { blk_account_io_done(rq); if (rq->end_io) { wbt_done(rq->q->rq_wb, &rq->issue_stat); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) blk_mq_free_request(rq->next_rq); blk_mq_free_request(rq); } }
可以看出,上面这个__blk_mq_end_request函数会调用request的end_io,这个是在哪里定义的呢,可以看一下blk_execute_rq和blk_execute_rq_nowait函数,request的end_io就是blk_end_sync_rq函数
这个函数的作用就是唤醒上面blk_execute_rq里的wait_for_completion_io(&wait)
static void blk_end_sync_rq(struct request *rq, int error) { struct completion *waiting = rq->end_io_data; rq->end_io_data = NULL; /* * complete last, if this is a stack request the process (and thus * the rq pointer) could be invalid right after this complete() */ complete(waiting); }
至此,整个ioctl的过程几乎完全结束了,剩下的就是将result向上返回了