From f2b07a9ecac88be480eaf04fb477cb72c062a914 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Fri, 10 Mar 2017 23:04:53 +0000
Subject: drivers/amazon: xen-blkfront: resurrect request-based mode
This change resurrect request-based mode which was completely dropped in
commit 907c3eb18e0b ("xen-blkfront: convert to blk-mq APIs").
Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Guru Anbalagane <guruanb@amazon.com>
CR: https://cr.amazon.com/r/6834653/
---
drivers/amazon/Kconfig | 12 +++
drivers/amazon/block/xen-blkfront.c | 206 ++++++++++++++++++++++++++++--------
2 files changed, 176 insertions(+), 42 deletions(-)
diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index aeecd56..986ed17 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -55,4 +55,16 @@ config AMAZON_XEN_BLKDEV_FRONTEND
This driver is separately maintained from drivers/block/xen-blkfront.c
and will include our changes which may not available in the upstream.
+config AMAZON_XEN_BLKDEV_FRONTEND_MQ_DEFAULT
+ bool "blkfront: use blk-mq I/O path by default"
+ depends on AMAZON_XEN_BLKDEV_FRONTEND
+ default n
+ help
+ This option enables the new blk-mq based I/O path for blkfront
+ devices by default. If the option is not set, blkfront defaults
+ to the request based mode.
+
+ The boot time parameter xen_blkfront.use_blk_mq can be used to
+ change the setting regardless of this option.
+
endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/block/xen-blkfront.c b/drivers/amazon/block/xen-blkfront.c
index 0f19206..9c13857 100644
--- a/drivers/amazon/block/xen-blkfront.c
+++ b/drivers/amazon/block/xen-blkfront.c
@@ -152,6 +152,19 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
#define BLK_MAX_RING_SIZE \
__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
+#ifdef CONFIG_AMAZON_XEN_BLKDEV_FRONTEND_MQ_DEFAULT
+static unsigned int blkfront_use_blk_mq = 1;
+#else
+static unsigned int blkfront_use_blk_mq = 0;
+#endif
+module_param_named(use_blk_mq, blkfront_use_blk_mq, int, S_IRUGO);
+MODULE_PARM_DESC(use_blk_mq, "Override default block layer to use (0=request-based,1=blk-mq)");
+
+/*
+ * Index to the first available ring.
+ */
+#define FIRST_RING_ID (0)
+
/*
* ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
* characters are enough. Define to 20 to keep consistent with backend.
@@ -876,6 +889,61 @@ static inline bool blkif_request_flush_invalid(struct request *req,
!info->feature_fua));
}
+static inline void blkif_complete_request(struct request *req, int error)
+{
+ if (blkfront_use_blk_mq)
+ blk_mq_complete_request(req, error);
+ else
+ __blk_end_request_all(req, error);
+}
+
+/* do_blkif_request
+ * read a block; request is in a request queue
+ */
+static void do_blkif_request(struct request_queue *rq)
+{
+ struct blkfront_info *info = NULL;
+ struct request *req;
+ int queued;
+
+ pr_debug("Entered do_blkif_request\n");
+
+ queued = 0;
+
+ while ((req = blk_peek_request(rq)) != NULL) {
+ info = req->rq_disk->private_data;
+
+ if (RING_FULL(&info->rinfo[FIRST_RING_ID].ring))
+ goto wait;
+
+ blk_start_request(req);
+
+ if (blkif_request_flush_invalid(req, info)) {
+ __blk_end_request_all(req, -EOPNOTSUPP);
+ continue;
+ }
+
+ pr_debug("do_blk req %p: cmd %p, sec %lx, "
+ "(%u/%u) [%s]\n",
+ req, req->cmd, (unsigned long)blk_rq_pos(req),
+ blk_rq_cur_sectors(req), blk_rq_sectors(req),
+ rq_data_dir(req) ? "write" : "read");
+
+ if (blkif_queue_request(req, &info->rinfo[FIRST_RING_ID])) {
+ blk_requeue_request(rq, req);
+wait:
+ /* Avoid pointless unplugs. */
+ blk_stop_queue(rq);
+ break;
+ }
+
+ queued++;
+ }
+
+ if(queued != 0)
+ flush_requests(&info->rinfo[FIRST_RING_ID]);
+}
+
static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *qd)
{
@@ -958,30 +1026,41 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
struct request_queue *rq;
struct blkfront_info *info = gd->private_data;
- memset(&info->tag_set, 0, sizeof(info->tag_set));
- info->tag_set.ops = &blkfront_mq_ops;
- info->tag_set.nr_hw_queues = info->nr_rings;
- if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+ if (blkfront_use_blk_mq) {
+ memset(&info->tag_set, 0, sizeof(info->tag_set));
+ info->tag_set.ops = &blkfront_mq_ops;
+ info->tag_set.nr_hw_queues = info->nr_rings;
+ if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+ /*
+ * When indirect descriptior is not supported, the I/O request
+ * will be split between multiple request in the ring.
+ * To avoid problems when sending the request, divide by
+ * 2 the depth of the queue.
+ */
+ info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
+ } else
+ info->tag_set.queue_depth = BLK_RING_SIZE(info);
+ info->tag_set.numa_node = NUMA_NO_NODE;
+ info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ info->tag_set.cmd_size = 0;
+ info->tag_set.driver_data = info;
+
+ if (blk_mq_alloc_tag_set(&info->tag_set))
+ return -EINVAL;
+ rq = blk_mq_init_queue(&info->tag_set);
+ if (IS_ERR(rq)) {
+ blk_mq_free_tag_set(&info->tag_set);
+ return PTR_ERR(rq);
+ }
+ } else {
/*
- * When indirect descriptior is not supported, the I/O request
- * will be split between multiple request in the ring.
- * To avoid problems when sending the request, divide by
- * 2 the depth of the queue.
+ * Per-device lock no longer exists. Use the spin lock in the first
+ * available ring as queue lock.
*/
- info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
- } else
- info->tag_set.queue_depth = BLK_RING_SIZE(info);
- info->tag_set.numa_node = NUMA_NO_NODE;
- info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
- info->tag_set.cmd_size = 0;
- info->tag_set.driver_data = info;
-
- if (blk_mq_alloc_tag_set(&info->tag_set))
- return -EINVAL;
- rq = blk_mq_init_queue(&info->tag_set);
- if (IS_ERR(rq)) {
- blk_mq_free_tag_set(&info->tag_set);
- return PTR_ERR(rq);
+ rq = blk_init_queue(do_blkif_request,
+ &info->rinfo[FIRST_RING_ID].ring_lock);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
}
rq->queuedata = info;
@@ -1179,21 +1258,29 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
static void xlvbd_release_gendisk(struct blkfront_info *info)
{
unsigned int minor, nr_minors, i;
+ unsigned long flags;
if (info->rq == NULL)
return;
/* No more blkif_request(). */
- blk_mq_stop_hw_queues(info->rq);
+ if (blkfront_use_blk_mq) {
+ blk_mq_stop_hw_queues(info->rq);
- for (i = 0; i < info->nr_rings; i++) {
- struct blkfront_ring_info *rinfo = &info->rinfo[i];
+ for (i = 0; i < info->nr_rings; i++) {
+ struct blkfront_ring_info *rinfo = &info->rinfo[i];
- /* No more gnttab callback work. */
- gnttab_cancel_free_callback(&rinfo->callback);
+ /* No more gnttab callback work. */
+ gnttab_cancel_free_callback(&rinfo->callback);
- /* Flush gnttab callback work. Must be done with no locks held. */
- flush_work(&rinfo->work);
+ /* Flush gnttab callback work. Must be done with no locks held. */
+ flush_work(&rinfo->work);
+ }
+ } else {
+ spin_lock_irqsave(&info->rinfo[FIRST_RING_ID].ring_lock, flags);
+ blk_stop_queue(info->rq);
+ gnttab_cancel_free_callback(&info->rinfo[FIRST_RING_ID].callback);
+ spin_unlock_irqrestore(&info->rinfo[FIRST_RING_ID].ring_lock, flags);
}
del_gendisk(info->gd);
@@ -1203,7 +1290,8 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
xlbd_release_minors(minor, nr_minors);
blk_cleanup_queue(info->rq);
- blk_mq_free_tag_set(&info->tag_set);
+ if (blkfront_use_blk_mq)
+ blk_mq_free_tag_set(&info->tag_set);
info->rq = NULL;
put_disk(info->gd);
@@ -1213,8 +1301,17 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
/* Already hold rinfo->ring_lock. */
static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
{
- if (!RING_FULL(&rinfo->ring))
+ if (RING_FULL(&rinfo->ring))
+ return;
+
+ if (blkfront_use_blk_mq) {
blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
+ } else {
+ /* Re-enable calldowns */
+ blk_start_queue(rinfo->dev_info->rq);
+ /* Kick things off immediately */
+ do_blkif_request(rinfo->dev_info->rq);
+ }
}
static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
@@ -1342,8 +1439,12 @@ static void blkif_free(struct blkfront_info *info, int suspend)
info->connected = suspend ?
BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
/* No more blkif_request(). */
- if (info->rq)
- blk_mq_stop_hw_queues(info->rq);
+ if (info->rq) {
+ if (blkfront_use_blk_mq)
+ blk_mq_stop_hw_queues(info->rq);
+ else
+ blk_stop_queue(info->rq);
+ }
for (i = 0; i < info->nr_rings; i++)
blkif_free_ring(&info->rinfo[i]);
@@ -1604,7 +1705,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
}
- blk_mq_complete_request(req, error);
+ blkif_complete_request(req, error);
break;
case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER:
@@ -1633,7 +1734,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
"request: %x\n", bret->status);
- blk_mq_complete_request(req, error);
+ blkif_complete_request(req, error);
break;
default:
BUG();
@@ -1897,8 +1998,11 @@ static int negotiate_mq(struct blkfront_info *info)
backend_max_queues = 1;
info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
- /* We need at least one ring. */
- if (!info->nr_rings)
+ /*
+ * We need at least one ring. Also, do not allow to have multiple rings if blk-mq is
+ * not used.
+ */
+ if (!info->nr_rings || !blkfront_use_blk_mq)
info->nr_rings = 1;
info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
@@ -2035,6 +2139,10 @@ static int blkif_recover(struct blkfront_info *info)
}
xenbus_switch_state(info->xbdev, XenbusStateConnected);
+ /* blk_requeue_request below must be called with queue lock held */
+ if (!blkfront_use_blk_mq)
+ spin_lock_irq(&info->rinfo[FIRST_RING_ID].ring_lock);
+
/* Now safe for us to use the shared ring */
info->connected = BLKIF_STATE_CONNECTED;
@@ -2043,16 +2151,27 @@ static int blkif_recover(struct blkfront_info *info)
rinfo = &info->rinfo[r_index];
/* Kick any other new requests queued since we resumed */
- kick_pending_request_queues(rinfo);
+ if (blkfront_use_blk_mq)
+ kick_pending_request_queues(rinfo);
+ else
+ kick_pending_request_queues_locked(rinfo);
}
list_for_each_entry_safe(req, n, &info->requests, queuelist) {
/* Requeue pending requests (flush or discard) */
list_del_init(&req->queuelist);
BUG_ON(req->nr_phys_segments > segs);
- blk_mq_requeue_request(req);
+ if (blkfront_use_blk_mq)
+ blk_mq_requeue_request(req);
+ else
+ blk_requeue_request(info->rq, req);
}
- blk_mq_kick_requeue_list(info->rq);
+
+ if (!blkfront_use_blk_mq)
+ spin_unlock_irq(&info->rinfo[FIRST_RING_ID].ring_lock);
+
+ if (blkfront_use_blk_mq)
+ blk_mq_kick_requeue_list(info->rq);
while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
/* Traverse the list of pending bios and re-queue them */
@@ -2137,7 +2256,10 @@ static int blkfront_resume(struct xenbus_device *dev)
merge_bio.tail = shadow[j].request->biotail;
bio_list_merge(&info->bio_list, &merge_bio);
shadow[j].request->bio = NULL;
- blk_mq_end_request(shadow[j].request, 0);
+ if (blkfront_use_blk_mq)
+ blk_mq_end_request(shadow[j].request, 0);
+ else
+ blk_end_request_all(shadow[j].request, 0);
}
}
@@ -2148,7 +2270,7 @@ static int blkfront_resume(struct xenbus_device *dev)
return err;
err = talk_to_blkback(dev, info);
- if (!err)
+ if (!err && blkfront_use_blk_mq)
blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
/*
--
2.7.5