GitList

Raw Blame History
From f2b07a9ecac88be480eaf04fb477cb72c062a914 Mon Sep 17 00:00:00 2001
From: Munehisa Kamata <kamatam@amazon.com>
Date: Fri, 10 Mar 2017 23:04:53 +0000
Subject: drivers/amazon: xen-blkfront: resurrect request-based mode

This change resurrect request-based mode which was completely dropped in
commit 907c3eb18e0b ("xen-blkfront: convert to blk-mq APIs").

Signed-off-by: Munehisa Kamata <kamatam@amazon.com>
Reviewed-by: Guru Anbalagane <guruanb@amazon.com>

CR: https://cr.amazon.com/r/6834653/
---
 drivers/amazon/Kconfig              |  12 +++
 drivers/amazon/block/xen-blkfront.c | 206 ++++++++++++++++++++++++++++--------
 2 files changed, 176 insertions(+), 42 deletions(-)

diff --git a/drivers/amazon/Kconfig b/drivers/amazon/Kconfig
index aeecd56..986ed17 100644
--- a/drivers/amazon/Kconfig
+++ b/drivers/amazon/Kconfig
@@ -55,4 +55,16 @@ config AMAZON_XEN_BLKDEV_FRONTEND
 	  This driver is separately maintained from drivers/block/xen-blkfront.c
 	  and will include our changes which may not available in the upstream.
 
+config AMAZON_XEN_BLKDEV_FRONTEND_MQ_DEFAULT
+        bool "blkfront: use blk-mq I/O path by default"
+        depends on AMAZON_XEN_BLKDEV_FRONTEND
+        default n
+        help
+          This option enables the new blk-mq based I/O path for blkfront
+          devices by default. If the option is not set, blkfront defaults
+          to the request based mode.
+
+          The boot time parameter xen_blkfront.use_blk_mq can be used to
+          change the setting regardless of this option.
+
 endif # AMAZON_DRIVER_UPDATES
diff --git a/drivers/amazon/block/xen-blkfront.c b/drivers/amazon/block/xen-blkfront.c
index 0f19206..9c13857 100644
--- a/drivers/amazon/block/xen-blkfront.c
+++ b/drivers/amazon/block/xen-blkfront.c
@@ -152,6 +152,19 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
 #define BLK_MAX_RING_SIZE	\
 	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
 
+#ifdef CONFIG_AMAZON_XEN_BLKDEV_FRONTEND_MQ_DEFAULT
+static unsigned int blkfront_use_blk_mq = 1;
+#else
+static unsigned int blkfront_use_blk_mq = 0;
+#endif
+module_param_named(use_blk_mq, blkfront_use_blk_mq, int, S_IRUGO);
+MODULE_PARM_DESC(use_blk_mq, "Override default block layer to use (0=request-based,1=blk-mq)");
+
+/*
+ * Index to the first available ring.
+ */
+#define FIRST_RING_ID (0)
+
 /*
  * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
  * characters are enough. Define to 20 to keep consistent with backend.
@@ -876,6 +889,61 @@ static inline bool blkif_request_flush_invalid(struct request *req,
 		 !info->feature_fua));
 }
 
+static inline void blkif_complete_request(struct request *req, int error)
+{
+	if (blkfront_use_blk_mq)
+		blk_mq_complete_request(req, error);
+	else
+		__blk_end_request_all(req, error);
+}
+
+/* do_blkif_request
+ *  read a block; request is in a request queue
+ */
+static void do_blkif_request(struct request_queue *rq)
+{
+	struct blkfront_info *info = NULL;
+	struct request *req;
+	int queued;
+
+	pr_debug("Entered do_blkif_request\n");
+
+	queued = 0;
+
+	while ((req = blk_peek_request(rq)) != NULL) {
+		info = req->rq_disk->private_data;
+
+		if (RING_FULL(&info->rinfo[FIRST_RING_ID].ring))
+			goto wait;
+
+		blk_start_request(req);
+
+		if (blkif_request_flush_invalid(req, info)) {
+			__blk_end_request_all(req, -EOPNOTSUPP);
+			continue;
+		}
+
+		pr_debug("do_blk req %p: cmd %p, sec %lx, "
+			 "(%u/%u) [%s]\n",
+			 req, req->cmd, (unsigned long)blk_rq_pos(req),
+			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
+			 rq_data_dir(req) ? "write" : "read");
+
+		if (blkif_queue_request(req, &info->rinfo[FIRST_RING_ID])) {
+			blk_requeue_request(rq, req);
+wait:
+			/* Avoid pointless unplugs. */
+			blk_stop_queue(rq);
+			break;
+		}
+
+		queued++;
+	}
+
+	if(queued != 0)
+		flush_requests(&info->rinfo[FIRST_RING_ID]);
+}
+
 static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
 			  const struct blk_mq_queue_data *qd)
 {
@@ -958,30 +1026,41 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 	struct request_queue *rq;
 	struct blkfront_info *info = gd->private_data;
 
-	memset(&info->tag_set, 0, sizeof(info->tag_set));
-	info->tag_set.ops = &blkfront_mq_ops;
-	info->tag_set.nr_hw_queues = info->nr_rings;
-	if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+	if (blkfront_use_blk_mq) {
+		memset(&info->tag_set, 0, sizeof(info->tag_set));
+		info->tag_set.ops = &blkfront_mq_ops;
+		info->tag_set.nr_hw_queues = info->nr_rings;
+		if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+			/*
+			 * When indirect descriptior is not supported, the I/O request
+			 * will be split between multiple request in the ring.
+			 * To avoid problems when sending the request, divide by
+			 * 2 the depth of the queue.
+			 */
+			info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+		} else
+			info->tag_set.queue_depth = BLK_RING_SIZE(info);
+		info->tag_set.numa_node = NUMA_NO_NODE;
+		info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+		info->tag_set.cmd_size = 0;
+		info->tag_set.driver_data = info;
+
+		if (blk_mq_alloc_tag_set(&info->tag_set))
+			return -EINVAL;
+		rq = blk_mq_init_queue(&info->tag_set);
+		if (IS_ERR(rq)) {
+			blk_mq_free_tag_set(&info->tag_set);
+			return PTR_ERR(rq);
+		}
+	} else {
 		/*
-		 * When indirect descriptior is not supported, the I/O request
-		 * will be split between multiple request in the ring.
-		 * To avoid problems when sending the request, divide by
-		 * 2 the depth of the queue.
+		 * Per-device lock no longer exists. Use the spin lock in the first
+		 * available ring as queue lock.
 		 */
-		info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
-	} else
-		info->tag_set.queue_depth = BLK_RING_SIZE(info);
-	info->tag_set.numa_node = NUMA_NO_NODE;
-	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
-	info->tag_set.cmd_size = 0;
-	info->tag_set.driver_data = info;
-
-	if (blk_mq_alloc_tag_set(&info->tag_set))
-		return -EINVAL;
-	rq = blk_mq_init_queue(&info->tag_set);
-	if (IS_ERR(rq)) {
-		blk_mq_free_tag_set(&info->tag_set);
-		return PTR_ERR(rq);
+		rq = blk_init_queue(do_blkif_request,
+					&info->rinfo[FIRST_RING_ID].ring_lock);
+		if (IS_ERR(rq))
+			return PTR_ERR(rq);
 	}
 
 	rq->queuedata = info;
@@ -1179,21 +1258,29 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
 	unsigned int minor, nr_minors, i;
+	unsigned long flags;
 
 	if (info->rq == NULL)
 		return;
 
 	/* No more blkif_request(). */
-	blk_mq_stop_hw_queues(info->rq);
+	if (blkfront_use_blk_mq) {
+		blk_mq_stop_hw_queues(info->rq);
 
-	for (i = 0; i < info->nr_rings; i++) {
-		struct blkfront_ring_info *rinfo = &info->rinfo[i];
+		for (i = 0; i < info->nr_rings; i++) {
+			struct blkfront_ring_info *rinfo = &info->rinfo[i];
 
-		/* No more gnttab callback work. */
-		gnttab_cancel_free_callback(&rinfo->callback);
+			/* No more gnttab callback work. */
+			gnttab_cancel_free_callback(&rinfo->callback);
 
-		/* Flush gnttab callback work. Must be done with no locks held. */
-		flush_work(&rinfo->work);
+			/* Flush gnttab callback work. Must be done with no locks held. */
+			flush_work(&rinfo->work);
+		}
+	} else {
+		spin_lock_irqsave(&info->rinfo[FIRST_RING_ID].ring_lock, flags);
+		blk_stop_queue(info->rq);
+		gnttab_cancel_free_callback(&info->rinfo[FIRST_RING_ID].callback);
+		spin_unlock_irqrestore(&info->rinfo[FIRST_RING_ID].ring_lock, flags);
 	}
 
 	del_gendisk(info->gd);
@@ -1203,7 +1290,8 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	xlbd_release_minors(minor, nr_minors);
 
 	blk_cleanup_queue(info->rq);
-	blk_mq_free_tag_set(&info->tag_set);
+	if (blkfront_use_blk_mq)
+		blk_mq_free_tag_set(&info->tag_set);
 	info->rq = NULL;
 
 	put_disk(info->gd);
@@ -1213,8 +1301,17 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 /* Already hold rinfo->ring_lock. */
 static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
 {
-	if (!RING_FULL(&rinfo->ring))
+	if (RING_FULL(&rinfo->ring))
+		return;
+
+	if (blkfront_use_blk_mq) {
 		blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
+	} else {
+		/* Re-enable calldowns */
+		blk_start_queue(rinfo->dev_info->rq);
+		/* Kick things off immediately */
+		do_blkif_request(rinfo->dev_info->rq);
+	}
 }
 
 static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
@@ -1342,8 +1439,12 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
-	if (info->rq)
-		blk_mq_stop_hw_queues(info->rq);
+	if (info->rq) {
+		if (blkfront_use_blk_mq)
+			blk_mq_stop_hw_queues(info->rq);
+		else
+			blk_stop_queue(info->rq);
+	}
 
 	for (i = 0; i < info->nr_rings; i++)
 		blkif_free_ring(&info->rinfo[i]);
@@ -1604,7 +1705,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
 				queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
 			}
-			blk_mq_complete_request(req, error);
+			blkif_complete_request(req, error);
 			break;
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_WRITE_BARRIER:
@@ -1633,7 +1734,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);
 
-			blk_mq_complete_request(req, error);
+			blkif_complete_request(req, error);
 			break;
 		default:
 			BUG();
@@ -1897,8 +1998,11 @@ static int negotiate_mq(struct blkfront_info *info)
 		backend_max_queues = 1;
 
 	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
-	/* We need at least one ring. */
-	if (!info->nr_rings)
+	/*
+	 * We need at least one ring. Also, do not allow to have multiple rings if blk-mq is
+	 * not used.
+	 */
+	if (!info->nr_rings || !blkfront_use_blk_mq)
 		info->nr_rings = 1;
 
 	info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
@@ -2035,6 +2139,10 @@ static int blkif_recover(struct blkfront_info *info)
 	}
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
+	/* blk_requeue_request below must be called with queue lock held */
+	if (!blkfront_use_blk_mq)
+		spin_lock_irq(&info->rinfo[FIRST_RING_ID].ring_lock);
+
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 
@@ -2043,16 +2151,27 @@ static int blkif_recover(struct blkfront_info *info)
 
 		rinfo = &info->rinfo[r_index];
 		/* Kick any other new requests queued since we resumed */
-		kick_pending_request_queues(rinfo);
+		if (blkfront_use_blk_mq)
+			kick_pending_request_queues(rinfo);
+		else
+			kick_pending_request_queues_locked(rinfo);
 	}
 
 	list_for_each_entry_safe(req, n, &info->requests, queuelist) {
 		/* Requeue pending requests (flush or discard) */
 		list_del_init(&req->queuelist);
 		BUG_ON(req->nr_phys_segments > segs);
-		blk_mq_requeue_request(req);
+		if (blkfront_use_blk_mq)
+			blk_mq_requeue_request(req);
+		else
+			blk_requeue_request(info->rq, req);
 	}
-	blk_mq_kick_requeue_list(info->rq);
+
+	if (!blkfront_use_blk_mq)
+		spin_unlock_irq(&info->rinfo[FIRST_RING_ID].ring_lock);
+
+	if (blkfront_use_blk_mq)
+		blk_mq_kick_requeue_list(info->rq);
 
 	while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
 		/* Traverse the list of pending bios and re-queue them */
@@ -2137,7 +2256,10 @@ static int blkfront_resume(struct xenbus_device *dev)
 			merge_bio.tail = shadow[j].request->biotail;
 			bio_list_merge(&info->bio_list, &merge_bio);
 			shadow[j].request->bio = NULL;
-			blk_mq_end_request(shadow[j].request, 0);
+			if (blkfront_use_blk_mq)
+				blk_mq_end_request(shadow[j].request, 0);
+			else
+				blk_end_request_all(shadow[j].request, 0);
 		}
 	}
 
@@ -2148,7 +2270,7 @@ static int blkfront_resume(struct xenbus_device *dev)
 		return err;
 
 	err = talk_to_blkback(dev, info);
-	if (!err)
+	if (!err && blkfront_use_blk_mq)
 		blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
 
 	/*
-- 
2.7.5