Commit 86839c56 authored by Bob Liu's avatar Bob Liu Committed by Konrad Rzeszutek Wilk

xen/block: add multi-page ring support

Extend xen/block to support multi-page ring, so that more requests can be
issued by using more than one pages as the request ring between blkfront
and backend.
As a result, the performance can get improved significantly.

We got some impressive improvements on our highend iscsi storage cluster
backend. If using 64 pages as the ring, the IOPS increased about 15 times
for the throughput testing and above doubled for the latency testing.

The reason was the limit on outstanding requests is 32 if use only one-page
ring, but in our case the iscsi lun was spread across about 100 physical
drives, 32 was really not enough to keep them busy.

Changes in v2:
 - Rebased to 4.0-rc6.
 - Document on how multi-page ring feature working to linux io/blkif.h.

Changes in v3:
 - Remove changes to linux io/blkif.h and follow the protocol defined
   in io/blkif.h of XEN tree.
 - Rebased to 4.1-rc3

Changes in v4:
 - Turn to use 'ring-page-order' and 'max-ring-page-order'.
 - A few comments from Roger.

Changes in v5:
 - Clarify with 4k granularity to comment
 - Address more comments from Roger
Signed-off-by: default avatarBob Liu <bob.liu@oracle.com>
Signed-off-by: default avatarKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>
parent 8ab0144a
...@@ -83,6 +83,13 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); ...@@ -83,6 +83,13 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
MODULE_PARM_DESC(max_persistent_grants, MODULE_PARM_DESC(max_persistent_grants,
"Maximum number of grants to map persistently"); "Maximum number of grants to map persistently");
/*
* Maximum order of pages to be used for the shared ring between front and
* backend, 4KB page granularity is used.
*/
unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
/* /*
* The LRU mechanism to clean the lists of persistent grants needs to * The LRU mechanism to clean the lists of persistent grants needs to
* be executed periodically. The time interval between consecutive executions * be executed periodically. The time interval between consecutive executions
...@@ -1451,6 +1458,12 @@ static int __init xen_blkif_init(void) ...@@ -1451,6 +1458,12 @@ static int __init xen_blkif_init(void)
if (!xen_domain()) if (!xen_domain())
return -ENODEV; return -ENODEV;
if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
}
rc = xen_blkif_interface_init(); rc = xen_blkif_interface_init();
if (rc) if (rc)
goto failed_init; goto failed_init;
......
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#include <xen/interface/io/blkif.h> #include <xen/interface/io/blkif.h>
#include <xen/interface/io/protocols.h> #include <xen/interface/io/protocols.h>
extern unsigned int xen_blkif_max_ring_order;
/* /*
* This is the maximum number of segments that would be allowed in indirect * This is the maximum number of segments that would be allowed in indirect
* requests. This value will also be passed to the frontend. * requests. This value will also be passed to the frontend.
...@@ -320,6 +321,7 @@ struct xen_blkif { ...@@ -320,6 +321,7 @@ struct xen_blkif {
struct work_struct free_work; struct work_struct free_work;
/* Thread shutdown wait queue. */ /* Thread shutdown wait queue. */
wait_queue_head_t shutdown_wq; wait_queue_head_t shutdown_wq;
unsigned int nr_ring_pages;
}; };
struct seg_buf { struct seg_buf {
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
/* Enlarge the array size in order to fully show blkback name. */ /* Enlarge the array size in order to fully show blkback name. */
#define BLKBACK_NAME_LEN (20) #define BLKBACK_NAME_LEN (20)
#define RINGREF_NAME_LEN (20)
struct backend_info { struct backend_info {
struct xenbus_device *dev; struct xenbus_device *dev;
...@@ -156,8 +157,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) ...@@ -156,8 +157,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
return blkif; return blkif;
} }
static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
unsigned int evtchn) unsigned int nr_grefs, unsigned int evtchn)
{ {
int err; int err;
...@@ -165,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, ...@@ -165,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
if (blkif->irq) if (blkif->irq)
return 0; return 0;
err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1, err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
&blkif->blk_ring); &blkif->blk_ring);
if (err < 0) if (err < 0)
return err; return err;
...@@ -175,21 +176,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, ...@@ -175,21 +176,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
{ {
struct blkif_sring *sring; struct blkif_sring *sring;
sring = (struct blkif_sring *)blkif->blk_ring; sring = (struct blkif_sring *)blkif->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
break; break;
} }
case BLKIF_PROTOCOL_X86_32: case BLKIF_PROTOCOL_X86_32:
{ {
struct blkif_x86_32_sring *sring_x86_32; struct blkif_x86_32_sring *sring_x86_32;
sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
break; break;
} }
case BLKIF_PROTOCOL_X86_64: case BLKIF_PROTOCOL_X86_64:
{ {
struct blkif_x86_64_sring *sring_x86_64; struct blkif_x86_64_sring *sring_x86_64;
sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
break; break;
} }
default: default:
...@@ -270,7 +271,7 @@ static void xen_blkif_free(struct xen_blkif *blkif) ...@@ -270,7 +271,7 @@ static void xen_blkif_free(struct xen_blkif *blkif)
i++; i++;
} }
WARN_ON(i != XEN_BLKIF_REQS_PER_PAGE); WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
kmem_cache_free(xen_blkif_cachep, blkif); kmem_cache_free(xen_blkif_cachep, blkif);
} }
...@@ -555,6 +556,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev, ...@@ -555,6 +556,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
if (err) if (err)
goto fail; goto fail;
err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u",
xen_blkif_max_ring_order);
if (err)
pr_warn("%s write out 'max-ring-page-order' failed\n", __func__);
err = xenbus_switch_state(dev, XenbusStateInitWait); err = xenbus_switch_state(dev, XenbusStateInitWait);
if (err) if (err)
goto fail; goto fail;
...@@ -818,8 +824,8 @@ static void connect(struct backend_info *be) ...@@ -818,8 +824,8 @@ static void connect(struct backend_info *be)
static int connect_ring(struct backend_info *be) static int connect_ring(struct backend_info *be)
{ {
struct xenbus_device *dev = be->dev; struct xenbus_device *dev = be->dev;
unsigned long ring_ref; unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
unsigned int evtchn; unsigned int evtchn, nr_grefs, ring_page_order;
unsigned int pers_grants; unsigned int pers_grants;
char protocol[64] = ""; char protocol[64] = "";
struct pending_req *req, *n; struct pending_req *req, *n;
...@@ -827,14 +833,57 @@ static int connect_ring(struct backend_info *be) ...@@ -827,14 +833,57 @@ static int connect_ring(struct backend_info *be)
pr_debug("%s %s\n", __func__, dev->otherend); pr_debug("%s %s\n", __func__, dev->otherend);
err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
&ring_ref, "event-channel", "%u", &evtchn, NULL); &evtchn);
if (err) { if (err != 1) {
xenbus_dev_fatal(dev, err, err = -EINVAL;
"reading %s/ring-ref and event-channel", xenbus_dev_fatal(dev, err, "reading %s/event-channel",
dev->otherend); dev->otherend);
return err; return err;
} }
pr_info("event-channel %u\n", evtchn);
err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
&ring_page_order);
if (err != 1) {
err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
"%u", &ring_ref[0]);
if (err != 1) {
err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
dev->otherend);
return err;
}
nr_grefs = 1;
pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
ring_ref[0]);
} else {
unsigned int i;
if (ring_page_order > xen_blkif_max_ring_order) {
err = -EINVAL;
xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
dev->otherend, ring_page_order,
xen_blkif_max_ring_order);
return err;
}
nr_grefs = 1 << ring_page_order;
for (i = 0; i < nr_grefs; i++) {
char ring_ref_name[RINGREF_NAME_LEN];
snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
"%u", &ring_ref[i]);
if (err != 1) {
err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/%s",
dev->otherend, ring_ref_name);
return err;
}
pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
}
}
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
...@@ -859,12 +908,13 @@ static int connect_ring(struct backend_info *be) ...@@ -859,12 +908,13 @@ static int connect_ring(struct backend_info *be)
be->blkif->vbd.feature_gnt_persistent = pers_grants; be->blkif->vbd.feature_gnt_persistent = pers_grants;
be->blkif->vbd.overflow_max_grants = 0; be->blkif->vbd.overflow_max_grants = 0;
be->blkif->nr_ring_pages = nr_grefs;
pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
ring_ref, evtchn, be->blkif->blk_protocol, protocol, nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
pers_grants ? "persistent grants" : ""); pers_grants ? "persistent grants" : "");
for (i = 0; i < XEN_BLKIF_REQS_PER_PAGE; i++) { for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
req = kzalloc(sizeof(*req), GFP_KERNEL); req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req) if (!req)
goto fail; goto fail;
...@@ -883,10 +933,9 @@ static int connect_ring(struct backend_info *be) ...@@ -883,10 +933,9 @@ static int connect_ring(struct backend_info *be)
} }
/* Map the shared frame, irq etc. */ /* Map the shared frame, irq etc. */
err = xen_blkif_map(be->blkif, ring_ref, evtchn); err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
if (err) { if (err) {
xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
ring_ref, evtchn);
return err; return err;
} }
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment