Commit 4a4037f2 authored by Rob Nelson's avatar Rob Nelson Committed by Luis Henriques

UBUNTU: SAUCE: nvme: improve performance for virtual NVMe devices

BugLink: http://bugs.launchpad.net/bugs/1637565

This change provides a mechanism to reduce the number of MMIO doorbell
writes for the NVMe driver. When running in a virtualized environment
like QEMU, the cost of an MMIO is quite hefy here. The main idea for
the patch is provide the device two memory location locations:
 1) to store the doorbell values so they can be lookup without the doorbell
    MMIO write
 2) to store an event index.
I believe the doorbell value is obvious, the event index not so much.
Similar to the virtio specificaiton, the virtual device can tell the
driver (guest OS) not to write MMIO unless you are writing past this
value.

FYI: doorbell values are written by the nvme driver (guest OS) and the
event index is written by the virtual device (host OS).

The patch implements a new admin command that will communicate where
these two memory locations reside. If the command fails, the nvme
driver will work as before without any optimizations.

Contributions:
  Eric Northup <digitaleric@google.com>
  Frank Swiderski <fes@google.com>
  Ted Tso <tytso@mit.edu>
  Keith Busch <keith.busch@intel.com>

Just to give an idea on the performance boost with the vendor
extension: Running fio [1], a stock NVMe driver I get about 200K read
IOPs with my vendor patch I get about 1000K read IOPs. This was
running with a null device i.e. the backing device simply returned
success on every read IO request.

[1] Running on a 4 core machine:
  fio --time_based --name=benchmark --runtime=30
  --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
  --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
  --rw=randread --blocksize=4k --randrepeat=false
Signed-off-by: default avatarRob Nelson <rlnelson@google.com>
[mlin: port for upstream]
Signed-off-by: default avatarMing Lin <mlin@kernel.org>
[koike: updated for current APIs]
Signed-off-by: default avatarHelen Mae Koike Fornazier <helen.koike@collabora.co.uk>
Signed-off-by: default avatarTim Gardner <tim.gardner@canonical.com>
Acked-by: default avatarStefan Bader <stefan.bader@canonical.com>
Acked-by: default avatarSeth Forshee <seth.forshee@canonical.com>
Signed-off-by: default avatarLuis Henriques <luis.henriques@canonical.com>
parent 45ed9217
......@@ -17,5 +17,14 @@ config BLK_DEV_NVME_SCSI
and block devices nodes, as well a a translation for a small
number of selected SCSI commands to NVMe commands to the NVMe
driver. If you don't know what this means you probably want
to say N here, and if you know what it means you probably
want to say N as well.
to say N here, unless you run a distro that abuses the SCSI
emulation to provide stable device names for mount by id, like
some OpenSuSE and SLES versions.
config NVME_VENDOR_EXT_GOOGLE
bool "NVMe Vendor Extension for Improved Virtualization"
depends on BLK_DEV_NVME
---help---
Google extension to reduce the number of MMIO doorbell
writes for the NVMe driver
......@@ -50,6 +50,9 @@
#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
/* Google Vendor ID is not in include/linux/pci_ids.h */
#define PCI_VENDOR_ID_GOOGLE 0x1AE0
/*
* We handle AEN commands ourselves and don't even let the
* block layer know about them.
......@@ -122,6 +125,12 @@ struct nvme_dev {
struct nvme_ctrl ctrl;
struct completion ioq_wait;
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
u32 *db_mem;
dma_addr_t doorbell;
u32 *ei_mem;
dma_addr_t eventidx;
#endif
};
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
......@@ -153,6 +162,12 @@ struct nvme_queue {
u16 qid;
u8 cq_phase;
u8 cqe_seen;
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
u32 *sq_doorbell_addr;
u32 *sq_eventidx_addr;
u32 *cq_doorbell_addr;
u32 *cq_eventidx_addr;
#endif
};
/*
......@@ -190,6 +205,9 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
BUILD_BUG_ON(sizeof(struct nvme_doorbell_memory) != 64);
#endif
}
/*
......@@ -319,6 +337,51 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
}
}
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
static int nvme_vendor_memory_size(struct nvme_dev *dev)
{
return ((num_possible_cpus() + 1) * 8 * dev->db_stride);
}
static int nvme_set_doorbell_memory(struct nvme_dev *dev)
{
struct nvme_command c;
memset(&c, 0, sizeof(c));
c.doorbell_memory.opcode = nvme_admin_doorbell_memory;
c.doorbell_memory.prp1 = cpu_to_le64(dev->doorbell);
c.doorbell_memory.prp2 = cpu_to_le64(dev->eventidx);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
static inline int nvme_ext_need_event(u16 event_idx, u16 new_idx, u16 old)
{
/* Borrowed from vring_need_event */
return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}
static void nvme_ext_write_doorbell(u16 value, u32 __iomem* q_db,
u32* db_addr, volatile u32* event_idx)
{
u16 old_value;
if (!db_addr)
goto ring_doorbell;
old_value = *db_addr;
*db_addr = value;
rmb();
if (!nvme_ext_need_event(*event_idx, value, old_value))
goto no_doorbell;
ring_doorbell:
writel(value, q_db);
no_doorbell:
return;
}
#endif
/**
* __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
......@@ -336,9 +399,19 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
else
memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
if (nvmeq->sq_doorbell_addr)
wmb();
#endif
if (++tail == nvmeq->q_depth)
tail = 0;
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
nvme_ext_write_doorbell(tail, nvmeq->q_db,
nvmeq->sq_doorbell_addr, nvmeq->sq_eventidx_addr);
#else
writel(tail, nvmeq->q_db);
#endif
nvmeq->sq_tail = tail;
}
......@@ -752,6 +825,12 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
if ((status & 1) != phase)
break;
nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
if (to_pci_dev(nvmeq->dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE)
rmb();
#endif
if (++head == nvmeq->q_depth) {
head = 0;
phase = !phase;
......@@ -798,7 +877,14 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
return;
if (likely(nvmeq->cq_vector >= 0))
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
nvme_ext_write_doorbell(head,
nvmeq->q_db + nvmeq->dev->db_stride,
nvmeq->cq_doorbell_addr,
nvmeq->cq_eventidx_addr);
#else
writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
#endif
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
......@@ -1184,6 +1270,17 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
dev->queues[qid] = nvmeq;
dev->queue_count++;
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
if (dev->db_mem && dev->ei_mem && qid != 0) {
nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
nvmeq->cq_doorbell_addr =
&dev->db_mem[(qid * 2 + 1) * dev->db_stride];
nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
nvmeq->cq_eventidx_addr =
&dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
}
#endif
return nvmeq;
free_cqdma:
......@@ -1214,6 +1311,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE && qid != 0) {
nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
nvmeq->cq_doorbell_addr =
&dev->db_mem[(qid * 2 + 1) * dev->db_stride];
nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
nvmeq->cq_eventidx_addr =
&dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
}
#endif
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
dev->online_queues++;
spin_unlock_irq(&nvmeq->q_lock);
......@@ -1712,6 +1819,19 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(&dev->tagset))
return 0;
dev->ctrl.tagset = &dev->tagset;
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE) {
int res = nvme_set_doorbell_memory(dev);
if (res) {
// Free memory and continue on.
dma_free_coherent(dev->dev, 8192, dev->db_mem, dev->doorbell);
dma_free_coherent(dev->dev, 8192, dev->ei_mem, dev->doorbell);
dev->db_mem = 0;
dev->ei_mem = 0;
}
}
#endif
}
nvme_queue_scan(dev);
return 0;
......@@ -1769,8 +1889,31 @@ static int nvme_pci_enable(struct nvme_dev *dev)
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
if (pdev->vendor == PCI_VENDOR_ID_GOOGLE) {
int mem_size = nvme_vendor_memory_size(dev);
dev->db_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->doorbell, GFP_KERNEL);
if (!dev->db_mem) {
result = -ENOMEM;
goto disable;
}
dev->ei_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->eventidx, GFP_KERNEL);
if (!dev->ei_mem) {
result = -ENOMEM;
goto dma_free;
}
}
#endif
return 0;
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
dma_free:
dma_free_coherent(&pdev->dev, nvme_vendor_memory_size(dev), dev->db_mem, dev->doorbell);
dev->db_mem = 0;
#endif
disable:
pci_disable_device(pdev);
......@@ -1787,6 +1930,14 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
static void nvme_pci_disable(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
int mem_size = nvme_vendor_memory_size(dev);
if (dev->db_mem)
dma_free_coherent(&pdev->dev, mem_size, dev->db_mem, dev->doorbell);
if (dev->ei_mem)
dma_free_coherent(&pdev->dev, mem_size, dev->ei_mem, dev->eventidx);
#endif
if (pdev->msi_enabled)
pci_disable_msi(pdev);
......
......@@ -546,6 +546,9 @@ enum nvme_admin_opcode {
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
nvme_admin_doorbell_memory = 0xC0,
#endif
};
enum {
......@@ -810,6 +813,18 @@ struct nvmf_property_get_command {
__u8 resv4[16];
};
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
struct nvme_doorbell_memory {
__u8 opcode;
__u8 flags;
__u16 command_id;
__u32 rsvd1[5];
__le64 prp1;
__le64 prp2;
__u32 rsvd12[6];
};
#endif
struct nvme_command {
union {
struct nvme_common_command common;
......@@ -823,6 +838,9 @@ struct nvme_command {
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
struct nvme_abort_cmd abort;
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
struct nvme_doorbell_memory doorbell_memory;
#endif
struct nvme_get_log_page_command get_log_page;
struct nvmf_common_command fabrics;
struct nvmf_connect_command connect;
......@@ -901,6 +919,9 @@ enum {
NVME_SC_BAD_ATTRIBUTES = 0x180,
NVME_SC_INVALID_PI = 0x181,
NVME_SC_READ_ONLY = 0x182,
#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
NVME_SC_DOORBELL_MEMORY_INVALID = 0x1C0,
#endif
/*
* I/O Command Set Specific - Fabrics commands:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment