Commit 64145482 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull virtio updates from Michael Tsirkin:

 - vdpa sim refactoring

 - virtio mem: Big Block Mode support

 - misc cleanus, fixes

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (61 commits)
  vdpa: Use simpler version of ida allocation
  vdpa: Add missing comment for virtqueue count
  uapi: virtio_ids: add missing device type IDs from OASIS spec
  uapi: virtio_ids.h: consistent indentions
  vhost scsi: fix error return code in vhost_scsi_set_endpoint()
  virtio_ring: Fix two use after free bugs
  virtio_net: Fix error code in probe()
  virtio_ring: Cut and paste bugs in vring_create_virtqueue_packed()
  tools/virtio: add barrier for aarch64
  tools/virtio: add krealloc_array
  tools/virtio: include asm/bug.h
  vdpa/mlx5: Use write memory barrier after updating CQ index
  vdpa: split vdpasim to core and net modules
  vdpa_sim: split vdpasim_virtqueue's iov field in out_iov and in_iov
  vdpa_sim: make vdpasim->buffer size configurable
  vdpa_sim: use kvmalloc to allocate vdpasim->buffer
  vdpa_sim: set vringh notify callback
  vdpa_sim: add set_config callback in vdpasim_dev_attr
  vdpa_sim: add get_config callback in vdpasim_dev_attr
  vdpa_sim: make 'config' generic and usable for any device type
  ...
parents 58cf05f5 418eddef
...@@ -3072,6 +3072,7 @@ static int virtnet_probe(struct virtio_device *vdev) ...@@ -3072,6 +3072,7 @@ static int virtnet_probe(struct virtio_device *vdev)
dev_err(&vdev->dev, dev_err(&vdev->dev,
"device MTU appears to have changed it is now %d < %d", "device MTU appears to have changed it is now %d < %d",
mtu, dev->min_mtu); mtu, dev->min_mtu);
err = -EINVAL;
goto free; goto free;
} }
......
...@@ -9,21 +9,24 @@ menuconfig VDPA ...@@ -9,21 +9,24 @@ menuconfig VDPA
if VDPA if VDPA
config VDPA_SIM config VDPA_SIM
tristate "vDPA device simulator" tristate "vDPA device simulator core"
depends on RUNTIME_TESTING_MENU && HAS_DMA depends on RUNTIME_TESTING_MENU && HAS_DMA
select DMA_OPS select DMA_OPS
select VHOST_RING select VHOST_RING
help
Enable this module to support vDPA device simulators. These devices
are used for testing, prototyping and development of vDPA.
config VDPA_SIM_NET
tristate "vDPA simulator for networking device"
depends on VDPA_SIM
select GENERIC_NET_UTILS select GENERIC_NET_UTILS
default n
help help
vDPA networking device simulator which loop TX traffic back vDPA networking device simulator which loops TX traffic back to RX.
to RX. This device is used for testing, prototyping and
development of vDPA.
config IFCVF config IFCVF
tristate "Intel IFC VF vDPA driver" tristate "Intel IFC VF vDPA driver"
depends on PCI_MSI depends on PCI_MSI
default n
help help
This kernel module can drive Intel IFC VF NIC to offload This kernel module can drive Intel IFC VF NIC to offload
virtio dataplane traffic to hardware. virtio dataplane traffic to hardware.
...@@ -42,7 +45,6 @@ config MLX5_VDPA_NET ...@@ -42,7 +45,6 @@ config MLX5_VDPA_NET
tristate "vDPA driver for ConnectX devices" tristate "vDPA driver for ConnectX devices"
select MLX5_VDPA select MLX5_VDPA
depends on MLX5_CORE depends on MLX5_CORE
default n
help help
VDPA network driver for ConnectX6 and newer. Provides offloading VDPA network driver for ConnectX6 and newer. Provides offloading
of virtio net datapath such that descriptors put on the ring will of virtio net datapath such that descriptors put on the ring will
......
...@@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) ...@@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return ret; return ret;
} }
ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
if (ret) { if (ret) {
IFCVF_ERR(pdev, "No usable DMA confiugration\n"); IFCVF_ERR(pdev, "No usable DMA configuration\n");
return ret;
}
ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
if (ret) {
IFCVF_ERR(pdev,
"No usable coherent DMA confiugration\n");
return ret; return ret;
} }
......
...@@ -479,6 +479,11 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq) ...@@ -479,6 +479,11 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
{ {
mlx5_cq_set_ci(&mvq->cq.mcq); mlx5_cq_set_ci(&mvq->cq.mcq);
/* make sure CQ cosumer update is visible to the hardware before updating
* RX doorbell record.
*/
dma_wmb();
rx_post(&mvq->vqqp, num); rx_post(&mvq->vqqp, num);
if (mvq->event_cb.callback) if (mvq->event_cb.callback)
mvq->event_cb.callback(mvq->event_cb.private); mvq->event_cb.callback(mvq->event_cb.private);
......
...@@ -89,7 +89,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, ...@@ -89,7 +89,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
if (!vdev) if (!vdev)
goto err; goto err;
err = ida_simple_get(&vdpa_index_ida, 0, 0, GFP_KERNEL); err = ida_alloc(&vdpa_index_ida, GFP_KERNEL);
if (err < 0) if (err < 0)
goto err_ida; goto err_ida;
......
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o
obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o
// SPDX-License-Identifier: GPL-2.0-only // SPDX-License-Identifier: GPL-2.0-only
/* /*
* VDPA networking device simulator. * VDPA device simulator core.
* *
* Copyright (c) 2020, Red Hat Inc. All rights reserved. * Copyright (c) 2020, Red Hat Inc. All rights reserved.
* Author: Jason Wang <jasowang@redhat.com> * Author: Jason Wang <jasowang@redhat.com>
...@@ -11,97 +11,32 @@ ...@@ -11,97 +11,32 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/device.h> #include <linux/device.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/wait.h>
#include <linux/uuid.h>
#include <linux/iommu.h>
#include <linux/dma-map-ops.h> #include <linux/dma-map-ops.h>
#include <linux/sysfs.h>
#include <linux/file.h>
#include <linux/etherdevice.h>
#include <linux/vringh.h> #include <linux/vringh.h>
#include <linux/vdpa.h> #include <linux/vdpa.h>
#include <linux/virtio_byteorder.h>
#include <linux/vhost_iotlb.h> #include <linux/vhost_iotlb.h>
#include <uapi/linux/virtio_config.h>
#include <uapi/linux/virtio_net.h> #include "vdpa_sim.h"
#define DRV_VERSION "0.1" #define DRV_VERSION "0.1"
#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>" #define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>"
#define DRV_DESC "vDPA Device Simulator" #define DRV_DESC "vDPA Device Simulator core"
#define DRV_LICENSE "GPL v2" #define DRV_LICENSE "GPL v2"
static int batch_mapping = 1; static int batch_mapping = 1;
module_param(batch_mapping, int, 0444); module_param(batch_mapping, int, 0444);
MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable"); MODULE_PARM_DESC(batch_mapping, "Batched mapping 1 -Enable; 0 - Disable");
static char *macaddr; static int max_iotlb_entries = 2048;
module_param(macaddr, charp, 0); module_param(max_iotlb_entries, int, 0444);
MODULE_PARM_DESC(macaddr, "Ethernet MAC address"); MODULE_PARM_DESC(max_iotlb_entries,
"Maximum number of iotlb entries. 0 means unlimited. (default: 2048)");
struct vdpasim_virtqueue {
struct vringh vring;
struct vringh_kiov iov;
unsigned short head;
bool ready;
u64 desc_addr;
u64 device_addr;
u64 driver_addr;
u32 num;
void *private;
irqreturn_t (*cb)(void *data);
};
#define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_ALIGN PAGE_SIZE
#define VDPASIM_QUEUE_MAX 256 #define VDPASIM_QUEUE_MAX 256
#define VDPASIM_DEVICE_ID 0x1
#define VDPASIM_VENDOR_ID 0 #define VDPASIM_VENDOR_ID 0
#define VDPASIM_VQ_NUM 0x2
#define VDPASIM_NAME "vdpasim-netdev"
static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) |
(1ULL << VIRTIO_F_VERSION_1) |
(1ULL << VIRTIO_F_ACCESS_PLATFORM) |
(1ULL << VIRTIO_NET_F_MAC);
/* State of each vdpasim device */
struct vdpasim {
struct vdpa_device vdpa;
struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM];
struct work_struct work;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
struct virtio_net_config config;
struct vhost_iotlb *iommu;
void *buffer;
u32 status;
u32 generation;
u64 features;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
};
/* TODO: cross-endian support */
static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
{
return virtio_legacy_is_little_endian() ||
(vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
}
static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
{
return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
{
return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
}
static struct vdpasim *vdpasim_dev;
static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa) static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
{ {
...@@ -115,20 +50,34 @@ static struct vdpasim *dev_to_sim(struct device *dev) ...@@ -115,20 +50,34 @@ static struct vdpasim *dev_to_sim(struct device *dev)
return vdpa_to_sim(vdpa); return vdpa_to_sim(vdpa);
} }
static void vdpasim_vq_notify(struct vringh *vring)
{
struct vdpasim_virtqueue *vq =
container_of(vring, struct vdpasim_virtqueue, vring);
if (!vq->cb)
return;
vq->cb(vq->private);
}
static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx)
{ {
struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx]; struct vdpasim_virtqueue *vq = &vdpasim->vqs[idx];
vringh_init_iotlb(&vq->vring, vdpasim_features, vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features,
VDPASIM_QUEUE_MAX, false, VDPASIM_QUEUE_MAX, false,
(struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_desc *)(uintptr_t)vq->desc_addr,
(struct vring_avail *) (struct vring_avail *)
(uintptr_t)vq->driver_addr, (uintptr_t)vq->driver_addr,
(struct vring_used *) (struct vring_used *)
(uintptr_t)vq->device_addr); (uintptr_t)vq->device_addr);
vq->vring.notify = vdpasim_vq_notify;
} }
static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) static void vdpasim_vq_reset(struct vdpasim *vdpasim,
struct vdpasim_virtqueue *vq)
{ {
vq->ready = false; vq->ready = false;
vq->desc_addr = 0; vq->desc_addr = 0;
...@@ -136,16 +85,18 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) ...@@ -136,16 +85,18 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq)
vq->device_addr = 0; vq->device_addr = 0;
vq->cb = NULL; vq->cb = NULL;
vq->private = NULL; vq->private = NULL;
vringh_init_iotlb(&vq->vring, vdpasim_features, VDPASIM_QUEUE_MAX, vringh_init_iotlb(&vq->vring, vdpasim->dev_attr.supported_features,
false, NULL, NULL, NULL); VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL);
vq->vring.notify = NULL;
} }
static void vdpasim_reset(struct vdpasim *vdpasim) static void vdpasim_reset(struct vdpasim *vdpasim)
{ {
int i; int i;
for (i = 0; i < VDPASIM_VQ_NUM; i++) for (i = 0; i < vdpasim->dev_attr.nvqs; i++)
vdpasim_vq_reset(&vdpasim->vqs[i]); vdpasim_vq_reset(vdpasim, &vdpasim->vqs[i]);
spin_lock(&vdpasim->iommu_lock); spin_lock(&vdpasim->iommu_lock);
vhost_iotlb_reset(vdpasim->iommu); vhost_iotlb_reset(vdpasim->iommu);
...@@ -156,80 +107,6 @@ static void vdpasim_reset(struct vdpasim *vdpasim) ...@@ -156,80 +107,6 @@ static void vdpasim_reset(struct vdpasim *vdpasim)
++vdpasim->generation; ++vdpasim->generation;
} }
static void vdpasim_work(struct work_struct *work)
{
struct vdpasim *vdpasim = container_of(work, struct
vdpasim, work);
struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
ssize_t read, write;
size_t total_write;
int pkts = 0;
int err;
spin_lock(&vdpasim->lock);
if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
goto out;
if (!txq->ready || !rxq->ready)
goto out;
while (true) {
total_write = 0;
err = vringh_getdesc_iotlb(&txq->vring, &txq->iov, NULL,
&txq->head, GFP_ATOMIC);
if (err <= 0)
break;
err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->iov,
&rxq->head, GFP_ATOMIC);
if (err <= 0) {
vringh_complete_iotlb(&txq->vring, txq->head, 0);
break;
}
while (true) {
read = vringh_iov_pull_iotlb(&txq->vring, &txq->iov,
vdpasim->buffer,
PAGE_SIZE);
if (read <= 0)
break;
write = vringh_iov_push_iotlb(&rxq->vring, &rxq->iov,
vdpasim->buffer, read);
if (write <= 0)
break;
total_write += write;
}
/* Make sure data is wrote before advancing index */
smp_wmb();
vringh_complete_iotlb(&txq->vring, txq->head, 0);
vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
/* Make sure used is visible before rasing the interrupt. */
smp_wmb();
local_bh_disable();
if (txq->cb)
txq->cb(txq->private);
if (rxq->cb)
rxq->cb(rxq->private);
local_bh_enable();
if (++pkts > 4) {
schedule_work(&vdpasim->work);
goto out;
}
}
out:
spin_unlock(&vdpasim->lock);
}
static int dir_to_perm(enum dma_data_direction dir) static int dir_to_perm(enum dma_data_direction dir)
{ {
int perm = -EFAULT; int perm = -EFAULT;
...@@ -342,26 +219,28 @@ static const struct dma_map_ops vdpasim_dma_ops = { ...@@ -342,26 +219,28 @@ static const struct dma_map_ops vdpasim_dma_ops = {
.free = vdpasim_free_coherent, .free = vdpasim_free_coherent,
}; };
static const struct vdpa_config_ops vdpasim_net_config_ops; static const struct vdpa_config_ops vdpasim_config_ops;
static const struct vdpa_config_ops vdpasim_net_batch_config_ops; static const struct vdpa_config_ops vdpasim_batch_config_ops;
static struct vdpasim *vdpasim_create(void) struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr)
{ {
const struct vdpa_config_ops *ops; const struct vdpa_config_ops *ops;
struct vdpasim *vdpasim; struct vdpasim *vdpasim;
struct device *dev; struct device *dev;
int ret = -ENOMEM; int i, ret = -ENOMEM;
if (batch_mapping) if (batch_mapping)
ops = &vdpasim_net_batch_config_ops; ops = &vdpasim_batch_config_ops;
else else
ops = &vdpasim_net_config_ops; ops = &vdpasim_config_ops;
vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops,
dev_attr->nvqs);
if (!vdpasim) if (!vdpasim)
goto err_alloc; goto err_alloc;
INIT_WORK(&vdpasim->work, vdpasim_work); vdpasim->dev_attr = *dev_attr;
INIT_WORK(&vdpasim->work, dev_attr->work_fn);
spin_lock_init(&vdpasim->lock); spin_lock_init(&vdpasim->lock);
spin_lock_init(&vdpasim->iommu_lock); spin_lock_init(&vdpasim->iommu_lock);
...@@ -371,31 +250,27 @@ static struct vdpasim *vdpasim_create(void) ...@@ -371,31 +250,27 @@ static struct vdpasim *vdpasim_create(void)
goto err_iommu; goto err_iommu;
set_dma_ops(dev, &vdpasim_dma_ops); set_dma_ops(dev, &vdpasim_dma_ops);
vdpasim->iommu = vhost_iotlb_alloc(2048, 0); vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL);
if (!vdpasim->config)
goto err_iommu;
vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue),
GFP_KERNEL);
if (!vdpasim->vqs)
goto err_iommu;
vdpasim->iommu = vhost_iotlb_alloc(max_iotlb_entries, 0);
if (!vdpasim->iommu) if (!vdpasim->iommu)
goto err_iommu; goto err_iommu;
vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); vdpasim->buffer = kvmalloc(dev_attr->buffer_size, GFP_KERNEL);
if (!vdpasim->buffer) if (!vdpasim->buffer)
goto err_iommu; goto err_iommu;
if (macaddr) { for (i = 0; i < dev_attr->nvqs; i++)
mac_pton(macaddr, vdpasim->config.mac); vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu);
if (!is_valid_ether_addr(vdpasim->config.mac)) {
ret = -EADDRNOTAVAIL;
goto err_iommu;
}
} else {
eth_random_addr(vdpasim->config.mac);
}
vringh_set_iotlb(&vdpasim->vqs[0].vring, vdpasim->iommu);
vringh_set_iotlb(&vdpasim->vqs[1].vring, vdpasim->iommu);
vdpasim->vdpa.dma_dev = dev; vdpasim->vdpa.dma_dev = dev;
ret = vdpa_register_device(&vdpasim->vdpa);
if (ret)
goto err_iommu;
return vdpasim; return vdpasim;
...@@ -404,6 +279,7 @@ static struct vdpasim *vdpasim_create(void) ...@@ -404,6 +279,7 @@ static struct vdpasim *vdpasim_create(void)
err_alloc: err_alloc:
return ERR_PTR(ret); return ERR_PTR(ret);
} }
EXPORT_SYMBOL_GPL(vdpasim_create);
static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx, static int vdpasim_set_vq_address(struct vdpa_device *vdpa, u16 idx,
u64 desc_area, u64 driver_area, u64 desc_area, u64 driver_area,
...@@ -498,28 +374,21 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) ...@@ -498,28 +374,21 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa)
static u64 vdpasim_get_features(struct vdpa_device *vdpa) static u64 vdpasim_get_features(struct vdpa_device *vdpa)
{ {
return vdpasim_features; struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
return vdpasim->dev_attr.supported_features;
} }
static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features)
{ {
struct vdpasim *vdpasim = vdpa_to_sim(vdpa); struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
struct virtio_net_config *config = &vdpasim->config;
/* DMA mapping must be done by driver */ /* DMA mapping must be done by driver */
if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM)))
return -EINVAL; return -EINVAL;
vdpasim->features = features & vdpasim_features; vdpasim->features = features & vdpasim->dev_attr.supported_features;
/* We generally only know whether guest is using the legacy interface
* here, so generally that's the earliest we can set config fields.
* Note: We actually require VIRTIO_F_ACCESS_PLATFORM above which
* implies VIRTIO_F_VERSION_1, but let's not try to be clever here.
*/
config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
return 0; return 0;
} }
...@@ -536,7 +405,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa) ...@@ -536,7 +405,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa)
static u32 vdpasim_get_device_id(struct vdpa_device *vdpa) static u32 vdpasim_get_device_id(struct vdpa_device *vdpa)
{ {
return VDPASIM_DEVICE_ID; struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
return vdpasim->dev_attr.id;
} }
static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa) static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa)
...@@ -572,14 +443,27 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, ...@@ -572,14 +443,27 @@ static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
{ {
struct vdpasim *vdpasim = vdpa_to_sim(vdpa); struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
if (offset + len < sizeof(struct virtio_net_config)) if (offset + len > vdpasim->dev_attr.config_size)
memcpy(buf, (u8 *)&vdpasim->config + offset, len); return;
if (vdpasim->dev_attr.get_config)
vdpasim->dev_attr.get_config(vdpasim, vdpasim->config);
memcpy(buf, vdpasim->config + offset, len);
} }
static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset, static void vdpasim_set_config(struct vdpa_device *vdpa, unsigned int offset,
const void *buf, unsigned int len) const void *buf, unsigned int len)
{ {
/* No writable config supportted by vdpasim */ struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
if (offset + len > vdpasim->dev_attr.config_size)
return;
memcpy(vdpasim->config + offset, buf, len);
if (vdpasim->dev_attr.set_config)
vdpasim->dev_attr.set_config(vdpasim, vdpasim->config);
} }
static u32 vdpasim_get_generation(struct vdpa_device *vdpa) static u32 vdpasim_get_generation(struct vdpa_device *vdpa)
...@@ -656,12 +540,14 @@ static void vdpasim_free(struct vdpa_device *vdpa) ...@@ -656,12 +540,14 @@ static void vdpasim_free(struct vdpa_device *vdpa)
struct vdpasim *vdpasim = vdpa_to_sim(vdpa); struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
cancel_work_sync(&vdpasim->work); cancel_work_sync(&vdpasim->work);
kfree(vdpasim->buffer); kvfree(vdpasim->buffer);
if (vdpasim->iommu) if (vdpasim->iommu)
vhost_iotlb_free(vdpasim->iommu); vhost_iotlb_free(vdpasim->iommu);
kfree(vdpasim->vqs);
kfree(vdpasim->config);
} }
static const struct vdpa_config_ops vdpasim_net_config_ops = { static const struct vdpa_config_ops vdpasim_config_ops = {
.set_vq_address = vdpasim_set_vq_address, .set_vq_address = vdpasim_set_vq_address,
.set_vq_num = vdpasim_set_vq_num, .set_vq_num = vdpasim_set_vq_num,
.kick_vq = vdpasim_kick_vq, .kick_vq = vdpasim_kick_vq,
...@@ -688,7 +574,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = { ...@@ -688,7 +574,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = {
.free = vdpasim_free, .free = vdpasim_free,
}; };
static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { static const struct vdpa_config_ops vdpasim_batch_config_ops = {
.set_vq_address = vdpasim_set_vq_address, .set_vq_address = vdpasim_set_vq_address,
.set_vq_num = vdpasim_set_vq_num, .set_vq_num = vdpasim_set_vq_num,
.kick_vq = vdpasim_kick_vq, .kick_vq = vdpasim_kick_vq,
...@@ -714,26 +600,6 @@ static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { ...@@ -714,26 +600,6 @@ static const struct vdpa_config_ops vdpasim_net_batch_config_ops = {
.free = vdpasim_free, .free = vdpasim_free,
}; };
static int __init vdpasim_dev_init(void)
{
vdpasim_dev = vdpasim_create();
if (!IS_ERR(vdpasim_dev))
return 0;
return PTR_ERR(vdpasim_dev);
}
static void __exit vdpasim_dev_exit(void)
{
struct vdpa_device *vdpa = &vdpasim_dev->vdpa;
vdpa_unregister_device(vdpa);
}
module_init(vdpasim_dev_init)
module_exit(vdpasim_dev_exit)
MODULE_VERSION(DRV_VERSION); MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE(DRV_LICENSE); MODULE_LICENSE(DRV_LICENSE);
MODULE_AUTHOR(DRV_AUTHOR); MODULE_AUTHOR(DRV_AUTHOR);
......
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2020, Red Hat Inc. All rights reserved.
*/
#ifndef _VDPA_SIM_H
#define _VDPA_SIM_H
#include <linux/vringh.h>
#include <linux/vdpa.h>
#include <linux/virtio_byteorder.h>
#include <linux/vhost_iotlb.h>
#include <uapi/linux/virtio_config.h>
#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \
(1ULL << VIRTIO_F_VERSION_1) | \
(1ULL << VIRTIO_F_ACCESS_PLATFORM))
struct vdpasim;
struct vdpasim_virtqueue {
struct vringh vring;
struct vringh_kiov in_iov;
struct vringh_kiov out_iov;
unsigned short head;
bool ready;
u64 desc_addr;
u64 device_addr;
u64 driver_addr;
u32 num;
void *private;
irqreturn_t (*cb)(void *data);
};
struct vdpasim_dev_attr {
u64 supported_features;
size_t config_size;
size_t buffer_size;
int nvqs;
u32 id;
work_func_t work_fn;
void (*get_config)(struct vdpasim *vdpasim, void *config);
void (*set_config)(struct vdpasim *vdpasim, const void *config);
};
/* State of each vdpasim device */
struct vdpasim {
struct vdpa_device vdpa;
struct vdpasim_virtqueue *vqs;
struct work_struct work;
struct vdpasim_dev_attr dev_attr;
/* spinlock to synchronize virtqueue state */
spinlock_t lock;
/* virtio config according to device type */
void *config;
struct vhost_iotlb *iommu;
void *buffer;
u32 status;
u32 generation;
u64 features;
/* spinlock to synchronize iommu table */
spinlock_t iommu_lock;
};
struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr);
/* TODO: cross-endian support */
static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim)
{
return virtio_legacy_is_little_endian() ||
(vdpasim->features & (1ULL << VIRTIO_F_VERSION_1));
}
static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val)
{
return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val)
{
return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val);
}
static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val)
{
return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val)
{
return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val);
}
static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val)
{
return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val);
}
static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val)
{
return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val);
}
#endif
// SPDX-License-Identifier: GPL-2.0-only
/*
* VDPA simulator for networking device.
*
* Copyright (c) 2020, Red Hat Inc. All rights reserved.
* Author: Jason Wang <jasowang@redhat.com>
*
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/etherdevice.h>
#include <linux/vringh.h>
#include <linux/vdpa.h>
#include <uapi/linux/virtio_net.h>
#include "vdpa_sim.h"
#define DRV_VERSION "0.1"
#define DRV_AUTHOR "Jason Wang <jasowang@redhat.com>"
#define DRV_DESC "vDPA Device Simulator for networking device"
#define DRV_LICENSE "GPL v2"
#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \
(1ULL << VIRTIO_NET_F_MAC))
#define VDPASIM_NET_VQ_NUM 2
static char *macaddr;
module_param(macaddr, charp, 0);
MODULE_PARM_DESC(macaddr, "Ethernet MAC address");
u8 macaddr_buf[ETH_ALEN];
static struct vdpasim *vdpasim_net_dev;
static void vdpasim_net_work(struct work_struct *work)
{
struct vdpasim *vdpasim = container_of(work, struct vdpasim, work);
struct vdpasim_virtqueue *txq = &vdpasim->vqs[1];
struct vdpasim_virtqueue *rxq = &vdpasim->vqs[0];
ssize_t read, write;
size_t total_write;
int pkts = 0;
int err;
spin_lock(&vdpasim->lock);
if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
goto out;
if (!txq->ready || !rxq->ready)
goto out;
while (true) {
total_write = 0;
err = vringh_getdesc_iotlb(&txq->vring, &txq->out_iov, NULL,
&txq->head, GFP_ATOMIC);
if (err <= 0)
break;
err = vringh_getdesc_iotlb(&rxq->vring, NULL, &rxq->in_iov,
&rxq->head, GFP_ATOMIC);
if (err <= 0) {
vringh_complete_iotlb(&txq->vring, txq->head, 0);
break;
}
while (true) {
read = vringh_iov_pull_iotlb(&txq->vring, &txq->out_iov,
vdpasim->buffer,
PAGE_SIZE);
if (read <= 0)
break;
write = vringh_iov_push_iotlb(&rxq->vring, &rxq->in_iov,
vdpasim->buffer, read);
if (write <= 0)
break;
total_write += write;
}
/* Make sure data is wrote before advancing index */
smp_wmb();
vringh_complete_iotlb(&txq->vring, txq->head, 0);
vringh_complete_iotlb(&rxq->vring, rxq->head, total_write);
/* Make sure used is visible before rasing the interrupt. */
smp_wmb();
local_bh_disable();
if (vringh_need_notify_iotlb(&txq->vring) > 0)
vringh_notify(&txq->vring);
if (vringh_need_notify_iotlb(&rxq->vring) > 0)
vringh_notify(&rxq->vring);
local_bh_enable();
if (++pkts > 4) {
schedule_work(&vdpasim->work);
goto out;
}
}
out:
spin_unlock(&vdpasim->lock);
}
static void vdpasim_net_get_config(struct vdpasim *vdpasim, void *config)
{
struct virtio_net_config *net_config =
(struct virtio_net_config *)config;
net_config->mtu = cpu_to_vdpasim16(vdpasim, 1500);
net_config->status = cpu_to_vdpasim16(vdpasim, VIRTIO_NET_S_LINK_UP);
memcpy(net_config->mac, macaddr_buf, ETH_ALEN);
}
static int __init vdpasim_net_init(void)
{
struct vdpasim_dev_attr dev_attr = {};
int ret;
if (macaddr) {
mac_pton(macaddr, macaddr_buf);
if (!is_valid_ether_addr(macaddr_buf)) {
ret = -EADDRNOTAVAIL;
goto out;
}
} else {
eth_random_addr(macaddr_buf);
}
dev_attr.id = VIRTIO_ID_NET;
dev_attr.supported_features = VDPASIM_NET_FEATURES;
dev_attr.nvqs = VDPASIM_NET_VQ_NUM;
dev_attr.config_size = sizeof(struct virtio_net_config);
dev_attr.get_config = vdpasim_net_get_config;
dev_attr.work_fn = vdpasim_net_work;
dev_attr.buffer_size = PAGE_SIZE;
vdpasim_net_dev = vdpasim_create(&dev_attr);
if (IS_ERR(vdpasim_net_dev)) {
ret = PTR_ERR(vdpasim_net_dev);
goto out;
}
ret = vdpa_register_device(&vdpasim_net_dev->vdpa);
if (ret)
goto put_dev;
return 0;
put_dev:
put_device(&vdpasim_net_dev->vdpa.dev);
out:
return ret;
}
static void __exit vdpasim_net_exit(void)
{
struct vdpa_device *vdpa = &vdpasim_net_dev->vdpa;
vdpa_unregister_device(vdpa);
}
module_init(vdpasim_net_init);
module_exit(vdpasim_net_exit);
MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE(DRV_LICENSE);
MODULE_AUTHOR(DRV_AUTHOR);
MODULE_DESCRIPTION(DRV_DESC);
...@@ -1643,7 +1643,8 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, ...@@ -1643,7 +1643,8 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs,
if (!vhost_vq_is_setup(vq)) if (!vhost_vq_is_setup(vq))
continue; continue;
if (vhost_scsi_setup_vq_cmds(vq, vq->num)) ret = vhost_scsi_setup_vq_cmds(vq, vq->num);
if (ret)
goto destroy_vq_cmds; goto destroy_vq_cmds;
} }
......
...@@ -245,14 +245,10 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, ...@@ -245,14 +245,10 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v,
return -EFAULT; return -EFAULT;
if (vhost_vdpa_config_validate(v, &config)) if (vhost_vdpa_config_validate(v, &config))
return -EINVAL; return -EINVAL;
buf = kvzalloc(config.len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (copy_from_user(buf, c->buf, config.len)) { buf = vmemdup_user(c->buf, config.len);
kvfree(buf); if (IS_ERR(buf))
return -EFAULT; return PTR_ERR(buf);
}
ops->set_config(vdpa, config.off, buf, config.len); ops->set_config(vdpa, config.off, buf, config.len);
......
...@@ -27,20 +27,74 @@ static bool unplug_online = true; ...@@ -27,20 +27,74 @@ static bool unplug_online = true;
module_param(unplug_online, bool, 0644); module_param(unplug_online, bool, 0644);
MODULE_PARM_DESC(unplug_online, "Try to unplug online memory"); MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
enum virtio_mem_mb_state { static bool force_bbm;
module_param(force_bbm, bool, 0444);
MODULE_PARM_DESC(force_bbm,
"Force Big Block Mode. Default is 0 (auto-selection)");
static unsigned long bbm_block_size;
module_param(bbm_block_size, ulong, 0444);
MODULE_PARM_DESC(bbm_block_size,
"Big Block size in bytes. Default is 0 (auto-detection).");
static bool bbm_safe_unplug = true;
module_param(bbm_safe_unplug, bool, 0444);
MODULE_PARM_DESC(bbm_safe_unplug,
"Use a safe unplug mechanism in BBM, avoiding long/endless loops");
/*
* virtio-mem currently supports the following modes of operation:
*
* * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
* size of a Sub Block (SB) is determined based on the device block size, the
* pageblock size, and the maximum allocation granularity of the buddy.
* Subblocks within a Linux memory block might either be plugged or unplugged.
* Memory is added/removed to Linux MM in Linux memory block granularity.
*
* * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
* Memory is added/removed to Linux MM in Big Block granularity.
*
* The mode is determined automatically based on the Linux memory block size
* and the device block size.
*
* User space / core MM (auto onlining) is responsible for onlining added
* Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
* always onlined separately, and all memory within a Linux memory block is
* onlined to the same zone - virtio-mem relies on this behavior.
*/
/*
* State of a Linux memory block in SBM.
*/
enum virtio_mem_sbm_mb_state {
/* Unplugged, not added to Linux. Can be reused later. */ /* Unplugged, not added to Linux. Can be reused later. */
VIRTIO_MEM_MB_STATE_UNUSED = 0, VIRTIO_MEM_SBM_MB_UNUSED = 0,
/* (Partially) plugged, not added to Linux. Error on add_memory(). */ /* (Partially) plugged, not added to Linux. Error on add_memory(). */
VIRTIO_MEM_MB_STATE_PLUGGED, VIRTIO_MEM_SBM_MB_PLUGGED,
/* Fully plugged, fully added to Linux, offline. */ /* Fully plugged, fully added to Linux, offline. */
VIRTIO_MEM_MB_STATE_OFFLINE, VIRTIO_MEM_SBM_MB_OFFLINE,
/* Partially plugged, fully added to Linux, offline. */ /* Partially plugged, fully added to Linux, offline. */
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
/* Fully plugged, fully added to Linux, online. */ /* Fully plugged, fully added to Linux, online. */
VIRTIO_MEM_MB_STATE_ONLINE, VIRTIO_MEM_SBM_MB_ONLINE,
/* Partially plugged, fully added to Linux, online. */ /* Partially plugged, fully added to Linux, online. */
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL, VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL,
VIRTIO_MEM_MB_STATE_COUNT VIRTIO_MEM_SBM_MB_COUNT
};
/*
* State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
*/
enum virtio_mem_bbm_bb_state {
/* Unplugged, not added to Linux. Can be reused later. */
VIRTIO_MEM_BBM_BB_UNUSED = 0,
/* Plugged, not added to Linux. Error on add_memory(). */
VIRTIO_MEM_BBM_BB_PLUGGED,
/* Plugged and added to Linux. */
VIRTIO_MEM_BBM_BB_ADDED,
/* All online parts are fake-offline, ready to remove. */
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
VIRTIO_MEM_BBM_BB_COUNT
}; };
struct virtio_mem { struct virtio_mem {
...@@ -51,6 +105,7 @@ struct virtio_mem { ...@@ -51,6 +105,7 @@ struct virtio_mem {
/* Workqueue that processes the plug/unplug requests. */ /* Workqueue that processes the plug/unplug requests. */
struct work_struct wq; struct work_struct wq;
atomic_t wq_active;
atomic_t config_changed; atomic_t config_changed;
/* Virtqueue for guest->host requests. */ /* Virtqueue for guest->host requests. */
...@@ -70,27 +125,13 @@ struct virtio_mem { ...@@ -70,27 +125,13 @@ struct virtio_mem {
/* The device block size (for communicating with the device). */ /* The device block size (for communicating with the device). */
uint64_t device_block_size; uint64_t device_block_size;
/* The translated node id. NUMA_NO_NODE in case not specified. */ /* The determined node id for all memory of the device. */
int nid; int nid;
/* Physical start address of the memory region. */ /* Physical start address of the memory region. */
uint64_t addr; uint64_t addr;
/* Maximum region size in bytes. */ /* Maximum region size in bytes. */
uint64_t region_size; uint64_t region_size;
/* The subblock size. */
uint64_t subblock_size;
/* The number of subblocks per memory block. */
uint32_t nb_sb_per_mb;
/* Id of the first memory block of this device. */
unsigned long first_mb_id;
/* Id of the last memory block of this device. */
unsigned long last_mb_id;
/* Id of the last usable memory block of this device. */
unsigned long last_usable_mb_id;
/* Id of the next memory bock to prepare when needed. */
unsigned long next_mb_id;
/* The parent resource for all memory added via this device. */ /* The parent resource for all memory added via this device. */
struct resource *parent_resource; struct resource *parent_resource;
/* /*
...@@ -99,31 +140,79 @@ struct virtio_mem { ...@@ -99,31 +140,79 @@ struct virtio_mem {
*/ */
const char *resource_name; const char *resource_name;
/* Summary of all memory block states. */
unsigned long nb_mb_state[VIRTIO_MEM_MB_STATE_COUNT];
#define VIRTIO_MEM_NB_OFFLINE_THRESHOLD 10
/*
* One byte state per memory block.
*
* Allocated via vmalloc(). When preparing new blocks, resized
* (alloc+copy+free) when needed (crossing pages with the next mb).
* (when crossing pages).
*
* With 128MB memory blocks, we have states for 512GB of memory in one
* page.
*/
uint8_t *mb_state;
/* /*
* $nb_sb_per_mb bit per memory block. Handled similar to mb_state. * We don't want to add too much memory if it's not getting onlined,
* * to avoid running OOM. Besides this threshold, we allow to have at
* With 4MB subblocks, we manage 128GB of memory in one page. * least two offline blocks at a time (whatever is bigger).
*/ */
unsigned long *sb_bitmap; #define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD (1024 * 1024 * 1024)
atomic64_t offline_size;
uint64_t offline_threshold;
/* If set, the driver is in SBM, otherwise in BBM. */
bool in_sbm;
union {
struct {
/* Id of the first memory block of this device. */
unsigned long first_mb_id;
/* Id of the last usable memory block of this device. */
unsigned long last_usable_mb_id;
/* Id of the next memory bock to prepare when needed. */
unsigned long next_mb_id;
/* The subblock size. */
uint64_t sb_size;
/* The number of subblocks per Linux memory block. */
uint32_t sbs_per_mb;
/* Summary of all memory block states. */
unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
/*
* One byte state per memory block. Allocated via
* vmalloc(). Resized (alloc+copy+free) on demand.
*
* With 128 MiB memory blocks, we have states for 512
* GiB of memory in one 4 KiB page.
*/
uint8_t *mb_states;
/*
* Bitmap: one bit per subblock. Allocated similar to
* sbm.mb_states.
*
* A set bit means the corresponding subblock is
* plugged, otherwise it's unblocked.
*
* With 4 MiB subblocks, we manage 128 GiB of memory
* in one 4 KiB page.
*/
unsigned long *sb_states;
} sbm;
struct {
/* Id of the first big block of this device. */
unsigned long first_bb_id;
/* Id of the last usable big block of this device. */
unsigned long last_usable_bb_id;
/* Id of the next device bock to prepare when needed. */
unsigned long next_bb_id;
/* Summary of all big block states. */
unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
/* One byte state per big block. See sbm.mb_states. */
uint8_t *bb_states;
/* The block size used for plugging/adding/removing. */
uint64_t bb_size;
} bbm;
};
/* /*
* Mutex that protects the nb_mb_state, mb_state, and sb_bitmap. * Mutex that protects the sbm.mb_count, sbm.mb_states,
* sbm.sb_states, bbm.bb_count, and bbm.bb_states
* *
* When this lock is held the pointers can't change, ONLINE and * When this lock is held the pointers can't change, ONLINE and
* OFFLINE blocks can't change the state and no subblocks will get * OFFLINE blocks can't change the state and no subblocks will get
...@@ -160,6 +249,11 @@ static DEFINE_MUTEX(virtio_mem_mutex); ...@@ -160,6 +249,11 @@ static DEFINE_MUTEX(virtio_mem_mutex);
static LIST_HEAD(virtio_mem_devices); static LIST_HEAD(virtio_mem_devices);
static void virtio_mem_online_page_cb(struct page *page, unsigned int order); static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
unsigned long nr_pages);
static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
unsigned long nr_pages);
static void virtio_mem_retry(struct virtio_mem *vm);
/* /*
* Register a virtio-mem device so it will be considered for the online_page * Register a virtio-mem device so it will be considered for the online_page
...@@ -212,6 +306,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id) ...@@ -212,6 +306,24 @@ static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
return mb_id * memory_block_size_bytes(); return mb_id * memory_block_size_bytes();
} }
/*
* Calculate the big block id of a given address.
*/
static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
uint64_t addr)
{
return addr / vm->bbm.bb_size;
}
/*
* Calculate the physical start address of a given big block id.
*/
static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
unsigned long bb_id)
{
return bb_id * vm->bbm.bb_size;
}
/* /*
* Calculate the subblock id of a given address. * Calculate the subblock id of a given address.
*/ */
...@@ -221,89 +333,164 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm, ...@@ -221,89 +333,164 @@ static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id); const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
return (addr - mb_addr) / vm->subblock_size; return (addr - mb_addr) / vm->sbm.sb_size;
} }
/*
* Set the state of a big block, taking care of the state counter.
*/
static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
unsigned long bb_id,
enum virtio_mem_bbm_bb_state state)
{
const unsigned long idx = bb_id - vm->bbm.first_bb_id;
enum virtio_mem_bbm_bb_state old_state;
old_state = vm->bbm.bb_states[idx];
vm->bbm.bb_states[idx] = state;
BUG_ON(vm->bbm.bb_count[old_state] == 0);
vm->bbm.bb_count[old_state]--;
vm->bbm.bb_count[state]++;
}
/*
* Get the state of a big block.
*/
static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
unsigned long bb_id)
{
return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
}
/*
* Prepare the big block state array for the next big block.
*/
static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
{
unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
unsigned long new_bytes = old_bytes + 1;
int old_pages = PFN_UP(old_bytes);
int new_pages = PFN_UP(new_bytes);
uint8_t *new_array;
if (vm->bbm.bb_states && old_pages == new_pages)
return 0;
new_array = vzalloc(new_pages * PAGE_SIZE);
if (!new_array)
return -ENOMEM;
mutex_lock(&vm->hotplug_mutex);
if (vm->bbm.bb_states)
memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
vfree(vm->bbm.bb_states);
vm->bbm.bb_states = new_array;
mutex_unlock(&vm->hotplug_mutex);
return 0;
}
#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
for (_bb_id = vm->bbm.first_bb_id; \
_bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
_bb_id++) \
if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
for (_bb_id = vm->bbm.next_bb_id - 1; \
_bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
_bb_id--) \
if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
/* /*
* Set the state of a memory block, taking care of the state counter. * Set the state of a memory block, taking care of the state counter.
*/ */
static void virtio_mem_mb_set_state(struct virtio_mem *vm, unsigned long mb_id, static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
enum virtio_mem_mb_state state) unsigned long mb_id, uint8_t state)
{ {
const unsigned long idx = mb_id - vm->first_mb_id; const unsigned long idx = mb_id - vm->sbm.first_mb_id;
enum virtio_mem_mb_state old_state; uint8_t old_state;
old_state = vm->mb_state[idx]; old_state = vm->sbm.mb_states[idx];
vm->mb_state[idx] = state; vm->sbm.mb_states[idx] = state;
BUG_ON(vm->nb_mb_state[old_state] == 0); BUG_ON(vm->sbm.mb_count[old_state] == 0);
vm->nb_mb_state[old_state]--; vm->sbm.mb_count[old_state]--;
vm->nb_mb_state[state]++; vm->sbm.mb_count[state]++;
} }
/* /*
* Get the state of a memory block. * Get the state of a memory block.
*/ */
static enum virtio_mem_mb_state virtio_mem_mb_get_state(struct virtio_mem *vm, static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
unsigned long mb_id) unsigned long mb_id)
{ {
const unsigned long idx = mb_id - vm->first_mb_id; const unsigned long idx = mb_id - vm->sbm.first_mb_id;
return vm->mb_state[idx]; return vm->sbm.mb_states[idx];
} }
/* /*
* Prepare the state array for the next memory block. * Prepare the state array for the next memory block.
*/ */
static int virtio_mem_mb_state_prepare_next_mb(struct virtio_mem *vm) static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
{ {
unsigned long old_bytes = vm->next_mb_id - vm->first_mb_id + 1; int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
unsigned long new_bytes = vm->next_mb_id - vm->first_mb_id + 2; int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
int old_pages = PFN_UP(old_bytes); uint8_t *new_array;
int new_pages = PFN_UP(new_bytes);
uint8_t *new_mb_state;
if (vm->mb_state && old_pages == new_pages) if (vm->sbm.mb_states && old_pages == new_pages)
return 0; return 0;
new_mb_state = vzalloc(new_pages * PAGE_SIZE); new_array = vzalloc(new_pages * PAGE_SIZE);
if (!new_mb_state) if (!new_array)
return -ENOMEM; return -ENOMEM;
mutex_lock(&vm->hotplug_mutex); mutex_lock(&vm->hotplug_mutex);
if (vm->mb_state) if (vm->sbm.mb_states)
memcpy(new_mb_state, vm->mb_state, old_pages * PAGE_SIZE); memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
vfree(vm->mb_state); vfree(vm->sbm.mb_states);
vm->mb_state = new_mb_state; vm->sbm.mb_states = new_array;
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
return 0; return 0;
} }
#define virtio_mem_for_each_mb_state(_vm, _mb_id, _state) \ #define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
for (_mb_id = _vm->first_mb_id; \ for (_mb_id = _vm->sbm.first_mb_id; \
_mb_id < _vm->next_mb_id && _vm->nb_mb_state[_state]; \ _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
_mb_id++) \ _mb_id++) \
if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
#define virtio_mem_for_each_mb_state_rev(_vm, _mb_id, _state) \ #define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
for (_mb_id = _vm->next_mb_id - 1; \ for (_mb_id = _vm->sbm.next_mb_id - 1; \
_mb_id >= _vm->first_mb_id && _vm->nb_mb_state[_state]; \ _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
_mb_id--) \ _mb_id--) \
if (virtio_mem_mb_get_state(_vm, _mb_id) == _state) if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
/*
* Calculate the bit number in the subblock bitmap for the given subblock
* inside the given memory block.
*/
static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
unsigned long mb_id, int sb_id)
{
return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
}
/* /*
* Mark all selected subblocks plugged. * Mark all selected subblocks plugged.
* *
* Will not modify the state of the memory block. * Will not modify the state of the memory block.
*/ */
static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id, unsigned long mb_id, int sb_id,
int count) int count)
{ {
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
__bitmap_set(vm->sb_bitmap, bit, count); __bitmap_set(vm->sbm.sb_states, bit, count);
} }
/* /*
...@@ -311,105 +498,114 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm, ...@@ -311,105 +498,114 @@ static void virtio_mem_mb_set_sb_plugged(struct virtio_mem *vm,
* *
* Will not modify the state of the memory block. * Will not modify the state of the memory block.
*/ */
static void virtio_mem_mb_set_sb_unplugged(struct virtio_mem *vm, static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id, unsigned long mb_id, int sb_id,
int count) int count)
{ {
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
__bitmap_clear(vm->sb_bitmap, bit, count); __bitmap_clear(vm->sbm.sb_states, bit, count);
} }
/* /*
* Test if all selected subblocks are plugged. * Test if all selected subblocks are plugged.
*/ */
static bool virtio_mem_mb_test_sb_plugged(struct virtio_mem *vm, static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id, unsigned long mb_id, int sb_id,
int count) int count)
{ {
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
if (count == 1) if (count == 1)
return test_bit(bit, vm->sb_bitmap); return test_bit(bit, vm->sbm.sb_states);
/* TODO: Helper similar to bitmap_set() */ /* TODO: Helper similar to bitmap_set() */
return find_next_zero_bit(vm->sb_bitmap, bit + count, bit) >= return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
bit + count; bit + count;
} }
/* /*
* Test if all selected subblocks are unplugged. * Test if all selected subblocks are unplugged.
*/ */
static bool virtio_mem_mb_test_sb_unplugged(struct virtio_mem *vm, static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
unsigned long mb_id, int sb_id, unsigned long mb_id, int sb_id,
int count) int count)
{ {
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb + sb_id; const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
/* TODO: Helper similar to bitmap_set() */ /* TODO: Helper similar to bitmap_set() */
return find_next_bit(vm->sb_bitmap, bit + count, bit) >= bit + count; return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
bit + count;
} }
/* /*
* Find the first unplugged subblock. Returns vm->nb_sb_per_mb in case there is * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
* none. * none.
*/ */
static int virtio_mem_mb_first_unplugged_sb(struct virtio_mem *vm, static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
unsigned long mb_id) unsigned long mb_id)
{ {
const int bit = (mb_id - vm->first_mb_id) * vm->nb_sb_per_mb; const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
return find_next_zero_bit(vm->sb_bitmap, bit + vm->nb_sb_per_mb, bit) - return find_next_zero_bit(vm->sbm.sb_states,
bit; bit + vm->sbm.sbs_per_mb, bit) - bit;
} }
/* /*
* Prepare the subblock bitmap for the next memory block. * Prepare the subblock bitmap for the next memory block.
*/ */
static int virtio_mem_sb_bitmap_prepare_next_mb(struct virtio_mem *vm) static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
{ {
const unsigned long old_nb_mb = vm->next_mb_id - vm->first_mb_id; const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
const unsigned long old_nb_bits = old_nb_mb * vm->nb_sb_per_mb; const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->nb_sb_per_mb; const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long)); int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long)); int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
unsigned long *new_sb_bitmap, *old_sb_bitmap; unsigned long *new_bitmap, *old_bitmap;
if (vm->sb_bitmap && old_pages == new_pages) if (vm->sbm.sb_states && old_pages == new_pages)
return 0; return 0;
new_sb_bitmap = vzalloc(new_pages * PAGE_SIZE); new_bitmap = vzalloc(new_pages * PAGE_SIZE);
if (!new_sb_bitmap) if (!new_bitmap)
return -ENOMEM; return -ENOMEM;
mutex_lock(&vm->hotplug_mutex); mutex_lock(&vm->hotplug_mutex);
if (new_sb_bitmap) if (new_bitmap)
memcpy(new_sb_bitmap, vm->sb_bitmap, old_pages * PAGE_SIZE); memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
old_sb_bitmap = vm->sb_bitmap; old_bitmap = vm->sbm.sb_states;
vm->sb_bitmap = new_sb_bitmap; vm->sbm.sb_states = new_bitmap;
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
vfree(old_sb_bitmap); vfree(old_bitmap);
return 0; return 0;
} }
/* /*
* Try to add a memory block to Linux. This will usually only fail * Test if we could add memory without creating too much offline memory -
* if out of memory. * to avoid running OOM if memory is getting onlined deferred.
*/
static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
{
if (WARN_ON_ONCE(size > vm->offline_threshold))
return false;
return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
}
/*
* Try adding memory to Linux. Will usually only fail if out of memory.
* *
* Must not be called with the vm->hotplug_mutex held (possible deadlock with * Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code). * onlining code).
* *
* Will not modify the state of the memory block. * Will not modify the state of memory blocks in virtio-mem.
*/ */
static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
uint64_t size)
{ {
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); int rc;
int nid = vm->nid;
if (nid == NUMA_NO_NODE)
nid = memory_add_physaddr_to_nid(addr);
/* /*
* When force-unloading the driver and we still have memory added to * When force-unloading the driver and we still have memory added to
...@@ -422,53 +618,155 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id) ...@@ -422,53 +618,155 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
return -ENOMEM; return -ENOMEM;
} }
dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id); dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
return add_memory_driver_managed(nid, addr, memory_block_size_bytes(), addr + size - 1);
vm->resource_name, /* Memory might get onlined immediately. */
MEMHP_MERGE_RESOURCE); atomic64_add(size, &vm->offline_size);
rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
MEMHP_MERGE_RESOURCE);
if (rc) {
atomic64_sub(size, &vm->offline_size);
dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
/*
* TODO: Linux MM does not properly clean up yet in all cases
* where adding of memory failed - especially on -ENOMEM.
*/
}
return rc;
}
/*
* See virtio_mem_add_memory(): Try adding a single Linux memory block.
*/
static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
{
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
const uint64_t size = memory_block_size_bytes();
return virtio_mem_add_memory(vm, addr, size);
}
/*
* See virtio_mem_add_memory(): Try adding a big block.
*/
static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
return virtio_mem_add_memory(vm, addr, size);
} }
/* /*
* Try to remove a memory block from Linux. Will only fail if the memory block * Try removing memory from Linux. Will only fail if memory blocks aren't
* is not offline. * offline.
* *
* Must not be called with the vm->hotplug_mutex held (possible deadlock with * Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code). * onlining code).
* *
* Will not modify the state of the memory block. * Will not modify the state of memory blocks in virtio-mem.
*/
static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
uint64_t size)
{
int rc;
dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
rc = remove_memory(vm->nid, addr, size);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
* We might have freed up memory we can now unplug, retry
* immediately instead of waiting.
*/
virtio_mem_retry(vm);
} else {
dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
}
return rc;
}
/*
* See virtio_mem_remove_memory(): Try removing a single Linux memory block.
*/ */
static int virtio_mem_mb_remove(struct virtio_mem *vm, unsigned long mb_id) static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
{ {
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
int nid = vm->nid; const uint64_t size = memory_block_size_bytes();
if (nid == NUMA_NO_NODE) return virtio_mem_remove_memory(vm, addr, size);
nid = memory_add_physaddr_to_nid(addr); }
/*
* See virtio_mem_remove_memory(): Try to remove all Linux memory blocks covered
* by the big block.
*/
static int virtio_mem_bbm_remove_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
dev_dbg(&vm->vdev->dev, "removing memory block: %lu\n", mb_id); return virtio_mem_remove_memory(vm, addr, size);
return remove_memory(nid, addr, memory_block_size_bytes());
} }
/* /*
* Try to offline and remove a memory block from Linux. * Try offlining and removing memory from Linux.
* *
* Must not be called with the vm->hotplug_mutex held (possible deadlock with * Must not be called with the vm->hotplug_mutex held (possible deadlock with
* onlining code). * onlining code).
* *
* Will not modify the state of the memory block. * Will not modify the state of memory blocks in virtio-mem.
*/ */
static int virtio_mem_mb_offline_and_remove(struct virtio_mem *vm, static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
unsigned long mb_id) uint64_t addr,
uint64_t size)
{
int rc;
dev_dbg(&vm->vdev->dev,
"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
rc = offline_and_remove_memory(vm->nid, addr, size);
if (!rc) {
atomic64_sub(size, &vm->offline_size);
/*
* We might have freed up memory we can now unplug, retry
* immediately instead of waiting.
*/
virtio_mem_retry(vm);
} else {
dev_dbg(&vm->vdev->dev,
"offlining and removing memory failed: %d\n", rc);
}
return rc;
}
/*
* See virtio_mem_offline_and_remove_memory(): Try offlining and removing
* a single Linux memory block.
*/
static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
unsigned long mb_id)
{ {
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id); const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
int nid = vm->nid; const uint64_t size = memory_block_size_bytes();
if (nid == NUMA_NO_NODE) return virtio_mem_offline_and_remove_memory(vm, addr, size);
nid = memory_add_physaddr_to_nid(addr); }
/*
* See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
* all Linux memory blocks covered by the big block.
*/
static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
dev_dbg(&vm->vdev->dev, "offlining and removing memory block: %lu\n", return virtio_mem_offline_and_remove_memory(vm, addr, size);
mb_id);
return offline_and_remove_memory(nid, addr, memory_block_size_bytes());
} }
/* /*
...@@ -499,31 +797,28 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id) ...@@ -499,31 +797,28 @@ static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
* Test if a virtio-mem device overlaps with the given range. Can be called * Test if a virtio-mem device overlaps with the given range. Can be called
* from (notifier) callbacks lockless. * from (notifier) callbacks lockless.
*/ */
static bool virtio_mem_overlaps_range(struct virtio_mem *vm, static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
unsigned long start, unsigned long size) uint64_t size)
{ {
unsigned long dev_start = virtio_mem_mb_id_to_phys(vm->first_mb_id); return start < vm->addr + vm->region_size && vm->addr < start + size;
unsigned long dev_end = virtio_mem_mb_id_to_phys(vm->last_mb_id) +
memory_block_size_bytes();
return start < dev_end && dev_start < start + size;
} }
/* /*
* Test if a virtio-mem device owns a memory block. Can be called from * Test if a virtio-mem device contains a given range. Can be called from
* (notifier) callbacks lockless. * (notifier) callbacks lockless.
*/ */
static bool virtio_mem_owned_mb(struct virtio_mem *vm, unsigned long mb_id) static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
uint64_t size)
{ {
return mb_id >= vm->first_mb_id && mb_id <= vm->last_mb_id; return start >= vm->addr && start + size <= vm->addr + vm->region_size;
} }
static int virtio_mem_notify_going_online(struct virtio_mem *vm, static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
unsigned long mb_id) unsigned long mb_id)
{ {
switch (virtio_mem_mb_get_state(vm, mb_id)) { switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
case VIRTIO_MEM_MB_STATE_OFFLINE: case VIRTIO_MEM_SBM_MB_OFFLINE:
return NOTIFY_OK; return NOTIFY_OK;
default: default:
break; break;
...@@ -533,108 +828,100 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm, ...@@ -533,108 +828,100 @@ static int virtio_mem_notify_going_online(struct virtio_mem *vm,
return NOTIFY_BAD; return NOTIFY_BAD;
} }
static void virtio_mem_notify_offline(struct virtio_mem *vm, static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
unsigned long mb_id) unsigned long mb_id)
{ {
switch (virtio_mem_mb_get_state(vm, mb_id)) { switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
case VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL: case VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL:
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
break; break;
case VIRTIO_MEM_MB_STATE_ONLINE: case VIRTIO_MEM_SBM_MB_ONLINE:
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE); VIRTIO_MEM_SBM_MB_OFFLINE);
break; break;
default: default:
BUG(); BUG();
break; break;
} }
/*
* Trigger the workqueue, maybe we can now unplug memory. Also,
* when we offline and remove a memory block, this will re-trigger
* us immediately - which is often nice because the removal of
* the memory block (e.g., memmap) might have freed up memory
* on other memory blocks we manage.
*/
virtio_mem_retry(vm);
} }
static void virtio_mem_notify_online(struct virtio_mem *vm, unsigned long mb_id) static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
unsigned long mb_id)
{ {
unsigned long nb_offline; switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
switch (virtio_mem_mb_get_state(vm, mb_id)) { virtio_mem_sbm_set_mb_state(vm, mb_id,
case VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL: VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL);
virtio_mem_mb_set_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL);
break; break;
case VIRTIO_MEM_MB_STATE_OFFLINE: case VIRTIO_MEM_SBM_MB_OFFLINE:
virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_ONLINE); virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_ONLINE);
break; break;
default: default:
BUG(); BUG();
break; break;
} }
nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
/* see if we can add new blocks now that we onlined one block */
if (nb_offline == VIRTIO_MEM_NB_OFFLINE_THRESHOLD - 1)
virtio_mem_retry(vm);
} }
static void virtio_mem_notify_going_offline(struct virtio_mem *vm, static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
unsigned long mb_id) unsigned long mb_id)
{ {
const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
struct page *page;
unsigned long pfn; unsigned long pfn;
int sb_id, i; int sb_id;
for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
continue; continue;
/*
* Drop our reference to the pages so the memory can get
* offlined and add the unplugged pages to the managed
* page counters (so offlining code can correctly subtract
* them again).
*/
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size); sb_id * vm->sbm.sb_size);
adjust_managed_page_count(pfn_to_page(pfn), nr_pages); virtio_mem_fake_offline_going_offline(pfn, nr_pages);
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(pfn + i);
if (WARN_ON(!page_ref_dec_and_test(page)))
dump_page(page, "unplugged page referenced");
}
} }
} }
static void virtio_mem_notify_cancel_offline(struct virtio_mem *vm, static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
unsigned long mb_id) unsigned long mb_id)
{ {
const unsigned long nr_pages = PFN_DOWN(vm->subblock_size); const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
unsigned long pfn; unsigned long pfn;
int sb_id, i; int sb_id;
for (sb_id = 0; sb_id < vm->nb_sb_per_mb; sb_id++) { for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
continue; continue;
/*
* Get the reference we dropped when going offline and
* subtract the unplugged pages from the managed page
* counters.
*/
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size); sb_id * vm->sbm.sb_size);
adjust_managed_page_count(pfn_to_page(pfn), -nr_pages); virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
for (i = 0; i < nr_pages; i++)
page_ref_inc(pfn_to_page(pfn + i));
} }
} }
static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
unsigned long bb_id,
unsigned long pfn,
unsigned long nr_pages)
{
/*
* When marked as "fake-offline", all online memory of this device block
* is allocated by us. Otherwise, we don't have any memory allocated.
*/
if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
return;
virtio_mem_fake_offline_going_offline(pfn, nr_pages);
}
static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
unsigned long bb_id,
unsigned long pfn,
unsigned long nr_pages)
{
if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
return;
virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
}
/* /*
* This callback will either be called synchronously from add_memory() or * This callback will either be called synchronously from add_memory() or
* asynchronously (e.g., triggered via user space). We have to be careful * asynchronously (e.g., triggered via user space). We have to be careful
...@@ -648,20 +935,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, ...@@ -648,20 +935,33 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
struct memory_notify *mhp = arg; struct memory_notify *mhp = arg;
const unsigned long start = PFN_PHYS(mhp->start_pfn); const unsigned long start = PFN_PHYS(mhp->start_pfn);
const unsigned long size = PFN_PHYS(mhp->nr_pages); const unsigned long size = PFN_PHYS(mhp->nr_pages);
const unsigned long mb_id = virtio_mem_phys_to_mb_id(start);
int rc = NOTIFY_OK; int rc = NOTIFY_OK;
unsigned long id;
if (!virtio_mem_overlaps_range(vm, start, size)) if (!virtio_mem_overlaps_range(vm, start, size))
return NOTIFY_DONE; return NOTIFY_DONE;
/* if (vm->in_sbm) {
* Memory is onlined/offlined in memory block granularity. We cannot id = virtio_mem_phys_to_mb_id(start);
* cross virtio-mem device boundaries and memory block boundaries. Bail /*
* out if this ever changes. * In SBM, we add memory in separate memory blocks - we expect
*/ * it to be onlined/offlined in the same granularity. Bail out
if (WARN_ON_ONCE(size != memory_block_size_bytes() || * if this ever changes.
!IS_ALIGNED(start, memory_block_size_bytes()))) */
return NOTIFY_BAD; if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
!IS_ALIGNED(start, memory_block_size_bytes())))
return NOTIFY_BAD;
} else {
id = virtio_mem_phys_to_bb_id(vm, start);
/*
* In BBM, we only care about onlining/offlining happening
* within a single big block, we don't care about the
* actual granularity as we don't track individual Linux
* memory blocks.
*/
if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
return NOTIFY_BAD;
}
/* /*
* Avoid circular locking lockdep warnings. We lock the mutex * Avoid circular locking lockdep warnings. We lock the mutex
...@@ -680,7 +980,12 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, ...@@ -680,7 +980,12 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
break; break;
} }
vm->hotplug_active = true; vm->hotplug_active = true;
virtio_mem_notify_going_offline(vm, mb_id); if (vm->in_sbm)
virtio_mem_sbm_notify_going_offline(vm, id);
else
virtio_mem_bbm_notify_going_offline(vm, id,
mhp->start_pfn,
mhp->nr_pages);
break; break;
case MEM_GOING_ONLINE: case MEM_GOING_ONLINE:
mutex_lock(&vm->hotplug_mutex); mutex_lock(&vm->hotplug_mutex);
...@@ -690,22 +995,51 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, ...@@ -690,22 +995,51 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
break; break;
} }
vm->hotplug_active = true; vm->hotplug_active = true;
rc = virtio_mem_notify_going_online(vm, mb_id); if (vm->in_sbm)
rc = virtio_mem_sbm_notify_going_online(vm, id);
break; break;
case MEM_OFFLINE: case MEM_OFFLINE:
virtio_mem_notify_offline(vm, mb_id); if (vm->in_sbm)
virtio_mem_sbm_notify_offline(vm, id);
atomic64_add(size, &vm->offline_size);
/*
* Trigger the workqueue. Now that we have some offline memory,
* maybe we can handle pending unplug requests.
*/
if (!unplug_online)
virtio_mem_retry(vm);
vm->hotplug_active = false; vm->hotplug_active = false;
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
break; break;
case MEM_ONLINE: case MEM_ONLINE:
virtio_mem_notify_online(vm, mb_id); if (vm->in_sbm)
virtio_mem_sbm_notify_online(vm, id);
atomic64_sub(size, &vm->offline_size);
/*
* Start adding more memory once we onlined half of our
* threshold. Don't trigger if it's possibly due to our actipn
* (e.g., us adding memory which gets onlined immediately from
* the core).
*/
if (!atomic_read(&vm->wq_active) &&
virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
virtio_mem_retry(vm);
vm->hotplug_active = false; vm->hotplug_active = false;
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
break; break;
case MEM_CANCEL_OFFLINE: case MEM_CANCEL_OFFLINE:
if (!vm->hotplug_active) if (!vm->hotplug_active)
break; break;
virtio_mem_notify_cancel_offline(vm, mb_id); if (vm->in_sbm)
virtio_mem_sbm_notify_cancel_offline(vm, id);
else
virtio_mem_bbm_notify_cancel_offline(vm, id,
mhp->start_pfn,
mhp->nr_pages);
vm->hotplug_active = false; vm->hotplug_active = false;
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
break; break;
...@@ -729,7 +1063,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb, ...@@ -729,7 +1063,7 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
* (via generic_online_page()) using PageDirty(). * (via generic_online_page()) using PageDirty().
*/ */
static void virtio_mem_set_fake_offline(unsigned long pfn, static void virtio_mem_set_fake_offline(unsigned long pfn,
unsigned int nr_pages, bool onlined) unsigned long nr_pages, bool onlined)
{ {
for (; nr_pages--; pfn++) { for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn); struct page *page = pfn_to_page(pfn);
...@@ -748,7 +1082,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn, ...@@ -748,7 +1082,7 @@ static void virtio_mem_set_fake_offline(unsigned long pfn,
* (via generic_online_page()), clear PageDirty(). * (via generic_online_page()), clear PageDirty().
*/ */
static void virtio_mem_clear_fake_offline(unsigned long pfn, static void virtio_mem_clear_fake_offline(unsigned long pfn,
unsigned int nr_pages, bool onlined) unsigned long nr_pages, bool onlined)
{ {
for (; nr_pages--; pfn++) { for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn); struct page *page = pfn_to_page(pfn);
...@@ -763,16 +1097,17 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn, ...@@ -763,16 +1097,17 @@ static void virtio_mem_clear_fake_offline(unsigned long pfn,
* Release a range of fake-offline pages to the buddy, effectively * Release a range of fake-offline pages to the buddy, effectively
* fake-onlining them. * fake-onlining them.
*/ */
static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
{ {
const int order = MAX_ORDER - 1; const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
int i; unsigned long i;
/* /*
* We are always called with subblock granularity, which is at least * We are always called at least with MAX_ORDER_NR_PAGES
* aligned to MAX_ORDER - 1. * granularity/alignment (e.g., the way subblocks work). All pages
* inside such a block are alike.
*/ */
for (i = 0; i < nr_pages; i += 1 << order) { for (i = 0; i < nr_pages; i += max_nr_pages) {
struct page *page = pfn_to_page(pfn + i); struct page *page = pfn_to_page(pfn + i);
/* /*
...@@ -782,42 +1117,128 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages) ...@@ -782,42 +1117,128 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned int nr_pages)
* alike. * alike.
*/ */
if (PageDirty(page)) { if (PageDirty(page)) {
virtio_mem_clear_fake_offline(pfn + i, 1 << order, virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
false); false);
generic_online_page(page, order); generic_online_page(page, MAX_ORDER - 1);
} else { } else {
virtio_mem_clear_fake_offline(pfn + i, 1 << order, virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
true); true);
free_contig_range(pfn + i, 1 << order); free_contig_range(pfn + i, max_nr_pages);
adjust_managed_page_count(page, 1 << order); adjust_managed_page_count(page, max_nr_pages);
} }
} }
} }
/*
* Try to allocate a range, marking pages fake-offline, effectively
* fake-offlining them.
*/
static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
{
const bool is_movable = zone_idx(page_zone(pfn_to_page(pfn))) ==
ZONE_MOVABLE;
int rc, retry_count;
/*
* TODO: We want an alloc_contig_range() mode that tries to allocate
* harder (e.g., dealing with temporarily pinned pages, PCP), especially
* with ZONE_MOVABLE. So for now, retry a couple of times with
* ZONE_MOVABLE before giving up - because that zone is supposed to give
* some guarantees.
*/
for (retry_count = 0; retry_count < 5; retry_count++) {
rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
GFP_KERNEL);
if (rc == -ENOMEM)
/* whoops, out of memory */
return rc;
else if (rc && !is_movable)
break;
else if (rc)
continue;
virtio_mem_set_fake_offline(pfn, nr_pages, true);
adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
return 0;
}
return -EBUSY;
}
/*
* Handle fake-offline pages when memory is going offline - such that the
* pages can be skipped by mm-core when offlining.
*/
static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
unsigned long nr_pages)
{
struct page *page;
unsigned long i;
/*
* Drop our reference to the pages so the memory can get offlined
* and add the unplugged pages to the managed page counters (so
* offlining code can correctly subtract them again).
*/
adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
/* Drop our reference to the pages so the memory can get offlined. */
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(pfn + i);
if (WARN_ON(!page_ref_dec_and_test(page)))
dump_page(page, "fake-offline page referenced");
}
}
/*
* Handle fake-offline pages when memory offlining is canceled - to undo
* what we did in virtio_mem_fake_offline_going_offline().
*/
static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
unsigned long nr_pages)
{
unsigned long i;
/*
* Get the reference we dropped when going offline and subtract the
* unplugged pages from the managed page counters.
*/
adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
for (i = 0; i < nr_pages; i++)
page_ref_inc(pfn_to_page(pfn + i));
}
static void virtio_mem_online_page_cb(struct page *page, unsigned int order) static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
{ {
const unsigned long addr = page_to_phys(page); const unsigned long addr = page_to_phys(page);
const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr); unsigned long id, sb_id;
struct virtio_mem *vm; struct virtio_mem *vm;
int sb_id; bool do_online;
/*
* We exploit here that subblocks have at least MAX_ORDER - 1
* size/alignment and that this callback is is called with such a
* size/alignment. So we cannot cross subblocks and therefore
* also not memory blocks.
*/
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(vm, &virtio_mem_devices, next) { list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
if (!virtio_mem_owned_mb(vm, mb_id)) if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
continue; continue;
sb_id = virtio_mem_phys_to_sb_id(vm, addr); if (vm->in_sbm) {
/* /*
* If plugged, online the pages, otherwise, set them fake * We exploit here that subblocks have at least
* offline (PageOffline). * MAX_ORDER_NR_PAGES size/alignment - so we cannot
*/ * cross subblocks within one call.
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) */
id = virtio_mem_phys_to_mb_id(addr);
sb_id = virtio_mem_phys_to_sb_id(vm, addr);
do_online = virtio_mem_sbm_test_sb_plugged(vm, id,
sb_id, 1);
} else {
/*
* If the whole block is marked fake offline, keep
* everything that way.
*/
id = virtio_mem_phys_to_bb_id(vm, addr);
do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
}
if (do_online)
generic_online_page(page, order); generic_online_page(page, order);
else else
virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order, virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
...@@ -870,23 +1291,33 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr, ...@@ -870,23 +1291,33 @@ static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
.u.plug.addr = cpu_to_virtio64(vm->vdev, addr), .u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), .u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
}; };
int rc = -ENOMEM;
if (atomic_read(&vm->config_changed)) if (atomic_read(&vm->config_changed))
return -EAGAIN; return -EAGAIN;
dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
switch (virtio_mem_send_request(vm, &req)) { switch (virtio_mem_send_request(vm, &req)) {
case VIRTIO_MEM_RESP_ACK: case VIRTIO_MEM_RESP_ACK:
vm->plugged_size += size; vm->plugged_size += size;
return 0; return 0;
case VIRTIO_MEM_RESP_NACK: case VIRTIO_MEM_RESP_NACK:
return -EAGAIN; rc = -EAGAIN;
break;
case VIRTIO_MEM_RESP_BUSY: case VIRTIO_MEM_RESP_BUSY:
return -ETXTBSY; rc = -ETXTBSY;
break;
case VIRTIO_MEM_RESP_ERROR: case VIRTIO_MEM_RESP_ERROR:
return -EINVAL; rc = -EINVAL;
break;
default: default:
return -ENOMEM; break;
} }
dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
return rc;
} }
static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
...@@ -898,21 +1329,30 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr, ...@@ -898,21 +1329,30 @@ static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr), .u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks), .u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
}; };
int rc = -ENOMEM;
if (atomic_read(&vm->config_changed)) if (atomic_read(&vm->config_changed))
return -EAGAIN; return -EAGAIN;
dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
addr + size - 1);
switch (virtio_mem_send_request(vm, &req)) { switch (virtio_mem_send_request(vm, &req)) {
case VIRTIO_MEM_RESP_ACK: case VIRTIO_MEM_RESP_ACK:
vm->plugged_size -= size; vm->plugged_size -= size;
return 0; return 0;
case VIRTIO_MEM_RESP_BUSY: case VIRTIO_MEM_RESP_BUSY:
return -ETXTBSY; rc = -ETXTBSY;
break;
case VIRTIO_MEM_RESP_ERROR: case VIRTIO_MEM_RESP_ERROR:
return -EINVAL; rc = -EINVAL;
break;
default: default:
return -ENOMEM; break;
} }
dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
return rc;
} }
static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
...@@ -920,6 +1360,9 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) ...@@ -920,6 +1360,9 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
const struct virtio_mem_req req = { const struct virtio_mem_req req = {
.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL), .type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
}; };
int rc = -ENOMEM;
dev_dbg(&vm->vdev->dev, "unplugging all memory");
switch (virtio_mem_send_request(vm, &req)) { switch (virtio_mem_send_request(vm, &req)) {
case VIRTIO_MEM_RESP_ACK: case VIRTIO_MEM_RESP_ACK:
...@@ -929,30 +1372,31 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm) ...@@ -929,30 +1372,31 @@ static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
atomic_set(&vm->config_changed, 1); atomic_set(&vm->config_changed, 1);
return 0; return 0;
case VIRTIO_MEM_RESP_BUSY: case VIRTIO_MEM_RESP_BUSY:
return -ETXTBSY; rc = -ETXTBSY;
break;
default: default:
return -ENOMEM; break;
} }
dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
return rc;
} }
/* /*
* Plug selected subblocks. Updates the plugged state, but not the state * Plug selected subblocks. Updates the plugged state, but not the state
* of the memory block. * of the memory block.
*/ */
static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
int sb_id, int count) int sb_id, int count)
{ {
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size; sb_id * vm->sbm.sb_size;
const uint64_t size = count * vm->subblock_size; const uint64_t size = count * vm->sbm.sb_size;
int rc; int rc;
dev_dbg(&vm->vdev->dev, "plugging memory block: %lu : %i - %i\n", mb_id,
sb_id, sb_id + count - 1);
rc = virtio_mem_send_plug_request(vm, addr, size); rc = virtio_mem_send_plug_request(vm, addr, size);
if (!rc) if (!rc)
virtio_mem_mb_set_sb_plugged(vm, mb_id, sb_id, count); virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
return rc; return rc;
} }
...@@ -960,23 +1404,46 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id, ...@@ -960,23 +1404,46 @@ static int virtio_mem_mb_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
* Unplug selected subblocks. Updates the plugged state, but not the state * Unplug selected subblocks. Updates the plugged state, but not the state
* of the memory block. * of the memory block.
*/ */
static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
int sb_id, int count) int sb_id, int count)
{ {
const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) + const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size; sb_id * vm->sbm.sb_size;
const uint64_t size = count * vm->subblock_size; const uint64_t size = count * vm->sbm.sb_size;
int rc; int rc;
dev_dbg(&vm->vdev->dev, "unplugging memory block: %lu : %i - %i\n",
mb_id, sb_id, sb_id + count - 1);
rc = virtio_mem_send_unplug_request(vm, addr, size); rc = virtio_mem_send_unplug_request(vm, addr, size);
if (!rc) if (!rc)
virtio_mem_mb_set_sb_unplugged(vm, mb_id, sb_id, count); virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
return rc; return rc;
} }
/*
* Request to unplug a big block.
*
* Will not modify the state of the big block.
*/
static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
return virtio_mem_send_unplug_request(vm, addr, size);
}
/*
* Request to plug a big block.
*
* Will not modify the state of the big block.
*/
static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
{
const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
const uint64_t size = vm->bbm.bb_size;
return virtio_mem_send_plug_request(vm, addr, size);
}
/* /*
* Unplug the desired number of plugged subblocks of a offline or not-added * Unplug the desired number of plugged subblocks of a offline or not-added
* memory block. Will fail if any subblock cannot get unplugged (instead of * memory block. Will fail if any subblock cannot get unplugged (instead of
...@@ -986,29 +1453,29 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id, ...@@ -986,29 +1453,29 @@ static int virtio_mem_mb_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
* *
* Note: can fail after some subblocks were unplugged. * Note: can fail after some subblocks were unplugged.
*/ */
static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
unsigned long mb_id, uint64_t *nb_sb) unsigned long mb_id, uint64_t *nb_sb)
{ {
int sb_id, count; int sb_id, count;
int rc; int rc;
sb_id = vm->nb_sb_per_mb - 1; sb_id = vm->sbm.sbs_per_mb - 1;
while (*nb_sb) { while (*nb_sb) {
/* Find the next candidate subblock */ /* Find the next candidate subblock */
while (sb_id >= 0 && while (sb_id >= 0 &&
virtio_mem_mb_test_sb_unplugged(vm, mb_id, sb_id, 1)) virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
sb_id--; sb_id--;
if (sb_id < 0) if (sb_id < 0)
break; break;
/* Try to unplug multiple subblocks at a time */ /* Try to unplug multiple subblocks at a time */
count = 1; count = 1;
while (count < *nb_sb && sb_id > 0 && while (count < *nb_sb && sb_id > 0 &&
virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) { virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
count++; count++;
sb_id--; sb_id--;
} }
rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
if (rc) if (rc)
return rc; return rc;
*nb_sb -= count; *nb_sb -= count;
...@@ -1025,63 +1492,50 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm, ...@@ -1025,63 +1492,50 @@ static int virtio_mem_mb_unplug_any_sb(struct virtio_mem *vm,
* *
* Note: can fail after some subblocks were unplugged. * Note: can fail after some subblocks were unplugged.
*/ */
static int virtio_mem_mb_unplug(struct virtio_mem *vm, unsigned long mb_id) static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
{ {
uint64_t nb_sb = vm->nb_sb_per_mb; uint64_t nb_sb = vm->sbm.sbs_per_mb;
return virtio_mem_mb_unplug_any_sb(vm, mb_id, &nb_sb); return virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
} }
/* /*
* Prepare tracking data for the next memory block. * Prepare tracking data for the next memory block.
*/ */
static int virtio_mem_prepare_next_mb(struct virtio_mem *vm, static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
unsigned long *mb_id) unsigned long *mb_id)
{ {
int rc; int rc;
if (vm->next_mb_id > vm->last_usable_mb_id) if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
return -ENOSPC; return -ENOSPC;
/* Resize the state array if required. */ /* Resize the state array if required. */
rc = virtio_mem_mb_state_prepare_next_mb(vm); rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
if (rc) if (rc)
return rc; return rc;
/* Resize the subblock bitmap if required. */ /* Resize the subblock bitmap if required. */
rc = virtio_mem_sb_bitmap_prepare_next_mb(vm); rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
if (rc) if (rc)
return rc; return rc;
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_UNUSED]++; vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
*mb_id = vm->next_mb_id++; *mb_id = vm->sbm.next_mb_id++;
return 0; return 0;
} }
/*
* Don't add too many blocks that are not onlined yet to avoid running OOM.
*/
static bool virtio_mem_too_many_mb_offline(struct virtio_mem *vm)
{
unsigned long nb_offline;
nb_offline = vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] +
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL];
return nb_offline >= VIRTIO_MEM_NB_OFFLINE_THRESHOLD;
}
/* /*
* Try to plug the desired number of subblocks and add the memory block * Try to plug the desired number of subblocks and add the memory block
* to Linux. * to Linux.
* *
* Will modify the state of the memory block. * Will modify the state of the memory block.
*/ */
static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
unsigned long mb_id, unsigned long mb_id, uint64_t *nb_sb)
uint64_t *nb_sb)
{ {
const int count = min_t(int, *nb_sb, vm->nb_sb_per_mb); const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
int rc, rc2; int rc;
if (WARN_ON_ONCE(!count)) if (WARN_ON_ONCE(!count))
return -EINVAL; return -EINVAL;
...@@ -1090,7 +1544,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, ...@@ -1090,7 +1544,7 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
* Plug the requested number of subblocks before adding it to linux, * Plug the requested number of subblocks before adding it to linux,
* so that onlining will directly online all plugged subblocks. * so that onlining will directly online all plugged subblocks.
*/ */
rc = virtio_mem_mb_plug_sb(vm, mb_id, 0, count); rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
if (rc) if (rc)
return rc; return rc;
...@@ -1098,29 +1552,21 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, ...@@ -1098,29 +1552,21 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
* Mark the block properly offline before adding it to Linux, * Mark the block properly offline before adding it to Linux,
* so the memory notifiers will find the block in the right state. * so the memory notifiers will find the block in the right state.
*/ */
if (count == vm->nb_sb_per_mb) if (count == vm->sbm.sbs_per_mb)
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE); VIRTIO_MEM_SBM_MB_OFFLINE);
else else
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
/* Add the memory block to linux - if that fails, try to unplug. */ /* Add the memory block to linux - if that fails, try to unplug. */
rc = virtio_mem_mb_add(vm, mb_id); rc = virtio_mem_sbm_add_mb(vm, mb_id);
if (rc) { if (rc) {
enum virtio_mem_mb_state new_state = VIRTIO_MEM_MB_STATE_UNUSED; int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
dev_err(&vm->vdev->dev,
"adding memory block %lu failed with %d\n", mb_id, rc);
rc2 = virtio_mem_mb_unplug_sb(vm, mb_id, 0, count);
/* if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
* TODO: Linux MM does not properly clean up yet in all cases new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
* where adding of memory failed - especially on -ENOMEM. virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
*/
if (rc2)
new_state = VIRTIO_MEM_MB_STATE_PLUGGED;
virtio_mem_mb_set_state(vm, mb_id, new_state);
return rc; return rc;
} }
...@@ -1136,8 +1582,9 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm, ...@@ -1136,8 +1582,9 @@ static int virtio_mem_mb_plug_and_add(struct virtio_mem *vm,
* *
* Note: Can fail after some subblocks were successfully plugged. * Note: Can fail after some subblocks were successfully plugged.
*/ */
static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
uint64_t *nb_sb, bool online) unsigned long mb_id, uint64_t *nb_sb,
bool online)
{ {
unsigned long pfn, nr_pages; unsigned long pfn, nr_pages;
int sb_id, count; int sb_id, count;
...@@ -1147,17 +1594,16 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, ...@@ -1147,17 +1594,16 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
return -EINVAL; return -EINVAL;
while (*nb_sb) { while (*nb_sb) {
sb_id = virtio_mem_mb_first_unplugged_sb(vm, mb_id); sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
if (sb_id >= vm->nb_sb_per_mb) if (sb_id >= vm->sbm.sbs_per_mb)
break; break;
count = 1; count = 1;
while (count < *nb_sb && while (count < *nb_sb &&
sb_id + count < vm->nb_sb_per_mb && sb_id + count < vm->sbm.sbs_per_mb &&
!virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id + count, !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1))
count++; count++;
rc = virtio_mem_mb_plug_sb(vm, mb_id, sb_id, count); rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
if (rc) if (rc)
return rc; return rc;
*nb_sb -= count; *nb_sb -= count;
...@@ -1166,29 +1612,26 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id, ...@@ -1166,29 +1612,26 @@ static int virtio_mem_mb_plug_any_sb(struct virtio_mem *vm, unsigned long mb_id,
/* fake-online the pages if the memory block is online */ /* fake-online the pages if the memory block is online */
pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size); sb_id * vm->sbm.sb_size);
nr_pages = PFN_DOWN(count * vm->subblock_size); nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
virtio_mem_fake_online(pfn, nr_pages); virtio_mem_fake_online(pfn, nr_pages);
} }
if (virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
if (online) if (online)
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE); VIRTIO_MEM_SBM_MB_ONLINE);
else else
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE); VIRTIO_MEM_SBM_MB_OFFLINE);
} }
return 0; return 0;
} }
/* static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
* Try to plug the requested amount of memory.
*/
static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
{ {
uint64_t nb_sb = diff / vm->subblock_size; uint64_t nb_sb = diff / vm->sbm.sb_size;
unsigned long mb_id; unsigned long mb_id;
int rc; int rc;
...@@ -1199,18 +1642,18 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1199,18 +1642,18 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
mutex_lock(&vm->hotplug_mutex); mutex_lock(&vm->hotplug_mutex);
/* Try to plug subblocks of partially plugged online blocks. */ /* Try to plug subblocks of partially plugged online blocks. */
virtio_mem_for_each_mb_state(vm, mb_id, virtio_mem_sbm_for_each_mb(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) {
rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, true); rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, true);
if (rc || !nb_sb) if (rc || !nb_sb)
goto out_unlock; goto out_unlock;
cond_resched(); cond_resched();
} }
/* Try to plug subblocks of partially plugged offline blocks. */ /* Try to plug subblocks of partially plugged offline blocks. */
virtio_mem_for_each_mb_state(vm, mb_id, virtio_mem_sbm_for_each_mb(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
rc = virtio_mem_mb_plug_any_sb(vm, mb_id, &nb_sb, false); rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb, false);
if (rc || !nb_sb) if (rc || !nb_sb)
goto out_unlock; goto out_unlock;
cond_resched(); cond_resched();
...@@ -1223,11 +1666,11 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1223,11 +1666,11 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
/* Try to plug and add unused blocks */ /* Try to plug and add unused blocks */
virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED) { virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
if (virtio_mem_too_many_mb_offline(vm)) if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
return -ENOSPC; return -ENOSPC;
rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
if (rc || !nb_sb) if (rc || !nb_sb)
return rc; return rc;
cond_resched(); cond_resched();
...@@ -1235,13 +1678,13 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1235,13 +1678,13 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
/* Try to prepare, plug and add new blocks */ /* Try to prepare, plug and add new blocks */
while (nb_sb) { while (nb_sb) {
if (virtio_mem_too_many_mb_offline(vm)) if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
return -ENOSPC; return -ENOSPC;
rc = virtio_mem_prepare_next_mb(vm, &mb_id); rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
if (rc) if (rc)
return rc; return rc;
rc = virtio_mem_mb_plug_and_add(vm, mb_id, &nb_sb); rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
if (rc) if (rc)
return rc; return rc;
cond_resched(); cond_resched();
...@@ -1253,6 +1696,112 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1253,6 +1696,112 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
return rc; return rc;
} }
/*
* Plug a big block and add it to Linux.
*
* Will modify the state of the big block.
*/
static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
int rc;
if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_UNUSED))
return -EINVAL;
rc = virtio_mem_bbm_plug_bb(vm, bb_id);
if (rc)
return rc;
virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
rc = virtio_mem_bbm_add_bb(vm, bb_id);
if (rc) {
if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_UNUSED);
else
/* Retry from the main loop. */
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_PLUGGED);
return rc;
}
return 0;
}
/*
* Prepare tracking data for the next big block.
*/
static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
unsigned long *bb_id)
{
int rc;
if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
return -ENOSPC;
/* Resize the big block state array if required. */
rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
if (rc)
return rc;
vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
*bb_id = vm->bbm.next_bb_id;
vm->bbm.next_bb_id++;
return 0;
}
static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
{
uint64_t nb_bb = diff / vm->bbm.bb_size;
unsigned long bb_id;
int rc;
if (!nb_bb)
return 0;
/* Try to plug and add unused big blocks */
virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
return -ENOSPC;
rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
if (!rc)
nb_bb--;
if (rc || !nb_bb)
return rc;
cond_resched();
}
/* Try to prepare, plug and add new big blocks */
while (nb_bb) {
if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
return -ENOSPC;
rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
if (rc)
return rc;
rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
if (!rc)
nb_bb--;
if (rc)
return rc;
cond_resched();
}
return 0;
}
/*
* Try to plug the requested amount of memory.
*/
static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
{
if (vm->in_sbm)
return virtio_mem_sbm_plug_request(vm, diff);
return virtio_mem_bbm_plug_request(vm, diff);
}
/* /*
* Unplug the desired number of plugged subblocks of an offline memory block. * Unplug the desired number of plugged subblocks of an offline memory block.
* Will fail if any subblock cannot get unplugged (instead of skipping it). * Will fail if any subblock cannot get unplugged (instead of skipping it).
...@@ -1262,33 +1811,33 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1262,33 +1811,33 @@ static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
* *
* Note: Can fail after some subblocks were successfully unplugged. * Note: Can fail after some subblocks were successfully unplugged.
*/ */
static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
unsigned long mb_id, unsigned long mb_id,
uint64_t *nb_sb) uint64_t *nb_sb)
{ {
int rc; int rc;
rc = virtio_mem_mb_unplug_any_sb(vm, mb_id, nb_sb); rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, nb_sb);
/* some subblocks might have been unplugged even on failure */ /* some subblocks might have been unplugged even on failure */
if (!virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL); VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
if (rc) if (rc)
return rc; return rc;
if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
/* /*
* Remove the block from Linux - this should never fail. * Remove the block from Linux - this should never fail.
* Hinder the block from getting onlined by marking it * Hinder the block from getting onlined by marking it
* unplugged. Temporarily drop the mutex, so * unplugged. Temporarily drop the mutex, so
* any pending GOING_ONLINE requests can be serviced/rejected. * any pending GOING_ONLINE requests can be serviced/rejected.
*/ */
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_UNUSED); VIRTIO_MEM_SBM_MB_UNUSED);
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
rc = virtio_mem_mb_remove(vm, mb_id); rc = virtio_mem_sbm_remove_mb(vm, mb_id);
BUG_ON(rc); BUG_ON(rc);
mutex_lock(&vm->hotplug_mutex); mutex_lock(&vm->hotplug_mutex);
} }
...@@ -1300,38 +1849,31 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm, ...@@ -1300,38 +1849,31 @@ static int virtio_mem_mb_unplug_any_sb_offline(struct virtio_mem *vm,
* *
* Will modify the state of the memory block. * Will modify the state of the memory block.
*/ */
static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
unsigned long mb_id, int sb_id, unsigned long mb_id, int sb_id,
int count) int count)
{ {
const unsigned long nr_pages = PFN_DOWN(vm->subblock_size) * count; const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
unsigned long start_pfn; unsigned long start_pfn;
int rc; int rc;
start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) + start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
sb_id * vm->subblock_size); sb_id * vm->sbm.sb_size);
rc = alloc_contig_range(start_pfn, start_pfn + nr_pages,
MIGRATE_MOVABLE, GFP_KERNEL);
if (rc == -ENOMEM)
/* whoops, out of memory */
return rc;
if (rc)
return -EBUSY;
/* Mark it as fake-offline before unplugging it */ rc = virtio_mem_fake_offline(start_pfn, nr_pages);
virtio_mem_set_fake_offline(start_pfn, nr_pages, true); if (rc)
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); return rc;
/* Try to unplug the allocated memory */ /* Try to unplug the allocated memory */
rc = virtio_mem_mb_unplug_sb(vm, mb_id, sb_id, count); rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
if (rc) { if (rc) {
/* Return the memory to the buddy. */ /* Return the memory to the buddy. */
virtio_mem_fake_online(start_pfn, nr_pages); virtio_mem_fake_online(start_pfn, nr_pages);
return rc; return rc;
} }
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL); VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL);
return 0; return 0;
} }
...@@ -1345,34 +1887,34 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm, ...@@ -1345,34 +1887,34 @@ static int virtio_mem_mb_unplug_sb_online(struct virtio_mem *vm,
* Note: Can fail after some subblocks were successfully unplugged. Can * Note: Can fail after some subblocks were successfully unplugged. Can
* return 0 even if subblocks were busy and could not get unplugged. * return 0 even if subblocks were busy and could not get unplugged.
*/ */
static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
unsigned long mb_id, unsigned long mb_id,
uint64_t *nb_sb) uint64_t *nb_sb)
{ {
int rc, sb_id; int rc, sb_id;
/* If possible, try to unplug the complete block in one shot. */ /* If possible, try to unplug the complete block in one shot. */
if (*nb_sb >= vm->nb_sb_per_mb && if (*nb_sb >= vm->sbm.sbs_per_mb &&
virtio_mem_mb_test_sb_plugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, 0, rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
vm->nb_sb_per_mb); vm->sbm.sbs_per_mb);
if (!rc) { if (!rc) {
*nb_sb -= vm->nb_sb_per_mb; *nb_sb -= vm->sbm.sbs_per_mb;
goto unplugged; goto unplugged;
} else if (rc != -EBUSY) } else if (rc != -EBUSY)
return rc; return rc;
} }
/* Fallback to single subblocks. */ /* Fallback to single subblocks. */
for (sb_id = vm->nb_sb_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) { for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
/* Find the next candidate subblock */ /* Find the next candidate subblock */
while (sb_id >= 0 && while (sb_id >= 0 &&
!virtio_mem_mb_test_sb_plugged(vm, mb_id, sb_id, 1)) !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
sb_id--; sb_id--;
if (sb_id < 0) if (sb_id < 0)
break; break;
rc = virtio_mem_mb_unplug_sb_online(vm, mb_id, sb_id, 1); rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
if (rc == -EBUSY) if (rc == -EBUSY)
continue; continue;
else if (rc) else if (rc)
...@@ -1386,24 +1928,21 @@ static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm, ...@@ -1386,24 +1928,21 @@ static int virtio_mem_mb_unplug_any_sb_online(struct virtio_mem *vm,
* remove it. This will usually not fail, as no memory is in use * remove it. This will usually not fail, as no memory is in use
* anymore - however some other notifiers might NACK the request. * anymore - however some other notifiers might NACK the request.
*/ */
if (virtio_mem_mb_test_sb_unplugged(vm, mb_id, 0, vm->nb_sb_per_mb)) { if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
rc = virtio_mem_mb_offline_and_remove(vm, mb_id); rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
mutex_lock(&vm->hotplug_mutex); mutex_lock(&vm->hotplug_mutex);
if (!rc) if (!rc)
virtio_mem_mb_set_state(vm, mb_id, virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_MB_STATE_UNUSED); VIRTIO_MEM_SBM_MB_UNUSED);
} }
return 0; return 0;
} }
/* static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
* Try to unplug the requested amount of memory.
*/
static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
{ {
uint64_t nb_sb = diff / vm->subblock_size; uint64_t nb_sb = diff / vm->sbm.sb_size;
unsigned long mb_id; unsigned long mb_id;
int rc; int rc;
...@@ -1418,20 +1957,17 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1418,20 +1957,17 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
mutex_lock(&vm->hotplug_mutex); mutex_lock(&vm->hotplug_mutex);
/* Try to unplug subblocks of partially plugged offline blocks. */ /* Try to unplug subblocks of partially plugged offline blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id, virtio_mem_sbm_for_each_mb_rev(vm, mb_id,
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id, rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb);
&nb_sb);
if (rc || !nb_sb) if (rc || !nb_sb)
goto out_unlock; goto out_unlock;
cond_resched(); cond_resched();
} }
/* Try to unplug subblocks of plugged offline blocks. */ /* Try to unplug subblocks of plugged offline blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id, virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_OFFLINE) {
VIRTIO_MEM_MB_STATE_OFFLINE) { rc = virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, &nb_sb);
rc = virtio_mem_mb_unplug_any_sb_offline(vm, mb_id,
&nb_sb);
if (rc || !nb_sb) if (rc || !nb_sb)
goto out_unlock; goto out_unlock;
cond_resched(); cond_resched();
...@@ -1443,10 +1979,9 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1443,10 +1979,9 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
} }
/* Try to unplug subblocks of partially plugged online blocks. */ /* Try to unplug subblocks of partially plugged online blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id, virtio_mem_sbm_for_each_mb_rev(vm, mb_id,
VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL) { VIRTIO_MEM_SBM_MB_ONLINE_PARTIAL) {
rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id, rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb);
&nb_sb);
if (rc || !nb_sb) if (rc || !nb_sb)
goto out_unlock; goto out_unlock;
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
...@@ -1455,10 +1990,8 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1455,10 +1990,8 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
} }
/* Try to unplug subblocks of plugged online blocks. */ /* Try to unplug subblocks of plugged online blocks. */
virtio_mem_for_each_mb_state_rev(vm, mb_id, virtio_mem_sbm_for_each_mb_rev(vm, mb_id, VIRTIO_MEM_SBM_MB_ONLINE) {
VIRTIO_MEM_MB_STATE_ONLINE) { rc = virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, &nb_sb);
rc = virtio_mem_mb_unplug_any_sb_online(vm, mb_id,
&nb_sb);
if (rc || !nb_sb) if (rc || !nb_sb)
goto out_unlock; goto out_unlock;
mutex_unlock(&vm->hotplug_mutex); mutex_unlock(&vm->hotplug_mutex);
...@@ -1473,20 +2006,212 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff) ...@@ -1473,20 +2006,212 @@ static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
return rc; return rc;
} }
/*
* Try to offline and remove a big block from Linux and unplug it. Will fail
* with -EBUSY if some memory is busy and cannot get unplugged.
*
* Will modify the state of the memory block. Might temporarily drop the
* hotplug_mutex.
*/
static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
struct page *page;
int rc;
if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_ADDED))
return -EINVAL;
if (bbm_safe_unplug) {
/*
* Start by fake-offlining all memory. Once we marked the device
* block as fake-offline, all newly onlined memory will
* automatically be kept fake-offline. Protect from concurrent
* onlining/offlining until we have a consistent state.
*/
mutex_lock(&vm->hotplug_mutex);
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
page = pfn_to_online_page(pfn);
if (!page)
continue;
rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
if (rc) {
end_pfn = pfn;
goto rollback_safe_unplug;
}
}
mutex_unlock(&vm->hotplug_mutex);
}
rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
if (rc) {
if (bbm_safe_unplug) {
mutex_lock(&vm->hotplug_mutex);
goto rollback_safe_unplug;
}
return rc;
}
rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
if (rc)
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_PLUGGED);
else
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_UNUSED);
return rc;
rollback_safe_unplug:
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
page = pfn_to_online_page(pfn);
if (!page)
continue;
virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
}
virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
mutex_unlock(&vm->hotplug_mutex);
return rc;
}
/*
* Try to remove a big block from Linux and unplug it. Will fail with
* -EBUSY if some memory is online.
*
* Will modify the state of the memory block.
*/
static int virtio_mem_bbm_remove_and_unplug_bb(struct virtio_mem *vm,
unsigned long bb_id)
{
int rc;
if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
VIRTIO_MEM_BBM_BB_ADDED))
return -EINVAL;
rc = virtio_mem_bbm_remove_bb(vm, bb_id);
if (rc)
return -EBUSY;
rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
if (rc)
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_PLUGGED);
else
virtio_mem_bbm_set_bb_state(vm, bb_id,
VIRTIO_MEM_BBM_BB_UNUSED);
return rc;
}
/*
* Test if a big block is completely offline.
*/
static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
unsigned long bb_id)
{
const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
unsigned long pfn;
for (pfn = start_pfn; pfn < start_pfn + nr_pages;
pfn += PAGES_PER_SECTION) {
if (pfn_to_online_page(pfn))
return false;
}
return true;
}
static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
{
uint64_t nb_bb = diff / vm->bbm.bb_size;
uint64_t bb_id;
int rc;
if (!nb_bb)
return 0;
/* Try to unplug completely offline big blocks first. */
virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
cond_resched();
/*
* As we're holding no locks, this check is racy as memory
* can get onlined in the meantime - but we'll fail gracefully.
*/
if (!virtio_mem_bbm_bb_is_offline(vm, bb_id))
continue;
rc = virtio_mem_bbm_remove_and_unplug_bb(vm, bb_id);
if (rc == -EBUSY)
continue;
if (!rc)
nb_bb--;
if (rc || !nb_bb)
return rc;
}
if (!unplug_online)
return 0;
/* Try to unplug any big blocks. */
virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
cond_resched();
rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
if (rc == -EBUSY)
continue;
if (!rc)
nb_bb--;
if (rc || !nb_bb)
return rc;
}
return nb_bb ? -EBUSY : 0;
}
/*
* Try to unplug the requested amount of memory.
*/
static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
{
if (vm->in_sbm)
return virtio_mem_sbm_unplug_request(vm, diff);
return virtio_mem_bbm_unplug_request(vm, diff);
}
/* /*
* Try to unplug all blocks that couldn't be unplugged before, for example, * Try to unplug all blocks that couldn't be unplugged before, for example,
* because the hypervisor was busy. * because the hypervisor was busy.
*/ */
static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm) static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
{ {
unsigned long mb_id; unsigned long id;
int rc; int rc;
virtio_mem_for_each_mb_state(vm, mb_id, VIRTIO_MEM_MB_STATE_PLUGGED) { if (!vm->in_sbm) {
rc = virtio_mem_mb_unplug(vm, mb_id); virtio_mem_bbm_for_each_bb(vm, id,
VIRTIO_MEM_BBM_BB_PLUGGED) {
rc = virtio_mem_bbm_unplug_bb(vm, id);
if (rc)
return rc;
virtio_mem_bbm_set_bb_state(vm, id,
VIRTIO_MEM_BBM_BB_UNUSED);
}
return 0;
}
virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
rc = virtio_mem_sbm_unplug_mb(vm, id);
if (rc) if (rc)
return rc; return rc;
virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); virtio_mem_sbm_set_mb_state(vm, id,
VIRTIO_MEM_SBM_MB_UNUSED);
} }
return 0; return 0;
...@@ -1511,7 +2236,13 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm) ...@@ -1511,7 +2236,13 @@ static void virtio_mem_refresh_config(struct virtio_mem *vm)
usable_region_size, &usable_region_size); usable_region_size, &usable_region_size);
end_addr = vm->addr + usable_region_size; end_addr = vm->addr + usable_region_size;
end_addr = min(end_addr, phys_limit); end_addr = min(end_addr, phys_limit);
vm->last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr) - 1;
if (vm->in_sbm)
vm->sbm.last_usable_mb_id =
virtio_mem_phys_to_mb_id(end_addr) - 1;
else
vm->bbm.last_usable_bb_id =
virtio_mem_phys_to_bb_id(vm, end_addr) - 1;
/* see if there is a request to change the size */ /* see if there is a request to change the size */
virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size, virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
...@@ -1535,6 +2266,7 @@ static void virtio_mem_run_wq(struct work_struct *work) ...@@ -1535,6 +2266,7 @@ static void virtio_mem_run_wq(struct work_struct *work)
if (vm->broken) if (vm->broken)
return; return;
atomic_set(&vm->wq_active, 1);
retry: retry:
rc = 0; rc = 0;
...@@ -1595,6 +2327,8 @@ static void virtio_mem_run_wq(struct work_struct *work) ...@@ -1595,6 +2327,8 @@ static void virtio_mem_run_wq(struct work_struct *work)
"unknown error, marking device broken: %d\n", rc); "unknown error, marking device broken: %d\n", rc);
vm->broken = true; vm->broken = true;
} }
atomic_set(&vm->wq_active, 0);
} }
static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer) static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
...@@ -1631,6 +2365,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm) ...@@ -1631,6 +2365,7 @@ static int virtio_mem_init_vq(struct virtio_mem *vm)
static int virtio_mem_init(struct virtio_mem *vm) static int virtio_mem_init(struct virtio_mem *vm)
{ {
const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS; const uint64_t phys_limit = 1UL << MAX_PHYSMEM_BITS;
uint64_t sb_size, addr;
uint16_t node_id; uint16_t node_id;
if (!vm->vdev->config->get) { if (!vm->vdev->config->get) {
...@@ -1659,15 +2394,9 @@ static int virtio_mem_init(struct virtio_mem *vm) ...@@ -1659,15 +2394,9 @@ static int virtio_mem_init(struct virtio_mem *vm)
virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size, virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
&vm->region_size); &vm->region_size);
/* /* Determine the nid for the device based on the lowest address. */
* We always hotplug memory in memory block granularity. This way, if (vm->nid == NUMA_NO_NODE)
* we have to wait for exactly one memory block to online. vm->nid = memory_add_physaddr_to_nid(vm->addr);
*/
if (vm->device_block_size > memory_block_size_bytes()) {
dev_err(&vm->vdev->dev,
"The block size is not supported (too big).\n");
return -EINVAL;
}
/* bad device setup - warn only */ /* bad device setup - warn only */
if (!IS_ALIGNED(vm->addr, memory_block_size_bytes())) if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
...@@ -1681,23 +2410,57 @@ static int virtio_mem_init(struct virtio_mem *vm) ...@@ -1681,23 +2410,57 @@ static int virtio_mem_init(struct virtio_mem *vm)
"Some memory is not addressable. This can make some memory unusable.\n"); "Some memory is not addressable. This can make some memory unusable.\n");
/* /*
* Calculate the subblock size: * We want subblocks to span at least MAX_ORDER_NR_PAGES and
* - At least MAX_ORDER - 1 / pageblock_order. * pageblock_nr_pages pages. This:
* - At least the device block size. * - Simplifies our page onlining code (virtio_mem_online_page_cb)
* In the worst case, a single subblock per memory block. * and fake page onlining code (virtio_mem_fake_online).
* - Is required for now for alloc_contig_range() to work reliably -
* it doesn't properly handle smaller granularity on ZONE_NORMAL.
*/ */
vm->subblock_size = PAGE_SIZE * 1ul << max_t(uint32_t, MAX_ORDER - 1, sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
pageblock_order); pageblock_nr_pages) * PAGE_SIZE;
vm->subblock_size = max_t(uint64_t, vm->device_block_size, sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
vm->subblock_size);
vm->nb_sb_per_mb = memory_block_size_bytes() / vm->subblock_size; if (sb_size < memory_block_size_bytes() && !force_bbm) {
/* SBM: At least two subblocks per Linux memory block. */
/* Round up to the next full memory block */ vm->in_sbm = true;
vm->first_mb_id = virtio_mem_phys_to_mb_id(vm->addr - 1 + vm->sbm.sb_size = sb_size;
memory_block_size_bytes()); vm->sbm.sbs_per_mb = memory_block_size_bytes() /
vm->next_mb_id = vm->first_mb_id; vm->sbm.sb_size;
vm->last_mb_id = virtio_mem_phys_to_mb_id(vm->addr +
vm->region_size) - 1; /* Round up to the next full memory block */
addr = vm->addr + memory_block_size_bytes() - 1;
vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
vm->sbm.next_mb_id = vm->sbm.first_mb_id;
} else {
/* BBM: At least one Linux memory block. */
vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
memory_block_size_bytes());
if (bbm_block_size) {
if (!is_power_of_2(bbm_block_size)) {
dev_warn(&vm->vdev->dev,
"bbm_block_size is not a power of 2");
} else if (bbm_block_size < vm->bbm.bb_size) {
dev_warn(&vm->vdev->dev,
"bbm_block_size is too small");
} else {
vm->bbm.bb_size = bbm_block_size;
}
}
/* Round up to the next aligned big block */
addr = vm->addr + vm->bbm.bb_size - 1;
vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
vm->bbm.next_bb_id = vm->bbm.first_bb_id;
}
/* Prepare the offline threshold - make sure we can add two blocks. */
vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
/* In BBM, we also want at least two big blocks. */
vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
vm->offline_threshold);
dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr); dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size); dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
...@@ -1705,9 +2468,13 @@ static int virtio_mem_init(struct virtio_mem *vm) ...@@ -1705,9 +2468,13 @@ static int virtio_mem_init(struct virtio_mem *vm)
(unsigned long long)vm->device_block_size); (unsigned long long)vm->device_block_size);
dev_info(&vm->vdev->dev, "memory block size: 0x%lx", dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
memory_block_size_bytes()); memory_block_size_bytes());
dev_info(&vm->vdev->dev, "subblock size: 0x%llx", if (vm->in_sbm)
(unsigned long long)vm->subblock_size); dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
if (vm->nid != NUMA_NO_NODE) (unsigned long long)vm->sbm.sb_size);
else
dev_info(&vm->vdev->dev, "big block size: 0x%llx",
(unsigned long long)vm->bbm.bb_size);
if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
dev_info(&vm->vdev->dev, "nid: %d", vm->nid); dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
return 0; return 0;
...@@ -1753,6 +2520,20 @@ static void virtio_mem_delete_resource(struct virtio_mem *vm) ...@@ -1753,6 +2520,20 @@ static void virtio_mem_delete_resource(struct virtio_mem *vm)
vm->parent_resource = NULL; vm->parent_resource = NULL;
} }
static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
{
return 1;
}
static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
{
const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
vm->addr + vm->region_size, NULL,
virtio_mem_range_has_system_ram) == 1;
}
static int virtio_mem_probe(struct virtio_device *vdev) static int virtio_mem_probe(struct virtio_device *vdev)
{ {
struct virtio_mem *vm; struct virtio_mem *vm;
...@@ -1849,21 +2630,24 @@ static void virtio_mem_remove(struct virtio_device *vdev) ...@@ -1849,21 +2630,24 @@ static void virtio_mem_remove(struct virtio_device *vdev)
cancel_work_sync(&vm->wq); cancel_work_sync(&vm->wq);
hrtimer_cancel(&vm->retry_timer); hrtimer_cancel(&vm->retry_timer);
/* if (vm->in_sbm) {
* After we unregistered our callbacks, user space can online partially /*
* plugged offline blocks. Make sure to remove them. * After we unregistered our callbacks, user space can online
*/ * partially plugged offline blocks. Make sure to remove them.
virtio_mem_for_each_mb_state(vm, mb_id, */
VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL) { virtio_mem_sbm_for_each_mb(vm, mb_id,
rc = virtio_mem_mb_remove(vm, mb_id); VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
BUG_ON(rc); rc = virtio_mem_sbm_remove_mb(vm, mb_id);
virtio_mem_mb_set_state(vm, mb_id, VIRTIO_MEM_MB_STATE_UNUSED); BUG_ON(rc);
virtio_mem_sbm_set_mb_state(vm, mb_id,
VIRTIO_MEM_SBM_MB_UNUSED);
}
/*
* After we unregistered our callbacks, user space can no longer
* offline partially plugged online memory blocks. No need to
* worry about them.
*/
} }
/*
* After we unregistered our callbacks, user space can no longer
* offline partially plugged online memory blocks. No need to worry
* about them.
*/
/* unregister callbacks */ /* unregister callbacks */
unregister_virtio_mem_device(vm); unregister_virtio_mem_device(vm);
...@@ -1874,10 +2658,7 @@ static void virtio_mem_remove(struct virtio_device *vdev) ...@@ -1874,10 +2658,7 @@ static void virtio_mem_remove(struct virtio_device *vdev)
* the system. And there is no way to stop the driver/device from going * the system. And there is no way to stop the driver/device from going
* away. Warn at least. * away. Warn at least.
*/ */
if (vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE] || if (virtio_mem_has_memory_added(vm)) {
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_OFFLINE_PARTIAL] ||
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE] ||
vm->nb_mb_state[VIRTIO_MEM_MB_STATE_ONLINE_PARTIAL]) {
dev_warn(&vdev->dev, "device still has system memory added\n"); dev_warn(&vdev->dev, "device still has system memory added\n");
} else { } else {
virtio_mem_delete_resource(vm); virtio_mem_delete_resource(vm);
...@@ -1885,8 +2666,12 @@ static void virtio_mem_remove(struct virtio_device *vdev) ...@@ -1885,8 +2666,12 @@ static void virtio_mem_remove(struct virtio_device *vdev)
} }
/* remove all tracking data - no locking needed */ /* remove all tracking data - no locking needed */
vfree(vm->mb_state); if (vm->in_sbm) {
vfree(vm->sb_bitmap); vfree(vm->sbm.mb_states);
vfree(vm->sbm.sb_states);
} else {
vfree(vm->bbm.bb_states);
}
/* reset the device and cleanup the queues */ /* reset the device and cleanup the queues */
vdev->config->reset(vdev); vdev->config->reset(vdev);
......
...@@ -1608,7 +1608,6 @@ static struct virtqueue *vring_create_virtqueue_packed( ...@@ -1608,7 +1608,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
vq->num_added = 0; vq->num_added = 0;
vq->packed_ring = true; vq->packed_ring = true;
vq->use_dma_api = vring_use_dma_api(vdev); vq->use_dma_api = vring_use_dma_api(vdev);
list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG #ifdef DEBUG
vq->in_use = false; vq->in_use = false;
vq->last_add_time_valid = false; vq->last_add_time_valid = false;
...@@ -1669,6 +1668,7 @@ static struct virtqueue *vring_create_virtqueue_packed( ...@@ -1669,6 +1668,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
cpu_to_le16(vq->packed.event_flags_shadow); cpu_to_le16(vq->packed.event_flags_shadow);
} }
list_add_tail(&vq->vq.list, &vdev->vqs);
return &vq->vq; return &vq->vq;
err_desc_extra: err_desc_extra:
...@@ -1676,9 +1676,9 @@ static struct virtqueue *vring_create_virtqueue_packed( ...@@ -1676,9 +1676,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
err_desc_state: err_desc_state:
kfree(vq); kfree(vq);
err_vq: err_vq:
vring_free_queue(vdev, event_size_in_bytes, device, ring_dma_addr); vring_free_queue(vdev, event_size_in_bytes, device, device_event_dma_addr);
err_device: err_device:
vring_free_queue(vdev, event_size_in_bytes, driver, ring_dma_addr); vring_free_queue(vdev, event_size_in_bytes, driver, driver_event_dma_addr);
err_driver: err_driver:
vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr); vring_free_queue(vdev, ring_size_in_bytes, ring, ring_dma_addr);
err_ring: err_ring:
...@@ -2085,7 +2085,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, ...@@ -2085,7 +2085,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
vq->last_used_idx = 0; vq->last_used_idx = 0;
vq->num_added = 0; vq->num_added = 0;
vq->use_dma_api = vring_use_dma_api(vdev); vq->use_dma_api = vring_use_dma_api(vdev);
list_add_tail(&vq->vq.list, &vdev->vqs);
#ifdef DEBUG #ifdef DEBUG
vq->in_use = false; vq->in_use = false;
vq->last_add_time_valid = false; vq->last_add_time_valid = false;
...@@ -2127,6 +2126,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, ...@@ -2127,6 +2126,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
memset(vq->split.desc_state, 0, vring.num * memset(vq->split.desc_state, 0, vring.num *
sizeof(struct vring_desc_state_split)); sizeof(struct vring_desc_state_split));
list_add_tail(&vq->vq.list, &vdev->vqs);
return &vq->vq; return &vq->vq;
} }
EXPORT_SYMBOL_GPL(__vring_new_virtqueue); EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
......
...@@ -42,6 +42,7 @@ struct vdpa_vq_state { ...@@ -42,6 +42,7 @@ struct vdpa_vq_state {
* @config: the configuration ops for this device. * @config: the configuration ops for this device.
* @index: device index * @index: device index
* @features_valid: were features initialized? for legacy guests * @features_valid: were features initialized? for legacy guests
* @nvqs: maximum number of supported virtqueues
*/ */
struct vdpa_device { struct vdpa_device {
struct device dev; struct device dev;
......
...@@ -29,24 +29,30 @@ ...@@ -29,24 +29,30 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE. */ * SUCH DAMAGE. */
#define VIRTIO_ID_NET 1 /* virtio net */ #define VIRTIO_ID_NET 1 /* virtio net */
#define VIRTIO_ID_BLOCK 2 /* virtio block */ #define VIRTIO_ID_BLOCK 2 /* virtio block */
#define VIRTIO_ID_CONSOLE 3 /* virtio console */ #define VIRTIO_ID_CONSOLE 3 /* virtio console */
#define VIRTIO_ID_RNG 4 /* virtio rng */ #define VIRTIO_ID_RNG 4 /* virtio rng */
#define VIRTIO_ID_BALLOON 5 /* virtio balloon */ #define VIRTIO_ID_BALLOON 5 /* virtio balloon */
#define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */ #define VIRTIO_ID_IOMEM 6 /* virtio ioMemory */
#define VIRTIO_ID_SCSI 8 /* virtio scsi */ #define VIRTIO_ID_RPMSG 7 /* virtio remote processor messaging */
#define VIRTIO_ID_9P 9 /* 9p virtio console */ #define VIRTIO_ID_SCSI 8 /* virtio scsi */
#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */ #define VIRTIO_ID_9P 9 /* 9p virtio console */
#define VIRTIO_ID_CAIF 12 /* Virtio caif */ #define VIRTIO_ID_MAC80211_WLAN 10 /* virtio WLAN MAC */
#define VIRTIO_ID_GPU 16 /* virtio GPU */ #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
#define VIRTIO_ID_INPUT 18 /* virtio input */ #define VIRTIO_ID_CAIF 12 /* Virtio caif */
#define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */ #define VIRTIO_ID_MEMORY_BALLOON 13 /* virtio memory balloon */
#define VIRTIO_ID_CRYPTO 20 /* virtio crypto */ #define VIRTIO_ID_GPU 16 /* virtio GPU */
#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */ #define VIRTIO_ID_CLOCK 17 /* virtio clock/timer */
#define VIRTIO_ID_MEM 24 /* virtio mem */ #define VIRTIO_ID_INPUT 18 /* virtio input */
#define VIRTIO_ID_FS 26 /* virtio filesystem */ #define VIRTIO_ID_VSOCK 19 /* virtio vsock transport */
#define VIRTIO_ID_PMEM 27 /* virtio pmem */ #define VIRTIO_ID_CRYPTO 20 /* virtio crypto */
#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ #define VIRTIO_ID_SIGNAL_DIST 21 /* virtio signal distribution device */
#define VIRTIO_ID_PSTORE 22 /* virtio pstore device */
#define VIRTIO_ID_IOMMU 23 /* virtio IOMMU */
#define VIRTIO_ID_MEM 24 /* virtio mem */
#define VIRTIO_ID_FS 26 /* virtio filesystem */
#define VIRTIO_ID_PMEM 27 /* virtio pmem */
#define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */
#endif /* _LINUX_VIRTIO_IDS_H */ #endif /* _LINUX_VIRTIO_IDS_H */
...@@ -1784,39 +1784,112 @@ int remove_memory(int nid, u64 start, u64 size) ...@@ -1784,39 +1784,112 @@ int remove_memory(int nid, u64 start, u64 size)
} }
EXPORT_SYMBOL_GPL(remove_memory); EXPORT_SYMBOL_GPL(remove_memory);
static int try_offline_memory_block(struct memory_block *mem, void *arg)
{
uint8_t online_type = MMOP_ONLINE_KERNEL;
uint8_t **online_types = arg;
struct page *page;
int rc;
/*
* Sense the online_type via the zone of the memory block. Offlining
* with multiple zones within one memory block will be rejected
* by offlining code ... so we don't care about that.
*/
page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
online_type = MMOP_ONLINE_MOVABLE;
rc = device_offline(&mem->dev);
/*
* Default is MMOP_OFFLINE - change it only if offlining succeeded,
* so try_reonline_memory_block() can do the right thing.
*/
if (!rc)
**online_types = online_type;
(*online_types)++;
/* Ignore if already offline. */
return rc < 0 ? rc : 0;
}
static int try_reonline_memory_block(struct memory_block *mem, void *arg)
{
uint8_t **online_types = arg;
int rc;
if (**online_types != MMOP_OFFLINE) {
mem->online_type = **online_types;
rc = device_online(&mem->dev);
if (rc < 0)
pr_warn("%s: Failed to re-online memory: %d",
__func__, rc);
}
/* Continue processing all remaining memory blocks. */
(*online_types)++;
return 0;
}
/* /*
* Try to offline and remove a memory block. Might take a long time to * Try to offline and remove memory. Might take a long time to finish in case
* finish in case memory is still in use. Primarily useful for memory devices * memory is still in use. Primarily useful for memory devices that logically
* that logically unplugged all memory (so it's no longer in use) and want to * unplugged all memory (so it's no longer in use) and want to offline + remove
* offline + remove the memory block. * that memory.
*/ */
int offline_and_remove_memory(int nid, u64 start, u64 size) int offline_and_remove_memory(int nid, u64 start, u64 size)
{ {
struct memory_block *mem; const unsigned long mb_count = size / memory_block_size_bytes();
int rc = -EINVAL; uint8_t *online_types, *tmp;
int rc;
if (!IS_ALIGNED(start, memory_block_size_bytes()) || if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
size != memory_block_size_bytes()) !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
return rc; return -EINVAL;
/*
* We'll remember the old online type of each memory block, so we can
* try to revert whatever we did when offlining one memory block fails
* after offlining some others succeeded.
*/
online_types = kmalloc_array(mb_count, sizeof(*online_types),
GFP_KERNEL);
if (!online_types)
return -ENOMEM;
/*
* Initialize all states to MMOP_OFFLINE, so when we abort processing in
* try_offline_memory_block(), we'll skip all unprocessed blocks in
* try_reonline_memory_block().
*/
memset(online_types, MMOP_OFFLINE, mb_count);
lock_device_hotplug(); lock_device_hotplug();
mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
if (mem) tmp = online_types;
rc = device_offline(&mem->dev); rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
/* Ignore if the device is already offline. */
if (rc > 0)
rc = 0;
/* /*
* In case we succeeded to offline the memory block, remove it. * In case we succeeded to offline all memory, remove it.
* This cannot fail as it cannot get onlined in the meantime. * This cannot fail as it cannot get onlined in the meantime.
*/ */
if (!rc) { if (!rc) {
rc = try_remove_memory(nid, start, size); rc = try_remove_memory(nid, start, size);
WARN_ON_ONCE(rc); if (rc)
pr_err("%s: Failed to remove memory: %d", __func__, rc);
}
/*
* Rollback what we did. While memory onlining might theoretically fail
* (nacked by a notifier), it barely ever happens.
*/
if (rc) {
tmp = online_types;
walk_memory_blocks(start, size, &tmp,
try_reonline_memory_block);
} }
unlock_device_hotplug(); unlock_device_hotplug();
kfree(online_types);
return rc; return rc;
} }
EXPORT_SYMBOL_GPL(offline_and_remove_memory); EXPORT_SYMBOL_GPL(offline_and_remove_memory);
......
...@@ -16,6 +16,16 @@ ...@@ -16,6 +16,16 @@
# define mb() abort() # define mb() abort()
# define dma_rmb() abort() # define dma_rmb() abort()
# define dma_wmb() abort() # define dma_wmb() abort()
#elif defined(__aarch64__)
#define dmb(opt) asm volatile("dmb " #opt : : : "memory")
#define virt_mb() __sync_synchronize()
#define virt_rmb() dmb(ishld)
#define virt_wmb() dmb(ishst)
#define virt_store_mb(var, value) do { WRITE_ONCE(var, value); dmb(ish); } while (0)
/* Weak barriers should be used. If not - it's a bug */
# define mb() abort()
# define dma_rmb() abort()
# define dma_wmb() abort()
#else #else
#error Please fill in barrier macros #error Please fill in barrier macros
#endif #endif
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#ifndef BUG_H #ifndef BUG_H
#define BUG_H #define BUG_H
#include <asm/bug.h>
#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond)) #define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
#define BUILD_BUG_ON(x) #define BUILD_BUG_ON(x)
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <linux/compiler.h> #include <linux/compiler.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/overflow.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/printk.h> #include <linux/printk.h>
#include <linux/bug.h> #include <linux/bug.h>
...@@ -117,6 +118,16 @@ static inline void free_page(unsigned long addr) ...@@ -117,6 +118,16 @@ static inline void free_page(unsigned long addr)
# define unlikely(x) (__builtin_expect(!!(x), 0)) # define unlikely(x) (__builtin_expect(!!(x), 0))
# endif # endif
static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t gfp)
{
size_t bytes;
if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
return NULL;
return krealloc(p, bytes, gfp);
}
#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__) #define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#ifdef DEBUG #ifdef DEBUG
#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__) #define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
...@@ -126,8 +137,6 @@ static inline void free_page(unsigned long addr) ...@@ -126,8 +137,6 @@ static inline void free_page(unsigned long addr)
#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) #define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__) #define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
#define WARN_ON_ONCE(cond) (unlikely(cond) ? fprintf (stderr, "WARNING\n") : 0)
#define min(x, y) ({ \ #define min(x, y) ({ \
typeof(x) _min1 = (x); \ typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \ typeof(y) _min2 = (y); \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment