Commit a4850b55 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvm-s390-next-5.20-1' of...

Merge tag 'kvm-s390-next-5.20-1' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD

KVM: s390x: Fixes and features for 5.20

* First part of deferred teardown
* CPU Topology
* interpretive execution for PCI instructions
* PV attestation
* Minor fixes
parents 8031d87a f5ecfee9
......@@ -5955,6 +5955,52 @@ KVM_PV_DUMP_CPU
Provides encrypted dump data like register values.
The length of the returned data is provided by uv_info.guest_cpu_stor_len.
4.137 KVM_S390_ZPCI_OP
----------------------
:Capability: KVM_CAP_S390_ZPCI_OP
:Architectures: s390
:Type: vm ioctl
:Parameters: struct kvm_s390_zpci_op (in)
:Returns: 0 on success, <0 on error
Used to manage hardware-assisted virtualization features for zPCI devices.
Parameters are specified via the following structure::
struct kvm_s390_zpci_op {
/* in */
__u32 fh; /* target device */
__u8 op; /* operation to perform */
__u8 pad[3];
union {
/* for KVM_S390_ZPCIOP_REG_AEN */
struct {
__u64 ibv; /* Guest addr of interrupt bit vector */
__u64 sb; /* Guest addr of summary bit */
__u32 flags;
__u32 noi; /* Number of interrupts */
__u8 isc; /* Guest interrupt subclass */
__u8 sbo; /* Offset of guest summary bit vector */
__u16 pad;
} reg_aen;
__u64 reserved[8];
} u;
};
The type of operation is specified in the "op" field.
KVM_S390_ZPCIOP_REG_AEN is used to register the VM for adapter event
notification interpretation, which will allow firmware delivery of adapter
events directly to the vm, with KVM providing a backup delivery mechanism;
KVM_S390_ZPCIOP_DEREG_AEN is used to subsequently disable interpretation of
adapter event notifications.
The target zPCI function must also be specified via the "fh" field. For the
KVM_S390_ZPCIOP_REG_AEN operation, additional information to establish firmware
delivery must be provided via the "reg_aen" struct.
The "pad" and "reserved" fields may be used for future extensions and should be
set to 0s by userspace.
5. The kvm_run structure
========================
......@@ -8223,6 +8269,31 @@ The capability has no effect if the nx_huge_pages module parameter is not set.
This capability may only be set before any vCPUs are created.
8.39 KVM_CAP_S390_CPU_TOPOLOGY
------------------------------
:Capability: KVM_CAP_S390_CPU_TOPOLOGY
:Architectures: s390
:Type: vm
This capability indicates that KVM will provide the S390 CPU Topology
facility which consist of the interpretation of the PTF instruction for
the function code 2 along with interception and forwarding of both the
PTF instruction with function codes 0 or 1 and the STSI(15,1,x)
instruction to the userland hypervisor.
The stfle facility 11, CPU Topology facility, should not be indicated
to the guest without this capability.
When this capability is present, KVM provides a new attribute group
on vm fd, KVM_S390_VM_CPU_TOPOLOGY.
This new attribute allows to get, set or clear the Modified Change
Topology Report (MTCR) bit of the SCA through the kvm_device_attr
structure.
When getting the Modified Change Topology Report value, the attr->addr
must point to a byte where the value will be stored or retrieved from.
9. Known KVM API problems
=========================
......
......@@ -17453,6 +17453,7 @@ M: Eric Farman <farman@linux.ibm.com>
L: linux-s390@vger.kernel.org
L: kvm@vger.kernel.org
S: Supported
F: arch/s390/kvm/pci*
F: drivers/vfio/pci/vfio_pci_zdev.c
F: include/uapi/linux/vfio_zdev.h
......
......@@ -45,6 +45,8 @@ void uv_query_info(void)
uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf;
uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len;
uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len;
uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver;
uv_info.supp_att_pflags = uvcb.supp_att_pflags;
}
#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST
......
......@@ -12,10 +12,11 @@
#include <linux/bit_spinlock.h>
#include <linux/dma-mapping.h>
#include <asm/tpi.h>
struct airq_struct {
struct hlist_node list; /* Handler queueing. */
void (*handler)(struct airq_struct *airq, bool floating);
void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info);
u8 *lsi_ptr; /* Local-Summary-Indicator pointer */
u8 lsi_mask; /* Local-Summary-Indicator mask */
u8 isc; /* Interrupt-subclass */
......@@ -46,8 +47,10 @@ struct airq_iv {
#define AIRQ_IV_PTR 4 /* Allocate the ptr array */
#define AIRQ_IV_DATA 8 /* Allocate the data array */
#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */
#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */
struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags);
struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags,
unsigned long *vec);
void airq_iv_release(struct airq_iv *iv);
unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num);
void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num);
......
......@@ -147,5 +147,42 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start,
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
int gmap_mark_unmergeable(void);
void s390_reset_acc(struct mm_struct *mm);
void s390_unlist_old_asce(struct gmap *gmap);
int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool interruptible);
/**
* s390_uv_destroy_range - Destroy a range of pages in the given mm.
* @mm: the mm on which to operate on
* @start: the start of the range
* @end: the end of the range
*
* This function will call cond_sched, so it should not generate stalls, but
* it will otherwise only return when it completed.
*/
static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
(void)__s390_uv_destroy_range(mm, start, end, false);
}
/**
* s390_uv_destroy_range_interruptible - Destroy a range of pages in the
* given mm, but stop when a fatal signal is received.
* @mm: the mm on which to operate on
* @start: the start of the range
* @end: the end of the range
*
* This function will call cond_sched, so it should not generate stalls. If
* a fatal signal is received, it will return with -EINTR immediately,
* without finishing destroying the whole range. Upon successful
* completion, 0 is returned.
*/
static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
return __s390_uv_destroy_range(mm, start, end, true);
}
#endif /* _ASM_S390_GMAP_H */
......@@ -19,6 +19,8 @@
#include <linux/kvm.h>
#include <linux/seqlock.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/mmu_notifier.h>
#include <asm/debug.h>
#include <asm/cpu.h>
#include <asm/fpu/api.h>
......@@ -93,19 +95,30 @@ union ipte_control {
};
};
union sca_utility {
__u16 val;
struct {
__u16 mtcr : 1;
__u16 reserved : 15;
};
};
struct bsca_block {
union ipte_control ipte_control;
__u64 reserved[5];
__u64 mcn;
__u64 reserved2;
union sca_utility utility;
__u8 reserved2[6];
struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS];
};
struct esca_block {
union ipte_control ipte_control;
__u64 reserved1[7];
__u64 reserved1[6];
union sca_utility utility;
__u8 reserved2[6];
__u64 mcn[4];
__u64 reserved2[20];
__u64 reserved3[20];
struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS];
};
......@@ -249,12 +262,16 @@ struct kvm_s390_sie_block {
#define ECB_SPECI 0x08
#define ECB_SRSI 0x04
#define ECB_HOSTPROTINT 0x02
#define ECB_PTF 0x01
__u8 ecb; /* 0x0061 */
#define ECB2_CMMA 0x80
#define ECB2_IEP 0x20
#define ECB2_PFMFI 0x08
#define ECB2_ESCA 0x04
#define ECB2_ZPCI_LSI 0x02
__u8 ecb2; /* 0x0062 */
#define ECB3_AISI 0x20
#define ECB3_AISII 0x10
#define ECB3_DEA 0x08
#define ECB3_AES 0x04
#define ECB3_RI 0x01
......@@ -759,6 +776,7 @@ struct kvm_vm_stat {
u64 inject_pfault_done;
u64 inject_service_signal;
u64 inject_virtio;
u64 aen_forward;
};
struct kvm_arch_memory_slot {
......@@ -924,6 +942,7 @@ struct kvm_s390_pv {
unsigned long stor_base;
void *stor_var;
bool dumping;
struct mmu_notifier mmu_notifier;
};
struct kvm_arch{
......@@ -940,6 +959,7 @@ struct kvm_arch{
int use_cmma;
int use_pfmfi;
int use_skf;
int use_zpci_interp;
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
......@@ -963,6 +983,8 @@ struct kvm_arch{
DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS);
struct kvm_s390_gisa_interrupt gisa_int;
struct kvm_s390_pv pv;
struct list_head kzdev_list;
spinlock_t kzdev_list_lock;
};
#define KVM_HVA_ERR_BAD (-1UL)
......@@ -1013,4 +1035,19 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
#define __KVM_HAVE_ARCH_VM_FREE
void kvm_arch_free_vm(struct kvm *kvm);
#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm);
void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev);
#else
static inline int kvm_s390_pci_register_kvm(struct zpci_dev *dev,
struct kvm *kvm)
{
return -EPERM;
}
static inline void kvm_s390_pci_unregister_kvm(struct zpci_dev *dev) {}
#endif
#endif
......@@ -18,7 +18,7 @@ typedef struct {
unsigned long asce_limit;
unsigned long vdso_base;
/* The mmu context belongs to a secure guest. */
atomic_t is_protected;
atomic_t protected_count;
/*
* The following bitfields need a down_write on the mm
* semaphore when they are written to. As they are only
......
......@@ -26,7 +26,7 @@ static inline int init_new_context(struct task_struct *tsk,
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.flush_count, 0);
atomic_set(&mm->context.is_protected, 0);
atomic_set(&mm->context.protected_count, 0);
mm->context.gmap_asce = 0;
mm->context.flush_mm = 0;
#ifdef CONFIG_PGSTE
......
......@@ -9,6 +9,7 @@
#include <asm-generic/pci.h>
#include <asm/pci_clp.h>
#include <asm/pci_debug.h>
#include <asm/pci_insn.h>
#include <asm/sclp.h>
#define PCIBIOS_MIN_IO 0x1000
......@@ -97,6 +98,7 @@ struct zpci_bar_struct {
};
struct s390_domain;
struct kvm_zdev;
#define ZPCI_FUNCTIONS_PER_BUS 256
struct zpci_bus {
......@@ -123,11 +125,14 @@ struct zpci_dev {
enum zpci_state state;
u32 fid; /* function ID, used by sclp */
u32 fh; /* function handle, used by insn's */
u32 gisa; /* GISA designation for passthrough */
u16 vfn; /* virtual function number */
u16 pchid; /* physical channel ID */
u16 maxstbl; /* Maximum store block size */
u8 pfgid; /* function group ID */
u8 pft; /* pci function type */
u8 port;
u8 dtsm; /* Supported DT mask */
u8 rid_available : 1;
u8 has_hp_slot : 1;
u8 has_resources : 1;
......@@ -186,7 +191,10 @@ struct zpci_dev {
struct dentry *debugfs_dev;
/* IOMMU and passthrough */
struct s390_domain *s390_domain; /* s390 IOMMU domain data */
struct kvm_zdev *kzdev;
struct mutex kzdev_lock;
};
static inline bool zdev_enabled(struct zpci_dev *zdev)
......@@ -198,6 +206,9 @@ extern const struct attribute_group *zpci_attr_groups[];
extern unsigned int s390_pci_force_floating __initdata;
extern unsigned int s390_pci_no_rid;
extern union zpci_sic_iib *zpci_aipb;
extern struct airq_iv *zpci_aif_sbv;
/* -----------------------------------------------------------------------------
Prototypes
----------------------------------------------------------------------------- */
......
......@@ -153,9 +153,11 @@ struct clp_rsp_query_pci_grp {
u8 : 6;
u8 frame : 1;
u8 refresh : 1; /* TLB refresh mode */
u16 reserved2;
u16 : 3;
u16 maxstbl : 13; /* Maximum store block size */
u16 mui;
u16 : 16;
u8 dtsm; /* Supported DT mask */
u8 reserved3;
u16 maxfaal;
u16 : 4;
u16 dnoi : 12;
......@@ -173,7 +175,8 @@ struct clp_req_set_pci {
u16 reserved2;
u8 oc; /* operation controls */
u8 ndas; /* number of dma spaces */
u64 reserved3;
u32 reserved3;
u32 gisa; /* GISA designation */
} __packed;
/* Set PCI function response */
......
......@@ -98,6 +98,15 @@ struct zpci_fib {
u32 gd;
} __packed __aligned(8);
/* Set Interruption Controls Operation Controls */
#define SIC_IRQ_MODE_ALL 0
#define SIC_IRQ_MODE_SINGLE 1
#define SIC_SET_AENI_CONTROLS 2
#define SIC_IRQ_MODE_DIRECT 4
#define SIC_IRQ_MODE_D_ALL 16
#define SIC_IRQ_MODE_D_SINGLE 17
#define SIC_IRQ_MODE_SET_CPU 18
/* directed interruption information block */
struct zpci_diib {
u32 : 1;
......@@ -119,9 +128,20 @@ struct zpci_cdiib {
u64 : 64;
} __packed __aligned(8);
/* adapter interruption parameters block */
struct zpci_aipb {
u64 faisb;
u64 gait;
u16 : 13;
u16 afi : 3;
u32 : 32;
u16 faal;
} __packed __aligned(8);
union zpci_sic_iib {
struct zpci_diib diib;
struct zpci_cdiib cdiib;
struct zpci_aipb aipb;
};
DECLARE_STATIC_KEY_FALSE(have_mio);
......@@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset);
int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len);
int __zpci_store_block(const u64 *data, u64 req, u64 offset);
void zpci_barrier(void);
int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc)
{
union zpci_sic_iib iib = {{0}};
return __zpci_set_irq_ctrl(ctl, isc, &iib);
}
int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib);
#endif
......@@ -525,7 +525,7 @@ static inline int mm_has_pgste(struct mm_struct *mm)
static inline int mm_is_protected(struct mm_struct *mm)
{
#ifdef CONFIG_PGSTE
if (unlikely(atomic_read(&mm->context.is_protected)))
if (unlikely(atomic_read(&mm->context.protected_count)))
return 1;
#endif
return 0;
......@@ -1182,9 +1182,22 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
} else {
res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
}
/* At this point the reference through the mapping is still present */
if (mm_is_protected(mm) && pte_present(res))
uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
/* Nothing to do */
if (!mm_is_protected(mm) || !pte_present(res))
return res;
/*
* At this point the reference through the mapping is still present.
* The notifier should have destroyed all protected vCPUs at this
* point, so the destroy should be successful.
*/
if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK))
return res;
/*
* If something went wrong and the page could not be destroyed, or
* if this is not a mm teardown, the slower export is used as
* fallback instead.
*/
uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK);
return res;
}
......
......@@ -88,6 +88,10 @@ struct sclp_info {
unsigned char has_sipl : 1;
unsigned char has_dirq : 1;
unsigned char has_iplcc : 1;
unsigned char has_zpci_lsi : 1;
unsigned char has_aisii : 1;
unsigned char has_aeni : 1;
unsigned char has_aisi : 1;
unsigned int ibc;
unsigned int mtid;
unsigned int mtid_cp;
......
......@@ -19,6 +19,19 @@ struct tpi_info {
u32 :12;
} __packed __aligned(4);
/* I/O-Interruption Code as stored by TPI for an Adapter I/O */
struct tpi_adapter_info {
u32 aism:8;
u32 :22;
u32 error:1;
u32 forward:1;
u32 reserved;
u32 adapter_IO:1;
u32 directed_irq:1;
u32 isc:3;
u32 :27;
} __packed __aligned(4);
#endif /* __ASSEMBLY__ */
#endif /* _ASM_S390_TPI_H */
......@@ -124,7 +124,10 @@ struct uv_cb_qui {
u64 reservedc0; /* 0x00c0 */
u64 conf_dump_storage_state_len; /* 0x00c8 */
u64 conf_dump_finalize_len; /* 0x00d0 */
u8 reservedd8[256 - 216]; /* 0x00d8 */
u64 reservedd8; /* 0x00d8 */
u64 supp_att_req_hdr_ver; /* 0x00e0 */
u64 supp_att_pflags; /* 0x00e8 */
u8 reservedf0[256 - 240]; /* 0x00f0 */
} __packed __aligned(8);
/* Initialize Ultravisor */
......@@ -350,6 +353,8 @@ struct uv_info {
unsigned long supp_se_hdr_pcf;
unsigned long conf_dump_storage_state_len;
unsigned long conf_dump_finalize_len;
unsigned long supp_att_req_hdr_ver;
unsigned long supp_att_pflags;
};
extern struct uv_info uv_info;
......@@ -421,6 +426,7 @@ static inline int is_prot_virt_host(void)
}
int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
int uv_destroy_owned_page(unsigned long paddr);
int uv_convert_from_secure(unsigned long paddr);
int uv_convert_owned_from_secure(unsigned long paddr);
......
......@@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req {
#define KVM_S390_VM_CRYPTO 2
#define KVM_S390_VM_CPU_MODEL 3
#define KVM_S390_VM_MIGRATION 4
#define KVM_S390_VM_CPU_TOPOLOGY 5
/* kvm attributes for mem_ctrl */
#define KVM_S390_VM_MEM_ENABLE_CMMA 0
......
......@@ -234,6 +234,32 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr,
return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
}
/**
* should_export_before_import - Determine whether an export is needed
* before an import-like operation
* @uvcb: the Ultravisor control block of the UVC to be performed
* @mm: the mm of the process
*
* Returns whether an export is needed before every import-like operation.
* This is needed for shared pages, which don't trigger a secure storage
* exception when accessed from a different guest.
*
* Although considered as one, the Unpin Page UVC is not an actual import,
* so it is not affected.
*
* No export is needed also when there is only one protected VM, because the
* page cannot belong to the wrong VM in that case (there is no "other VM"
* it can belong to).
*
* Return: true if an export is needed before every import, otherwise false.
*/
static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
{
if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
return false;
return atomic_read(&mm->context.protected_count) > 1;
}
/*
* Requests the Ultravisor to make a page accessible to a guest.
* If it's brought in the first time, it will be cleared. If
......@@ -277,6 +303,8 @@ int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
lock_page(page);
ptep = get_locked_pte(gmap->mm, uaddr, &ptelock);
if (should_export_before_import(uvcb, gmap->mm))
uv_convert_from_secure(page_to_phys(page));
rc = make_secure_pte(ptep, uaddr, page, uvcb);
pte_unmap_unlock(ptep, ptelock);
unlock_page(page);
......@@ -334,6 +362,61 @@ int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
}
EXPORT_SYMBOL_GPL(gmap_convert_to_secure);
/**
* gmap_destroy_page - Destroy a guest page.
* @gmap: the gmap of the guest
* @gaddr: the guest address to destroy
*
* An attempt will be made to destroy the given guest page. If the attempt
* fails, an attempt is made to export the page. If both attempts fail, an
* appropriate error is returned.
*/
int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
{
struct vm_area_struct *vma;
unsigned long uaddr;
struct page *page;
int rc;
rc = -EFAULT;
mmap_read_lock(gmap->mm);
uaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(uaddr))
goto out;
vma = vma_lookup(gmap->mm, uaddr);
if (!vma)
goto out;
/*
* Huge pages should not be able to become secure
*/
if (is_vm_hugetlb_page(vma))
goto out;
rc = 0;
/* we take an extra reference here */
page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
rc = uv_destroy_owned_page(page_to_phys(page));
/*
* Fault handlers can race; it is possible that two CPUs will fault
* on the same secure page. One CPU can destroy the page, reboot,
* re-enter secure mode and import it, while the second CPU was
* stuck at the beginning of the handler. At some point the second
* CPU will be able to progress, and it will not be able to destroy
* the page. In that case we do not want to terminate the process,
* we instead try to export the page.
*/
if (rc)
rc = uv_convert_owned_from_secure(page_to_phys(page));
put_page(page);
out:
mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_destroy_page);
/*
* To be called with the page locked or with an extra reference! This will
* prevent gmap_make_secure from touching the page concurrently. Having 2
......@@ -479,6 +562,24 @@ static ssize_t uv_query_max_guest_addr(struct kobject *kobj,
static struct kobj_attribute uv_query_max_guest_addr_attr =
__ATTR(max_address, 0444, uv_query_max_guest_addr, NULL);
static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver);
}
static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr =
__ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL);
static ssize_t uv_query_supp_att_pflags(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags);
}
static struct kobj_attribute uv_query_supp_att_pflags_attr =
__ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL);
static struct attribute *uv_query_attrs[] = {
&uv_query_facilities_attr.attr,
&uv_query_feature_indications_attr.attr,
......@@ -490,6 +591,8 @@ static struct attribute *uv_query_attrs[] = {
&uv_query_dump_storage_state_len_attr.attr,
&uv_query_dump_finalize_len_attr.attr,
&uv_query_dump_cpu_len_attr.attr,
&uv_query_supp_att_req_hdr_ver_attr.attr,
&uv_query_supp_att_pflags_attr.attr,
NULL,
};
......
......@@ -34,6 +34,7 @@ config KVM
select SRCU
select KVM_VFIO
select INTERVAL_TREE
select MMU_NOTIFIER
help
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
......
......@@ -10,4 +10,5 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
obj-$(CONFIG_KVM) += kvm.o
......@@ -262,77 +262,77 @@ struct aste {
/* .. more fields there */
};
int ipte_lock_held(struct kvm_vcpu *vcpu)
int ipte_lock_held(struct kvm *kvm)
{
if (vcpu->arch.sie_block->eca & ECA_SII) {
if (sclp.has_siif) {
int rc;
read_lock(&vcpu->kvm->arch.sca_lock);
rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0;
read_unlock(&vcpu->kvm->arch.sca_lock);
read_lock(&kvm->arch.sca_lock);
rc = kvm_s390_get_ipte_control(kvm)->kh != 0;
read_unlock(&kvm->arch.sca_lock);
return rc;
}
return vcpu->kvm->arch.ipte_lock_count != 0;
return kvm->arch.ipte_lock_count != 0;
}
static void ipte_lock_simple(struct kvm_vcpu *vcpu)
static void ipte_lock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
mutex_lock(&vcpu->kvm->arch.ipte_mutex);
vcpu->kvm->arch.ipte_lock_count++;
if (vcpu->kvm->arch.ipte_lock_count > 1)
mutex_lock(&kvm->arch.ipte_mutex);
kvm->arch.ipte_lock_count++;
if (kvm->arch.ipte_lock_count > 1)
goto out;
retry:
read_lock(&vcpu->kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(vcpu->kvm);
read_lock(&kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.k) {
read_unlock(&vcpu->kvm->arch.sca_lock);
read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
new = old;
new.k = 1;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
read_unlock(&vcpu->kvm->arch.sca_lock);
read_unlock(&kvm->arch.sca_lock);
out:
mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
mutex_unlock(&kvm->arch.ipte_mutex);
}
static void ipte_unlock_simple(struct kvm_vcpu *vcpu)
static void ipte_unlock_simple(struct kvm *kvm)
{
union ipte_control old, new, *ic;
mutex_lock(&vcpu->kvm->arch.ipte_mutex);
vcpu->kvm->arch.ipte_lock_count--;
if (vcpu->kvm->arch.ipte_lock_count)
mutex_lock(&kvm->arch.ipte_mutex);
kvm->arch.ipte_lock_count--;
if (kvm->arch.ipte_lock_count)
goto out;
read_lock(&vcpu->kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(vcpu->kvm);
read_lock(&kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
read_unlock(&vcpu->kvm->arch.sca_lock);
wake_up(&vcpu->kvm->arch.ipte_wq);
read_unlock(&kvm->arch.sca_lock);
wake_up(&kvm->arch.ipte_wq);
out:
mutex_unlock(&vcpu->kvm->arch.ipte_mutex);
mutex_unlock(&kvm->arch.ipte_mutex);
}
static void ipte_lock_siif(struct kvm_vcpu *vcpu)
static void ipte_lock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
retry:
read_lock(&vcpu->kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(vcpu->kvm);
read_lock(&kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
if (old.kg) {
read_unlock(&vcpu->kvm->arch.sca_lock);
read_unlock(&kvm->arch.sca_lock);
cond_resched();
goto retry;
}
......@@ -340,15 +340,15 @@ static void ipte_lock_siif(struct kvm_vcpu *vcpu)
new.k = 1;
new.kh++;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
read_unlock(&vcpu->kvm->arch.sca_lock);
read_unlock(&kvm->arch.sca_lock);
}
static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
static void ipte_unlock_siif(struct kvm *kvm)
{
union ipte_control old, new, *ic;
read_lock(&vcpu->kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(vcpu->kvm);
read_lock(&kvm->arch.sca_lock);
ic = kvm_s390_get_ipte_control(kvm);
do {
old = READ_ONCE(*ic);
new = old;
......@@ -356,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu)
if (!new.kh)
new.k = 0;
} while (cmpxchg(&ic->val, old.val, new.val) != old.val);
read_unlock(&vcpu->kvm->arch.sca_lock);
read_unlock(&kvm->arch.sca_lock);
if (!new.kh)
wake_up(&vcpu->kvm->arch.ipte_wq);
wake_up(&kvm->arch.ipte_wq);
}
void ipte_lock(struct kvm_vcpu *vcpu)
void ipte_lock(struct kvm *kvm)
{
if (vcpu->arch.sie_block->eca & ECA_SII)
ipte_lock_siif(vcpu);
if (sclp.has_siif)
ipte_lock_siif(kvm);
else
ipte_lock_simple(vcpu);
ipte_lock_simple(kvm);
}
void ipte_unlock(struct kvm_vcpu *vcpu)
void ipte_unlock(struct kvm *kvm)
{
if (vcpu->arch.sie_block->eca & ECA_SII)
ipte_unlock_siif(vcpu);
if (sclp.has_siif)
ipte_unlock_siif(kvm);
else
ipte_unlock_simple(vcpu);
ipte_unlock_simple(kvm);
}
static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
......@@ -1086,7 +1086,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
try_storage_prot_override = storage_prot_override_applicable(vcpu);
need_ipte_lock = psw_bits(*psw).dat && !asce.r;
if (need_ipte_lock)
ipte_lock(vcpu);
ipte_lock(vcpu->kvm);
/*
* Since we do the access further down ultimately via a move instruction
* that does key checking and returns an error in case of a protection
......@@ -1127,7 +1127,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
}
out_unlock:
if (need_ipte_lock)
ipte_unlock(vcpu);
ipte_unlock(vcpu->kvm);
if (nr_pages > ARRAY_SIZE(gpa_array))
vfree(gpas);
return rc;
......@@ -1199,10 +1199,10 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
ipte_lock(vcpu);
ipte_lock(vcpu->kvm);
rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode,
access_key);
ipte_unlock(vcpu);
ipte_unlock(vcpu->kvm);
return rc;
}
......@@ -1465,7 +1465,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
* tables/pointers we read stay valid - unshadowing is however
* always possible - only guest_table_lock protects us.
*/
ipte_lock(vcpu);
ipte_lock(vcpu->kvm);
rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
if (rc)
......@@ -1499,7 +1499,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
pte.p |= dat_protection;
if (!rc)
rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
ipte_unlock(vcpu);
ipte_unlock(vcpu->kvm);
mmap_read_unlock(sg->mm);
return rc;
}
......@@ -440,9 +440,9 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
return access_guest_real(vcpu, gra, data, len, 0);
}
void ipte_lock(struct kvm_vcpu *vcpu);
void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
void ipte_lock(struct kvm *kvm);
void ipte_unlock(struct kvm *kvm);
int ipte_lock_held(struct kvm *kvm);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
/* MVPG PEI indication bits */
......
......@@ -528,12 +528,27 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu)
static int handle_pv_notification(struct kvm_vcpu *vcpu)
{
int ret;
if (vcpu->arch.sie_block->ipa == 0xb210)
return handle_pv_spx(vcpu);
if (vcpu->arch.sie_block->ipa == 0xb220)
return handle_pv_sclp(vcpu);
if (vcpu->arch.sie_block->ipa == 0xb9a4)
return handle_pv_uvc(vcpu);
if (vcpu->arch.sie_block->ipa >> 8 == 0xae) {
/*
* Besides external call, other SIGP orders also cause a
* 108 (pv notify) intercept. In contrast to external call,
* these orders need to be emulated and hence the appropriate
* place to handle them is in handle_instruction().
* So first try kvm_s390_handle_sigp_pei() and if that isn't
* successful, go on with handle_instruction().
*/
ret = kvm_s390_handle_sigp_pei(vcpu);
if (!ret)
return ret;
}
return handle_instruction(vcpu);
}
......
......@@ -28,9 +28,11 @@
#include <asm/switch_to.h>
#include <asm/nmi.h>
#include <asm/airq.h>
#include <asm/tpi.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include "trace-s390.h"
#include "pci.h"
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
......@@ -702,7 +704,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu)
/*
* We indicate floating repressible conditions along with
* other pending conditions. Channel Report Pending and Channel
* Subsystem damage are the only two and and are indicated by
* Subsystem damage are the only two and are indicated by
* bits in mcic and masked in cr14.
*/
if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) {
......@@ -3311,10 +3313,87 @@ int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc)
}
EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister);
static void gib_alert_irq_handler(struct airq_struct *airq, bool floating)
static void aen_host_forward(unsigned long si)
{
struct kvm_s390_gisa_interrupt *gi;
struct zpci_gaite *gaite;
struct kvm *kvm;
gaite = (struct zpci_gaite *)aift->gait +
(si * sizeof(struct zpci_gaite));
if (gaite->count == 0)
return;
if (gaite->aisb != 0)
set_bit_inv(gaite->aisbo, (unsigned long *)gaite->aisb);
kvm = kvm_s390_pci_si_to_kvm(aift, si);
if (!kvm)
return;
gi = &kvm->arch.gisa_int;
if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) ||
!(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) {
gisa_set_ipm_gisc(gi->origin, gaite->gisc);
if (hrtimer_active(&gi->timer))
hrtimer_cancel(&gi->timer);
hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL);
kvm->stat.aen_forward++;
}
}
static void aen_process_gait(u8 isc)
{
bool found = false, first = true;
union zpci_sic_iib iib = {{0}};
unsigned long si, flags;
spin_lock_irqsave(&aift->gait_lock, flags);
if (!aift->gait) {
spin_unlock_irqrestore(&aift->gait_lock, flags);
return;
}
for (si = 0;;) {
/* Scan adapter summary indicator bit vector */
si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv));
if (si == -1UL) {
if (first || found) {
/* Re-enable interrupts. */
zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc,
&iib);
first = found = false;
} else {
/* Interrupts on and all bits processed */
break;
}
found = false;
si = 0;
/* Scan again after re-enabling interrupts */
continue;
}
found = true;
aen_host_forward(si);
}
spin_unlock_irqrestore(&aift->gait_lock, flags);
}
static void gib_alert_irq_handler(struct airq_struct *airq,
struct tpi_info *tpi_info)
{
struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info;
inc_irq_stat(IRQIO_GAL);
process_gib_alert_list();
if ((info->forward || info->error) &&
IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
aen_process_gait(info->isc);
if (info->aism != 0)
process_gib_alert_list();
} else {
process_gib_alert_list();
}
}
static struct airq_struct gib_alert_irq = {
......@@ -3326,6 +3405,11 @@ void kvm_s390_gib_destroy(void)
{
if (!gib)
return;
if (kvm_s390_pci_interp_allowed() && aift) {
mutex_lock(&aift->aift_lock);
kvm_s390_pci_aen_exit();
mutex_unlock(&aift->aift_lock);
}
chsc_sgib(0);
unregister_adapter_interrupt(&gib_alert_irq);
free_page((unsigned long)gib);
......@@ -3363,6 +3447,14 @@ int kvm_s390_gib_init(u8 nisc)
goto out_unreg_gal;
}
if (kvm_s390_pci_interp_allowed()) {
if (kvm_s390_pci_aen_init(nisc)) {
pr_err("Initializing AEN for PCI failed\n");
rc = -EIO;
goto out_unreg_gal;
}
}
KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc);
goto out;
......
This diff is collapsed.
......@@ -379,6 +379,7 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
/* implemented in diag.c */
int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
......@@ -512,6 +513,16 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
*/
void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm);
/**
* kvm_s390_vcpu_pci_enable_interp
*
* Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store
* interpretation as well as adapter interruption forwarding.
*
* @kvm: the KVM guest
*/
void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm);
/**
* diag9c_forwarding_hz
*
......
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
/*
* s390 kvm PCI passthrough support
*
* Copyright IBM Corp. 2022
*
* Author(s): Matthew Rosato <mjrosato@linux.ibm.com>
*/
#ifndef __KVM_S390_PCI_H
#define __KVM_S390_PCI_H
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <asm/airq.h>
#include <asm/cpu.h>
struct kvm_zdev {
struct zpci_dev *zdev;
struct kvm *kvm;
struct zpci_fib fib;
struct list_head entry;
};
struct zpci_gaite {
u32 gisa;
u8 gisc;
u8 count;
u8 reserved;
u8 aisbo;
u64 aisb;
};
struct zpci_aift {
struct zpci_gaite *gait;
struct airq_iv *sbv;
struct kvm_zdev **kzdev;
spinlock_t gait_lock; /* Protects the gait, used during AEN forward */
struct mutex aift_lock; /* Protects the other structures in aift */
};
extern struct zpci_aift *aift;
static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift,
unsigned long si)
{
if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || aift->kzdev == 0 ||
aift->kzdev[si] == 0)
return 0;
return aift->kzdev[si]->kvm;
};
int kvm_s390_pci_aen_init(u8 nisc);
void kvm_s390_pci_aen_exit(void);
void kvm_s390_pci_init_list(struct kvm *kvm);
void kvm_s390_pci_clear_list(struct kvm *kvm);
int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
int kvm_s390_pci_init(void);
void kvm_s390_pci_exit(void);
static inline bool kvm_s390_pci_interp_allowed(void)
{
struct cpuid cpu_id;
get_cpu_id(&cpu_id);
switch (cpu_id.machine) {
case 0x2817:
case 0x2818:
case 0x2827:
case 0x2828:
case 0x2964:
case 0x2965:
/* No SHM on certain machines */
return false;
default:
return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) &&
sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi &&
sclp.has_aisii);
}
}
#endif /* __KVM_S390_PCI_H */
......@@ -442,7 +442,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_ipte_interlock++;
if (psw_bits(vcpu->arch.sie_block->gpsw).pstate)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm));
kvm_s390_retry_instr(vcpu);
VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
return 0;
......@@ -873,10 +873,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
if (fc > 3) {
kvm_s390_set_psw_cc(vcpu, 3);
return 0;
}
/* Bailout forbidden function codes */
if (fc > 3 && fc != 15)
goto out_no_data;
/*
* fc 15 is provided only with
* - PTF/CPU topology support through facility 15
* - KVM_CAP_S390_USER_STSI
*/
if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) ||
!vcpu->kvm->arch.user_stsi))
goto out_no_data;
if (vcpu->run->s.regs.gprs[0] & 0x0fffff00
|| vcpu->run->s.regs.gprs[1] & 0xffff0000)
......@@ -910,6 +918,10 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
goto out_no_data;
handle_stsi_3_2_2(vcpu, (void *) mem);
break;
case 15: /* fc 15 is fully handled in userspace */
insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2);
trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
return -EREMOTE;
}
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem,
......@@ -1471,7 +1483,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
access_key = (operand2 & 0xf0) >> 4;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
ipte_lock(vcpu);
ipte_lock(vcpu->kvm);
ret = guest_translate_address_with_key(vcpu, address, ar, &gpa,
GACC_STORE, access_key);
......@@ -1508,7 +1520,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
}
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
ipte_unlock(vcpu);
ipte_unlock(vcpu->kvm);
return ret;
}
......
......@@ -13,8 +13,19 @@
#include <asm/gmap.h>
#include <asm/uv.h>
#include <asm/mman.h>
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
static void kvm_s390_clear_pv_state(struct kvm *kvm)
{
kvm->arch.pv.handle = 0;
kvm->arch.pv.guest_len = 0;
kvm->arch.pv.stor_base = 0;
kvm->arch.pv.stor_var = NULL;
}
int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
{
int cc;
......@@ -109,7 +120,7 @@ static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
vfree(kvm->arch.pv.stor_var);
free_pages(kvm->arch.pv.stor_base,
get_order(uv_info.guest_base_stor_len));
memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
kvm_s390_clear_pv_state(kvm);
}
static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
......@@ -153,21 +164,51 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
{
int cc;
/* make all pages accessible before destroying the guest */
s390_reset_acc(kvm->mm);
cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
atomic_set(&kvm->mm->context.is_protected, 0);
/*
* if the mm still has a mapping, make all its pages accessible
* before destroying the guest
*/
if (mmget_not_zero(kvm->mm)) {
s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
mmput(kvm->mm);
}
if (!cc) {
atomic_dec(&kvm->mm->context.protected_count);
kvm_s390_pv_dealloc_vm(kvm);
} else {
/* Intended memory leak on "impossible" error */
s390_replace_asce(kvm->arch.gmap);
}
KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
/* Inteded memory leak on "impossible" error */
if (!cc)
kvm_s390_pv_dealloc_vm(kvm);
return cc ? -EIO : 0;
}
static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
struct mm_struct *mm)
{
struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier);
u16 dummy;
/*
* No locking is needed since this is the last thread of the last user of this
* struct mm.
* When the struct kvm gets deinitialized, this notifier is also
* unregistered. This means that if this notifier runs, then the
* struct kvm is still valid.
*/
kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
}
static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
.release = kvm_s390_pv_mmu_notifier_release,
};
int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
{
struct uv_cb_cgc uvcb = {
......@@ -198,14 +239,22 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
/* Outputs */
kvm->arch.pv.handle = uvcb.guest_handle;
atomic_inc(&kvm->mm->context.protected_count);
if (cc) {
if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
else
} else {
atomic_dec(&kvm->mm->context.protected_count);
kvm_s390_pv_dealloc_vm(kvm);
}
return -EIO;
}
kvm->arch.gmap->guest_handle = uvcb.guest_handle;
/* Add the notifier only once. No races because we hold kvm->lock */
if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) {
kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops;
mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm);
}
return 0;
}
......@@ -225,8 +274,6 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
*rrc = uvcb.header.rrc;
KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x",
*rc, *rrc);
if (!cc)
atomic_set(&kvm->mm->context.is_protected, 1);
return cc ? -EINVAL : 0;
}
......
......@@ -480,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu)
struct kvm_vcpu *dest_vcpu;
u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL);
trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
if (order_code == SIGP_EXTERNAL_CALL) {
trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr);
dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr);
BUG_ON(dest_vcpu == NULL);
......
......@@ -503,6 +503,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* Host-protection-interruption introduced with ESOP */
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT;
/*
* CPU Topology
* This facility only uses the utility field of the SCA and none of
* the cpu entries that are problematic with the other interpretation
* facilities so we can pass it through
*/
if (test_kvm_facility(vcpu->kvm, 11))
scb_s->ecb |= scb_o->ecb & ECB_PTF;
/* transactional execution */
if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) {
/* remap the prefix is tx is toggled on */
......
......@@ -754,6 +754,7 @@ void do_secure_storage_access(struct pt_regs *regs)
struct vm_area_struct *vma;
struct mm_struct *mm;
struct page *page;
struct gmap *gmap;
int rc;
/*
......@@ -783,6 +784,17 @@ void do_secure_storage_access(struct pt_regs *regs)
}
switch (get_fault_type(regs)) {
case GMAP_FAULT:
mm = current->mm;
gmap = (struct gmap *)S390_lowcore.gmap;
mmap_read_lock(mm);
addr = __gmap_translate(gmap, addr);
mmap_read_unlock(mm);
if (IS_ERR_VALUE(addr)) {
do_fault_error(regs, VM_ACCESS_FLAGS, VM_FAULT_BADMAP);
break;
}
fallthrough;
case USER_FAULT:
mm = current->mm;
mmap_read_lock(mm);
......@@ -811,7 +823,6 @@ void do_secure_storage_access(struct pt_regs *regs)
if (rc)
BUG();
break;
case GMAP_FAULT:
default:
do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
WARN_ON_ONCE(1);
......@@ -837,6 +848,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access);
void do_secure_storage_violation(struct pt_regs *regs)
{
unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
/*
* If the VM has been rebooted, its address space might still contain
* secure pages from the previous boot.
* Clear the page so it can be reused.
*/
if (!gmap_destroy_page(gmap, gaddr))
return;
/*
* Either KVM messed up the secure guest mapping or the same
* page is mapped into multiple secure guests.
......
......@@ -2697,41 +2697,168 @@ void s390_reset_cmma(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
#define GATHER_GET_PAGES 32
struct reset_walk_state {
unsigned long next;
unsigned long count;
unsigned long pfns[GATHER_GET_PAGES];
};
static int s390_gather_pages(pte_t *ptep, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct reset_walk_state *p = walk->private;
pte_t pte = READ_ONCE(*ptep);
if (pte_present(pte)) {
/* we have a reference from the mapping, take an extra one */
get_page(phys_to_page(pte_val(pte)));
p->pfns[p->count] = phys_to_pfn(pte_val(pte));
p->next = next;
p->count++;
}
return p->count >= GATHER_GET_PAGES;
}
static const struct mm_walk_ops gather_pages_ops = {
.pte_entry = s390_gather_pages,
};
/*
* make inaccessible pages accessible again
* Call the Destroy secure page UVC on each page in the given array of PFNs.
* Each page needs to have an extra reference, which will be released here.
*/
static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
unsigned long next, struct mm_walk *walk)
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
{
pte_t pte = READ_ONCE(*ptep);
unsigned long i;
/* There is a reference through the mapping */
if (pte_present(pte))
WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK));
for (i = 0; i < count; i++) {
/* we always have an extra reference */
uv_destroy_owned_page(pfn_to_phys(pfns[i]));
/* get rid of the extra reference */
put_page(pfn_to_page(pfns[i]));
cond_resched();
}
}
EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
/**
* __s390_uv_destroy_range - Call the destroy secure page UVC on each page
* in the given range of the given address space.
* @mm: the mm to operate on
* @start: the start of the range
* @end: the end of the range
* @interruptible: if not 0, stop when a fatal signal is received
*
* Walk the given range of the given address space and call the destroy
* secure page UVC on each page. Optionally exit early if a fatal signal is
* pending.
*
* Return: 0 on success, -EINTR if the function stopped before completing
*/
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool interruptible)
{
struct reset_walk_state state = { .next = start };
int r = 1;
while (r > 0) {
state.count = 0;
mmap_read_lock(mm);
r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
mmap_read_unlock(mm);
cond_resched();
s390_uv_destroy_pfns(state.count, state.pfns);
if (interruptible && fatal_signal_pending(current))
return -EINTR;
}
return 0;
}
EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
static const struct mm_walk_ops reset_acc_walk_ops = {
.pte_entry = __s390_reset_acc,
};
/**
* s390_unlist_old_asce - Remove the topmost level of page tables from the
* list of page tables of the gmap.
* @gmap: the gmap whose table is to be removed
*
* On s390x, KVM keeps a list of all pages containing the page tables of the
* gmap (the CRST list). This list is used at tear down time to free all
* pages that are now not needed anymore.
*
* This function removes the topmost page of the tree (the one pointed to by
* the ASCE) from the CRST list.
*
* This means that it will not be freed when the VM is torn down, and needs
* to be handled separately by the caller, unless a leak is actually
* intended. Notice that this function will only remove the page from the
* list, the page will still be used as a top level page table (and ASCE).
*/
void s390_unlist_old_asce(struct gmap *gmap)
{
struct page *old;
#include <linux/sched/mm.h>
void s390_reset_acc(struct mm_struct *mm)
old = virt_to_page(gmap->table);
spin_lock(&gmap->guest_table_lock);
list_del(&old->lru);
/*
* Sometimes the topmost page might need to be "removed" multiple
* times, for example if the VM is rebooted into secure mode several
* times concurrently, or if s390_replace_asce fails after calling
* s390_remove_old_asce and is attempted again later. In that case
* the old asce has been removed from the list, and therefore it
* will not be freed when the VM terminates, but the ASCE is still
* in use and still pointed to.
* A subsequent call to replace_asce will follow the pointer and try
* to remove the same page from the list again.
* Therefore it's necessary that the page of the ASCE has valid
* pointers, so list_del can work (and do nothing) without
* dereferencing stale or invalid pointers.
*/
INIT_LIST_HEAD(&old->lru);
spin_unlock(&gmap->guest_table_lock);
}
EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
/**
* s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
* @gmap: the gmap whose ASCE needs to be replaced
*
* If the allocation of the new top level page table fails, the ASCE is not
* replaced.
* In any case, the old ASCE is always removed from the gmap CRST list.
* Therefore the caller has to make sure to save a pointer to it
* beforehand, unless a leak is actually intended.
*/
int s390_replace_asce(struct gmap *gmap)
{
if (!mm_is_protected(mm))
return;
unsigned long asce;
struct page *page;
void *table;
s390_unlist_old_asce(gmap);
page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
table = page_to_virt(page);
memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
/*
* we might be called during
* reset: we walk the pages and clear
* close of all kvm file descriptors: we walk the pages and clear
* exit of process on fd closure: vma already gone, do nothing
* The caller has to deal with the old ASCE, but here we make sure
* the new one is properly added to the CRST list, so that
* it will be freed when the VM is torn down.
*/
if (!mmget_not_zero(mm))
return;
mmap_read_lock(mm);
walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
mmap_read_unlock(mm);
mmput(mm);
spin_lock(&gmap->guest_table_lock);
list_add(&page->lru, &gmap->crst_list);
spin_unlock(&gmap->guest_table_lock);
/* Set new table origin while preserving existing ASCE control bits */
asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
WRITE_ONCE(gmap->asce, asce);
WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
WRITE_ONCE(gmap->table, table);
return 0;
}
EXPORT_SYMBOL_GPL(s390_reset_acc);
EXPORT_SYMBOL_GPL(s390_replace_asce);
......@@ -61,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio);
static struct kmem_cache *zdev_fmb_cache;
/* AEN structures that must be preserved over KVM module re-insertion */
union zpci_sic_iib *zpci_aipb;
EXPORT_SYMBOL_GPL(zpci_aipb);
struct airq_iv *zpci_aif_sbv;
EXPORT_SYMBOL_GPL(zpci_aif_sbv);
struct zpci_dev *get_zdev_by_fid(u32 fid)
{
struct zpci_dev *tmp, *zdev = NULL;
......@@ -120,11 +126,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas,
fib.pba = base;
fib.pal = limit;
fib.iota = iota | ZPCI_IOTA_RTTO_FLAG;
fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc)
zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
return cc;
}
EXPORT_SYMBOL_GPL(zpci_register_ioat);
/* Modify PCI: Unregister I/O address translation parameters */
int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
......@@ -133,6 +141,8 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas)
struct zpci_fib fib = {0};
u8 cc, status;
fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc)
zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status);
......@@ -160,6 +170,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev)
atomic64_set(&zdev->unmapped_pages, 0);
fib.fmb_addr = virt_to_phys(zdev->fmb);
fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc) {
kmem_cache_free(zdev_fmb_cache, zdev->fmb);
......@@ -178,6 +189,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev)
if (!zdev->fmb)
return -EINVAL;
fib.gd = zdev->gisa;
/* Function measurement is disabled if fmb address is zero */
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3) /* Function already gone. */
......@@ -700,6 +713,7 @@ int zpci_enable_device(struct zpci_dev *zdev)
zpci_update_fh(zdev, fh);
return rc;
}
EXPORT_SYMBOL_GPL(zpci_enable_device);
int zpci_disable_device(struct zpci_dev *zdev)
{
......@@ -723,6 +737,7 @@ int zpci_disable_device(struct zpci_dev *zdev)
}
return rc;
}
EXPORT_SYMBOL_GPL(zpci_disable_device);
/**
* zpci_hot_reset_device - perform a reset of the given zPCI function
......@@ -816,6 +831,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
kref_init(&zdev->kref);
mutex_init(&zdev->lock);
mutex_init(&zdev->kzdev_lock);
rc = zpci_init_iommu(zdev);
if (rc)
......
......@@ -106,6 +106,8 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev,
zdev->max_msi = response->noi;
zdev->fmb_update = response->mui;
zdev->version = response->version;
zdev->maxstbl = response->maxstbl;
zdev->dtsm = response->dtsm;
switch (response->version) {
case 1:
......@@ -229,12 +231,16 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma
{
struct clp_req_rsp_set_pci *rrb;
int rc, retries = 100;
u32 gisa = 0;
*fh = 0;
rrb = clp_alloc_block(GFP_KERNEL);
if (!rrb)
return -ENOMEM;
if (command != CLP_SET_DISABLE_PCI_FN)
gisa = zdev->gisa;
do {
memset(rrb, 0, sizeof(*rrb));
rrb->request.hdr.len = sizeof(rrb->request);
......@@ -243,6 +249,7 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma
rrb->request.fh = zdev->fh;
rrb->request.oc = command;
rrb->request.ndas = nr_dma_as;
rrb->request.gisa = gisa;
rc = clp_req(rrb, CLP_LPS_PCI);
if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) {
......
......@@ -92,6 +92,7 @@ u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
return cc;
}
EXPORT_SYMBOL_GPL(zpci_mod_fc);
/* Refresh PCI Translations */
static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
......@@ -138,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
}
/* Set Interruption Controls */
int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
{
if (!test_facility(72))
return -EIO;
......@@ -149,6 +150,7 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib)
return 0;
}
EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl);
/* PCI Load */
static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
......
......@@ -11,16 +11,10 @@
#include <asm/isc.h>
#include <asm/airq.h>
#include <asm/tpi.h>
static enum {FLOATING, DIRECTED} irq_delivery;
#define SIC_IRQ_MODE_ALL 0
#define SIC_IRQ_MODE_SINGLE 1
#define SIC_IRQ_MODE_DIRECT 4
#define SIC_IRQ_MODE_D_ALL 16
#define SIC_IRQ_MODE_D_SINGLE 17
#define SIC_IRQ_MODE_SET_CPU 18
/*
* summary bit vector
* FLOATING - summary bit per function
......@@ -49,6 +43,7 @@ static int zpci_set_airq(struct zpci_dev *zdev)
fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */
fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8;
fib.fmt0.aisbo = zdev->aisb & 63;
fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
......@@ -60,6 +55,8 @@ static int zpci_clear_airq(struct zpci_dev *zdev)
struct zpci_fib fib = {0};
u8 cc, status;
fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
......@@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev)
fib.fmt = 1;
fib.fmt1.noi = zdev->msi_nr_irqs;
fib.fmt1.dibvo = zdev->msi_first_bit;
fib.gd = zdev->gisa;
return zpci_mod_fc(req, &fib, &status) ? -EIO : 0;
}
......@@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev)
u8 cc, status;
fib.fmt = 1;
fib.gd = zdev->gisa;
cc = zpci_mod_fc(req, &fib, &status);
if (cc == 3 || (cc == 1 && status == 24))
/* Function already gone or IRQs already deregistered. */
......@@ -153,6 +152,7 @@ static struct irq_chip zpci_irq_chip = {
static void zpci_handle_cpu_local_irq(bool rescan)
{
struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
union zpci_sic_iib iib = {{0}};
unsigned long bit;
int irqs_on = 0;
......@@ -164,7 +164,7 @@ static void zpci_handle_cpu_local_irq(bool rescan)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC))
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib))
break;
bit = 0;
continue;
......@@ -192,6 +192,7 @@ static void zpci_handle_remote_irq(void *data)
static void zpci_handle_fallback_irq(void)
{
struct cpu_irq_data *cpu_data;
union zpci_sic_iib iib = {{0}};
unsigned long cpu;
int irqs_on = 0;
......@@ -202,7 +203,7 @@ static void zpci_handle_fallback_irq(void)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
cpu = 0;
continue;
......@@ -216,8 +217,11 @@ static void zpci_handle_fallback_irq(void)
}
}
static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
static void zpci_directed_irq_handler(struct airq_struct *airq,
struct tpi_info *tpi_info)
{
bool floating = !tpi_info->directed_irq;
if (floating) {
inc_irq_stat(IRQIO_PCF);
zpci_handle_fallback_irq();
......@@ -227,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating)
}
}
static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
static void zpci_floating_irq_handler(struct airq_struct *airq,
struct tpi_info *tpi_info)
{
union zpci_sic_iib iib = {{0}};
unsigned long si, ai;
struct airq_iv *aibv;
int irqs_on = 0;
......@@ -242,7 +248,7 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating)
/* End of second scan with interrupts on. */
break;
/* First scan complete, reenable interrupts. */
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC))
if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib))
break;
si = 0;
continue;
......@@ -291,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
zdev->aisb = bit;
/* Create adapter interrupt vector */
zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK);
zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL);
if (!zdev->aibv)
return -ENOMEM;
......@@ -402,11 +408,12 @@ static struct airq_struct zpci_airq = {
static void __init cpu_enable_directed_irq(void *unused)
{
union zpci_sic_iib iib = {{0}};
union zpci_sic_iib ziib = {{0}};
iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector;
__zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC);
zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib);
zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib);
}
static int __init zpci_directed_irq_init(void)
......@@ -414,14 +421,14 @@ static int __init zpci_directed_irq_init(void)
union zpci_sic_iib iib = {{0}};
unsigned int cpu;
zpci_sbv = airq_iv_create(num_possible_cpus(), 0);
zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL);
if (!zpci_sbv)
return -ENOMEM;
iib.diib.isc = PCI_ISC;
iib.diib.nr_cpus = num_possible_cpus();
iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector);
__zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib);
zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv),
GFP_KERNEL);
......@@ -436,7 +443,7 @@ static int __init zpci_directed_irq_init(void)
zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE,
AIRQ_IV_DATA |
AIRQ_IV_CACHELINE |
(!cpu ? AIRQ_IV_ALLOC : 0));
(!cpu ? AIRQ_IV_ALLOC : 0), NULL);
if (!zpci_ibv[cpu])
return -ENOMEM;
}
......@@ -453,7 +460,7 @@ static int __init zpci_floating_irq_init(void)
if (!zpci_ibv)
return -ENOMEM;
zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC);
zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL);
if (!zpci_sbv)
goto out_free;
......@@ -466,6 +473,7 @@ static int __init zpci_floating_irq_init(void)
int __init zpci_irq_init(void)
{
union zpci_sic_iib iib = {{0}};
int rc;
irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING;
......@@ -497,7 +505,7 @@ int __init zpci_irq_init(void)
* Enable floating IRQs (with suppression after one IRQ). When using
* directed IRQs this enables the fallback path.
*/
zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC);
zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib);
return 0;
out_airq:
......
......@@ -111,6 +111,7 @@ static struct facility_def facility_defs[] = {
193, /* bear enhancement facility */
194, /* rdp enhancement facility */
196, /* processor activity instrumentation facility */
197, /* processor activity instrumentation extension 1 */
-1 /* END */
}
},
......
......@@ -45,6 +45,10 @@ static void __init sclp_early_facilities_detect(void)
sclp.has_gisaf = !!(sccb->fac118 & 0x08);
sclp.has_hvs = !!(sccb->fac119 & 0x80);
sclp.has_kss = !!(sccb->fac98 & 0x01);
sclp.has_aisii = !!(sccb->fac118 & 0x40);
sclp.has_aeni = !!(sccb->fac118 & 0x20);
sclp.has_aisi = !!(sccb->fac118 & 0x10);
sclp.has_zpci_lsi = !!(sccb->fac118 & 0x01);
if (sccb->fac85 & 0x02)
S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;
if (sccb->fac91 & 0x40)
......
......@@ -99,7 +99,7 @@ static irqreturn_t do_airq_interrupt(int irq, void *dummy)
rcu_read_lock();
hlist_for_each_entry_rcu(airq, head, list)
if ((*airq->lsi_ptr & airq->lsi_mask) != 0)
airq->handler(airq, !tpi_info->directed_irq);
airq->handler(airq, tpi_info);
rcu_read_unlock();
return IRQ_HANDLED;
......@@ -122,10 +122,12 @@ static inline unsigned long iv_size(unsigned long bits)
* airq_iv_create - create an interrupt vector
* @bits: number of bits in the interrupt vector
* @flags: allocation flags
* @vec: pointer to pinned guest memory if AIRQ_IV_GUESTVEC
*
* Returns a pointer to an interrupt vector structure
*/
struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags)
struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags,
unsigned long *vec)
{
struct airq_iv *iv;
unsigned long size;
......@@ -146,6 +148,8 @@ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags)
&iv->vector_dma);
if (!iv->vector)
goto out_free;
} else if (flags & AIRQ_IV_GUESTVEC) {
iv->vector = vec;
} else {
iv->vector = cio_dma_zalloc(size);
if (!iv->vector)
......@@ -185,7 +189,7 @@ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags)
kfree(iv->avail);
if (iv->flags & AIRQ_IV_CACHELINE && iv->vector)
dma_pool_free(airq_iv_cache, iv->vector, iv->vector_dma);
else
else if (!(iv->flags & AIRQ_IV_GUESTVEC))
cio_dma_free(iv->vector, size);
kfree(iv);
out:
......@@ -204,7 +208,7 @@ void airq_iv_release(struct airq_iv *iv)
kfree(iv->bitlock);
if (iv->flags & AIRQ_IV_CACHELINE)
dma_pool_free(airq_iv_cache, iv->vector, iv->vector_dma);
else
else if (!(iv->flags & AIRQ_IV_GUESTVEC))
cio_dma_free(iv->vector, iv_size(iv->bits));
kfree(iv->avail);
kfree(iv);
......
......@@ -15,6 +15,7 @@
#include <asm/qdio.h>
#include <asm/airq.h>
#include <asm/isc.h>
#include <asm/tpi.h>
#include "cio.h"
#include "ioasm.h"
......@@ -93,9 +94,10 @@ static inline u32 clear_shared_ind(void)
/**
* tiqdio_thinint_handler - thin interrupt handler for qdio
* @airq: pointer to adapter interrupt descriptor
* @floating: flag to recognize floating vs. directed interrupts (unused)
* @tpi_info: interrupt information (e.g. floating vs directed -- unused)
*/
static void tiqdio_thinint_handler(struct airq_struct *airq, bool floating)
static void tiqdio_thinint_handler(struct airq_struct *airq,
struct tpi_info *tpi_info)
{
u64 irq_time = S390_lowcore.int_clock;
u32 si_used = clear_shared_ind();
......
......@@ -27,6 +27,7 @@
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <asm/airq.h>
#include <asm/tpi.h>
#include <linux/atomic.h>
#include <asm/isc.h>
#include <linux/hrtimer.h>
......@@ -131,7 +132,8 @@ static int ap_max_adapter_id = 63;
static struct bus_type ap_bus_type;
/* Adapter interrupt definitions */
static void ap_interrupt_handler(struct airq_struct *airq, bool floating);
static void ap_interrupt_handler(struct airq_struct *airq,
struct tpi_info *tpi_info);
static bool ap_irq_flag;
......@@ -452,9 +454,10 @@ static enum hrtimer_restart ap_poll_timeout(struct hrtimer *unused)
/**
* ap_interrupt_handler() - Schedule ap_tasklet on interrupt
* @airq: pointer to adapter interrupt descriptor
* @floating: ignored
* @tpi_info: ignored
*/
static void ap_interrupt_handler(struct airq_struct *airq, bool floating)
static void ap_interrupt_handler(struct airq_struct *airq,
struct tpi_info *tpi_info)
{
inc_irq_stat(IRQIO_APB);
tasklet_schedule(&ap_tasklet);
......
......@@ -33,6 +33,7 @@
#include <asm/virtio-ccw.h>
#include <asm/isc.h>
#include <asm/airq.h>
#include <asm/tpi.h>
/*
* virtio related functions
......@@ -204,7 +205,8 @@ static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info)
write_unlock_irqrestore(&info->lock, flags);
}
static void virtio_airq_handler(struct airq_struct *airq, bool floating)
static void virtio_airq_handler(struct airq_struct *airq,
struct tpi_info *tpi_info)
{
struct airq_info *info = container_of(airq, struct airq_info, airq);
unsigned long ai;
......@@ -240,7 +242,7 @@ static struct airq_info *new_airq_info(int index)
return NULL;
rwlock_init(&info->lock);
info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR
| AIRQ_IV_CACHELINE);
| AIRQ_IV_CACHELINE, NULL);
if (!info->aiv) {
kfree(info);
return NULL;
......
......@@ -44,6 +44,17 @@ config VFIO_PCI_IGD
To enable Intel IGD assignment through vfio-pci, say Y.
endif
config VFIO_PCI_ZDEV_KVM
bool "VFIO PCI extensions for s390x KVM passthrough"
depends on S390 && KVM
default y
help
Support s390x-specific extensions to enable support for enhancements
to KVM passthrough capabilities, such as interpretive execution of
zPCI instructions.
To enable s390x KVM vfio-pci extensions, say Y.
source "drivers/vfio/pci/mlx5/Kconfig"
source "drivers/vfio/pci/hisilicon/Kconfig"
......
# SPDX-License-Identifier: GPL-2.0-only
vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o
vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o
vfio-pci-y := vfio_pci.o
......
......@@ -316,10 +316,14 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
pci_write_config_word(pdev, PCI_COMMAND, cmd);
}
ret = vfio_config_init(vdev);
ret = vfio_pci_zdev_open_device(vdev);
if (ret)
goto out_free_state;
ret = vfio_config_init(vdev);
if (ret)
goto out_free_zdev;
msix_pos = pdev->msix_cap;
if (msix_pos) {
u16 flags;
......@@ -340,6 +344,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
return 0;
out_free_zdev:
vfio_pci_zdev_close_device(vdev);
out_free_state:
kfree(vdev->pci_saved_state);
vdev->pci_saved_state = NULL;
......@@ -418,6 +424,8 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
vdev->needs_reset = true;
vfio_pci_zdev_close_device(vdev);
/*
* If we have saved state, restore it. If we can reset the device,
* even better. Resetting with current state seems better than
......
......@@ -11,6 +11,7 @@
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/vfio_zdev.h>
#include <linux/kvm_host.h>
#include <asm/pci_clp.h>
#include <asm/pci_io.h>
......@@ -23,14 +24,15 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
{
struct vfio_device_info_cap_zpci_base cap = {
.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
.header.version = 1,
.header.version = 2,
.start_dma = zdev->start_dma,
.end_dma = zdev->end_dma,
.pchid = zdev->pchid,
.vfn = zdev->vfn,
.fmb_length = zdev->fmb_length,
.pft = zdev->pft,
.gid = zdev->pfgid
.gid = zdev->pfgid,
.fh = zdev->fh
};
return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
......@@ -43,14 +45,16 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
{
struct vfio_device_info_cap_zpci_group cap = {
.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
.header.version = 1,
.header.version = 2,
.dasm = zdev->dma_mask,
.msi_addr = zdev->msi_addr,
.flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
.mui = zdev->fmb_update,
.noi = zdev->max_msi,
.maxstbl = ZPCI_MAX_WRITE_SIZE,
.version = zdev->version
.version = zdev->version,
.reserved = 0,
.imaxstbl = zdev->maxstbl
};
return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
......@@ -136,3 +140,26 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
return ret;
}
int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
{
struct zpci_dev *zdev = to_zpci(vdev->pdev);
if (!zdev)
return -ENODEV;
if (!vdev->vdev.kvm)
return 0;
return kvm_s390_pci_register_kvm(zdev, vdev->vdev.kvm);
}
void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
{
struct zpci_dev *zdev = to_zpci(vdev->pdev);
if (!zdev || !vdev->vdev.kvm)
return;
kvm_s390_pci_unregister_kvm(zdev);
}
......@@ -24,7 +24,8 @@ struct user_struct {
kuid_t uid;
#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
defined(CONFIG_NET) || defined(CONFIG_IO_URING)
defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \
defined(CONFIG_VFIO_PCI_ZDEV_KVM)
atomic_long_t locked_vm;
#endif
#ifdef CONFIG_WATCH_QUEUE
......
......@@ -206,15 +206,25 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
}
#endif
#ifdef CONFIG_S390
#ifdef CONFIG_VFIO_PCI_ZDEV_KVM
extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
struct vfio_info_cap *caps);
int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev);
void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev);
#else
static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
struct vfio_info_cap *caps)
{
return -ENODEV;
}
static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
{
return 0;
}
static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
{}
#endif
/* Will be exported for vfio pci drivers usage */
......
......@@ -1167,6 +1167,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218
#define KVM_CAP_X86_NOTIFY_VMEXIT 219
#define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220
#define KVM_CAP_S390_ZPCI_OP 221
#define KVM_CAP_S390_CPU_TOPOLOGY 222
#ifdef KVM_CAP_IRQ_ROUTING
......@@ -2186,4 +2188,34 @@ struct kvm_stats_desc {
#define KVM_X86_NOTIFY_VMEXIT_ENABLED (1ULL << 0)
#define KVM_X86_NOTIFY_VMEXIT_USER (1ULL << 1)
/* Available with KVM_CAP_S390_ZPCI_OP */
#define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op)
struct kvm_s390_zpci_op {
/* in */
__u32 fh; /* target device */
__u8 op; /* operation to perform */
__u8 pad[3];
union {
/* for KVM_S390_ZPCIOP_REG_AEN */
struct {
__u64 ibv; /* Guest addr of interrupt bit vector */
__u64 sb; /* Guest addr of summary bit */
__u32 flags;
__u32 noi; /* Number of interrupts */
__u8 isc; /* Guest interrupt subclass */
__u8 sbo; /* Offset of guest summary bit vector */
__u16 pad;
} reg_aen;
__u64 reserved[8];
} u;
};
/* types for kvm_s390_zpci_op->op */
#define KVM_S390_ZPCIOP_REG_AEN 0
#define KVM_S390_ZPCIOP_DEREG_AEN 1
/* flags for kvm_s390_zpci_op->u.reg_aen.flags */
#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0)
#endif /* __LINUX_KVM_H */
......@@ -29,6 +29,9 @@ struct vfio_device_info_cap_zpci_base {
__u16 fmb_length; /* Measurement Block Length (in bytes) */
__u8 pft; /* PCI Function Type */
__u8 gid; /* PCI function group ID */
/* End of version 1 */
__u32 fh; /* PCI function handle */
/* End of version 2 */
};
/**
......@@ -47,6 +50,10 @@ struct vfio_device_info_cap_zpci_group {
__u16 noi; /* Maximum number of MSIs */
__u16 maxstbl; /* Maximum Store Block Length */
__u8 version; /* Supported PCI Version */
/* End of version 1 */
__u8 reserved;
__u16 imaxstbl; /* Maximum Interpreted Store Block Length */
/* End of version 2 */
};
/**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment