Commit e5992f2e authored by Martin Schwidefsky's avatar Martin Schwidefsky

[S390] kvm guest address space mapping

Add code that allows KVM to control the virtual memory layout that
is seen by a guest. The guest address space uses a second page table
that shares the last level pte-tables with the process page table.
If a page is unmapped from the process page table it is automatically
unmapped from the guest page table as well.

The guest address space mapping starts out empty, KVM can map any
individual 1MB segments from the process virtual memory to any 1MB
aligned location in the guest virtual memory. If a target segment in
the process virtual memory does not exist or is unmapped while a
guest mapping exists the desired target address is stored as an
invalid segment table entry in the guest page table.
The population of the guest page table is fault driven.
Signed-off-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
parent 144d634a
...@@ -268,7 +268,7 @@ struct _lowcore { ...@@ -268,7 +268,7 @@ struct _lowcore {
__u64 vdso_per_cpu_data; /* 0x0358 */ __u64 vdso_per_cpu_data; /* 0x0358 */
__u64 machine_flags; /* 0x0360 */ __u64 machine_flags; /* 0x0360 */
__u64 ftrace_func; /* 0x0368 */ __u64 ftrace_func; /* 0x0368 */
__u64 sie_hook; /* 0x0370 */ __u64 gmap; /* 0x0370 */
__u64 cmf_hpp; /* 0x0378 */ __u64 cmf_hpp; /* 0x0378 */
/* Interrupt response block. */ /* Interrupt response block. */
......
...@@ -6,6 +6,7 @@ typedef struct { ...@@ -6,6 +6,7 @@ typedef struct {
unsigned int flush_mm; unsigned int flush_mm;
spinlock_t list_lock; spinlock_t list_lock;
struct list_head pgtable_list; struct list_head pgtable_list;
struct list_head gmap_list;
unsigned long asce_bits; unsigned long asce_bits;
unsigned long asce_limit; unsigned long asce_limit;
unsigned long vdso_base; unsigned long vdso_base;
...@@ -17,6 +18,7 @@ typedef struct { ...@@ -17,6 +18,7 @@ typedef struct {
#define INIT_MM_CONTEXT(name) \ #define INIT_MM_CONTEXT(name) \
.context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \ .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
#endif #endif
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
unsigned long *crst_table_alloc(struct mm_struct *); unsigned long *crst_table_alloc(struct mm_struct *);
void crst_table_free(struct mm_struct *, unsigned long *); void crst_table_free(struct mm_struct *, unsigned long *);
unsigned long *page_table_alloc(struct mm_struct *); unsigned long *page_table_alloc(struct mm_struct *, unsigned long);
void page_table_free(struct mm_struct *, unsigned long *); void page_table_free(struct mm_struct *, unsigned long *);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE #ifdef CONFIG_HAVE_RCU_TABLE_FREE
void page_table_free_rcu(struct mmu_gather *, unsigned long *); void page_table_free_rcu(struct mmu_gather *, unsigned long *);
...@@ -115,6 +115,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) ...@@ -115,6 +115,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{ {
spin_lock_init(&mm->context.list_lock); spin_lock_init(&mm->context.list_lock);
INIT_LIST_HEAD(&mm->context.pgtable_list); INIT_LIST_HEAD(&mm->context.pgtable_list);
INIT_LIST_HEAD(&mm->context.gmap_list);
return (pgd_t *) crst_table_alloc(mm); return (pgd_t *) crst_table_alloc(mm);
} }
#define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd) #define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd)
...@@ -133,8 +134,8 @@ static inline void pmd_populate(struct mm_struct *mm, ...@@ -133,8 +134,8 @@ static inline void pmd_populate(struct mm_struct *mm,
/* /*
* page table entry allocation/free routines. * page table entry allocation/free routines.
*/ */
#define pte_alloc_one_kernel(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) #define pte_alloc_one_kernel(mm, vmaddr) ((pte_t *) page_table_alloc(mm, vmaddr))
#define pte_alloc_one(mm, vmaddr) ((pte_t *) page_table_alloc(mm)) #define pte_alloc_one(mm, vmaddr) ((pte_t *) page_table_alloc(mm, vmaddr))
#define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte)
#define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte)
......
...@@ -654,6 +654,48 @@ static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste) ...@@ -654,6 +654,48 @@ static inline void pgste_set_pte(pte_t *ptep, pgste_t pgste)
#endif #endif
} }
/**
* struct gmap_struct - guest address space
* @mm: pointer to the parent mm_struct
* @table: pointer to the page directory
* @crst_list: list of all crst tables used in the guest address space
*/
struct gmap {
struct list_head list;
struct mm_struct *mm;
unsigned long *table;
struct list_head crst_list;
};
/**
* struct gmap_rmap - reverse mapping for segment table entries
* @next: pointer to the next gmap_rmap structure in the list
* @entry: pointer to a segment table entry
*/
struct gmap_rmap {
struct list_head list;
unsigned long *entry;
};
/**
* struct gmap_pgtable - gmap information attached to a page table
* @vmaddr: address of the 1MB segment in the process virtual memory
* @mapper: list of segment table entries maping a page table
*/
struct gmap_pgtable {
unsigned long vmaddr;
struct list_head mapper;
};
struct gmap *gmap_alloc(struct mm_struct *mm);
void gmap_free(struct gmap *gmap);
void gmap_enable(struct gmap *gmap);
void gmap_disable(struct gmap *gmap);
int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long to, unsigned long length);
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
unsigned long gmap_fault(unsigned long address, struct gmap *);
/* /*
* Certain architectures need to do special things when PTEs * Certain architectures need to do special things when PTEs
* within a page table are directly modified. Thus, the following * within a page table are directly modified. Thus, the following
......
...@@ -80,6 +80,7 @@ struct thread_struct { ...@@ -80,6 +80,7 @@ struct thread_struct {
mm_segment_t mm_segment; mm_segment_t mm_segment;
unsigned long prot_addr; /* address of protection-excep. */ unsigned long prot_addr; /* address of protection-excep. */
unsigned int trap_no; unsigned int trap_no;
unsigned long gmap_addr; /* address of last gmap fault. */
struct per_regs per_user; /* User specified PER registers */ struct per_regs per_user; /* User specified PER registers */
struct per_event per_event; /* Cause of the last PER trap */ struct per_event per_event; /* Cause of the last PER trap */
/* pfault_wait is used to block the process on a pfault event */ /* pfault_wait is used to block the process on a pfault event */
......
...@@ -80,7 +80,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm) ...@@ -80,7 +80,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm)
* on all cpus instead of doing a local flush if the mm * on all cpus instead of doing a local flush if the mm
* only ran on the local cpu. * only ran on the local cpu.
*/ */
if (MACHINE_HAS_IDTE) if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list))
__tlb_flush_idte((unsigned long) mm->pgd | __tlb_flush_idte((unsigned long) mm->pgd |
mm->context.asce_bits); mm->context.asce_bits);
else else
......
...@@ -151,7 +151,7 @@ int main(void) ...@@ -151,7 +151,7 @@ int main(void)
DEFINE(__LC_FP_CREG_SAVE_AREA, offsetof(struct _lowcore, fpt_creg_save_area)); DEFINE(__LC_FP_CREG_SAVE_AREA, offsetof(struct _lowcore, fpt_creg_save_area));
DEFINE(__LC_LAST_BREAK, offsetof(struct _lowcore, breaking_event_addr)); DEFINE(__LC_LAST_BREAK, offsetof(struct _lowcore, breaking_event_addr));
DEFINE(__LC_VDSO_PER_CPU, offsetof(struct _lowcore, vdso_per_cpu_data)); DEFINE(__LC_VDSO_PER_CPU, offsetof(struct _lowcore, vdso_per_cpu_data));
DEFINE(__LC_SIE_HOOK, offsetof(struct _lowcore, sie_hook)); DEFINE(__LC_GMAP, offsetof(struct _lowcore, gmap));
DEFINE(__LC_CMF_HPP, offsetof(struct _lowcore, cmf_hpp)); DEFINE(__LC_CMF_HPP, offsetof(struct _lowcore, cmf_hpp));
#endif /* CONFIG_32BIT */ #endif /* CONFIG_32BIT */
return 0; return 0;
......
...@@ -303,9 +303,24 @@ static inline int do_exception(struct pt_regs *regs, int access, ...@@ -303,9 +303,24 @@ static inline int do_exception(struct pt_regs *regs, int access,
flags = FAULT_FLAG_ALLOW_RETRY; flags = FAULT_FLAG_ALLOW_RETRY;
if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
flags |= FAULT_FLAG_WRITE; flags |= FAULT_FLAG_WRITE;
retry:
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
#ifdef CONFIG_PGSTE
if (test_tsk_thread_flag(current, TIF_SIE) && S390_lowcore.gmap) {
address = gmap_fault(address,
(struct gmap *) S390_lowcore.gmap);
if (address == -EFAULT) {
fault = VM_FAULT_BADMAP;
goto out_up;
}
if (address == -ENOMEM) {
fault = VM_FAULT_OOM;
goto out_up;
}
}
#endif
retry:
fault = VM_FAULT_BADMAP; fault = VM_FAULT_BADMAP;
vma = find_vma(mm, address); vma = find_vma(mm, address);
if (!vma) if (!vma)
...@@ -356,6 +371,7 @@ static inline int do_exception(struct pt_regs *regs, int access, ...@@ -356,6 +371,7 @@ static inline int do_exception(struct pt_regs *regs, int access,
/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
* of starvation. */ * of starvation. */
flags &= ~FAULT_FLAG_ALLOW_RETRY; flags &= ~FAULT_FLAG_ALLOW_RETRY;
down_read(&mm->mmap_sem);
goto retry; goto retry;
} }
} }
......
...@@ -35,7 +35,7 @@ int arch_prepare_hugepage(struct page *page) ...@@ -35,7 +35,7 @@ int arch_prepare_hugepage(struct page *page)
if (MACHINE_HAS_HPAGE) if (MACHINE_HAS_HPAGE)
return 0; return 0;
ptep = (pte_t *) pte_alloc_one(&init_mm, address); ptep = (pte_t *) pte_alloc_one(&init_mm, addr);
if (!ptep) if (!ptep)
return -ENOMEM; return -ENOMEM;
......
This diff is collapsed.
...@@ -61,12 +61,12 @@ static inline pmd_t *vmem_pmd_alloc(void) ...@@ -61,12 +61,12 @@ static inline pmd_t *vmem_pmd_alloc(void)
return pmd; return pmd;
} }
static pte_t __ref *vmem_pte_alloc(void) static pte_t __ref *vmem_pte_alloc(unsigned long address)
{ {
pte_t *pte; pte_t *pte;
if (slab_is_available()) if (slab_is_available())
pte = (pte_t *) page_table_alloc(&init_mm); pte = (pte_t *) page_table_alloc(&init_mm, address);
else else
pte = alloc_bootmem(PTRS_PER_PTE * sizeof(pte_t)); pte = alloc_bootmem(PTRS_PER_PTE * sizeof(pte_t));
if (!pte) if (!pte)
...@@ -120,7 +120,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro) ...@@ -120,7 +120,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
} }
#endif #endif
if (pmd_none(*pm_dir)) { if (pmd_none(*pm_dir)) {
pt_dir = vmem_pte_alloc(); pt_dir = vmem_pte_alloc(address);
if (!pt_dir) if (!pt_dir)
goto out; goto out;
pmd_populate(&init_mm, pm_dir, pt_dir); pmd_populate(&init_mm, pm_dir, pt_dir);
...@@ -205,7 +205,7 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node) ...@@ -205,7 +205,7 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
pm_dir = pmd_offset(pu_dir, address); pm_dir = pmd_offset(pu_dir, address);
if (pmd_none(*pm_dir)) { if (pmd_none(*pm_dir)) {
pt_dir = vmem_pte_alloc(); pt_dir = vmem_pte_alloc(address);
if (!pt_dir) if (!pt_dir)
goto out; goto out;
pmd_populate(&init_mm, pm_dir, pt_dir); pmd_populate(&init_mm, pm_dir, pt_dir);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment