Commit 1d86b5cc authored by Avi Kivity's avatar Avi Kivity

Merge branch 'queue' into next

* queue:
  KVM: MMU: Eliminate pointless temporary 'ac'
  KVM: MMU: Avoid access/dirty update loop if all is well
  KVM: MMU: Eliminate eperm temporary
  KVM: MMU: Optimize is_last_gpte()
  KVM: MMU: Simplify walk_addr_generic() loop
  KVM: MMU: Optimize pte permission checks
  KVM: MMU: Update accessed and dirty bits after guest pagetable walk
  KVM: MMU: Move gpte_access() out of paging_tmpl.h
  KVM: MMU: Optimize gpte_access() slightly
  KVM: MMU: Push clean gpte write protection out of gpte_access()
  KVM: clarify kvmclock documentation
  KVM: make processes waiting on vcpu mutex killable
  KVM: SVM: Make use of asm.h
  KVM: VMX: Make use of asm.h
  KVM: VMX: Make lto-friendly
Signed-off-by: default avatarAvi Kivity <avi@redhat.com>
parents ecba9a52 c5421519
......@@ -34,9 +34,12 @@ MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00
time information and check that they are both equal and even.
An odd version indicates an in-progress update.
sec: number of seconds for wallclock.
sec: number of seconds for wallclock at time of boot.
nsec: number of nanoseconds for wallclock.
nsec: number of nanoseconds for wallclock at time of boot.
In order to get the current wallclock time, the system_time from
MSR_KVM_SYSTEM_TIME_NEW needs to be added.
Note that although MSRs are per-CPU entities, the effect of this
particular MSR is global.
......@@ -82,20 +85,25 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01
time at the time this structure was last updated. Unit is
nanoseconds.
tsc_to_system_mul: a function of the tsc frequency. One has
to multiply any tsc-related quantity by this value to get
a value in nanoseconds, besides dividing by 2^tsc_shift
tsc_to_system_mul: multiplier to be used when converting
tsc-related quantity to nanoseconds
tsc_shift: cycle to nanosecond divider, as a power of two, to
allow for shift rights. One has to shift right any tsc-related
quantity by this value to get a value in nanoseconds, besides
multiplying by tsc_to_system_mul.
tsc_shift: shift to be used when converting tsc-related
quantity to nanoseconds. This shift will ensure that
multiplication with tsc_to_system_mul does not overflow.
A positive value denotes a left shift, a negative value
a right shift.
With this information, guests can derive per-CPU time by
doing:
The conversion from tsc to nanoseconds involves an additional
right shift by 32 bits. With this information, guests can
derive per-CPU time by doing:
time = (current_tsc - tsc_timestamp)
time = (time * tsc_to_system_mul) >> tsc_shift
if (tsc_shift >= 0)
time <<= tsc_shift;
else
time >>= -tsc_shift;
time = (time * tsc_to_system_mul) >> 32
time = time + system_time
flags: bits in this field indicate extended capabilities
......
......@@ -287,10 +287,24 @@ struct kvm_mmu {
union kvm_mmu_page_role base_role;
bool direct_map;
/*
* Bitmap; bit set = permission fault
* Byte index: page fault error code [4:1]
* Bit index: pte permissions in ACC_* format
*/
u8 permissions[16];
u64 *pae_root;
u64 *lm_root;
u64 rsvd_bits_mask[2][4];
/*
* Bitmap: bit set = last pte in walk
* index[0:1]: level (zero-based)
* index[2]: pte.ps
*/
u8 last_pte_bitmap;
bool nx;
u64 pdptrs[4]; /* pae */
......
......@@ -3408,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}
static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
{
unsigned mask;
BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
mask = (unsigned)~ACC_WRITE_MASK;
/* Allow write access to dirty gptes */
mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK;
*access &= mask;
}
static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
int *nr_present)
{
......@@ -3425,6 +3437,25 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
return false;
}
static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte)
{
unsigned access;
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
access &= ~(gpte >> PT64_NX_SHIFT);
return access;
}
static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
{
unsigned index;
index = level - 1;
index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
return mmu->last_pte_bitmap & (1 << index);
}
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
......@@ -3494,6 +3525,56 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
}
static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
unsigned bit, byte, pfec;
u8 map;
bool fault, x, w, u, wf, uf, ff, smep;
smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
pfec = byte << 1;
map = 0;
wf = pfec & PFERR_WRITE_MASK;
uf = pfec & PFERR_USER_MASK;
ff = pfec & PFERR_FETCH_MASK;
for (bit = 0; bit < 8; ++bit) {
x = bit & ACC_EXEC_MASK;
w = bit & ACC_WRITE_MASK;
u = bit & ACC_USER_MASK;
/* Not really needed: !nx will cause pte.nx to fault */
x |= !mmu->nx;
/* Allow supervisor writes if !cr0.wp */
w |= !is_write_protection(vcpu) && !uf;
/* Disallow supervisor fetches of user code if cr4.smep */
x &= !(smep && u && !uf);
fault = (ff && !x) || (uf && !u) || (wf && !w);
map |= fault << bit;
}
mmu->permissions[byte] = map;
}
}
static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
u8 map;
unsigned level, root_level = mmu->root_level;
const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */
if (root_level == PT32E_ROOT_LEVEL)
--root_level;
/* PT_PAGE_TABLE_LEVEL always terminates */
map = 1 | (1 << ps_set_index);
for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
if (level <= PT_PDPE_LEVEL
&& (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
map |= 1 << (ps_set_index | (level - 1));
}
mmu->last_pte_bitmap = map;
}
static int paging64_init_context_common(struct kvm_vcpu *vcpu,
struct kvm_mmu *context,
int level)
......@@ -3502,6 +3583,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->root_level = level;
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context);
update_last_pte_bitmap(vcpu, context);
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
......@@ -3530,6 +3613,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->root_level = PT32_ROOT_LEVEL;
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context);
update_last_pte_bitmap(vcpu, context);
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
......@@ -3590,6 +3675,9 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
}
update_permission_bitmask(vcpu, context);
update_last_pte_bitmap(vcpu, context);
return 0;
}
......@@ -3665,6 +3753,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
}
update_permission_bitmask(vcpu, g_context);
update_last_pte_bitmap(vcpu, g_context);
return 0;
}
......
......@@ -18,8 +18,10 @@
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_SHIFT 5
#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
#define PT_DIRTY_MASK (1ULL << 6)
#define PT_PAGE_SIZE_MASK (1ULL << 7)
#define PT_DIRTY_SHIFT 6
#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT)
#define PT_PAGE_SIZE_SHIFT 7
#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT)
#define PT_PAT_MASK (1ULL << 7)
#define PT_GLOBAL_MASK (1ULL << 8)
#define PT64_NX_SHIFT 63
......@@ -88,17 +90,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
}
static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
bool write_fault, bool user_fault,
unsigned long pte)
/*
* Will a fault with a given page-fault error code (pfec) cause a permission
* fault with the given access (in ACC_* format)?
*/
static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access,
unsigned pfec)
{
if (unlikely(write_fault && !is_writable_pte(pte)
&& (user_fault || is_write_protection(vcpu))))
return false;
if (unlikely(user_fault && !(pte & PT_USER_MASK)))
return false;
return true;
return (mmu->permissions[pfec >> 1] >> pte_access) & 1;
}
#endif
......@@ -63,10 +63,12 @@
*/
struct guest_walker {
int level;
unsigned max_level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
......@@ -101,38 +103,41 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return (ret != orig_pte);
}
static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
bool last)
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
struct guest_walker *walker,
int write_fault)
{
unsigned access;
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
if (last && !is_dirty_gpte(gpte))
access &= ~ACC_WRITE_MASK;
#if PTTYPE == 64
if (vcpu->arch.mmu.nx)
access &= ~(gpte >> PT64_NX_SHIFT);
#endif
return access;
}
static bool FNAME(is_last_gpte)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t gpte)
{
if (walker->level == PT_PAGE_TABLE_LEVEL)
return true;
if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
(PTTYPE == 64 || is_pse(vcpu)))
return true;
unsigned level, index;
pt_element_t pte, orig_pte;
pt_element_t __user *ptep_user;
gfn_t table_gfn;
int ret;
for (level = walker->max_level; level >= walker->level; --level) {
pte = orig_pte = walker->ptes[level - 1];
table_gfn = walker->table_gfn[level - 1];
ptep_user = walker->ptep_user[level - 1];
index = offset_in_page(ptep_user) / sizeof(pt_element_t);
if (!(pte & PT_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
pte |= PT_ACCESSED_MASK;
}
if (level == walker->level && write_fault && !is_dirty_gpte(pte)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
pte |= PT_DIRTY_MASK;
}
if (pte == orig_pte)
continue;
if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
(mmu->root_level == PT64_ROOT_LEVEL))
return true;
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
if (ret)
return ret;
return false;
mark_page_dirty(vcpu->kvm, table_gfn);
walker->ptes[level] = pte;
}
return 0;
}
/*
......@@ -142,21 +147,22 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gva_t addr, u32 access)
{
int ret;
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
unsigned index, pt_access, uninitialized_var(pte_access);
unsigned index, pt_access, pte_access, accessed_dirty, shift;
gpa_t pte_gpa;
bool eperm, last_gpte;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault = access & PFERR_USER_MASK;
const int fetch_fault = access & PFERR_FETCH_MASK;
u16 errcode = 0;
gpa_t real_gpa;
gfn_t gfn;
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
eperm = false;
walker->level = mmu->root_level;
pte = mmu->get_cr3(vcpu);
......@@ -169,15 +175,21 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
--walker->level;
}
#endif
walker->max_level = walker->level;
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
(mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
pt_access = ACC_ALL;
accessed_dirty = PT_ACCESSED_MASK;
pt_access = pte_access = ACC_ALL;
++walker->level;
for (;;) {
do {
gfn_t real_gfn;
unsigned long host_addr;
pt_access &= pte_access;
--walker->level;
index = PT_INDEX(addr, walker->level);
table_gfn = gpte_to_gfn(pte);
......@@ -199,6 +211,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
goto error;
walker->ptep_user[walker->level - 1] = ptep_user;
trace_kvm_mmu_paging_element(pte, walker->level);
......@@ -211,92 +224,48 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
goto error;
}
if (!check_write_user_access(vcpu, write_fault, user_fault,
pte))
eperm = true;
#if PTTYPE == 64
if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
eperm = true;
#endif
last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
if (last_gpte) {
pte_access = pt_access &
FNAME(gpte_access)(vcpu, pte, true);
/* check if the kernel is fetching from user page */
if (unlikely(pte_access & PT_USER_MASK) &&
kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
if (fetch_fault && !user_fault)
eperm = true;
}
if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
int ret;
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
pte, pte|PT_ACCESSED_MASK);
if (unlikely(ret < 0))
goto error;
else if (ret)
goto retry_walk;
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
accessed_dirty &= pte;
pte_access = pt_access & gpte_access(vcpu, pte);
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
if (last_gpte) {
int lvl = walker->level;
gpa_t real_gpa;
gfn_t gfn;
u32 ac;
gfn = gpte_to_gfn_lvl(pte, lvl);
gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
if (PTTYPE == 32 &&
walker->level == PT_DIRECTORY_LEVEL &&
is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
ac = write_fault | fetch_fault | user_fault;
if (unlikely(permission_fault(mmu, pte_access, access))) {
errcode |= PFERR_PRESENT_MASK;
goto error;
}
real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
ac);
if (real_gpa == UNMAPPED_GVA)
return 0;
gfn = gpte_to_gfn_lvl(pte, walker->level);
gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
walker->gfn = real_gpa >> PAGE_SHIFT;
if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
break;
}
real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
if (real_gpa == UNMAPPED_GVA)
return 0;
pt_access &= FNAME(gpte_access)(vcpu, pte, false);
--walker->level;
}
walker->gfn = real_gpa >> PAGE_SHIFT;
if (unlikely(eperm)) {
errcode |= PFERR_PRESENT_MASK;
goto error;
}
if (!write_fault)
protect_clean_gpte(&pte_access, pte);
if (write_fault && unlikely(!is_dirty_gpte(pte))) {
int ret;
/*
* On a write fault, fold the dirty bit into accessed_dirty by shifting it one
* place right.
*
* On a read fault, do nothing.
*/
shift = write_fault >> ilog2(PFERR_WRITE_MASK);
shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT;
accessed_dirty &= pte >> shift;
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
pte, pte|PT_DIRTY_MASK);
if (unlikely(!accessed_dirty)) {
ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
if (unlikely(ret < 0))
goto error;
else if (ret)
goto retry_walk;
mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
}
walker->pt_access = pt_access;
......@@ -368,7 +337,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pte_access = sp->role.access & gpte_access(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
if (mmu_invalid_pfn(pfn))
return;
......@@ -441,8 +411,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
true);
pte_access = sp->role.access & gpte_access(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
pte_access & ACC_WRITE_MASK);
......@@ -794,7 +764,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access;
pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
pte_access &= gpte_access(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
continue;
......
......@@ -3782,12 +3782,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
svm_complete_interrupts(svm);
}
#ifdef CONFIG_X86_64
#define R "r"
#else
#define R "e"
#endif
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
......@@ -3814,13 +3808,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
local_irq_enable();
asm volatile (
"push %%"R"bp; \n\t"
"mov %c[rbx](%[svm]), %%"R"bx \n\t"
"mov %c[rcx](%[svm]), %%"R"cx \n\t"
"mov %c[rdx](%[svm]), %%"R"dx \n\t"
"mov %c[rsi](%[svm]), %%"R"si \n\t"
"mov %c[rdi](%[svm]), %%"R"di \n\t"
"mov %c[rbp](%[svm]), %%"R"bp \n\t"
"push %%" _ASM_BP "; \n\t"
"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
"mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
"mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
"mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
"mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
"mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%[svm]), %%r8 \n\t"
"mov %c[r9](%[svm]), %%r9 \n\t"
......@@ -3833,20 +3827,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
/* Enter guest mode */
"push %%"R"ax \n\t"
"mov %c[vmcb](%[svm]), %%"R"ax \n\t"
"push %%" _ASM_AX " \n\t"
"mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
__ex(SVM_VMLOAD) "\n\t"
__ex(SVM_VMRUN) "\n\t"
__ex(SVM_VMSAVE) "\n\t"
"pop %%"R"ax \n\t"
"pop %%" _ASM_AX " \n\t"
/* Save guest registers, load host registers */
"mov %%"R"bx, %c[rbx](%[svm]) \n\t"
"mov %%"R"cx, %c[rcx](%[svm]) \n\t"
"mov %%"R"dx, %c[rdx](%[svm]) \n\t"
"mov %%"R"si, %c[rsi](%[svm]) \n\t"
"mov %%"R"di, %c[rdi](%[svm]) \n\t"
"mov %%"R"bp, %c[rbp](%[svm]) \n\t"
"mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
"mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
"mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
"mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
"mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
"mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%[svm]) \n\t"
"mov %%r9, %c[r9](%[svm]) \n\t"
......@@ -3857,7 +3851,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
#endif
"pop %%"R"bp"
"pop %%" _ASM_BP
:
: [svm]"a"(svm),
[vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
......@@ -3878,9 +3872,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
[r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
#endif
: "cc", "memory"
, R"bx", R"cx", R"dx", R"si", R"di"
#ifdef CONFIG_X86_64
, "rbx", "rcx", "rdx", "rsi", "rdi"
, "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
#else
, "ebx", "ecx", "edx", "esi", "edi"
#endif
);
......@@ -3940,8 +3936,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
mark_all_clean(svm->vmcb);
}
#undef R
static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
......
......@@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO);
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8
#define VMCS02_POOL_SIZE 1
......@@ -3724,8 +3726,7 @@ static void vmx_set_constant_host_state(void)
native_store_idt(&dt);
vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */
asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
......@@ -6183,14 +6184,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
}
#ifdef CONFIG_X86_64
#define R "r"
#define Q "q"
#else
#define R "e"
#define Q "l"
#endif
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
......@@ -6239,30 +6232,30 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
"push %%"R"cx \n\t" /* placeholder for guest rcx */
"push %%"R"cx \n\t"
"cmp %%"R"sp, %c[host_rsp](%0) \n\t"
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
"push %%" _ASM_CX " \n\t"
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%"R"sp, %c[host_rsp](%0) \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
"mov %c[cr2](%0), %%"R"ax \n\t"
"mov %%cr2, %%"R"dx \n\t"
"cmp %%"R"ax, %%"R"dx \n\t"
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
"je 2f \n\t"
"mov %%"R"ax, %%cr2 \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
"2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
"mov %c[rax](%0), %%"R"ax \n\t"
"mov %c[rbx](%0), %%"R"bx \n\t"
"mov %c[rdx](%0), %%"R"dx \n\t"
"mov %c[rsi](%0), %%"R"si \n\t"
"mov %c[rdi](%0), %%"R"di \n\t"
"mov %c[rbp](%0), %%"R"bp \n\t"
"mov %c[rax](%0), %%" _ASM_AX " \n\t"
"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
......@@ -6273,24 +6266,24 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
#endif
"mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
/* Enter guest mode */
"jne .Llaunched \n\t"
"jne 1f \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
"jmp .Lkvm_vmx_return \n\t"
".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
".Lkvm_vmx_return: "
"jmp 2f \n\t"
"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
"2: "
/* Save guest registers, load host registers, keep flags */
"mov %0, %c[wordsize](%%"R"sp) \n\t"
"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
"pop %0 \n\t"
"mov %%"R"ax, %c[rax](%0) \n\t"
"mov %%"R"bx, %c[rbx](%0) \n\t"
"pop"Q" %c[rcx](%0) \n\t"
"mov %%"R"dx, %c[rdx](%0) \n\t"
"mov %%"R"si, %c[rsi](%0) \n\t"
"mov %%"R"di, %c[rdi](%0) \n\t"
"mov %%"R"bp, %c[rbp](%0) \n\t"
"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
......@@ -6301,11 +6294,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
"mov %%cr2, %%"R"ax \n\t"
"mov %%"R"ax, %c[cr2](%0) \n\t"
"mov %%cr2, %%" _ASM_AX " \n\t"
"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
"pop %%"R"bp; pop %%"R"dx \n\t"
"pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
".pushsection .rodata \n\t"
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
......@@ -6330,9 +6327,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
, R"ax", R"bx", R"di", R"si"
#ifdef CONFIG_X86_64
, "rax", "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "eax", "ebx", "edi", "esi"
#endif
);
......@@ -6384,9 +6383,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_complete_interrupts(vmx);
}
#undef R
#undef Q
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
......
......@@ -3672,20 +3672,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
if (vcpu_match_mmio_gva(vcpu, gva) &&
check_write_user_access(vcpu, write, access,
vcpu->arch.access)) {
if (vcpu_match_mmio_gva(vcpu, gva)
&& !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}
if (write)
access |= PFERR_WRITE_MASK;
*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
if (*gpa == UNMAPPED_GVA)
......@@ -6016,7 +6013,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
int r;
vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = vcpu_load(vcpu);
if (r)
return r;
r = kvm_arch_vcpu_reset(vcpu);
if (r == 0)
r = kvm_mmu_setup(vcpu);
......@@ -6027,9 +6026,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int r;
vcpu->arch.apf.msr_val = 0;
vcpu_load(vcpu);
r = vcpu_load(vcpu);
BUG_ON(r);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
......@@ -6275,7 +6276,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
int r;
r = vcpu_load(vcpu);
BUG_ON(r);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
}
......
......@@ -408,7 +408,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
void vcpu_load(struct kvm_vcpu *vcpu);
int __must_check vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);
int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
......
......@@ -131,11 +131,12 @@ bool kvm_is_mmio_pfn(pfn_t pfn)
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
void vcpu_load(struct kvm_vcpu *vcpu)
int vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu;
mutex_lock(&vcpu->mutex);
if (mutex_lock_killable(&vcpu->mutex))
return -EINTR;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
......@@ -148,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu);
put_cpu();
return 0;
}
void vcpu_put(struct kvm_vcpu *vcpu)
......@@ -1891,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
#endif
vcpu_load(vcpu);
r = vcpu_load(vcpu);
if (r)
return r;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment