Commit 936a5fe6 authored by Andrea Arcangeli's avatar Andrea Arcangeli Committed by Linus Torvalds

thp: kvm mmu transparent hugepage support

This should work for both hugetlbfs and transparent hugepages.

[akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability]
Signed-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 47ad8475
......@@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
int host_level, level, max_level;
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
return PT_PAGE_TABLE_LEVEL;
return true;
return false;
}
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
int host_level, level, max_level;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
......@@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
gfn_t *gfnp, pfn_t *pfnp, int *levelp)
{
pfn_t pfn = *pfnp;
gfn_t gfn = *gfnp;
int level = *levelp;
/*
* Check if it's a transparent hugepage. If this would be an
* hugetlbfs page, level wouldn't be set to
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
* here.
*/
if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
* mmu_notifier_retry was successful and we hold the
* mmu_lock here, so the pmd can't become splitting
* from under us, and in turn
* __split_huge_page_refcount() can't run from under
* us and we can safely transfer the refcount from
* PG_tail to PG_head as we switch the pfn to tail to
* head.
*/
*levelp = level = PT_DIRECTORY_LEVEL;
mask = KVM_PAGES_PER_HPAGE(level) - 1;
VM_BUG_ON((gfn & mask) != (pfn & mask));
if (pfn & mask) {
gfn &= ~mask;
*gfnp = gfn;
kvm_release_pfn_clean(pfn);
pfn &= ~mask;
if (!get_page_unless_zero(pfn_to_page(pfn)))
BUG();
*pfnp = pfn;
}
}
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
......@@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
{
int r;
int level;
int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
bool map_writable;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn);
/*
* This path builds a PAE pagetable - so we can map 2mb pages at
* maximum. Therefore check if the level is larger than that.
* This path builds a PAE pagetable - so we can map
* 2mb pages at maximum. Therefore check if the level
* is larger than that.
*/
if (level > PT_DIRECTORY_LEVEL)
level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
......@@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
......@@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
pfn_t pfn;
int r;
int level;
int force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
int write = error_code & PFERR_WRITE_MASK;
......@@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn);
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
......@@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
r = __direct_map(vcpu, gpa, write, map_writable,
level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
......
......@@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
int force_pt_level;
unsigned long mmu_seq;
bool map_writable;
......@@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
}
if (walker.level >= PT_DIRECTORY_LEVEL) {
if (walker.level >= PT_DIRECTORY_LEVEL)
force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
else
force_pt_level = 1;
if (!force_pt_level) {
level = min(walker.level, mapping_level(vcpu, walker.gfn));
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
......@@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn, map_writable, prefault);
(void)sptep;
......
......@@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page)
#endif /* !PAGEFLAGS_EXTENDED */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int PageTransCompound(struct page *page)
{
return PageCompound(page);
}
#else
static inline int PageTransCompound(struct page *page)
{
return 0;
}
#endif
#ifdef CONFIG_MMU
#define __PG_MLOCKED (1 << PG_mlocked)
#else
......
......@@ -104,8 +104,36 @@ static pfn_t fault_pfn;
inline int kvm_is_mmio_pfn(pfn_t pfn)
{
if (pfn_valid(pfn)) {
struct page *page = compound_head(pfn_to_page(pfn));
return PageReserved(page);
struct page *head;
struct page *tail = pfn_to_page(pfn);
head = compound_head(tail);
if (head != tail) {
smp_rmb();
/*
* head may be a dangling pointer.
* __split_huge_page_refcount clears PageTail
* before overwriting first_page, so if
* PageTail is still there it means the head
* pointer isn't dangling.
*/
if (PageTail(tail)) {
/*
* the "head" is not a dangling
* pointer but the hugepage may have
* been splitted from under us (and we
* may not hold a reference count on
* the head page so it can be reused
* before we run PageReferenced), so
* we've to recheck PageTail before
* returning what we just read.
*/
int reserved = PageReserved(head);
smp_rmb();
if (PageTail(tail))
return reserved;
}
}
return PageReserved(tail);
}
return true;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment