Commit 1c134b19 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The main changes in this cycle were:

   - A PAT series from Davidlohr Bueso, which simplifies the memtype
     rbtree by using the interval tree helpers. (There's more cleanups
     in this area queued up, but they didn't make the merge window.)

   - Also flip over CONFIG_X86_5LEVEL to default-y. This might draw in a
     few more testers, as all the major distros are going to have
     5-level paging enabled by default in their next iterations.

   - Misc cleanups"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm/pat: Rename pat_rbtree.c to pat_interval.c
  x86/mm/pat: Drop the rbt_ prefix from external memtype calls
  x86/mm/pat: Do not pass 'rb_root' down the memtype tree helper functions
  x86/mm/pat: Convert the PAT tree to a generic interval tree
  x86/mm: Clean up the pmd_read_atomic() comments
  x86/mm: Fix function name typo in pmd_read_atomic() comment
  x86/cpu: Clean up intel_tlb_table[]
  x86/mm: Enable 5-level paging support by default
parents 24ee25a6 7f264dab
......@@ -1462,6 +1462,7 @@ config X86_PAE
config X86_5LEVEL
bool "Enable 5-level page tables support"
default y
select DYNAMIC_MEMORY_LAYOUT
select SPARSEMEM_VMEMMAP
depends on X86_64
......
......@@ -36,39 +36,41 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
#define pmd_read_atomic pmd_read_atomic
/*
* pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
* a "*pmdp" dereference done by gcc. Problem is, in certain places
* where pte_offset_map_lock is called, concurrent page faults are
* pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
* a "*pmdp" dereference done by GCC. Problem is, in certain places
* where pte_offset_map_lock() is called, concurrent page faults are
* allowed, if the mmap_sem is hold for reading. An example is mincore
* vs page faults vs MADV_DONTNEED. On the page fault side
* pmd_populate rightfully does a set_64bit, but if we're reading the
* pmd_populate() rightfully does a set_64bit(), but if we're reading the
* pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
* because gcc will not read the 64bit of the pmd atomically. To fix
* this all places running pmd_offset_map_lock() while holding the
* because GCC will not read the 64-bit value of the pmd atomically.
*
* To fix this all places running pte_offset_map_lock() while holding the
* mmap_sem in read mode, shall read the pmdp pointer using this
* function to know if the pmd is null nor not, and in turn to know if
* they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
* function to know if the pmd is null or not, and in turn to know if
* they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
* operations.
*
* Without THP if the mmap_sem is hold for reading, the pmd can only
* transition from null to not null while pmd_read_atomic runs. So
* Without THP if the mmap_sem is held for reading, the pmd can only
* transition from null to not null while pmd_read_atomic() runs. So
* we can always return atomic pmd values with this function.
*
* With THP if the mmap_sem is hold for reading, the pmd can become
* With THP if the mmap_sem is held for reading, the pmd can become
* trans_huge or none or point to a pte (and in turn become "stable")
* at any time under pmd_read_atomic. We could read it really
* atomically here with a atomic64_read for the THP enabled case (and
* at any time under pmd_read_atomic(). We could read it truly
* atomically here with an atomic64_read() for the THP enabled case (and
* it would be a whole lot simpler), but to avoid using cmpxchg8b we
* only return an atomic pmdval if the low part of the pmdval is later
* found stable (i.e. pointing to a pte). And we're returning a none
* pmdval if the low part of the pmd is none. In some cases the high
* and low part of the pmdval returned may not be consistent if THP is
* enabled (the low part may point to previously mapped hugepage,
* while the high part may point to a more recently mapped hugepage),
* but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
* of the pmd to be read atomically to decide if the pmd is unstable
* or not, with the only exception of when the low part of the pmd is
* zero in which case we return a none pmd.
* found to be stable (i.e. pointing to a pte). We are also returning a
* 'none' (zero) pmdval if the low part of the pmd is zero.
*
* In some cases the high and low part of the pmdval returned may not be
* consistent if THP is enabled (the low part may point to previously
* mapped hugepage, while the high part may point to a more recently
* mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
* needs the low part of the pmd to be read atomically to decide if the
* pmd is unstable or not, with the only exception when the low part
* of the pmd is zero, in which case we return a 'none' pmd.
*/
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
......
......@@ -819,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
{ 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
{ 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
{ 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" },
{ 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
{ 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
......@@ -847,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
{ 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
{ 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
{ 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
......@@ -859,8 +859,8 @@ static void intel_tlb_lookup(const unsigned char desc)
return;
/* look up this descriptor in the table */
for (k = 0; intel_tlb_table[k].descriptor != desc && \
intel_tlb_table[k].descriptor != 0; k++)
for (k = 0; intel_tlb_table[k].descriptor != desc &&
intel_tlb_table[k].descriptor != 0; k++)
;
if (intel_tlb_table[k].tlb_type == 0)
......
......@@ -23,7 +23,7 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp)
CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_X86_PAT) += pat_interval.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
......
......@@ -603,7 +603,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
spin_lock(&memtype_lock);
err = rbt_memtype_check_insert(new, new_type);
err = memtype_check_insert(new, new_type);
if (err) {
pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
start, end - 1,
......@@ -650,7 +650,7 @@ int free_memtype(u64 start, u64 end)
}
spin_lock(&memtype_lock);
entry = rbt_memtype_erase(start, end);
entry = memtype_erase(start, end);
spin_unlock(&memtype_lock);
if (IS_ERR(entry)) {
......@@ -693,7 +693,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
spin_lock(&memtype_lock);
entry = rbt_memtype_lookup(paddr);
entry = memtype_lookup(paddr);
if (entry != NULL)
rettype = entry->type;
else
......@@ -1109,7 +1109,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
return NULL;
spin_lock(&memtype_lock);
ret = rbt_memtype_copy_nth_element(print_entry, pos);
ret = memtype_copy_nth_element(print_entry, pos);
spin_unlock(&memtype_lock);
if (!ret) {
......
......@@ -29,20 +29,20 @@ static inline char *cattr_name(enum page_cache_mode pcm)
}
#ifdef CONFIG_X86_PAT
extern int rbt_memtype_check_insert(struct memtype *new,
enum page_cache_mode *new_type);
extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
extern struct memtype *rbt_memtype_lookup(u64 addr);
extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
extern int memtype_check_insert(struct memtype *new,
enum page_cache_mode *new_type);
extern struct memtype *memtype_erase(u64 start, u64 end);
extern struct memtype *memtype_lookup(u64 addr);
extern int memtype_copy_nth_element(struct memtype *out, loff_t pos);
#else
static inline int rbt_memtype_check_insert(struct memtype *new,
enum page_cache_mode *new_type)
static inline int memtype_check_insert(struct memtype *new,
enum page_cache_mode *new_type)
{ return 0; }
static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
static inline struct memtype *memtype_erase(u64 start, u64 end)
{ return NULL; }
static inline struct memtype *rbt_memtype_lookup(u64 addr)
static inline struct memtype *memtype_lookup(u64 addr)
{ return NULL; }
static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos)
{ return 0; }
#endif
......
......@@ -5,14 +5,13 @@
* Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* Suresh B Siddha <suresh.b.siddha@intel.com>
*
* Interval tree (augmented rbtree) used to store the PAT memory type
* reservations.
* Interval tree used to store the PAT memory type reservations.
*/
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/rbtree_augmented.h>
#include <linux/interval_tree_generic.h>
#include <linux/sched.h>
#include <linux/gfp.h>
......@@ -33,72 +32,32 @@
*
* memtype_lock protects the rbtree.
*/
static struct rb_root memtype_rbroot = RB_ROOT;
static int is_node_overlap(struct memtype *node, u64 start, u64 end)
static inline u64 memtype_interval_start(struct memtype *memtype)
{
if (node->start >= end || node->end <= start)
return 0;
return 1;
return memtype->start;
}
static u64 get_subtree_max_end(struct rb_node *node)
static inline u64 memtype_interval_end(struct memtype *memtype)
{
u64 ret = 0;
if (node) {
struct memtype *data = rb_entry(node, struct memtype, rb);
ret = data->subtree_max_end;
}
return ret;
return memtype->end - 1;
}
INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
memtype_interval_start, memtype_interval_end,
static, memtype_interval)
#define NODE_END(node) ((node)->end)
RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb,
struct memtype, rb, u64, subtree_max_end, NODE_END)
/* Find the first (lowest start addr) overlapping range from rb tree */
static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
u64 start, u64 end)
{
struct rb_node *node = root->rb_node;
struct memtype *last_lower = NULL;
while (node) {
struct memtype *data = rb_entry(node, struct memtype, rb);
if (get_subtree_max_end(node->rb_left) > start) {
/* Lowest overlap if any must be on left side */
node = node->rb_left;
} else if (is_node_overlap(data, start, end)) {
last_lower = data;
break;
} else if (start >= data->start) {
/* Lowest overlap if any must be on right side */
node = node->rb_right;
} else {
break;
}
}
return last_lower; /* Returns NULL if there is no overlap */
}
static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
enum {
MEMTYPE_EXACT_MATCH = 0,
MEMTYPE_END_MATCH = 1
};
static struct memtype *memtype_rb_match(struct rb_root *root,
u64 start, u64 end, int match_type)
static struct memtype *memtype_match(u64 start, u64 end, int match_type)
{
struct memtype *match;
match = memtype_rb_lowest_match(root, start, end);
match = memtype_interval_iter_first(&memtype_rbroot, start, end);
while (match != NULL && match->start < end) {
struct rb_node *node;
if ((match_type == MEMTYPE_EXACT_MATCH) &&
(match->start == start) && (match->end == end))
return match;
......@@ -107,26 +66,20 @@ static struct memtype *memtype_rb_match(struct rb_root *root,
(match->start < start) && (match->end == end))
return match;
node = rb_next(&match->rb);
if (node)
match = rb_entry(node, struct memtype, rb);
else
match = NULL;
match = memtype_interval_iter_next(match, start, end);
}
return NULL; /* Returns NULL if there is no match */
}
static int memtype_rb_check_conflict(struct rb_root *root,
u64 start, u64 end,
enum page_cache_mode reqtype,
enum page_cache_mode *newtype)
static int memtype_check_conflict(u64 start, u64 end,
enum page_cache_mode reqtype,
enum page_cache_mode *newtype)
{
struct rb_node *node;
struct memtype *match;
enum page_cache_mode found_type = reqtype;
match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
match = memtype_interval_iter_first(&memtype_rbroot, start, end);
if (match == NULL)
goto success;
......@@ -136,19 +89,12 @@ static int memtype_rb_check_conflict(struct rb_root *root,
dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
found_type = match->type;
node = rb_next(&match->rb);
while (node) {
match = rb_entry(node, struct memtype, rb);
if (match->start >= end) /* Checked all possible matches */
goto success;
if (is_node_overlap(match, start, end) &&
match->type != found_type) {
match = memtype_interval_iter_next(match, start, end);
while (match) {
if (match->type != found_type)
goto failure;
}
node = rb_next(&match->rb);
match = memtype_interval_iter_next(match, start, end);
}
success:
if (newtype)
......@@ -163,103 +109,74 @@ static int memtype_rb_check_conflict(struct rb_root *root,
return -EBUSY;
}
static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
{
struct rb_node **node = &(root->rb_node);
struct rb_node *parent = NULL;
while (*node) {
struct memtype *data = rb_entry(*node, struct memtype, rb);
parent = *node;
if (data->subtree_max_end < newdata->end)
data->subtree_max_end = newdata->end;
if (newdata->start <= data->start)
node = &((*node)->rb_left);
else if (newdata->start > data->start)
node = &((*node)->rb_right);
}
newdata->subtree_max_end = newdata->end;
rb_link_node(&newdata->rb, parent, node);
rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
}
int rbt_memtype_check_insert(struct memtype *new,
enum page_cache_mode *ret_type)
int memtype_check_insert(struct memtype *new,
enum page_cache_mode *ret_type)
{
int err = 0;
err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
new->type, ret_type);
err = memtype_check_conflict(new->start, new->end, new->type, ret_type);
if (err)
return err;
if (!err) {
if (ret_type)
new->type = *ret_type;
if (ret_type)
new->type = *ret_type;
new->subtree_max_end = new->end;
memtype_rb_insert(&memtype_rbroot, new);
}
return err;
memtype_interval_insert(new, &memtype_rbroot);
return 0;
}
struct memtype *rbt_memtype_erase(u64 start, u64 end)
struct memtype *memtype_erase(u64 start, u64 end)
{
struct memtype *data;
/*
* Since the memtype_rbroot tree allows overlapping ranges,
* rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free
* memtype_erase() checks with EXACT_MATCH first, i.e. free
* a whole node for the munmap case. If no such entry is found,
* it then checks with END_MATCH, i.e. shrink the size of a node
* from the end for the mremap case.
*/
data = memtype_rb_match(&memtype_rbroot, start, end,
MEMTYPE_EXACT_MATCH);
data = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
if (!data) {
data = memtype_rb_match(&memtype_rbroot, start, end,
MEMTYPE_END_MATCH);
data = memtype_match(start, end, MEMTYPE_END_MATCH);
if (!data)
return ERR_PTR(-EINVAL);
}
if (data->start == start) {
/* munmap: erase this node */
rb_erase_augmented(&data->rb, &memtype_rbroot,
&memtype_rb_augment_cb);
memtype_interval_remove(data, &memtype_rbroot);
} else {
/* mremap: update the end value of this node */
rb_erase_augmented(&data->rb, &memtype_rbroot,
&memtype_rb_augment_cb);
memtype_interval_remove(data, &memtype_rbroot);
data->end = start;
data->subtree_max_end = data->end;
memtype_rb_insert(&memtype_rbroot, data);
memtype_interval_insert(data, &memtype_rbroot);
return NULL;
}
return data;
}
struct memtype *rbt_memtype_lookup(u64 addr)
struct memtype *memtype_lookup(u64 addr)
{
return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
return memtype_interval_iter_first(&memtype_rbroot, addr,
addr + PAGE_SIZE);
}
#if defined(CONFIG_DEBUG_FS)
int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
int memtype_copy_nth_element(struct memtype *out, loff_t pos)
{
struct rb_node *node;
struct memtype *match;
int i = 1;
node = rb_first(&memtype_rbroot);
while (node && pos != i) {
node = rb_next(node);
match = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX);
while (match && pos != i) {
match = memtype_interval_iter_next(match, 0, ULONG_MAX);
i++;
}
if (node) { /* pos == i */
struct memtype *this = rb_entry(node, struct memtype, rb);
*out = *this;
if (match) { /* pos == i */
*out = *match;
return 0;
} else {
return 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment