x86/mm/tlb: Revert the recent lazy TLB patches

Revert commits: 95b0e635 x86/mm/tlb: Always use lazy TLB mode 64482aaf x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs ac031589 x86/mm/tlb: Make lazy TLB mode lazier 61d0beb5 x86/mm/tlb: Restructure switch_mm_irqs_off() 2ff6ddf1 x86/mm/tlb: Leave lazy TLB mode at page table free time In order to simplify the TLB invalidate fixes for x86 and unify the parts that need backporting. We'll try again later. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Rik van Riel <riel@surriel.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

x86/mm/tlb: Revert the recent lazy TLB patches
Revert commits: 95b0e635 x86/mm/tlb: Always use lazy TLB mode 64482aaf x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs ac031589 x86/mm/tlb: Make lazy TLB mode lazier 61d0beb5 x86/mm/tlb: Restructure switch_mm_irqs_off() 2ff6ddf1 x86/mm/tlb: Leave lazy TLB mode at page table free time In order to simplify the TLB invalidate fixes for x86 and unify the parts that need backporting. We'll try again later. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Rik van Riel <riel@surriel.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
52a288c7 · Peter Zijlstra · Linus Torvalds · 815f0ddb · 52a288c7 · 52a288c7
Commit 52a288c7 authored Aug 22, 2018 by Peter Zijlstra Committed by Linus Torvalds Aug 22, 2018
Showing with 77 additions and 181 deletions

arch/x86/include/asm/tlbflush.h arch/x86/include/asm/tlbflush.h +16 -5

arch/x86/mm/tlb.c arch/x86/mm/tlb.c +53 -152

include/asm-generic/tlb.h include/asm-generic/tlb.h +0 -10

mm/memory.c mm/memory.c +8 -14

No files found.
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,6 +148,22 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
 #endif
+static inline bool tlb_defer_switch_to_init_mm(void)
+{
+	/*
+	 * If we have PCID, then switching to init_mm is reasonably
+	 * fast.  If we don't have PCID, then switching to init_mm is
+	 * quite slow, so we try to defer it in the hopes that we can
+	 * avoid it entirely.  The latter approach runs the risk of
+	 * receiving otherwise unnecessary IPIs.
+	 *
+	 * This choice is just a heuristic.  The tlb code can handle this
+	 * function returning true or false regardless of whether we have
+	 * PCID.
+	 */
+	return !static_cpu_has(X86_FEATURE_PCID);
+}
 struct tlb_context {
 	u64 ctx_id;
 	u64 tlb_gen;
@@ -538,9 +554,4 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 	native_flush_tlb_others(mask, info)
 #endif
-extern void tlb_flush_remove_tables(struct mm_struct *mm);
-extern void tlb_flush_remove_tables_local(void *arg);
-#define HAVE_TLB_FLUSH_REMOVE_TABLES
 #endif /* _ASM_X86_TLBFLUSH_H */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,7 +7,6 @@
 #include <linux/export.h>
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
-#include <linux/gfp.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 {
 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
-	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
 	unsigned cpu = smp_processor_id();
 	u64 next_tlb_gen;
-	bool need_flush;
-	u16 new_asid;
 	/*
 	 * NB: The scheduler will call us with prev == next when switching
@@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			   next->context.ctx_id);
 		/*
-		 * Even in lazy TLB mode, the CPU should stay set in the
+		 * We don't currently support having a real mm loaded without
-		 * mm_cpumask. The TLB shootdown code can figure out from
+		 * our cpu set in mm_cpumask().  We have all the bookkeeping
-		 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+		 * in place to figure out whether we would need to flush
+		 * if our cpu were cleared in mm_cpumask(), but we don't
+		 * currently use it.
 		 */
 		if (WARN_ON_ONCE(real_prev != &init_mm &&
 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
 			cpumask_set_cpu(cpu, mm_cpumask(next));
-		/*
-		 * If the CPU is not in lazy TLB mode, we are just switching
-		 * from one thread in a process to another thread in the same
-		 * process. No TLB flush required.
-		 */
-		if (!was_lazy)
-			return;
-		/*
-		 * Read the tlb_gen to check whether a flush is needed.
-		 * If the TLB is up to date, just use it.
-		 * The barrier synchronizes with the tlb_gen increment in
-		 * the TLB shootdown code.
-		 */
-		smp_mb();
-		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
-				next_tlb_gen)
 		return;
-		/*
-		 * TLB contents went out of date while we were in lazy
-		 * mode. Fall through to the TLB switching code below.
-		 */
-		new_asid = prev_asid;
-		need_flush = true;
 	} else {
+		u16 new_asid;
+		bool need_flush;
 		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
 		/*
@@ -329,7 +304,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-	}
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
@@ -363,6 +337,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
 		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+	}
 	load_mm_cr4(next);
 	switch_ldt(real_prev, next);
@@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
 		return;
+	if (tlb_defer_switch_to_init_mm()) {
+		/*
+		 * There's a significant optimization that may be possible
+		 * here.  We have accurate enough TLB flush tracking that we
+		 * don't need to maintain coherence of TLB per se when we're
+		 * lazy.  We do, however, need to maintain coherence of
+		 * paging-structure caches.  We could, in principle, leave our
+		 * old mm loaded and only switch to init_mm when
+		 * tlb_remove_page() happens.
+		 */
 		this_cpu_write(cpu_tlbstate.is_lazy, true);
+	} else {
+		switch_mm(NULL, &init_mm, NULL);
+	}
 }
 /*
@@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 		 * paging-structure cache to avoid speculatively reading
 		 * garbage into our TLB.  Since switching to init_mm is barely
 		 * slower than a minimal flush, just switch to init_mm.
-		 *
-		 * This should be rare, with native_flush_tlb_others skipping
-		 * IPIs to lazy TLB mode CPUs.
 		 */
 		switch_mm_irqs_off(NULL, &init_mm, NULL);
 		return;
@@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info)
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info)
 {
-	cpumask_var_t lazymask;
-	unsigned int cpu;
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
 	if (info->end == TLB_FLUSH_ALL)
 		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -608,6 +590,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 		 * that UV should be updated so that smp_call_function_many(),
 		 * etc, are optimal on UV.
 		 */
+		unsigned int cpu;
 		cpu = smp_processor_id();
 		cpumask = uv_flush_tlb_others(cpumask, info);
 		if (cpumask)
@@ -615,29 +599,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 					       (void *)info, 1);
 		return;
 	}
-	/*
-	 * A temporary cpumask is used in order to skip sending IPIs
-	 * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
-	 * If the allocation fails, simply IPI every CPU in mm_cpumask.
-	 */
-	if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
 	smp_call_function_many(cpumask, flush_tlb_func_remote,
 			       (void *)info, 1);
-		return;
-	}
-	cpumask_copy(lazymask, cpumask);
-	for_each_cpu(cpu, lazymask) {
-		if (per_cpu(cpu_tlbstate.is_lazy, cpu))
-			cpumask_clear_cpu(cpu, lazymask);
-	}
-	smp_call_function_many(lazymask, flush_tlb_func_remote,
-			       (void *)info, 1);
-	free_cpumask_var(lazymask);
 }
 /*
@@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	put_cpu();
 }
-void tlb_flush_remove_tables_local(void *arg)
-{
-	struct mm_struct *mm = arg;
-	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
-			this_cpu_read(cpu_tlbstate.is_lazy)) {
-		/*
-		 * We're in lazy mode.  We need to at least flush our
-		 * paging-structure cache to avoid speculatively reading
-		 * garbage into our TLB.  Since switching to init_mm is barely
-		 * slower than a minimal flush, just switch to init_mm.
-		 */
-		switch_mm_irqs_off(NULL, &init_mm, NULL);
-	}
-}
-static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
-				      struct cpumask *lazy_cpus)
-{
-	int cpu;
-	for_each_cpu(cpu, mm_cpumask(mm)) {
-		if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
-			cpumask_set_cpu(cpu, lazy_cpus);
-	}
-}
-void tlb_flush_remove_tables(struct mm_struct *mm)
-{
-	int cpu = get_cpu();
-	cpumask_var_t lazy_cpus;
-	if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
-		put_cpu();
-		return;
-	}
-	if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
-		/*
-		 * If the cpumask allocation fails, do a brute force flush
-		 * on all the CPUs that have this mm loaded.
-		 */
-		smp_call_function_many(mm_cpumask(mm),
-				tlb_flush_remove_tables_local, (void *)mm, 1);
-		put_cpu();
-		return;
-	}
-	/*
-	 * CPUs with !is_lazy either received a TLB flush IPI while the user
-	 * pages in this address range were unmapped, or have context switched
-	 * and reloaded %CR3 since then.
-	 *
-	 * Shootdown IPIs at page table freeing time only need to be sent to
-	 * CPUs that may have out of date TLB contents.
-	 */
-	mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
-	smp_call_function_many(lazy_cpus,
-				tlb_flush_remove_tables_local, (void *)mm, 1);
-	free_cpumask_var(lazy_cpus);
-	put_cpu();
-}
 static void do_flush_tlb_all(void *info)
 {

--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -303,14 +303,4 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 #define tlb_migrate_finish(mm) do {} while (0)
-/*
- * Used to flush the TLB when page tables are removed, when lazy
- * TLB mode may cause a CPU to retain intermediate translations
- * pointing to about-to-be-freed page table memory.
- */
-#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
-#define tlb_flush_remove_tables(mm) do {} while (0)
-#define tlb_flush_remove_tables_local(mm) do {} while (0)
-#endif
 #endif /* _ASM_GENERIC__TLB_H */
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -326,20 +326,16 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
-static void tlb_remove_table_smp_sync(void *arg)
+/*
-{
-	struct mm_struct __maybe_unused *mm = arg;
-	/*
-	 * On most architectures this does nothing. Simply delivering the
-	 * interrupt is enough to prevent races with software page table
-	 * walking like that done in get_user_pages_fast.
-	 *
 * See the comment near struct mmu_table_batch.
 */
-	tlb_flush_remove_tables_local(mm);
+static void tlb_remove_table_smp_sync(void *arg)
+{
+	/* Simply deliver the interrupt */
 }
-static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
+static void tlb_remove_table_one(void *table)
 {
 	/*
 	 * This isn't an RCU grace period and hence the page-tables cannot be
@@ -348,7 +344,7 @@ static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
 	 * It is however sufficient for software page-table walkers that rely on
 	 * IRQ disabling. See the comment near struct mmu_table_batch.
 	 */
-	smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
+	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 	__tlb_remove_table(table);
 }
@@ -369,8 +365,6 @@ void tlb_table_flush(struct mmu_gather *tlb)
 {
 	struct mmu_table_batch **batch = &tlb->batch;
-	tlb_flush_remove_tables(tlb->mm);
 	if (*batch) {
 		call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
 		*batch = NULL;
@@ -393,7 +387,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 	if (*batch == NULL) {
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		if (*batch == NULL) {
-			tlb_remove_table_one(table, tlb);
+			tlb_remove_table_one(table);
 			return;
 		}
 		(*batch)->nr = 0;