Commit f13b83fd authored by Joao Martins's avatar Joao Martins Committed by Andrew Morton

hugetlb: batch TLB flushes when freeing vmemmap

Now that a list of pages is deduplicated at once, the TLB flush can be
batched for all vmemmap pages that got remapped.

Expand the flags field value to pass whether to skip the TLB flush on
remap of the PTE.

The TLB flush is global as we don't have guarantees from caller that the
set of folios is contiguous, or to add complexity in composing a list of
kVAs to flush.

Modified by Mike Kravetz to perform TLB flush on single folio if an
error is encountered.

Link: https://lkml.kernel.org/r/20231019023113.345257-8-mike.kravetz@oracle.comSigned-off-by: default avatarJoao Martins <joao.m.martins@oracle.com>
Signed-off-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: default avatarMuchun Song <songmuchun@bytedance.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Konrad Dybcio <konradybcio@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Usama Arif <usama.arif@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent f4b7e3ef
...@@ -40,6 +40,8 @@ struct vmemmap_remap_walk { ...@@ -40,6 +40,8 @@ struct vmemmap_remap_walk {
/* Skip the TLB flush when we split the PMD */ /* Skip the TLB flush when we split the PMD */
#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
/* Skip the TLB flush when we remap the PTE */
#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
unsigned long flags; unsigned long flags;
}; };
...@@ -214,7 +216,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, ...@@ -214,7 +216,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
return ret; return ret;
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
if (walk->remap_pte) if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
flush_tlb_kernel_range(start, end); flush_tlb_kernel_range(start, end);
return 0; return 0;
...@@ -355,19 +357,21 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, ...@@ -355,19 +357,21 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end,
* @reuse: reuse address. * @reuse: reuse address.
* @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
* responsibility to free pages. * responsibility to free pages.
* @flags: modifications to vmemmap_remap_walk flags
* *
* Return: %0 on success, negative error code otherwise. * Return: %0 on success, negative error code otherwise.
*/ */
static int vmemmap_remap_free(unsigned long start, unsigned long end, static int vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse, unsigned long reuse,
struct list_head *vmemmap_pages) struct list_head *vmemmap_pages,
unsigned long flags)
{ {
int ret; int ret;
struct vmemmap_remap_walk walk = { struct vmemmap_remap_walk walk = {
.remap_pte = vmemmap_remap_pte, .remap_pte = vmemmap_remap_pte,
.reuse_addr = reuse, .reuse_addr = reuse,
.vmemmap_pages = vmemmap_pages, .vmemmap_pages = vmemmap_pages,
.flags = 0, .flags = flags,
}; };
int nid = page_to_nid((struct page *)reuse); int nid = page_to_nid((struct page *)reuse);
gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
...@@ -629,7 +633,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h ...@@ -629,7 +633,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h
static int __hugetlb_vmemmap_optimize(const struct hstate *h, static int __hugetlb_vmemmap_optimize(const struct hstate *h,
struct page *head, struct page *head,
struct list_head *vmemmap_pages) struct list_head *vmemmap_pages,
unsigned long flags)
{ {
int ret = 0; int ret = 0;
unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end;
...@@ -640,6 +645,18 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, ...@@ -640,6 +645,18 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h,
return ret; return ret;
static_branch_inc(&hugetlb_optimize_vmemmap_key); static_branch_inc(&hugetlb_optimize_vmemmap_key);
/*
* Very Subtle
* If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
* immediately after remapping. As a result, subsequent accesses
* and modifications to struct pages associated with the hugetlb
* page could be to the OLD struct pages. Set the vmemmap optimized
* flag here so that it is copied to the new head page. This keeps
* the old and new struct pages in sync.
* If there is an error during optimization, we will immediately FLUSH
* the TLB and clear the flag below.
*/
SetHPageVmemmapOptimized(head);
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
vmemmap_reuse = vmemmap_start; vmemmap_reuse = vmemmap_start;
...@@ -651,11 +668,12 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, ...@@ -651,11 +668,12 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h,
* mapping the range to vmemmap_pages list so that they can be freed by * mapping the range to vmemmap_pages list so that they can be freed by
* the caller. * the caller.
*/ */
ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages); ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
if (ret) vmemmap_pages, flags);
if (ret) {
static_branch_dec(&hugetlb_optimize_vmemmap_key); static_branch_dec(&hugetlb_optimize_vmemmap_key);
else ClearHPageVmemmapOptimized(head);
SetHPageVmemmapOptimized(head); }
return ret; return ret;
} }
...@@ -674,7 +692,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) ...@@ -674,7 +692,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head)
{ {
LIST_HEAD(vmemmap_pages); LIST_HEAD(vmemmap_pages);
__hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0);
free_vmemmap_page_list(&vmemmap_pages); free_vmemmap_page_list(&vmemmap_pages);
} }
...@@ -719,19 +737,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l ...@@ -719,19 +737,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
list_for_each_entry(folio, folio_list, lru) { list_for_each_entry(folio, folio_list, lru) {
int ret = __hugetlb_vmemmap_optimize(h, &folio->page, int ret = __hugetlb_vmemmap_optimize(h, &folio->page,
&vmemmap_pages); &vmemmap_pages,
VMEMMAP_REMAP_NO_TLB_FLUSH);
/* /*
* Pages to be freed may have been accumulated. If we * Pages to be freed may have been accumulated. If we
* encounter an ENOMEM, free what we have and try again. * encounter an ENOMEM, free what we have and try again.
* This can occur in the case that both spliting fails
* halfway and head page allocation also failed. In this
* case __hugetlb_vmemmap_optimize() would free memory
* allowing more vmemmap remaps to occur.
*/ */
if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages); free_vmemmap_page_list(&vmemmap_pages);
INIT_LIST_HEAD(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages);
__hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); __hugetlb_vmemmap_optimize(h, &folio->page,
&vmemmap_pages,
VMEMMAP_REMAP_NO_TLB_FLUSH);
} }
} }
flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages); free_vmemmap_page_list(&vmemmap_pages);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment