Commit 5d1bc760 authored by Andrew Morton's avatar Andrew Morton

merge mm-hotfixes-stable into mm-nonmm-stable to pick up needed changes

parents 640958fd 52ccdde1
......@@ -446,7 +446,8 @@ Mythri P K <mythripk@ti.com>
Nadav Amit <nadav.amit@gmail.com> <namit@vmware.com>
Nadav Amit <nadav.amit@gmail.com> <namit@cs.technion.ac.il>
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
Naoya Horiguchi <naoya.horiguchi@nec.com> <n-horiguchi@ah.jp.nec.com>
Naoya Horiguchi <nao.horiguchi@gmail.com> <n-horiguchi@ah.jp.nec.com>
Naoya Horiguchi <nao.horiguchi@gmail.com> <naoya.horiguchi@nec.com>
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
Neeraj Upadhyay <quic_neeraju@quicinc.com> <neeraju@codeaurora.org>
Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
......
......@@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of
each page. It is already implemented and activated if page owner is
enabled. Other usages are more than welcome.
It can also be used to show all the stacks and their outstanding
allocations, which gives us a quick overview of where the memory is going
without the need to screen through all the pages and match the allocation
and free operation.
It can also be used to show all the stacks and their current number of
allocated base pages, which gives us a quick overview of where the memory
is going without the need to screen through all the pages and match the
allocation and free operation.
page owner is disabled by default. So, if you'd like to use it, you need
to add "page_owner=on" to your boot cmdline. If the kernel is built
......@@ -75,42 +75,45 @@ Usage
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
cat stacks.txt
prep_new_page+0xa9/0x120
get_page_from_freelist+0x7e6/0x2140
__alloc_pages+0x18a/0x370
new_slab+0xc8/0x580
___slab_alloc+0x1f2/0xaf0
__slab_alloc.isra.86+0x22/0x40
kmem_cache_alloc+0x31b/0x350
__khugepaged_enter+0x39/0x100
dup_mmap+0x1c7/0x5ce
copy_process+0x1afe/0x1c90
kernel_clone+0x9a/0x3c0
__do_sys_clone+0x66/0x90
do_syscall_64+0x7f/0x160
entry_SYSCALL_64_after_hwframe+0x6c/0x74
stack_count: 234
post_alloc_hook+0x177/0x1a0
get_page_from_freelist+0xd01/0xd80
__alloc_pages+0x39e/0x7e0
allocate_slab+0xbc/0x3f0
___slab_alloc+0x528/0x8a0
kmem_cache_alloc+0x224/0x3b0
sk_prot_alloc+0x58/0x1a0
sk_alloc+0x32/0x4f0
inet_create+0x427/0xb50
__sock_create+0x2e4/0x650
inet_ctl_sock_create+0x30/0x180
igmp_net_init+0xc1/0x130
ops_init+0x167/0x410
setup_net+0x304/0xa60
copy_net_ns+0x29b/0x4a0
create_new_namespaces+0x4a1/0x820
nr_base_pages: 16
...
...
echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
cat stacks_7000.txt
prep_new_page+0xa9/0x120
get_page_from_freelist+0x7e6/0x2140
__alloc_pages+0x18a/0x370
alloc_pages_mpol+0xdf/0x1e0
folio_alloc+0x14/0x50
filemap_alloc_folio+0xb0/0x100
page_cache_ra_unbounded+0x97/0x180
filemap_fault+0x4b4/0x1200
__do_fault+0x2d/0x110
do_pte_missing+0x4b0/0xa30
__handle_mm_fault+0x7fa/0xb70
handle_mm_fault+0x125/0x300
do_user_addr_fault+0x3c9/0x840
exc_page_fault+0x68/0x150
asm_exc_page_fault+0x22/0x30
stack_count: 8248
post_alloc_hook+0x177/0x1a0
get_page_from_freelist+0xd01/0xd80
__alloc_pages+0x39e/0x7e0
alloc_pages_mpol+0x22e/0x490
folio_alloc+0xd5/0x110
filemap_alloc_folio+0x78/0x230
page_cache_ra_order+0x287/0x6f0
filemap_get_pages+0x517/0x1160
filemap_read+0x304/0x9f0
xfs_file_buffered_read+0xe6/0x1d0 [xfs]
xfs_file_read_iter+0x1f0/0x380 [xfs]
__kernel_read+0x3b9/0x730
kernel_read_file+0x309/0x4d0
__do_sys_finit_module+0x381/0x730
do_syscall_64+0x8d/0x150
entry_SYSCALL_64_after_hwframe+0x62/0x6a
nr_base_pages: 20824
...
cat /sys/kernel/debug/page_owner > page_owner_full.txt
......
......@@ -10024,7 +10024,7 @@ F: drivers/media/platform/st/sti/hva
HWPOISON MEMORY FAILURE HANDLING
M: Miaohe Lin <linmiaohe@huawei.com>
R: Naoya Horiguchi <naoya.horiguchi@nec.com>
R: Naoya Horiguchi <nao.horiguchi@gmail.com>
L: linux-mm@kvack.org
S: Maintained
F: mm/hwpoison-inject.c
......
......@@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = {
#define S_SHIFT 12
static unsigned char
nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
[S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
......
......@@ -67,7 +67,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
*/
ppage = pfn_to_online_page(pfn);
if (!ppage || PageSlab(ppage) || page_has_type(ppage))
if (!ppage)
pcount = 0;
else
pcount = page_mapcount(ppage);
......@@ -124,11 +124,8 @@ u64 stable_page_flags(struct page *page)
/*
* pseudo flags for the well known (anonymous) memory mapped pages
*
* Note that page->_mapcount is overloaded in SLAB, so the
* simple test in page_mapped() is not enough.
*/
if (!PageSlab(page) && page_mapped(page))
if (page_mapped(page))
u |= 1 << KPF_MMAP;
if (PageAnon(page))
u |= 1 << KPF_ANON;
......
......@@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
gid_t i_gid;
int err;
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
if (inode->i_ino == 0)
return -EINVAL;
err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
if (err)
return err;
......@@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid);
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
......
......@@ -1223,14 +1223,16 @@ static inline void page_mapcount_reset(struct page *page)
* a large folio, it includes the number of times this page is mapped
* as part of that folio.
*
* The result is undefined for pages which cannot be mapped into userspace.
* For example SLAB or special types of pages. See function page_has_type().
* They use this field in struct page differently.
* Will report 0 for pages which cannot be mapped into userspace, eg
* slab, page tables and similar.
*/
static inline int page_mapcount(struct page *page)
{
int mapcount = atomic_read(&page->_mapcount) + 1;
/* Handle page_has_type() pages */
if (mapcount < 0)
mapcount = 0;
if (unlikely(PageCompound(page)))
mapcount += folio_entire_mapcount(page_folio(page));
......
......@@ -190,7 +190,6 @@ enum pageflags {
/* At least one page in this folio has the hwpoison flag set */
PG_has_hwpoisoned = PG_error,
PG_hugetlb = PG_active,
PG_large_rmappable = PG_workingset, /* anon or file-backed */
};
......@@ -458,30 +457,51 @@ static __always_inline int TestClearPage##uname(struct page *page) \
TESTSETFLAG(uname, lname, policy) \
TESTCLEARFLAG(uname, lname, policy)
#define FOLIO_TEST_FLAG_FALSE(name) \
static inline bool folio_test_##name(const struct folio *folio) \
{ return false; }
#define FOLIO_SET_FLAG_NOOP(name) \
static inline void folio_set_##name(struct folio *folio) { }
#define FOLIO_CLEAR_FLAG_NOOP(name) \
static inline void folio_clear_##name(struct folio *folio) { }
#define __FOLIO_SET_FLAG_NOOP(name) \
static inline void __folio_set_##name(struct folio *folio) { }
#define __FOLIO_CLEAR_FLAG_NOOP(name) \
static inline void __folio_clear_##name(struct folio *folio) { }
#define FOLIO_TEST_SET_FLAG_FALSE(name) \
static inline bool folio_test_set_##name(struct folio *folio) \
{ return false; }
#define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \
static inline bool folio_test_clear_##name(struct folio *folio) \
{ return false; }
#define FOLIO_FLAG_FALSE(name) \
FOLIO_TEST_FLAG_FALSE(name) \
FOLIO_SET_FLAG_NOOP(name) \
FOLIO_CLEAR_FLAG_NOOP(name)
#define TESTPAGEFLAG_FALSE(uname, lname) \
static inline bool folio_test_##lname(const struct folio *folio) { return false; } \
FOLIO_TEST_FLAG_FALSE(lname) \
static inline int Page##uname(const struct page *page) { return 0; }
#define SETPAGEFLAG_NOOP(uname, lname) \
static inline void folio_set_##lname(struct folio *folio) { } \
FOLIO_SET_FLAG_NOOP(lname) \
static inline void SetPage##uname(struct page *page) { }
#define CLEARPAGEFLAG_NOOP(uname, lname) \
static inline void folio_clear_##lname(struct folio *folio) { } \
FOLIO_CLEAR_FLAG_NOOP(lname) \
static inline void ClearPage##uname(struct page *page) { }
#define __CLEARPAGEFLAG_NOOP(uname, lname) \
static inline void __folio_clear_##lname(struct folio *folio) { } \
__FOLIO_CLEAR_FLAG_NOOP(lname) \
static inline void __ClearPage##uname(struct page *page) { }
#define TESTSETFLAG_FALSE(uname, lname) \
static inline bool folio_test_set_##lname(struct folio *folio) \
{ return 0; } \
FOLIO_TEST_SET_FLAG_FALSE(lname) \
static inline int TestSetPage##uname(struct page *page) { return 0; }
#define TESTCLEARFLAG_FALSE(uname, lname) \
static inline bool folio_test_clear_##lname(struct folio *folio) \
{ return 0; } \
FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \
static inline int TestClearPage##uname(struct page *page) { return 0; }
#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \
......@@ -855,29 +875,6 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
#define PG_head_mask ((1UL << PG_head))
#ifdef CONFIG_HUGETLB_PAGE
int PageHuge(const struct page *page);
SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND)
/**
* folio_test_hugetlb - Determine if the folio belongs to hugetlbfs
* @folio: The folio to test.
*
* Context: Any context. Caller should have a reference on the folio to
* prevent it from being turned into a tail page.
* Return: True for hugetlbfs folios, false for anon folios or folios
* belonging to other filesystems.
*/
static inline bool folio_test_hugetlb(const struct folio *folio)
{
return folio_test_large(folio) &&
test_bit(PG_hugetlb, const_folio_flags(folio, 1));
}
#else
TESTPAGEFLAG_FALSE(Huge, hugetlb)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* PageHuge() only returns true for hugetlbfs pages, but not for
......@@ -933,34 +930,23 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
#endif
/*
* Check if a page is currently marked HWPoisoned. Note that this check is
* best effort only and inherently racy: there is no way to synchronize with
* failing hardware.
*/
static inline bool is_page_hwpoison(struct page *page)
{
if (PageHWPoison(page))
return true;
return PageHuge(page) && PageHWPoison(compound_head(page));
}
/*
* For pages that are never mapped to userspace (and aren't PageSlab),
* page_type may be used. Because it is initialised to -1, we invert the
* sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
* __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and
* low bits so that an underflow or overflow of page_mapcount() won't be
* low bits so that an underflow or overflow of _mapcount won't be
* mistaken for a page type value.
*/
#define PAGE_TYPE_BASE 0xf0000000
/* Reserve 0x0000007f to catch underflows of page_mapcount */
/* Reserve 0x0000007f to catch underflows of _mapcount */
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
#define PG_offline 0x00000100
#define PG_table 0x00000200
#define PG_guard 0x00000400
#define PG_hugetlb 0x00000800
#define PageType(page, flag) \
((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
......@@ -977,35 +963,38 @@ static inline int page_has_type(const struct page *page)
return page_type_has_type(page->page_type);
}
#define FOLIO_TYPE_OPS(lname, fname) \
static __always_inline bool folio_test_##fname(const struct folio *folio)\
{ \
return folio_test_type(folio, PG_##lname); \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \
folio->page.page_type &= ~PG_##lname; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type |= PG_##lname; \
}
#define PAGE_TYPE_OPS(uname, lname, fname) \
FOLIO_TYPE_OPS(lname, fname) \
static __always_inline int Page##uname(const struct page *page) \
{ \
return PageType(page, PG_##lname); \
} \
static __always_inline int folio_test_##fname(const struct folio *folio)\
{ \
return folio_test_type(folio, PG_##lname); \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!PageType(page, 0), page); \
page->page_type &= ~PG_##lname; \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \
folio->page.page_type &= ~PG_##lname; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type |= PG_##lname; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type |= PG_##lname; \
} \
}
/*
* PageBuddy() indicates that the page is free and in the buddy system
......@@ -1052,6 +1041,37 @@ PAGE_TYPE_OPS(Table, table, pgtable)
*/
PAGE_TYPE_OPS(Guard, guard, guard)
#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
#else
FOLIO_TEST_FLAG_FALSE(hugetlb)
#endif
/**
* PageHuge - Determine if the page belongs to hugetlbfs
* @page: The page to test.
*
* Context: Any context.
* Return: True for hugetlbfs pages, false for anon pages or pages
* belonging to other filesystems.
*/
static inline bool PageHuge(const struct page *page)
{
return folio_test_hugetlb(page_folio(page));
}
/*
* Check if a page is currently marked HWPoisoned. Note that this check is
* best effort only and inherently racy: there is no way to synchronize with
* failing hardware.
*/
static inline bool is_page_hwpoison(struct page *page)
{
if (PageHWPoison(page))
return true;
return PageHuge(page) && PageHWPoison(compound_head(page));
}
extern bool is_free_buddy_page(struct page *page);
PAGEFLAG(Isolated, isolated, PF_ANY);
......@@ -1118,7 +1138,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
*/
#define PAGE_FLAGS_SECOND \
(0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \
1UL << PG_hugetlb | 1UL << PG_large_rmappable)
1UL << PG_large_rmappable)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
......
......@@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
struct mm_struct *mm, unsigned long vm_flags);
#else
static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
struct mm_struct *mm, unsigned long vm_flags)
{
return false;
}
#endif
#ifdef CONFIG_SHMEM
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
#else
......
......@@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry)
}
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_MEMORY_FAILURE
/*
* Support for hardware poisoned pages
*/
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
BUG_ON(!PageLocked(page));
return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}
static inline int is_hwpoison_entry(swp_entry_t entry)
{
return swp_type(entry) == SWP_HWPOISON;
}
#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
return swp_entry(0, 0);
}
static inline int is_hwpoison_entry(swp_entry_t swp)
{
return 0;
}
#endif
typedef unsigned long pte_marker;
#define PTE_MARKER_UFFD_WP BIT(0)
......@@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
/*
* A pfn swap entry is a special type of swap entry that always has a pfn stored
* in the swap offset. They are used to represent unaddressable device memory
* and to restrict access to a page undergoing migration.
* in the swap offset. They can either be used to represent unaddressable device
* memory, to restrict access to a page undergoing migration or to represent a
* pfn which has been hwpoisoned and unmapped.
*/
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
......@@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry)
BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
return is_migration_entry(entry) || is_device_private_entry(entry) ||
is_device_exclusive_entry(entry);
is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
}
struct page_vma_mapped_walk;
......@@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
}
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
#ifdef CONFIG_MEMORY_FAILURE
/*
* Support for hardware poisoned pages
*/
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
BUG_ON(!PageLocked(page));
return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}
static inline int is_hwpoison_entry(swp_entry_t entry)
{
return swp_type(entry) == SWP_HWPOISON;
}
#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
return swp_entry(0, 0);
}
static inline int is_hwpoison_entry(swp_entry_t swp)
{
return 0;
}
#endif
static inline int non_swap_entry(swp_entry_t entry)
{
return swp_type(entry) >= MAX_SWAPFILES;
......
......@@ -135,6 +135,7 @@ IF_HAVE_PG_ARCH_X(arch_3)
#define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) }
#define __def_pagetype_names \
DEF_PAGETYPE_NAME(hugetlb), \
DEF_PAGETYPE_NAME(offline), \
DEF_PAGETYPE_NAME(guard), \
DEF_PAGETYPE_NAME(table), \
......
......@@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
vm_flags_clear(tmp, VM_LOCKED_MASK);
/*
* Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
/*
* Link the vma into the MT. After using __mt_dup(), memory
* allocation is not necessary here, so it cannot fail.
*/
vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
file = tmp->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
......@@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
i_mmap_unlock_write(mapping);
}
/*
* Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
/*
* Link the vma into the MT. After using __mt_dup(), memory
* allocation is not necessary here, so it cannot fail.
*/
vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
retval = copy_page_range(tmp, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
if (retval) {
mpnt = vma_next(&vmi);
goto loop_out;
......
......@@ -205,11 +205,10 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_NUMBER(PG_head_mask);
#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy)
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
#ifdef CONFIG_HUGETLB_PAGE
VMCOREINFO_NUMBER(PG_hugetlb);
#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb)
VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE);
#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline)
VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE);
#endif
#ifdef CONFIG_KALLSYMS
VMCOREINFO_SYMBOL(kallsyms_names);
......
......@@ -627,10 +627,10 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
/*
* Zero out zone modifiers, as we don't have specific zone
* requirements. Keep the flags related to allocation in atomic
* contexts and I/O.
* contexts, I/O, nolockdep.
*/
alloc_flags &= ~GFP_ZONEMASK;
alloc_flags &= (GFP_ATOMIC | GFP_KERNEL);
alloc_flags &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP);
alloc_flags |= __GFP_NOWARN;
page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
if (page)
......
......@@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm,
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
/*
* MADV_POPULATE_(READ|WRITE) wants to handle VMA
* lookups+error reporting differently.
*/
if (gup_flags & FOLL_MADV_POPULATE) {
vma = vma_lookup(mm, start);
if (!vma) {
ret = -ENOMEM;
goto out;
}
if (check_vma_flags(vma, gup_flags)) {
ret = -EINVAL;
goto out;
}
goto retry;
}
vma = gup_vma_lookup(mm, start);
if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK,
......@@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma,
}
/*
* faultin_vma_page_range() - populate (prefault) page tables inside the
* given VMA range readable/writable
* faultin_page_range() - populate (prefault) page tables inside the
* given range readable/writable
*
* This takes care of mlocking the pages, too, if VM_LOCKED is set.
*
* @vma: target vma
* @mm: the mm to populate page tables in
* @start: start address
* @end: end address
* @write: whether to prefault readable or writable
* @locked: whether the mmap_lock is still held
*
* Returns either number of processed pages in the vma, or a negative error
* code on error (see __get_user_pages()).
* Returns either number of processed pages in the MM, or a negative error
* code on error (see __get_user_pages()). Note that this function reports
* errors related to VMAs, such as incompatible mappings, as expected by
* MADV_POPULATE_(READ|WRITE).
*
* vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
* covered by the VMA. If it's released, *@locked will be set to 0.
* The range must be page-aligned.
*
* mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
*/
long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, bool write, int *locked)
long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags;
long ret;
VM_BUG_ON(!PAGE_ALIGNED(start));
VM_BUG_ON(!PAGE_ALIGNED(end));
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
/*
......@@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
* a poisoned page.
* !FOLL_FORCE: Require proper access permissions.
*/
gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
FOLL_MADV_POPULATE;
if (write)
gup_flags |= FOLL_WRITE;
/*
* We want to report -EINVAL instead of -EFAULT for any permission
* problems or incompatible mappings.
*/
if (check_vma_flags(vma, gup_flags))
return -EINVAL;
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, locked);
ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
gup_flags);
lru_add_drain();
return ret;
}
......
......@@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
goto unlock_ptls;
}
folio_move_anon_rmap(src_folio, dst_vma);
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
/* Folio got pinned from under us. Put it back and fail the move. */
if (folio_maybe_dma_pinned(src_folio)) {
......@@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
goto unlock_ptls;
}
folio_move_anon_rmap(src_folio, dst_vma);
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
......
......@@ -1624,7 +1624,7 @@ static inline void __clear_hugetlb_destructor(struct hstate *h,
{
lockdep_assert_held(&hugetlb_lock);
folio_clear_hugetlb(folio);
__folio_clear_hugetlb(folio);
}
/*
......@@ -1711,7 +1711,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
h->surplus_huge_pages_node[nid]++;
}
folio_set_hugetlb(folio);
__folio_set_hugetlb(folio);
folio_change_private(folio, NULL);
/*
* We have to set hugetlb_vmemmap_optimized again as above
......@@ -1781,7 +1781,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
* If vmemmap pages were allocated above, then we need to clear the
* hugetlb destructor under the hugetlb lock.
*/
if (clear_dtor) {
if (folio_test_hugetlb(folio)) {
spin_lock_irq(&hugetlb_lock);
__clear_hugetlb_destructor(h, folio);
spin_unlock_irq(&hugetlb_lock);
......@@ -2049,7 +2049,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
folio_set_hugetlb(folio);
__folio_set_hugetlb(folio);
INIT_LIST_HEAD(&folio->lru);
hugetlb_set_folio_subpool(folio, NULL);
set_hugetlb_cgroup(folio, NULL);
......@@ -2159,22 +2159,6 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
return __prep_compound_gigantic_folio(folio, order, true);
}
/*
* PageHuge() only returns true for hugetlbfs pages, but not for normal or
* transparent huge pages. See the PageTransHuge() documentation for more
* details.
*/
int PageHuge(const struct page *page)
{
const struct folio *folio;
if (!PageCompound(page))
return 0;
folio = page_folio(page);
return folio_test_hugetlb(folio);
}
EXPORT_SYMBOL_GPL(PageHuge);
/*
* Find and lock address space (mapping) in write mode.
*
......@@ -3268,9 +3252,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
if (deferred_reserve)
if (deferred_reserve) {
spin_lock_irq(&hugetlb_lock);
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
spin_unlock_irq(&hugetlb_lock);
}
}
if (!memcg_charge_ret)
......@@ -6274,6 +6261,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
VM_UFFD_MISSING);
}
if (!(vma->vm_flags & VM_MAYSHARE)) {
ret = vmf_anon_prepare(vmf);
if (unlikely(ret))
goto out;
}
folio = alloc_hugetlb_folio(vma, haddr, 0);
if (IS_ERR(folio)) {
/*
......@@ -6310,15 +6303,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
*/
restore_reserve_on_error(h, vma, haddr, folio);
folio_put(folio);
ret = VM_FAULT_SIGBUS;
goto out;
}
new_pagecache_folio = true;
} else {
folio_lock(folio);
ret = vmf_anon_prepare(vmf);
if (unlikely(ret))
goto backout_unlocked;
anon_rmap = 1;
}
} else {
......@@ -7044,9 +7034,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
if (!pte_same(pte, newpte))
set_huge_pte_at(mm, address, ptep, newpte, psize);
} else if (unlikely(is_pte_marker(pte))) {
/* No other markers apply for now. */
WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
if (uffd_wp_resolve)
/*
* Do nothing on a poison marker; page is
* corrupted, permissons do not apply. Here
* pte_marker_uffd_wp()==true implies !poison
* because they're mutual exclusive.
*/
if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
/* Safe to modify directly (non-present->none). */
huge_pte_clear(mm, address, ptep, psize);
} else if (!huge_pte_none(pte)) {
......
......@@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio);
void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked);
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
unsigned long bytes);
......@@ -1127,10 +1126,13 @@ enum {
FOLL_FAST_ONLY = 1 << 20,
/* allow unlocking the mmap lock */
FOLL_UNLOCKABLE = 1 << 21,
/* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
FOLL_MADV_POPULATE = 1 << 22,
};
#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
FOLL_FAST_ONLY | FOLL_UNLOCKABLE)
FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
FOLL_MADV_POPULATE)
/*
* Indicates for which pages that are write-protected in the page table,
......
......@@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma,
{
const bool write = behavior == MADV_POPULATE_WRITE;
struct mm_struct *mm = vma->vm_mm;
unsigned long tmp_end;
int locked = 1;
long pages;
*prev = vma;
while (start < end) {
/*
* We might have temporarily dropped the lock. For example,
* our VMA might have been split.
*/
if (!vma || start >= vma->vm_end) {
vma = vma_lookup(mm, start);
if (!vma)
return -ENOMEM;
}
tmp_end = min_t(unsigned long, end, vma->vm_end);
/* Populate (prefault) page tables readable/writable. */
pages = faultin_vma_page_range(vma, start, tmp_end, write,
&locked);
pages = faultin_page_range(mm, start, end, write, &locked);
if (!locked) {
mmap_read_lock(mm);
locked = 1;
......@@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma,
pr_warn_once("%s: unhandled return value: %ld\n",
__func__, pages);
fallthrough;
case -ENOMEM:
case -ENOMEM: /* No VMA or out of memory. */
return -ENOMEM;
}
}
......
......@@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page)
{
int ret;
zone_pcp_disable(page_zone(page));
/*
* zone_pcp_disable() can't be used here. It will
* hold pcp_batch_high_lock and dissolve_free_huge_page() might hold
* cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
* optimization is enabled. This will break current lock dependency
* chain and leads to deadlock.
* Disabling pcp before dissolving the page was a deterministic
* approach because we made sure that those pages cannot end up in any
* PCP list. Draining PCP lists expels those pages to the buddy system,
* but nothing guarantees that those pages do not get back to a PCP
* queue if we need to refill those.
*/
ret = dissolve_free_huge_page(page);
if (!ret)
if (!ret) {
drain_all_pages(page_zone(page));
ret = take_page_off_buddy(page);
zone_pcp_enable(page_zone(page));
}
return ret;
}
......
This diff is collapsed.
......@@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY
bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
struct mm_struct *mm, unsigned long vm_flags)
{
return false;
}
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split)
{
......
......@@ -1331,15 +1331,22 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
if (!gfp_has_io_fs(sc->gfp_mask))
return 0;
#ifdef CONFIG_MEMCG_KMEM
mem_cgroup_flush_stats(memcg);
nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
#else
/* use pool stats instead of memcg stats */
nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
nr_stored = atomic_read(&zswap_nr_stored);
#endif
/*
* For memcg, use the cgroup-wide ZSWAP stats since we don't
* have them per-node and thus per-lruvec. Careful if memcg is
* runtime-disabled: we can get sc->memcg == NULL, which is ok
* for the lruvec, but not for memcg_page_state().
*
* Without memcg, use the zswap pool-wide metrics.
*/
if (!mem_cgroup_disabled()) {
mem_cgroup_flush_stats(memcg);
nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
} else {
nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
nr_stored = atomic_read(&zswap_nr_stored);
}
if (!nr_stored)
return 0;
......
......@@ -56,7 +56,6 @@
#include <asm/types.h>
#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
......@@ -1156,7 +1155,7 @@ void __run_test(struct __fixture_metadata *f,
struct __test_metadata *t)
{
struct __test_xfail *xfail;
char test_name[LINE_MAX];
char *test_name;
const char *diagnostic;
/* reset test struct */
......@@ -1164,8 +1163,12 @@ void __run_test(struct __fixture_metadata *f,
t->trigger = 0;
memset(t->results->reason, 0, sizeof(t->results->reason));
snprintf(test_name, sizeof(test_name), "%s%s%s.%s",
f->name, variant->name[0] ? "." : "", variant->name, t->name);
if (asprintf(&test_name, "%s%s%s.%s", f->name,
variant->name[0] ? "." : "", variant->name, t->name) == -1) {
ksft_print_msg("ERROR ALLOCATING MEMORY\n");
t->exit_code = KSFT_FAIL;
_exit(t->exit_code);
}
ksft_print_msg(" RUN %s ...\n", test_name);
......@@ -1203,6 +1206,7 @@ void __run_test(struct __fixture_metadata *f,
ksft_test_result_code(t->exit_code, test_name,
diagnostic ? "%s" : "", diagnostic);
free(test_name);
}
static int test_harness_run(int argc, char **argv)
......
......@@ -7,6 +7,7 @@
#include <linux/mman.h>
#include <linux/prctl.h>
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <sys/auxv.h>
......
......@@ -54,7 +54,6 @@ int test_nr;
u64 shadow_pkey_reg;
int dprint_in_signal;
char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
char buf[256];
void cat_into_file(char *str, char *file)
{
......@@ -1745,42 +1744,6 @@ void pkey_setup_shadow(void)
shadow_pkey_reg = __read_pkey_reg();
}
pid_t parent_pid;
void restore_settings_atexit(void)
{
if (parent_pid == getpid())
cat_into_file(buf, "/proc/sys/vm/nr_hugepages");
}
void save_settings(void)
{
int fd;
int err;
if (geteuid())
return;
fd = open("/proc/sys/vm/nr_hugepages", O_RDONLY);
if (fd < 0) {
fprintf(stderr, "error opening\n");
perror("error: ");
exit(__LINE__);
}
/* -1 to guarantee leaving the trailing \0 */
err = read(fd, buf, sizeof(buf)-1);
if (err < 0) {
fprintf(stderr, "error reading\n");
perror("error: ");
exit(__LINE__);
}
parent_pid = getpid();
atexit(restore_settings_atexit);
close(fd);
}
int main(void)
{
int nr_iterations = 22;
......@@ -1788,7 +1751,6 @@ int main(void)
srand((unsigned int)time(NULL));
save_settings();
setup_handlers();
printf("has pkeys: %d\n", pkeys_supported);
......
......@@ -385,6 +385,7 @@ CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0
CATEGORY="ksm" run_test ./ksm_functional_tests
# protection_keys tests
nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)
if [ -x ./protection_keys_32 ]
then
CATEGORY="pkey" run_test ./protection_keys_32
......@@ -394,6 +395,7 @@ if [ -x ./protection_keys_64 ]
then
CATEGORY="pkey" run_test ./protection_keys_64
fi
echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
if [ -x ./soft-dirty ]
then
......
......@@ -300,7 +300,7 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd,
char **addr)
{
size_t i;
int dummy;
int __attribute__((unused)) dummy = 0;
srand(time(NULL));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment