Commit 6fda0bb8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'mm-hotfixes-stable-2023-04-07-16-23' of...

Merge tag 'mm-hotfixes-stable-2023-04-07-16-23' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull MM fixes from Andrew Morton:
 "28 hotfixes.

  23 are cc:stable and the other five address issues which were
  introduced during this merge cycle.

  20 are for MM and the remainder are for other subsystems"

* tag 'mm-hotfixes-stable-2023-04-07-16-23' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (28 commits)
  maple_tree: fix a potential concurrency bug in RCU mode
  maple_tree: fix get wrong data_end in mtree_lookup_walk()
  mm/swap: fix swap_info_struct race between swapoff and get_swap_pages()
  nilfs2: fix sysfs interface lifetime
  mm: take a page reference when removing device exclusive entries
  mm: vmalloc: avoid warn_alloc noise caused by fatal signal
  nilfs2: initialize "struct nilfs_binfo_dat"->bi_pad field
  nilfs2: fix potential UAF of struct nilfs_sc_info in nilfs_segctor_thread()
  zsmalloc: document freeable stats
  zsmalloc: document new fullness grouping
  fsdax: force clear dirty mark if CoW
  mm/hugetlb: fix uffd wr-protection for CoW optimization path
  mm: enable maple tree RCU mode by default
  maple_tree: add RCU lock checking to rcu callback functions
  maple_tree: add smp_rmb() to dead node detection
  maple_tree: fix write memory barrier of nodes once dead for RCU mode
  maple_tree: remove extra smp_wmb() from mas_dead_leaves()
  maple_tree: fix freeing of nodes in rcu mode
  maple_tree: detect dead nodes in mas_start()
  maple_tree: be more cautious about dead nodes
  ...
parents aa318c48 c45ea315
......@@ -265,7 +265,9 @@ Krzysztof Kozlowski <krzk@kernel.org> <k.kozlowski@samsung.com>
Krzysztof Kozlowski <krzk@kernel.org> <krzysztof.kozlowski@canonical.com>
Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Kuogee Hsieh <quic_khsieh@quicinc.com> <khsieh@codeaurora.org>
Leonard Crestez <leonard.crestez@nxp.com> Leonard Crestez <cdleonard@gmail.com>
Leonardo Bras <leobras.c@gmail.com> <leonardo@linux.ibm.com>
Leonard Göhrs <l.goehrs@pengutronix.de>
Leonid I Ananiev <leonid.i.ananiev@intel.com>
Leon Romanovsky <leon@kernel.org> <leon@leon.nu>
Leon Romanovsky <leon@kernel.org> <leonro@mellanox.com>
......
This diff is collapsed.
......@@ -781,6 +781,33 @@ static int __dax_invalidate_entry(struct address_space *mapping,
return ret;
}
static int __dax_clear_dirty_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
XA_STATE(xas, &mapping->i_pages, start);
unsigned int scanned = 0;
void *entry;
xas_lock_irq(&xas);
xas_for_each(&xas, entry, end) {
entry = get_unlocked_entry(&xas, 0);
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
put_unlocked_entry(&xas, entry, WAKE_NEXT);
if (++scanned % XA_CHECK_SCHED)
continue;
xas_pause(&xas);
xas_unlock_irq(&xas);
cond_resched();
xas_lock_irq(&xas);
}
xas_unlock_irq(&xas);
return 0;
}
/*
* Delete DAX entry at @index from @mapping. Wait for it
* to be unlocked before deleting it.
......@@ -1258,15 +1285,20 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
/* don't bother with blocks that are not shared to start with */
if (!(iomap->flags & IOMAP_F_SHARED))
return length;
/* don't bother with holes or unwritten extents */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return length;
id = dax_read_lock();
ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
if (ret < 0)
goto out_unlock;
/* zero the distance if srcmap is HOLE or UNWRITTEN */
if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) {
memset(daddr, 0, length);
dax_flush(iomap->dax_dev, daddr, length);
ret = length;
goto out_unlock;
}
ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
if (ret < 0)
goto out_unlock;
......@@ -1435,6 +1467,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
* written by write(2) is visible in mmap.
*/
if (iomap->flags & IOMAP_F_NEW || cow) {
/*
* Filesystem allows CoW on non-shared extents. The src extents
* may have been mmapped with dirty mark before. To be able to
* invalidate its dax entries, we need to clear the dirty mark
* in advance.
*/
if (cow)
__dax_clear_dirty_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
invalidate_inode_pages2_range(iomi->inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
......@@ -2022,8 +2064,8 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
(ret = iomap_iter(&dst_iter, ops)) > 0) {
compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
same);
compared = dax_range_compare_iter(&src_iter, &dst_iter,
min(src_iter.len, dst_iter.len), same);
if (compared < 0)
return ret;
src_iter.processed = dst_iter.processed = compared;
......
......@@ -2219,6 +2219,7 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
/* on-disk format */
binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
binfo->bi_dat.bi_level = level;
memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));
return 0;
}
......
......@@ -314,6 +314,7 @@ static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
binfo->bi_dat.bi_level = 0;
memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));
return 0;
}
......
......@@ -2609,11 +2609,10 @@ static int nilfs_segctor_thread(void *arg)
goto loop;
end_thread:
spin_unlock(&sci->sc_state_lock);
/* end sync. */
sci->sc_task = NULL;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
spin_unlock(&sci->sc_state_lock);
return 0;
}
......
......@@ -482,6 +482,7 @@ static void nilfs_put_super(struct super_block *sb)
up_write(&nilfs->ns_sem);
}
nilfs_sysfs_delete_device_group(nilfs);
iput(nilfs->ns_sufile);
iput(nilfs->ns_cpfile);
iput(nilfs->ns_dat);
......@@ -1105,6 +1106,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
nilfs_put_root(fsroot);
failed_unload:
nilfs_sysfs_delete_device_group(nilfs);
iput(nilfs->ns_sufile);
iput(nilfs->ns_cpfile);
iput(nilfs->ns_dat);
......
......@@ -87,7 +87,6 @@ void destroy_nilfs(struct the_nilfs *nilfs)
{
might_sleep();
if (nilfs_init(nilfs)) {
nilfs_sysfs_delete_device_group(nilfs);
brelse(nilfs->ns_sbh[0]);
brelse(nilfs->ns_sbh[1]);
}
......@@ -305,6 +304,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto failed;
}
err = nilfs_sysfs_create_device_group(sb);
if (unlikely(err))
goto sysfs_error;
if (valid_fs)
goto skip_recovery;
......@@ -366,6 +369,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto failed;
failed_unload:
nilfs_sysfs_delete_device_group(nilfs);
sysfs_error:
iput(nilfs->ns_cpfile);
iput(nilfs->ns_sufile);
iput(nilfs->ns_dat);
......@@ -697,10 +703,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto failed_sbh;
err = nilfs_sysfs_create_device_group(sb);
if (err)
goto failed_sbh;
set_nilfs_init(nilfs);
err = 0;
out:
......
......@@ -774,7 +774,8 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */
......
......@@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;
mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
......@@ -700,6 +701,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
if (!retval)
mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
......
......@@ -1143,7 +1143,7 @@ menu "Scheduler Debugging"
config SCHED_DEBUG
bool "Collect scheduler debugging info"
depends on DEBUG_KERNEL && PROC_FS
depends on DEBUG_KERNEL && DEBUG_FS
default y
help
If you say Y here, the /sys/kernel/debug/sched file will be provided
......@@ -1392,7 +1392,7 @@ config LOCKDEP_STACK_TRACE_HASH_BITS
range 10 30
default 14
help
Try increasing this value if you need large MAX_STACK_TRACE_ENTRIES.
Try increasing this value if you need large STACK_TRACE_HASH_SIZE.
config LOCKDEP_CIRCULAR_QUEUE_BITS
int "Bitsize for elements in circular_queue struct"
......
This diff is collapsed.
......@@ -5478,7 +5478,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
struct folio *pagecache_folio, spinlock_t *ptl)
{
const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte;
pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(vma);
struct page *old_page;
struct folio *new_folio;
......@@ -5487,6 +5487,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
/*
* Never handle CoW for uffd-wp protected pages. It should be only
* handled when the uffd-wp protection is removed.
*
* Note that only the CoW optimization path (in hugetlb_no_page())
* can trigger this, because hugetlb_fault() will always resolve
* uffd-wp bit first.
*/
if (!unshare && huge_pte_uffd_wp(pte))
return 0;
/*
* hugetlb does not support FOLL_FORCE-style write faults that keep the
* PTE mapped R/O such as maybe_mkwrite() would do.
......@@ -5500,7 +5511,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
delayacct_wpcopy_start();
......
......@@ -556,15 +556,11 @@ static unsigned long kfence_init_pool(void)
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
struct slab *slab = page_slab(&pages[i]);
struct slab *slab = page_slab(nth_page(pages, i));
if (!i || (i % 2))
continue;
/* Verify we do not have a compound head page. */
if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
return addr;
__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
......@@ -597,12 +593,26 @@ static unsigned long kfence_init_pool(void)
/* Protect the right redzone. */
if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
return addr;
goto reset_slab;
addr += 2 * PAGE_SIZE;
}
return 0;
reset_slab:
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
struct slab *slab = page_slab(nth_page(pages, i));
if (!i || (i % 2))
continue;
#ifdef CONFIG_MEMCG
slab->memcg_data = 0;
#endif
__folio_clear_slab(slab_folio(slab));
}
return addr;
}
static bool __init kfence_init_pool_early(void)
......@@ -632,16 +642,6 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
struct slab *slab = virt_to_slab(p);
if (!slab)
continue;
#ifdef CONFIG_MEMCG
slab->memcg_data = 0;
#endif
__folio_clear_slab(slab_folio(slab));
}
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
return false;
......
......@@ -3563,8 +3563,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
/*
* We need a reference to lock the folio because we don't hold
* the PTL so a racing thread can remove the device-exclusive
* entry and unmap it. If the folio is free the entry must
* have been removed already. If it happens to have already
* been re-allocated after being freed all we do is lock and
* unlock it.
*/
if (!folio_try_get(folio))
return 0;
if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
folio_put(folio);
return VM_FAULT_RETRY;
}
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
......@@ -3577,6 +3590,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
folio_unlock(folio);
folio_put(folio);
mmu_notifier_invalidate_range_end(&range);
return 0;
......
......@@ -2277,7 +2277,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
int count = 0;
int error = -ENOMEM;
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
/*
......@@ -3037,6 +3037,7 @@ void exit_mmap(struct mm_struct *mm)
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
......
......@@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)
{
int nid;
assert_spin_locked(&p->lock);
for_each_node(nid)
plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
......@@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
del_from_avail_list(p);
spin_lock(&p->lock);
del_from_avail_list(p);
if (p->prio < 0) {
struct swap_info_struct *si = p;
int nid;
......
......@@ -3042,6 +3042,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
/* vm_area_alloc_pages() can also fail due to a fatal signal */
if (!fatal_signal_pending(current))
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
......
......@@ -108,6 +108,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mn->slot[1] != NULL);
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mas.node = MAS_START;
mas_nomem(&mas, GFP_KERNEL);
......@@ -160,6 +161,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas_allocated(&mas) != i);
MT_BUG_ON(mt, !mn);
MT_BUG_ON(mt, not_empty(mn));
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
......@@ -192,6 +194,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, not_empty(mn));
MT_BUG_ON(mt, mas_allocated(&mas) != i - 1);
MT_BUG_ON(mt, !mn);
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
......@@ -210,6 +213,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
MT_BUG_ON(mt, mas_allocated(&mas) != j - 1);
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
......@@ -233,6 +237,7 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas_allocated(&mas) != i - j);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1);
}
......@@ -269,6 +274,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mn = mas_pop_node(&mas); /* get the next node. */
MT_BUG_ON(mt, mn == NULL);
MT_BUG_ON(mt, not_empty(mn));
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
......@@ -294,6 +300,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mn = mas_pop_node(&mas2); /* get the next node. */
MT_BUG_ON(mt, mn == NULL);
MT_BUG_ON(mt, not_empty(mn));
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas2) != 0);
......@@ -334,10 +341,12 @@ static noinline void check_new_node(struct maple_tree *mt)
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) {
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, not_empty(mn));
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
}
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
......@@ -375,6 +384,7 @@ static noinline void check_new_node(struct maple_tree *mt)
mas_node_count(&mas, i); /* Request */
mas_nomem(&mas, GFP_KERNEL); /* Fill request */
mn = mas_pop_node(&mas); /* get the next node. */
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mas_destroy(&mas);
......@@ -382,10 +392,13 @@ static noinline void check_new_node(struct maple_tree *mt)
mas_node_count(&mas, i); /* Request */
mas_nomem(&mas, GFP_KERNEL); /* Fill request */
mn = mas_pop_node(&mas); /* get the next node. */
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mn = mas_pop_node(&mas); /* get the next node. */
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mn = mas_pop_node(&mas); /* get the next node. */
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
mas_destroy(&mas);
}
......@@ -35369,6 +35382,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
MT_BUG_ON(mt, allocated != 1 + height * 3);
mn = mas_pop_node(&mas);
MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
mas_destroy(&mas);
......@@ -35386,6 +35400,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
mas_destroy(&mas);
allocated = mas_allocated(&mas);
MT_BUG_ON(mt, allocated != 0);
mn->parent = ma_parent_ptr(mn);
ma_free_rcu(mn);
MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
......@@ -35756,6 +35771,7 @@ void farmer_tests(void)
tree.ma_root = mt_mk_node(node, maple_leaf_64);
mt_dump(&tree);
node->parent = ma_parent_ptr(node);
ma_free_rcu(node);
/* Check things that will make lockdep angry */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment