Commit cc4b2dd9 authored by Gao Xiang's avatar Gao Xiang

erofs: fix infinite loop due to a race of filling compressed_bvecs

I encountered a race issue after lengthy (~594647 secs) stress tests on
a 64k-page arm64 VM with several 4k-block EROFS images.  The timing
is like below:

z_erofs_try_inplace_io                  z_erofs_fill_bio_vec
  cmpxchg(&compressed_bvecs[].page,
          NULL, ..)
                                        [access bufvec]
  compressed_bvecs[] = *bvec;

Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
other fields in bufvec didn't matter.  After the subpage block support
is landed, .offset and .end can be used too, but filling bufvec isn't
an atomic operation which can cause inconsistency.

Let's use a spinlock to keep the atomicity of each bufvec.  More
specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
since it's rarely used (also it takes a short time if even used) as long
as the pcluster has a reference.

Fixes: 19235161 ("erofs: support I/O submission for sub-page compressed blocks")
Signed-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: default avatarYue Hu <huyue2@coolpad.com>
Reviewed-by: default avatarSandeep Dhavale <dhavale@google.com>
Link: https://lore.kernel.org/r/20240125120039.3228103-1-hsiangkao@linux.alibaba.com
parent 97cf5d53
...@@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) ...@@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i; unsigned int i;
if (i_blocksize(fe->inode) != PAGE_SIZE) if (i_blocksize(fe->inode) != PAGE_SIZE ||
return; fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return; return;
for (i = 0; i < pclusterpages; ++i) { for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage; struct page *page, *newpage;
void *t; /* mark pages just found for debugging */ void *t; /* mark pages just found for debugging */
/* the compressed page was loaded before */ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page)) if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue; continue;
page = find_get_page(mc, pcl->obj.index + i); page = find_get_page(mc, pcl->obj.index + i);
if (page) { if (page) {
t = (void *)((unsigned long)page | 1); t = (void *)((unsigned long)page | 1);
newpage = NULL; newpage = NULL;
...@@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) ...@@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1); t = (void *)((unsigned long)newpage | 1);
} }
spin_lock(&pcl->obj.lockref.lock);
if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) if (!pcl->compressed_bvecs[i].page) {
pcl->compressed_bvecs[i].page = t;
spin_unlock(&pcl->obj.lockref.lock);
continue; continue;
}
spin_unlock(&pcl->obj.lockref.lock);
if (page) if (page)
put_page(page); put_page(page);
...@@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb) ...@@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
return 0; return 0;
} }
static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec)
{
struct z_erofs_pcluster *const pcl = fe->pcl;
while (fe->icur > 0) {
if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
NULL, bvec->page)) {
pcl->compressed_bvecs[fe->icur] = *bvec;
return true;
}
}
return false;
}
/* callers must be with pcluster lock held */ /* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive) struct z_erofs_bvec *bvec, bool exclusive)
{ {
struct z_erofs_pcluster *pcl = fe->pcl;
int ret; int ret;
if (exclusive) { if (exclusive) {
/* give priority for inplaceio to use file pages first */ /* give priority for inplaceio to use file pages first */
if (z_erofs_try_inplace_io(fe, bvec)) spin_lock(&pcl->obj.lockref.lock);
while (fe->icur > 0) {
if (pcl->compressed_bvecs[--fe->icur].page)
continue;
pcl->compressed_bvecs[fe->icur] = *bvec;
spin_unlock(&pcl->obj.lockref.lock);
return 0; return 0;
}
spin_unlock(&pcl->obj.lockref.lock);
/* otherwise, check if it can be used as a bvpage */ /* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage) !fe->candidate_bvpage)
...@@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
{ {
gfp_t gfp = mapping_gfp_mask(mc); gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false; bool tocache = false;
struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; struct z_erofs_bvec zbv;
struct address_space *mapping; struct address_space *mapping;
struct page *page, *oldpage; struct page *page;
int justfound, bs = i_blocksize(f->inode); int justfound, bs = i_blocksize(f->inode);
/* Except for inplace pages, the entire page can be used for I/Os */ /* Except for inplace pages, the entire page can be used for I/Os */
bvec->bv_offset = 0; bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE; bvec->bv_len = PAGE_SIZE;
repeat: repeat:
oldpage = READ_ONCE(zbv->page); spin_lock(&pcl->obj.lockref.lock);
if (!oldpage) zbv = pcl->compressed_bvecs[nr];
page = zbv.page;
justfound = (unsigned long)page & 1UL;
page = (struct page *)((unsigned long)page & ~1UL);
pcl->compressed_bvecs[nr].page = page;
spin_unlock(&pcl->obj.lockref.lock);
if (!page)
goto out_allocpage; goto out_allocpage;
justfound = (unsigned long)oldpage & 1UL;
page = (struct page *)((unsigned long)oldpage & ~1UL);
bvec->bv_page = page; bvec->bv_page = page;
DBG_BUGON(z_erofs_is_shortlived_page(page)); DBG_BUGON(z_erofs_is_shortlived_page(page));
/* /*
* Handle preallocated cached pages. We tried to allocate such pages * Handle preallocated cached pages. We tried to allocate such pages
...@@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
*/ */
if (page->private == Z_EROFS_PREALLOCATED_PAGE) { if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
set_page_private(page, 0); set_page_private(page, 0);
WRITE_ONCE(zbv->page, page);
tocache = true; tocache = true;
goto out_tocache; goto out_tocache;
} }
...@@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
* therefore it is impossible for `mapping` to be NULL. * therefore it is impossible for `mapping` to be NULL.
*/ */
if (mapping && mapping != mc) { if (mapping && mapping != mc) {
if (zbv->offset < 0) if (zbv.offset < 0)
bvec->bv_offset = round_up(-zbv->offset, bs); bvec->bv_offset = round_up(-zbv.offset, bs);
bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
return; return;
} }
...@@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
/* the cached page is still in managed cache */ /* the cached page is still in managed cache */
if (page->mapping == mc) { if (page->mapping == mc) {
WRITE_ONCE(zbv->page, page);
/* /*
* The cached page is still available but without a valid * The cached page is still available but without a valid
* `->private` pcluster hint. Let's reconnect them. * `->private` pcluster hint. Let's reconnect them.
...@@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
put_page(page); put_page(page);
out_allocpage: out_allocpage:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { spin_lock(&pcl->obj.lockref.lock);
if (pcl->compressed_bvecs[nr].page) {
erofs_pagepool_add(&f->pagepool, page); erofs_pagepool_add(&f->pagepool, page);
spin_unlock(&pcl->obj.lockref.lock);
cond_resched(); cond_resched();
goto repeat; goto repeat;
} }
pcl->compressed_bvecs[nr].page = page;
spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page; bvec->bv_page = page;
out_tocache: out_tocache:
if (!tocache || bs != PAGE_SIZE || if (!tocache || bs != PAGE_SIZE ||
...@@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, ...@@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
if (cur + bvec.bv_len > end) if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur; bvec.bv_len = end - cur;
DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset)) bvec.bv_offset))
goto submit_bio_retry; goto submit_bio_retry;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment