Commit 967c28b2 authored by Gao Xiang's avatar Gao Xiang

erofs: kill hooked chains to avoid loops on deduplicated compressed images

After heavily stressing EROFS with several images which include a
hand-crafted image of repeated patterns for more than 46 days, I found
two chains could be linked with each other almost simultaneously and
form a loop so that the entire loop won't be submitted.  As a
consequence, the corresponding file pages will remain locked forever.

It can be _only_ observed on data-deduplicated compressed images.
For example, consider two chains with five pclusters in total:
	Chain 1:  2->3->4->5    -- The tail pcluster is 5;
        Chain 2:  5->1->2       -- The tail pcluster is 2.

Chain 2 could link to Chain 1 with pcluster 5; and Chain 1 could link
to Chain 2 at the same time with pcluster 2.

Since hooked chains are all linked locklessly now, I have no idea how
to simply avoid the race.  Instead, let's avoid hooked chains completely
until I could work out a proper way to fix this and end users finally
tell us that it's needed to add it back.

Actually, this optimization can be found with multi-threaded workloads
(especially even more often on deduplicated compressed images), yet I'm
not sure about the overall system impacts of not having this compared
with implementation complexity.

Fixes: 267f2492 ("erofs: introduce multi-reference pclusters (fully-referenced)")
Signed-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
Reviewed-by: default avatarYue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20230526201459.128169-4-hsiangkao@linux.alibaba.com
parent 6ab5eed6
...@@ -93,11 +93,8 @@ struct z_erofs_pcluster { ...@@ -93,11 +93,8 @@ struct z_erofs_pcluster {
/* let's avoid the valid 32-bit kernel addresses */ /* let's avoid the valid 32-bit kernel addresses */
/* the chained workgroup has't submitted io (still open) */ /* the end of a chain of pclusters */
#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) #define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
/* the chained workgroup has already submitted io */
#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
#define Z_EROFS_PCLUSTER_NIL (NULL) #define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue { struct z_erofs_decompressqueue {
...@@ -504,20 +501,6 @@ int __init z_erofs_init_zip_subsystem(void) ...@@ -504,20 +501,6 @@ int __init z_erofs_init_zip_subsystem(void)
enum z_erofs_pclustermode { enum z_erofs_pclustermode {
Z_EROFS_PCLUSTER_INFLIGHT, Z_EROFS_PCLUSTER_INFLIGHT,
/*
* The current pclusters was the tail of an exist chain, in addition
* that the previous processed chained pclusters are all decided to
* be hooked up to it.
* A new chain will be created for the remaining pclusters which are
* not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
* the next pcluster cannot reuse the whole page safely for inplace I/O
* in the following scenario:
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
* | (belongs to the next pcl) | (belongs to the current pcl) |
* |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
*/
Z_EROFS_PCLUSTER_HOOKED,
/* /*
* a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
* could be dispatched into bypass queue later due to uptodated managed * could be dispatched into bypass queue later due to uptodated managed
...@@ -535,8 +518,8 @@ enum z_erofs_pclustermode { ...@@ -535,8 +518,8 @@ enum z_erofs_pclustermode {
* ________________________________________________________________ * ________________________________________________________________
* | tail (partial) page | head (partial) page | * | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) | * | (of the current cl) | (of the previous collection) |
* | PCLUSTER_FOLLOWED or | | * | | |
* |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
* *
* [ (*) the above page can be used as inplace I/O. ] * [ (*) the above page can be used as inplace I/O. ]
*/ */
...@@ -550,7 +533,7 @@ struct z_erofs_decompress_frontend { ...@@ -550,7 +533,7 @@ struct z_erofs_decompress_frontend {
struct page *pagepool; struct page *pagepool;
struct page *candidate_bvpage; struct page *candidate_bvpage;
struct z_erofs_pcluster *pcl, *tailpcl; struct z_erofs_pcluster *pcl;
z_erofs_next_pcluster_t owned_head; z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode; enum z_erofs_pclustermode mode;
...@@ -755,19 +738,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) ...@@ -755,19 +738,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
return; return;
} }
/* /* type 2, it belongs to an ongoing chain */
* type 2, link to the end of an existing open chain, be careful
* that its submission is controlled by the original attached chain.
*/
if (*owned_head != &pcl->next && pcl != f->tailpcl &&
cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
f->mode = Z_EROFS_PCLUSTER_HOOKED;
f->tailpcl = NULL;
return;
}
/* type 3, it belongs to a chain, but it isn't the end of the chain */
f->mode = Z_EROFS_PCLUSTER_INFLIGHT; f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
} }
...@@ -828,9 +799,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) ...@@ -828,9 +799,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
goto err_out; goto err_out;
} }
} }
/* used to check tail merging loop due to corrupted images */
if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
fe->tailpcl = pcl;
fe->owned_head = &pcl->next; fe->owned_head = &pcl->next;
fe->pcl = pcl; fe->pcl = pcl;
return 0; return 0;
...@@ -851,7 +819,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) ...@@ -851,7 +819,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (!(map->m_flags & EROFS_MAP_META)) { if (!(map->m_flags & EROFS_MAP_META)) {
grp = erofs_find_workgroup(fe->inode->i_sb, grp = erofs_find_workgroup(fe->inode->i_sb,
...@@ -870,10 +837,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) ...@@ -870,10 +837,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) { if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock); mutex_lock(&fe->pcl->lock);
/* used to check tail merging loop due to corrupted images */
if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
fe->tailpcl = fe->pcl;
z_erofs_try_to_claim_pcluster(fe); z_erofs_try_to_claim_pcluster(fe);
} else if (ret) { } else if (ret) {
return ret; return ret;
...@@ -1028,8 +991,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, ...@@ -1028,8 +991,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
* those chains are handled asynchronously thus the page cannot be used * those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or bvpage (should be processed in a strict order.) * for inplace I/O or bvpage (should be processed in a strict order.)
*/ */
tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end); cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) { if (!(map->m_flags & EROFS_MAP_MAPPED)) {
...@@ -1398,10 +1360,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, ...@@ -1398,10 +1360,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
}; };
z_erofs_next_pcluster_t owned = io->head; z_erofs_next_pcluster_t owned = io->head;
while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { while (owned != Z_EROFS_PCLUSTER_TAIL) {
/* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
/* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
be.pcl = container_of(owned, struct z_erofs_pcluster, next); be.pcl = container_of(owned, struct z_erofs_pcluster, next);
...@@ -1418,7 +1377,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) ...@@ -1418,7 +1377,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
container_of(work, struct z_erofs_decompressqueue, u.work); container_of(work, struct z_erofs_decompressqueue, u.work);
struct page *pagepool = NULL; struct page *pagepool = NULL;
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
z_erofs_decompress_queue(bgq, &pagepool); z_erofs_decompress_queue(bgq, &pagepool);
erofs_release_pages(&pagepool); erofs_release_pages(&pagepool);
kvfree(bgq); kvfree(bgq);
...@@ -1606,7 +1565,7 @@ static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, ...@@ -1606,7 +1565,7 @@ static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
q->sync = true; q->sync = true;
} }
q->sb = sb; q->sb = sb;
q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; q->head = Z_EROFS_PCLUSTER_TAIL;
return q; return q;
} }
...@@ -1624,11 +1583,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, ...@@ -1624,11 +1583,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
if (owned_head == Z_EROFS_PCLUSTER_TAIL)
owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
WRITE_ONCE(*submit_qtail, owned_head); WRITE_ONCE(*submit_qtail, owned_head);
WRITE_ONCE(*bypass_qtail, &pcl->next); WRITE_ONCE(*bypass_qtail, &pcl->next);
...@@ -1698,15 +1653,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, ...@@ -1698,15 +1653,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
unsigned int i = 0; unsigned int i = 0;
bool bypass = true; bool bypass = true;
/* no possible 'owned_head' equals the following */
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
pcl = container_of(owned_head, struct z_erofs_pcluster, next); pcl = container_of(owned_head, struct z_erofs_pcluster, next);
owned_head = READ_ONCE(pcl->next);
/* close the main owned chain at first */
owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (z_erofs_is_inline_pcluster(pcl)) { if (z_erofs_is_inline_pcluster(pcl)) {
move_to_bypass_jobqueue(pcl, qtail, owned_head); move_to_bypass_jobqueue(pcl, qtail, owned_head);
continue; continue;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment