Commit 410bb2ce authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Christian Brauner

iomap: submit ioends immediately

Currently the writeback code delays submitting fill ioends until we
reach the end of the folio.  The reason for that is that otherwise
the end I/O handler could clear the writeback bit before we've even
finished submitting all I/O for the folio.

Add a bias to ifs->write_bytes_pending while we are submitting I/O
for a folio so that it never reaches zero until all I/O is completed
to prevent the early writeback bit clearing, and remove the now
superfluous submit_list.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20231207072710.176093-13-hch@lst.deReviewed-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parent f525152a
...@@ -1610,30 +1610,34 @@ static void iomap_writepage_end_bio(struct bio *bio) ...@@ -1610,30 +1610,34 @@ static void iomap_writepage_end_bio(struct bio *bio)
* Submit the final bio for an ioend. * Submit the final bio for an ioend.
* *
* If @error is non-zero, it means that we have a situation where some part of * If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we've marked pages for writeback * the submission process has failed after we've marked pages for writeback.
* and unlocked them. In this situation, we need to fail the bio instead of * We cannot cancel ioend directly in that case, so call the bio end I/O handler
* submitting it. This typically only happens on a filesystem shutdown. * with the error status here to run the normal I/O completion handler to clear
* the writeback bit and let the file system proess the errors.
*/ */
static int static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
int error)
{ {
if (!wpc->ioend)
return error;
/*
* Let the file systems prepare the I/O submission and hook in an I/O
* comletion handler. This also needs to happen in case after a
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
if (wpc->ops->prepare_ioend) if (wpc->ops->prepare_ioend)
error = wpc->ops->prepare_ioend(ioend, error); error = wpc->ops->prepare_ioend(wpc->ioend, error);
if (error) { if (error) {
/* wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
* If we're failing the IO now, just mark the ioend with an bio_endio(&wpc->ioend->io_bio);
* error and finish it. This will run IO completion immediately } else {
* as there is only one reference to the ioend at this point in submit_bio(&wpc->ioend->io_bio);
* time.
*/
ioend->io_bio.bi_status = errno_to_blk_status(error);
bio_endio(&ioend->io_bio);
return error;
} }
submit_bio(&ioend->io_bio); wpc->ioend = NULL;
return 0; return error;
} }
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
...@@ -1687,19 +1691,28 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) ...@@ -1687,19 +1691,28 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
/* /*
* Test to see if we have an existing ioend structure that we could append to * Test to see if we have an existing ioend structure that we could append to
* first; otherwise finish off the current ioend and start another. * first; otherwise finish off the current ioend and start another.
*
* If a new ioend is created and cached, the old ioend is submitted to the block
* layer instantly. Batching optimisations are provided by higher level block
* plugging.
*
* At the end of a writeback pass, there will be a cached ioend remaining on the
* writepage context that the caller will need to submit.
*/ */
static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio, struct writeback_control *wbc, struct folio *folio,
struct inode *inode, loff_t pos, struct list_head *iolist) struct inode *inode, loff_t pos)
{ {
struct iomap_folio_state *ifs = folio->private; struct iomap_folio_state *ifs = folio->private;
unsigned len = i_blocksize(inode); unsigned len = i_blocksize(inode);
size_t poff = offset_in_folio(folio, pos); size_t poff = offset_in_folio(folio, pos);
int error;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
new_ioend: new_ioend:
if (wpc->ioend) error = iomap_submit_ioend(wpc, 0);
list_add(&wpc->ioend->io_list, iolist); if (error)
return error;
wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
} }
...@@ -1710,12 +1723,12 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, ...@@ -1710,12 +1723,12 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
atomic_add(len, &ifs->write_bytes_pending); atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len; wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len); wbc_account_cgroup_owner(wbc, &folio->page, len);
return 0;
} }
static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio, struct writeback_control *wbc, struct folio *folio,
struct inode *inode, u64 pos, unsigned *count, struct inode *inode, u64 pos, unsigned *count)
struct list_head *submit_list)
{ {
int error; int error;
...@@ -1732,8 +1745,9 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, ...@@ -1732,8 +1745,9 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
case IOMAP_HOLE: case IOMAP_HOLE:
break; break;
default: default:
iomap_add_to_ioend(wpc, wbc, folio, inode, pos, submit_list); error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos);
(*count)++; if (!error)
(*count)++;
} }
fail: fail:
...@@ -1809,35 +1823,21 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, ...@@ -1809,35 +1823,21 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
return true; return true;
} }
/*
* We implement an immediate ioend submission policy here to avoid needing to
* chain multiple ioends and hence nest mempool allocations which can violate
* the forward progress guarantees we need to provide. The current ioend we're
* adding blocks to is cached in the writepage context, and if the new block
* doesn't append to the cached ioend, it will create a new ioend and cache that
* instead.
*
* If a new ioend is created and cached, the old ioend is returned and queued
* locally for submission once the entire page is processed or an error has been
* detected. While ioends are submitted immediately after they are completed,
* batching optimisations are provided by higher level block plugging.
*
* At the end of a writeback pass, there will be a cached ioend remaining on the
* writepage context that the caller will need to submit.
*/
static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio) struct writeback_control *wbc, struct folio *folio)
{ {
struct iomap_folio_state *ifs = folio->private; struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host; struct inode *inode = folio->mapping->host;
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode); unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio); unsigned nblocks = i_blocks_per_folio(inode, folio);
u64 pos = folio_pos(folio); u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio); u64 end_pos = pos + folio_size(folio);
unsigned count = 0; unsigned count = 0;
int error = 0, i; int error = 0, i;
LIST_HEAD(submit_list);
WARN_ON_ONCE(!folio_test_locked(folio));
WARN_ON_ONCE(folio_test_dirty(folio));
WARN_ON_ONCE(folio_test_writeback(folio));
trace_iomap_writepage(inode, pos, folio_size(folio)); trace_iomap_writepage(inode, pos, folio_size(folio));
...@@ -1847,12 +1847,27 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1847,12 +1847,27 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
} }
WARN_ON_ONCE(end_pos <= pos); WARN_ON_ONCE(end_pos <= pos);
if (!ifs && nblocks > 1) { if (nblocks > 1) {
ifs = ifs_alloc(inode, folio, 0); if (!ifs) {
iomap_set_range_dirty(folio, 0, end_pos - pos); ifs = ifs_alloc(inode, folio, 0);
iomap_set_range_dirty(folio, 0, end_pos - pos);
}
/*
* Keep the I/O completion handler from clearing the writeback
* bit until we have submitted all blocks by adding a bias to
* ifs->write_bytes_pending, which is dropped after submitting
* all blocks.
*/
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
atomic_inc(&ifs->write_bytes_pending);
} }
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); /*
* Set the writeback bit ASAP, as the I/O completion for the single
* block per folio case happen hit as soon as we're submitting the bio.
*/
folio_start_writeback(folio);
/* /*
* Walk through the folio to find areas to write back. If we * Walk through the folio to find areas to write back. If we
...@@ -1863,18 +1878,13 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1863,18 +1878,13 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (ifs && !ifs_block_is_dirty(folio, ifs, i)) if (ifs && !ifs_block_is_dirty(folio, ifs, i))
continue; continue;
error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos, error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos,
&count, &submit_list); &count);
if (error) if (error)
break; break;
} }
if (count) if (count)
wpc->nr_folios++; wpc->nr_folios++;
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
WARN_ON_ONCE(!folio_test_locked(folio));
WARN_ON_ONCE(folio_test_writeback(folio));
WARN_ON_ONCE(folio_test_dirty(folio));
/* /*
* We can have dirty bits set past end of file in page_mkwrite path * We can have dirty bits set past end of file in page_mkwrite path
* while mapping the last partial folio. Hence it's better to clear * while mapping the last partial folio. Hence it's better to clear
...@@ -1883,38 +1893,20 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1883,38 +1893,20 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
iomap_clear_range_dirty(folio, 0, folio_size(folio)); iomap_clear_range_dirty(folio, 0, folio_size(folio));
/* /*
* If the page hasn't been added to the ioend, it won't be affected by * Usually the writeback bit is cleared by the I/O completion handler.
* I/O completion and we must unlock it now. * But we may end up either not actually writing any blocks, or (when
* there are multiple blocks in a folio) all I/O might have finished
* already at this point. In that case we need to clear the writeback
* bit ourselves right after unlocking the page.
*/ */
if (error && !count) {
folio_unlock(folio);
goto done;
}
folio_start_writeback(folio);
folio_unlock(folio); folio_unlock(folio);
if (ifs) {
/* if (atomic_dec_and_test(&ifs->write_bytes_pending))
* Preserve the original error if there was one; catch folio_end_writeback(folio);
* submission errors here and propagate into subsequent ioend } else {
* submissions. if (!count)
*/ folio_end_writeback(folio);
list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
int error2;
list_del_init(&ioend->io_list);
error2 = iomap_submit_ioend(wpc, ioend, error);
if (error2 && !error)
error = error2;
} }
/*
* We can end up here with no error and nothing to write only if we race
* with a partial page truncate on a sub-page block sized filesystem.
*/
if (!count)
folio_end_writeback(folio);
done:
mapping_set_error(inode->i_mapping, error); mapping_set_error(inode->i_mapping, error);
return error; return error;
} }
...@@ -1942,9 +1934,7 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, ...@@ -1942,9 +1934,7 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
wpc->ops = ops; wpc->ops = ops;
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
if (!wpc->ioend) return iomap_submit_ioend(wpc, ret);
return ret;
return iomap_submit_ioend(wpc, wpc->ioend, ret);
} }
EXPORT_SYMBOL_GPL(iomap_writepages); EXPORT_SYMBOL_GPL(iomap_writepages);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment