Commit a67371b7 authored by Darrick J. Wong's avatar Darrick J. Wong

Merge tag 'iomap-per-block-dirty-tracking' of...

Merge tag 'iomap-per-block-dirty-tracking' of https://github.com/riteshharjani/linux into iomap-6.6-merge

iomap: Add per-block dirty state tracking to iomap

iomap today only tracks per-block update state bitmap, this series extends
the support by adding per-block dirty state bitmap tracking to iomap buffered
I/O path. This helps in reducing the write amplification and improve
write performance for large folio writes and for platforms with higher
pagesize compared to blocksize.

We have seen ~83% performance improvement with these patches using
database benchmarking tests, with XFS on 64k pagesize.
fio benchmark (as shown in the last patch which adds dirty tracking
support) showed close to 16x performance improvement when tested with
64K pagesize on 4k blocksize XFS using nvme on Power.

* tag 'iomap-per-block-dirty-tracking' of https://github.com/riteshharjani/linux:
  iomap: Add per-block dirty state tracking to improve performance
  iomap: Allocate ifs in ->write_begin() early
  iomap: Refactor iomap_write_delalloc_punch() function out
  iomap: Use iomap_punch_t typedef
  iomap: Fix possible overflow condition in iomap_write_delalloc_scan
  iomap: Add some uptodate state handling helpers for ifs state bitmap
  iomap: Drop ifs argument from iomap_set_range_uptodate()
  iomap: Rename iomap_page to iomap_folio_state and others

[djwong: also yay to less write amplification!]
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
parents d42bd17c 4ce02c67
...@@ -747,7 +747,7 @@ static const struct address_space_operations gfs2_aops = { ...@@ -747,7 +747,7 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages, .writepages = gfs2_writepages,
.read_folio = gfs2_read_folio, .read_folio = gfs2_read_folio,
.readahead = gfs2_readahead, .readahead = gfs2_readahead,
.dirty_folio = filemap_dirty_folio, .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio, .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio, .invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap, .bmap = gfs2_bmap,
......
...@@ -23,65 +23,169 @@ ...@@ -23,65 +23,169 @@
#define IOEND_BATCH_SIZE 4096 #define IOEND_BATCH_SIZE 4096
typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
/* /*
* Structure allocated for each folio when block size < folio size * Structure allocated for each folio to track per-block uptodate, dirty state
* to track sub-folio uptodate status and I/O completions. * and I/O completions.
*/ */
struct iomap_page { struct iomap_folio_state {
atomic_t read_bytes_pending; atomic_t read_bytes_pending;
atomic_t write_bytes_pending; atomic_t write_bytes_pending;
spinlock_t uptodate_lock; spinlock_t state_lock;
unsigned long uptodate[];
/*
* Each block has two bits in this bitmap:
* Bits [0..blocks_per_folio) has the uptodate status.
* Bits [b_p_f...(2*b_p_f)) has the dirty status.
*/
unsigned long state[];
}; };
static inline struct iomap_page *to_iomap_page(struct folio *folio) static struct bio_set iomap_ioend_bioset;
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{ {
if (folio_test_private(folio)) struct inode *inode = folio->mapping->host;
return folio_get_private(folio);
return NULL; return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
} }
static struct bio_set iomap_ioend_bioset; static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
unsigned int block)
{
return test_bit(block, ifs->state);
}
static void ifs_set_range_uptodate(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
unsigned int first_blk = off >> inode->i_blkbits;
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
unsigned int nr_blks = last_blk - first_blk + 1;
unsigned long flags;
spin_lock_irqsave(&ifs->state_lock, flags);
bitmap_set(ifs->state, first_blk, nr_blks);
if (ifs_is_fully_uptodate(folio, ifs))
folio_mark_uptodate(folio);
spin_unlock_irqrestore(&ifs->state_lock, flags);
}
static void iomap_set_range_uptodate(struct folio *folio, size_t off,
size_t len)
{
struct iomap_folio_state *ifs = folio->private;
if (ifs)
ifs_set_range_uptodate(folio, ifs, off, len);
else
folio_mark_uptodate(folio);
}
static inline bool ifs_block_is_dirty(struct folio *folio,
struct iomap_folio_state *ifs, int block)
{
struct inode *inode = folio->mapping->host;
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
static struct iomap_page * return test_bit(block + blks_per_folio, ifs->state);
iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) }
static void ifs_clear_range_dirty(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{ {
struct iomap_page *iop = to_iomap_page(folio); struct inode *inode = folio->mapping->host;
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
unsigned int first_blk = (off >> inode->i_blkbits);
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
unsigned int nr_blks = last_blk - first_blk + 1;
unsigned long flags;
spin_lock_irqsave(&ifs->state_lock, flags);
bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
spin_unlock_irqrestore(&ifs->state_lock, flags);
}
static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
{
struct iomap_folio_state *ifs = folio->private;
if (ifs)
ifs_clear_range_dirty(folio, ifs, off, len);
}
static void ifs_set_range_dirty(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
unsigned int first_blk = (off >> inode->i_blkbits);
unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
unsigned int nr_blks = last_blk - first_blk + 1;
unsigned long flags;
spin_lock_irqsave(&ifs->state_lock, flags);
bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
spin_unlock_irqrestore(&ifs->state_lock, flags);
}
static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
{
struct iomap_folio_state *ifs = folio->private;
if (ifs)
ifs_set_range_dirty(folio, ifs, off, len);
}
static struct iomap_folio_state *ifs_alloc(struct inode *inode,
struct folio *folio, unsigned int flags)
{
struct iomap_folio_state *ifs = folio->private;
unsigned int nr_blocks = i_blocks_per_folio(inode, folio); unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
gfp_t gfp; gfp_t gfp;
if (iop || nr_blocks <= 1) if (ifs || nr_blocks <= 1)
return iop; return ifs;
if (flags & IOMAP_NOWAIT) if (flags & IOMAP_NOWAIT)
gfp = GFP_NOWAIT; gfp = GFP_NOWAIT;
else else
gfp = GFP_NOFS | __GFP_NOFAIL; gfp = GFP_NOFS | __GFP_NOFAIL;
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), /*
gfp); * ifs->state tracks two sets of state flags when the
if (iop) { * filesystem block size is smaller than the folio size.
spin_lock_init(&iop->uptodate_lock); * The first state tracks per-block uptodate and the
* second tracks per-block dirty state.
*/
ifs = kzalloc(struct_size(ifs, state,
BITS_TO_LONGS(2 * nr_blocks)), gfp);
if (!ifs)
return ifs;
spin_lock_init(&ifs->state_lock);
if (folio_test_uptodate(folio)) if (folio_test_uptodate(folio))
bitmap_fill(iop->uptodate, nr_blocks); bitmap_set(ifs->state, 0, nr_blocks);
folio_attach_private(folio, iop); if (folio_test_dirty(folio))
} bitmap_set(ifs->state, nr_blocks, nr_blocks);
return iop; folio_attach_private(folio, ifs);
return ifs;
} }
static void iomap_page_release(struct folio *folio) static void ifs_free(struct folio *folio)
{ {
struct iomap_page *iop = folio_detach_private(folio); struct iomap_folio_state *ifs = folio_detach_private(folio);
struct inode *inode = folio->mapping->host;
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
if (!iop) if (!ifs)
return; return;
WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending));
WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
folio_test_uptodate(folio)); folio_test_uptodate(folio));
kfree(iop); kfree(ifs);
} }
/* /*
...@@ -90,7 +194,7 @@ static void iomap_page_release(struct folio *folio) ...@@ -90,7 +194,7 @@ static void iomap_page_release(struct folio *folio)
static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
loff_t *pos, loff_t length, size_t *offp, size_t *lenp) loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
{ {
struct iomap_page *iop = to_iomap_page(folio); struct iomap_folio_state *ifs = folio->private;
loff_t orig_pos = *pos; loff_t orig_pos = *pos;
loff_t isize = i_size_read(inode); loff_t isize = i_size_read(inode);
unsigned block_bits = inode->i_blkbits; unsigned block_bits = inode->i_blkbits;
...@@ -105,12 +209,12 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, ...@@ -105,12 +209,12 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* per-block uptodate status and adjust the offset and length if needed * per-block uptodate status and adjust the offset and length if needed
* to avoid reading in already uptodate ranges. * to avoid reading in already uptodate ranges.
*/ */
if (iop) { if (ifs) {
unsigned int i; unsigned int i;
/* move forward for each leading block marked uptodate */ /* move forward for each leading block marked uptodate */
for (i = first; i <= last; i++) { for (i = first; i <= last; i++) {
if (!test_bit(i, iop->uptodate)) if (!ifs_block_is_uptodate(ifs, i))
break; break;
*pos += block_size; *pos += block_size;
poff += block_size; poff += block_size;
...@@ -120,7 +224,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, ...@@ -120,7 +224,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
/* truncate len if we find any trailing uptodate block(s) */ /* truncate len if we find any trailing uptodate block(s) */
for ( ; i <= last; i++) { for ( ; i <= last; i++) {
if (test_bit(i, iop->uptodate)) { if (ifs_block_is_uptodate(ifs, i)) {
plen -= (last - i + 1) * block_size; plen -= (last - i + 1) * block_size;
last = i - 1; last = i - 1;
break; break;
...@@ -144,43 +248,19 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, ...@@ -144,43 +248,19 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen; *lenp = plen;
} }
static void iomap_iop_set_range_uptodate(struct folio *folio,
struct iomap_page *iop, size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
unsigned first = off >> inode->i_blkbits;
unsigned last = (off + len - 1) >> inode->i_blkbits;
unsigned long flags;
spin_lock_irqsave(&iop->uptodate_lock, flags);
bitmap_set(iop->uptodate, first, last - first + 1);
if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
folio_mark_uptodate(folio);
spin_unlock_irqrestore(&iop->uptodate_lock, flags);
}
static void iomap_set_range_uptodate(struct folio *folio,
struct iomap_page *iop, size_t off, size_t len)
{
if (iop)
iomap_iop_set_range_uptodate(folio, iop, off, len);
else
folio_mark_uptodate(folio);
}
static void iomap_finish_folio_read(struct folio *folio, size_t offset, static void iomap_finish_folio_read(struct folio *folio, size_t offset,
size_t len, int error) size_t len, int error)
{ {
struct iomap_page *iop = to_iomap_page(folio); struct iomap_folio_state *ifs = folio->private;
if (unlikely(error)) { if (unlikely(error)) {
folio_clear_uptodate(folio); folio_clear_uptodate(folio);
folio_set_error(folio); folio_set_error(folio);
} else { } else {
iomap_set_range_uptodate(folio, iop, offset, len); iomap_set_range_uptodate(folio, offset, len);
} }
if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending)) if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending))
folio_unlock(folio); folio_unlock(folio);
} }
...@@ -213,7 +293,6 @@ struct iomap_readpage_ctx { ...@@ -213,7 +293,6 @@ struct iomap_readpage_ctx {
static int iomap_read_inline_data(const struct iomap_iter *iter, static int iomap_read_inline_data(const struct iomap_iter *iter,
struct folio *folio) struct folio *folio)
{ {
struct iomap_page *iop;
const struct iomap *iomap = iomap_iter_srcmap(iter); const struct iomap *iomap = iomap_iter_srcmap(iter);
size_t size = i_size_read(iter->inode) - iomap->offset; size_t size = i_size_read(iter->inode) - iomap->offset;
size_t poff = offset_in_page(iomap->offset); size_t poff = offset_in_page(iomap->offset);
...@@ -231,15 +310,13 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, ...@@ -231,15 +310,13 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length)) if (WARN_ON_ONCE(size > iomap->length))
return -EIO; return -EIO;
if (offset > 0) if (offset > 0)
iop = iomap_page_create(iter->inode, folio, iter->flags); ifs_alloc(iter->inode, folio, iter->flags);
else
iop = to_iomap_page(folio);
addr = kmap_local_folio(folio, offset); addr = kmap_local_folio(folio, offset);
memcpy(addr, iomap->inline_data, size); memcpy(addr, iomap->inline_data, size);
memset(addr + size, 0, PAGE_SIZE - poff - size); memset(addr + size, 0, PAGE_SIZE - poff - size);
kunmap_local(addr); kunmap_local(addr);
iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff); iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff);
return 0; return 0;
} }
...@@ -260,7 +337,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, ...@@ -260,7 +337,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
loff_t pos = iter->pos + offset; loff_t pos = iter->pos + offset;
loff_t length = iomap_length(iter) - offset; loff_t length = iomap_length(iter) - offset;
struct folio *folio = ctx->cur_folio; struct folio *folio = ctx->cur_folio;
struct iomap_page *iop; struct iomap_folio_state *ifs;
loff_t orig_pos = pos; loff_t orig_pos = pos;
size_t poff, plen; size_t poff, plen;
sector_t sector; sector_t sector;
...@@ -269,20 +346,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, ...@@ -269,20 +346,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio); return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */ /* zero post-eof blocks as the page may be mapped */
iop = iomap_page_create(iter->inode, folio, iter->flags); ifs = ifs_alloc(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0) if (plen == 0)
goto done; goto done;
if (iomap_block_needs_zeroing(iter, pos)) { if (iomap_block_needs_zeroing(iter, pos)) {
folio_zero_range(folio, poff, plen); folio_zero_range(folio, poff, plen);
iomap_set_range_uptodate(folio, iop, poff, plen); iomap_set_range_uptodate(folio, poff, plen);
goto done; goto done;
} }
ctx->cur_folio_in_bio = true; ctx->cur_folio_in_bio = true;
if (iop) if (ifs)
atomic_add(plen, &iop->read_bytes_pending); atomic_add(plen, &ifs->read_bytes_pending);
sector = iomap_sector(iomap, pos); sector = iomap_sector(iomap, pos);
if (!ctx->bio || if (!ctx->bio ||
...@@ -436,11 +513,11 @@ EXPORT_SYMBOL_GPL(iomap_readahead); ...@@ -436,11 +513,11 @@ EXPORT_SYMBOL_GPL(iomap_readahead);
*/ */
bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{ {
struct iomap_page *iop = to_iomap_page(folio); struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host; struct inode *inode = folio->mapping->host;
unsigned first, last, i; unsigned first, last, i;
if (!iop) if (!ifs)
return false; return false;
/* Caller's range may extend past the end of this folio */ /* Caller's range may extend past the end of this folio */
...@@ -451,7 +528,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) ...@@ -451,7 +528,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
last = (from + count - 1) >> inode->i_blkbits; last = (from + count - 1) >> inode->i_blkbits;
for (i = first; i <= last; i++) for (i = first; i <= last; i++)
if (!test_bit(i, iop->uptodate)) if (!ifs_block_is_uptodate(ifs, i))
return false; return false;
return true; return true;
} }
...@@ -491,7 +568,7 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) ...@@ -491,7 +568,7 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
*/ */
if (folio_test_dirty(folio)) if (folio_test_dirty(folio))
return false; return false;
iomap_page_release(folio); ifs_free(folio);
return true; return true;
} }
EXPORT_SYMBOL_GPL(iomap_release_folio); EXPORT_SYMBOL_GPL(iomap_release_folio);
...@@ -508,11 +585,22 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) ...@@ -508,11 +585,22 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
if (offset == 0 && len == folio_size(folio)) { if (offset == 0 && len == folio_size(folio)) {
WARN_ON_ONCE(folio_test_writeback(folio)); WARN_ON_ONCE(folio_test_writeback(folio));
folio_cancel_dirty(folio); folio_cancel_dirty(folio);
iomap_page_release(folio); ifs_free(folio);
} }
} }
EXPORT_SYMBOL_GPL(iomap_invalidate_folio); EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
struct inode *inode = mapping->host;
size_t len = folio_size(folio);
ifs_alloc(inode, folio, 0);
iomap_set_range_dirty(folio, 0, len);
return filemap_dirty_folio(mapping, folio);
}
EXPORT_SYMBOL_GPL(iomap_dirty_folio);
static void static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{ {
...@@ -543,7 +631,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -543,7 +631,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio) size_t len, struct folio *folio)
{ {
const struct iomap *srcmap = iomap_iter_srcmap(iter); const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct iomap_page *iop; struct iomap_folio_state *ifs;
loff_t block_size = i_blocksize(iter->inode); loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size); loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size); loff_t block_end = round_up(pos + len, block_size);
...@@ -551,14 +639,23 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -551,14 +639,23 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t from = offset_in_folio(folio, pos), to = from + len; size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen; size_t poff, plen;
if (folio_test_uptodate(folio)) /*
* If the write completely overlaps the current folio, then
* entire folio will be dirtied so there is no need for
* per-block state tracking structures to be attached to this folio.
*/
if (pos <= folio_pos(folio) &&
pos + len >= folio_pos(folio) + folio_size(folio))
return 0; return 0;
folio_clear_error(folio);
iop = iomap_page_create(iter->inode, folio, iter->flags); ifs = ifs_alloc(iter->inode, folio, iter->flags);
if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1)
return -EAGAIN; return -EAGAIN;
if (folio_test_uptodate(folio))
return 0;
folio_clear_error(folio);
do { do {
iomap_adjust_read_range(iter->inode, folio, &block_start, iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen); block_end - block_start, &poff, &plen);
...@@ -585,7 +682,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -585,7 +682,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (status) if (status)
return status; return status;
} }
iomap_set_range_uptodate(folio, iop, poff, plen); iomap_set_range_uptodate(folio, poff, plen);
} while ((block_start += plen) < block_end); } while ((block_start += plen) < block_end);
return 0; return 0;
...@@ -692,7 +789,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, ...@@ -692,7 +789,6 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
size_t copied, struct folio *folio) size_t copied, struct folio *folio)
{ {
struct iomap_page *iop = to_iomap_page(folio);
flush_dcache_folio(folio); flush_dcache_folio(folio);
/* /*
...@@ -708,7 +804,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, ...@@ -708,7 +804,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
*/ */
if (unlikely(copied < len && !folio_test_uptodate(folio))) if (unlikely(copied < len && !folio_test_uptodate(folio)))
return 0; return 0;
iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len); iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
filemap_dirty_folio(inode->i_mapping, folio); filemap_dirty_folio(inode->i_mapping, folio);
return copied; return copied;
} }
...@@ -878,6 +975,76 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, ...@@ -878,6 +975,76 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
} }
EXPORT_SYMBOL_GPL(iomap_file_buffered_write); EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
static int iomap_write_delalloc_ifs_punch(struct inode *inode,
struct folio *folio, loff_t start_byte, loff_t end_byte,
iomap_punch_t punch)
{
unsigned int first_blk, last_blk, i;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;
struct iomap_folio_state *ifs;
int ret = 0;
/*
* When we have per-block dirty tracking, there can be
* blocks within a folio which are marked uptodate
* but not dirty. In that case it is necessary to punch
* out such blocks to avoid leaking any delalloc blocks.
*/
ifs = folio->private;
if (!ifs)
return ret;
last_byte = min_t(loff_t, end_byte - 1,
folio_pos(folio) + folio_size(folio) - 1);
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
for (i = first_blk; i <= last_blk; i++) {
if (!ifs_block_is_dirty(folio, ifs, i)) {
ret = punch(inode, folio_pos(folio) + (i << blkbits),
1 << blkbits);
if (ret)
return ret;
}
}
return ret;
}
static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
iomap_punch_t punch)
{
int ret = 0;
if (!folio_test_dirty(folio))
return ret;
/* if dirty, punch up to offset */
if (start_byte > *punch_start_byte) {
ret = punch(inode, *punch_start_byte,
start_byte - *punch_start_byte);
if (ret)
return ret;
}
/* Punch non-dirty blocks within folio */
ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
end_byte, punch);
if (ret)
return ret;
/*
* Make sure the next punch start is correctly bound to
* the end of this data range, not the end of the folio.
*/
*punch_start_byte = min_t(loff_t, end_byte,
folio_pos(folio) + folio_size(folio));
return ret;
}
/* /*
* Scan the data range passed to us for dirty page cache folios. If we find a * Scan the data range passed to us for dirty page cache folios. If we find a
* dirty folio, punch out the preceeding range and update the offset from which * dirty folio, punch out the preceeding range and update the offset from which
...@@ -897,10 +1064,11 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write); ...@@ -897,10 +1064,11 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
*/ */
static int iomap_write_delalloc_scan(struct inode *inode, static int iomap_write_delalloc_scan(struct inode *inode,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
int (*punch)(struct inode *inode, loff_t offset, loff_t length)) iomap_punch_t punch)
{ {
while (start_byte < end_byte) { while (start_byte < end_byte) {
struct folio *folio; struct folio *folio;
int ret;
/* grab locked page */ /* grab locked page */
folio = filemap_lock_folio(inode->i_mapping, folio = filemap_lock_folio(inode->i_mapping,
...@@ -911,26 +1079,12 @@ static int iomap_write_delalloc_scan(struct inode *inode, ...@@ -911,26 +1079,12 @@ static int iomap_write_delalloc_scan(struct inode *inode,
continue; continue;
} }
/* if dirty, punch up to offset */ ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte,
if (folio_test_dirty(folio)) { start_byte, end_byte, punch);
if (start_byte > *punch_start_byte) { if (ret) {
int error;
error = punch(inode, *punch_start_byte,
start_byte - *punch_start_byte);
if (error) {
folio_unlock(folio); folio_unlock(folio);
folio_put(folio); folio_put(folio);
return error; return ret;
}
}
/*
* Make sure the next punch start is correctly bound to
* the end of this data range, not the end of the folio.
*/
*punch_start_byte = min_t(loff_t, end_byte,
folio_next_index(folio) << PAGE_SHIFT);
} }
/* move offset to start of next folio in range */ /* move offset to start of next folio in range */
...@@ -975,8 +1129,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, ...@@ -975,8 +1129,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
* the code to subtle off-by-one bugs.... * the code to subtle off-by-one bugs....
*/ */
static int iomap_write_delalloc_release(struct inode *inode, static int iomap_write_delalloc_release(struct inode *inode,
loff_t start_byte, loff_t end_byte, loff_t start_byte, loff_t end_byte, iomap_punch_t punch)
int (*punch)(struct inode *inode, loff_t pos, loff_t length))
{ {
loff_t punch_start_byte = start_byte; loff_t punch_start_byte = start_byte;
loff_t scan_end_byte = min(i_size_read(inode), end_byte); loff_t scan_end_byte = min(i_size_read(inode), end_byte);
...@@ -1069,8 +1222,7 @@ static int iomap_write_delalloc_release(struct inode *inode, ...@@ -1069,8 +1222,7 @@ static int iomap_write_delalloc_release(struct inode *inode,
*/ */
int iomap_file_buffered_write_punch_delalloc(struct inode *inode, int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
struct iomap *iomap, loff_t pos, loff_t length, struct iomap *iomap, loff_t pos, loff_t length,
ssize_t written, ssize_t written, iomap_punch_t punch)
int (*punch)(struct inode *inode, loff_t pos, loff_t length))
{ {
loff_t start_byte; loff_t start_byte;
loff_t end_byte; loff_t end_byte;
...@@ -1291,17 +1443,17 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite); ...@@ -1291,17 +1443,17 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len, int error) size_t len, int error)
{ {
struct iomap_page *iop = to_iomap_page(folio); struct iomap_folio_state *ifs = folio->private;
if (error) { if (error) {
folio_set_error(folio); folio_set_error(folio);
mapping_set_error(inode->i_mapping, error); mapping_set_error(inode->i_mapping, error);
} }
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop); WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
folio_end_writeback(folio); folio_end_writeback(folio);
} }
...@@ -1568,7 +1720,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, ...@@ -1568,7 +1720,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
*/ */
static void static void
iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
struct iomap_page *iop, struct iomap_writepage_ctx *wpc, struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct list_head *iolist) struct writeback_control *wbc, struct list_head *iolist)
{ {
sector_t sector = iomap_sector(&wpc->iomap, pos); sector_t sector = iomap_sector(&wpc->iomap, pos);
...@@ -1586,8 +1738,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, ...@@ -1586,8 +1738,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
} }
if (iop) if (ifs)
atomic_add(len, &iop->write_bytes_pending); atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len; wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len); wbc_account_cgroup_owner(wbc, &folio->page, len);
} }
...@@ -1613,7 +1765,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1613,7 +1765,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode, struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos) struct folio *folio, u64 end_pos)
{ {
struct iomap_page *iop = iomap_page_create(inode, folio, 0); struct iomap_folio_state *ifs = folio->private;
struct iomap_ioend *ioend, *next; struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode); unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio); unsigned nblocks = i_blocks_per_folio(inode, folio);
...@@ -1621,7 +1773,14 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1621,7 +1773,14 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
int error = 0, count = 0, i; int error = 0, count = 0, i;
LIST_HEAD(submit_list); LIST_HEAD(submit_list);
WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); WARN_ON_ONCE(end_pos <= pos);
if (!ifs && nblocks > 1) {
ifs = ifs_alloc(inode, folio, 0);
iomap_set_range_dirty(folio, 0, end_pos - pos);
}
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
/* /*
* Walk through the folio to find areas to write back. If we * Walk through the folio to find areas to write back. If we
...@@ -1629,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1629,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* invalid, grab a new one. * invalid, grab a new one.
*/ */
for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
if (iop && !test_bit(i, iop->uptodate)) if (ifs && !ifs_block_is_dirty(folio, ifs, i))
continue; continue;
error = wpc->ops->map_blocks(wpc, inode, pos); error = wpc->ops->map_blocks(wpc, inode, pos);
...@@ -1640,7 +1799,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1640,7 +1799,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
continue; continue;
if (wpc->iomap.type == IOMAP_HOLE) if (wpc->iomap.type == IOMAP_HOLE)
continue; continue;
iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc, iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
&submit_list); &submit_list);
count++; count++;
} }
...@@ -1673,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1673,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
} }
} }
/*
* We can have dirty bits set past end of file in page_mkwrite path
* while mapping the last partial folio. Hence it's better to clear
* all the dirty bits in the folio here.
*/
iomap_clear_range_dirty(folio, 0, folio_size(folio));
folio_start_writeback(folio); folio_start_writeback(folio);
folio_unlock(folio); folio_unlock(folio);
......
...@@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = { ...@@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
.read_folio = xfs_vm_read_folio, .read_folio = xfs_vm_read_folio,
.readahead = xfs_vm_readahead, .readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages, .writepages = xfs_vm_writepages,
.dirty_folio = filemap_dirty_folio, .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio, .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio, .invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap, .bmap = xfs_vm_bmap,
......
...@@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = { ...@@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
.read_folio = zonefs_read_folio, .read_folio = zonefs_read_folio,
.readahead = zonefs_readahead, .readahead = zonefs_readahead,
.writepages = zonefs_writepages, .writepages = zonefs_writepages,
.dirty_folio = filemap_dirty_folio, .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio, .release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio, .invalidate_folio = iomap_invalidate_folio,
.migrate_folio = filemap_migrate_folio, .migrate_folio = filemap_migrate_folio,
......
...@@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); ...@@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops); const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment