Commit 6432f212 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "The big new feature added this time is supporting online resizing
  using the meta_bg feature.  This allows us to resize file systems
  which are greater than 16TB.  In addition, the speed of online
  resizing has been improved in general.

  We also fix a number of races, some of which could lead to deadlocks,
  in ext4's Asynchronous I/O and online defrag support, thanks to good
  work by Dmitry Monakhov.

  There are also a large number of more minor bug fixes and cleanups
  from a number of other ext4 contributors, quite of few of which have
  submitted fixes for the first time."

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (69 commits)
  ext4: fix ext4_flush_completed_IO wait semantics
  ext4: fix mtime update in nodelalloc mode
  ext4: fix ext_remove_space for punch_hole case
  ext4: punch_hole should wait for DIO writers
  ext4: serialize truncate with owerwrite DIO workers
  ext4: endless truncate due to nonlocked dio readers
  ext4: serialize unlocked dio reads with truncate
  ext4: serialize dio nonlocked reads with defrag workers
  ext4: completed_io locking cleanup
  ext4: fix unwritten counter leakage
  ext4: give i_aiodio_unwritten a more appropriate name
  ext4: ext4_inode_info diet
  ext4: convert to use leXX_add_cpu()
  ext4: ext4_bread usage audit
  fs: reserve fallocate flag codepoint
  ext4: remove redundant offset check in mext_check_arguments()
  ext4: don't clear orphan list on ro mount with errors
  jbd2: fix assertion failure in commit code due to lacking transaction credits
  ext4: release donor reference when EXT4_IOC_MOVE_EXT ioctl fails
  ext4: enable FITRIM ioctl on bigalloc file system
  ...
parents 1b033447 c278531d
......@@ -96,3 +96,16 @@ Contact: "Theodore Ts'o" <tytso@mit.edu>
Description:
The maximum number of megabytes the writeback code will
try to write out before move on to another inode.
What: /sys/fs/ext4/<disk>/extent_max_zeroout_kb
Date: August 2012
Contact: "Theodore Ts'o" <tytso@mit.edu>
Description:
The maximum number of kilobytes which will be zeroed
out in preference to creating a new uninitialized
extent when manipulating an inode's extent tree. Note
that using a larger value will increase the
variability of time necessary to complete a random
write operation (since a 4k random write might turn
into a much larger write due to the zeroout
operation).
......@@ -375,6 +375,16 @@ dioread_nolock locking. If the dioread_nolock option is specified
Because of the restrictions this options comprises
it is off by default (e.g. dioread_lock).
max_dir_size_kb=n This limits the size of directories so that any
attempt to expand them beyond the specified
limit in kilobytes will cause an ENOSPC error.
This is useful in memory constrained
environments, where a very large directory can
cause severe performance problems or even
provoke the Out Of Memory killer. (For example,
if there is only 512mb memory available, a 176mb
directory may seriously cramp the system's style.)
i_version Enable 64-bit inode version support. This option is
off by default.
......
......@@ -2312,12 +2312,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
loff_t size;
int ret;
/*
* Update file times before taking page lock. We may end up failing the
* fault so this update may be superfluous but who really cares...
*/
file_update_time(vma->vm_file);
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
......@@ -2355,6 +2349,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
sb_start_pagefault(sb);
/*
* Update file times before taking page lock. We may end up failing the
* fault so this update may be superfluous but who really cares...
*/
file_update_time(vma->vm_file);
ret = __block_page_mkwrite(vma, vmf, get_block);
sb_end_pagefault(sb);
return block_page_mkwrite_return(ret);
......
......@@ -186,7 +186,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
#define EXT4_IO_END_IN_FSYNC 0x0010
struct ext4_io_page {
struct page *p_page;
......@@ -912,9 +911,7 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
......@@ -1233,6 +1230,7 @@ struct ext4_sb_info {
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
/* tunables */
unsigned long s_stripe;
......@@ -1243,6 +1241,7 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_writeback_mb_bump;
unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
......@@ -1270,8 +1269,12 @@ struct ext4_sb_info {
unsigned long s_sectors_written_start;
u64 s_kbytes_written;
/* the size of zero-out chunk */
unsigned int s_extent_max_zeroout_kb;
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
/* workqueue for dio unwritten */
struct workqueue_struct *dio_unwritten_wq;
......@@ -1328,10 +1331,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
{
return inode->i_private;
}
static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
{
inode->i_private = io;
}
/*
* Inode dynamic state flags
*/
......@@ -1345,6 +1358,8 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
......@@ -1932,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
extern int ext4_flush_completed_IO(struct inode *);
extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
......@@ -1966,6 +1981,8 @@ extern void ext4_exit_mballoc(void);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
......@@ -2051,6 +2068,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb,
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
......@@ -2352,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations;
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern void ext4_unwritten_wait(struct inode *inode);
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
......@@ -2400,11 +2420,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
......@@ -2452,6 +2472,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
/*
* Disable DIO read nolock optimization, so new dioreaders will be forced
* to grab i_mutex
*/
static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
{
ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
smp_mb();
}
static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
{
smp_mb();
ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
}
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
......
This diff is collapsed.
......@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
static void ext4_aiodio_wait(struct inode *inode)
void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
......@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
"performance will be poor.",
inode->i_ino, current->comm);
mutex_lock(ext4_aio_mutex(inode));
ext4_aiodio_wait(inode);
ext4_unwritten_wait(inode);
}
BUG_ON(iocb->ki_pos != pos);
......
......@@ -34,87 +34,6 @@
#include <trace/events/ext4.h>
static void dump_completed_IO(struct inode * inode)
{
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
unsigned long flags;
if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
return;
}
ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
io1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
io, inode->i_ino, io0, io1);
}
spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
#endif
}
/*
* This function is called from ext4_sync_file().
*
* When IO is completed, the work to convert unwritten extents to
* written is queued on workqueue but may not get immediately
* scheduled. When fsync is called, we need to ensure the
* conversion is complete before fsync returns.
* The inode keeps track of a list of pending/completed IO that
* might needs to do the conversion. This function walks through
* the list and convert the related unwritten extents for completed IO
* to written.
* The function return the number of pending IOs on success.
*/
int ext4_flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned long flags;
int ret = 0;
int ret2 = 0;
dump_completed_IO(inode);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
while (!list_empty(&ei->i_completed_io_list)){
io = list_entry(ei->i_completed_io_list.next,
ext4_io_end_t, list);
list_del_init(&io->list);
io->flag |= EXT4_IO_END_IN_FSYNC;
/*
* Calling ext4_end_io_nolock() to convert completed
* IO to written.
*
* When ext4_sync_file() is called, run_queue() may already
* about to flush the work corresponding to this io structure.
* It will be upset if it founds the io structure related
* to the work-to-be schedule is freed.
*
* Thus we need to keep the io structure still valid here after
* conversion finished. The io structure has a flag to
* avoid double converting from both fsync and background work
* queue work.
*/
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
ret = ext4_end_io_nolock(io);
if (ret < 0)
ret2 = ret;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
io->flag &= ~EXT4_IO_END_IN_FSYNC;
}
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
return (ret2 < 0) ? ret2 : 0;
}
/*
* If we're not journaling and this is a just-created file, we have to
* sync our parent directory (if it was freshly created) since
......@@ -203,7 +122,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret;
int ret, err;
tid_t commit_tid;
bool needs_barrier = false;
......@@ -219,7 +138,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (inode->i_sb->s_flags & MS_RDONLY)
goto out;
ret = ext4_flush_completed_IO(inode);
ret = ext4_flush_unwritten_io(inode);
if (ret < 0)
goto out;
......@@ -255,8 +174,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
needs_barrier = true;
jbd2_log_start_commit(journal, commit_tid);
ret = jbd2_log_wait_commit(journal, commit_tid);
if (needs_barrier)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
ret = err;
}
out:
mutex_unlock(&inode->i_mutex);
trace_ext4_sync_file_exit(inode, ret);
......
......@@ -697,6 +697,15 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
if (!gdp)
goto fail;
/*
* Check free inodes count before loading bitmap.
*/
if (ext4_free_inodes_count(sb, gdp) == 0) {
if (++group == ngroups)
group = 0;
continue;
}
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!inode_bitmap_bh)
......
......@@ -807,16 +807,30 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
retry:
if (rw == READ && ext4_should_dioread_nolock(inode)) {
if (unlikely(!list_empty(&ei->i_completed_io_list))) {
if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
mutex_lock(&inode->i_mutex);
ext4_flush_completed_IO(inode);
ext4_flush_unwritten_io(inode);
mutex_unlock(&inode->i_mutex);
}
/*
* Nolock dioread optimization may be dynamically disabled
* via ext4_inode_block_unlocked_dio(). Check inode's state
* while holding extra i_dio_count ref.
*/
atomic_inc(&inode->i_dio_count);
smp_mb();
if (unlikely(ext4_test_inode_state(inode,
EXT4_STATE_DIOREAD_LOCK))) {
inode_dio_done(inode);
goto locked;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
ret = blockdev_direct_IO(rw, iocb, inode, iov,
offset, nr_segs, ext4_get_block);
......
......@@ -732,11 +732,13 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
err = ext4_map_blocks(handle, inode, &map,
create ? EXT4_GET_BLOCKS_CREATE : 0);
/* ensure we send some value back into *errp */
*errp = 0;
if (err < 0)
*errp = err;
if (err <= 0)
return NULL;
*errp = 0;
bh = sb_getblk(inode->i_sb, map.m_pblk);
if (!bh) {
......@@ -1954,9 +1956,6 @@ static int __ext4_journalled_writepage(struct page *page,
return ret;
}
static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
......@@ -2463,6 +2462,16 @@ static int ext4_nonda_switch(struct super_block *sb)
free_blocks = EXT4_C2B(sbi,
percpu_counter_read_positive(&sbi->s_freeclusters_counter));
dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
/*
* Start pushing delalloc when 1/2 of free blocks are dirty.
*/
if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
!writeback_in_progress(sb->s_bdi) &&
down_read_trylock(&sb->s_umount)) {
writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
up_read(&sb->s_umount);
}
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
/*
......@@ -2471,13 +2480,6 @@ static int ext4_nonda_switch(struct super_block *sb)
*/
return 1;
}
/*
* Even if we don't switch but are nearing capacity,
* start pushing delalloc when 1/2 of free blocks are dirty.
*/
if (free_blocks < 2 * dirty_blocks)
writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
return 0;
}
......@@ -2879,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
ext4_io_end_t *io_end = iocb->private;
struct workqueue_struct *wq;
unsigned long flags;
struct ext4_inode_info *ei;
/* if not async direct IO or dio with 0 bytes write, just return */
if (!io_end || !size)
......@@ -2910,24 +2909,14 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
io_end->iocb = iocb;
io_end->result = ret;
}
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
/* Add the io_end to per-inode completed aio dio list*/
ei = EXT4_I(io_end->inode);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
ext4_add_complete_io(io_end);
}
static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
{
ext4_io_end_t *io_end = bh->b_private;
struct workqueue_struct *wq;
struct inode *inode;
unsigned long flags;
if (!test_clear_buffer_uninit(bh) || !io_end)
goto out;
......@@ -2946,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
*/
inode = io_end->inode;
ext4_set_io_unwritten_flag(inode, io_end);
/* Add the io_end to per-inode completed io list*/
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
ext4_add_complete_io(io_end);
out:
bh->b_private = NULL;
bh->b_end_io = NULL;
......@@ -3029,6 +3010,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
overwrite = *((int *)iocb->private);
if (overwrite) {
atomic_inc(&inode->i_dio_count);
down_read(&EXT4_I(inode)->i_data_sem);
mutex_unlock(&inode->i_mutex);
}
......@@ -3054,7 +3036,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* hook to the iocb.
*/
iocb->private = NULL;
EXT4_I(inode)->cur_aio_dio = NULL;
ext4_inode_aio_set(inode, NULL);
if (!is_sync_kiocb(iocb)) {
ext4_io_end_t *io_end =
ext4_init_io_end(inode, GFP_NOFS);
......@@ -3071,7 +3053,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* is a unwritten extents needs to be converted
* when IO is completed.
*/
EXT4_I(inode)->cur_aio_dio = iocb->private;
ext4_inode_aio_set(inode, io_end);
}
if (overwrite)
......@@ -3091,7 +3073,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
NULL,
DIO_LOCKING);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;
ext4_inode_aio_set(inode, NULL);
/*
* The io_end structure takes a reference to the inode,
* that structure needs to be destroyed and the
......@@ -3126,6 +3108,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
retake_lock:
/* take i_mutex locking again if we do a ovewrite dio */
if (overwrite) {
inode_dio_done(inode);
up_read(&EXT4_I(inode)->i_data_sem);
mutex_lock(&inode->i_mutex);
}
......@@ -4052,6 +4035,7 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
int err = 0, rc, block;
int need_datasync = 0;
uid_t i_uid;
gid_t i_gid;
......@@ -4102,7 +4086,10 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
if (ei->i_disksize != ext4_isize(raw_inode)) {
ext4_isize_set(raw_inode, ei->i_disksize);
need_datasync = 1;
}
if (ei->i_disksize > 0x7fffffffULL) {
struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
......@@ -4155,7 +4142,7 @@ static int ext4_do_update_inode(handle_t *handle,
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
ext4_update_inode_fsync_trans(handle, inode, 0);
ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
......@@ -4298,7 +4285,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
inode_dio_wait(inode);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
......@@ -4347,8 +4333,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
if (attr->ia_size != i_size_read(inode))
if (attr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
/* Inode size will be reduced, wait for dio in flight.
* Temporarily disable dioread_nolock to prevent
* livelock. */
if (orphan) {
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
ext4_inode_resume_unlocked_dio(inode);
}
}
ext4_truncate(inode);
}
......@@ -4727,6 +4722,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
jbd2_journal_lock_updates(journal);
/*
......@@ -4746,6 +4745,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
ext4_inode_resume_unlocked_dio(inode);
/* Finally we can mark the inode as dirty. */
......@@ -4780,6 +4780,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
int retries = 0;
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
......
......@@ -366,26 +366,11 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EOPNOTSUPP;
}
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_META_BG)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not (yet) supported with meta_bg");
return -EOPNOTSUPP;
}
if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
sizeof(__u64))) {
return -EFAULT;
}
if (n_blocks_count > MAX_32_NUM &&
!EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR,
"File system only supports 32-bit block numbers");
return -EOPNOTSUPP;
}
err = ext4_resize_begin(sb);
if (err)
return err;
......@@ -420,13 +405,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
ext4_msg(sb, KERN_ERR,
"FITRIM not supported with bigalloc");
return -EOPNOTSUPP;
}
if (copy_from_user(&range, (struct fstrim_range __user *)arg,
sizeof(range)))
return -EFAULT;
......
......@@ -24,6 +24,7 @@
#include "ext4_jbd2.h"
#include "mballoc.h"
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <linux/slab.h>
#include <trace/events/ext4.h>
......@@ -1338,17 +1339,17 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
}
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
int next = block;
int max;
int max, order;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
BUG_ON(ex == NULL);
buddy = mb_find_buddy(e4b, order, &max);
buddy = mb_find_buddy(e4b, 0, &max);
BUG_ON(buddy == NULL);
BUG_ON(block >= max);
if (mb_test_bit(block, buddy)) {
......@@ -1358,12 +1359,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
return 0;
}
/* FIXME dorp order completely ? */
if (likely(order == 0)) {
/* find actual order */
order = mb_find_order_for_block(e4b, block);
block = block >> order;
}
ex->fe_len = 1 << order;
ex->fe_start = block << order;
......@@ -1549,7 +1547,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
/* recheck chunk's availability - we don't know
* when it was found (within this lock-unlock
* period or not) */
max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
if (max >= gex->fe_len) {
ext4_mb_use_best_found(ac, e4b);
return;
......@@ -1641,7 +1639,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
ac->ac_b_ex = ex;
......@@ -1662,17 +1660,20 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
int max;
int err;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
return 0;
if (grp->bb_free == 0)
return 0;
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
return err;
ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
......@@ -1788,7 +1789,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
break;
}
mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
BUG_ON(ex.fe_len <= 0);
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
......@@ -1840,7 +1841,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
ac->ac_b_ex = ex;
......@@ -1862,6 +1863,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
free = grp->bb_free;
if (free == 0)
return 0;
if (cr <= 2 && free < ac->ac_g_ex.fe_len)
return 0;
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
int ret = ext4_mb_init_group(ac->ac_sb, group);
......@@ -1869,10 +1876,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
return 0;
}
free = grp->bb_free;
fragments = grp->bb_fragments;
if (free == 0)
return 0;
if (fragments == 0)
return 0;
......@@ -2163,6 +2167,39 @@ static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
return cachep;
}
/*
* Allocate the top-level s_group_info array for the specified number
* of groups
*/
int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned size;
struct ext4_group_info ***new_groupinfo;
size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
if (size <= sbi->s_group_info_size)
return 0;
size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
if (!new_groupinfo) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
}
if (sbi->s_group_info) {
memcpy(new_groupinfo, sbi->s_group_info,
sbi->s_group_info_size * sizeof(*sbi->s_group_info));
ext4_kvfree(sbi->s_group_info);
}
sbi->s_group_info = new_groupinfo;
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
sbi->s_group_info_size);
return 0;
}
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
struct ext4_group_desc *desc)
......@@ -2195,12 +2232,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
if (meta_group_info[i] == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
goto exit_group_info;
}
memset(meta_group_info[i], 0, kmem_cache_size(cachep));
set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
&(meta_group_info[i]->bb_state));
......@@ -2252,49 +2288,14 @@ static int ext4_mb_init_backend(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
int num_meta_group_infos;
int num_meta_group_infos_max;
int array_size;
int err;
struct ext4_group_desc *desc;
struct kmem_cache *cachep;
/* This is the number of blocks used by GDT */
num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
/*
* This is the total number of blocks used by GDT including
* the number of reserved blocks for GDT.
* The s_group_info array is allocated with this value
* to allow a clean online resize without a complex
* manipulation of pointer.
* The drawback is the unused memory when no resize
* occurs but it's very low in terms of pages
* (see comments below)
* Need to handle this properly when META_BG resizing is allowed
*/
num_meta_group_infos_max = num_meta_group_infos +
le16_to_cpu(es->s_reserved_gdt_blocks);
err = ext4_mb_alloc_groupinfo(sb, ngroups);
if (err)
return err;
/*
* array_size is the size of s_group_info array. We round it
* to the next power of two because this approximation is done
* internally by kmalloc so we can have some more memory
* for free here (e.g. may be used for META_BG resize).
*/
array_size = 1;
while (array_size < sizeof(*sbi->s_group_info) *
num_meta_group_infos_max)
array_size = array_size << 1;
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
* So a two level scheme suffices for now. */
sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
}
sbi->s_buddy_cache = new_inode(sb);
if (sbi->s_buddy_cache == NULL) {
ext4_msg(sb, KERN_ERR, "can't get new inode");
......@@ -2322,7 +2323,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
i = num_meta_group_infos;
i = sbi->s_group_info_size;
while (i-- > 0)
kfree(sbi->s_group_info[i]);
iput(sbi->s_buddy_cache);
......@@ -4008,7 +4009,6 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
memset(ac, 0, sizeof(struct ext4_allocation_context));
ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
......@@ -4291,7 +4291,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
}
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
ar->len = 0;
*errp = -ENOMEM;
......@@ -4657,6 +4657,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
* with group lock held. generate_buddy look at
* them with group lock_held
*/
if (test_opt(sb, DISCARD))
ext4_issue_discard(sb, block_group, bit, count);
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
mb_free_blocks(inode, &e4b, bit, count_clusters);
......@@ -4988,7 +4990,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> sb->s_blocksize_bits;
end = start + (range->len >> sb->s_blocksize_bits) - 1;
minlen = range->minlen >> sb->s_blocksize_bits;
minlen = EXT4_NUM_B2C(EXT4_SB(sb),
range->minlen >> sb->s_blocksize_bits);
if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
unlikely(start >= max_blks))
......@@ -5048,6 +5051,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
out:
range->len = trimmed * sb->s_blocksize;
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
return ret;
}
......@@ -64,11 +64,6 @@ extern u8 mb_enable_debug;
*/
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
* How many groups mballoc will scan looking for the best chunk
*/
#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5
/*
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
......
This diff is collapsed.
......@@ -55,6 +55,13 @@ static struct buffer_head *ext4_append(handle_t *handle,
{
struct buffer_head *bh;
if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
((inode->i_size >> 10) >=
EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
*err = -ENOSPC;
return NULL;
}
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
bh = ext4_bread(handle, inode, *block, 1, err);
......@@ -67,6 +74,12 @@ static struct buffer_head *ext4_append(handle_t *handle,
bh = NULL;
}
}
if (!bh && !(*err)) {
*err = -EIO;
ext4_error(inode->i_sb,
"Directory hole detected on inode %lu\n",
inode->i_ino);
}
return bh;
}
......@@ -594,8 +607,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
u32 hash;
frame->bh = NULL;
if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
if (*err == 0)
*err = ERR_BAD_DX_DIR;
goto fail;
}
root = (struct dx_root *) bh->b_data;
if (root->info.hash_version != DX_HASH_TEA &&
root->info.hash_version != DX_HASH_HALF_MD4 &&
......@@ -696,8 +712,11 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
frame->entries = entries;
frame->at = at;
if (!indirect--) return frame;
if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
if (!(*err))
*err = ERR_BAD_DX_DIR;
goto fail2;
}
at = entries = ((struct dx_node *) bh->b_data)->entries;
if (!buffer_verified(bh) &&
......@@ -807,8 +826,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
*/
while (num_frames--) {
if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
0, &err)))
0, &err))) {
if (!err) {
ext4_error(dir->i_sb,
"Directory hole detected on inode %lu\n",
dir->i_ino);
return -EIO;
}
return err; /* Failure */
}
if (!buffer_verified(bh) &&
!ext4_dx_csum_verify(dir,
......@@ -839,12 +865,19 @@ static int htree_dirblock_to_tree(struct file *dir_file,
{
struct buffer_head *bh;
struct ext4_dir_entry_2 *de, *top;
int err, count = 0;
int err = 0, count = 0;
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
if (!err) {
err = -EIO;
ext4_error(dir->i_sb,
"Directory hole detected on inode %lu\n",
dir->i_ino);
}
return err;
}
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
......@@ -1267,8 +1300,15 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
return NULL;
do {
block = dx_get_block(frame->at);
if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
if (!(*err)) {
*err = -EIO;
ext4_error(dir->i_sb,
"Directory hole detected on inode %lu\n",
dir->i_ino);
}
goto errout;
}
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir,
......@@ -1801,9 +1841,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
blocks = dir->i_size >> sb->s_blocksize_bits;
for (block = 0; block < blocks; block++) {
bh = ext4_bread(handle, dir, block, 0, &retval);
if(!bh)
if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
if (!retval) {
retval = -EIO;
ext4_error(inode->i_sb,
"Directory hole detected on inode %lu\n",
inode->i_ino);
}
return retval;
}
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir,
(struct ext4_dir_entry *)bh->b_data))
......@@ -1860,8 +1906,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
entries = frame->entries;
at = frame->at;
if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
if (!err) {
err = -EIO;
ext4_error(dir->i_sb,
"Directory hole detected on inode %lu\n",
dir->i_ino);
}
goto cleanup;
}
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
......@@ -2149,9 +2202,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
#ifdef CONFIG_EXT4_FS_XATTR
inode->i_op = &ext4_special_inode_operations;
#endif
err = ext4_add_nondir(handle, dentry, inode);
}
ext4_journal_stop(handle);
......@@ -2199,9 +2250,15 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
dir_block = ext4_bread(handle, inode, 0, 1, &err);
if (!dir_block)
if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
if (!err) {
err = -EIO;
ext4_error(inode->i_sb,
"Directory hole detected on inode %lu\n",
inode->i_ino);
}
goto out_clear_inode;
}
BUFFER_TRACE(dir_block, "get_write_access");
err = ext4_journal_get_write_access(handle, dir_block);
if (err)
......@@ -2318,6 +2375,11 @@ static int empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode,
"error %d reading directory "
"lblock %u", err, lblock);
else
ext4_warning(inode->i_sb,
"bad directory (dir #%lu) - no data block",
inode->i_ino);
offset += sb->s_blocksize;
continue;
}
......@@ -2362,7 +2424,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0, rc;
if (!ext4_handle_valid(handle))
if (!EXT4_SB(sb)->s_journal)
return 0;
mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
......@@ -2436,8 +2498,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
/* ext4_handle_valid() assumes a valid handle_t pointer */
if (handle && !ext4_handle_valid(handle))
if (!EXT4_SB(inode->i_sb)->s_journal)
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
......@@ -2456,7 +2517,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
if (sbi->s_journal && !handle)
if (!handle)
goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
......@@ -2826,9 +2887,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
}
retval = -EIO;
dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
if (!dir_bh)
if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
if (!retval) {
retval = -EIO;
ext4_error(old_inode->i_sb,
"Directory hole detected on inode %lu\n",
old_inode->i_ino);
}
goto end_rename;
}
if (!buffer_verified(dir_bh) &&
!ext4_dirent_csum_verify(old_inode,
(struct ext4_dir_entry *)dir_bh->b_data))
......
......@@ -71,6 +71,9 @@ void ext4_free_io_end(ext4_io_end_t *io)
int i;
BUG_ON(!io);
BUG_ON(!list_empty(&io->list));
BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
if (io->page)
put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
......@@ -81,13 +84,8 @@ void ext4_free_io_end(ext4_io_end_t *io)
kmem_cache_free(io_end_cachep, io);
}
/*
* check a range of space and convert unwritten extents to written.
*
* Called with inode->i_mutex; we depend on this when we manipulate
* io->flag, since we could otherwise race with ext4_flush_completed_IO()
*/
int ext4_end_io_nolock(ext4_io_end_t *io)
/* check a range of space and convert unwritten extents to written. */
static int ext4_end_io(ext4_io_end_t *io)
{
struct inode *inode = io->inode;
loff_t offset = io->offset;
......@@ -106,63 +104,136 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
if (io->iocb)
aio_complete(io->iocb, io->result, 0);
if (io->flag & EXT4_IO_END_DIRECT)
inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(io->inode));
return ret;
}
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
static void ext4_end_io_work(struct work_struct *work)
static void dump_completed_IO(struct inode *inode)
{
ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
struct inode *inode = io->inode;
struct ext4_inode_info *ei = EXT4_I(inode);
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
unsigned long flags;
if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
ext4_debug("inode %lu completed_io list is empty\n",
inode->i_ino);
return;
}
ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino);
list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) {
cur = &io->list;
before = cur->prev;
io0 = container_of(before, ext4_io_end_t, list);
after = cur->next;
io1 = container_of(after, ext4_io_end_t, list);
ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
io, inode->i_ino, io0, io1);
}
#endif
}
/* Add the io_end to per-inode completed end_io list. */
void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
struct workqueue_struct *wq;
unsigned long flags;
BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
if (io->flag & EXT4_IO_END_IN_FSYNC)
goto requeue;
if (list_empty(&io->list)) {
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
goto free;
if (list_empty(&ei->i_completed_io_list)) {
io_end->flag |= EXT4_IO_END_QUEUED;
queue_work(wq, &io_end->work);
}
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
static int ext4_do_flush_completed_IO(struct inode *inode,
ext4_io_end_t *work_io)
{
ext4_io_end_t *io;
struct list_head unwritten, complete, to_free;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, ret = 0;
if (!mutex_trylock(&inode->i_mutex)) {
bool was_queued;
requeue:
was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
io->flag |= EXT4_IO_END_QUEUED;
INIT_LIST_HEAD(&complete);
INIT_LIST_HEAD(&to_free);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
dump_completed_IO(inode);
list_replace_init(&ei->i_completed_io_list, &unwritten);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
/*
* Requeue the work instead of waiting so that the work
* items queued after this can be processed.
*/
queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work);
/*
* To prevent the ext4-dio-unwritten thread from keeping
* requeueing end_io requests and occupying cpu for too long,
* yield the cpu if it sees an end_io request that has already
* been requeued.
*/
if (was_queued)
yield();
return;
while (!list_empty(&unwritten)) {
io = list_entry(unwritten.next, ext4_io_end_t, list);
BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN));
list_del_init(&io->list);
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
list_add_tail(&io->list, &complete);
}
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
while (!list_empty(&complete)) {
io = list_entry(complete.next, ext4_io_end_t, list);
io->flag &= ~EXT4_IO_END_UNWRITTEN;
/* end_io context can not be destroyed now because it still
* used by queued worker. Worker thread will destroy it later */
if (io->flag & EXT4_IO_END_QUEUED)
list_del_init(&io->list);
else
list_move(&io->list, &to_free);
}
/* If we are called from worker context, it is time to clear queued
* flag, and destroy it's end_io if it was converted already */
if (work_io) {
work_io->flag &= ~EXT4_IO_END_QUEUED;
if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
list_add_tail(&work_io->list, &to_free);
}
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
(void) ext4_end_io_nolock(io);
mutex_unlock(&inode->i_mutex);
free:
while (!list_empty(&to_free)) {
io = list_entry(to_free.next, ext4_io_end_t, list);
list_del_init(&io->list);
ext4_free_io_end(io);
}
return ret;
}
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
static void ext4_end_io_work(struct work_struct *work)
{
ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
ext4_do_flush_completed_IO(io->inode, io);
}
int ext4_flush_unwritten_io(struct inode *inode)
{
int ret;
WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
!(inode->i_state & I_FREEING));
ret = ext4_do_flush_completed_IO(inode, NULL);
ext4_unwritten_wait(inode);
return ret;
}
ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
......@@ -195,9 +266,7 @@ static void buffer_io_error(struct buffer_head *bh)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
struct workqueue_struct *wq;
struct inode *inode;
unsigned long flags;
int i;
sector_t bi_sector = bio->bi_sector;
......@@ -255,14 +324,7 @@ static void ext4_end_bio(struct bio *bio, int error)
return;
}
/* Add the io_end to per-inode completed io list*/
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
/* queue the work to convert unwritten extents to written */
queue_work(wq, &io_end->work);
ext4_add_complete_io(io_end);
}
void ext4_io_submit(struct ext4_io_submit *io)
......
This diff is collapsed.
......@@ -420,7 +420,7 @@ static void __save_error_info(struct super_block *sb, const char *func,
*/
if (!es->s_error_count)
mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
le32_add_cpu(&es->s_error_count, 1);
}
static void save_error_info(struct super_block *sb, const char *func,
......@@ -850,7 +850,6 @@ static void ext4_put_super(struct super_block *sb)
flush_workqueue(sbi->dio_unwritten_wq);
destroy_workqueue(sbi->dio_unwritten_wq);
lock_super(sb);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
......@@ -917,7 +916,6 @@ static void ext4_put_super(struct super_block *sb)
* Now that we are completely done shutting down the
* superblock, we need to actually destroy the kobject.
*/
unlock_super(sb);
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
if (sbi->s_chksum_driver)
......@@ -956,11 +954,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->jinode = NULL;
INIT_LIST_HEAD(&ei->i_completed_io_list);
spin_lock_init(&ei->i_completed_io_lock);
ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_aiodio_unwritten, 0);
atomic_set(&ei->i_unwritten, 0);
return &ei->vfs_inode;
}
......@@ -1224,6 +1221,7 @@ enum {
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
Opt_max_dir_size_kb,
};
static const match_table_t tokens = {
......@@ -1297,6 +1295,7 @@ static const match_table_t tokens = {
{Opt_init_itable, "init_itable=%u"},
{Opt_init_itable, "init_itable"},
{Opt_noinit_itable, "noinit_itable"},
{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
{Opt_removed, "check=none"}, /* mount option from ext2/3 */
{Opt_removed, "nocheck"}, /* mount option from ext2/3 */
{Opt_removed, "reservation"}, /* mount option from ext2/3 */
......@@ -1477,6 +1476,7 @@ static const struct mount_opts {
{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
{Opt_max_dir_size_kb, 0, MOPT_GTE0},
{Opt_err, 0, 0}
};
......@@ -1592,6 +1592,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
if (!args->from)
arg = EXT4_DEF_LI_WAIT_MULT;
sbi->s_li_wait_mult = arg;
} else if (token == Opt_max_dir_size_kb) {
sbi->s_max_dir_size_kb = arg;
} else if (token == Opt_stripe) {
sbi->s_stripe = arg;
} else if (m->flags & MOPT_DATAJ) {
......@@ -1664,7 +1666,7 @@ static int parse_options(char *options, struct super_block *sb,
* Initialize args struct so we know whether arg was
* found; some options take optional arguments.
*/
args[0].to = args[0].from = 0;
args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
if (handle_mount_opt(sb, p, token, args, journal_devnum,
journal_ioprio, is_remount) < 0)
......@@ -1740,7 +1742,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
static const char *token2str(int token)
{
static const struct match_token *t;
const struct match_token *t;
for (t = tokens; t->token != Opt_err; t++)
if (t->token == token && !strchr(t->pattern, '='))
......@@ -1823,6 +1825,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
(sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
if (nodefs || sbi->s_max_dir_size_kb)
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
ext4_show_quota_options(seq, sb);
return 0;
......@@ -1914,15 +1918,45 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
return res;
}
int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct flex_groups *new_groups;
int size;
if (!sbi->s_log_groups_per_flex)
return 0;
size = ext4_flex_group(sbi, ngroup - 1) + 1;
if (size <= sbi->s_flex_groups_allocated)
return 0;
size = roundup_pow_of_two(size * sizeof(struct flex_groups));
new_groups = ext4_kvzalloc(size, GFP_KERNEL);
if (!new_groups) {
ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
size / (int) sizeof(struct flex_groups));
return -ENOMEM;
}
if (sbi->s_flex_groups) {
memcpy(new_groups, sbi->s_flex_groups,
(sbi->s_flex_groups_allocated *
sizeof(struct flex_groups)));
ext4_kvfree(sbi->s_flex_groups);
}
sbi->s_flex_groups = new_groups;
sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
return 0;
}
static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
ext4_group_t flex_group_count;
ext4_group_t flex_group;
unsigned int groups_per_flex = 0;
size_t size;
int i;
int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
......@@ -1931,17 +1965,9 @@ static int ext4_fill_flex_info(struct super_block *sb)
}
groups_per_flex = 1 << sbi->s_log_groups_per_flex;
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
size = flex_group_count * sizeof(struct flex_groups);
sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
if (sbi->s_flex_groups == NULL) {
ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
flex_group_count);
err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
if (err)
goto failed;
}
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
......@@ -2144,10 +2170,12 @@ static void ext4_orphan_cleanup(struct super_block *sb,
}
if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
if (es->s_last_orphan)
/* don't clear list on RO mount w/ errors */
if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
jbd_debug(1, "Errors on filesystem, "
"clearing orphan list.\n");
es->s_last_orphan = 0;
}
jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
return;
}
......@@ -2528,6 +2556,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
static struct attribute *ext4_attrs[] = {
......@@ -2543,6 +2572,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
NULL,
};
......@@ -2550,10 +2580,12 @@ static struct attribute *ext4_attrs[] = {
/* Features this copy of ext4 supports */
EXT4_INFO_ATTR(lazy_itable_init);
EXT4_INFO_ATTR(batched_discard);
EXT4_INFO_ATTR(meta_bg_resize);
static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(lazy_itable_init),
ATTR_LIST(batched_discard),
ATTR_LIST(meta_bg_resize),
NULL,
};
......@@ -3374,7 +3406,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* enable delayed allocation by default
* Use -o nodelalloc to turn it off
*/
if (!IS_EXT3_SB(sb) &&
if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
......@@ -3743,6 +3775,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
/*
* set up enough so that it can read an inode
......@@ -4519,11 +4552,9 @@ static int ext4_unfreeze(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return 0;
lock_super(sb);
/* Reset the needs_recovery flag before the fs is unlocked. */
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
unlock_super(sb);
return 0;
}
......@@ -4559,7 +4590,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
char *orig_data = kstrdup(data, GFP_KERNEL);
/* Store the original options */
lock_super(sb);
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
old_opts.s_mount_opt2 = sbi->s_mount_opt2;
......@@ -4701,7 +4731,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal == NULL)
ext4_commit_super(sb, 1);
unlock_super(sb);
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
......@@ -4714,12 +4743,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_QUOTA)) {
err = ext4_enable_quotas(sb);
if (err) {
lock_super(sb);
if (err)
goto restore_opts;
}
}
}
#endif
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
......@@ -4744,7 +4771,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
unlock_super(sb);
kfree(orig_data);
return err;
}
......@@ -5269,8 +5295,10 @@ static int __init ext4_init_fs(void)
if (err)
goto out6;
ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
if (!ext4_kset)
if (!ext4_kset) {
err = -ENOMEM;
goto out5;
}
ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = ext4_init_feat_adverts();
......
......@@ -63,6 +63,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
{
return test_bit(BDI_writeback_running, &bdi->state);
}
EXPORT_SYMBOL(writeback_in_progress);
static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
......
......@@ -1014,17 +1014,35 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* there's no point in keeping a checkpoint record for
* it. */
/* A buffer which has been freed while still being
* journaled by a previous transaction may end up still
* being dirty here, but we want to avoid writing back
* that buffer in the future after the "add to orphan"
* operation been committed, That's not only a performance
* gain, it also stops aliasing problems if the buffer is
* left behind for writeback and gets reallocated for another
* use in a different page. */
if (buffer_freed(bh) && !jh->b_next_transaction) {
/*
* A buffer which has been freed while still being journaled by
* a previous transaction.
*/
if (buffer_freed(bh)) {
/*
* If the running transaction is the one containing
* "add to orphan" operation (b_next_transaction !=
* NULL), we have to wait for that transaction to
* commit before we can really get rid of the buffer.
* So just clear b_modified to not confuse transaction
* credit accounting and refile the buffer to
* BJ_Forget of the running transaction. If the just
* committed transaction contains "add to orphan"
* operation, we can completely invalidate the buffer
* now. We are rather through in that since the
* buffer may be still accessible when blocksize <
* pagesize and it is attached to the last partial
* page.
*/
jh->b_modified = 0;
if (!jh->b_next_transaction) {
clear_buffer_freed(bh);
clear_buffer_jbddirty(bh);
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_req(bh);
bh->b_bdev = NULL;
}
}
if (buffer_jbddirty(bh)) {
......
......@@ -1354,6 +1354,11 @@ static void jbd2_mark_journal_empty(journal_t *journal)
BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
read_lock(&journal->j_state_lock);
/* Is it already empty? */
if (sb->s_start == 0) {
read_unlock(&journal->j_state_lock);
return;
}
jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
journal->j_tail_sequence);
......
......@@ -289,8 +289,11 @@ int jbd2_journal_recover(journal_t *journal)
if (!err)
err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
if (journal->j_flags & JBD2_BARRIER) {
err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
if (!err)
err = err2;
}
return err;
}
......
......@@ -1841,15 +1841,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
* We're outside-transaction here. Either or both of j_running_transaction
* and j_committing_transaction may be NULL.
*/
static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
int partial_page)
{
transaction_t *transaction;
struct journal_head *jh;
int may_free = 1;
int ret;
BUFFER_TRACE(bh, "entry");
retry:
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
......@@ -1878,10 +1879,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* clear the buffer dirty bit at latest at the moment when the
* transaction marking the buffer as freed in the filesystem
* structures is committed because from that moment on the
* buffer can be reallocated and used by a different page.
* block can be reallocated and used by a different page.
* Since the block hasn't been freed yet but the inode has
* already been added to orphan list, it is safe for us to add
* the buffer to BJ_Forget list of the newest transaction.
*
* Also we have to clear buffer_mapped flag of a truncated buffer
* because the buffer_head may be attached to the page straddling
* i_size (can happen only when blocksize < pagesize) and thus the
* buffer_head can be reused when the file is extended again. So we end
* up keeping around invalidated buffers attached to transactions'
* BJ_Forget list just to stop checkpointing code from cleaning up
* the transaction this buffer was modified in.
*/
transaction = jh->b_transaction;
if (transaction == NULL) {
......@@ -1908,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* committed, the buffer won't be needed any
* longer. */
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
ret = __dispose_buffer(jh,
may_free = __dispose_buffer(jh,
journal->j_running_transaction);
jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
write_unlock(&journal->j_state_lock);
return ret;
goto zap_buffer;
} else {
/* There is no currently-running transaction. So the
* orphan record which we wrote for this file must have
......@@ -1922,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* the committing transaction, if it exists. */
if (journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "give to committing trans");
ret = __dispose_buffer(jh,
may_free = __dispose_buffer(jh,
journal->j_committing_transaction);
jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
write_unlock(&journal->j_state_lock);
return ret;
goto zap_buffer;
} else {
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
......@@ -1940,10 +1941,24 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
JBUFFER_TRACE(jh, "on committing transaction");
/*
* The buffer is committing, we simply cannot touch
* it. So we just set j_next_transaction to the
* running transaction (if there is one) and mark
* buffer as freed so that commit code knows it should
* clear dirty bits when it is done with the buffer.
* it. If the page is straddling i_size we have to wait
* for commit and try again.
*/
if (partial_page) {
tid_t tid = journal->j_committing_transaction->t_tid;
jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
write_unlock(&journal->j_state_lock);
jbd2_log_wait_commit(journal, tid);
goto retry;
}
/*
* OK, buffer won't be reachable after truncate. We just set
* j_next_transaction to the running transaction (if there is
* one) and mark buffer as freed so that commit code knows it
* should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
......@@ -1966,6 +1981,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
zap_buffer:
/*
* This is tricky. Although the buffer is truncated, it may be reused
* if blocksize < pagesize and it is attached to the page straddling
* EOF. Since the buffer might have been added to BJ_Forget list of the
* running transaction, journal_get_write_access() won't clear
* b_modified and credit accounting gets confused. So clear b_modified
* here.
*/
jh->b_modified = 0;
jbd2_journal_put_journal_head(jh);
zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
......@@ -2017,7 +2041,8 @@ void jbd2_journal_invalidatepage(journal_t *journal,
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
may_free &= journal_unmap_buffer(journal, bh);
may_free &= journal_unmap_buffer(journal, bh,
offset > 0);
unlock_buffer(bh);
}
curr_off = next_off;
......
......@@ -116,6 +116,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (unlikely(ret))
goto out;
file_update_time(vma->vm_file);
ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
......
......@@ -3,6 +3,7 @@
#define FALLOC_FL_KEEP_SIZE 0x01 /* default is extend size */
#define FALLOC_FL_PUNCH_HOLE 0x02 /* de-allocates range */
#define FALLOC_FL_NO_HIDE_STALE 0x04 /* reserved codepoint */
#ifdef __KERNEL__
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment