Commit 9e38d86f authored by Nick Piggin's avatar Nick Piggin Committed by Al Viro

fs: Implement lazy LRU updates for inodes

Convert the inode LRU to use lazy updates to reduce lock and
cacheline traffic.  We avoid moving inodes around in the LRU list
during iget/iput operations so these frequent operations don't need
to access the LRUs. Instead, we defer the refcount checks to
reclaim-time and use a per-inode state flag, I_REFERENCED, to tell
reclaim that iget has touched the inode in the past. This means that
only reclaim should be touching the LRU with any frequency, hence
significantly reducing lock acquisitions and the amount contention
on LRU updates.

This also removes the inode_in_use list, which means we now only
have one list for tracking the inode LRU status. This makes it much
simpler to split out the LRU list operations under it's own lock.
Signed-off-by: default avatarNick Piggin <npiggin@suse.de>
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent cffbc8aa
......@@ -408,16 +408,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* completion.
*/
redirty_tail(inode);
} else if (atomic_read(&inode->i_count)) {
/*
* The inode is clean, inuse
*/
list_move(&inode->i_list, &inode_in_use);
} else {
/*
* The inode is clean, unused
* The inode is clean. At this point we either have
* a reference to the inode or it's on it's way out.
* No need to add it back to the LRU.
*/
list_move(&inode->i_list, &inode_unused);
list_del_init(&inode->i_list);
}
}
inode_sync_complete(inode);
......
......@@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly;
* allowing for low-overhead inode sync() operations.
*/
LIST_HEAD(inode_in_use);
LIST_HEAD(inode_unused);
static LIST_HEAD(inode_unused);
static struct hlist_head *inode_hashtable __read_mostly;
/*
......@@ -291,6 +290,7 @@ void inode_init_once(struct inode *inode)
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_devices);
INIT_LIST_HEAD(&inode->i_list);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.tree_lock);
spin_lock_init(&inode->i_data.i_mmap_lock);
......@@ -317,12 +317,23 @@ static void init_once(void *foo)
*/
void __iget(struct inode *inode)
{
if (atomic_inc_return(&inode->i_count) != 1)
return;
atomic_inc(&inode->i_count);
}
if (!(inode->i_state & (I_DIRTY|I_SYNC)))
list_move(&inode->i_list, &inode_in_use);
percpu_counter_dec(&nr_inodes_unused);
static void inode_lru_list_add(struct inode *inode)
{
if (list_empty(&inode->i_list)) {
list_add(&inode->i_list, &inode_unused);
percpu_counter_inc(&nr_inodes_unused);
}
}
static void inode_lru_list_del(struct inode *inode)
{
if (!list_empty(&inode->i_list)) {
list_del_init(&inode->i_list);
percpu_counter_dec(&nr_inodes_unused);
}
}
void end_writeback(struct inode *inode)
......@@ -367,7 +378,7 @@ static void dispose_list(struct list_head *head)
struct inode *inode;
inode = list_first_entry(head, struct inode, i_list);
list_del(&inode->i_list);
list_del_init(&inode->i_list);
evict(inode);
......@@ -413,7 +424,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
list_move(&inode->i_list, dispose);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
percpu_counter_dec(&nr_inodes_unused);
if (!(inode->i_state & (I_DIRTY | I_SYNC)))
percpu_counter_dec(&nr_inodes_unused);
continue;
}
busy = 1;
......@@ -448,7 +460,7 @@ int invalidate_inodes(struct super_block *sb)
static int can_unuse(struct inode *inode)
{
if (inode->i_state)
if (inode->i_state & ~I_REFERENCED)
return 0;
if (inode_has_buffers(inode))
return 0;
......@@ -460,17 +472,20 @@ static int can_unuse(struct inode *inode)
}
/*
* Scan `goal' inodes on the unused list for freeable ones. They are moved to
* a temporary list and then are freed outside inode_lock by dispose_list().
* Scan `goal' inodes on the unused list for freeable ones. They are moved to a
* temporary list and then are freed outside inode_lock by dispose_list().
*
* Any inodes which are pinned purely because of attached pagecache have their
* pagecache removed. We expect the final iput() on that inode to add it to
* the front of the inode_unused list. So look for it there and if the
* inode is still freeable, proceed. The right inode is found 99.9% of the
* time in testing on a 4-way.
* pagecache removed. If the inode has metadata buffers attached to
* mapping->private_list then try to remove them.
*
* If the inode has metadata buffers attached to mapping->private_list then
* try to remove them.
* If the inode has the I_REFERENCED flag set, then it means that it has been
* used recently - the flag is set in iput_final(). When we encounter such an
* inode, clear the flag and move it to the back of the LRU so it gets another
* pass through the LRU before it gets reclaimed. This is necessary because of
* the fact we are doing lazy LRU updates to minimise lock contention so the
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
static void prune_icache(int nr_to_scan)
{
......@@ -488,8 +503,21 @@ static void prune_icache(int nr_to_scan)
inode = list_entry(inode_unused.prev, struct inode, i_list);
if (inode->i_state || atomic_read(&inode->i_count)) {
/*
* Referenced or dirty inodes are still in use. Give them
* another pass through the LRU as we canot reclaim them now.
*/
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
list_del_init(&inode->i_list);
percpu_counter_dec(&nr_inodes_unused);
continue;
}
/* recently referenced inodes get one more pass */
if (inode->i_state & I_REFERENCED) {
list_move(&inode->i_list, &inode_unused);
inode->i_state &= ~I_REFERENCED;
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
......@@ -620,7 +648,6 @@ static inline void
__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
struct inode *inode)
{
list_add(&inode->i_list, &inode_in_use);
list_add(&inode->i_sb_list, &sb->s_inodes);
if (head)
hlist_add_head(&inode->i_hash, head);
......@@ -1237,10 +1264,11 @@ static void iput_final(struct inode *inode)
drop = generic_drop_inode(inode);
if (!drop) {
if (!(inode->i_state & (I_DIRTY|I_SYNC)))
list_move(&inode->i_list, &inode_unused);
percpu_counter_inc(&nr_inodes_unused);
if (sb->s_flags & MS_ACTIVE) {
inode->i_state |= I_REFERENCED;
if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
inode_lru_list_add(inode);
}
spin_unlock(&inode_lock);
return;
}
......@@ -1251,13 +1279,19 @@ static void iput_final(struct inode *inode)
spin_lock(&inode_lock);
WARN_ON(inode->i_state & I_NEW);
inode->i_state &= ~I_WILL_FREE;
percpu_counter_dec(&nr_inodes_unused);
hlist_del_init(&inode->i_hash);
}
list_del_init(&inode->i_list);
list_del_init(&inode->i_sb_list);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
/*
* After we delete the inode from the LRU here, we avoid moving dirty
* inodes back onto the LRU now because I_FREEING is set and hence
* writeback_single_inode() won't move the inode around.
*/
inode_lru_list_del(inode);
list_del_init(&inode->i_sb_list);
spin_unlock(&inode_lock);
evict(inode);
spin_lock(&inode_lock);
......
......@@ -1641,16 +1641,17 @@ struct super_operations {
*
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
#define I_DIRTY_SYNC 1
#define I_DIRTY_DATASYNC 2
#define I_DIRTY_PAGES 4
#define I_DIRTY_SYNC (1 << 0)
#define I_DIRTY_DATASYNC (1 << 1)
#define I_DIRTY_PAGES (1 << 2)
#define __I_NEW 3
#define I_NEW (1 << __I_NEW)
#define I_WILL_FREE 16
#define I_FREEING 32
#define I_CLEAR 64
#define I_WILL_FREE (1 << 4)
#define I_FREEING (1 << 5)
#define I_CLEAR (1 << 6)
#define __I_SYNC 7
#define I_SYNC (1 << __I_SYNC)
#define I_REFERENCED (1 << 8)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
......
......@@ -10,8 +10,6 @@
struct backing_dev_info;
extern spinlock_t inode_lock;
extern struct list_head inode_in_use;
extern struct list_head inode_unused;
/*
* fs/fs-writeback.c
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment