Commit d6686d54 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Don't remove inode from hash until filesystem has

From: Neil Brown <neilb@cse.unsw.edu.au>

When an NFS request arrives, it contains a filehandle which needs to be
converted to a dentry.  Many filesystems use find_exported_dentry in
fs/exportfs/expfs.c.  A key part of this on filesystem where a 32bit inode
number uniquely locates a file is export_iget which calls iget(sb, inum).

iget will either:

   1/ find the inode in the inode cache and return it

 or

   2/ create a new inode and call ->read_inode to load it from the
      storage device.

export_iget then verifies the inode is really a good inode (->read_inode
didn't detect any problems) and the right inode (base on generation number
from the file handle).

For this to work reliably, it is important that whenever an inode is *not* in
the cache, the on-device version is up-to-date.  Otherwise, when read_inode
loads the inode it will get bad data.

For a file that has not been deleted, this condition always holds: a dirty
inode is always flushed to disc before the inode is unhashed.

However for a file that is being deleted this condition doesn't (didn't)
hold.  When iput -> iput_final -> generic_drop_inode -> generic_delete_inode
is called we would unhash the inode before calling into the filesytem through
->delete_inode.

So there is a small window between when generic_delete_inode unhashes the
inode, and when ->delete_inode writes something to disc, where a call to
->read_inode (for export_iget) might discover what it thinks is a valid
inode, but is really one that is in the process of being destroyed.

It is this window that I want to close by moving the unhashing to the end of
generic_delete_inode.
parent 2eb4051e
...@@ -90,7 +90,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) ...@@ -90,7 +90,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
* Only add valid (hashed) inodes to the superblock's * Only add valid (hashed) inodes to the superblock's
* dirty list. Add blockdev inodes as well. * dirty list. Add blockdev inodes as well.
*/ */
if (hlist_unhashed(&inode->i_hash) && !S_ISBLK(inode->i_mode)) if ((hlist_unhashed(&inode->i_hash) || (inode->i_state & (I_FREEING|I_CLEAR)))
&& !S_ISBLK(inode->i_mode))
goto out; goto out;
/* /*
......
...@@ -470,6 +470,7 @@ static int shrink_icache_memory(int nr, unsigned int gfp_mask) ...@@ -470,6 +470,7 @@ static int shrink_icache_memory(int nr, unsigned int gfp_mask)
return inodes_stat.nr_unused; return inodes_stat.nr_unused;
} }
static void __wait_on_freeing_inode(struct inode *inode);
/* /*
* Called with the inode lock held. * Called with the inode lock held.
* NOTE: we are not increasing the inode-refcount, you must call __iget() * NOTE: we are not increasing the inode-refcount, you must call __iget()
...@@ -481,6 +482,7 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea ...@@ -481,6 +482,7 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea
struct hlist_node *node; struct hlist_node *node;
struct inode * inode = NULL; struct inode * inode = NULL;
repeat:
hlist_for_each (node, head) { hlist_for_each (node, head) {
prefetch(node->next); prefetch(node->next);
inode = hlist_entry(node, struct inode, i_hash); inode = hlist_entry(node, struct inode, i_hash);
...@@ -488,6 +490,10 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea ...@@ -488,6 +490,10 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea
continue; continue;
if (!test(inode, data)) if (!test(inode, data))
continue; continue;
if (inode->i_state & (I_FREEING|I_CLEAR)) {
__wait_on_freeing_inode(inode);
goto repeat;
}
break; break;
} }
return node ? inode : NULL; return node ? inode : NULL;
...@@ -502,6 +508,7 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head ...@@ -502,6 +508,7 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head
struct hlist_node *node; struct hlist_node *node;
struct inode * inode = NULL; struct inode * inode = NULL;
repeat:
hlist_for_each (node, head) { hlist_for_each (node, head) {
prefetch(node->next); prefetch(node->next);
inode = list_entry(node, struct inode, i_hash); inode = list_entry(node, struct inode, i_hash);
...@@ -509,6 +516,10 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head ...@@ -509,6 +516,10 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head
continue; continue;
if (inode->i_sb != sb) if (inode->i_sb != sb)
continue; continue;
if (inode->i_state & (I_FREEING|I_CLEAR)) {
__wait_on_freeing_inode(inode);
goto repeat;
}
break; break;
} }
return node ? inode : NULL; return node ? inode : NULL;
...@@ -937,11 +948,22 @@ void remove_inode_hash(struct inode *inode) ...@@ -937,11 +948,22 @@ void remove_inode_hash(struct inode *inode)
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
} }
/*
* Tell the filesystem that this inode is no longer of any interest and should
* be completely destroyed.
*
* We leave the inode in the inode hash table until *after* the filesystem's
* ->delete_inode completes. This ensures that an iget (such as nfsd might
* instigate) will always find up-to-date information either in the hash or on
* disk.
*
* I_FREEING is set so that no-one will take a new reference to the inode while
* it is being deleted.
*/
void generic_delete_inode(struct inode *inode) void generic_delete_inode(struct inode *inode)
{ {
struct super_operations *op = inode->i_sb->s_op; struct super_operations *op = inode->i_sb->s_op;
hlist_del_init(&inode->i_hash);
list_del_init(&inode->i_list); list_del_init(&inode->i_list);
inode->i_state|=I_FREEING; inode->i_state|=I_FREEING;
inodes_stat.nr_inodes--; inodes_stat.nr_inodes--;
...@@ -960,6 +982,10 @@ void generic_delete_inode(struct inode *inode) ...@@ -960,6 +982,10 @@ void generic_delete_inode(struct inode *inode)
delete(inode); delete(inode);
} else } else
clear_inode(inode); clear_inode(inode);
spin_lock(&inode_lock);
hlist_del_init(&inode->i_hash);
spin_unlock(&inode_lock);
wake_up_inode(inode);
if (inode->i_state != I_CLEAR) if (inode->i_state != I_CLEAR)
BUG(); BUG();
destroy_inode(inode); destroy_inode(inode);
...@@ -1233,6 +1259,32 @@ void __wait_on_inode(struct inode *inode) ...@@ -1233,6 +1259,32 @@ void __wait_on_inode(struct inode *inode)
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
} }
/*
* If we try to find an inode in the inode hash while it is being deleted, we
* have to wait until the filesystem completes its deletion before reporting
* that it isn't found. This is because iget will immediately call
* ->read_inode, and we want to be sure that evidence of the deletion is found
* by ->read_inode.
*
* This call might return early if an inode which shares the waitq is woken up.
* This is most easily handled by the caller which will loop around again
* looking for the inode.
*
* This is called with inode_lock held.
*/
static void __wait_on_freeing_inode(struct inode *inode)
{
DECLARE_WAITQUEUE(wait, current);
wait_queue_head_t *wq = i_waitq_head(inode);
add_wait_queue(wq, &wait);
set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&inode_lock);
schedule();
remove_wait_queue(wq, &wait);
spin_lock(&inode_lock);
}
void wake_up_inode(struct inode *inode) void wake_up_inode(struct inode *inode)
{ {
wait_queue_head_t *wq = i_waitq_head(inode); wait_queue_head_t *wq = i_waitq_head(inode);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment