afs: Overhaul the callback handling

Overhaul the AFS callback handling by the following means: (1) Don't give up callback promises on vnodes that we are no longer using, rather let them just expire on the server or let the server break them. This is actually more efficient for the server as the callback lookup is expensive if there are lots of extant callbacks. (2) Only give up the callback promises we have from a server when the server record is destroyed. Then we can just give up *all* the callback promises on it in one go. (3) Servers can end up being shared between cells if cells are aliased, so don't add all the vnodes being backed by a particular server into a big FID-indexed tree on that server as there may be duplicates. Instead have each volume instance (~= superblock) register an interest in a server as it starts to make use of it and use this to allow the processor for callbacks from the server to find the superblock and thence the inode corresponding to the FID being broken by means of ilookup_nowait(). (4) Rather than iterating over the entire callback list when a mass-break comes in from the server, maintain a counter of mass-breaks in afs_server (cb_seq) and make afs_validate() check it against the copy in afs_vnode. It would be nice not to have to take a read_lock whilst doing this, but that's tricky without using RCU. (5) Save a ref on the fileserver we're using for a call in the afs_call struct so that we can access its cb_s_break during call decoding. (6) Write-lock around callback and status storage in a vnode and read-lock around getattr so that we don't see the status mid-update. This has the following consequences: (1) Data invalidation isn't seen until someone calls afs_validate() on a vnode. Unfortunately, we need to use a key to query the server, but getting one from a background thread is tricky without caching loads of keys all over the place. (2) Mass invalidation isn't seen until someone calls afs_validate(). (3) Callback breaking is going to hit the inode_hash_lock quite a bit. Could this be replaced with rcu_read_lock() since inodes are destroyed under RCU conditions. Signed-off-by: David Howells <dhowells@redhat.com>

afs: Overhaul the callback handling
Overhaul the AFS callback handling by the following means: (1) Don't give up callback promises on vnodes that we are no longer using, rather let them just expire on the server or let the server break them. This is actually more efficient for the server as the callback lookup is expensive if there are lots of extant callbacks. (2) Only give up the callback promises we have from a server when the server record is destroyed. Then we can just give up *all* the callback promises on it in one go. (3) Servers can end up being shared between cells if cells are aliased, so don't add all the vnodes being backed by a particular server into a big FID-indexed tree on that server as there may be duplicates. Instead have each volume instance (~= superblock) register an interest in a server as it starts to make use of it and use this to allow the processor for callbacks from the server to find the superblock and thence the inode corresponding to the FID being broken by means of ilookup_nowait(). (4) Rather than iterating over the entire callback list when a mass-break comes in from the server, maintain a counter of mass-breaks in afs_server (cb_seq) and make afs_validate() check it against the copy in afs_vnode. It would be nice not to have to take a read_lock whilst doing this, but that's tricky without using RCU. (5) Save a ref on the fileserver we're using for a call in the afs_call struct so that we can access its cb_s_break during call decoding. (6) Write-lock around callback and status storage in a vnode and read-lock around getattr so that we don't see the status mid-update. This has the following consequences: (1) Data invalidation isn't seen until someone calls afs_validate() on a vnode. Unfortunately, we need to use a key to query the server, but getting one from a background thread is tricky without caching loads of keys all over the place. (2) Mass invalidation isn't seen until someone calls afs_validate(). (3) Callback breaking is going to hit the inode_hash_lock quite a bit. Could this be replaced with rcu_read_lock() since inodes are destroyed under RCU conditions. Signed-off-by: David Howells <dhowells@redhat.com>
c435ee34 · David Howells · d0676a16 · c435ee34 · c435ee34 · c435ee34
Commit c435ee34 authored Nov 02, 2017 by David Howells
14 changed files
--- a/fs/afs/afs_fs.h
+++ b/fs/afs/afs_fs.h
@@ -37,6 +37,7 @@ enum AFS_FS_Operations {
 	FSLOOKUP		= 161,	/* AFS lookup file in directory */
 	FSFETCHDATA64		= 65537, /* AFS Fetch file data */
 	FSSTOREDATA64		= 65538, /* AFS Store file data */
+	FSGIVEUPALLCALLBACKS	= 65539, /* AFS Give up all outstanding callbacks on a server */
 };

 enum AFS_FS_Errors {

--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -153,7 +153,7 @@ static void afs_cm_destructor(struct afs_call *call)
 }

 /*
- * allow the fileserver to see if the cache manager is still alive
+ * The server supplied a list of callbacks that it wanted to break.
 */
 static void SRXAFSCB_CallBack(struct work_struct *work)
 {

--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -581,6 +581,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	struct afs_vnode *vnode, *dir;
 	struct afs_fid uninitialized_var(fid);
 	struct dentry *parent;
+	struct inode *inode;
 	struct key *key;
 	void *dir_version;
 	int ret;
@@ -588,30 +589,39 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;

+	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
-
-	if (d_really_is_positive(dentry))
 		_enter("{v={%x:%u} n=%pd fl=%lx},",
 		       vnode->fid.vid, vnode->fid.vnode, dentry,
 		       vnode->flags);
-	else
+	} else {
 		_enter("{neg n=%pd}", dentry);
+	}

 	key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
 	if (IS_ERR(key))
 		key = NULL;

+	if (d_really_is_positive(dentry)) {
+		inode = d_inode(dentry);
+		if (inode) {
+			vnode = AFS_FS_I(inode);
+			afs_validate(vnode, key);
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+				goto out_bad;
+		}
+	}
+
 	/* lock down the parent dentry so we can peer at it */
 	parent = dget_parent(dentry);
 	dir = AFS_FS_I(d_inode(parent));

 	/* validate the parent directory */
-	if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
 	afs_validate(dir, key);

 	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
 		_debug("%pd: parent dir deleted", dentry);
-		goto out_bad;
+		goto out_bad_parent;
 	}

 	dir_version = (void *) (unsigned long) dir->status.data_version;
@@ -626,13 +636,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	case 0:
 		/* the filename maps to something */
 		if (d_really_is_negative(dentry))
-			goto out_bad;
-		if (is_bad_inode(d_inode(dentry))) {
+			goto out_bad_parent;
+		inode = d_inode(dentry);
+		if (is_bad_inode(inode)) {
 			printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
 			       dentry);
-			goto out_bad;
+			goto out_bad_parent;
 		}

+		vnode = AFS_FS_I(inode);
+
 		/* if the vnode ID has changed, then the dirent points to a
 		 * different file */
 		if (fid.vnode != vnode->fid.vnode) {
@@ -649,10 +662,10 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 			_debug("%pd: file deleted (uq %u -> %u I:%u)",
 			       dentry, fid.unique,
 			       vnode->fid.unique,
-			       d_inode(dentry)->i_generation);
-			spin_lock(&vnode->lock);
+			       vnode->vfs_inode.i_generation);
+			write_seqlock(&vnode->cb_lock);
 			set_bit(AFS_VNODE_DELETED, &vnode->flags);
-			spin_unlock(&vnode->lock);
+			write_sequnlock(&vnode->cb_lock);
 			goto not_found;
 		}
 		goto out_valid;
@@ -667,7 +680,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	default:
 		_debug("failed to iterate dir %pd: %d",
 		       parent, ret);
-		goto out_bad;
+		goto out_bad_parent;
 	}

 out_valid:
@@ -683,9 +696,10 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
 	spin_unlock(&dentry->d_lock);

-out_bad:
+out_bad_parent:
 	_debug("dropping dentry %pd2", dentry);
 	dput(parent);
+out_bad:
 	key_put(key);

 	_leave(" = 0 [bad]");
@@ -820,7 +834,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
 		vnode = AFS_FS_I(d_inode(dentry));
 		clear_nlink(&vnode->vfs_inode);
 		set_bit(AFS_VNODE_DELETED, &vnode->flags);
-		afs_discard_callback_on_delete(vnode);
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 	}

 	key_put(key);
@@ -884,9 +898,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		vnode = AFS_FS_I(d_inode(dentry));
 		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 			_debug("AFS_VNODE_DELETED");
-		if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
-			_debug("AFS_VNODE_CB_BROKEN");
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 		ret = afs_validate(vnode, key);
 		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
 	}

--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -243,7 +243,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)

 	/* make sure we've got a callback on this file and that our view of the
 	 * data version is up to date */
-	ret = afs_vnode_fetch_status(vnode, NULL, key);
+	ret = afs_validate(vnode, key);
 	if (ret < 0)
 		goto error;

@@ -383,7 +383,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 	/* again, make sure we've got a callback on this file and, again, make
 	 * sure that our view of the data version is up to date (we ignore
 	 * errors incurred here and deal with the consequences elsewhere) */
-	afs_vnode_fetch_status(vnode, NULL, key);
+	afs_vnode_fetch_status(vnode, NULL, key, false);

 error:
 	spin_unlock(&inode->i_lock);
@@ -455,7 +455,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 	posix_test_lock(file, fl);
 	if (fl->fl_type == F_UNLCK) {
 		/* no local locks; consult the server */
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		ret = afs_vnode_fetch_status(vnode, NULL, key, true);
 		if (ret < 0)
 			goto error;
 		lock_count = vnode->status.lock_count;

--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -22,6 +22,11 @@
 */
 static u8 afs_discard_buffer[64];

+static inline void afs_use_fs_server(struct afs_call *call, struct afs_server *server)
+{
+	call->server = afs_get_server(server);
+}
+
 /*
 * decode an AFSFid block
 */
@@ -47,14 +52,17 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 	const __be32 *bp = *_bp;
 	umode_t mode;
 	u64 data_version, size;
-	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+	bool changed = false;
 	kuid_t owner;
 	kgid_t group;

+	write_seqlock(&vnode->cb_lock);
+
 #define EXTRACT(DST)				\
 	do {					\
 		u32 x = ntohl(*bp++);		\
-		changed |= DST - x;		\
+		if (DST != x)			\
+			changed |= true;	\
 		DST = x;			\
 	} while (0)

@@ -127,25 +135,39 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
 			_debug("vnode modified %llx on {%x:%u}",
 			       (unsigned long long) data_version,
 			       vnode->fid.vid, vnode->fid.vnode);
-			set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+			set_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
 			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
 		}
 	} else if (store_version) {
 		status->data_version = data_version;
 	}
+
+	write_sequnlock(&vnode->cb_lock);
 }

 /*
 * decode an AFSCallBack block
 */
-static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+static void xdr_decode_AFSCallBack(struct afs_call *call,
+				   struct afs_vnode *vnode,
+				   const __be32 **_bp)
 {
 	const __be32 *bp = *_bp;
+	u32 cb_expiry;
+
+	write_seqlock(&vnode->cb_lock);

+	if (call->cb_break == (vnode->cb_break + call->server->cb_s_break)) {
 		vnode->cb_version	= ntohl(*bp++);
-	vnode->cb_expiry	= ntohl(*bp++);
+		cb_expiry		= ntohl(*bp++);
 		vnode->cb_type		= ntohl(*bp++);
-	vnode->cb_expires	= vnode->cb_expiry + ktime_get_real_seconds();
+		vnode->cb_expires_at	= cb_expiry + ktime_get_real_seconds();
+		set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+	} else {
+		bp += 3;
+	}
+
+	write_sequnlock(&vnode->cb_lock);
 	*_bp = bp;
 }

@@ -247,16 +269,16 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
 	const __be32 *bp;
 	int ret;

-	_enter("");
-
 	ret = afs_transfer_reply(call);
 	if (ret < 0)
 		return ret;

+	_enter("{%x:%u}", vnode->fid.vid, vnode->fid.vnode);
+
 	/* unmarshall the reply once we've received all of it */
 	bp = call->buffer;
 	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-	xdr_decode_AFSCallBack(&bp, vnode);
+	xdr_decode_AFSCallBack(call, vnode, &bp);
 	if (call->reply[1])
 		xdr_decode_AFSVolSync(&bp, call->reply[1]);

@@ -304,6 +326,8 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);

+	call->cb_break = vnode->cb_break + server->cb_s_break;
+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -429,7 +453,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)

 		bp = call->buffer;
 		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
-		xdr_decode_AFSCallBack(&bp, vnode);
+		xdr_decode_AFSCallBack(call, vnode, &bp);
 		if (call->reply[1])
 			xdr_decode_AFSVolSync(&bp, call->reply[1]);

@@ -513,6 +537,8 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	bp[7] = htonl(lower_32_bits(req->len));

 	atomic_inc(&req->usage);
+	call->cb_break = vnode->cb_break + server->cb_s_break;
+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -556,87 +582,8 @@ int afs_fs_fetch_data(struct afs_server *server,
 	bp[5] = htonl(lower_32_bits(req->len));

 	atomic_inc(&req->usage);
-	return afs_make_call(&server->addr, call, GFP_NOFS, async);
-}
-
-/*
- * deliver reply data to an FS.GiveUpCallBacks
- */
-static int afs_deliver_fs_give_up_callbacks(struct afs_call *call)
-{
-	_enter("");
-
-	/* shouldn't be any reply data */
-	return afs_extract_data(call, NULL, 0, false);
-}
-
-/*
- * FS.GiveUpCallBacks operation type
- */
-static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
-	.name		= "FS.GiveUpCallBacks",
-	.deliver	= afs_deliver_fs_give_up_callbacks,
-	.destructor	= afs_flat_call_destructor,
-};
-
-/*
- * give up a set of callbacks
- * - the callbacks are held in the server->cb_break ring
- */
-int afs_fs_give_up_callbacks(struct afs_net *net,
-			     struct afs_server *server,
-			     bool async)
-{
-	struct afs_call *call;
-	size_t ncallbacks;
-	__be32 *bp, *tp;
-	int loop;
-
-	ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
-			      ARRAY_SIZE(server->cb_break));
-
-	_enter("{%zu},", ncallbacks);
-
-	if (ncallbacks == 0)
-		return 0;
-	if (ncallbacks > AFSCBMAX)
-		ncallbacks = AFSCBMAX;
-
-	_debug("break %zu callbacks", ncallbacks);
-
-	call = afs_alloc_flat_call(net, &afs_RXFSGiveUpCallBacks,
-				   12 + ncallbacks * 6 * 4, 0);
-	if (!call)
-		return -ENOMEM;
-
-
-	/* marshall the parameters */
-	bp = call->request;
-	tp = bp + 2 + ncallbacks * 3;
-	*bp++ = htonl(FSGIVEUPCALLBACKS);
-	*bp++ = htonl(ncallbacks);
-	*tp++ = htonl(ncallbacks);
-
-	atomic_sub(ncallbacks, &server->cb_break_n);
-	for (loop = ncallbacks; loop > 0; loop--) {
-		struct afs_callback *cb =
-			&server->cb_break[server->cb_break_tail];
-
-		*bp++ = htonl(cb->fid.vid);
-		*bp++ = htonl(cb->fid.vnode);
-		*bp++ = htonl(cb->fid.unique);
-		*tp++ = htonl(cb->version);
-		*tp++ = htonl(cb->expiry);
-		*tp++ = htonl(cb->type);
-		smp_mb();
-		server->cb_break_tail =
-			(server->cb_break_tail + 1) &
-			(ARRAY_SIZE(server->cb_break) - 1);
-	}
-
-	ASSERT(ncallbacks > 0);
-	wake_up_nr(&server->cb_break_waitq, ncallbacks);
-
+	call->cb_break = vnode->cb_break + server->cb_s_break;
+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -731,6 +678,7 @@ int afs_fs_create(struct afs_server *server,
 	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
 	*bp++ = 0; /* segment size */

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -809,6 +757,7 @@ int afs_fs_remove(struct afs_server *server,
 		bp = (void *) bp + padsz;
 	}

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -892,6 +841,7 @@ int afs_fs_link(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -994,6 +944,7 @@ int afs_fs_symlink(struct afs_server *server,
 	*bp++ = htonl(S_IRWXUGO); /* unix mode */
 	*bp++ = 0; /* segment size */

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1095,6 +1046,7 @@ int afs_fs_rename(struct afs_server *server,
 		bp = (void *) bp + n_padsz;
 	}

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1196,6 +1148,7 @@ static int afs_fs_store_data64(struct afs_server *server,
 	*bp++ = htonl(i_size >> 32);
 	*bp++ = htonl((u32) i_size);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1269,6 +1222,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 	*bp++ = htonl(size);
 	*bp++ = htonl(i_size);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1366,6 +1320,7 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
 	*bp++ = htonl((u32) attr->ia_size);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1413,6 +1368,7 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 	*bp++ = 0;				/* size of write */
 	*bp++ = htonl(attr->ia_size);		/* new file length */

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1454,6 +1410,7 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,

 	xdr_encode_AFS_StoreStatus(&bp, attr);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1684,6 +1641,7 @@ int afs_fs_get_volume_status(struct afs_server *server,
 	bp[0] = htonl(FSGETVOLUMESTATUS);
 	bp[1] = htonl(vnode->fid.vid);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1766,6 +1724,7 @@ int afs_fs_set_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.unique);
 	*bp++ = htonl(type);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1797,6 +1756,7 @@ int afs_fs_extend_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);

+	afs_use_fs_server(call, server);
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }

@@ -1828,5 +1788,49 @@ int afs_fs_release_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);

+	afs_use_fs_server(call, server);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
+}
+
+/*
+ * Deliver reply data to an FS.GiveUpAllCallBacks operation.
+ */
+static int afs_deliver_fs_give_up_all_callbacks(struct afs_call *call)
+{
+	return afs_transfer_reply(call);
+}
+
+/*
+ * FS.GiveUpAllCallBacks operation type
+ */
+static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = {
+	.name		= "FS.GiveUpAllCallBacks",
+	.deliver	= afs_deliver_fs_give_up_all_callbacks,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * Flush all the callbacks we have on a server.
+ */
+int afs_fs_give_up_all_callbacks(struct afs_server *server,
+				 struct key *key,
+				 bool async)
+{
+	struct afs_call *call;
+	__be32 *bp;
+
+	_enter("");
+
+	call = afs_alloc_flat_call(server->net, &afs_RXFSGiveUpAllCallBacks, 2 * 4, 0);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSGIVEUPALLCALLBACKS);
+
+	/* Can't take a ref on server */
 	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -23,11 +23,6 @@
 #include <linux/namei.h>
 #include "internal.h"

-struct afs_iget_data {
-	struct afs_fid		fid;
-	struct afs_volume	*volume;	/* volume on which resides */
-};
-
 static const struct inode_operations afs_symlink_inode_operations = {
 	.get_link	= page_get_link,
 	.listxattr	= afs_listxattr,
@@ -39,6 +34,7 @@ static const struct inode_operations afs_symlink_inode_operations = {
 static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 {
 	struct inode *inode = AFS_VNODE_TO_I(vnode);
+	bool changed;

 	_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
 	       vnode->status.type,
@@ -47,6 +43,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	       vnode->status.data_version,
 	       vnode->status.mode);

+	read_seqlock_excl(&vnode->cb_lock);
+
 	switch (vnode->status.type) {
 	case AFS_FTYPE_FILE:
 		inode->i_mode	= S_IFREG | vnode->status.mode;
@@ -63,9 +61,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		if ((vnode->status.mode & 0777) == 0644) {
 			inode->i_flags |= S_AUTOMOUNT;

-			spin_lock(&vnode->lock);
 			set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
-			spin_unlock(&vnode->lock);

 			inode->i_mode	= S_IFDIR | 0555;
 			inode->i_op	= &afs_mntpt_inode_operations;
@@ -78,13 +74,11 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 		break;
 	default:
 		printk("kAFS: AFS vnode with undefined type\n");
+		read_sequnlock_excl(&vnode->cb_lock);
 		return -EBADMSG;
 	}

-#ifdef CONFIG_AFS_FSCACHE
-	if (vnode->status.size != inode->i_size)
-		fscache_attr_changed(vnode->cache);
-#endif
+	changed = (vnode->status.size != inode->i_size);

 	set_nlink(inode, vnode->status.nlink);
 	inode->i_uid		= vnode->status.owner;
@@ -97,13 +91,20 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	inode->i_generation	= vnode->fid.unique;
 	inode->i_version	= vnode->status.data_version;
 	inode->i_mapping->a_ops	= &afs_fs_aops;
+
+	read_sequnlock_excl(&vnode->cb_lock);
+
+#ifdef CONFIG_AFS_FSCACHE
+	if (changed)
+		fscache_attr_changed(vnode->cache);
+#endif
 	return 0;
 }

 /*
 * iget5() comparator
 */
-static int afs_iget5_test(struct inode *inode, void *opaque)
+int afs_iget5_test(struct inode *inode, void *opaque)
 {
 	struct afs_iget_data *data = opaque;

@@ -237,8 +238,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,

 	if (!status) {
 		/* it's a remotely extant inode */
-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		ret = afs_vnode_fetch_status(vnode, NULL, key, true);
 		if (ret < 0)
 			goto bad_inode;
 	} else {
@@ -249,16 +249,16 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
 			/* it's a symlink we just created (the fileserver
 			 * didn't give us a callback) */
 			vnode->cb_version = 0;
-			vnode->cb_expiry = 0;
 			vnode->cb_type = 0;
-			vnode->cb_expires = ktime_get_real_seconds();
+			vnode->cb_expires_at = 0;
 		} else {
 			vnode->cb_version = cb->version;
-			vnode->cb_expiry = cb->expiry;
 			vnode->cb_type = cb->type;
-			vnode->cb_expires = vnode->cb_expiry +
-				ktime_get_real_seconds();
+			vnode->cb_expires_at = cb->expiry;
+			set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
 		}
+
+		vnode->cb_expires_at += ktime_get_real_seconds();
 	}

 	/* set up caching before mapping the status, as map-status reads the
@@ -320,25 +320,34 @@ void afs_zap_data(struct afs_vnode *vnode)
 */
 int afs_validate(struct afs_vnode *vnode, struct key *key)
 {
+	time64_t now = ktime_get_real_seconds();
+	bool valid = false;
 	int ret;

 	_enter("{v={%x:%u} fl=%lx},%x",
 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
 	       key_serial(key));

-	if (vnode->cb_promised &&
-	    !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
-	    !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
-		if (vnode->cb_expires < ktime_get_real_seconds() + 10) {
-			_debug("callback expired");
-			set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		} else {
-			goto valid;
+	/* Quickly check the callback state.  Ideally, we'd use read_seqbegin
+	 * here, but we have no way to pass the net namespace to the RCU
+	 * cleanup for the server record.
+	 */
+	read_seqlock_excl(&vnode->cb_lock);
+
+	if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+		if (vnode->cb_s_break != vnode->cb_interest->server->cb_s_break) {
+			vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
+		} else if (!test_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags) &&
+			   !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
+			   vnode->cb_expires_at - 10 > now) {
+				valid = true;
 		}
+	} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		valid = true;
 	}

-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+	read_sequnlock_excl(&vnode->cb_lock);
+	if (valid)
 		goto valid;

 	mutex_lock(&vnode->validate_lock);
@@ -347,12 +356,16 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	 * a new promise - note that if the (parent) directory's metadata was
 	 * changed then the security may be different and we may no longer have
 	 * access */
-	if (!vnode->cb_promised ||
-	    test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+	if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		_debug("not promised");
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
-		if (ret < 0)
+		ret = afs_vnode_fetch_status(vnode, NULL, key, false);
+		if (ret < 0) {
+			if (ret == -ENOENT) {
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
+				ret = -ESTALE;
+			}
 			goto error_unlock;
+		}
 		_debug("new promise [fl=%lx]", vnode->flags);
 	}

@@ -367,7 +380,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
 		afs_zap_data(vnode);

-	clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+	clear_bit(AFS_VNODE_DIR_MODIFIED, &vnode->flags);
 	mutex_unlock(&vnode->validate_lock);
 valid:
 	_leave(" = 0");
@@ -386,10 +399,17 @@ int afs_getattr(const struct path *path, struct kstat *stat,
 		u32 request_mask, unsigned int query_flags)
 {
 	struct inode *inode = d_inode(path->dentry);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	int seq = 0;

 	_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);

+	do {
+		read_seqbegin_or_lock(&vnode->cb_lock, &seq);
 		generic_fillattr(inode, stat);
+	} while (need_seqretry(&vnode->cb_lock, seq));
+
+	done_seqretry(&vnode->cb_lock, seq);
 	return 0;
 }

@@ -416,13 +436,10 @@ void afs_evict_inode(struct inode *inode)

 	vnode = AFS_FS_I(inode);

-	_enter("{%x:%u.%d} v=%u x=%u t=%u }",
+	_enter("{%x:%u.%d}",
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
-	       vnode->fid.unique,
-	       vnode->cb_version,
-	       vnode->cb_expiry,
-	       vnode->cb_type);
+	       vnode->fid.unique);

 	_debug("CLEAR INODE %p", inode);

@@ -431,18 +448,12 @@ void afs_evict_inode(struct inode *inode)
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);

-	afs_give_up_callback(vnode);
-
-	if (vnode->server) {
-		spin_lock(&vnode->server->fs_lock);
-		rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
-		spin_unlock(&vnode->server->fs_lock);
-		afs_put_server(afs_i2net(inode), vnode->server);
-		vnode->server = NULL;
+	if (vnode->cb_interest) {
+		afs_put_cb_interest(afs_i2net(inode), vnode->cb_interest);
+		vnode->cb_interest = NULL;
 	}

 	ASSERT(list_empty(&vnode->writebacks));
-	ASSERT(!vnode->cb_promised);

 #ifdef CONFIG_AFS_FSCACHE
 	fscache_relinquish_cookie(vnode->cache, 0);

--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -55,6 +55,11 @@ struct afs_mount_params {
 	struct key		*key;		/* key to use for secure mounting */
 };

+struct afs_iget_data {
+	struct afs_fid		fid;
+	struct afs_volume	*volume;	/* volume on which resides */
+};
+
 enum afs_call_state {
 	AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
 	AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
@@ -77,6 +82,7 @@ struct afs_call {
 	struct key		*key;		/* security for this call */
 	struct afs_net		*net;		/* The network namespace */
 	struct afs_server	*cm_server;	/* Server affected by incoming CM call */
+	struct afs_server	*server;	/* Server used by client call */
 	void			*request;	/* request data (first part) */
 	struct address_space	*mapping;	/* page set */
 	struct afs_writeback	*wb;		/* writeback being performed */
@@ -92,6 +98,7 @@ struct afs_call {
 	unsigned		request_size;	/* size of request data */
 	unsigned		reply_max;	/* maximum size of reply */
 	unsigned		first_offset;	/* offset into mapping[first] */
+	unsigned int		cb_break;	/* cb_break + cb_s_break before the call */
 	union {
 		unsigned	last_to;	/* amount of mapping[last] */
 		unsigned	count2;		/* count used in unmarshalling */
@@ -314,26 +321,31 @@ struct afs_server {
 	struct afs_cell		*cell;		/* cell in which server resides */
 	struct list_head	link;		/* link in cell's server list */
 	struct list_head	grave;		/* link in master graveyard list */
+
 	struct rb_node		master_rb;	/* link in master by-addr tree */
 	struct rw_semaphore	sem;		/* access lock */
+	unsigned long		flags;
+#define AFS_SERVER_NEW		0		/* New server, don't inc cb_s_break */

 	/* file service access */
-	struct rb_root		fs_vnodes;	/* vnodes backed by this server (ordered by FID) */
-	unsigned long		fs_act_jif;	/* time at which last activity occurred */
-	unsigned long		fs_dead_jif;	/* time at which no longer to be considered dead */
-	spinlock_t		fs_lock;	/* access lock */
 	int			fs_state;      	/* 0 or reason FS currently marked dead (-errno) */
+	spinlock_t		fs_lock;	/* access lock */

 	/* callback promise management */
-	struct rb_root		cb_promises;	/* vnode expiration list (ordered earliest first) */
-	struct delayed_work	cb_updater;	/* callback updater */
-	struct delayed_work	cb_break_work;	/* collected break dispatcher */
-	wait_queue_head_t	cb_break_waitq;	/* space available in cb_break waitqueue */
-	spinlock_t		cb_lock;	/* access lock */
-	struct afs_callback	cb_break[64];	/* ring of callbacks awaiting breaking */
-	atomic_t		cb_break_n;	/* number of pending breaks */
-	u8			cb_break_head;	/* head of callback breaking ring */
-	u8			cb_break_tail;	/* tail of callback breaking ring */
+	struct list_head	cb_interests;	/* List of superblocks using this server */
+	unsigned		cb_s_break;	/* Break-everything counter. */
+	rwlock_t		cb_break_lock;	/* Volume finding lock */
+};
+
+/*
+ * Interest by a superblock on a server.
+ */
+struct afs_cb_interest {
+	struct list_head	cb_link;	/* Link in server->cb_interests */
+	struct afs_server	*server;	/* Server on which this interest resides */
+	struct super_block	*sb;		/* Superblock on which inodes reside */
+	afs_volid_t		vid;		/* Volume ID to match */
+	refcount_t		usage;
 };

 /*
@@ -352,6 +364,7 @@ struct afs_volume {
 	unsigned short		nservers;	/* number of server slots filled */
 	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
 	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
+	struct afs_cb_interest	*cb_interests[8]; /* Interests on servers for callbacks */
 	struct rw_semaphore	server_sem;	/* lock for accessing current server */
 };

@@ -371,7 +384,6 @@ struct afs_vnode {
 	struct inode		vfs_inode;	/* the VFS's inode record */

 	struct afs_volume	*volume;	/* volume on which vnode resides */
-	struct afs_server	*server;	/* server currently supplying this file */
 	struct afs_fid		fid;		/* the file identifier for this inode */
 	struct afs_file_status	status;		/* AFS status info for this file */
 #ifdef CONFIG_AFS_FSCACHE
@@ -386,9 +398,9 @@ struct afs_vnode {
 	spinlock_t		writeback_lock;	/* lock for writebacks */
 	spinlock_t		lock;		/* waitqueue/flags lock */
 	unsigned long		flags;
-#define AFS_VNODE_CB_BROKEN	0		/* set if vnode's callback was broken */
+#define AFS_VNODE_CB_PROMISED	0		/* Set if vnode has a callback promise */
 #define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
-#define AFS_VNODE_MODIFIED	2		/* set if vnode's data modified */
+#define AFS_VNODE_DIR_MODIFIED	2		/* set if dir vnode's data modified */
 #define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
 #define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
 #define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
@@ -408,15 +420,14 @@ struct afs_vnode {
 	struct key		*unlock_key;	/* key to be used in unlocking */

 	/* outstanding callback notification on this file */
-	struct rb_node		server_rb;	/* link in server->fs_vnodes */
-	struct rb_node		cb_promise;	/* link in server->cb_promises */
-	struct work_struct	cb_broken_work;	/* work to be done on callback break */
-	time64_t		cb_expires;	/* time at which callback expires */
-	time64_t		cb_expires_at;	/* time used to order cb_promise */
+	struct afs_cb_interest	*cb_interest;	/* Server on which this resides */
+	unsigned int		cb_s_break;	/* Mass break counter on ->server */
+	unsigned int		cb_break;	/* Break counter on vnode */
+	seqlock_t		cb_lock;	/* Lock for ->cb_interest, ->status, ->cb_*break */
+
+	time64_t		cb_expires_at;	/* time at which callback expires */
 	unsigned		cb_version;	/* callback version */
-	unsigned		cb_expiry;	/* callback expiry time */
 	afs_callback_type_t	cb_type;	/* type of callback */
-	bool			cb_promised;	/* true if promise still holds */
 };

 /*
@@ -463,16 +474,20 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def;
 /*
 * callback.c
 */
-extern struct workqueue_struct *afs_callback_update_worker;
-
 extern void afs_init_callback_state(struct afs_server *);
-extern void afs_broken_callback_work(struct work_struct *);
-extern void afs_break_callbacks(struct afs_server *, size_t,
-				struct afs_callback[]);
-extern void afs_discard_callback_on_delete(struct afs_vnode *);
-extern void afs_give_up_callback(struct afs_vnode *);
-extern void afs_dispatch_give_up_callbacks(struct work_struct *);
-extern void afs_flush_callback_breaks(struct afs_server *);
+extern void afs_break_callback(struct afs_vnode *);
+extern void afs_break_callbacks(struct afs_server *, size_t,struct afs_callback[]);
+
+extern int afs_register_server_cb_interest(struct afs_vnode *, struct afs_cb_interest **,
+					   struct afs_server *);
+extern void afs_put_cb_interest(struct afs_net *, struct afs_cb_interest *);
+extern void afs_clear_callback_interests(struct afs_net *, struct afs_volume *);
+
+static inline struct afs_cb_interest *afs_get_cb_interest(struct afs_cb_interest *cbi)
+{
+	refcount_inc(&cbi->usage);
+	return cbi;
+}

 /*
 * cell.c
@@ -560,10 +575,12 @@ extern int afs_fs_extend_lock(struct afs_server *, struct key *,
 			      struct afs_vnode *, bool);
 extern int afs_fs_release_lock(struct afs_server *, struct key *,
 			       struct afs_vnode *, bool);
+extern int afs_fs_give_up_all_callbacks(struct afs_server *, struct key *, bool);

 /*
 * inode.c
 */
+extern int afs_iget5_test(struct inode *, void *);
 extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
 				       struct key *);
 extern struct inode *afs_iget(struct super_block *, struct key *,
@@ -676,11 +693,11 @@ extern int afs_permission(struct inode *, int);
 */
 extern spinlock_t afs_server_peer_lock;

-#define afs_get_server(S)					\
-do {								\
-	_debug("GET SERVER %d", atomic_read(&(S)->usage));	\
-	atomic_inc(&(S)->usage);				\
-} while(0)
+static inline struct afs_server *afs_get_server(struct afs_server *server)
+{
+	atomic_inc(&server->usage);
+	return server;
+}

 extern void afs_server_timer(struct timer_list *);
 extern struct afs_server *afs_lookup_server(struct afs_cell *,
@@ -741,7 +758,7 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
 extern void afs_vnode_finalise_status_update(struct afs_vnode *,
 					     struct afs_server *);
 extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
-				  struct key *);
+				  struct key *, bool);
 extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
 				struct afs_read *);
 extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,

--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -123,10 +123,6 @@ static int __init afs_init(void)
 		alloc_workqueue("kafs_vlupdated", WQ_MEM_RECLAIM, 0);
 	if (!afs_vlocation_update_worker)
 		goto error_vl_up;
-	afs_callback_update_worker =
-		alloc_ordered_workqueue("kafs_callbackd", WQ_MEM_RECLAIM);
-	if (!afs_callback_update_worker)
-		goto error_callback;
 	afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
 	if (!afs_lock_manager)
 		goto error_lockmgr;
@@ -158,8 +154,6 @@ static int __init afs_init(void)
 #endif
 	destroy_workqueue(afs_lock_manager);
 error_lockmgr:
-	destroy_workqueue(afs_callback_update_worker);
-error_callback:
 	destroy_workqueue(afs_vlocation_update_worker);
 error_vl_up:
 	destroy_workqueue(afs_async_calls);
@@ -189,7 +183,6 @@ static void __exit afs_exit(void)
 	fscache_unregister_netfs(&afs_cache_netfs);
 #endif
 	destroy_workqueue(afs_lock_manager);
-	destroy_workqueue(afs_callback_update_worker);
 	destroy_workqueue(afs_vlocation_update_worker);
 	destroy_workqueue(afs_async_calls);
 	destroy_workqueue(afs_wq);

--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -115,6 +115,7 @@ void afs_clear_permits(struct afs_vnode *vnode)
 	mutex_lock(&vnode->permits_lock);
 	permits = vnode->permits;
 	RCU_INIT_POINTER(vnode->permits, NULL);
+	vnode->cb_break++;
 	mutex_unlock(&vnode->permits_lock);

 	if (permits)
@@ -264,8 +265,7 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
 		 * (the post-processing will cache the result on auth_vnode) */
 		_debug("no valid permit");

-		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-		ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+		ret = afs_vnode_fetch_status(vnode, auth_vnode, key, true);
 		if (ret < 0) {
 			iput(&auth_vnode->vfs_inode);
 			*_access = 0;
@@ -304,14 +304,9 @@ int afs_permission(struct inode *inode, int mask)
 		return PTR_ERR(key);
 	}

-	/* if the promise has expired, we need to check the server again */
-	if (!vnode->cb_promised) {
-		_debug("not promised");
-		ret = afs_vnode_fetch_status(vnode, NULL, key);
+	ret = afs_validate(vnode, key);
 	if (ret < 0)
 		goto error;
-		_debug("new promise [fl=%lx]", vnode->flags);
-	}

 	/* check the permits to see if we've got one yet */
 	ret = afs_check_permit(vnode, key, &access);

--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -94,12 +94,8 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
 		INIT_LIST_HEAD(&server->grave);
 		init_rwsem(&server->sem);
 		spin_lock_init(&server->fs_lock);
-		server->fs_vnodes = RB_ROOT;
-		server->cb_promises = RB_ROOT;
-		spin_lock_init(&server->cb_lock);
-		init_waitqueue_head(&server->cb_break_waitq);
-		INIT_DELAYED_WORK(&server->cb_break_work,
-				  afs_dispatch_give_up_callbacks);
+		INIT_LIST_HEAD(&server->cb_interests);
+		rwlock_init(&server->cb_break_lock);

 		server->addr = *addr;
 		afs_inc_servers_outstanding(cell->net);
@@ -258,8 +254,6 @@ void afs_put_server(struct afs_net *net, struct afs_server *server)
 		return;
 	}

-	afs_flush_callback_breaks(server);
-
 	spin_lock(&net->server_graveyard_lock);
 	if (atomic_read(&server->usage) == 0) {
 		list_move_tail(&server->grave, &net->server_graveyard);
@@ -277,15 +271,8 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
 {
 	_enter("%p", server);

-	ASSERTIF(server->cb_break_head != server->cb_break_tail,
-		 delayed_work_pending(&server->cb_break_work));
-
-	ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
-	ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
-	ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
-	ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
-
-	afs_put_cell(server->net, server->cell);
+	afs_fs_give_up_all_callbacks(server, NULL, false);
+	afs_put_cell(net, server->cell);
 	kfree(server);
 	afs_dec_servers_outstanding(net);
 }

--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -512,8 +512,12 @@ static struct dentry *afs_mount(struct file_system_type *fs_type,

 static void afs_kill_super(struct super_block *sb)
 {
-	struct afs_super_info *as = sb->s_fs_info;
+	struct afs_super_info *as = AFS_FS_S(sb);

+	/* Clear the callback interests (which will do ilookup5) before
+	 * deactivating the superblock.
+	 */
+	afs_clear_callback_interests(as->net, as->volume);
 	kill_anon_super(sb);
 	afs_destroy_sbi(as);
 }
@@ -536,7 +540,7 @@ static void afs_i_init_once(void *_vnode)
 	INIT_LIST_HEAD(&vnode->pending_locks);
 	INIT_LIST_HEAD(&vnode->granted_locks);
 	INIT_DELAYED_WORK(&vnode->lock_work, afs_lock_work);
-	INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
+	seqlock_init(&vnode->cb_lock);
 }

 /*
@@ -558,7 +562,6 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 	vnode->volume		= NULL;
 	vnode->update_cnt	= 0;
 	vnode->flags		= 1 << AFS_VNODE_UNSET;
-	vnode->cb_promised	= false;

 	_leave(" = %p", &vnode->vfs_inode);
 	return &vnode->vfs_inode;
@@ -582,7 +585,7 @@ static void afs_destroy_inode(struct inode *inode)

 	_debug("DESTROY INODE %p", inode);

-	ASSERTCMP(vnode->server, ==, NULL);
+	ASSERTCMP(vnode->cb_interest, ==, NULL);

 	call_rcu(&inode->i_rcu, afs_i_callback);
 	atomic_dec(&afs_count_active_inodes);

--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -16,189 +16,20 @@
 #include <linux/sched.h>
 #include "internal.h"

-#if 0
-static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
-				   int depth, char lr)
-{
-	struct afs_vnode *vnode;
-	bool bad = false;
-
-	if (!node)
-		return false;
-
-	if (node->rb_left)
-		bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
-
-	vnode = rb_entry(node, struct afs_vnode, cb_promise);
-	_debug("%c %*.*s%c%p {%d}",
-	       rb_is_red(node) ? 'R' : 'B',
-	       depth, depth, "", lr,
-	       vnode, vnode->cb_expires_at);
-	if (rb_parent(node) != parent) {
-		printk("BAD: %p != %p\n", rb_parent(node), parent);
-		bad = true;
-	}
-
-	if (node->rb_right)
-		bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
-
-	return bad;
-}
-
-static noinline void dump_tree(const char *name, struct afs_server *server)
-{
-	_enter("%s", name);
-	if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
-		BUG();
-}
-#endif
-
-/*
- * insert a vnode into the backing server's vnode tree
- */
-static void afs_install_vnode(struct afs_vnode *vnode,
-			      struct afs_server *server)
-{
-	struct afs_server *old_server = vnode->server;
-	struct afs_vnode *xvnode;
-	struct rb_node *parent, **p;
-
-	_enter("%p,%p", vnode, server);
-
-	if (old_server) {
-		spin_lock(&old_server->fs_lock);
-		rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
-		spin_unlock(&old_server->fs_lock);
-	}
-
-	afs_get_server(server);
-	vnode->server = server;
-	afs_put_server(afs_v2net(vnode), old_server);
-
-	/* insert into the server's vnode tree in FID order */
-	spin_lock(&server->fs_lock);
-
-	parent = NULL;
-	p = &server->fs_vnodes.rb_node;
-	while (*p) {
-		parent = *p;
-		xvnode = rb_entry(parent, struct afs_vnode, server_rb);
-		if (vnode->fid.vid < xvnode->fid.vid)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.vid > xvnode->fid.vid)
-			p = &(*p)->rb_right;
-		else if (vnode->fid.vnode < xvnode->fid.vnode)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.vnode > xvnode->fid.vnode)
-			p = &(*p)->rb_right;
-		else if (vnode->fid.unique < xvnode->fid.unique)
-			p = &(*p)->rb_left;
-		else if (vnode->fid.unique > xvnode->fid.unique)
-			p = &(*p)->rb_right;
-		else
-			BUG(); /* can't happen unless afs_iget() malfunctions */
-	}
-
-	rb_link_node(&vnode->server_rb, parent, p);
-	rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
-
-	spin_unlock(&server->fs_lock);
-	_leave("");
-}
-
 /*
- * insert a vnode into the promising server's update/expiration tree
- * - caller must hold vnode->lock
- */
-static void afs_vnode_note_promise(struct afs_vnode *vnode,
-				   struct afs_server *server)
-{
-	struct afs_server *old_server;
-	struct afs_vnode *xvnode;
-	struct rb_node *parent, **p;
-
-	_enter("%p,%p", vnode, server);
-
-	ASSERT(server != NULL);
-
-	old_server = vnode->server;
-	if (vnode->cb_promised) {
-		if (server == old_server &&
-		    vnode->cb_expires == vnode->cb_expires_at) {
-			_leave(" [no change]");
-			return;
-		}
-
-		spin_lock(&old_server->cb_lock);
-		if (vnode->cb_promised) {
-			_debug("delete");
-			rb_erase(&vnode->cb_promise, &old_server->cb_promises);
-			vnode->cb_promised = false;
-		}
-		spin_unlock(&old_server->cb_lock);
-	}
-
-	if (vnode->server != server)
-		afs_install_vnode(vnode, server);
-
-	vnode->cb_expires_at = vnode->cb_expires;
-	_debug("PROMISE on %p {%lu}",
-	       vnode, (unsigned long) vnode->cb_expires_at);
-
-	/* abuse an RB-tree to hold the expiration order (we may have multiple
-	 * items with the same expiration time) */
-	spin_lock(&server->cb_lock);
-
-	parent = NULL;
-	p = &server->cb_promises.rb_node;
-	while (*p) {
-		parent = *p;
-		xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
-		if (vnode->cb_expires_at < xvnode->cb_expires_at)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&vnode->cb_promise, parent, p);
-	rb_insert_color(&vnode->cb_promise, &server->cb_promises);
-	vnode->cb_promised = true;
-
-	spin_unlock(&server->cb_lock);
-	_leave("");
-}
-
-/*
- * handle remote file deletion by discarding the callback promise
+ * Handle remote file deletion.
 */
 static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
 {
-	struct afs_server *server;
+	struct afs_cb_interest *cbi = vnode->cb_interest;

-	_enter("{%p}", vnode->server);
+	_enter("{%p}", cbi);

 	set_bit(AFS_VNODE_DELETED, &vnode->flags);

-	server = vnode->server;
-	if (server) {
-		if (vnode->cb_promised) {
-			spin_lock(&server->cb_lock);
-			if (vnode->cb_promised) {
-				rb_erase(&vnode->cb_promise,
-					 &server->cb_promises);
-				vnode->cb_promised = false;
-			}
-			spin_unlock(&server->cb_lock);
-		}
-
-		spin_lock(&server->fs_lock);
-		rb_erase(&vnode->server_rb, &server->fs_vnodes);
-		spin_unlock(&server->fs_lock);
-
-		vnode->server = NULL;
-		afs_put_server(afs_v2net(vnode), server);
-	} else {
-		ASSERT(!vnode->cb_promised);
+	if (cbi) {
+		vnode->cb_interest = NULL;
+		afs_put_cb_interest(afs_v2net(vnode), cbi);
 	}

 	_leave("");
@@ -218,8 +49,6 @@ void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
 	_enter("%p,%p", vnode, server);

 	spin_lock(&vnode->lock);
-	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-	afs_vnode_note_promise(vnode, server);
 	vnode->update_cnt--;
 	ASSERTCMP(vnode->update_cnt, >=, 0);
 	spin_unlock(&vnode->lock);
@@ -238,8 +67,6 @@ static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)

 	spin_lock(&vnode->lock);

-	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
-
 	if (ret == -ENOENT) {
 		/* the file was deleted on the server */
 		_debug("got NOENT from server - marking file deleted");
@@ -261,8 +88,8 @@ static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
 *   - there are any outstanding ops that will fetch the status
 * - TODO implement local caching
 */
-int afs_vnode_fetch_status(struct afs_vnode *vnode,
-			   struct afs_vnode *auth_vnode, struct key *key)
+int afs_vnode_fetch_status(struct afs_vnode *vnode, struct afs_vnode *auth_vnode,
+			   struct key *key, bool force)
 {
 	struct afs_server *server;
 	unsigned long acl_order;
@@ -270,12 +97,13 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,

 	DECLARE_WAITQUEUE(myself, current);

-	_enter("%s,{%x:%u.%u}",
+	_enter("%s,{%x:%u.%u,S=%lx},%u",
 	       vnode->volume->vlocation->vldb.name,
-	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
+	       vnode->flags,
+	       force);

-	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    vnode->cb_promised) {
+	if (!force && test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		_leave(" [unchanged]");
 		return 0;
 	}
@@ -291,8 +119,7 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,

 	spin_lock(&vnode->lock);

-	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
-	    vnode->cb_promised) {
+	if (!force && test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
 		spin_unlock(&vnode->lock);
 		_leave(" [unchanged]");
 		return 0;
@@ -310,7 +137,7 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,

 		/* wait for the status to be updated */
 		for (;;) {
-			if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+			if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
 				break;
 			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 				break;

--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -153,8 +153,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 error_discard:
 	up_write(&params->cell->vl_sem);

-	for (loop = volume->nservers - 1; loop >= 0; loop--)
+	for (loop = volume->nservers - 1; loop >= 0; loop--) {
+		afs_put_cb_interest(params->net, volume->cb_interests[loop]);
 		afs_put_server(params->net, volume->servers[loop]);
+	}

 	kfree(volume);
 	goto error;
@@ -197,8 +199,10 @@ void afs_put_volume(struct afs_cell *cell, struct afs_volume *volume)
 #endif
 	afs_put_vlocation(cell->net, vlocation);

-	for (loop = volume->nservers - 1; loop >= 0; loop--)
+	for (loop = volume->nservers - 1; loop >= 0; loop--) {
+		afs_put_cb_interest(cell->net, volume->cb_interests[loop]);
 		afs_put_server(cell->net, volume->servers[loop]);
+	}

 	kfree(volume);

@@ -218,10 +222,10 @@ struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
 	_enter("%s", volume->vlocation->vldb.name);

 	/* stick with the server we're already using if we can */
-	if (vnode->server && vnode->server->fs_state == 0) {
-		afs_get_server(vnode->server);
-		_leave(" = %p [current]", vnode->server);
-		return vnode->server;
+	if (vnode->cb_interest && vnode->cb_interest->server->fs_state == 0) {
+		afs_get_server(vnode->cb_interest->server);
+		_leave(" = %p [current]", vnode->cb_interest->server);
+		return vnode->cb_interest->server;
 	}

 	down_read(&volume->server_sem);
@@ -244,13 +248,8 @@ struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
 		_debug("consider %d [%d]", loop, state);

 		switch (state) {
-			/* found an apparently healthy server */
 		case 0:
-			afs_get_server(server);
-			up_read(&volume->server_sem);
-			_leave(" = %p (picked %pIS)",
-			       server, &server->addr.transport);
-			return server;
+			goto picked_server;

 		case -ENETUNREACH:
 			if (ret == 0)
@@ -284,9 +283,25 @@ struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
 	/* no available servers
 	 * - TODO: handle the no active servers case better
 	 */
+error:
 	up_read(&volume->server_sem);
 	_leave(" = %d", ret);
 	return ERR_PTR(ret);
+
+picked_server:
+	/* Found an apparently healthy server.  We need to register an interest
+	 * in receiving callbacks before we talk to it.
+	 */
+	ret = afs_register_server_cb_interest(vnode,
+					      &volume->cb_interests[loop], server);
+	if (ret < 0)
+		goto error;
+	
+	afs_get_server(server);
+	up_read(&volume->server_sem);
+	_leave(" = %p (picked %pIS)",
+	       server, &server->addr.transport);
+	return server;
 }

 /*
@@ -309,14 +324,12 @@ int afs_volume_release_fileserver(struct afs_vnode *vnode,
 	switch (result) {
 		/* success */
 	case 0:
-		server->fs_act_jif = jiffies;
 		server->fs_state = 0;
 		_leave("");
 		return 1;

 		/* the fileserver denied all knowledge of the volume */
 	case -ENOMEDIUM:
-		server->fs_act_jif = jiffies;
 		down_write(&volume->server_sem);

 		/* firstly, find where the server is in the active list (if it
@@ -365,7 +378,6 @@ int afs_volume_release_fileserver(struct afs_vnode *vnode,
 		 */
 		spin_lock(&server->fs_lock);
 		if (!server->fs_state) {
-			server->fs_dead_jif = jiffies + HZ * 10;
 			server->fs_state = result;
 			printk("kAFS: SERVER DEAD state=%d\n", result);
 		}
@@ -374,7 +386,6 @@ int afs_volume_release_fileserver(struct afs_vnode *vnode,

 		/* miscellaneous error */
 	default:
-		server->fs_act_jif = jiffies;
 	case -ENOMEM:
 	case -ENONET:
 		/* tell the caller to accept the result */