Merge tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov: - a patch that removes crush_workspace_mutex (myself). CRUSH computations are no longer serialized and can run in parallel. - a couple new filesystem client metrics for "ceph fs top" command (Xiubo Li) - a fix for a very old messenger bug that affected the filesystem, marked for stable (myself) - assorted fixups and cleanups throughout the codebase from Jeff and others. * tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client: (27 commits) libceph: clear con->out_msg on Policy::stateful_server faults libceph: format ceph_entity_addr nonces as unsigned libceph: fix ENTITY_NAME format suggestion libceph: move a dout in queue_con_delay() ceph: comment cleanups and clarifications ceph: break up send_cap_msg ceph: drop separate mdsc argument from __send_cap ceph: promote to unsigned long long before shifting ceph: don't SetPageError on readpage errors ceph: mark ceph_fmt_xattr() as printf-like for better type checking ceph: fold ceph_update_writeable_page into ceph_write_begin ceph: fold ceph_sync_writepages into writepage_nounlock ceph: fold ceph_sync_readpages into ceph_readpage ceph: don't call ceph_update_writeable_page from page_mkwrite ceph: break out writeback of incompatible snap context to separate function ceph: add a note explaining session reject error string libceph: switch to the new "osd blocklist add" command libceph, rbd, ceph: "blacklist" -> "blocklist" ceph: have ceph_writepages_start call pagevec_lookup_range_tag ceph: use kill_anon_super helper ...

Merge tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: - a patch that removes crush_workspace_mutex (myself). CRUSH computations are no longer serialized and can run in parallel. - a couple new filesystem client metrics for "ceph fs top" command (Xiubo Li) - a fix for a very old messenger bug that affected the filesystem, marked for stable (myself) - assorted fixups and cleanups throughout the codebase from Jeff and others. * tag 'ceph-for-5.10-rc1' of git://github.com/ceph/ceph-client: (27 commits) libceph: clear con->out_msg on Policy::stateful_server faults libceph: format ceph_entity_addr nonces as unsigned libceph: fix ENTITY_NAME format suggestion libceph: move a dout in queue_con_delay() ceph: comment cleanups and clarifications ceph: break up send_cap_msg ceph: drop separate mdsc argument from __send_cap ceph: promote to unsigned long long before shifting ceph: don't SetPageError on readpage errors ceph: mark ceph_fmt_xattr() as printf-like for better type checking ceph: fold ceph_update_writeable_page into ceph_write_begin ceph: fold ceph_sync_writepages into writepage_nounlock ceph: fold ceph_sync_readpages into ceph_readpage ceph: don't call ceph_update_writeable_page from page_mkwrite ceph: break out writeback of incompatible snap context to separate function ceph: add a note explaining session reject error string libceph: switch to the new "osd blocklist add" command libceph, rbd, ceph: "blacklist" -> "blocklist" ceph: have ceph_writepages_start call pagevec_lookup_range_tag ceph: use kill_anon_super helper ...
ed7cfefe · Linus Torvalds · c4d6fe73 · 28e1581c · ed7cfefe · ed7cfefe
Commit ed7cfefe authored Oct 21, 2020 by Linus Torvalds
26 changed files
--- a/Documentation/filesystems/ceph.rst
+++ b/Documentation/filesystems/ceph.rst
@@ -163,14 +163,14 @@ Mount Options
        to the default VFS implementation if this option is used.
  recover_session=<no|clean>
-	Set auto reconnect mode in the case where the client is blacklisted. The
+	Set auto reconnect mode in the case where the client is blocklisted. The
 	available modes are "no" and "clean". The default is "no".
 	* no: never attempt to reconnect when client detects that it has been
-	  blacklisted. Operations will generally fail after being blacklisted.
+	  blocklisted. Operations will generally fail after being blocklisted.
 	* clean: client reconnects to the ceph cluster automatically when it
-	  detects that it has been blacklisted. During reconnect, client drops
+	  detects that it has been blocklisted. During reconnect, client drops
 	  dirty data/metadata, invalidates page caches and writable file handles.
 	  After reconnect, file locks become stale because the MDS loses track
 	  of them. If an inode contains any stale file locks, read/write on the

--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4010,10 +4010,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
 			 ENTITY_NAME(lockers[0].id.name));
-		ret = ceph_monc_blacklist_add(&client->monc,
+		ret = ceph_monc_blocklist_add(&client->monc,
 					      &lockers[0].info.addr);
 		if (ret) {
-			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
+			rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
 				 ENTITY_NAME(lockers[0].id.name), ret);
 			goto out;
 		}
@@ -4077,7 +4077,7 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
 	ret = rbd_try_lock(rbd_dev);
 	if (ret < 0) {
 		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
-		if (ret == -EBLACKLISTED)
+		if (ret == -EBLOCKLISTED)
 			goto out;
 		ret = 1; /* request lock anyway */
@@ -4613,7 +4613,7 @@ static void rbd_reregister_watch(struct work_struct *work)
 	ret = __rbd_register_watch(rbd_dev);
 	if (ret) {
 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
-		if (ret != -EBLACKLISTED && ret != -ENOENT) {
+		if (ret != -EBLOCKLISTED && ret != -ENOENT) {
 			queue_delayed_work(rbd_dev->task_wq,
 					   &rbd_dev->watch_dwork,
 					   RBD_RETRY_DELAY);

--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1222,36 +1222,27 @@ struct cap_msg_args {
 };
 /*
- * Build and send a cap message to the given MDS.
+ * cap struct size + flock buffer size + inline version + inline data size +
- *
+ * osd_epoch_barrier + oldest_flush_tid
- * Caller should be holding s_mutex.
 */
-static int send_cap_msg(struct cap_msg_args *arg)
+#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
+		      4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
+/* Marshal up the cap msg to the MDS */
+static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
 {
 	struct ceph_mds_caps *fc;
-	struct ceph_msg *msg;
 	void *p;
-	size_t extra_len;
 	struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
-	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+	dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n",
-	     " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
+	     __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino,
-	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
+	     ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted),
-	     arg->cid, arg->ino, ceph_cap_string(arg->caps),
+	     ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq,
-	     ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
+	     arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows,
-	     arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
+	     arg->size, arg->max_size, arg->xattr_version,
-	     arg->mseq, arg->follows, arg->size, arg->max_size,
-	     arg->xattr_version,
 	     arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
-	/* flock buffer size + inline version + inline data size +
-	 * osd_epoch_barrier + oldest_flush_tid */
-	extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
-	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
-			   GFP_NOFS, false);
-	if (!msg)
-		return -ENOMEM;
 	msg->hdr.version = cpu_to_le16(10);
 	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
@@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg)
 	/* Advisory flags (version 10) */
 	ceph_encode_32(&p, arg->flags);
-	ceph_con_send(&arg->session->s_con, msg);
-	return 0;
 }
 /*
@@ -1454,25 +1442,25 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
 *
 * Caller should hold snap_rwsem (read), s_mutex.
 */
-static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
+static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
-		       struct ceph_inode_info *ci)
 {
+	struct ceph_msg *msg;
 	struct inode *inode = &ci->vfs_inode;
-	int ret;
-	ret = send_cap_msg(arg);
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
-	if (ret < 0) {
+	if (!msg) {
-		pr_err("error sending cap msg, ino (%llx.%llx) "
+		pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
-		       "flushing %s tid %llu, requeue\n",
 		       ceph_vinop(inode), ceph_cap_string(arg->dirty),
 		       arg->flush_tid);
 		spin_lock(&ci->i_ceph_lock);
-		__cap_delay_requeue(mdsc, ci);
+		__cap_delay_requeue(arg->session->s_mdsc, ci);
 		spin_unlock(&ci->i_ceph_lock);
+		return;
 	}
+	encode_cap_msg(msg, arg);
+	ceph_con_send(&arg->session->s_con, msg);
 	ceph_buffer_put(arg->old_xattr_buf);
 	if (arg->wake)
 		wake_up_all(&ci->i_cap_wq);
 }
@@ -1483,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode,
 				    u32 mseq, u64 oldest_flush_tid)
 {
 	struct cap_msg_args	arg;
+	struct ceph_msg		*msg;
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+	if (!msg)
+		return -ENOMEM;
 	arg.session = session;
 	arg.ino = ceph_vino(inode).ino;
@@ -1521,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode,
 	arg.flags = 0;
 	arg.wake = false;
-	return send_cap_msg(&arg);
+	encode_cap_msg(msg, &arg);
+	ceph_con_send(&arg.session->s_con, msg);
+	return 0;
 }
 /*
@@ -1906,9 +1901,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		     struct ceph_mds_session *session)
 {
-	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct inode *inode = &ci->vfs_inode;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct ceph_cap *cap;
 	u64 flush_tid, oldest_flush_tid;
 	int file_wanted, used, cap_used;
@@ -1928,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 retry:
 	spin_lock(&ci->i_ceph_lock);
 retry_locked:
+	/* Caps wanted by virtue of active open files. */
 	file_wanted = __ceph_caps_file_wanted(ci);
+	/* Caps which have active references against them */
 	used = __ceph_caps_used(ci);
+	/*
+	 * "issued" represents the current caps that the MDS wants us to have.
+	 * "implemented" is the set that we have been granted, and includes the
+	 * ones that have not yet been returned to the MDS (the "revoking" set,
+	 * usually because they have outstanding references).
+	 */
 	issued = __ceph_caps_issued(ci, &implemented);
 	revoking = implemented & ~issued;
 	want = file_wanted;
+	/* The ones we currently want to retain (may be adjusted below) */
 	retain = file_wanted | used | CEPH_CAP_PIN;
 	if (!mdsc->stopping && inode->i_nlink > 0) {
 		if (file_wanted) {
@@ -2011,6 +2017,10 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		/* NOTE: no side-effects allowed, until we take s_mutex */
+		/*
+		 * If we have an auth cap, we don't need to consider any
+		 * overlapping caps as used.
+		 */
 		cap_used = used;
 		if (ci->i_auth_cap && cap != ci->i_auth_cap)
 			cap_used &= ~ci->i_auth_cap->issued;
@@ -2148,7 +2158,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 			   want, retain, flushing, flush_tid, oldest_flush_tid);
 		spin_unlock(&ci->i_ceph_lock);
-		__send_cap(mdsc, &arg, ci);
+		__send_cap(&arg, ci);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
@@ -2222,7 +2232,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
 			   flushing, flush_tid, oldest_flush_tid);
 		spin_unlock(&ci->i_ceph_lock);
-		__send_cap(mdsc, &arg, ci);
+		__send_cap(&arg, ci);
 	} else {
 		if (!list_empty(&ci->i_cap_flush_list)) {
 			struct ceph_cap_flush *cf =
@@ -2436,7 +2446,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 					  (cap->issued | cap->implemented),
 					  cf->caps, cf->tid, oldest_flush_tid);
 			spin_unlock(&ci->i_ceph_lock);
-			__send_cap(mdsc, &arg, ci);
+			__send_cap(&arg, ci);
 		} else {
 			struct ceph_cap_snap *capsnap =
 					container_of(cf, struct ceph_cap_snap,
@@ -4284,13 +4294,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
 void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
 {
-	int i;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
 	int bits = (fmode << 1) | 1;
+	bool is_opened = false;
+	int i;
+	if (count == 1)
+		atomic64_inc(&mdsc->metric.opened_files);
 	spin_lock(&ci->i_ceph_lock);
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
 		if (bits & (1 << i))
 			ci->i_nr_by_mode[i] += count;
+		/*
+		 * If any of the mode ref is larger than 1,
+		 * that means it has been already opened by
+		 * others. Just skip checking the PIN ref.
+		 */
+		if (i && ci->i_nr_by_mode[i] > 1)
+			is_opened = true;
 	}
+	if (!is_opened)
+		percpu_counter_inc(&mdsc->metric.opened_inodes);
 	spin_unlock(&ci->i_ceph_lock);
 }
@@ -4301,15 +4328,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
 */
 void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
 {
-	int i;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
 	int bits = (fmode << 1) | 1;
+	bool is_closed = true;
+	int i;
+	if (count == 1)
+		atomic64_dec(&mdsc->metric.opened_files);
 	spin_lock(&ci->i_ceph_lock);
 	for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
 		if (bits & (1 << i)) {
 			BUG_ON(ci->i_nr_by_mode[i] < count);
 			ci->i_nr_by_mode[i] -= count;
 		}
+		/*
+		 * If any of the mode ref is not 0 after
+		 * decreased, that means it is still opened
+		 * by others. Just skip checking the PIN ref.
+		 */
+		if (i && ci->i_nr_by_mode[i])
+			is_closed = false;
 	}
+	if (is_closed)
+		percpu_counter_dec(&mdsc->metric.opened_inodes);
 	spin_unlock(&ci->i_ceph_lock);
 }

--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p)
 	int nr_caps = 0;
 	s64 total, sum, avg, min, max, sq;
+	sum = percpu_counter_sum(&m->total_inodes);
+	seq_printf(s, "item                               total\n");
+	seq_printf(s, "------------------------------------------\n");
+	seq_printf(s, "%-35s%lld / %lld\n", "opened files  / total inodes",
+		   atomic64_read(&m->opened_files), sum);
+	seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes",
+		   atomic64_read(&m->total_caps), sum);
+	seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes",
+		   percpu_counter_sum(&m->opened_inodes), sum);
+	seq_printf(s, "\n");
 	seq_printf(s, "item          total       avg_lat(us)     min_lat(us)     max_lat(us)     stdev(us)\n");
 	seq_printf(s, "-----------------------------------------------------------------------------------\n");
@@ -202,7 +213,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
 {
 	struct seq_file *s = p;
-	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
+	seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode),
+		   cap->session->s_mds,
 		   ceph_cap_string(cap->issued),
 		   ceph_cap_string(cap->implemented));
 	return 0;
@@ -222,8 +234,8 @@ static int caps_show(struct seq_file *s, void *p)
 		   "reserved\t%d\n"
 		   "min\t\t%d\n\n",
 		   total, avail, used, reserved, min);
-	seq_printf(s, "ino                issued           implemented\n");
+	seq_printf(s, "ino              mds  issued           implemented\n");
-	seq_printf(s, "-----------------------------------------------\n");
+	seq_printf(s, "--------------------------------------------------\n");
 	mutex_lock(&mdsc->mutex);
 	for (i = 0; i < mdsc->max_sessions; i++) {

--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry);
 static int ceph_d_init(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di;
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
 	if (!di)
@@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 				  unsigned int flags)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
 	struct ceph_mds_request *req;
 	int op;
 	int mask;
@@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
 static int ceph_mknod(struct inode *dir, struct dentry *dentry,
 		      umode_t mode, dev_t rdev)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_acl_sec_ctx as_ctx = {};
 	int err;
@@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 			    const char *dest)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_acl_sec_ctx as_ctx = {};
 	int err;
@@ -942,8 +939,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
 static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	struct ceph_acl_sec_ctx as_ctx = {};
 	int err = -EROFS;
@@ -1010,8 +1006,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 static int ceph_link(struct dentry *old_dentry, struct inode *dir,
 		     struct dentry *dentry)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int err;
@@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		       struct inode *new_dir, struct dentry *new_dentry,
 		       unsigned int flags)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int op = CEPH_MDS_OP_RENAME;
 	int err;

--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
 static struct ceph_mds_request *
 prepare_open_request(struct super_block *sb, int flags, int create_mode)
 {
-	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
-	struct ceph_mds_client *mdsc = fsc->mdsc;
 	struct ceph_mds_request *req;
 	int want_auth = USE_ANY_MDS;
 	int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -256,8 +255,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 	case S_IFDIR:
 		ret = ceph_init_file_info(inode, file, fmode,
 						S_ISDIR(inode->i_mode));
-		if (ret)
-			return ret;
 		break;
 	case S_IFLNK:
@@ -285,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 */
 int ceph_renew_caps(struct inode *inode, int fmode)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	int err, flags, wanted;
@@ -865,6 +862,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 		size_t page_off;
 		u64 i_size;
 		bool more;
+		int idx;
+		size_t left;
 		req = ceph_osdc_new_request(osdc, &ci->i_layout,
 					ci->i_vino, off, &len, 0, 1,
@@ -878,29 +877,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 		more = len < iov_iter_count(to);
-		if (unlikely(iov_iter_is_pipe(to))) {
+		num_pages = calc_pages_for(off, len);
-			ret = iov_iter_get_pages_alloc(to, &pages, len,
+		page_off = off & ~PAGE_MASK;
-						       &page_off);
+		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-			if (ret <= 0) {
+		if (IS_ERR(pages)) {
-				ceph_osdc_put_request(req);
+			ceph_osdc_put_request(req);
-				ret = -ENOMEM;
+			ret = PTR_ERR(pages);
-				break;
+			break;
-			}
-			num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
-			if (ret < len) {
-				len = ret;
-				osd_req_op_extent_update(req, 0, len);
-				more = false;
-			}
-		} else {
-			num_pages = calc_pages_for(off, len);
-			page_off = off & ~PAGE_MASK;
-			pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-			if (IS_ERR(pages)) {
-				ceph_osdc_put_request(req);
-				ret = PTR_ERR(pages);
-				break;
-			}
 		}
 		osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
@@ -931,36 +914,27 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
 			ret += zlen;
 		}
-		if (unlikely(iov_iter_is_pipe(to))) {
+		idx = 0;
-			if (ret > 0) {
+		left = ret > 0 ? ret : 0;
-				iov_iter_advance(to, ret);
+		while (left > 0) {
-				off += ret;
+			size_t len, copied;
-			} else {
+			page_off = off & ~PAGE_MASK;
-				iov_iter_advance(to, 0);
+			len = min_t(size_t, left, PAGE_SIZE - page_off);
-			}
+			SetPageUptodate(pages[idx]);
-			ceph_put_page_vector(pages, num_pages, false);
+			copied = copy_page_to_iter(pages[idx++],
-		} else {
+						   page_off, len, to);
-			int idx = 0;
+			off += copied;
-			size_t left = ret > 0 ? ret : 0;
+			left -= copied;
-			while (left > 0) {
+			if (copied < len) {
-				size_t len, copied;
+				ret = -EFAULT;
-				page_off = off & ~PAGE_MASK;
+				break;
-				len = min_t(size_t, left, PAGE_SIZE - page_off);
-				copied = copy_page_to_iter(pages[idx++],
-							   page_off, len, to);
-				off += copied;
-				left -= copied;
-				if (copied < len) {
-					ret = -EFAULT;
-					break;
-				}
 			}
-			ceph_release_page_vector(pages, num_pages);
 		}
+		ceph_release_page_vector(pages, num_pages);
 		if (ret < 0) {
-			if (ret == -EBLACKLISTED)
+			if (ret == -EBLOCKLISTED)
-				fsc->blacklisted = true;
+				fsc->blocklisted = true;
 			break;
 		}
@@ -1052,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
 	struct inode *inode = req->r_inode;
 	struct ceph_aio_request *aio_req = req->r_priv;
 	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
-	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
-	struct ceph_client_metric *metric = &fsc->mdsc->metric;
 	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
 	BUG_ON(!osd_data->num_bvecs);

--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work);
 static int ceph_set_ino_cb(struct inode *inode, void *data)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	ci->i_vino = *(struct ceph_vino *)data;
 	inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
 	inode_set_iversion_raw(inode, 0);
+	percpu_counter_inc(&mdsc->metric.total_inodes);
 	return 0;
 }
@@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode)
 void ceph_evict_inode(struct inode *inode)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct ceph_inode_frag *frag;
 	struct rb_node *n;
 	dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
+	percpu_counter_dec(&mdsc->metric.total_inodes);
 	truncate_inode_pages_final(&inode->i_data);
 	clear_inode(inode);
@@ -558,8 +564,6 @@ void ceph_evict_inode(struct inode *inode)
 	 * caps in i_snap_caps.
 	 */
 	if (ci->i_snap_realm) {
-		struct ceph_mds_client *mdsc =
-					ceph_inode_to_client(inode)->mdsc;
 		if (ceph_snap(inode) == CEPH_NOSNAP) {
 			struct ceph_snap_realm *realm = ci->i_snap_realm;
 			dout(" dropping residual ref to snap realm %p\n",
@@ -739,7 +743,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
 		    struct ceph_mds_session *session, int cap_fmode,
 		    struct ceph_cap_reservation *caps_reservation)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct ceph_mds_reply_inode *info = iinfo->in;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int issued, new_issued, info_caps;

--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 			     int cmd, u8 wait, struct file_lock *fl)
 {
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct ceph_mds_request *req;
 	int err;
 	u64 length = 0;

--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3303,7 +3303,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
 }
 static int __decode_session_metadata(void **p, void *end,
-				     bool *blacklisted)
+				     bool *blocklisted)
 {
 	/* map<string,string> */
 	u32 n;
@@ -3317,8 +3317,12 @@ static int __decode_session_metadata(void **p, void *end,
 		*p += len;
 		ceph_decode_32_safe(p, end, len, bad);
 		ceph_decode_need(p, end, len, bad);
+		/*
+		 * Match "blocklisted (blacklisted)" from newer MDSes,
+		 * or "blacklisted" from older MDSes.
+		 */
 		if (err_str && strnstr(*p, "blacklisted", len))
-			*blacklisted = true;
+			*blocklisted = true;
 		*p += len;
 	}
 	return 0;
@@ -3341,7 +3345,7 @@ static void handle_session(struct ceph_mds_session *session,
 	u32 op;
 	u64 seq, features = 0;
 	int wake = 0;
-	bool blacklisted = false;
+	bool blocklisted = false;
 	/* decode */
 	ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3354,7 +3358,7 @@ static void handle_session(struct ceph_mds_session *session,
 	if (msg_version >= 3) {
 		u32 len;
 		/* version >= 2, metadata */
-		if (__decode_session_metadata(&p, end, &blacklisted) < 0)
+		if (__decode_session_metadata(&p, end, &blocklisted) < 0)
 			goto bad;
 		/* version >= 3, feature bits */
 		ceph_decode_32_safe(&p, end, len, bad);
@@ -3445,8 +3449,8 @@ static void handle_session(struct ceph_mds_session *session,
 		session->s_state = CEPH_MDS_SESSION_REJECTED;
 		cleanup_session_requests(mdsc, session);
 		remove_session_caps(session);
-		if (blacklisted)
+		if (blocklisted)
-			mdsc->fsc->blacklisted = true;
+			mdsc->fsc->blocklisted = true;
 		wake = 2; /* for good measure */
 		break;
@@ -3612,6 +3616,39 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
 	return err;
 }
+static struct dentry* d_find_primary(struct inode *inode)
+{
+	struct dentry *alias, *dn = NULL;
+	if (hlist_empty(&inode->i_dentry))
+		return NULL;
+	spin_lock(&inode->i_lock);
+	if (hlist_empty(&inode->i_dentry))
+		goto out_unlock;
+	if (S_ISDIR(inode->i_mode)) {
+		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
+		if (!IS_ROOT(alias))
+			dn = dget(alias);
+		goto out_unlock;
+	}
+	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
+		spin_lock(&alias->d_lock);
+		if (!d_unhashed(alias) &&
+		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
+			dn = dget_dlock(alias);
+		}
+		spin_unlock(&alias->d_lock);
+		if (dn)
+			break;
+	}
+out_unlock:
+	spin_unlock(&inode->i_lock);
+	return dn;
+}
 /*
 * Encode information about a cap for a reconnect with the MDS.
 */
@@ -3625,13 +3662,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_reconnect_state *recon_state = arg;
 	struct ceph_pagelist *pagelist = recon_state->pagelist;
-	int err;
+	struct dentry *dentry;
+	char *path;
+	int pathlen, err;
+	u64 pathbase;
 	u64 snap_follows;
 	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
 	     inode, ceph_vinop(inode), cap, cap->cap_id,
 	     ceph_cap_string(cap->issued));
+	dentry = d_find_primary(inode);
+	if (dentry) {
+		/* set pathbase to parent dir when msg_version >= 2 */
+		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
+					    recon_state->msg_version >= 2);
+		dput(dentry);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out_err;
+		}
+	} else {
+		path = NULL;
+		pathlen = 0;
+		pathbase = 0;
+	}
 	spin_lock(&ci->i_ceph_lock);
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
@@ -3652,7 +3708,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
 		rec.v2.issued = cpu_to_le32(cap->issued);
 		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v2.pathbase = 0;
+		rec.v2.pathbase = cpu_to_le64(pathbase);
 		rec.v2.flock_len = (__force __le32)
 			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
 	} else {
@@ -3663,7 +3719,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
 		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
 		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-		rec.v1.pathbase = 0;
+		rec.v1.pathbase = cpu_to_le64(pathbase);
 	}
 	if (list_empty(&ci->i_cap_snaps)) {
@@ -3725,7 +3781,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			    sizeof(struct ceph_filelock);
 		rec.v2.flock_len = cpu_to_le32(struct_len);
-		struct_len += sizeof(u32) + sizeof(rec.v2);
+		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
 		if (struct_v >= 2)
 			struct_len += sizeof(u64); /* snap_follows */
@@ -3749,7 +3805,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			ceph_pagelist_encode_8(pagelist, 1);
 			ceph_pagelist_encode_32(pagelist, struct_len);
 		}
-		ceph_pagelist_encode_string(pagelist, NULL, 0);
+		ceph_pagelist_encode_string(pagelist, path, pathlen);
 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
 		ceph_locks_to_pagelist(flocks, pagelist,
 				       num_fcntl_locks, num_flock_locks);
@@ -3758,39 +3814,20 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
 out_freeflocks:
 		kfree(flocks);
 	} else {
-		u64 pathbase = 0;
-		int pathlen = 0;
-		char *path = NULL;
-		struct dentry *dentry;
-		dentry = d_find_alias(inode);
-		if (dentry) {
-			path = ceph_mdsc_build_path(dentry,
-						&pathlen, &pathbase, 0);
-			dput(dentry);
-			if (IS_ERR(path)) {
-				err = PTR_ERR(path);
-				goto out_err;
-			}
-			rec.v1.pathbase = cpu_to_le64(pathbase);
-		}
 		err = ceph_pagelist_reserve(pagelist,
 					    sizeof(u64) + sizeof(u32) +
 					    pathlen + sizeof(rec.v1));
-		if (err) {
+		if (err)
-			goto out_freepath;
+			goto out_err;
-		}
 		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
 		ceph_pagelist_encode_string(pagelist, path, pathlen);
 		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
-out_freepath:
-		ceph_mdsc_free_path(path, pathlen);
 	}
 out_err:
-	if (err >= 0)
+	ceph_mdsc_free_path(path, pathlen);
+	if (!err)
 		recon_state->nr_caps++;
 	return err;
 }
@@ -4334,14 +4371,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
 	if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
 		return;
-	if (!READ_ONCE(fsc->blacklisted))
+	if (!READ_ONCE(fsc->blocklisted))
 		return;
 	if (fsc->last_auto_reconnect &&
 	    time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
 		return;
-	pr_info("auto reconnect after blacklisted\n");
+	pr_info("auto reconnect after blocklisted\n");
 	fsc->last_auto_reconnect = jiffies;
 	ceph_force_reconnect(fsc->sb);
 }

--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -393,7 +393,7 @@ struct ceph_mds_client {
 	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
 	atomic_t		num_sessions;
-	int                     max_sessions;  /* len of s_mds_sessions */
+	int                     max_sessions;  /* len of sessions array */
 	int                     stopping;      /* true if shutting down */
 	atomic64_t		quotarealms_count; /* # realms with quota */

--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m)
 	m->total_metadatas = 0;
 	m->metadata_latency_sum = 0;
+	atomic64_set(&m->opened_files, 0);
+	ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL);
+	if (ret)
+		goto err_opened_inodes;
+	ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL);
+	if (ret)
+		goto err_total_inodes;
 	m->session = NULL;
 	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
 	return 0;
+err_total_inodes:
+	percpu_counter_destroy(&m->opened_inodes);
+err_opened_inodes:
+	percpu_counter_destroy(&m->i_caps_mis);
 err_i_caps_mis:
 	percpu_counter_destroy(&m->i_caps_hit);
 err_i_caps_hit:
@@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
 	if (!m)
 		return;
+	percpu_counter_destroy(&m->total_inodes);
+	percpu_counter_destroy(&m->opened_inodes);
 	percpu_counter_destroy(&m->i_caps_mis);
 	percpu_counter_destroy(&m->i_caps_hit);
 	percpu_counter_destroy(&m->d_lease_mis);

--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -115,6 +115,13 @@ struct ceph_client_metric {
 	ktime_t metadata_latency_min;
 	ktime_t metadata_latency_max;
+	/* The total number of directories and files that are opened */
+	atomic64_t opened_files;
+	/* The total number of inodes that have opened files or directories */
+	struct percpu_counter opened_inodes;
+	struct percpu_counter total_inodes;
 	struct ceph_mds_session *session;
 	struct delayed_work delayed_work;  /* delayed work */
 };

--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -12,7 +12,7 @@
 void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	if (inc)
 		atomic64_inc(&mdsc->quotarealms_count);
 	else
@@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc)
 static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct super_block *sb = inode->i_sb;
-	struct super_block *sb = mdsc->fsc->sb;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb);
 	struct inode *root = d_inode(sb->s_root);
 	if (atomic64_read(&mdsc->quotarealms_count) > 0)
@@ -266,7 +266,7 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
 static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
 	struct ceph_snap_realm *old_realm, *new_realm;
 	bool is_same;
@@ -313,7 +313,7 @@ enum quota_check_op {
 static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op,
 				 loff_t delta)
 {
-	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	struct ceph_inode_info *ci;
 	struct ceph_snap_realm *realm, *next;
 	struct inode *in;

--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 			    struct ceph_cap_snap *capsnap)
 {
 	struct inode *inode = &ci->vfs_inode;
-	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
+	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
 	BUG_ON(capsnap->writing);
 	capsnap->size = inode->i_size;

--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1205,14 +1205,13 @@ static int ceph_init_fs_context(struct fs_context *fc)
 static void ceph_kill_sb(struct super_block *s)
 {
 	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
-	dev_t dev = s->s_dev;
 	dout("kill_sb %p\n", s);
 	ceph_mdsc_pre_umount(fsc->mdsc);
 	flush_fs_workqueues(fsc);
-	generic_shutdown_super(s);
+	kill_anon_super(s);
 	fsc->client->extra_mon_dispatch = NULL;
 	ceph_fs_debugfs_cleanup(fsc);
@@ -1220,7 +1219,6 @@ static void ceph_kill_sb(struct super_block *s)
 	ceph_fscache_unregister_fs(fsc);
 	destroy_fs_client(fsc);
-	free_anon_bdev(dev);
 }
 static struct file_system_type ceph_fs_type = {
@@ -1243,13 +1241,13 @@ int ceph_force_reconnect(struct super_block *sb)
 	 * see remove_session_caps_cb() */
 	flush_workqueue(fsc->inode_wq);
-	/* In case that we were blacklisted. This also reset
+	/* In case that we were blocklisted. This also reset
 	 * all mon/osd connections */
 	ceph_reset_client_addr(fsc->client);
 	ceph_osdc_clear_abort_err(&fsc->client->osdc);
-	fsc->blacklisted = false;
+	fsc->blocklisted = false;
 	fsc->mount_state = CEPH_MOUNT_MOUNTED;
 	if (sb->s_root) {

--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -32,7 +32,7 @@
 #define CEPH_BLOCK_SHIFT   22  /* 4 MB */
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
-#define CEPH_MOUNT_OPT_CLEANRECOVER    (1<<1) /* auto reonnect (clean mode) after blacklisted */
+#define CEPH_MOUNT_OPT_CLEANRECOVER    (1<<1) /* auto reonnect (clean mode) after blocklisted */
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
@@ -109,7 +109,7 @@ struct ceph_fs_client {
 	unsigned long mount_state;
 	unsigned long last_auto_reconnect;
-	bool blacklisted;
+	bool blocklisted;
 	bool have_copy_from2;
@@ -160,7 +160,8 @@ struct ceph_cap {
 			int issued;       /* latest, from the mds */
 			int implemented;  /* implemented superset of
 					     issued (for revocation) */
-			int mds, mds_wanted;
+			int mds;	  /* mds index for this cap */
+			int mds_wanted;   /* caps wanted from this mds */
 		};
 		/* caps to release */
 		struct {
@@ -451,6 +452,12 @@ ceph_sb_to_client(const struct super_block *sb)
 	return (struct ceph_fs_client *)sb->s_fs_info;
 }
+static inline struct ceph_mds_client *
+ceph_sb_to_mdsc(const struct super_block *sb)
+{
+	return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc;
+}
 static inline struct ceph_vino
 ceph_vino(const struct inode *inode)
 {

--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 * NULL terminates however, so call it on a temporary buffer and then memcpy
 * the result into place.
 */
-static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
+static __printf(3, 4)
+int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...)
 {
 	int ret;
 	va_list args;

--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -54,7 +54,7 @@ struct ceph_connection_operations {
 	int (*check_message_signature) (struct ceph_msg *msg);
 };
-/* use format string %s%d */
+/* use format string %s%lld */
 #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
 struct ceph_messenger {

--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
 int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
 				ceph_monc_callback_t cb, u64 private_data);
-int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
 			    struct ceph_entity_addr *client_addr);
 extern int ceph_monc_open_session(struct ceph_mon_client *monc);

--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -137,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
 		     const char *fmt, ...);
 void ceph_oid_destroy(struct ceph_object_id *oid);
+struct workspace_manager {
+	struct list_head idle_ws;
+	spinlock_t ws_lock;
+	/* Number of free workspaces */
+	int free_ws;
+	/* Total number of allocated workspaces */
+	atomic_t total_ws;
+	/* Waiters for a free workspace */
+	wait_queue_head_t ws_wait;
+};
 struct ceph_pg_mapping {
 	struct rb_node node;
 	struct ceph_pg pgid;
@@ -184,8 +195,7 @@ struct ceph_osdmap {
 	 * the list of osds that store+replicate them. */
 	struct crush_map *crush;
-	struct mutex crush_workspace_mutex;
+	struct workspace_manager crush_wsm;
-	void *crush_workspace;
 };
 static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)

--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -424,7 +424,7 @@ enum {
 };
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
-#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+#define EBLOCKLISTED ESHUTDOWN /* blocklisted */
 /* xattr comparison */
 enum {

--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -346,6 +346,9 @@ struct crush_work_bucket {
 struct crush_work {
 	struct crush_work_bucket **work; /* Per-bucket working store */
+#ifdef __KERNEL__
+	struct list_head item;
+#endif
 };
 #ifdef __KERNEL__

--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2016,11 +2016,11 @@ static int process_banner(struct ceph_connection *con)
 		   sizeof(con->peer_addr)) != 0 &&
 	    !(addr_is_blank(&con->actual_peer_addr) &&
 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warn("wrong peer, want %s/%d, got %s/%d\n",
+		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
 			ceph_pr_addr(&con->peer_addr),
-			(int)le32_to_cpu(con->peer_addr.nonce),
+			le32_to_cpu(con->peer_addr.nonce),
 			ceph_pr_addr(&con->actual_peer_addr),
-			(int)le32_to_cpu(con->actual_peer_addr.nonce));
+			le32_to_cpu(con->actual_peer_addr.nonce));
 		con->error_msg = "wrong peer at address";
 		return -1;
 	}
@@ -2811,13 +2811,13 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 		return -ENOENT;
 	}
+	dout("%s %p %lu\n", __func__, con, delay);
 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
 		dout("%s %p - already queued\n", __func__, con);
 		con->ops->put(con);
 		return -EBUSY;
 	}
-	dout("%s %p %lu\n", __func__, con, delay);
 	return 0;
 }
@@ -2998,6 +2998,11 @@ static void con_fault(struct ceph_connection *con)
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 	}
+	if (con->out_msg) {
+		BUG_ON(con->out_msg->con != con);
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
 	/* Requeue anything that hasn't been acked */
 	list_splice_init(&con->out_sent, &con->out_queue);

--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -896,8 +896,9 @@ static void handle_command_ack(struct ceph_mon_client *monc,
 	ceph_msg_dump(msg);
 }
-int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
+static __printf(2, 0)
-			    struct ceph_entity_addr *client_addr)
+int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt,
+			 va_list ap)
 {
 	struct ceph_mon_generic_request *req;
 	struct ceph_mon_command *h;
@@ -925,29 +926,65 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
 	h->monhdr.session_mon_tid = 0;
 	h->fsid = monc->monmap->fsid;
 	h->num_strs = cpu_to_le32(1);
-	len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
+	len = vsprintf(h->str, fmt, ap);
-		                 \"blacklistop\": \"add\", \
-				 \"addr\": \"%pISpc/%u\" }",
-		      &client_addr->in_addr, le32_to_cpu(client_addr->nonce));
 	h->str_len = cpu_to_le32(len);
 	send_generic_request(monc, req);
 	mutex_unlock(&monc->mutex);
 	ret = wait_generic_request(req);
-	if (!ret)
-		/*
-		 * Make sure we have the osdmap that includes the blacklist
-		 * entry.  This is needed to ensure that the OSDs pick up the
-		 * new blacklist before processing any future requests from
-		 * this client.
-		 */
-		ret = ceph_wait_for_latest_osdmap(monc->client, 0);
 out:
 	put_generic_request(req);
 	return ret;
 }
-EXPORT_SYMBOL(ceph_monc_blacklist_add);
+static __printf(2, 3)
+int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	va_start(ap, fmt);
+	ret = do_mon_command_vargs(monc, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
+			    struct ceph_entity_addr *client_addr)
+{
+	int ret;
+	ret = do_mon_command(monc,
+			     "{ \"prefix\": \"osd blocklist\", \
+				\"blocklistop\": \"add\", \
+				\"addr\": \"%pISpc/%u\" }",
+			     &client_addr->in_addr,
+			     le32_to_cpu(client_addr->nonce));
+	if (ret == -EINVAL) {
+		/*
+		 * The monitor returns EINVAL on an unrecognized command.
+		 * Try the legacy command -- it is exactly the same except
+		 * for the name.
+		 */
+		ret = do_mon_command(monc,
+				     "{ \"prefix\": \"osd blacklist\", \
+					\"blacklistop\": \"add\", \
+					\"addr\": \"%pISpc/%u\" }",
+				     &client_addr->in_addr,
+				     le32_to_cpu(client_addr->nonce));
+	}
+	if (ret)
+		return ret;
+	/*
+	 * Make sure we have the osdmap that includes the blocklist
+	 * entry.  This is needed to ensure that the OSDs pick up the
+	 * new blocklist before processing any future requests from
+	 * this client.
+	 */
+	return ceph_wait_for_latest_osdmap(monc->client, 0);
+}
+EXPORT_SYMBOL(ceph_monc_blocklist_add);
 /*
 * Resend pending generic requests.

--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -964,6 +964,143 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 	return -EINVAL;
 }
+/*
+ * CRUSH workspaces
+ *
+ * workspace_manager framework borrowed from fs/btrfs/compression.c.
+ * Two simplifications: there is only one type of workspace and there
+ * is always at least one workspace.
+ */
+static struct crush_work *alloc_workspace(const struct crush_map *c)
+{
+	struct crush_work *work;
+	size_t work_size;
+	WARN_ON(!c->working_size);
+	work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
+	dout("%s work_size %zu bytes\n", __func__, work_size);
+	work = ceph_kvmalloc(work_size, GFP_NOIO);
+	if (!work)
+		return NULL;
+	INIT_LIST_HEAD(&work->item);
+	crush_init_workspace(c, work);
+	return work;
+}
+static void free_workspace(struct crush_work *work)
+{
+	WARN_ON(!list_empty(&work->item));
+	kvfree(work);
+}
+static void init_workspace_manager(struct workspace_manager *wsm)
+{
+	INIT_LIST_HEAD(&wsm->idle_ws);
+	spin_lock_init(&wsm->ws_lock);
+	atomic_set(&wsm->total_ws, 0);
+	wsm->free_ws = 0;
+	init_waitqueue_head(&wsm->ws_wait);
+}
+static void add_initial_workspace(struct workspace_manager *wsm,
+				  struct crush_work *work)
+{
+	WARN_ON(!list_empty(&wsm->idle_ws));
+	list_add(&work->item, &wsm->idle_ws);
+	atomic_set(&wsm->total_ws, 1);
+	wsm->free_ws = 1;
+}
+static void cleanup_workspace_manager(struct workspace_manager *wsm)
+{
+	struct crush_work *work;
+	while (!list_empty(&wsm->idle_ws)) {
+		work = list_first_entry(&wsm->idle_ws, struct crush_work,
+					item);
+		list_del_init(&work->item);
+		free_workspace(work);
+	}
+	atomic_set(&wsm->total_ws, 0);
+	wsm->free_ws = 0;
+}
+/*
+ * Finds an available workspace or allocates a new one.  If it's not
+ * possible to allocate a new one, waits until there is one.
+ */
+static struct crush_work *get_workspace(struct workspace_manager *wsm,
+					const struct crush_map *c)
+{
+	struct crush_work *work;
+	int cpus = num_online_cpus();
+again:
+	spin_lock(&wsm->ws_lock);
+	if (!list_empty(&wsm->idle_ws)) {
+		work = list_first_entry(&wsm->idle_ws, struct crush_work,
+					item);
+		list_del_init(&work->item);
+		wsm->free_ws--;
+		spin_unlock(&wsm->ws_lock);
+		return work;
+	}
+	if (atomic_read(&wsm->total_ws) > cpus) {
+		DEFINE_WAIT(wait);
+		spin_unlock(&wsm->ws_lock);
+		prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
+			schedule();
+		finish_wait(&wsm->ws_wait, &wait);
+		goto again;
+	}
+	atomic_inc(&wsm->total_ws);
+	spin_unlock(&wsm->ws_lock);
+	work = alloc_workspace(c);
+	if (!work) {
+		atomic_dec(&wsm->total_ws);
+		wake_up(&wsm->ws_wait);
+		/*
+		 * Do not return the error but go back to waiting.  We
+		 * have the inital workspace and the CRUSH computation
+		 * time is bounded so we will get it eventually.
+		 */
+		WARN_ON(atomic_read(&wsm->total_ws) < 1);
+		goto again;
+	}
+	return work;
+}
+/*
+ * Puts a workspace back on the list or frees it if we have enough
+ * idle ones sitting around.
+ */
+static void put_workspace(struct workspace_manager *wsm,
+			  struct crush_work *work)
+{
+	spin_lock(&wsm->ws_lock);
+	if (wsm->free_ws <= num_online_cpus()) {
+		list_add(&work->item, &wsm->idle_ws);
+		wsm->free_ws++;
+		spin_unlock(&wsm->ws_lock);
+		goto wake;
+	}
+	spin_unlock(&wsm->ws_lock);
+	free_workspace(work);
+	atomic_dec(&wsm->total_ws);
+wake:
+	if (wq_has_sleeper(&wsm->ws_wait))
+		wake_up(&wsm->ws_wait);
+}
 /*
 * osd map
 */
@@ -981,7 +1118,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
 	map->primary_temp = RB_ROOT;
 	map->pg_upmap = RB_ROOT;
 	map->pg_upmap_items = RB_ROOT;
-	mutex_init(&map->crush_workspace_mutex);
+	init_workspace_manager(&map->crush_wsm);
 	return map;
 }
@@ -989,8 +1127,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
 void ceph_osdmap_destroy(struct ceph_osdmap *map)
 {
 	dout("osdmap_destroy %p\n", map);
 	if (map->crush)
 		crush_destroy(map->crush);
+	cleanup_workspace_manager(&map->crush_wsm);
 	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
 		struct ceph_pg_mapping *pg =
 			rb_entry(rb_first(&map->pg_temp),
@@ -1029,7 +1170,6 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
 	kvfree(map->osd_weight);
 	kvfree(map->osd_addr);
 	kvfree(map->osd_primary_affinity);
-	kvfree(map->crush_workspace);
 	kfree(map);
 }
@@ -1104,26 +1244,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
 static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
 {
-	void *workspace;
+	struct crush_work *work;
-	size_t work_size;
 	if (IS_ERR(crush))
 		return PTR_ERR(crush);
-	work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
+	work = alloc_workspace(crush);
-	dout("%s work_size %zu bytes\n", __func__, work_size);
+	if (!work) {
-	workspace = ceph_kvmalloc(work_size, GFP_NOIO);
-	if (!workspace) {
 		crush_destroy(crush);
 		return -ENOMEM;
 	}
-	crush_init_workspace(crush, workspace);
 	if (map->crush)
 		crush_destroy(map->crush);
-	kvfree(map->crush_workspace);
+	cleanup_workspace_manager(&map->crush_wsm);
 	map->crush = crush;
-	map->crush_workspace = workspace;
+	add_initial_workspace(&map->crush_wsm, work);
 	return 0;
 }
@@ -2322,6 +2458,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		    s64 choose_args_index)
 {
 	struct crush_choose_arg_map *arg_map;
+	struct crush_work *work;
 	int r;
 	BUG_ON(result_max > CEPH_PG_MAX_SIZE);
@@ -2332,12 +2469,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
 		arg_map = lookup_choose_arg_map(&map->crush->choose_args,
 						CEPH_DEFAULT_CHOOSE_ARGS);
-	mutex_lock(&map->crush_workspace_mutex);
+	work = get_workspace(&map->crush_wsm, map->crush);
 	r = crush_do_rule(map->crush, ruleno, x, result, result_max,
-			  weight, weight_max, map->crush_workspace,
+			  weight, weight_max, work,
 			  arg_map ? arg_map->args : NULL);
-	mutex_unlock(&map->crush_workspace_mutex);
+	put_workspace(&map->crush_wsm, work);
 	return r;
 }