Commit de630176 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'iversion-v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux

Pull i_version updates from Jeff Layton:
 "This overhauls how we handle i_version queries from nfsd.

  Instead of having special routines and grabbing the i_version field
  directly out of the inode in some cases, we've moved most of the
  handling into the various filesystems' getattr operations. As a bonus,
  this makes ceph's change attribute usable by knfsd as well.

  This should pave the way for future work to make this value queryable
  by userland, and to make it more resilient against rolling back on a
  crash"

* tag 'iversion-v6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/jlayton/linux:
  nfsd: remove fetch_iversion export operation
  nfsd: use the getattr operation to fetch i_version
  nfsd: move nfsd4_change_attribute to nfsfh.c
  ceph: report the inode version in getattr if requested
  nfs: report the inode version in getattr if requested
  vfs: plumb i_version handling into struct kstat
  fs: clarify when the i_version counter must be updated
  fs: uninline inode_query_iversion
parents 575a7e0f 58a033c9
......@@ -2417,10 +2417,10 @@ static int statx_to_caps(u32 want, umode_t mode)
{
int mask = 0;
if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME|STATX_CHANGE_COOKIE))
mask |= CEPH_CAP_AUTH_SHARED;
if (want & (STATX_NLINK|STATX_CTIME)) {
if (want & (STATX_NLINK|STATX_CTIME|STATX_CHANGE_COOKIE)) {
/*
* The link count for directories depends on inode->i_subdirs,
* and that is only updated when Fs caps are held.
......@@ -2431,11 +2431,10 @@ static int statx_to_caps(u32 want, umode_t mode)
mask |= CEPH_CAP_LINK_SHARED;
}
if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
STATX_BLOCKS))
if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|STATX_BLOCKS|STATX_CHANGE_COOKIE))
mask |= CEPH_CAP_FILE_SHARED;
if (want & (STATX_CTIME))
if (want & (STATX_CTIME|STATX_CHANGE_COOKIE))
mask |= CEPH_CAP_XATTR_SHARED;
return mask;
......@@ -2478,6 +2477,11 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
valid_mask |= STATX_BTIME;
}
if (request_mask & STATX_CHANGE_COOKIE) {
stat->change_cookie = inode_peek_iversion_raw(inode);
valid_mask |= STATX_CHANGE_COOKIE;
}
if (ceph_snap(inode) == CEPH_NOSNAP)
stat->dev = sb->s_dev;
else
......@@ -2519,6 +2523,8 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
stat->nlink = 1 + 1 + ci->i_subdirs;
}
stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
stat->result_mask = request_mask & valid_mask;
return err;
}
......
......@@ -1582,3 +1582,39 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force)
return true;
}
EXPORT_SYMBOL(inode_maybe_inc_iversion);
/**
* inode_query_iversion - read i_version for later use
* @inode: inode from which i_version should be read
*
* Read the inode i_version counter. This should be used by callers that wish
* to store the returned i_version for later comparison. This will guarantee
* that a later query of the i_version will result in a different value if
* anything has changed.
*
* In this implementation, we fetch the current value, set the QUERIED flag and
* then try to swap it into place with a cmpxchg, if it wasn't already set. If
* that fails, we try again with the newly fetched value from the cmpxchg.
*/
u64 inode_query_iversion(struct inode *inode)
{
u64 cur, new;
cur = inode_peek_iversion_raw(inode);
do {
/* If flag is already set, then no need to swap */
if (cur & I_VERSION_QUERIED) {
/*
* This barrier (and the implicit barrier in the
* cmpxchg below) pairs with the barrier in
* inode_maybe_inc_iversion().
*/
smp_mb();
break;
}
new = cur | I_VERSION_QUERIED;
} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
return cur >> I_VERSION_QUERIED_SHIFT;
}
EXPORT_SYMBOL(inode_query_iversion);
......@@ -145,17 +145,10 @@ nfs_get_parent(struct dentry *dentry)
return parent;
}
static u64 nfs_fetch_iversion(struct inode *inode)
{
nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
return inode_peek_iversion_raw(inode);
}
const struct export_operations nfs_export_ops = {
.encode_fh = nfs_encode_fh,
.fh_to_dentry = nfs_fh_to_dentry,
.get_parent = nfs_get_parent,
.fetch_iversion = nfs_fetch_iversion,
.flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK|
EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS|
EXPORT_OP_NOATOMIC_ATTR,
......
......@@ -825,6 +825,8 @@ static u32 nfs_get_valid_attrmask(struct inode *inode)
reply_mask |= STATX_UID | STATX_GID;
if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
reply_mask |= STATX_BLOCKS;
if (!(cache_validity & NFS_INO_INVALID_CHANGE))
reply_mask |= STATX_CHANGE_COOKIE;
return reply_mask;
}
......@@ -843,7 +845,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID |
STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME |
STATX_INO | STATX_SIZE | STATX_BLOCKS;
STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME |
STATX_CHANGE_COOKIE;
if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
if (readdirplus_enabled)
......@@ -851,8 +854,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
goto out_no_revalidate;
}
/* Flush out writes to the server in order to update c/mtime. */
if ((request_mask & (STATX_CTIME | STATX_MTIME)) &&
/* Flush out writes to the server in order to update c/mtime/version. */
if ((request_mask & (STATX_CTIME | STATX_MTIME | STATX_CHANGE_COOKIE)) &&
S_ISREG(inode->i_mode))
filemap_write_and_wait(inode->i_mapping);
......@@ -872,7 +875,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
/* Is the user requesting attributes that might need revalidation? */
if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME|
STATX_MTIME|STATX_UID|STATX_GID|
STATX_SIZE|STATX_BLOCKS)))
STATX_SIZE|STATX_BLOCKS|
STATX_CHANGE_COOKIE)))
goto out_no_revalidate;
/* Check whether the cached attributes are stale */
......@@ -910,6 +914,10 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
generic_fillattr(&init_user_ns, inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
stat->change_cookie = inode_peek_iversion_raw(inode);
stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
if (server->change_attr_type != NFS4_CHANGE_TYPE_IS_UNDEFINED)
stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
out:
......
......@@ -2965,7 +2965,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
goto out;
}
err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
err = vfs_getattr(&path, &stat,
STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
if (!(stat.result_mask & STATX_BTIME))
......
......@@ -628,6 +628,10 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
stat.mtime = inode->i_mtime;
stat.ctime = inode->i_ctime;
stat.size = inode->i_size;
if (v4 && IS_I_VERSION(inode)) {
stat.change_cookie = inode_query_iversion(inode);
stat.result_mask |= STATX_CHANGE_COOKIE;
}
}
if (v4)
fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
......@@ -659,6 +663,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
if (err) {
fhp->fh_post_saved = false;
fhp->fh_post_attr.ctime = inode->i_ctime;
if (v4 && IS_I_VERSION(inode)) {
fhp->fh_post_attr.change_cookie = inode_query_iversion(inode);
fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE;
}
} else
fhp->fh_post_saved = true;
if (v4)
......@@ -748,3 +756,37 @@ enum fsid_source fsid_source(const struct svc_fh *fhp)
return FSIDSOURCE_UUID;
return FSIDSOURCE_DEV;
}
/*
* We could use i_version alone as the change attribute. However, i_version
* can go backwards on a regular file after an unclean shutdown. On its own
* that doesn't necessarily cause a problem, but if i_version goes backwards
* and then is incremented again it could reuse a value that was previously
* used before boot, and a client who queried the two values might incorrectly
* assume nothing changed.
*
* By using both ctime and the i_version counter we guarantee that as long as
* time doesn't go backwards we never reuse an old value. If the filesystem
* advertises STATX_ATTR_CHANGE_MONOTONIC, then this mitigation is not
* needed.
*
* We only need to do this for regular files as well. For directories, we
* assume that the new change attr is always logged to stable storage in some
* fashion before the results can be seen.
*/
u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode)
{
u64 chattr;
if (stat->result_mask & STATX_CHANGE_COOKIE) {
chattr = stat->change_cookie;
if (S_ISREG(inode->i_mode) &&
!(stat->attributes & STATX_ATTR_CHANGE_MONOTONIC)) {
chattr += (u64)stat->ctime.tv_sec << 30;
chattr += stat->ctime.tv_nsec;
}
} else {
chattr = time_to_chattr(&stat->ctime);
}
return chattr;
}
......@@ -293,34 +293,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
fhp->fh_pre_saved = false;
}
/*
* We could use i_version alone as the change attribute. However,
* i_version can go backwards after a reboot. On its own that doesn't
* necessarily cause a problem, but if i_version goes backwards and then
* is incremented again it could reuse a value that was previously used
* before boot, and a client who queried the two values might
* incorrectly assume nothing changed.
*
* By using both ctime and the i_version counter we guarantee that as
* long as time doesn't go backwards we never reuse an old value.
*/
static inline u64 nfsd4_change_attribute(struct kstat *stat,
struct inode *inode)
{
if (inode->i_sb->s_export_op->fetch_iversion)
return inode->i_sb->s_export_op->fetch_iversion(inode);
else if (IS_I_VERSION(inode)) {
u64 chattr;
chattr = stat->ctime.tv_sec;
chattr <<= 30;
chattr += stat->ctime.tv_nsec;
chattr += inode_query_iversion(inode);
return chattr;
} else
return time_to_chattr(&stat->ctime);
}
u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
extern void fh_fill_pre_attrs(struct svc_fh *fhp);
extern void fh_fill_post_attrs(struct svc_fh *fhp);
extern void fh_fill_both_attrs(struct svc_fh *fhp);
......
......@@ -170,9 +170,14 @@ static inline void fh_drop_write(struct svc_fh *fh)
static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
{
u32 request_mask = STATX_BASIC_STATS;
struct path p = {.mnt = fh->fh_export->ex_path.mnt,
.dentry = fh->fh_dentry};
return nfserrno(vfs_getattr(&p, stat, STATX_BASIC_STATS,
if (fh->fh_maxsize == NFS4_FHSIZE)
request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE);
return nfserrno(vfs_getattr(&p, stat, request_mask,
AT_STATX_SYNC_AS_STAT));
}
......
......@@ -18,6 +18,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/compat.h>
#include <linux/iversion.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
......@@ -122,6 +123,11 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
STATX_ATTR_DAX);
if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
stat->result_mask |= STATX_CHANGE_COOKIE;
stat->change_cookie = inode_query_iversion(inode);
}
mnt_userns = mnt_user_ns(path->mnt);
if (inode->i_op->getattr)
return inode->i_op->getattr(mnt_userns, path, stat,
......@@ -602,9 +608,11 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
memset(&tmp, 0, sizeof(tmp));
tmp.stx_mask = stat->result_mask;
/* STATX_CHANGE_COOKIE is kernel-only for now */
tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE;
tmp.stx_blksize = stat->blksize;
tmp.stx_attributes = stat->attributes;
/* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */
tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC;
tmp.stx_nlink = stat->nlink;
tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
......@@ -643,6 +651,11 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
return -EINVAL;
/* STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
* from userland.
*/
mask &= ~STATX_CHANGE_COOKIE;
error = vfs_statx(dfd, filename, flags, &stat, mask);
if (error)
return error;
......
......@@ -213,7 +213,6 @@ struct export_operations {
bool write, u32 *device_generation);
int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
int nr_iomaps, struct iattr *iattr);
u64 (*fetch_iversion)(struct inode *);
#define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */
#define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */
#define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */
......
......@@ -9,8 +9,26 @@
* ---------------------------
* The change attribute (i_version) is mandated by NFSv4 and is mostly for
* knfsd, but is also used for other purposes (e.g. IMA). The i_version must
* appear different to observers if there was a change to the inode's data or
* metadata since it was last queried.
* appear larger to observers if there was an explicit change to the inode's
* data or metadata since it was last queried.
*
* An explicit change is one that would ordinarily result in a change to the
* inode status change time (aka ctime). i_version must appear to change, even
* if the ctime does not (since the whole point is to avoid missing updates due
* to timestamp granularity). If POSIX or other relevant spec mandates that the
* ctime must change due to an operation, then the i_version counter must be
* incremented as well.
*
* Making the i_version update completely atomic with the operation itself would
* be prohibitively expensive. Traditionally the kernel has updated the times on
* directories after an operation that changes its contents. For regular files,
* the ctime is usually updated before the data is copied into the cache for a
* write. This means that there is a window of time when an observer can
* associate a new timestamp with old file contents. Since the purpose of the
* i_version is to allow for better cache coherency, the i_version must always
* be updated after the results of the operation are visible. Updating it before
* and after a change is also permitted. (Note that no filesystems currently do
* this. Fixing that is a work-in-progress).
*
* Observers see the i_version as a 64-bit number that never decreases. If it
* remains the same since it was last checked, then nothing has changed in the
......@@ -234,42 +252,6 @@ inode_peek_iversion(const struct inode *inode)
return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
}
/**
* inode_query_iversion - read i_version for later use
* @inode: inode from which i_version should be read
*
* Read the inode i_version counter. This should be used by callers that wish
* to store the returned i_version for later comparison. This will guarantee
* that a later query of the i_version will result in a different value if
* anything has changed.
*
* In this implementation, we fetch the current value, set the QUERIED flag and
* then try to swap it into place with a cmpxchg, if it wasn't already set. If
* that fails, we try again with the newly fetched value from the cmpxchg.
*/
static inline u64
inode_query_iversion(struct inode *inode)
{
u64 cur, new;
cur = inode_peek_iversion_raw(inode);
do {
/* If flag is already set, then no need to swap */
if (cur & I_VERSION_QUERIED) {
/*
* This barrier (and the implicit barrier in the
* cmpxchg below) pairs with the barrier in
* inode_maybe_inc_iversion().
*/
smp_mb();
break;
}
new = cur | I_VERSION_QUERIED;
} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
return cur >> I_VERSION_QUERIED_SHIFT;
}
/*
* For filesystems without any sort of change attribute, the best we can
* do is fake one up from the ctime:
......@@ -283,6 +265,8 @@ static inline u64 time_to_chattr(struct timespec64 *t)
return chattr;
}
u64 inode_query_iversion(struct inode *inode);
/**
* inode_eq_iversion_raw - check whether the raw i_version counter has changed
* @inode: inode to check
......
......@@ -52,6 +52,15 @@ struct kstat {
u64 mnt_id;
u32 dio_mem_align;
u32 dio_offset_align;
u64 change_cookie;
};
/* These definitions are internal to the kernel for now. Mainly used by nfsd. */
/* mask values */
#define STATX_CHANGE_COOKIE 0x40000000U /* Want/got stx_change_attr */
/* file attribute values */
#define STATX_ATTR_CHANGE_MONOTONIC 0x8000000000000000ULL /* version monotonically increases */
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment