Commit 1f2300a7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'v6.5/vfs.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs file handling updates from Christian Brauner:
 "This contains Amir's work to fix a long-standing problem where an
  unprivileged overlayfs mount can be used to avoid fanotify permission
  events that were requested for an inode or superblock on the
  underlying filesystem.

  Some background about files opened in overlayfs. If a file is opened
  in overlayfs @file->f_path will refer to a "fake" path. What this
  means is that while @file->f_inode will refer to inode of the
  underlying layer, @file->f_path refers to an overlayfs
  {dentry,vfsmount} pair. The reasons for doing this are out of scope
  here but it is the reason why the vfs has been providing the
  open_with_fake_path() helper for overlayfs for very long time now. So
  nothing new here.

  This is for sure not very elegant and everyone including the overlayfs
  maintainers agree. Improving this significantly would involve more
  fragile and potentially rather invasive changes.

  In various codepaths access to the path of the underlying filesystem
  is needed for such hybrid file. The best example is fsnotify where
  this becomes security relevant. Passing the overlayfs
  @file->f_path->dentry will cause fsnotify to skip generating fsnotify
  events registered on the underlying inode or superblock.

  To fix this we extend the vfs provided open_with_fake_path() concept
  for overlayfs to create a backing file container that holds the real
  path and to expose a helper that can be used by relevant callers to
  get access to the path of the underlying filesystem through the new
  file_real_path() helper. This pattern is similar to what we do in
  d_real() and d_real_inode().

  The first beneficiary is fsnotify and fixes the security sensitive
  problem mentioned above.

  There's a couple of nice cleanups included as well.

  Over time, the old open_with_fake_path() helper added specifically for
  overlayfs a long time ago started to get used in other places such as
  cachefiles. Even though cachefiles have nothing to do with hybrid
  files.

  The only reason cachefiles used that concept was that files opened
  with open_with_fake_path() aren't charged against the caller's open
  file limit by raising FMODE_NOACCOUNT. It's just mere coincidence that
  both overlayfs and cachefiles need to ensure to not overcharge the
  caller for their internal open calls.

  So this work disentangles FMODE_NOACCOUNT use cases and backing file
  use-cases by adding the FMODE_BACKING flag which indicates that the
  file can be used to retrieve the backing file of another filesystem.
  (Fyi, Jens will be sending you a really nice cleanup from Christoph
  that gets rid of 3 FMODE_* flags otherwise this would be the last
  fmode_t bit we'd be using.)

  So now overlayfs becomes the sole user of the renamed
  open_with_fake_path() helper which is now named backing_file_open().
  For internal kernel users such as cachefiles that are only interested
  in FMODE_NOACCOUNT but not in FMODE_BACKING we add a new
  kernel_file_open() helper which opens a file without being charged
  against the caller's open file limit. All new helpers are properly
  documented and clearly annotated to mention their special uses.

  We also rename vfs_tmpfile_open() to kernel_tmpfile_open() to clearly
  distinguish it from vfs_tmpfile() and align it the other kernel_*()
  internal helpers"

* tag 'v6.5/vfs.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  ovl: enable fsnotify events on underlying real files
  fs: use backing_file container for internal files with "fake" f_path
  fs: move kmem_cache_zalloc() into alloc_empty_file*() helpers
  fs: use a helper for opening kernel internal files
  fs: rename {vfs,kernel}_tmpfile_open()
parents 2eedfa9e bc2473c9
......@@ -451,7 +451,7 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object)
ret = cachefiles_inject_write_error();
if (ret == 0) {
file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath,
file = kernel_tmpfile_open(&nop_mnt_idmap, &parentpath,
S_IFREG | 0600,
O_RDWR | O_LARGEFILE | O_DIRECT,
cache->cache_cred);
......@@ -561,7 +561,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
*/
path.mnt = cache->mnt;
path.dentry = dentry;
file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
d_backing_inode(dentry), cache->cache_cred);
if (IS_ERR(file)) {
trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
......
......@@ -44,18 +44,40 @@ static struct kmem_cache *filp_cachep __read_mostly;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
/* Container for backing file with optional real path */
struct backing_file {
struct file file;
struct path real_path;
};
static inline struct backing_file *backing_file(struct file *f)
{
return container_of(f, struct backing_file, file);
}
struct path *backing_file_real_path(struct file *f)
{
return &backing_file(f)->real_path;
}
EXPORT_SYMBOL_GPL(backing_file_real_path);
static void file_free_rcu(struct rcu_head *head)
{
struct file *f = container_of(head, struct file, f_rcuhead);
put_cred(f->f_cred);
if (unlikely(f->f_mode & FMODE_BACKING))
kfree(backing_file(f));
else
kmem_cache_free(filp_cachep, f);
}
static inline void file_free(struct file *f)
{
security_file_free(f);
if (!(f->f_mode & FMODE_NOACCOUNT))
if (unlikely(f->f_mode & FMODE_BACKING))
path_put(backing_file_real_path(f));
if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
percpu_counter_dec(&nr_files);
call_rcu(&f->f_rcuhead, file_free_rcu);
}
......@@ -131,20 +153,15 @@ static int __init init_fs_stat_sysctls(void)
fs_initcall(init_fs_stat_sysctls);
#endif
static struct file *__alloc_file(int flags, const struct cred *cred)
static int init_file(struct file *f, int flags, const struct cred *cred)
{
struct file *f;
int error;
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
f->f_cred = get_cred(cred);
error = security_file_alloc(f);
if (unlikely(error)) {
file_free_rcu(&f->f_rcuhead);
return ERR_PTR(error);
return error;
}
atomic_long_set(&f->f_count, 1);
......@@ -155,7 +172,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
return f;
return 0;
}
/* Find an unused file structure and return a pointer to it.
......@@ -172,6 +189,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
{
static long old_max;
struct file *f;
int error;
/*
* Privileged users can go above max_files
......@@ -185,8 +203,14 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
f = __alloc_file(flags, cred);
if (!IS_ERR(f))
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
error = init_file(f, flags, cred);
if (unlikely(error))
return ERR_PTR(error);
percpu_counter_inc(&nr_files);
return f;
......@@ -203,18 +227,51 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
/*
* Variant of alloc_empty_file() that doesn't check and modify nr_files.
*
* Should not be used unless there's a very good reason to do so.
* This is only for kernel internal use, and the allocate file must not be
* installed into file tables or such.
*/
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
struct file *f = __alloc_file(flags, cred);
struct file *f;
int error;
f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
error = init_file(f, flags, cred);
if (unlikely(error))
return ERR_PTR(error);
if (!IS_ERR(f))
f->f_mode |= FMODE_NOACCOUNT;
return f;
}
/*
* Variant of alloc_empty_file() that allocates a backing_file container
* and doesn't check and modify nr_files.
*
* This is only for kernel internal use, and the allocate file must not be
* installed into file tables or such.
*/
struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
{
struct backing_file *ff;
int error;
ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
if (unlikely(!ff))
return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred);
if (unlikely(error))
return ERR_PTR(error);
ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
return &ff->file;
}
/**
* alloc_file - allocate and initialize a 'struct file'
*
......
......@@ -97,8 +97,9 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
*/
extern struct file *alloc_empty_file(int, const struct cred *);
extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
static inline void put_file_access(struct file *file)
{
......
......@@ -3703,7 +3703,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap,
}
/**
* vfs_tmpfile_open - open a tmpfile for kernel internal use
* kernel_tmpfile_open - open a tmpfile for kernel internal use
* @idmap: idmap of the mount the inode was found from
* @parentpath: path of the base directory
* @mode: mode of the new tmpfile
......@@ -3714,24 +3714,26 @@ static int vfs_tmpfile(struct mnt_idmap *idmap,
* hence this is only for kernel internal use, and must not be installed into
* file tables or such.
*/
struct file *vfs_tmpfile_open(struct mnt_idmap *idmap,
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
const struct path *parentpath,
umode_t mode, int open_flag, const struct cred *cred)
umode_t mode, int open_flag,
const struct cred *cred)
{
struct file *file;
int error;
file = alloc_empty_file_noaccount(open_flag, cred);
if (!IS_ERR(file)) {
if (IS_ERR(file))
return file;
error = vfs_tmpfile(idmap, parentpath, file, mode);
if (error) {
fput(file);
file = ERR_PTR(error);
}
}
return file;
}
EXPORT_SYMBOL(vfs_tmpfile_open);
EXPORT_SYMBOL(kernel_tmpfile_open);
static int do_tmpfile(struct nameidata *nd, unsigned flags,
const struct open_flags *op,
......
......@@ -1108,23 +1108,77 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode,
}
EXPORT_SYMBOL(dentry_create);
struct file *open_with_fake_path(const struct path *path, int flags,
/**
* kernel_file_open - open a file for kernel internal use
* @path: path of the file to open
* @flags: open flags
* @inode: the inode
* @cred: credentials for open
*
* Open a file for use by in-kernel consumers. The file is not accounted
* against nr_files and must not be installed into the file descriptor
* table.
*
* Return: Opened file on success, an error pointer on failure.
*/
struct file *kernel_file_open(const struct path *path, int flags,
struct inode *inode, const struct cred *cred)
{
struct file *f = alloc_empty_file_noaccount(flags, cred);
if (!IS_ERR(f)) {
struct file *f;
int error;
f = alloc_empty_file_noaccount(flags, cred);
if (IS_ERR(f))
return f;
f->f_path = *path;
error = do_dentry_open(f, inode, NULL);
if (error) {
fput(f);
f = ERR_PTR(error);
}
return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);
/**
* backing_file_open - open a backing file for kernel internal use
* @path: path of the file to open
* @flags: open flags
* @path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
* @path may be on the stackable filesystem and backing inode on the
* underlying filesystem. In this case, we want to be able to return
* the @real_path of the backing inode. This is done by embedding the
* returned file into a container structure that also stores the path of
* the backing inode on the underlying filesystem, which can be
* retrieved using backing_file_real_path().
*/
struct file *backing_file_open(const struct path *path, int flags,
const struct path *real_path,
const struct cred *cred)
{
struct file *f;
int error;
f = alloc_empty_backing_file(flags, cred);
if (IS_ERR(f))
return f;
f->f_path = *path;
path_get(real_path);
*backing_file_real_path(f) = *real_path;
error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
if (error) {
fput(f);
f = ERR_PTR(error);
}
return f;
}
EXPORT_SYMBOL(open_with_fake_path);
EXPORT_SYMBOL_GPL(backing_file_open);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
......
......@@ -34,8 +34,8 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
/* No atime modification nor notify on underlying */
#define OVL_OPEN_FLAGS (O_NOATIME | __FMODE_NONOTIFY)
/* No atime modification on underlying */
#define OVL_OPEN_FLAGS (O_NOATIME)
static struct file *ovl_open_realfile(const struct file *file,
const struct path *realpath)
......@@ -61,7 +61,7 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
realfile = open_with_fake_path(&file->f_path, flags, realinode,
realfile = backing_file_open(&file->f_path, flags, realpath,
current_cred());
}
revert_creds(old_cred);
......
......@@ -329,8 +329,9 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs,
struct dentry *dentry, umode_t mode)
{
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry };
struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode,
O_LARGEFILE | O_WRONLY, current_cred());
struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path,
mode, O_LARGEFILE | O_WRONLY,
current_cred());
int err = PTR_ERR_OR_ZERO(file);
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
......
......@@ -171,6 +171,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports non-exclusive O_DIRECT writes from multiple threads */
#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)
/* File is embedded in backing_file object */
#define FMODE_BACKING ((__force fmode_t)0x2000000)
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
......@@ -1678,9 +1681,12 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap,
WHITEOUT_DEV);
}
struct file *vfs_tmpfile_open(struct mnt_idmap *idmap,
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
const struct path *parentpath,
umode_t mode, int open_flag, const struct cred *cred);
umode_t mode, int open_flag,
const struct cred *cred);
struct file *kernel_file_open(const struct path *path, int flags,
struct inode *inode, const struct cred *cred);
int vfs_mkobj(struct dentry *, umode_t,
int (*f)(struct dentry *, umode_t, void *),
......@@ -2355,11 +2361,31 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
name, flags, mode);
}
extern struct file * dentry_open(const struct path *, int, const struct cred *);
extern struct file *dentry_create(const struct path *path, int flags,
umode_t mode, const struct cred *cred);
extern struct file * open_with_fake_path(const struct path *, int,
struct inode*, const struct cred *);
struct file *dentry_open(const struct path *path, int flags,
const struct cred *creds);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
const struct cred *cred);
struct file *backing_file_open(const struct path *path, int flags,
const struct path *real_path,
const struct cred *cred);
struct path *backing_file_real_path(struct file *f);
/*
* file_real_path - get the path corresponding to f_inode
*
* When opening a backing file for a stackable filesystem (e.g.,
* overlayfs) f_path may be on the stackable filesystem and f_inode on
* the underlying filesystem. When the path associated with f_inode is
* needed, this helper should be used instead of accessing f_path
* directly.
*/
static inline const struct path *file_real_path(struct file *f)
{
if (unlikely(f->f_mode & FMODE_BACKING))
return backing_file_real_path(f);
return &f->f_path;
}
static inline struct file *file_clone_open(struct file *file)
{
return dentry_open(&file->f_path, file->f_flags, file->f_cred);
......
......@@ -91,11 +91,13 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
static inline int fsnotify_file(struct file *file, __u32 mask)
{
const struct path *path = &file->f_path;
const struct path *path;
if (file->f_mode & FMODE_NONOTIFY)
return 0;
/* Overlayfs internal files have fake f_path */
path = file_real_path(file);
return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment