Commit 9caf6961 authored by Imran Khan's avatar Imran Khan Committed by Greg Kroah-Hartman

kernfs: Introduce separate rwsem to protect inode attributes.

Right now a global per-fs rwsem (kernfs_rwsem) synchronizes multiple
kernfs operations. On a large system with few hundred CPUs and few
hundred applications simultaneoulsy trying to access sysfs, this
results in multiple sys_open(s) contending on kernfs_rwsem via
kernfs_iop_permission and kernfs_dop_revalidate.

For example on a system with 384 cores, if I run 200 instances of an
application which is mostly executing the following loop:

  for (int loop = 0; loop <100 ; loop++)
  {
    for (int port_num = 1; port_num < 2; port_num++)
    {
      for (int gid_index = 0; gid_index < 254; gid_index++ )
      {
        char ret_buf[64], ret_buf_lo[64];
        char gid_file_path[1024];

        int      ret_len;
        int      ret_fd;
        ssize_t  ret_rd;

        ub4  i, saved_errno;

        memset(ret_buf, 0, sizeof(ret_buf));
        memset(gid_file_path, 0, sizeof(gid_file_path));

        ret_len = snprintf(gid_file_path, sizeof(gid_file_path),
                           "/sys/class/infiniband/%s/ports/%d/gids/%d",
                           dev_name,
                           port_num,
                           gid_index);

        ret_fd = open(gid_file_path, O_RDONLY | O_CLOEXEC);
        if (ret_fd < 0)
        {
          printf("Failed to open %s\n", gid_file_path);
          continue;
        }

        /* Read the GID */
        ret_rd = read(ret_fd, ret_buf, 40);

        if (ret_rd == -1)
        {
          printf("Failed to read from file %s, errno: %u\n",
                 gid_file_path, saved_errno);

          continue;
        }

        close(ret_fd);
      }
    }

I see contention around kernfs_rwsem as follows:

path_openat
|
|----link_path_walk.part.0.constprop.0
|      |
|      |--49.92%--inode_permission
|      |          |
|      |           --48.69%--kernfs_iop_permission
|      |                     |
|      |                     |--18.16%--down_read
|      |                     |
|      |                     |--15.38%--up_read
|      |                     |
|      |                      --14.58%--_raw_spin_lock
|      |                                |
|      |                                 -----
|      |
|      |--29.08%--walk_component
|      |          |
|      |           --29.02%--lookup_fast
|      |                     |
|      |                     |--24.26%--kernfs_dop_revalidate
|      |                     |          |
|      |                     |          |--14.97%--down_read
|      |                     |          |
|      |                     |           --9.01%--up_read
|      |                     |
|      |                      --4.74%--__d_lookup
|      |                                |
|      |                                 --4.64%--_raw_spin_lock
|      |                                           |
|      |                                            ----

Having a separate per-fs rwsem to protect kernfs inode attributes,
will avoid the above mentioned contention and result in better
performance as can bee seen below:

path_openat
|
|----link_path_walk.part.0.constprop.0
|     |
|     |
|     |--27.06%--inode_permission
|     |          |
|     |           --25.84%--kernfs_iop_permission
|     |                     |
|     |                     |--9.29%--up_read
|     |                     |
|     |                     |--8.19%--down_read
|     |                     |
|     |                      --7.89%--_raw_spin_lock
|     |                                |
|     |                                 ----
|     |
|     |--22.42%--walk_component
|     |          |
|     |           --22.36%--lookup_fast
|     |                     |
|     |                     |--16.07%--__d_lookup
|     |                     |          |
|     |                     |           --16.01%--_raw_spin_lock
|     |                     |                     |
|     |                     |                      ----
|     |                     |
|     |                      --6.28%--kernfs_dop_revalidate
|     |                                |
|     |                                |--3.76%--down_read
|     |                                |
|     |                                 --2.26%--up_read

As can be seen from the above data the overhead due to both
kerfs_iop_permission and kernfs_dop_revalidate have gone down and
this also reduces overall run time of the earlier mentioned loop.
Signed-off-by: default avatarImran Khan <imran.f.khan@oracle.com>
Link: https://lore.kernel.org/r/20230309110932.2889010-2-imran.f.khan@oracle.comSigned-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 02fe26f2
......@@ -770,12 +770,15 @@ int kernfs_add_one(struct kernfs_node *kn)
goto out_unlock;
/* Update timestamps on the parent */
down_write(&root->kernfs_iattr_rwsem);
ps_iattr = parent->iattr;
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
up_write(&root->kernfs_iattr_rwsem);
up_write(&root->kernfs_rwsem);
/*
......@@ -940,6 +943,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
idr_init(&root->ino_idr);
init_rwsem(&root->kernfs_rwsem);
init_rwsem(&root->kernfs_iattr_rwsem);
INIT_LIST_HEAD(&root->supers);
/*
......@@ -1462,11 +1466,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
pos->parent ? pos->parent->iattr : NULL;
/* update timestamps on the parent */
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (ps_iattr) {
ktime_get_real_ts64(&ps_iattr->ia_ctime);
ps_iattr->ia_mtime = ps_iattr->ia_ctime;
}
up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
kernfs_put(pos);
}
......
......@@ -101,9 +101,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
int ret;
struct kernfs_root *root = kernfs_root(kn);
down_write(&root->kernfs_rwsem);
down_write(&root->kernfs_iattr_rwsem);
ret = __kernfs_setattr(kn, iattr);
up_write(&root->kernfs_rwsem);
up_write(&root->kernfs_iattr_rwsem);
return ret;
}
......@@ -119,7 +119,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return -EINVAL;
root = kernfs_root(kn);
down_write(&root->kernfs_rwsem);
down_write(&root->kernfs_iattr_rwsem);
error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
if (error)
goto out;
......@@ -132,7 +132,7 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
setattr_copy(&nop_mnt_idmap, inode, iattr);
out:
up_write(&root->kernfs_rwsem);
up_write(&root->kernfs_iattr_rwsem);
return error;
}
......@@ -189,10 +189,10 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
struct kernfs_node *kn = inode->i_private;
struct kernfs_root *root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&nop_mnt_idmap, inode, stat);
up_read(&root->kernfs_rwsem);
up_read(&root->kernfs_iattr_rwsem);
return 0;
}
......@@ -285,10 +285,10 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
kn = inode->i_private;
root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&nop_mnt_idmap, inode, mask);
up_read(&root->kernfs_rwsem);
up_read(&root->kernfs_iattr_rwsem);
return ret;
}
......
......@@ -47,6 +47,7 @@ struct kernfs_root {
wait_queue_head_t deactivate_waitq;
struct rw_semaphore kernfs_rwsem;
struct rw_semaphore kernfs_iattr_rwsem;
};
/* +1 to avoid triggering overflow warning when negating it */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment