Commit f41def39 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The highlights are:

   - automatic recovery of a blacklisted filesystem session (Zheng Yan).
     This is disabled by default and can be enabled by mounting with the
     new "recover_session=clean" option.

   - serialize buffered reads and O_DIRECT writes (Jeff Layton). Care is
     taken to avoid serializing O_DIRECT reads and writes with each
     other, this is based on the exclusion scheme from NFS.

   - handle large osdmaps better in the face of fragmented memory
     (myself)

   - don't limit what security.* xattrs can be get or set (Jeff Layton).
     We were overly restrictive here, unnecessarily preventing things
     like file capability sets stored in security.capability from
     working.

   - allow copy_file_range() within the same inode and across different
     filesystems within the same cluster (Luis Henriques)"

* tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client: (41 commits)
  ceph: call ceph_mdsc_destroy from destroy_fs_client
  libceph: use ceph_kvmalloc() for osdmap arrays
  libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc()
  ceph: allow object copies across different filesystems in the same cluster
  ceph: include ceph_debug.h in cache.c
  ceph: move static keyword to the front of declarations
  rbd: pull rbd_img_request_create() dout out into the callers
  ceph: reconnect connection if session hang in opening state
  libceph: drop unused con parameter of calc_target()
  ceph: use release_pages() directly
  rbd: fix response length parameter for encoded strings
  ceph: allow arbitrary security.* xattrs
  ceph: only set CEPH_I_SEC_INITED if we got a MAC label
  ceph: turn ceph_security_invalidate_secctx into static inline
  ceph: add buffered/direct exclusionary locking for reads and writes
  libceph: handle OSD op ceph_pagelist_append() errors
  ceph: don't return a value from void function
  ceph: don't freeze during write page faults
  ceph: update the mtime when truncating up
  ceph: fix indentation in __get_snap_name()
  ...
parents 7b1373dd 3ee5a701
...@@ -158,6 +158,20 @@ Mount Options ...@@ -158,6 +158,20 @@ Mount Options
copies. Currently, it's only used in copy_file_range, which will revert copies. Currently, it's only used in copy_file_range, which will revert
to the default VFS implementation if this option is used. to the default VFS implementation if this option is used.
recover_session=<no|clean>
Set auto reconnect mode in the case where the client is blacklisted. The
available modes are "no" and "clean". The default is "no".
* no: never attempt to reconnect when client detects that it has been
blacklisted. Operations will generally fail after being blacklisted.
* clean: client reconnects to the ceph cluster automatically when it
detects that it has been blacklisted. During reconnect, client drops
dirty data/metadata, invalidates page caches and writable file handles.
After reconnect, file locks become stale because the MDS loses track
of them. If an inode contains any stale file locks, read/write on the
inode is not allowed until applications release all stale file locks.
More Information More Information
================ ================
......
...@@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create( ...@@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create(
mutex_init(&img_request->state_mutex); mutex_init(&img_request->state_mutex);
kref_init(&img_request->kref); kref_init(&img_request->kref);
dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
obj_op_name(op_type), img_request);
return img_request; return img_request;
} }
...@@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) ...@@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
__set_bit(IMG_REQ_CHILD, &child_img_req->flags); __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
child_img_req->obj_request = obj_req; child_img_req->obj_request = obj_req;
dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
obj_req);
if (!rbd_img_is_write(img_req)) { if (!rbd_img_is_write(img_req)) {
switch (img_req->data_type) { switch (img_req->data_type) {
case OBJ_REQUEST_BIO: case OBJ_REQUEST_BIO:
...@@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work) ...@@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work)
img_request->rq = rq; img_request->rq = rq;
snapc = NULL; /* img_request consumes a ref */ snapc = NULL; /* img_request consumes a ref */
dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
img_request, obj_op_name(op_type), offset, length);
if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
result = rbd_img_fill_nodata(img_request, offset, length); result = rbd_img_fill_nodata(img_request, offset, length);
else else
...@@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) ...@@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
{ {
size_t size;
void *reply_buf; void *reply_buf;
int ret; int ret;
void *p; void *p;
reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); /* Response will be an encoded string, which includes a length */
size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
reply_buf = kzalloc(size, GFP_KERNEL);
if (!reply_buf) if (!reply_buf)
return -ENOMEM; return -ENOMEM;
ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
&rbd_dev->header_oloc, "get_object_prefix", &rbd_dev->header_oloc, "get_object_prefix",
NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); NULL, 0, reply_buf, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret < 0) if (ret < 0)
goto out; goto out;
...@@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ...@@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
dout("rbd id object name is %s\n", oid.name); dout("rbd id object name is %s\n", oid.name);
/* Response will be an encoded string, which includes a length */ /* Response will be an encoded string, which includes a length */
size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
response = kzalloc(size, GFP_NOIO); response = kzalloc(size, GFP_NOIO);
if (!response) { if (!response) {
...@@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ...@@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
"get_id", NULL, 0, "get_id", NULL, 0,
response, RBD_IMAGE_ID_LEN_MAX); response, size);
dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
if (ret == -ENOENT) { if (ret == -ENOENT) {
image_id = kstrdup("", GFP_KERNEL); image_id = kstrdup("", GFP_KERNEL);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
obj-$(CONFIG_CEPH_FS) += ceph.o obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o quota.o \ export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o debugfs.o
......
...@@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) ...@@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
{ {
struct inode *inode = file_inode(filp); struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc = struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
&ceph_inode_to_client(inode)->client->osdc;
int err = 0; int err = 0;
u64 off = page_offset(page); u64 off = page_offset(page);
u64 len = PAGE_SIZE; u64 len = PAGE_SIZE;
...@@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) ...@@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
dout("readpage inode %p file %p page %p index %lu\n", dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index); inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
off, &len, &ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size, ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0); &page, 1, 0);
if (err == -ENOENT) if (err == -ENOENT)
...@@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) ...@@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
if (err < 0) { if (err < 0) {
SetPageError(page); SetPageError(page);
ceph_fscache_readpage_cancel(inode, page); ceph_fscache_readpage_cancel(inode, page);
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
goto out; goto out;
} }
if (err < PAGE_SIZE) if (err < PAGE_SIZE)
...@@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) ...@@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req)
int i; int i;
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
if (rc == -EBLACKLISTED)
ceph_inode_to_client(inode)->blacklisted = true;
/* unlock all pages, zeroing any data we didn't read */ /* unlock all pages, zeroing any data we didn't read */
osd_data = osd_req_op_extent_osd_data(req, 0); osd_data = osd_req_op_extent_osd_data(req, 0);
...@@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, ...@@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
/* caller of readpages does not hold buffer and read caps /* caller of readpages does not hold buffer and read caps
* (fadvise, madvise and readahead cases) */ * (fadvise, madvise and readahead cases) */
int want = CEPH_CAP_FILE_CACHE; int want = CEPH_CAP_FILE_CACHE;
ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
true, &got);
if (ret < 0) { if (ret < 0) {
dout("start_read %p, error getting cap\n", inode); dout("start_read %p, error getting cap\n", inode);
} else if (!(got & want)) { } else if (!(got & want)) {
...@@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode, ...@@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode,
/* /*
* Write a single page, but leave the page locked. * Write a single page, but leave the page locked.
* *
* If we get a write error, set the page error bit, but still adjust the * If we get a write error, mark the mapping for error, but still adjust the
* dirty page accounting (i.e., page is no longer dirty). * dirty page accounting (i.e., page is no longer dirty).
*/ */
static int writepage_nounlock(struct page *page, struct writeback_control *wbc) static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
...@@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page); end_page_writeback(page);
return err; return err;
} }
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
dout("writepage setting page/mapping error %d %p\n", dout("writepage setting page/mapping error %d %p\n",
err, page); err, page);
SetPageError(page);
mapping_set_error(&inode->i_data, err); mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++; wbc->pages_skipped++;
} else { } else {
...@@ -679,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) ...@@ -679,23 +684,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
return err; return err;
} }
/*
* lame release_pages helper. release_pages() isn't exported to
* modules.
*/
static void ceph_release_pages(struct page **pages, int num)
{
struct pagevec pvec;
int i;
pagevec_init(&pvec);
for (i = 0; i < num; i++) {
if (pagevec_add(&pvec, pages[i]) == 0)
pagevec_release(&pvec);
}
pagevec_release(&pvec);
}
/* /*
* async writeback completion handler. * async writeback completion handler.
* *
...@@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req) ...@@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req)
if (rc < 0) { if (rc < 0) {
mapping_set_error(mapping, rc); mapping_set_error(mapping, rc);
ceph_set_error_write(ci); ceph_set_error_write(ci);
if (rc == -EBLACKLISTED)
fsc->blacklisted = true;
} else { } else {
ceph_clear_error_write(ci); ceph_clear_error_write(ci);
} }
...@@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req) ...@@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req)
dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
inode, osd_data->length, rc >= 0 ? num_pages : 0); inode, osd_data->length, rc >= 0 ? num_pages : 0);
ceph_release_pages(osd_data->pages, num_pages); release_pages(osd_data->pages, num_pages);
} }
ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
...@@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ...@@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
want = CEPH_CAP_FILE_CACHE; want = CEPH_CAP_FILE_CACHE;
got = 0; got = 0;
err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
&got, &pinned_page);
if (err < 0) if (err < 0)
goto out_restore; goto out_restore;
...@@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ...@@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
if (!prealloc_cf) if (!prealloc_cf)
return VM_FAULT_OOM; return VM_FAULT_OOM;
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset); ceph_block_sigs(&oldset);
if (ci->i_inline_version != CEPH_INLINE_NONE) { if (ci->i_inline_version != CEPH_INLINE_NONE) {
...@@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ...@@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
want = CEPH_CAP_FILE_BUFFER; want = CEPH_CAP_FILE_BUFFER;
got = 0; got = 0;
err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
&got, NULL); &got, NULL);
if (err < 0) if (err < 0)
goto out_free; goto out_free;
...@@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ...@@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_cap_refs(ci, got); ceph_put_cap_refs(ci, got);
out_free: out_free:
ceph_restore_sigs(&oldset); ceph_restore_sigs(&oldset);
sb_end_pagefault(inode->i_sb);
ceph_free_cap_flush(prealloc_cf); ceph_free_cap_flush(prealloc_cf);
if (err < 0) if (err < 0)
ret = vmf_error(err); ret = vmf_error(err);
...@@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, ...@@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
if (err >= 0 || err == -ENOENT) if (err >= 0 || err == -ENOENT)
have |= POOL_READ; have |= POOL_READ;
else if (err != -EPERM) else if (err != -EPERM) {
if (err == -EBLACKLISTED)
fsc->blacklisted = true;
goto out_unlock; goto out_unlock;
}
if (err2 == 0 || err2 == -EEXIST) if (err2 == 0 || err2 == -EEXIST)
have |= POOL_WRITE; have |= POOL_WRITE;
else if (err2 != -EPERM) { else if (err2 != -EPERM) {
if (err2 == -EBLACKLISTED)
fsc->blacklisted = true;
err = err2; err = err2;
goto out_unlock; goto out_unlock;
} }
...@@ -1989,10 +1987,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, ...@@ -1989,10 +1987,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
return err; return err;
} }
int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) int ceph_pool_perm_check(struct inode *inode, int need)
{ {
s64 pool; struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_string *pool_ns; struct ceph_string *pool_ns;
s64 pool;
int ret, flags; int ret, flags;
if (ci->i_vino.snap != CEPH_NOSNAP) { if (ci->i_vino.snap != CEPH_NOSNAP) {
...@@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) ...@@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
return 0; return 0;
} }
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), if (ceph_test_mount_opt(ceph_inode_to_client(inode),
NOPOOLPERM)) NOPOOLPERM))
return 0; return 0;
......
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
* Written by Milosz Tanski (milosz@adfin.com) * Written by Milosz Tanski (milosz@adfin.com)
*/ */
#include <linux/ceph/ceph_debug.h>
#include "super.h" #include "super.h"
#include "cache.h" #include "cache.h"
......
This diff is collapsed.
...@@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) ...@@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{ {
return 0;
} }
void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
......
...@@ -35,7 +35,7 @@ struct ceph_nfs_snapfh { ...@@ -35,7 +35,7 @@ struct ceph_nfs_snapfh {
static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode) struct inode *parent_inode)
{ {
const static int snap_handle_length = static const int snap_handle_length =
sizeof(struct ceph_nfs_snapfh) >> 2; sizeof(struct ceph_nfs_snapfh) >> 2;
struct ceph_nfs_snapfh *sfh = (void *)rawfh; struct ceph_nfs_snapfh *sfh = (void *)rawfh;
u64 snapid = ceph_snap(inode); u64 snapid = ceph_snap(inode);
...@@ -85,9 +85,9 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, ...@@ -85,9 +85,9 @@ static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode) struct inode *parent_inode)
{ {
const static int handle_length = static const int handle_length =
sizeof(struct ceph_nfs_fh) >> 2; sizeof(struct ceph_nfs_fh) >> 2;
const static int connected_handle_length = static const int connected_handle_length =
sizeof(struct ceph_nfs_confh) >> 2; sizeof(struct ceph_nfs_confh) >> 2;
int type; int type;
...@@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name, ...@@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name,
if (err < 0) if (err < 0)
goto out; goto out;
rinfo = &req->r_reply_info; rinfo = &req->r_reply_info;
for (i = 0; i < rinfo->dir_nr; i++) { for (i = 0; i < rinfo->dir_nr; i++) {
rde = rinfo->dir_entries + i; rde = rinfo->dir_entries + i;
BUG_ON(!rde->inode.in); BUG_ON(!rde->inode.in);
if (ceph_snap(inode) == if (ceph_snap(inode) ==
le64_to_cpu(rde->inode.in->snapid)) { le64_to_cpu(rde->inode.in->snapid)) {
memcpy(name, rde->name, rde->name_len); memcpy(name, rde->name, rde->name_len);
name[rde->name_len] = '\0'; name[rde->name_len] = '\0';
err = 0; err = 0;
goto out; goto out;
} }
} }
if (rinfo->dir_end) if (rinfo->dir_end)
break; break;
BUG_ON(rinfo->dir_nr <= 0); BUG_ON(rinfo->dir_nr <= 0);
rde = rinfo->dir_entries + (rinfo->dir_nr - 1); rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
next_offset += rinfo->dir_nr; next_offset += rinfo->dir_nr;
last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
if (!last_name) { if (!last_name) {
err = -ENOMEM; err = -ENOMEM;
goto out; goto out;
} }
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
req = NULL; req = NULL;
} }
err = -ENOENT; err = -ENOENT;
out: out:
......
This diff is collapsed.
...@@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ceph_fscache_inode_init(ci); ceph_fscache_inode_init(ci);
ci->i_meta_err = 0;
return &ci->vfs_inode; return &ci->vfs_inode;
} }
...@@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
/* update inode */ /* update inode */
inode->i_rdev = le32_to_cpu(info->rdev); inode->i_rdev = le32_to_cpu(info->rdev);
inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; /* directories have fl_stripe_unit set to zero */
if (le32_to_cpu(info->layout.fl_stripe_unit))
inode->i_blkbits =
fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
else
inode->i_blkbits = CEPH_BLOCK_SHIFT;
__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
...@@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = { ...@@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = {
int __ceph_setattr(struct inode *inode, struct iattr *attr) int __ceph_setattr(struct inode *inode, struct iattr *attr)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
const unsigned int ia_valid = attr->ia_valid; unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf; struct ceph_cap_flush *prealloc_cf;
...@@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ...@@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
} }
} }
if (ia_valid & ATTR_SIZE) {
dout("setattr %p size %lld -> %lld\n", inode,
inode->i_size, attr->ia_size);
if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
ia_valid |= ATTR_MTIME;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
attr->ia_size != inode->i_size) {
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
req->r_args.setattr.old_size =
cpu_to_le64(inode->i_size);
mask |= CEPH_SETATTR_SIZE;
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
}
}
if (ia_valid & ATTR_MTIME) { if (ia_valid & ATTR_MTIME) {
dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
...@@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ...@@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
} }
} }
if (ia_valid & ATTR_SIZE) {
dout("setattr %p size %lld -> %lld\n", inode,
inode->i_size, attr->ia_size);
if ((issued & CEPH_CAP_FILE_EXCL) &&
attr->ia_size > inode->i_size) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
ci->i_reported_size = attr->ia_size;
dirtied |= CEPH_CAP_FILE_EXCL;
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
attr->ia_size != inode->i_size) {
req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
req->r_args.setattr.old_size =
cpu_to_le64(inode->i_size);
mask |= CEPH_SETATTR_SIZE;
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
}
}
/* these do nothing */ /* these do nothing */
if (ia_valid & ATTR_CTIME) { if (ia_valid & ATTR_CTIME) {
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Trond Myklebust
* Copyright (c) 2019 Jeff Layton
*
* I/O and data path helper functionality.
*
* Heavily borrowed from equivalent code in fs/nfs/io.c
*/
#include <linux/ceph/ceph_debug.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/rwsem.h>
#include <linux/fs.h>
#include "super.h"
#include "io.h"
/* Call with exclusively locked inode->i_rwsem */
static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
{
lockdep_assert_held_write(&inode->i_rwsem);
if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
spin_unlock(&ci->i_ceph_lock);
inode_dio_wait(inode);
}
}
/**
* ceph_start_io_read - declare the file is being used for buffered reads
* @inode: file inode
*
* Declare that a buffered read operation is about to start, and ensure
* that we block all direct I/O.
* On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
* and holds a shared lock on inode->i_rwsem to ensure that the flag
* cannot be changed.
* In practice, this means that buffered read operations are allowed to
* execute in parallel, thanks to the shared lock, whereas direct I/O
* operations need to wait to grab an exclusive lock in order to set
* CEPH_I_ODIRECT.
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
*/
void
ceph_start_io_read(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
/* Be an optimist! */
down_read(&inode->i_rwsem);
if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
return;
up_read(&inode->i_rwsem);
/* Slow path.... */
down_write(&inode->i_rwsem);
ceph_block_o_direct(ci, inode);
downgrade_write(&inode->i_rwsem);
}
/**
* ceph_end_io_read - declare that the buffered read operation is done
* @inode: file inode
*
* Declare that a buffered read operation is done, and release the shared
* lock on inode->i_rwsem.
*/
void
ceph_end_io_read(struct inode *inode)
{
up_read(&inode->i_rwsem);
}
/**
* ceph_start_io_write - declare the file is being used for buffered writes
* @inode: file inode
*
* Declare that a buffered write operation is about to start, and ensure
* that we block all direct I/O.
*/
void
ceph_start_io_write(struct inode *inode)
{
down_write(&inode->i_rwsem);
ceph_block_o_direct(ceph_inode(inode), inode);
}
/**
* ceph_end_io_write - declare that the buffered write operation is done
* @inode: file inode
*
* Declare that a buffered write operation is done, and release the
* lock on inode->i_rwsem.
*/
void
ceph_end_io_write(struct inode *inode)
{
up_write(&inode->i_rwsem);
}
/* Call with exclusively locked inode->i_rwsem */
static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
{
lockdep_assert_held_write(&inode->i_rwsem);
if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags |= CEPH_I_ODIRECT;
spin_unlock(&ci->i_ceph_lock);
/* FIXME: unmap_mapping_range? */
filemap_write_and_wait(inode->i_mapping);
}
}
/**
* ceph_end_io_direct - declare the file is being used for direct i/o
* @inode: file inode
*
* Declare that a direct I/O operation is about to start, and ensure
* that we block all buffered I/O.
* On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
* and holds a shared lock on inode->i_rwsem to ensure that the flag
* cannot be changed.
* In practice, this means that direct I/O operations are allowed to
* execute in parallel, thanks to the shared lock, whereas buffered I/O
* operations need to wait to grab an exclusive lock in order to clear
* CEPH_I_ODIRECT.
* Note that buffered writes and truncates both take a write lock on
* inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
*/
void
ceph_start_io_direct(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
/* Be an optimist! */
down_read(&inode->i_rwsem);
if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
return;
up_read(&inode->i_rwsem);
/* Slow path.... */
down_write(&inode->i_rwsem);
ceph_block_buffered(ci, inode);
downgrade_write(&inode->i_rwsem);
}
/**
* ceph_end_io_direct - declare that the direct i/o operation is done
* @inode: file inode
*
* Declare that a direct I/O operation is done, and release the shared
* lock on inode->i_rwsem.
*/
void
ceph_end_io_direct(struct inode *inode)
{
up_read(&inode->i_rwsem);
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FS_CEPH_IO_H
#define _FS_CEPH_IO_H
void ceph_start_io_read(struct inode *inode);
void ceph_end_io_read(struct inode *inode);
void ceph_start_io_write(struct inode *inode);
void ceph_end_io_write(struct inode *inode);
void ceph_start_io_direct(struct inode *inode);
void ceph_end_io_direct(struct inode *inode);
#endif /* FS_CEPH_IO_H */
...@@ -32,14 +32,18 @@ void __init ceph_flock_init(void) ...@@ -32,14 +32,18 @@ void __init ceph_flock_init(void)
static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
{ {
struct inode *inode = file_inode(src->fl_file); struct ceph_file_info *fi = dst->fl_file->private_data;
struct inode *inode = file_inode(dst->fl_file);
atomic_inc(&ceph_inode(inode)->i_filelock_ref); atomic_inc(&ceph_inode(inode)->i_filelock_ref);
atomic_inc(&fi->num_locks);
} }
static void ceph_fl_release_lock(struct file_lock *fl) static void ceph_fl_release_lock(struct file_lock *fl)
{ {
struct ceph_file_info *fi = fl->fl_file->private_data;
struct inode *inode = file_inode(fl->fl_file); struct inode *inode = file_inode(fl->fl_file);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
atomic_dec(&fi->num_locks);
if (atomic_dec_and_test(&ci->i_filelock_ref)) { if (atomic_dec_and_test(&ci->i_filelock_ref)) {
/* clear error when all locks are released */ /* clear error when all locks are released */
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
...@@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, ...@@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
* window. Caller function will decrease the counter. * window. Caller function will decrease the counter.
*/ */
fl->fl_ops = &ceph_fl_lock_ops; fl->fl_ops = &ceph_fl_lock_ops;
atomic_inc(&ceph_inode(inode)->i_filelock_ref); fl->fl_ops->fl_copy_lock(fl, NULL);
} }
if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
......
...@@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ...@@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_renew_seq = 0; s->s_renew_seq = 0;
INIT_LIST_HEAD(&s->s_caps); INIT_LIST_HEAD(&s->s_caps);
s->s_nr_caps = 0; s->s_nr_caps = 0;
s->s_trim_caps = 0;
refcount_set(&s->s_ref, 1); refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe); INIT_LIST_HEAD(&s->s_unsafe);
...@@ -1270,6 +1269,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, ...@@ -1270,6 +1269,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
{ {
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct rb_node *p; struct rb_node *p;
struct ceph_inode_info *ci;
dout("cleanup_session_requests mds%d\n", session->s_mds); dout("cleanup_session_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
...@@ -1278,6 +1278,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, ...@@ -1278,6 +1278,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request, r_unsafe_item); struct ceph_mds_request, r_unsafe_item);
pr_warn_ratelimited(" dropping unsafe request %llu\n", pr_warn_ratelimited(" dropping unsafe request %llu\n",
req->r_tid); req->r_tid);
if (req->r_target_inode) {
/* dropping unsafe change of inode's attributes */
ci = ceph_inode(req->r_target_inode);
errseq_set(&ci->i_meta_err, -EIO);
}
if (req->r_unsafe_dir) {
/* dropping unsafe directory operation */
ci = ceph_inode(req->r_unsafe_dir);
errseq_set(&ci->i_meta_err, -EIO);
}
__unregister_request(mdsc, req); __unregister_request(mdsc, req);
} }
/* zero r_attempts, so kick_requests() will re-send requests */ /* zero r_attempts, so kick_requests() will re-send requests */
...@@ -1370,7 +1380,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1370,7 +1380,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
LIST_HEAD(to_remove); LIST_HEAD(to_remove);
bool drop = false; bool dirty_dropped = false;
bool invalidate = false; bool invalidate = false;
dout("removing cap %p, ci is %p, inode is %p\n", dout("removing cap %p, ci is %p, inode is %p\n",
...@@ -1383,9 +1393,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1383,9 +1393,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_cap_flush *cf; struct ceph_cap_flush *cf;
struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_client *mdsc = fsc->mdsc;
if (ci->i_wrbuffer_ref > 0 && if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) if (inode->i_data.nrpages > 0)
invalidate = true; invalidate = true;
if (ci->i_wrbuffer_ref > 0)
mapping_set_error(&inode->i_data, -EIO);
}
while (!list_empty(&ci->i_cap_flush_list)) { while (!list_empty(&ci->i_cap_flush_list)) {
cf = list_first_entry(&ci->i_cap_flush_list, cf = list_first_entry(&ci->i_cap_flush_list,
...@@ -1405,7 +1418,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1405,7 +1418,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
inode, ceph_ino(inode)); inode, ceph_ino(inode));
ci->i_dirty_caps = 0; ci->i_dirty_caps = 0;
list_del_init(&ci->i_dirty_item); list_del_init(&ci->i_dirty_item);
drop = true; dirty_dropped = true;
} }
if (!list_empty(&ci->i_flushing_item)) { if (!list_empty(&ci->i_flushing_item)) {
pr_warn_ratelimited( pr_warn_ratelimited(
...@@ -1415,10 +1428,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1415,10 +1428,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ci->i_flushing_caps = 0; ci->i_flushing_caps = 0;
list_del_init(&ci->i_flushing_item); list_del_init(&ci->i_flushing_item);
mdsc->num_cap_flushing--; mdsc->num_cap_flushing--;
drop = true; dirty_dropped = true;
} }
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
if (dirty_dropped) {
errseq_set(&ci->i_meta_err, -EIO);
if (ci->i_wrbuffer_ref_head == 0 &&
ci->i_wr_ref == 0 &&
ci->i_dirty_caps == 0 &&
ci->i_flushing_caps == 0) {
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
}
}
if (atomic_read(&ci->i_filelock_ref) > 0) { if (atomic_read(&ci->i_filelock_ref) > 0) {
/* make further file lock syscall return -EIO */ /* make further file lock syscall return -EIO */
ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
...@@ -1430,15 +1455,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1430,15 +1455,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL; ci->i_prealloc_cap_flush = NULL;
} }
if (drop &&
ci->i_wrbuffer_ref_head == 0 &&
ci->i_wr_ref == 0 &&
ci->i_dirty_caps == 0 &&
ci->i_flushing_caps == 0) {
ceph_put_snap_context(ci->i_head_snapc);
ci->i_head_snapc = NULL;
}
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) { while (!list_empty(&to_remove)) {
...@@ -1452,7 +1468,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1452,7 +1468,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
wake_up_all(&ci->i_cap_wq); wake_up_all(&ci->i_cap_wq);
if (invalidate) if (invalidate)
ceph_queue_invalidate(inode); ceph_queue_invalidate(inode);
if (drop) if (dirty_dropped)
iput(inode); iput(inode);
return 0; return 0;
} }
...@@ -1705,11 +1721,11 @@ static bool drop_negative_children(struct dentry *dentry) ...@@ -1705,11 +1721,11 @@ static bool drop_negative_children(struct dentry *dentry)
*/ */
static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
{ {
struct ceph_mds_session *session = arg; int *remaining = arg;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int used, wanted, oissued, mine; int used, wanted, oissued, mine;
if (session->s_trim_caps <= 0) if (*remaining <= 0)
return -1; return -1;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
...@@ -1746,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) ...@@ -1746,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if (oissued) { if (oissued) {
/* we aren't the only cap.. just remove us */ /* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true); __ceph_remove_cap(cap, true);
session->s_trim_caps--; (*remaining)--;
} else { } else {
struct dentry *dentry; struct dentry *dentry;
/* try dropping referring dentries */ /* try dropping referring dentries */
...@@ -1758,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) ...@@ -1758,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
d_prune_aliases(inode); d_prune_aliases(inode);
count = atomic_read(&inode->i_count); count = atomic_read(&inode->i_count);
if (count == 1) if (count == 1)
session->s_trim_caps--; (*remaining)--;
dout("trim_caps_cb %p cap %p pruned, count now %d\n", dout("trim_caps_cb %p cap %p pruned, count now %d\n",
inode, cap, count); inode, cap, count);
} else { } else {
...@@ -1784,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, ...@@ -1784,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
dout("trim_caps mds%d start: %d / %d, trim %d\n", dout("trim_caps mds%d start: %d / %d, trim %d\n",
session->s_mds, session->s_nr_caps, max_caps, trim_caps); session->s_mds, session->s_nr_caps, max_caps, trim_caps);
if (trim_caps > 0) { if (trim_caps > 0) {
session->s_trim_caps = trim_caps; int remaining = trim_caps;
ceph_iterate_session_caps(session, trim_caps_cb, session);
ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
dout("trim_caps mds%d done: %d / %d, trimmed %d\n", dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
session->s_mds, session->s_nr_caps, max_caps, session->s_mds, session->s_nr_caps, max_caps,
trim_caps - session->s_trim_caps); trim_caps - remaining);
session->s_trim_caps = 0;
} }
ceph_flush_cap_releases(mdsc, session); ceph_flush_cap_releases(mdsc, session);
...@@ -3015,18 +3031,23 @@ static void handle_forward(struct ceph_mds_client *mdsc, ...@@ -3015,18 +3031,23 @@ static void handle_forward(struct ceph_mds_client *mdsc,
pr_err("mdsc_handle_forward decode error err=%d\n", err); pr_err("mdsc_handle_forward decode error err=%d\n", err);
} }
static int __decode_and_drop_session_metadata(void **p, void *end) static int __decode_session_metadata(void **p, void *end,
bool *blacklisted)
{ {
/* map<string,string> */ /* map<string,string> */
u32 n; u32 n;
bool err_str;
ceph_decode_32_safe(p, end, n, bad); ceph_decode_32_safe(p, end, n, bad);
while (n-- > 0) { while (n-- > 0) {
u32 len; u32 len;
ceph_decode_32_safe(p, end, len, bad); ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad); ceph_decode_need(p, end, len, bad);
err_str = !strncmp(*p, "error_string", len);
*p += len; *p += len;
ceph_decode_32_safe(p, end, len, bad); ceph_decode_32_safe(p, end, len, bad);
ceph_decode_need(p, end, len, bad); ceph_decode_need(p, end, len, bad);
if (err_str && strnstr(*p, "blacklisted", len))
*blacklisted = true;
*p += len; *p += len;
} }
return 0; return 0;
...@@ -3050,6 +3071,7 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -3050,6 +3071,7 @@ static void handle_session(struct ceph_mds_session *session,
u64 seq; u64 seq;
unsigned long features = 0; unsigned long features = 0;
int wake = 0; int wake = 0;
bool blacklisted = false;
/* decode */ /* decode */
ceph_decode_need(&p, end, sizeof(*h), bad); ceph_decode_need(&p, end, sizeof(*h), bad);
...@@ -3062,7 +3084,7 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -3062,7 +3084,7 @@ static void handle_session(struct ceph_mds_session *session,
if (msg_version >= 3) { if (msg_version >= 3) {
u32 len; u32 len;
/* version >= 2, metadata */ /* version >= 2, metadata */
if (__decode_and_drop_session_metadata(&p, end) < 0) if (__decode_session_metadata(&p, end, &blacklisted) < 0)
goto bad; goto bad;
/* version >= 3, feature bits */ /* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad); ceph_decode_32_safe(&p, end, len, bad);
...@@ -3149,6 +3171,8 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -3149,6 +3171,8 @@ static void handle_session(struct ceph_mds_session *session,
session->s_state = CEPH_MDS_SESSION_REJECTED; session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session); cleanup_session_requests(mdsc, session);
remove_session_caps(session); remove_session_caps(session);
if (blacklisted)
mdsc->fsc->blacklisted = true;
wake = 2; /* for good measure */ wake = 2; /* for good measure */
break; break;
...@@ -3998,7 +4022,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) ...@@ -3998,7 +4022,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
} }
static void maybe_recover_session(struct ceph_mds_client *mdsc)
{
struct ceph_fs_client *fsc = mdsc->fsc;
if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
return;
if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
return;
if (!READ_ONCE(fsc->blacklisted))
return;
if (fsc->last_auto_reconnect &&
time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
return;
pr_info("auto reconnect after blacklisted\n");
fsc->last_auto_reconnect = jiffies;
ceph_force_reconnect(fsc->sb);
}
/* /*
* delayed work -- periodically trim expired leases, renew caps with mds * delayed work -- periodically trim expired leases, renew caps with mds
...@@ -4044,7 +4088,9 @@ static void delayed_work(struct work_struct *work) ...@@ -4044,7 +4088,9 @@ static void delayed_work(struct work_struct *work)
pr_info("mds%d hung\n", s->s_mds); pr_info("mds%d hung\n", s->s_mds);
} }
} }
if (s->s_state < CEPH_MDS_SESSION_OPEN) { if (s->s_state == CEPH_MDS_SESSION_NEW ||
s->s_state == CEPH_MDS_SESSION_RESTARTING ||
s->s_state == CEPH_MDS_SESSION_REJECTED) {
/* this mds is failed or recovering, just wait */ /* this mds is failed or recovering, just wait */
ceph_put_mds_session(s); ceph_put_mds_session(s);
continue; continue;
...@@ -4072,6 +4118,8 @@ static void delayed_work(struct work_struct *work) ...@@ -4072,6 +4118,8 @@ static void delayed_work(struct work_struct *work)
ceph_trim_snapid_map(mdsc); ceph_trim_snapid_map(mdsc);
maybe_recover_session(mdsc);
schedule_delayed(mdsc); schedule_delayed(mdsc);
} }
...@@ -4355,7 +4403,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) ...@@ -4355,7 +4403,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
session = __ceph_lookup_mds_session(mdsc, mds); session = __ceph_lookup_mds_session(mdsc, mds);
if (!session) if (!session)
continue; continue;
if (session->s_state == CEPH_MDS_SESSION_REJECTED)
__unregister_session(mdsc, session);
__wake_requests(mdsc, &session->s_waiting);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex); mutex_lock(&session->s_mutex);
__close_session(mdsc, session); __close_session(mdsc, session);
if (session->s_state == CEPH_MDS_SESSION_CLOSING) { if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
...@@ -4364,6 +4417,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) ...@@ -4364,6 +4417,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
} }
mutex_unlock(&session->s_mutex); mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session); ceph_put_mds_session(session);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
kick_requests(mdsc, mds); kick_requests(mdsc, mds);
} }
......
...@@ -148,9 +148,9 @@ enum { ...@@ -148,9 +148,9 @@ enum {
CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPENING = 2,
CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_OPEN = 3,
CEPH_MDS_SESSION_HUNG = 4, CEPH_MDS_SESSION_HUNG = 4,
CEPH_MDS_SESSION_CLOSING = 5, CEPH_MDS_SESSION_RESTARTING = 5,
CEPH_MDS_SESSION_RESTARTING = 6, CEPH_MDS_SESSION_RECONNECTING = 6,
CEPH_MDS_SESSION_RECONNECTING = 7, CEPH_MDS_SESSION_CLOSING = 7,
CEPH_MDS_SESSION_REJECTED = 8, CEPH_MDS_SESSION_REJECTED = 8,
}; };
...@@ -176,7 +176,7 @@ struct ceph_mds_session { ...@@ -176,7 +176,7 @@ struct ceph_mds_session {
spinlock_t s_cap_lock; spinlock_t s_cap_lock;
struct list_head s_caps; /* all caps issued by this session */ struct list_head s_caps; /* all caps issued by this session */
struct ceph_cap *s_cap_iterator; struct ceph_cap *s_cap_iterator;
int s_nr_caps, s_trim_caps; int s_nr_caps;
int s_num_cap_releases; int s_num_cap_releases;
int s_cap_reconnect; int s_cap_reconnect;
int s_readonly; int s_readonly;
......
...@@ -143,6 +143,7 @@ enum { ...@@ -143,6 +143,7 @@ enum {
Opt_snapdirname, Opt_snapdirname,
Opt_mds_namespace, Opt_mds_namespace,
Opt_fscache_uniq, Opt_fscache_uniq,
Opt_recover_session,
Opt_last_string, Opt_last_string,
/* string args above */ /* string args above */
Opt_dirstat, Opt_dirstat,
...@@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { ...@@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = {
/* int args above */ /* int args above */
{Opt_snapdirname, "snapdirname=%s"}, {Opt_snapdirname, "snapdirname=%s"},
{Opt_mds_namespace, "mds_namespace=%s"}, {Opt_mds_namespace, "mds_namespace=%s"},
{Opt_recover_session, "recover_session=%s"},
{Opt_fscache_uniq, "fsc=%s"}, {Opt_fscache_uniq, "fsc=%s"},
/* string args above */ /* string args above */
{Opt_dirstat, "dirstat"}, {Opt_dirstat, "dirstat"},
...@@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) ...@@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->mds_namespace) if (!fsopt->mds_namespace)
return -ENOMEM; return -ENOMEM;
break; break;
case Opt_recover_session:
if (!strncmp(argstr[0].from, "no",
argstr[0].to - argstr[0].from)) {
fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
} else if (!strncmp(argstr[0].from, "clean",
argstr[0].to - argstr[0].from)) {
fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
} else {
return -EINVAL;
}
break;
case Opt_fscache_uniq: case Opt_fscache_uniq:
kfree(fsopt->fscache_uniq); kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from, fsopt->fscache_uniq = kstrndup(argstr[0].from,
...@@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) ...@@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->mds_namespace) if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace); seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%d", fsopt->wsize); seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE) if (fsopt->rsize != CEPH_MAX_READ_SIZE)
...@@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, ...@@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->sb = NULL; fsc->sb = NULL;
fsc->mount_state = CEPH_MOUNT_MOUNTING; fsc->mount_state = CEPH_MOUNT_MOUNTING;
fsc->filp_gen = 1;
atomic_long_set(&fsc->writeback_count, 0); atomic_long_set(&fsc->writeback_count, 0);
...@@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) ...@@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{ {
dout("destroy_fs_client %p\n", fsc); dout("destroy_fs_client %p\n", fsc);
ceph_mdsc_destroy(fsc);
destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->inode_wq);
destroy_workqueue(fsc->cap_wq); destroy_workqueue(fsc->cap_wq);
...@@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb) ...@@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb)
fsc->mount_state = CEPH_MOUNT_SHUTDOWN; fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
ceph_mdsc_force_umount(fsc->mdsc); ceph_mdsc_force_umount(fsc->mdsc);
return; fsc->filp_gen++; // invalidate open files
} }
static int ceph_remount(struct super_block *sb, int *flags, char *data) static int ceph_remount(struct super_block *sb, int *flags, char *data)
...@@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, ...@@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
} }
if (ceph_sb_to_client(sb) != fsc) { if (ceph_sb_to_client(sb) != fsc) {
ceph_mdsc_destroy(fsc);
destroy_fs_client(fsc); destroy_fs_client(fsc);
fsc = ceph_sb_to_client(sb); fsc = ceph_sb_to_client(sb);
dout("get_sb got existing client %p\n", fsc); dout("get_sb got existing client %p\n", fsc);
...@@ -1115,7 +1133,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, ...@@ -1115,7 +1133,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
goto out_final; goto out_final;
out: out:
ceph_mdsc_destroy(fsc);
destroy_fs_client(fsc); destroy_fs_client(fsc);
out_final: out_final:
dout("ceph_mount fail %ld\n", PTR_ERR(res)); dout("ceph_mount fail %ld\n", PTR_ERR(res));
...@@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s) ...@@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s)
ceph_fscache_unregister_fs(fsc); ceph_fscache_unregister_fs(fsc);
ceph_mdsc_destroy(fsc);
destroy_fs_client(fsc); destroy_fs_client(fsc);
free_anon_bdev(dev); free_anon_bdev(dev);
} }
...@@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = { ...@@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = {
}; };
MODULE_ALIAS_FS("ceph"); MODULE_ALIAS_FS("ceph");
int ceph_force_reconnect(struct super_block *sb)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
int err = 0;
ceph_umount_begin(sb);
/* Make sure all page caches get invalidated.
* see remove_session_caps_cb() */
flush_workqueue(fsc->inode_wq);
/* In case that we were blacklisted. This also reset
* all mon/osd connections */
ceph_reset_client_addr(fsc->client);
ceph_osdc_clear_abort_err(&fsc->client->osdc);
fsc->blacklisted = false;
fsc->mount_state = CEPH_MOUNT_MOUNTED;
if (sb->s_root) {
err = __ceph_do_getattr(d_inode(sb->s_root), NULL,
CEPH_STAT_CAP_INODE, true);
}
return err;
}
static int __init init_ceph(void) static int __init init_ceph(void)
{ {
int ret = init_caches(); int ret = init_caches();
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/posix_acl.h> #include <linux/posix_acl.h>
#include <linux/refcount.h> #include <linux/refcount.h>
#include <linux/security.h>
#include <linux/ceph/libceph.h> #include <linux/ceph/libceph.h>
...@@ -31,6 +32,7 @@ ...@@ -31,6 +32,7 @@
#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
...@@ -101,6 +103,11 @@ struct ceph_fs_client { ...@@ -101,6 +103,11 @@ struct ceph_fs_client {
struct ceph_client *client; struct ceph_client *client;
unsigned long mount_state; unsigned long mount_state;
unsigned long last_auto_reconnect;
bool blacklisted;
u32 filp_gen;
loff_t max_file_size; loff_t max_file_size;
struct ceph_mds_client *mdsc; struct ceph_mds_client *mdsc;
...@@ -395,6 +402,8 @@ struct ceph_inode_info { ...@@ -395,6 +402,8 @@ struct ceph_inode_info {
struct fscache_cookie *fscache; struct fscache_cookie *fscache;
u32 i_fscache_gen; u32 i_fscache_gen;
#endif #endif
errseq_t i_meta_err;
struct inode vfs_inode; /* at end */ struct inode vfs_inode; /* at end */
}; };
...@@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, ...@@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ #define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ #define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ #define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
/* /*
* Masks of ceph inode work. * Masks of ceph inode work.
...@@ -703,6 +711,10 @@ struct ceph_file_info { ...@@ -703,6 +711,10 @@ struct ceph_file_info {
spinlock_t rw_contexts_lock; spinlock_t rw_contexts_lock;
struct list_head rw_contexts; struct list_head rw_contexts;
errseq_t meta_err;
u32 filp_gen;
atomic_t num_locks;
}; };
struct ceph_dir_file_info { struct ceph_dir_file_info {
...@@ -842,7 +854,8 @@ static inline int default_congestion_kb(void) ...@@ -842,7 +854,8 @@ static inline int default_congestion_kb(void)
} }
/* super.c */
extern int ceph_force_reconnect(struct super_block *sb);
/* snap.c */ /* snap.c */
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino); u64 ino);
...@@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) ...@@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in)
#ifdef CONFIG_CEPH_FS_SECURITY_LABEL #ifdef CONFIG_CEPH_FS_SECURITY_LABEL
extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
struct ceph_acl_sec_ctx *ctx); struct ceph_acl_sec_ctx *ctx);
extern void ceph_security_invalidate_secctx(struct inode *inode); static inline void ceph_security_invalidate_secctx(struct inode *inode)
{
security_inode_invalidate_secctx(inode);
}
#else #else
static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
struct ceph_acl_sec_ctx *ctx) struct ceph_acl_sec_ctx *ctx)
...@@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, ...@@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session); struct ceph_mds_session *session);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds); int mds);
extern int ceph_get_cap_mds(struct inode *inode);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
...@@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, ...@@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
struct inode *dir, struct inode *dir,
int mds, int drop, int unless); int mds, int drop, int unless);
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, extern int ceph_get_caps(struct file *filp, int need, int want,
loff_t endoff, int *got, struct page **pinned_page); loff_t endoff, int *got, struct page **pinned_page);
extern int ceph_try_get_caps(struct ceph_inode_info *ci, extern int ceph_try_get_caps(struct inode *inode,
int need, int want, bool nonblock, int *got); int need, int want, bool nonblock, int *got);
/* for counting open files by mode */ /* for counting open files by mode */
...@@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); ...@@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
extern const struct address_space_operations ceph_aops; extern const struct address_space_operations ceph_aops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
extern int ceph_uninline_data(struct file *filp, struct page *locked_page); extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */ /* file.c */
......
...@@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, ...@@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci,
static bool ceph_is_valid_xattr(const char *name) static bool ceph_is_valid_xattr(const char *name)
{ {
return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
} }
...@@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, ...@@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
memcpy(value, xattr->val, xattr->val_len); memcpy(value, xattr->val, xattr->val_len);
if (current->journal_info && if (current->journal_info &&
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN))
ci->i_ceph_flags |= CEPH_I_SEC_INITED; ci->i_ceph_flags |= CEPH_I_SEC_INITED;
out: out:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
...@@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ...@@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{ {
struct inode *inode = d_inode(dentry); struct inode *inode = d_inode(dentry);
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
bool len_only = (size == 0); bool len_only = (size == 0);
u32 namelen; u32 namelen;
int err; int err;
int i;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("listxattr %p ver=%lld index_ver=%lld\n", inode, dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
...@@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ...@@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
names = __copy_xattr_names(ci, names); names = __copy_xattr_names(ci, names);
size -= namelen; size -= namelen;
} }
/* virtual xattr names, too */
if (vxattrs) {
for (i = 0; vxattrs[i].name; i++) {
size_t this_len;
if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN)
continue;
if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci))
continue;
this_len = strlen(vxattrs[i].name) + 1;
namelen += this_len;
if (len_only)
continue;
if (this_len > size) {
err = -ERANGE;
goto out;
}
memcpy(names, vxattrs[i].name, this_len);
names += this_len;
size -= this_len;
}
}
err = namelen; err = namelen;
out: out:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
...@@ -1293,42 +1266,8 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, ...@@ -1293,42 +1266,8 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
ceph_pagelist_release(pagelist); ceph_pagelist_release(pagelist);
return err; return err;
} }
#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */
void ceph_security_invalidate_secctx(struct inode *inode) #endif /* CONFIG_SECURITY */
{
security_inode_invalidate_secctx(inode);
}
static int ceph_xattr_set_security_label(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *key, const void *buf,
size_t buflen, int flags)
{
if (security_ismaclabel(key)) {
const char *name = xattr_full_name(handler, key);
return __ceph_setxattr(inode, name, buf, buflen, flags);
}
return -EOPNOTSUPP;
}
static int ceph_xattr_get_security_label(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *key, void *buf, size_t buflen)
{
if (security_ismaclabel(key)) {
const char *name = xattr_full_name(handler, key);
return __ceph_getxattr(inode, name, buf, buflen);
}
return -EOPNOTSUPP;
}
static const struct xattr_handler ceph_security_label_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = ceph_xattr_get_security_label,
.set = ceph_xattr_set_security_label,
};
#endif
#endif
void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
{ {
...@@ -1351,9 +1290,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = { ...@@ -1351,9 +1290,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = {
#ifdef CONFIG_CEPH_FS_POSIX_ACL #ifdef CONFIG_CEPH_FS_POSIX_ACL
&posix_acl_access_xattr_handler, &posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler, &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
&ceph_security_label_handler,
#endif #endif
&ceph_other_xattr_handler, &ceph_other_xattr_handler,
NULL, NULL,
......
...@@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); ...@@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
u64 ceph_client_gid(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client);
extern void ceph_destroy_client(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client);
extern void ceph_reset_client_addr(struct ceph_client *client);
extern int __ceph_open_session(struct ceph_client *client, extern int __ceph_open_session(struct ceph_client *client,
unsigned long started); unsigned long started);
extern int ceph_open_session(struct ceph_client *client); extern int ceph_open_session(struct ceph_client *client);
......
...@@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void); ...@@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void);
extern void ceph_messenger_init(struct ceph_messenger *msgr, extern void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr); struct ceph_entity_addr *myaddr);
extern void ceph_messenger_fini(struct ceph_messenger *msgr); extern void ceph_messenger_fini(struct ceph_messenger *msgr);
extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr);
extern void ceph_con_init(struct ceph_connection *con, void *private, extern void ceph_con_init(struct ceph_connection *con, void *private,
const struct ceph_connection_operations *ops, const struct ceph_connection_operations *ops,
......
...@@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, ...@@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m,
extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
extern void ceph_monc_stop(struct ceph_mon_client *monc); extern void ceph_monc_stop(struct ceph_mon_client *monc);
extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
enum { enum {
CEPH_SUB_MONMAP = 0, CEPH_SUB_MONMAP = 0,
......
...@@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void); ...@@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void);
extern int ceph_osdc_init(struct ceph_osd_client *osdc, extern int ceph_osdc_init(struct ceph_osd_client *osdc,
struct ceph_client *client); struct ceph_client *client);
extern void ceph_osdc_stop(struct ceph_osd_client *osdc); extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
struct ceph_msg *msg); struct ceph_msg *msg);
...@@ -388,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, ...@@ -388,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg); struct ceph_msg *msg);
void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
#define osd_req_op_data(oreq, whch, typ, fld) \ #define osd_req_op_data(oreq, whch, typ, fld) \
({ \ ({ \
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/nsproxy.h> #include <linux/nsproxy.h>
#include <linux/parser.h> #include <linux/parser.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/statfs.h> #include <linux/statfs.h>
...@@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, ...@@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt,
} }
EXPORT_SYMBOL(ceph_compare_options); EXPORT_SYMBOL(ceph_compare_options);
/*
* kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
* compatible with (a superset of) GFP_KERNEL. This is because while the
* actual pages are allocated with the specified flags, the page table pages
* are always allocated with GFP_KERNEL. map_vm_area() doesn't even take
* flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc().
*
* ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
*/
void *ceph_kvmalloc(size_t size, gfp_t flags) void *ceph_kvmalloc(size_t size, gfp_t flags)
{ {
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { void *p;
void *ptr = kmalloc(size, flags | __GFP_NOWARN);
if (ptr) if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) {
return ptr; p = kvmalloc(size, flags);
} else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) {
unsigned int nofs_flag = memalloc_nofs_save();
p = kvmalloc(size, GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
} else {
unsigned int noio_flag = memalloc_noio_save();
p = kvmalloc(size, GFP_KERNEL);
memalloc_noio_restore(noio_flag);
} }
return __vmalloc(size, flags, PAGE_KERNEL); return p;
} }
static int parse_fsid(const char *str, struct ceph_fsid *fsid) static int parse_fsid(const char *str, struct ceph_fsid *fsid)
{ {
int i = 0; int i = 0;
...@@ -694,6 +711,14 @@ void ceph_destroy_client(struct ceph_client *client) ...@@ -694,6 +711,14 @@ void ceph_destroy_client(struct ceph_client *client)
} }
EXPORT_SYMBOL(ceph_destroy_client); EXPORT_SYMBOL(ceph_destroy_client);
void ceph_reset_client_addr(struct ceph_client *client)
{
ceph_messenger_reset_nonce(&client->msgr);
ceph_monc_reopen_session(&client->monc);
ceph_osdc_reopen_osds(&client->osdc);
}
EXPORT_SYMBOL(ceph_reset_client_addr);
/* /*
* true if we have the mon map (and have thus joined the cluster) * true if we have the mon map (and have thus joined the cluster)
*/ */
......
...@@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con) ...@@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con)
} }
void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
{
u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
msgr->inst.addr.nonce = cpu_to_le32(nonce);
encode_my_addr(msgr);
}
/* /*
* initialize a new messenger instance * initialize a new messenger instance
......
...@@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) ...@@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc)
__open_session(monc); __open_session(monc);
} }
void ceph_monc_reopen_session(struct ceph_mon_client *monc)
{
mutex_lock(&monc->mutex);
reopen_session(monc);
mutex_unlock(&monc->mutex);
}
static void un_backoff(struct ceph_mon_client *monc) static void un_backoff(struct ceph_mon_client *monc)
{ {
monc->hunt_mult /= 2; /* reduce by 50% */ monc->hunt_mult /= 2; /* reduce by 50% */
......
...@@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, ...@@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
struct ceph_pagelist *pagelist; struct ceph_pagelist *pagelist;
size_t payload_len = 0; size_t payload_len = 0;
size_t size; size_t size;
int ret;
op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
...@@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, ...@@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
size = strlen(class); size = strlen(class);
BUG_ON(size > (size_t) U8_MAX); BUG_ON(size > (size_t) U8_MAX);
op->cls.class_len = size; op->cls.class_len = size;
ceph_pagelist_append(pagelist, class, size); ret = ceph_pagelist_append(pagelist, class, size);
if (ret)
goto err_pagelist_free;
payload_len += size; payload_len += size;
op->cls.method_name = method; op->cls.method_name = method;
size = strlen(method); size = strlen(method);
BUG_ON(size > (size_t) U8_MAX); BUG_ON(size > (size_t) U8_MAX);
op->cls.method_len = size; op->cls.method_len = size;
ceph_pagelist_append(pagelist, method, size); ret = ceph_pagelist_append(pagelist, method, size);
if (ret)
goto err_pagelist_free;
payload_len += size; payload_len += size;
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
op->indata_len = payload_len; op->indata_len = payload_len;
return 0; return 0;
err_pagelist_free:
ceph_pagelist_release(pagelist);
return ret;
} }
EXPORT_SYMBOL(osd_req_op_cls_init); EXPORT_SYMBOL(osd_req_op_cls_init);
...@@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ...@@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
opcode, 0); opcode, 0);
struct ceph_pagelist *pagelist; struct ceph_pagelist *pagelist;
size_t payload_len; size_t payload_len;
int ret;
BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
...@@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ...@@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
payload_len = strlen(name); payload_len = strlen(name);
op->xattr.name_len = payload_len; op->xattr.name_len = payload_len;
ceph_pagelist_append(pagelist, name, payload_len); ret = ceph_pagelist_append(pagelist, name, payload_len);
if (ret)
goto err_pagelist_free;
op->xattr.value_len = size; op->xattr.value_len = size;
ceph_pagelist_append(pagelist, value, size); ret = ceph_pagelist_append(pagelist, value, size);
if (ret)
goto err_pagelist_free;
payload_len += size; payload_len += size;
op->xattr.cmp_op = cmp_op; op->xattr.cmp_op = cmp_op;
...@@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ...@@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
op->indata_len = payload_len; op->indata_len = payload_len;
return 0; return 0;
err_pagelist_free:
ceph_pagelist_release(pagelist);
return ret;
} }
EXPORT_SYMBOL(osd_req_op_xattr_init); EXPORT_SYMBOL(osd_req_op_xattr_init);
...@@ -1488,7 +1505,6 @@ enum calc_target_result { ...@@ -1488,7 +1505,6 @@ enum calc_target_result {
static enum calc_target_result calc_target(struct ceph_osd_client *osdc, static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_osd_request_target *t, struct ceph_osd_request_target *t,
struct ceph_connection *con,
bool any_change) bool any_change)
{ {
struct ceph_pg_pool_info *pi; struct ceph_pg_pool_info *pi;
...@@ -2272,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) ...@@ -2272,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again: again:
ct_res = calc_target(osdc, &req->r_t, NULL, false); ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
goto promote; goto promote;
...@@ -2476,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) ...@@ -2476,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
} }
EXPORT_SYMBOL(ceph_osdc_abort_requests); EXPORT_SYMBOL(ceph_osdc_abort_requests);
void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
{
down_write(&osdc->lock);
osdc->abort_err = 0;
up_write(&osdc->lock);
}
EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
{ {
if (likely(eb > osdc->epoch_barrier)) { if (likely(eb > osdc->epoch_barrier)) {
...@@ -3087,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) ...@@ -3087,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
} }
calc_target(osdc, &lreq->t, NULL, false); calc_target(osdc, &lreq->t, false);
osd = lookup_create_osd(osdc, lreq->t.osd, true); osd = lookup_create_osd(osdc, lreq->t.osd, true);
link_linger(osd, lreq); link_linger(osd, lreq);
...@@ -3704,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) ...@@ -3704,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_client *osdc = lreq->osdc;
enum calc_target_result ct_res; enum calc_target_result ct_res;
ct_res = calc_target(osdc, &lreq->t, NULL, true); ct_res = calc_target(osdc, &lreq->t, true);
if (ct_res == CALC_TARGET_NEED_RESEND) { if (ct_res == CALC_TARGET_NEED_RESEND) {
struct ceph_osd *osd; struct ceph_osd *osd;
...@@ -3776,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, ...@@ -3776,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd,
n = rb_next(n); /* unlink_request(), check_pool_dne() */ n = rb_next(n); /* unlink_request(), check_pool_dne() */
dout("%s req %p tid %llu\n", __func__, req, req->r_tid); dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, ct_res = calc_target(osdc, &req->r_t, false);
false);
switch (ct_res) { switch (ct_res) {
case CALC_TARGET_NO_ACTION: case CALC_TARGET_NO_ACTION:
force_resend_writes = cleared_full || force_resend_writes = cleared_full ||
...@@ -3886,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, ...@@ -3886,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
n = rb_next(n); n = rb_next(n);
if (req->r_t.epoch < osdc->osdmap->epoch) { if (req->r_t.epoch < osdc->osdmap->epoch) {
ct_res = calc_target(osdc, &req->r_t, NULL, false); ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE) { if (ct_res == CALC_TARGET_POOL_DNE) {
erase_request(need_resend, req); erase_request(need_resend, req);
check_pool_dne(req); check_pool_dne(req);
...@@ -5086,6 +5109,24 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, ...@@ -5086,6 +5109,24 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
} }
EXPORT_SYMBOL(ceph_osdc_call); EXPORT_SYMBOL(ceph_osdc_call);
/*
* reset all osd connections
*/
void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
{
struct rb_node *n;
down_write(&osdc->lock);
for (n = rb_first(&osdc->osds); n; ) {
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
n = rb_next(n);
if (!reopen_osd(osd))
kick_osd_requests(osd);
}
up_write(&osdc->lock);
}
/* /*
* init, shutdown * init, shutdown
*/ */
......
...@@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) ...@@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
struct ceph_pg_pool_info, node); struct ceph_pg_pool_info, node);
__remove_pg_pool(&map->pg_pools, pi); __remove_pg_pool(&map->pg_pools, pi);
} }
kfree(map->osd_state); kvfree(map->osd_state);
kfree(map->osd_weight); kvfree(map->osd_weight);
kfree(map->osd_addr); kvfree(map->osd_addr);
kfree(map->osd_primary_affinity); kvfree(map->osd_primary_affinity);
kfree(map->crush_workspace); kvfree(map->crush_workspace);
kfree(map); kfree(map);
} }
...@@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) ...@@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
* *
* The new elements are properly initialized. * The new elements are properly initialized.
*/ */
static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
{ {
u32 *state; u32 *state;
u32 *weight; u32 *weight;
struct ceph_entity_addr *addr; struct ceph_entity_addr *addr;
u32 to_copy;
int i; int i;
state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); dout("%s old %u new %u\n", __func__, map->max_osd, max);
if (!state) if (max == map->max_osd)
return -ENOMEM; return 0;
map->osd_state = state;
weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
if (!weight) weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
if (!state || !weight || !addr) {
kvfree(state);
kvfree(weight);
kvfree(addr);
return -ENOMEM; return -ENOMEM;
map->osd_weight = weight; }
addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); to_copy = min(map->max_osd, max);
if (!addr) if (map->osd_state) {
return -ENOMEM; memcpy(state, map->osd_state, to_copy * sizeof(*state));
map->osd_addr = addr; memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
kvfree(map->osd_state);
kvfree(map->osd_weight);
kvfree(map->osd_addr);
}
map->osd_state = state;
map->osd_weight = weight;
map->osd_addr = addr;
for (i = map->max_osd; i < max; i++) { for (i = map->max_osd; i < max; i++) {
map->osd_state[i] = 0; map->osd_state[i] = 0;
map->osd_weight[i] = CEPH_OSD_OUT; map->osd_weight[i] = CEPH_OSD_OUT;
...@@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) ...@@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
if (map->osd_primary_affinity) { if (map->osd_primary_affinity) {
u32 *affinity; u32 *affinity;
affinity = krealloc(map->osd_primary_affinity, affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
max*sizeof(*affinity), GFP_NOFS); GFP_NOFS);
if (!affinity) if (!affinity)
return -ENOMEM; return -ENOMEM;
map->osd_primary_affinity = affinity;
memcpy(affinity, map->osd_primary_affinity,
to_copy * sizeof(*affinity));
kvfree(map->osd_primary_affinity);
map->osd_primary_affinity = affinity;
for (i = map->max_osd; i < max; i++) for (i = map->max_osd; i < max; i++)
map->osd_primary_affinity[i] = map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
...@@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) ...@@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
dout("%s work_size %zu bytes\n", __func__, work_size); dout("%s work_size %zu bytes\n", __func__, work_size);
workspace = kmalloc(work_size, GFP_NOIO); workspace = ceph_kvmalloc(work_size, GFP_NOIO);
if (!workspace) { if (!workspace) {
crush_destroy(crush); crush_destroy(crush);
return -ENOMEM; return -ENOMEM;
...@@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) ...@@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
if (map->crush) if (map->crush)
crush_destroy(map->crush); crush_destroy(map->crush);
kfree(map->crush_workspace); kvfree(map->crush_workspace);
map->crush = crush; map->crush = crush;
map->crush_workspace = workspace; map->crush_workspace = workspace;
return 0; return 0;
...@@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) ...@@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
if (!map->osd_primary_affinity) { if (!map->osd_primary_affinity) {
int i; int i;
map->osd_primary_affinity = kmalloc_array(map->max_osd, map->osd_primary_affinity = ceph_kvmalloc(
sizeof(u32), array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
GFP_NOFS); GFP_NOFS);
if (!map->osd_primary_affinity) if (!map->osd_primary_affinity)
return -ENOMEM; return -ENOMEM;
...@@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, ...@@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end,
ceph_decode_32_safe(p, end, len, e_inval); ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0) { if (len == 0) {
kfree(map->osd_primary_affinity); kvfree(map->osd_primary_affinity);
map->osd_primary_affinity = NULL; map->osd_primary_affinity = NULL;
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment