Commit fcc95f06 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-5.7-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The main items are:

   - support for asynchronous create and unlink (Jeff Layton).

     Creates and unlinks are satisfied locally, without waiting for a
     reply from the MDS, provided the client has been granted
     appropriate caps (new in v15.y.z ("Octopus") release). This can be
     a big help for metadata heavy workloads such as tar and rsync.
     Opt-in with the new nowsync mount option.

   - multiple blk-mq queues for rbd (Hannes Reinecke and myself).

     When the driver was converted to blk-mq, we settled on a single
     blk-mq queue because of a global lock in libceph and some other
     technical debt. These have since been addressed, so allocate a
     queue per CPU to enhance parallelism.

   - don't hold onto caps that aren't actually needed (Zheng Yan).

     This has been our long-standing behavior, but it causes issues with
     some active/standby applications (synchronous I/O, stalls if the
     standby goes down, etc).

   - .snap directory timestamps consistent with ceph-fuse (Luis
     Henriques)"

* tag 'ceph-for-5.7-rc1' of git://github.com/ceph/ceph-client: (49 commits)
  ceph: fix snapshot directory timestamps
  ceph: wait for async creating inode before requesting new max size
  ceph: don't skip updating wanted caps when cap is stale
  ceph: request new max size only when there is auth cap
  ceph: cleanup return error of try_get_cap_refs()
  ceph: return ceph_mdsc_do_request() errors from __get_parent()
  ceph: check all mds' caps after page writeback
  ceph: update i_requested_max_size only when sending cap msg to auth mds
  ceph: simplify calling of ceph_get_fmode()
  ceph: remove delay check logic from ceph_check_caps()
  ceph: consider inode's last read/write when calculating wanted caps
  ceph: always renew caps if mds_wanted is insufficient
  ceph: update dentry lease for async create
  ceph: attempt to do async create when possible
  ceph: cache layout in parent dir on first sync create
  ceph: add new MDS req field to hold delegated inode number
  ceph: decode interval_sets for delegated inos
  ceph: make ceph_fill_inode non-static
  ceph: perform asynchronous unlink if we have sufficient caps
  ceph: don't take refs to want mask unless we have all bits
  ...
parents c6b80eb8 ef915725
......@@ -107,17 +107,17 @@ Mount Options
address its connection to the monitor originates from.
wsize=X
Specify the maximum write size in bytes. Default: 16 MB.
Specify the maximum write size in bytes. Default: 64 MB.
rsize=X
Specify the maximum read size in bytes. Default: 16 MB.
Specify the maximum read size in bytes. Default: 64 MB.
rasize=X
Specify the maximum readahead size in bytes. Default: 8 MB.
mount_timeout=X
Specify the timeout value for mount (in seconds), in the case
of a non-responsive Ceph file system. The default is 30
of a non-responsive Ceph file system. The default is 60
seconds.
caps_max=X
......
......@@ -337,10 +337,7 @@ struct rbd_img_request {
u64 snap_id; /* for reads */
struct ceph_snap_context *snapc; /* for writes */
};
union {
struct request *rq; /* block request */
struct rbd_obj_request *obj_request; /* obj req initiator */
};
struct list_head lock_item;
struct list_head object_extents; /* obj_req.ex structs */
......@@ -349,7 +346,6 @@ struct rbd_img_request {
struct pending_result pending;
struct work_struct work;
int work_result;
struct kref kref;
};
#define for_each_obj_request(ireq, oreq) \
......@@ -1320,15 +1316,6 @@ static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
kref_put(&obj_request->kref, rbd_obj_request_destroy);
}
static void rbd_img_request_destroy(struct kref *kref);
static void rbd_img_request_put(struct rbd_img_request *img_request)
{
rbd_assert(img_request != NULL);
dout("%s: img %p (was %d)\n", __func__, img_request,
kref_read(&img_request->kref));
kref_put(&img_request->kref, rbd_img_request_destroy);
}
static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
struct rbd_obj_request *obj_request)
{
......@@ -1366,18 +1353,10 @@ static void rbd_osd_submit(struct ceph_osd_request *osd_req)
static void img_request_layered_set(struct rbd_img_request *img_request)
{
set_bit(IMG_REQ_LAYERED, &img_request->flags);
smp_mb();
}
static void img_request_layered_clear(struct rbd_img_request *img_request)
{
clear_bit(IMG_REQ_LAYERED, &img_request->flags);
smp_mb();
}
static bool img_request_layered_test(struct rbd_img_request *img_request)
{
smp_mb();
return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
}
......@@ -1619,10 +1598,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
if (!rbd_dev->parent_spec)
return false;
down_read(&rbd_dev->header_rwsem);
if (rbd_dev->parent_overlap)
counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
up_read(&rbd_dev->header_rwsem);
if (counter < 0)
rbd_warn(rbd_dev, "parent reference overflow");
......@@ -1630,62 +1607,53 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
return counter > 0;
}
/*
* Caller is responsible for filling in the list of object requests
* that comprises the image request, and the Linux request pointer
* (if there is one).
*/
static struct rbd_img_request *rbd_img_request_create(
static void rbd_img_request_init(struct rbd_img_request *img_request,
struct rbd_device *rbd_dev,
enum obj_operation_type op_type,
struct ceph_snap_context *snapc)
enum obj_operation_type op_type)
{
struct rbd_img_request *img_request;
img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
if (!img_request)
return NULL;
memset(img_request, 0, sizeof(*img_request));
img_request->rbd_dev = rbd_dev;
img_request->op_type = op_type;
if (!rbd_img_is_write(img_request))
img_request->snap_id = rbd_dev->spec->snap_id;
else
img_request->snapc = snapc;
if (rbd_dev_parent_get(rbd_dev))
img_request_layered_set(img_request);
INIT_LIST_HEAD(&img_request->lock_item);
INIT_LIST_HEAD(&img_request->object_extents);
mutex_init(&img_request->state_mutex);
kref_init(&img_request->kref);
}
static void rbd_img_capture_header(struct rbd_img_request *img_req)
{
struct rbd_device *rbd_dev = img_req->rbd_dev;
lockdep_assert_held(&rbd_dev->header_rwsem);
return img_request;
if (rbd_img_is_write(img_req))
img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
else
img_req->snap_id = rbd_dev->spec->snap_id;
if (rbd_dev_parent_get(rbd_dev))
img_request_layered_set(img_req);
}
static void rbd_img_request_destroy(struct kref *kref)
static void rbd_img_request_destroy(struct rbd_img_request *img_request)
{
struct rbd_img_request *img_request;
struct rbd_obj_request *obj_request;
struct rbd_obj_request *next_obj_request;
img_request = container_of(kref, struct rbd_img_request, kref);
dout("%s: img %p\n", __func__, img_request);
WARN_ON(!list_empty(&img_request->lock_item));
for_each_obj_request_safe(img_request, obj_request, next_obj_request)
rbd_img_obj_request_del(img_request, obj_request);
if (img_request_layered_test(img_request)) {
img_request_layered_clear(img_request);
if (img_request_layered_test(img_request))
rbd_dev_parent_put(img_request->rbd_dev);
}
if (rbd_img_is_write(img_request))
ceph_put_snap_context(img_request->snapc);
if (test_bit(IMG_REQ_CHILD, &img_request->flags))
kmem_cache_free(rbd_img_request_cache, img_request);
}
......@@ -2849,17 +2817,22 @@ static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
{
struct rbd_img_request *img_req = obj_req->img_request;
struct rbd_device *parent = img_req->rbd_dev->parent;
struct rbd_img_request *child_img_req;
int ret;
child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
OBJ_OP_READ, NULL);
child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
if (!child_img_req)
return -ENOMEM;
rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
child_img_req->obj_request = obj_req;
down_read(&parent->header_rwsem);
rbd_img_capture_header(child_img_req);
up_read(&parent->header_rwsem);
dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
obj_req);
......@@ -2888,7 +2861,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
obj_req->copyup_bvecs);
}
if (ret) {
rbd_img_request_put(child_img_req);
rbd_img_request_destroy(child_img_req);
return ret;
}
......@@ -3647,15 +3620,15 @@ static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
struct rbd_obj_request *obj_req = img_req->obj_request;
rbd_img_request_put(img_req);
rbd_img_request_destroy(img_req);
if (__rbd_obj_handle_request(obj_req, &result)) {
img_req = obj_req->img_request;
goto again;
}
} else {
struct request *rq = img_req->rq;
struct request *rq = blk_mq_rq_from_pdu(img_req);
rbd_img_request_put(img_req);
rbd_img_request_destroy(img_req);
blk_mq_end_request(rq, errno_to_blk_status(result));
}
}
......@@ -4707,84 +4680,36 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
static void rbd_queue_workfn(struct work_struct *work)
{
struct request *rq = blk_mq_rq_from_pdu(work);
struct rbd_device *rbd_dev = rq->q->queuedata;
struct rbd_img_request *img_request;
struct ceph_snap_context *snapc = NULL;
struct rbd_img_request *img_request =
container_of(work, struct rbd_img_request, work);
struct rbd_device *rbd_dev = img_request->rbd_dev;
enum obj_operation_type op_type = img_request->op_type;
struct request *rq = blk_mq_rq_from_pdu(img_request);
u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
u64 length = blk_rq_bytes(rq);
enum obj_operation_type op_type;
u64 mapping_size;
int result;
switch (req_op(rq)) {
case REQ_OP_DISCARD:
op_type = OBJ_OP_DISCARD;
break;
case REQ_OP_WRITE_ZEROES:
op_type = OBJ_OP_ZEROOUT;
break;
case REQ_OP_WRITE:
op_type = OBJ_OP_WRITE;
break;
case REQ_OP_READ:
op_type = OBJ_OP_READ;
break;
default:
dout("%s: non-fs request type %d\n", __func__, req_op(rq));
result = -EIO;
goto err;
}
/* Ignore/skip any zero-length requests */
if (!length) {
dout("%s: zero-length request\n", __func__);
result = 0;
goto err_rq;
}
if (op_type != OBJ_OP_READ) {
if (rbd_is_ro(rbd_dev)) {
rbd_warn(rbd_dev, "%s on read-only mapping",
obj_op_name(op_type));
result = -EIO;
goto err;
}
rbd_assert(!rbd_is_snap(rbd_dev));
}
if (offset && length > U64_MAX - offset + 1) {
rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
length);
result = -EINVAL;
goto err_rq; /* Shouldn't happen */
goto err_img_request;
}
blk_mq_start_request(rq);
down_read(&rbd_dev->header_rwsem);
mapping_size = rbd_dev->mapping.size;
if (op_type != OBJ_OP_READ) {
snapc = rbd_dev->header.snapc;
ceph_get_snap_context(snapc);
}
rbd_img_capture_header(img_request);
up_read(&rbd_dev->header_rwsem);
if (offset + length > mapping_size) {
rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
length, mapping_size);
result = -EIO;
goto err_rq;
}
img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
if (!img_request) {
result = -ENOMEM;
goto err_rq;
goto err_img_request;
}
img_request->rq = rq;
snapc = NULL; /* img_request consumes a ref */
dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
img_request, obj_op_name(op_type), offset, length);
......@@ -4801,23 +4726,51 @@ static void rbd_queue_workfn(struct work_struct *work)
return;
err_img_request:
rbd_img_request_put(img_request);
err_rq:
rbd_img_request_destroy(img_request);
if (result)
rbd_warn(rbd_dev, "%s %llx at %llx result %d",
obj_op_name(op_type), length, offset, result);
ceph_put_snap_context(snapc);
err:
blk_mq_end_request(rq, errno_to_blk_status(result));
}
static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *rq = bd->rq;
struct work_struct *work = blk_mq_rq_to_pdu(rq);
struct rbd_device *rbd_dev = hctx->queue->queuedata;
struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
enum obj_operation_type op_type;
queue_work(rbd_wq, work);
switch (req_op(bd->rq)) {
case REQ_OP_DISCARD:
op_type = OBJ_OP_DISCARD;
break;
case REQ_OP_WRITE_ZEROES:
op_type = OBJ_OP_ZEROOUT;
break;
case REQ_OP_WRITE:
op_type = OBJ_OP_WRITE;
break;
case REQ_OP_READ:
op_type = OBJ_OP_READ;
break;
default:
rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
return BLK_STS_IOERR;
}
rbd_img_request_init(img_req, rbd_dev, op_type);
if (rbd_img_is_write(img_req)) {
if (rbd_is_ro(rbd_dev)) {
rbd_warn(rbd_dev, "%s on read-only mapping",
obj_op_name(img_req->op_type));
return BLK_STS_IOERR;
}
rbd_assert(!rbd_is_snap(rbd_dev));
}
INIT_WORK(&img_req->work, rbd_queue_workfn);
queue_work(rbd_wq, &img_req->work);
return BLK_STS_OK;
}
......@@ -4984,18 +4937,8 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
return ret;
}
static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, unsigned int numa_node)
{
struct work_struct *work = blk_mq_rq_to_pdu(rq);
INIT_WORK(work, rbd_queue_workfn);
return 0;
}
static const struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
.init_request = rbd_init_request,
};
static int rbd_init_disk(struct rbd_device *rbd_dev)
......@@ -5027,8 +4970,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
if (err)
......
......@@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
if (!PagePrivate(page))
return;
ClearPageChecked(page);
dout("%p invalidatepage %p idx %lu full dirty page\n",
inode, page, page->index);
......@@ -182,6 +180,47 @@ static int ceph_releasepage(struct page *page, gfp_t g)
return !PagePrivate(page);
}
/*
* Read some contiguous pages. If we cross a stripe boundary, shorten
* *plen. Return number of bytes read, or error.
*/
static int ceph_sync_readpages(struct ceph_fs_client *fsc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
u64 off, u64 *plen,
u32 truncate_seq, u64 truncate_size,
struct page **pages, int num_pages,
int page_align)
{
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
int rc = 0;
dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
vino.snap, off, *plen);
req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, truncate_seq, truncate_size,
false);
if (IS_ERR(req))
return PTR_ERR(req);
/* it may be a short read due to an object boundary */
osd_req_op_extent_osd_data_pages(req, 0,
pages, *plen, page_align, false, false);
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
off, *plen, *plen, page_align);
rc = ceph_osdc_start_request(osdc, req, false);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
ceph_osdc_put_request(req);
dout("readpages result %d\n", rc);
return rc;
}
/*
* read a single page, without unlocking it.
*/
......@@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
err = ceph_sync_readpages(fsc, ceph_vino(inode),
&ci->i_layout, off, &len,
ci->i_truncate_seq, ci->i_truncate_size,
&page, 1, 0);
......@@ -570,6 +609,47 @@ static u64 get_writepages_data_length(struct inode *inode,
return end > start ? end - start : 0;
}
/*
* do a synchronous write on N pages
*/
static int ceph_sync_writepages(struct ceph_fs_client *fsc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
struct ceph_snap_context *snapc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
struct timespec64 *mtime,
struct page **pages, int num_pages)
{
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
int rc = 0;
int page_align = off & ~PAGE_MASK;
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq, truncate_size,
true);
if (IS_ERR(req))
return PTR_ERR(req);
/* it may be a short write due to an object boundary */
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
false, false);
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
req->r_mtime = *mtime;
rc = ceph_osdc_start_request(osdc, req, true);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
ceph_osdc_put_request(req);
if (rc == 0)
rc = len;
dout("writepages result %d\n", rc);
return rc;
}
/*
* Write a single page, but leave the page locked.
*
......@@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
err = ceph_sync_writepages(fsc, ceph_vino(inode),
&ci->i_layout, snapc, page_off, len,
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size,
......@@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
do {
lock_page(page);
if ((off > size) || (page->mapping != inode->i_mapping)) {
if (page_mkwrite_check_truncate(page, inode) < 0) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
break;
......
......@@ -32,7 +32,7 @@ struct ceph_fscache_entry {
size_t uniq_len;
/* The following members must be last */
struct ceph_fsid fsid;
char uniquifier[0];
char uniquifier[];
};
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
......
......@@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_min = round_jiffies(jiffies +
opt->caps_wanted_delay_min * HZ);
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
ci->i_hold_caps_max - jiffies);
}
/*
......@@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci,
bool set_timeout)
struct ceph_inode_info *ci)
{
dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
......@@ -520,7 +516,6 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
if (set_timeout)
__cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
......@@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_delay_lock);
}
/*
* Common issue checks for add_cap, handle_cap_grant.
*/
/* Common issue checks for add_cap, handle_cap_grant. */
static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
unsigned issued)
{
unsigned had = __ceph_caps_issued(ci, NULL);
lockdep_assert_held(&ci->i_ceph_lock);
/*
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
if (S_ISREG(ci->vfs_inode.i_mode) &&
(issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
}
......@@ -592,6 +588,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
__ceph_dir_clear_complete(ci);
}
}
/* Wipe saved layout if we're losing DIR_CREATE caps */
if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
!(issued & CEPH_CAP_DIR_CREATE)) {
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
}
}
/*
......@@ -605,7 +608,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
*/
void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
int fmode, unsigned issued, unsigned wanted,
unsigned issued, unsigned wanted,
unsigned seq, unsigned mseq, u64 realmino, int flags,
struct ceph_cap **new_cap)
{
......@@ -621,13 +624,6 @@ void ceph_add_cap(struct inode *inode,
dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
session->s_mds, cap_id, ceph_cap_string(issued), seq);
/*
* If we are opening the file, include file mode wanted bits
* in wanted.
*/
if (fmode >= 0)
wanted |= ceph_caps_for_mode(fmode);
spin_lock(&session->s_gen_ttl_lock);
gen = session->s_cap_gen;
spin_unlock(&session->s_gen_ttl_lock);
......@@ -725,7 +721,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
__cap_delay_requeue(mdsc, ci, true);
__cap_delay_requeue(mdsc, ci);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
......@@ -752,9 +748,6 @@ void ceph_add_cap(struct inode *inode,
cap->issue_seq = seq;
cap->mseq = mseq;
cap->cap_gen = gen;
if (fmode >= 0)
__ceph_get_fmode(ci, fmode);
}
/*
......@@ -958,29 +951,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
(!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
(S_ISREG(ci->vfs_inode.i_mode) &&
ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
if (ci->i_wb_ref || ci->i_wrbuffer_ref)
used |= CEPH_CAP_FILE_BUFFER;
if (ci->i_fx_ref)
used |= CEPH_CAP_FILE_EXCL;
return used;
}
#define FMODE_WAIT_BIAS 1000
/*
* wanted, by virtue of open file modes
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
int i, bits = 0;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (ci->i_nr_by_mode[i])
bits |= 1 << i;
const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
struct ceph_mount_options *opt =
ceph_inode_to_client(&ci->vfs_inode)->mount_options;
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
if (S_ISDIR(ci->vfs_inode.i_mode)) {
int want = 0;
/* use used_cutoff here, to keep dir's wanted caps longer */
if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
time_after(ci->i_last_rd, used_cutoff))
want |= CEPH_CAP_ANY_SHARED;
if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
time_after(ci->i_last_wr, used_cutoff)) {
want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
want |= CEPH_CAP_ANY_DIR_OPS;
}
if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
want |= CEPH_CAP_PIN;
return want;
} else {
int bits = 0;
if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
time_after(ci->i_last_rd, used_cutoff))
bits |= 1 << RD_SHIFT;
} else if (time_after(ci->i_last_rd, idle_cutoff)) {
bits |= 1 << RD_SHIFT;
}
if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
time_after(ci->i_last_wr, used_cutoff))
bits |= 1 << WR_SHIFT;
} else if (time_after(ci->i_last_wr, idle_cutoff)) {
bits |= 1 << WR_SHIFT;
}
/* check lazyio only when read/write is wanted */
if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
ci->i_nr_by_mode[LAZY_SHIFT] > 0)
bits |= 1 << LAZY_SHIFT;
return bits ? ceph_caps_for_mode(bits >> 1) : 0;
}
if (bits == 0)
return 0;
return ceph_caps_for_mode(bits >> 1);
}
/*
* wanted, by virtue of open file modes AND cap refs (buffered/cached data)
*/
int __ceph_caps_wanted(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
if (S_ISDIR(ci->vfs_inode.i_mode)) {
/* we want EXCL if holding caps of dir ops */
if (w & CEPH_CAP_ANY_DIR_OPS)
w |= CEPH_CAP_FILE_EXCL;
} else {
/* we want EXCL if dirty data */
if (w & CEPH_CAP_FILE_BUFFER)
w |= CEPH_CAP_FILE_EXCL;
}
return w;
}
/*
......@@ -1004,14 +1065,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
return mds_wanted;
}
/*
* called under i_ceph_lock
*/
static int __ceph_is_single_caps(struct ceph_inode_info *ci)
{
return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
}
int ceph_is_any_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
......@@ -1274,9 +1327,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct cap_msg_args arg;
int held, revoking;
int wake = 0;
int delayed = 0;
int ret;
/* Don't send anything if it's still being created. Return delayed */
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
spin_unlock(&ci->i_ceph_lock);
dout("%s async create in flight for %p\n", __func__, inode);
return 1;
}
held = cap->issued | cap->implemented;
revoking = cap->implemented & ~cap->issued;
retain &= ~revoking;
......@@ -1287,28 +1346,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ceph_cap_string(revoking));
BUG_ON((retain & CEPH_CAP_PIN) == 0);
arg.session = cap->session;
/* don't release wanted unless we've waited a bit. */
if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
time_before(jiffies, ci->i_hold_caps_min)) {
dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
ceph_cap_string(cap->issued),
ceph_cap_string(cap->issued & retain),
ceph_cap_string(cap->mds_wanted),
ceph_cap_string(want));
want |= cap->mds_wanted;
retain |= cap->issued;
delayed = 1;
}
ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
if (want & ~cap->mds_wanted) {
/* user space may open/close single file frequently.
* This avoids droping mds_wanted immediately after
* requesting new mds_wanted.
*/
__cap_set_timeouts(mdsc, ci);
}
ci->i_ceph_flags &= ~CEPH_I_FLUSH;
cap->issued &= retain; /* drop bits we don't want */
if (cap->implemented & ~cap->issued) {
......@@ -1323,6 +1361,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
cap->implemented &= cap->issued | used;
cap->mds_wanted = want;
arg.session = cap->session;
arg.ino = ceph_vino(inode).ino;
arg.cid = cap->cap_id;
arg.follows = flushing ? ci->i_head_snapc->seq : 0;
......@@ -1332,6 +1371,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
arg.size = inode->i_size;
ci->i_reported_size = arg.size;
arg.max_size = ci->i_wanted_max_size;
if (cap == ci->i_auth_cap)
ci->i_requested_max_size = arg.max_size;
if (flushing & CEPH_CAP_XATTR_EXCL) {
......@@ -1383,14 +1423,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(&arg);
if (ret < 0) {
dout("error sending cap msg, must requeue %p\n", inode);
delayed = 1;
pr_err("error sending cap msg, ino (%llx.%llx) "
"flushing %s tid %llu, requeue\n",
ceph_vinop(inode), ceph_cap_string(flushing),
flush_tid);
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
spin_unlock(&ci->i_ceph_lock);
}
if (wake)
wake_up_all(&ci->i_cap_wq);
return delayed;
return ret;
}
static inline int __send_flush_snap(struct inode *inode,
......@@ -1617,6 +1662,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
int was = ci->i_dirty_caps;
int dirty = 0;
lockdep_assert_held(&ci->i_ceph_lock);
if (!ci->i_auth_cap) {
pr_warn("__mark_dirty_caps %p %llx mask %s, "
"but no auth cap (session was closed?)\n",
......@@ -1654,7 +1701,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
__cap_delay_requeue(mdsc, ci, true);
__cap_delay_requeue(mdsc, ci);
return dirty;
}
......@@ -1726,6 +1773,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
struct ceph_cap_flush *cf = NULL;
int flushing;
lockdep_assert_held(&ci->i_ceph_lock);
BUG_ON(ci->i_dirty_caps == 0);
BUG_ON(list_empty(&ci->i_dirty_item));
BUG_ON(!ci->i_prealloc_cap_flush);
......@@ -1805,8 +1853,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* versus held caps. Release, flush, ack revoked caps to mds as
* appropriate.
*
* CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
* cap release further.
* CHECK_CAPS_AUTHONLY - we should only check the auth cap
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
......@@ -1825,24 +1871,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
int delayed = 0, sent = 0;
bool no_delay = flags & CHECK_CAPS_NODELAY;
bool queue_invalidate = false;
bool tried_invalidate = false;
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
no_delay = true;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
if (!(flags & CHECK_CAPS_AUTHONLY) ||
(ci->i_auth_cap && __ceph_is_single_caps(ci)))
__cap_delay_cancel(mdsc, ci);
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
......@@ -1866,10 +1901,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
* revoking the shared cap on every create/unlink
* operation.
*/
if (IS_RDONLY(inode))
if (IS_RDONLY(inode)) {
want = CEPH_CAP_ANY_SHARED;
else
want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
} else {
want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
}
retain |= want;
} else {
......@@ -1885,14 +1921,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
}
dout("check_caps %p file_want %s used %s dirty %s flushing %s"
" issued %s revoking %s retain %s %s%s%s\n", inode,
" issued %s revoking %s retain %s %s%s\n", inode,
ceph_cap_string(file_wanted),
ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(issued), ceph_cap_string(revoking),
ceph_cap_string(retain),
(flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
(flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
(flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
/*
......@@ -1900,8 +1935,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
* have cached pages, but don't want them, then try to invalidate.
* If we fail, it's because pages are locked.... try again later.
*/
if ((!no_delay || mdsc->stopping) &&
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
S_ISREG(inode->i_mode) &&
!(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
(revoking & (CEPH_CAP_FILE_CACHE|
......@@ -1973,28 +2008,17 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
}
/* want more caps from mds? */
if (want & ~cap->mds_wanted) {
if (want & ~(cap->mds_wanted | cap->issued))
goto ack;
if (!__cap_is_valid(cap))
goto ack;
}
/* things we might delay */
if ((cap->issued & ~retain) == 0)
continue; /* nope, all good */
if (no_delay)
goto ack;
/* delay? */
if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
time_before(jiffies, ci->i_hold_caps_max)) {
dout(" delaying issued %s -> %s, wanted %s -> %s\n",
ceph_cap_string(cap->issued),
ceph_cap_string(cap->issued & retain),
ceph_cap_string(cap->mds_wanted),
ceph_cap_string(want));
delayed++;
continue;
}
ack:
if (session && session != cap->session) {
dout("oops, wrong session %p mutex\n", session);
......@@ -2055,18 +2079,20 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
}
mds = cap->mds; /* remember mds, so we don't repeat */
sent++;
/* __send_cap drops i_ceph_lock */
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
cap_used, want, retain, flushing,
flush_tid, oldest_flush_tid);
__send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
retain, flushing, flush_tid, oldest_flush_tid);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
/* Reschedule delayed caps release if we delayed anything */
if (delayed)
__cap_delay_requeue(mdsc, ci, false);
/* periodically re-calculate caps wanted by open files */
if (__ceph_is_any_real_caps(ci) &&
list_empty(&ci->i_cap_delay_list) &&
(file_wanted & ~CEPH_CAP_PIN) &&
!(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
__cap_delay_requeue(mdsc, ci);
}
spin_unlock(&ci->i_ceph_lock);
......@@ -2095,7 +2121,6 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
retry_locked:
if (ci->i_dirty_caps && ci->i_auth_cap) {
struct ceph_cap *cap = ci->i_auth_cap;
int delayed;
if (session != cap->session) {
spin_unlock(&ci->i_ceph_lock);
......@@ -2124,18 +2149,10 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
&oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
CEPH_CLIENT_CAPS_SYNC,
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
__ceph_caps_used(ci), __ceph_caps_wanted(ci),
(cap->issued | cap->implemented),
flushing, flush_tid, oldest_flush_tid);
if (delayed) {
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci, true);
spin_unlock(&ci->i_ceph_lock);
}
} else {
if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
......@@ -2233,6 +2250,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
if (datasync)
goto out;
ret = ceph_wait_on_async_create(inode);
if (ret)
goto out;
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
......@@ -2335,22 +2356,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
if (cf->caps) {
dout("kick_flushing_caps %p cap %p tid %llu %s\n",
inode, cap, cf->tid, ceph_cap_string(cf->caps));
ci->i_ceph_flags |= CEPH_I_NODELAY;
ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
__send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
(cf->tid < last_snap_flush ?
CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
__ceph_caps_used(ci),
__ceph_caps_wanted(ci),
(cap->issued | cap->implemented),
cf->caps, cf->tid, oldest_flush_tid);
if (ret) {
pr_err("kick_flushing_caps: error sending "
"cap flush, ino (%llx.%llx) "
"tid %llu flushing %s\n",
ceph_vinop(inode), cf->tid,
ceph_cap_string(cf->caps));
}
} else {
struct ceph_cap_snap *capsnap =
container_of(cf, struct ceph_cap_snap,
......@@ -2457,16 +2469,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
}
}
static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct inode *inode)
__releases(ci->i_ceph_lock)
void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
struct ceph_inode_info *ci)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap *cap = ci->i_auth_cap;
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s\n", inode,
lockdep_assert_held(&ci->i_ceph_lock);
dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
......@@ -2478,9 +2489,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
spin_unlock(&mdsc->cap_dirty_lock);
__kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
} else {
spin_unlock(&ci->i_ceph_lock);
}
}
......@@ -2488,18 +2496,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
/*
* Take references to capabilities we hold, so that we don't release
* them to the MDS prematurely.
*
* Protected by i_ceph_lock.
*/
static void __take_cap_refs(struct ceph_inode_info *ci, int got,
void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
bool snap_rwsem_locked)
{
lockdep_assert_held(&ci->i_ceph_lock);
if (got & CEPH_CAP_PIN)
ci->i_pin_ref++;
if (got & CEPH_CAP_FILE_RD)
ci->i_rd_ref++;
if (got & CEPH_CAP_FILE_CACHE)
ci->i_rdcache_ref++;
if (got & CEPH_CAP_FILE_EXCL)
ci->i_fx_ref++;
if (got & CEPH_CAP_FILE_WR) {
if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
BUG_ON(!snap_rwsem_locked);
......@@ -2512,7 +2522,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
if (ci->i_wb_ref == 0)
ihold(&ci->vfs_inode);
ci->i_wb_ref++;
dout("__take_cap_refs %p wb %d -> %d (?)\n",
dout("%s %p wb %d -> %d (?)\n", __func__,
&ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
......@@ -2524,14 +2534,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
* Note that caller is responsible for ensuring max_size increases are
* requested from the MDS.
*
* Returns 0 if caps were not able to be acquired (yet), a 1 if they were,
* or a negative error code.
*
* FIXME: how does a 0 return differ from -EAGAIN?
* Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
* or a negative error code. There are 3 speical error codes:
* -EAGAIN: need to sleep but non-blocking is specified
* -EFBIG: ask caller to call check_max_size() and try again.
* -ESTALE: ask caller to call ceph_renew_caps() and try again.
*/
enum {
NON_BLOCKING = 1,
CHECK_FILELOCK = 2,
/* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
NON_BLOCKING = (1 << 8),
CHECK_FILELOCK = (1 << 9),
};
static int try_get_cap_refs(struct inode *inode, int need, int want,
......@@ -2541,7 +2553,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
int ret = 0;
int have, implemented;
int file_wanted;
bool snap_rwsem_locked = false;
dout("get_cap_refs %p need %s want %s\n", inode,
......@@ -2557,15 +2568,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
goto out_unlock;
}
/* make sure file is actually open */
file_wanted = __ceph_caps_file_wanted(ci);
if ((file_wanted & need) != need) {
dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
ceph_cap_string(need), ceph_cap_string(file_wanted));
ret = -EBADF;
goto out_unlock;
}
/* finish pending truncate */
while (ci->i_truncate_pending) {
spin_unlock(&ci->i_ceph_lock);
......@@ -2584,7 +2586,7 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
inode, endoff, ci->i_max_size);
if (endoff > ci->i_requested_max_size)
ret = -EAGAIN;
ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
goto out_unlock;
}
/*
......@@ -2630,51 +2632,55 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
}
snap_rwsem_locked = true;
}
*got = need | (have & want);
if ((need & CEPH_CAP_FILE_RD) &&
if ((have & want) == want)
*got = need | want;
else
*got = need;
if (S_ISREG(inode->i_mode) &&
(need & CEPH_CAP_FILE_RD) &&
!(*got & CEPH_CAP_FILE_CACHE))
ceph_disable_fscache_readpage(ci);
__take_cap_refs(ci, *got, true);
ceph_take_cap_refs(ci, *got, true);
ret = 1;
}
} else {
int session_readonly = false;
if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
int mds_wanted;
if (ci->i_auth_cap &&
(need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
struct ceph_mds_session *s = ci->i_auth_cap->session;
spin_lock(&s->s_cap_lock);
session_readonly = s->s_readonly;
spin_unlock(&s->s_cap_lock);
}
if (session_readonly) {
dout("get_cap_refs %p needed %s but mds%d readonly\n",
dout("get_cap_refs %p need %s but mds%d readonly\n",
inode, ceph_cap_string(need), ci->i_auth_cap->mds);
ret = -EROFS;
goto out_unlock;
}
if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
int mds_wanted;
if (READ_ONCE(mdsc->fsc->mount_state) ==
CEPH_MOUNT_SHUTDOWN) {
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
dout("get_cap_refs %p forced umount\n", inode);
ret = -EIO;
goto out_unlock;
}
mds_wanted = __ceph_caps_mds_wanted(ci, false);
if (need & ~(mds_wanted & need)) {
dout("get_cap_refs %p caps were dropped"
" (session killed?)\n", inode);
if (need & ~mds_wanted) {
dout("get_cap_refs %p need %s > mds_wanted %s\n",
inode, ceph_cap_string(need),
ceph_cap_string(mds_wanted));
ret = -ESTALE;
goto out_unlock;
}
if (!(file_wanted & ~mds_wanted))
ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
}
dout("get_cap_refs %p have %s needed %s\n", inode,
dout("get_cap_refs %p have %s need %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
__ceph_touch_fmode(ci, mdsc, flags);
spin_unlock(&ci->i_ceph_lock);
if (snap_rwsem_locked)
up_read(&mdsc->snap_rwsem);
......@@ -2712,20 +2718,40 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
static inline int get_used_fmode(int caps)
{
int fmode = 0;
if (caps & CEPH_CAP_FILE_RD)
fmode |= CEPH_FILE_MODE_RD;
if (caps & CEPH_CAP_FILE_WR)
fmode |= CEPH_FILE_MODE_WR;
return fmode;
}
int ceph_try_get_caps(struct inode *inode, int need, int want,
bool nonblock, int *got)
{
int ret;
int ret, flags;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_ANY_DIR_OPS));
if (need) {
ret = ceph_pool_perm_check(inode, need);
if (ret < 0)
return ret;
}
ret = try_get_cap_refs(inode, need, want, 0,
(nonblock ? NON_BLOCKING : 0), got);
return ret == -EAGAIN ? 0 : ret;
flags = get_used_fmode(need | want);
if (nonblock)
flags |= NON_BLOCKING;
ret = try_get_cap_refs(inode, need, want, 0, flags, got);
/* three special error codes */
if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN)
ret = 0;
return ret;
}
/*
......@@ -2750,16 +2776,16 @@ int ceph_get_caps(struct file *filp, int need, int want,
fi->filp_gen != READ_ONCE(fsc->filp_gen))
return -EBADF;
while (true) {
if (endoff > 0)
check_max_size(inode, endoff);
flags = get_used_fmode(need | want);
flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
while (true) {
flags &= CEPH_FILE_MODE_MASK;
if (atomic_read(&fi->num_locks))
flags |= CHECK_FILELOCK;
_got = 0;
ret = try_get_cap_refs(inode, need, want, endoff,
flags, &_got);
if (ret == -EAGAIN)
continue;
WARN_ON_ONCE(ret == -EAGAIN);
if (!ret) {
struct ceph_mds_client *mdsc = fsc->mdsc;
struct cap_wait cw;
......@@ -2774,6 +2800,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
list_add(&cw.list, &mdsc->cap_wait_list);
spin_unlock(&mdsc->caps_list_lock);
/* make sure used fmode not timeout */
ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
......@@ -2787,6 +2815,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
spin_lock(&mdsc->caps_list_lock);
list_del(&cw.list);
......@@ -2804,16 +2833,26 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
if (ret < 0) {
if (ret == -EFBIG || ret == -ESTALE) {
int ret2 = ceph_wait_on_async_create(inode);
if (ret2 < 0)
return ret2;
}
if (ret == -EFBIG) {
check_max_size(inode, endoff);
continue;
}
if (ret == -ESTALE) {
/* session was killed, try renew caps */
ret = ceph_renew_caps(inode);
ret = ceph_renew_caps(inode, flags);
if (ret == 0)
continue;
}
return ret;
}
if (ci->i_inline_version != CEPH_INLINE_NONE &&
if (S_ISREG(ci->vfs_inode.i_mode) &&
ci->i_inline_version != CEPH_INLINE_NONE &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
struct page *page =
......@@ -2846,7 +2885,8 @@ int ceph_get_caps(struct file *filp, int need, int want,
break;
}
if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
if (S_ISREG(ci->vfs_inode.i_mode) &&
(_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
ceph_fscache_revalidate_cookie(ci);
*got = _got;
......@@ -2860,7 +2900,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
{
spin_lock(&ci->i_ceph_lock);
__take_cap_refs(ci, caps, false);
ceph_take_cap_refs(ci, caps, false);
spin_unlock(&ci->i_ceph_lock);
}
......@@ -2911,6 +2951,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (had & CEPH_CAP_FILE_CACHE)
if (--ci->i_rdcache_ref == 0)
last++;
if (had & CEPH_CAP_FILE_EXCL)
if (--ci->i_fx_ref == 0)
last++;
if (had & CEPH_CAP_FILE_BUFFER) {
if (--ci->i_wb_ref == 0) {
last++;
......@@ -2950,7 +2993,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
last ? " last" : "", put ? " put" : "");
if (last && !flushsnaps)
if (last)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
ceph_flush_snaps(ci, NULL);
......@@ -3032,7 +3075,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
spin_unlock(&ci->i_ceph_lock);
if (last) {
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
ceph_check_caps(ci, 0, NULL);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
......@@ -3133,7 +3176,7 @@ static void handle_cap_grant(struct inode *inode,
* try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.)
*/
if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
......@@ -3297,11 +3340,12 @@ static void handle_cap_grant(struct inode *inode,
ceph_cap_string(cap->issued),
ceph_cap_string(newcaps),
ceph_cap_string(revoking));
if (revoking & used & CEPH_CAP_FILE_BUFFER)
if (S_ISREG(inode->i_mode) &&
(revoking & used & CEPH_CAP_FILE_BUFFER))
writeback = true; /* initiate writeback; will delay ack */
else if (revoking == CEPH_CAP_FILE_CACHE &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
queue_invalidate)
else if (queue_invalidate &&
revoking == CEPH_CAP_FILE_CACHE &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
; /* do nothing yet, invalidation will be queued */
else if (cap == ci->i_auth_cap)
check_caps = 1; /* check auth cap only */
......@@ -3339,7 +3383,8 @@ static void handle_cap_grant(struct inode *inode,
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
if (newcaps & ~extra_info->issued)
wake = true;
kick_flushing_inode_caps(session->s_mdsc, session, inode);
ceph_kick_flushing_inode_caps(session, ci);
spin_unlock(&ci->i_ceph_lock);
up_read(&session->s_mdsc->snap_rwsem);
} else {
spin_unlock(&ci->i_ceph_lock);
......@@ -3367,10 +3412,10 @@ static void handle_cap_grant(struct inode *inode,
wake_up_all(&ci->i_cap_wq);
if (check_caps == 1)
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
session);
else if (check_caps == 2)
ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
else
mutex_unlock(&session->s_mutex);
}
......@@ -3619,8 +3664,6 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
goto out_unlock;
if (target < 0) {
if (cap->mds_wanted | cap->issued)
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
goto out_unlock;
}
......@@ -3668,7 +3711,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
/* add placeholder for the export tagert */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
if (!list_empty(&ci->i_cap_flush_list) &&
......@@ -3773,7 +3816,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
......@@ -4047,7 +4090,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{
struct inode *inode;
struct ceph_inode_info *ci;
int flags = CHECK_CAPS_NODELAY;
dout("check_delayed_caps\n");
while (1) {
......@@ -4067,7 +4109,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if (inode) {
dout("check_delayed_caps on %p\n", inode);
ceph_check_caps(ci, flags, NULL);
ceph_check_caps(ci, 0, NULL);
/* avoid calling iput_final() in tick thread */
ceph_async_iput(inode);
}
......@@ -4092,7 +4134,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
ihold(inode);
dout("flush_dirty_caps %p\n", inode);
spin_unlock(&mdsc->cap_dirty_lock);
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL);
ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
......@@ -4100,14 +4142,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
void __ceph_touch_fmode(struct ceph_inode_info *ci,
struct ceph_mds_client *mdsc, int fmode)
{
unsigned long now = jiffies;
if (fmode & CEPH_FILE_MODE_RD)
ci->i_last_rd = now;
if (fmode & CEPH_FILE_MODE_WR)
ci->i_last_wr = now;
/* queue periodic check */
if (fmode &&
__ceph_is_any_real_caps(ci) &&
list_empty(&ci->i_cap_delay_list))
__cap_delay_requeue(mdsc, ci);
}
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
int i;
int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i))
ci->i_nr_by_mode[i]++;
ci->i_nr_by_mode[i] += count;
}
spin_unlock(&ci->i_ceph_lock);
}
/*
......@@ -4115,26 +4174,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
* we may need to release capabilities to the MDS (or schedule
* their delayed release).
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
int i, last = 0;
int i;
int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
if (bits & (1 << i)) {
BUG_ON(ci->i_nr_by_mode[i] == 0);
if (--ci->i_nr_by_mode[i] == 0)
last++;
BUG_ON(ci->i_nr_by_mode[i] < count);
ci->i_nr_by_mode[i] -= count;
}
}
dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
&ci->vfs_inode, fmode,
ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
if (last && ci->i_vino.snap == CEPH_NOSNAP)
ceph_check_caps(ci, 0, NULL);
}
/*
......@@ -4152,7 +4203,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (inode->i_nlink == 1) {
drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
ci->i_ceph_flags |= CEPH_I_NODELAY;
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_client(inode)->mdsc;
......@@ -4208,8 +4258,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
if (force || (cap->issued & drop)) {
if (cap->issued & drop) {
int wanted = __ceph_caps_wanted(ci);
if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0)
wanted |= cap->mds_wanted;
dout("encode_inode_release %p cap %p "
"%s -> %s, wanted %s -> %s\n", inode, cap,
ceph_cap_string(cap->issued),
......
......@@ -218,10 +218,10 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
return 0;
}
CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
CEPH_DEFINE_SHOW_FUNC(mdsc_show)
CEPH_DEFINE_SHOW_FUNC(caps_show)
CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
DEFINE_SHOW_ATTRIBUTE(mdsmap);
DEFINE_SHOW_ATTRIBUTE(mdsc);
DEFINE_SHOW_ATTRIBUTE(caps);
DEFINE_SHOW_ATTRIBUTE(mds_sessions);
/*
......@@ -281,25 +281,25 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
0400,
fsc->client->debugfs_dir,
fsc,
&mdsmap_show_fops);
&mdsmap_fops);
fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
0400,
fsc->client->debugfs_dir,
fsc,
&mds_sessions_show_fops);
&mds_sessions_fops);
fsc->debugfs_mdsc = debugfs_create_file("mdsc",
0400,
fsc->client->debugfs_dir,
fsc,
&mdsc_show_fops);
&mdsc_fops);
fsc->debugfs_caps = debugfs_create_file("caps",
0400,
fsc->client->debugfs_dir,
fsc,
&caps_show_fops);
&caps_fops);
}
......
......@@ -335,8 +335,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = 2;
}
/* can we use the dcache? */
spin_lock(&ci->i_ceph_lock);
/* request Fx cap. if have Fx, we don't need to release Fs cap
* for later create/unlink. */
__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR);
/* can we use the dcache? */
if (ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
......@@ -752,7 +755,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_dentry_info *di = ceph_dentry(dentry);
spin_lock(&ci->i_ceph_lock);
dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags);
if (strncmp(dentry->d_name.name,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
......@@ -760,6 +763,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
ceph_test_mount_opt(fsc, DCACHE) &&
__ceph_dir_is_complete(ci) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
spin_unlock(&ci->i_ceph_lock);
dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL);
......@@ -1036,6 +1040,78 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
if (result == -EJUKEBOX)
goto out;
/* If op failed, mark everyone involved for errors */
if (result) {
int pathlen;
u64 base;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
/* mark error on parent + clear complete */
mapping_set_error(req->r_parent->i_mapping, result);
ceph_dir_clear_complete(req->r_parent);
/* drop the dentry -- we don't know its status */
if (!d_unhashed(req->r_dentry))
d_drop(req->r_dentry);
/* mark inode itself for an error (since metadata is bogus) */
mapping_set_error(req->r_old_inode->i_mapping, result);
pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
}
out:
iput(req->r_old_inode);
ceph_mdsc_release_dir_caps(req);
}
static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
{
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_dentry_info *di;
int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
spin_lock(&ci->i_ceph_lock);
if ((__ceph_caps_issued(ci, NULL) & want) == want) {
ceph_take_cap_refs(ci, want, false);
got = want;
}
spin_unlock(&ci->i_ceph_lock);
/* If we didn't get anything, return 0 */
if (!got)
return 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
/*
* - We are holding Fx, which implies Fs caps.
* - Only support async unlink for primary linkage
*/
if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
!(di->flags & CEPH_DENTRY_PRIMARY_LINK))
want = 0;
spin_unlock(&dentry->d_lock);
/* Do we still want what we've got? */
if (want == got)
return got;
ceph_put_cap_refs(ci, got);
return 0;
}
/*
* rmdir and unlink are differ only by the metadata op code
*/
......@@ -1045,6 +1121,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = d_inode(dentry);
struct ceph_mds_request *req;
bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int err = -EROFS;
int op;
......@@ -1059,6 +1136,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
} else
goto out;
retry:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
......@@ -1067,13 +1145,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
if (try_async && op == CEPH_MDS_OP_UNLINK &&
(req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
dentry->d_name.len, dentry->d_name.name,
ceph_cap_string(req->r_dir_caps));
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_callback = ceph_async_unlink_cb;
req->r_old_inode = d_inode(dentry);
ihold(req->r_old_inode);
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
/*
* We have enough caps, so we assume that the unlink
* will succeed. Fix up the target inode and dcache.
*/
drop_nlink(inode);
d_delete(dentry);
} else if (err == -EJUKEBOX) {
try_async = false;
ceph_mdsc_put_request(req);
goto retry;
}
} else {
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
d_delete(dentry);
}
ceph_mdsc_put_request(req);
out:
return err;
......@@ -1411,6 +1515,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
spin_lock(&dentry->d_lock);
di->time = jiffies;
di->lease_shared_gen = 0;
di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
__dentry_lease_unlist(di);
spin_unlock(&dentry->d_lock);
}
......@@ -1520,7 +1625,8 @@ static int __dir_lease_try_check(const struct dentry *dentry)
/*
* Check if directory-wide content lease/cap is valid.
*/
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc)
{
struct ceph_inode_info *ci = ceph_inode(dir);
int valid;
......@@ -1528,7 +1634,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
spin_lock(&ci->i_ceph_lock);
valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
if (valid) {
__ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD);
shared_gen = atomic_read(&ci->i_shared_gen);
}
spin_unlock(&ci->i_ceph_lock);
if (valid) {
struct ceph_dentry_info *di;
......@@ -1554,6 +1663,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
int valid = 0;
struct dentry *parent;
struct inode *dir, *inode;
struct ceph_mds_client *mdsc;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
......@@ -1570,6 +1680,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset);
mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
......@@ -1581,7 +1693,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = dentry_lease_is_valid(dentry, flags);
if (valid == -ECHILD)
return valid;
if (valid || dir_lease_is_valid(dir, dentry)) {
if (valid || dir_lease_is_valid(dir, dentry, mdsc)) {
if (inode)
valid = ceph_is_any_caps(inode);
else
......@@ -1590,8 +1702,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
}
if (!valid) {
struct ceph_mds_client *mdsc =
ceph_sb_to_client(dir->i_sb)->mdsc;
struct ceph_mds_request *req;
int op, err;
u32 mask;
......
......@@ -315,6 +315,11 @@ static struct dentry *__get_parent(struct super_block *sb,
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err) {
ceph_mdsc_put_request(req);
return ERR_PTR(err);
}
inode = req->r_target_inode;
if (inode)
ihold(inode);
......
......@@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
if (isdir) {
struct ceph_dir_file_info *dfi =
kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
if (!dfi) {
ceph_put_fmode(ci, fmode); /* clean up */
if (!dfi)
return -ENOMEM;
}
file->private_data = dfi;
fi = &dfi->file_info;
......@@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
dfi->readdir_cache_idx = -1;
} else {
fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
if (!fi) {
ceph_put_fmode(ci, fmode); /* clean up */
if (!fi)
return -ENOMEM;
}
file->private_data = fi;
}
ceph_get_fmode(ci, fmode, 1);
fi->fmode = fmode;
spin_lock_init(&fi->rw_contexts_lock);
INIT_LIST_HEAD(&fi->rw_contexts);
fi->meta_err = errseq_sample(&ci->i_meta_err);
......@@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFLNK:
dout("init_file %p %p 0%o (symlink)\n", inode, file,
inode->i_mode);
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
break;
default:
......@@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
* we need to drop the open ref now, since we don't
* have .release set to ceph_release.
*/
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
BUG_ON(inode->i_fop->release == ceph_release);
/* call the proper open fop */
......@@ -285,14 +281,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
/*
* try renew caps after session gets killed.
*/
int ceph_renew_caps(struct inode *inode)
int ceph_renew_caps(struct inode *inode, int fmode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req;
int err, flags, wanted;
spin_lock(&ci->i_ceph_lock);
__ceph_touch_fmode(ci, mdsc, fmode);
wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) &&
(!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
......@@ -326,7 +323,6 @@ int ceph_renew_caps(struct inode *inode)
req->r_inode = inode;
ihold(inode);
req->r_num_caps = 1;
req->r_fmode = -1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
......@@ -372,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
/* trivially open snapdir */
if (ceph_snap(inode) == CEPH_SNAPDIR) {
spin_lock(&ci->i_ceph_lock);
__ceph_get_fmode(ci, fmode);
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
......@@ -392,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file)
dout("open %p fmode %d want %s issued %s using existing\n",
inode, fmode, ceph_cap_string(wanted),
ceph_cap_string(issued));
__ceph_get_fmode(ci, fmode);
__ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
/* adjust wanted? */
......@@ -404,7 +397,7 @@ int ceph_open(struct inode *inode, struct file *file)
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
(ci->i_snap_caps & wanted) == wanted) {
__ceph_get_fmode(ci, fmode);
__ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
return ceph_init_file(inode, file, fmode);
}
......@@ -430,6 +423,236 @@ int ceph_open(struct inode *inode, struct file *file)
return err;
}
/* Clone the layout from a synchronous create, if the dir now has Dc caps */
static void
cache_file_layout(struct inode *dst, struct inode *src)
{
struct ceph_inode_info *cdst = ceph_inode(dst);
struct ceph_inode_info *csrc = ceph_inode(src);
spin_lock(&cdst->i_ceph_lock);
if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) &&
!ceph_file_layout_is_valid(&cdst->i_cached_layout)) {
memcpy(&cdst->i_cached_layout, &csrc->i_layout,
sizeof(cdst->i_cached_layout));
rcu_assign_pointer(cdst->i_cached_layout.pool_ns,
ceph_try_get_string(csrc->i_layout.pool_ns));
}
spin_unlock(&cdst->i_ceph_lock);
}
/*
* Try to set up an async create. We need caps, a file layout, and inode number,
* and either a lease on the dentry or complete dir info. If any of those
* criteria are not satisfied, then return false and the caller can go
* synchronous.
*/
static int try_prep_async_create(struct inode *dir, struct dentry *dentry,
struct ceph_file_layout *lo, u64 *pino)
{
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
u64 ino;
spin_lock(&ci->i_ceph_lock);
/* No auth cap means no chance for Dc caps */
if (!ci->i_auth_cap)
goto no_async;
/* Any delegated inos? */
if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos))
goto no_async;
if (!ceph_file_layout_is_valid(&ci->i_cached_layout))
goto no_async;
if ((__ceph_caps_issued(ci, NULL) & want) != want)
goto no_async;
if (d_in_lookup(dentry)) {
if (!__ceph_dir_is_complete(ci))
goto no_async;
spin_lock(&dentry->d_lock);
di->lease_shared_gen = atomic_read(&ci->i_shared_gen);
spin_unlock(&dentry->d_lock);
} else if (atomic_read(&ci->i_shared_gen) !=
READ_ONCE(di->lease_shared_gen)) {
goto no_async;
}
ino = ceph_get_deleg_ino(ci->i_auth_cap->session);
if (!ino)
goto no_async;
*pino = ino;
ceph_take_cap_refs(ci, want, false);
memcpy(lo, &ci->i_cached_layout, sizeof(*lo));
rcu_assign_pointer(lo->pool_ns,
ceph_try_get_string(ci->i_cached_layout.pool_ns));
got = want;
no_async:
spin_unlock(&ci->i_ceph_lock);
return got;
}
static void restore_deleg_ino(struct inode *dir, u64 ino)
{
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_mds_session *s = NULL;
spin_lock(&ci->i_ceph_lock);
if (ci->i_auth_cap)
s = ceph_get_mds_session(ci->i_auth_cap->session);
spin_unlock(&ci->i_ceph_lock);
if (s) {
int err = ceph_restore_deleg_ino(s, ino);
if (err)
pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n",
ino, err);
ceph_put_mds_session(s);
}
}
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
if (result == -EJUKEBOX)
goto out;
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
struct dentry *dentry = req->r_dentry;
int pathlen;
u64 base;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
d_drop(dentry);
/* FIXME: start returning I/O errors on all accesses? */
pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
}
if (req->r_target_inode) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
u64 ino = ceph_vino(req->r_target_inode).ino;
if (req->r_deleg_ino != ino)
pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
__func__, req->r_err, req->r_deleg_ino, ino);
mapping_set_error(req->r_target_inode->i_mapping, result);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
}
ceph_kick_flushing_inode_caps(req->r_session, ci);
spin_unlock(&ci->i_ceph_lock);
} else {
pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
req->r_deleg_ino);
}
out:
ceph_mdsc_release_dir_caps(req);
}
static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
struct file *file, umode_t mode,
struct ceph_mds_request *req,
struct ceph_acl_sec_ctx *as_ctx,
struct ceph_file_layout *lo)
{
int ret;
char xattr_buf[4];
struct ceph_mds_reply_inode in = { };
struct ceph_mds_reply_info_in iinfo = { .in = &in };
struct ceph_inode_info *ci = ceph_inode(dir);
struct inode *inode;
struct timespec64 now;
struct ceph_vino vino = { .ino = req->r_deleg_ino,
.snap = CEPH_NOSNAP };
ktime_get_real_ts64(&now);
inode = ceph_get_inode(dentry->d_sb, vino);
if (IS_ERR(inode))
return PTR_ERR(inode);
iinfo.inline_version = CEPH_INLINE_NONE;
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
iinfo.xattr_data = xattr_buf;
memset(iinfo.xattr_data, 0, iinfo.xattr_len);
in.ino = cpu_to_le64(vino.ino);
in.snapid = cpu_to_le64(CEPH_NOSNAP);
in.version = cpu_to_le64(1); // ???
in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
in.cap.cap_id = cpu_to_le64(1);
in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
in.cap.flags = CEPH_CAP_FLAG_AUTH;
in.ctime = in.mtime = in.atime = iinfo.btime;
in.mode = cpu_to_le32((u32)mode);
in.truncate_seq = cpu_to_le32(1);
in.truncate_size = cpu_to_le64(-1ULL);
in.xattr_version = cpu_to_le64(1);
in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ?
dir->i_gid : current_fsgid()));
in.nlink = cpu_to_le32(1);
in.max_size = cpu_to_le64(lo->stripe_unit);
ceph_file_layout_to_legacy(lo, &in.layout);
ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
req->r_fmode, NULL);
if (ret) {
dout("%s failed to fill inode: %d\n", __func__, ret);
ceph_dir_clear_complete(dir);
if (!d_unhashed(dentry))
d_drop(dentry);
if (inode->i_state & I_NEW)
discard_new_inode(inode);
} else {
struct dentry *dn;
dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
vino.ino, dir->i_ino, dentry->d_name.name);
ceph_dir_clear_ordered(dir);
ceph_init_inode_acls(inode, as_ctx);
if (inode->i_state & I_NEW) {
/*
* If it's not I_NEW, then someone created this before
* we got here. Assume the server is aware of it at
* that point and don't worry about setting
* CEPH_I_ASYNC_CREATE.
*/
ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE;
unlock_new_inode(inode);
}
if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
if (!d_unhashed(dentry))
d_drop(dentry);
dn = d_splice_alias(inode, dentry);
WARN_ON_ONCE(dn && dn != dentry);
}
file->f_mode |= FMODE_CREATED;
ret = finish_open(file, dentry, ceph_open);
}
return ret;
}
/*
* Do a lookup + open with a single request. If we get a non-existent
......@@ -443,6 +666,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req;
struct dentry *dn;
struct ceph_acl_sec_ctx as_ctx = {};
bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
int mask;
int err;
......@@ -466,7 +690,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
/* If it's not being looked up, it's negative */
return -ENOENT;
}
retry:
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req)) {
......@@ -475,21 +699,43 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask);
req->r_parent = dir;
if (flags & O_CREAT) {
struct ceph_file_layout lo;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
if (as_ctx.pagelist) {
req->r_pagelist = as_ctx.pagelist;
as_ctx.pagelist = NULL;
}
if (try_async &&
(req->r_dir_caps =
try_prep_async_create(dir, dentry, &lo,
&req->r_deleg_ino))) {
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
req->r_callback = ceph_async_create_cb;
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
err = ceph_finish_async_create(dir, dentry,
file, mode, req,
&as_ctx, &lo);
} else if (err == -EJUKEBOX) {
restore_deleg_ino(dir, req->r_deleg_ino);
ceph_mdsc_put_request(req);
try_async = false;
goto retry;
}
goto out_req;
}
}
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask);
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
......@@ -518,14 +764,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
ceph_init_inode_acls(d_inode(dentry), &as_ctx);
struct inode *newino = d_inode(dentry);
cache_file_layout(dir, newino);
ceph_init_inode_acls(newino, &as_ctx);
file->f_mode |= FMODE_CREATED;
}
err = finish_open(file, dentry, ceph_open);
}
out_req:
if (!req->r_err && req->r_target_inode)
ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
......@@ -542,7 +789,7 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p dir file %p\n", inode, file);
WARN_ON(!list_empty(&dfi->file_info.rw_contexts));
ceph_put_fmode(ci, dfi->file_info.fmode);
ceph_put_fmode(ci, dfi->file_info.fmode, 1);
if (dfi->last_readdir)
ceph_mdsc_put_request(dfi->last_readdir);
......@@ -554,7 +801,8 @@ int ceph_release(struct inode *inode, struct file *file)
dout("release inode %p regular file %p\n", inode, file);
WARN_ON(!list_empty(&fi->rw_contexts));
ceph_put_fmode(ci, fi->fmode);
ceph_put_fmode(ci, fi->fmode, 1);
kmem_cache_free(ceph_file_cachep, fi);
}
......@@ -1567,7 +1815,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL);
ceph_check_caps(ci, 0, NULL);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
......@@ -1944,6 +2192,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0;
}
static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
struct ceph_inode_info *dst_ci, u64 *dst_off,
struct ceph_fs_client *fsc,
size_t len, unsigned int flags)
{
struct ceph_object_locator src_oloc, dst_oloc;
struct ceph_object_id src_oid, dst_oid;
size_t bytes = 0;
u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
u32 src_objlen, dst_objlen;
u32 object_size = src_ci->i_layout.object_size;
int ret;
src_oloc.pool = src_ci->i_layout.pool_id;
src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
dst_oloc.pool = dst_ci->i_layout.pool_id;
dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
while (len >= object_size) {
ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
object_size, &src_objnum,
&src_objoff, &src_objlen);
ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
object_size, &dst_objnum,
&dst_objoff, &dst_objlen);
ceph_oid_init(&src_oid);
ceph_oid_printf(&src_oid, "%llx.%08llx",
src_ci->i_vino.ino, src_objnum);
ceph_oid_init(&dst_oid);
ceph_oid_printf(&dst_oid, "%llx.%08llx",
dst_ci->i_vino.ino, dst_objnum);
/* Do an object remote copy */
ret = ceph_osdc_copy_from(&fsc->client->osdc,
src_ci->i_vino.snap, 0,
&src_oid, &src_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
&dst_oid, &dst_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
dst_ci->i_truncate_seq,
dst_ci->i_truncate_size,
CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
if (ret) {
if (ret == -EOPNOTSUPP) {
fsc->have_copy_from2 = false;
pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
}
dout("ceph_osdc_copy_from returned %d\n", ret);
if (!bytes)
bytes = ret;
goto out;
}
len -= object_size;
bytes += object_size;
*src_off += object_size;
*dst_off += object_size;
}
out:
ceph_oloc_destroy(&src_oloc);
ceph_oloc_destroy(&dst_oloc);
return bytes;
}
static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags)
......@@ -1954,14 +2267,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf;
struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
struct ceph_object_locator src_oloc, dst_oloc;
struct ceph_object_id src_oid, dst_oid;
loff_t endoff = 0, size;
ssize_t ret = -EIO;
loff_t size;
ssize_t ret = -EIO, bytes;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
u32 src_objlen, dst_objlen, object_size;
u32 src_objlen, dst_objlen;
int src_got = 0, dst_got = 0, err, dirty;
bool do_final_copy = false;
if (src_inode->i_sb != dst_inode->i_sb) {
struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
......@@ -2039,22 +2349,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ret < 0)
goto out_caps;
size = i_size_read(dst_inode);
endoff = dst_off + len;
/* Drop dst file cached pages */
ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT,
endoff >> PAGE_SHIFT);
(dst_off + len) >> PAGE_SHIFT);
if (ret < 0) {
dout("Failed to invalidate inode pages (%zd)\n", ret);
ret = 0; /* XXX */
}
src_oloc.pool = src_ci->i_layout.pool_id;
src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
dst_oloc.pool = dst_ci->i_layout.pool_id;
dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
src_ci->i_layout.object_size,
&src_objnum, &src_objoff, &src_objlen);
......@@ -2073,6 +2375,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off
*/
if (src_objoff) {
dout("Initial partial copy of %u bytes\n", src_objlen);
/*
* we need to temporarily drop all caps as we'll be calling
* {read,write}_iter, which will get caps again.
......@@ -2080,8 +2384,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags);
if (ret < 0) {
dout("do_splice_direct returned %d\n", err);
/* Abort on short copies or on error */
if (ret < src_objlen) {
dout("Failed partial copy (%zd)\n", ret);
goto out;
}
len -= ret;
......@@ -2094,65 +2399,27 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (err < 0)
goto out_caps;
}
object_size = src_ci->i_layout.object_size;
while (len >= object_size) {
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
object_size, &src_objnum,
&src_objoff, &src_objlen);
ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
object_size, &dst_objnum,
&dst_objoff, &dst_objlen);
ceph_oid_init(&src_oid);
ceph_oid_printf(&src_oid, "%llx.%08llx",
src_ci->i_vino.ino, src_objnum);
ceph_oid_init(&dst_oid);
ceph_oid_printf(&dst_oid, "%llx.%08llx",
dst_ci->i_vino.ino, dst_objnum);
/* Do an object remote copy */
err = ceph_osdc_copy_from(
&src_fsc->client->osdc,
src_ci->i_vino.snap, 0,
&src_oid, &src_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
&dst_oid, &dst_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
if (err) {
if (err == -EOPNOTSUPP) {
src_fsc->have_copy_from2 = false;
pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
}
dout("ceph_osdc_copy_from returned %d\n", err);
size = i_size_read(dst_inode);
bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
src_fsc, len, flags);
if (bytes <= 0) {
if (!ret)
ret = err;
ret = bytes;
goto out_caps;
}
len -= object_size;
src_off += object_size;
dst_off += object_size;
ret += object_size;
}
if (len)
/* We still need one final local copy */
do_final_copy = true;
dout("Copied %zu bytes out of %zu\n", bytes, len);
len -= bytes;
ret += bytes;
file_update_time(dst_file);
inode_inc_iversion_raw(dst_inode);
if (endoff > size) {
int caps_flags = 0;
if (dst_off > size) {
/* Let the MDS know about dst file size change */
if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
caps_flags |= CHECK_CAPS_NODELAY;
if (ceph_inode_set_size(dst_inode, endoff))
caps_flags |= CHECK_CAPS_AUTHONLY;
if (caps_flags)
ceph_check_caps(dst_ci, caps_flags, NULL);
if (ceph_inode_set_size(dst_inode, dst_off) ||
ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
......@@ -2165,15 +2432,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
out_caps:
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
if (do_final_copy) {
err = do_splice_direct(src_file, &src_off, dst_file,
/*
* Do the final manual copy if we still have some bytes left, unless
* there were errors in remote object copies (len >= object_size).
*/
if (len && (len < src_ci->i_layout.object_size)) {
dout("Final partial copy of %zu bytes\n", len);
bytes = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, len, flags);
if (err < 0) {
dout("do_splice_direct returned %d\n", err);
goto out;
}
len -= err;
ret += err;
if (bytes > 0)
ret += bytes;
else
dout("Failed partial copy (%zd)\n", bytes);
}
out:
......
......@@ -82,10 +82,14 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
inode->i_mtime = parent->i_mtime;
inode->i_ctime = parent->i_ctime;
inode->i_atime = parent->i_atime;
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
ci->i_btime = ceph_inode(parent)->i_btime;
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
......@@ -447,6 +451,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_max_files = 0;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
......@@ -471,13 +476,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_prealloc_cap_flush = NULL;
INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
......@@ -496,6 +501,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_rdcache_ref = 0;
ci->i_wr_ref = 0;
ci->i_wb_ref = 0;
ci->i_fx_ref = 0;
ci->i_wrbuffer_ref = 0;
ci->i_wrbuffer_ref_head = 0;
atomic_set(&ci->i_filelock_ref, 0);
......@@ -586,6 +592,7 @@ void ceph_evict_inode(struct inode *inode)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
}
static inline blkcnt_t calc_inode_blocks(u64 size)
......@@ -636,7 +643,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if ((issued & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_BUFFER)) ||
mapping_mapped(inode->i_mapping) ||
__ceph_caps_file_wanted(ci)) {
__ceph_is_file_opened(ci)) {
ci->i_truncate_pending++;
queue_trunc = 1;
}
......@@ -727,7 +734,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
*/
static int fill_inode(struct inode *inode, struct page *locked_page,
int ceph_fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_reply_info_in *iinfo,
struct ceph_mds_reply_dirfrag *dirinfo,
struct ceph_mds_session *session, int cap_fmode,
......@@ -748,7 +755,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
bool new_version = false;
bool fill_inline = false;
dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
......@@ -769,7 +776,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (iinfo->xattr_len > 4) {
xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
if (!xattr_blob)
pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
iinfo->xattr_len);
}
......@@ -932,8 +939,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
spin_unlock(&ci->i_ceph_lock);
if (symlen != i_size_read(inode)) {
pr_err("fill_inode %llx.%llx BAD symlink "
"size %lld\n", ceph_vinop(inode),
pr_err("%s %llx.%llx BAD symlink "
"size %lld\n", __func__,
ceph_vinop(inode),
i_size_read(inode));
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
......@@ -957,7 +965,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_dir_fops;
break;
default:
pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
ceph_vinop(inode), inode->i_mode);
}
......@@ -966,7 +974,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (ceph_snap(inode) == CEPH_NOSNAP) {
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
cap_fmode, info_caps,
info_caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
......@@ -991,13 +999,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(info_caps));
ci->i_snap_caps |= info_caps;
if (cap_fmode >= 0)
__ceph_get_fmode(ci, cap_fmode);
}
} else if (cap_fmode >= 0) {
pr_warn("mds issued no caps on %llx.%llx\n",
ceph_vinop(inode));
__ceph_get_fmode(ci, cap_fmode);
}
if (iinfo->inline_version > 0 &&
......@@ -1009,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
fill_inline = true;
}
if (cap_fmode >= 0) {
if (!info_caps)
pr_warn("mds issued no caps on %llx.%llx\n",
ceph_vinop(inode));
__ceph_touch_fmode(ci, mdsc, cap_fmode);
}
spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
......@@ -1050,6 +1059,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
struct ceph_mds_session **old_lease_session)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
unsigned mask = le16_to_cpu(lease->mask);
long unsigned duration = le32_to_cpu(lease->duration_ms);
long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
......@@ -1061,8 +1071,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
if (ceph_snap(dir) != CEPH_NOSNAP)
return;
if (mask & CEPH_LEASE_PRIMARY_LINK)
di->flags |= CEPH_DENTRY_PRIMARY_LINK;
else
di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
if (duration == 0) {
if (!(mask & CEPH_LEASE_VALID)) {
__ceph_dentry_dir_lease_touch(di);
return;
}
......@@ -1239,9 +1254,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
struct inode *dir = req->r_parent;
if (dir) {
err = fill_inode(dir, NULL,
&rinfo->diri, rinfo->dirfrag,
session, -1,
err = ceph_fill_inode(dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
&req->r_caps_reservation);
if (err < 0)
goto done;
......@@ -1307,13 +1321,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
goto done;
}
err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
session,
err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
NULL, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation);
if (err < 0) {
pr_err("fill_inode badness %p %llx.%llx\n",
pr_err("ceph_fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in));
if (in->i_state & I_NEW)
discard_new_inode(in);
......@@ -1500,10 +1515,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
dout("new_inode badness got %d\n", err);
continue;
}
rc = fill_inode(in, NULL, &rde->inode, NULL, session,
rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
-1, &req->r_caps_reservation);
if (rc < 0) {
pr_err("fill_inode badness on %p got %d\n", in, rc);
pr_err("ceph_fill_inode badness on %p got %d\n",
in, rc);
err = rc;
if (in->i_state & I_NEW) {
ihold(in);
......@@ -1707,10 +1723,10 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
}
ret = fill_inode(in, NULL, &rde->inode, NULL, session,
ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
-1, &req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in);
pr_err("ceph_fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) {
/* avoid calling iput_final() in mds
* dispatch threads */
......@@ -1972,7 +1988,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
ceph_check_caps(ci, 0, NULL);
wake_up_all(&ci->i_cap_wq);
}
......
......@@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file)
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
fi->fmode |= CEPH_FILE_MODE_LAZY;
ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
__ceph_touch_fmode(ci, mdsc, fi->fmode);
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
......
......@@ -210,6 +210,21 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
return 0;
}
static int try_unlock_file(struct file *file, struct file_lock *fl)
{
int err;
unsigned int orig_flags = fl->fl_flags;
fl->fl_flags |= FL_EXISTS;
err = locks_lock_file_wait(file, fl);
fl->fl_flags = orig_flags;
if (err == -ENOENT) {
if (!(orig_flags & FL_EXISTS))
err = 0;
return err;
}
return 1;
}
/**
* Attempt to set an fcntl lock.
* For now, this just goes away to the server. Later it may be more awesome.
......@@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
err = try_unlock_file(file, fl);
if (err <= 0)
return err;
}
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
if (!err) {
if (op == CEPH_MDS_OP_SETFILELOCK) {
if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
dout("mds locked, locking locally\n");
err = posix_lock_file(file, fl, NULL);
if (err) {
......@@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
if (F_UNLCK == fl->fl_type) {
err = try_unlock_file(file, fl);
if (err <= 0)
return err;
}
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
inode, lock_cmd, wait, fl);
if (!err) {
if (!err && F_UNLCK != fl->fl_type) {
err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
......
......@@ -415,21 +415,121 @@ static int parse_reply_info_filelock(void **p, void *end,
return -EIO;
}
#if BITS_PER_LONG == 64
#define DELEGATED_INO_AVAILABLE xa_mk_value(1)
static int ceph_parse_deleg_inos(void **p, void *end,
struct ceph_mds_session *s)
{
u32 sets;
ceph_decode_32_safe(p, end, sets, bad);
dout("got %u sets of delegated inodes\n", sets);
while (sets--) {
u64 start, len, ino;
ceph_decode_64_safe(p, end, start, bad);
ceph_decode_64_safe(p, end, len, bad);
while (len--) {
int err = xa_insert(&s->s_delegated_inos, ino = start++,
DELEGATED_INO_AVAILABLE,
GFP_KERNEL);
if (!err) {
dout("added delegated inode 0x%llx\n",
start - 1);
} else if (err == -EBUSY) {
pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
start - 1);
} else {
return err;
}
}
}
return 0;
bad:
return -EIO;
}
u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
{
unsigned long ino;
void *val;
xa_for_each(&s->s_delegated_inos, ino, val) {
val = xa_erase(&s->s_delegated_inos, ino);
if (val == DELEGATED_INO_AVAILABLE)
return ino;
}
return 0;
}
int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
{
return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
GFP_KERNEL);
}
#else /* BITS_PER_LONG == 64 */
/*
* FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
* ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
* and bottom words?
*/
static int ceph_parse_deleg_inos(void **p, void *end,
struct ceph_mds_session *s)
{
u32 sets;
ceph_decode_32_safe(p, end, sets, bad);
if (sets)
ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
return 0;
bad:
return -EIO;
}
u64 ceph_get_deleg_ino(struct ceph_mds_session *s)
{
return 0;
}
int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
{
return 0;
}
#endif /* BITS_PER_LONG == 64 */
/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
u64 features, struct ceph_mds_session *s)
{
int ret;
if (features == (u64)-1 ||
(features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
/* Malformed reply? */
if (*p == end) {
/* Malformed reply? */
info->has_create_ino = false;
} else {
} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
u8 struct_v, struct_compat;
u32 len;
info->has_create_ino = true;
ceph_decode_8_safe(p, end, struct_v, bad);
ceph_decode_8_safe(p, end, struct_compat, bad);
ceph_decode_32_safe(p, end, len, bad);
ceph_decode_64_safe(p, end, info->ino, bad);
ret = ceph_parse_deleg_inos(p, end, s);
if (ret)
return ret;
} else {
/* legacy */
ceph_decode_64_safe(p, end, info->ino, bad);
info->has_create_ino = true;
}
} else {
if (*p != end)
......@@ -448,7 +548,7 @@ static int parse_reply_info_create(void **p, void *end,
*/
static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
u64 features, struct ceph_mds_session *s)
{
u32 op = le32_to_cpu(info->head->op);
......@@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end,
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
return parse_reply_info_create(p, end, info, features, s);
else
return -EIO;
}
......@@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end,
/*
* parse entire mds reply
*/
static int parse_reply_info(struct ceph_msg *msg,
static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
......@@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg,
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
ceph_decode_need(&p, end, len, bad);
err = parse_reply_info_extra(&p, p+len, info, features);
err = parse_reply_info_extra(&p, p+len, info, features, s);
if (err < 0)
goto out_bad;
}
......@@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer);
xa_destroy(&s->s_delegated_inos);
kfree(s);
}
}
......@@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
xa_init(&s->s_delegated_inos);
s->s_num_cap_releases = 0;
s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
......@@ -699,6 +801,7 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
ceph_mdsc_release_dir_caps(req);
destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
......@@ -736,7 +839,7 @@ void ceph_mdsc_release_request(struct kref *kref)
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
WARN_ON_ONCE(!list_empty(&req->r_wait));
kfree(req);
kmem_cache_free(ceph_mds_request_cachep, req);
}
DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
......@@ -793,8 +896,13 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
ihold(dir);
req->r_unsafe_dir = dir;
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
}
......@@ -822,8 +930,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
erase_request(&mdsc->request_tree, req);
if (req->r_unsafe_dir &&
test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
if (req->r_unsafe_dir) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
......@@ -1407,8 +1514,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
if (cap->mds_wanted | cap->issued)
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
__ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
......@@ -1574,9 +1679,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
/* mds did not re-issue stale cap */
spin_lock(&ci->i_ceph_lock);
cap->issued = cap->implemented = CEPH_CAP_PIN;
/* make sure mds knows what we want */
if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted)
ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
spin_unlock(&ci->i_ceph_lock);
}
} else if (ev == FORCE_RO) {
......@@ -1772,7 +1874,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
}
/* The inode has cached pages, but it's no longer used.
* we can safely drop it */
if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
if (S_ISREG(inode->i_mode) &&
wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
!(oissued & CEPH_CAP_FILE_CACHE)) {
used = 0;
oissued = 0;
......@@ -2089,8 +2192,9 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
{
struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
struct ceph_mds_request *req;
req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
if (!req)
return ERR_PTR(-ENOMEM);
......@@ -2368,7 +2472,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->op = cpu_to_le32(req->r_op);
head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
head->ino = 0;
head->ino = cpu_to_le64(req->r_deleg_ino);
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
......@@ -2382,7 +2486,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_inode_drop)
releases += ceph_encode_inode_release(&p,
req->r_inode ? req->r_inode : d_inode(req->r_dentry),
mds, req->r_inode_drop, req->r_inode_unless, 0);
mds, req->r_inode_drop, req->r_inode_unless,
req->r_op == CEPH_MDS_OP_READDIR);
if (req->r_dentry_drop)
releases += ceph_encode_dentry_release(&p, req->r_dentry,
req->r_parent, mds, req->r_dentry_drop,
......@@ -2522,12 +2627,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_ASYNC;
if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
rhead->flags = cpu_to_le32(flags);
rhead->num_fwd = req->r_num_fwd;
rhead->num_retry = req->r_attempts - 1;
rhead->ino = 0;
dout(" r_parent = %p\n", req->r_parent);
return 0;
......@@ -2573,7 +2679,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
dout("do_request timed out\n");
err = -EIO;
err = -ETIMEDOUT;
goto finish;
}
if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
......@@ -2605,6 +2711,10 @@ static void __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req, &random);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
err = -EJUKEBOX;
goto finish;
}
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
......@@ -2629,6 +2739,15 @@ static void __do_request(struct ceph_mds_client *mdsc,
err = -EACCES;
goto out_session;
}
/*
* We cannot queue async requests since the caps and delegated
* inodes are bound to the session. Just return -EJUKEBOX and
* let the caller retry a sync request in that case.
*/
if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
err = -EJUKEBOX;
goto out_session;
}
if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session);
......@@ -2709,19 +2828,43 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
struct ceph_mds_request *req)
{
int err;
int err = 0;
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_parent) {
ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
struct ceph_inode_info *ci = ceph_inode(req->r_parent);
int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
spin_lock(&ci->i_ceph_lock);
ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
__ceph_touch_fmode(ci, mdsc, fmode);
spin_unlock(&ci->i_ceph_lock);
ihold(req->r_parent);
}
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
if (req->r_inode) {
err = ceph_wait_on_async_create(req->r_inode);
if (err) {
dout("%s: wait for async create returned: %d\n",
__func__, err);
return err;
}
}
if (!err && req->r_old_inode) {
err = ceph_wait_on_async_create(req->r_old_inode);
if (err) {
dout("%s: wait for async create returned: %d\n",
__func__, err);
return err;
}
}
dout("submit_request on %p for inode %p\n", req, dir);
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
......@@ -2747,7 +2890,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
if (timeleft > 0)
err = 0;
else if (!timeleft)
err = -EIO; /* timed out */
err = -ETIMEDOUT; /* timed out */
else
err = timeleft; /* killed */
}
......@@ -2935,22 +3078,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
if (req->r_unsafe_dir) {
struct ceph_inode_info *ci =
ceph_inode(req->r_unsafe_dir);
spin_lock(&ci->i_unsafe_lock);
list_add_tail(&req->r_unsafe_dir_item,
&ci->i_unsafe_dirops);
spin_unlock(&ci->i_unsafe_lock);
}
}
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
err = parse_reply_info(msg, rinfo, (u64)-1);
err = parse_reply_info(session, msg, rinfo, (u64)-1);
else
err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
......@@ -3249,6 +3384,17 @@ static void handle_session(struct ceph_mds_session *session,
return;
}
void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
{
int dcaps;
dcaps = xchg(&req->r_dir_caps, 0);
if (dcaps) {
dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
}
}
/*
* called under session->mutex.
*/
......@@ -3276,8 +3422,13 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
continue;
if (req->r_attempts == 0)
continue; /* only old requests */
if (req->r_session &&
req->r_session->s_mds == session->s_mds)
if (!req->r_session)
continue;
if (req->r_session->s_mds != session->s_mds)
continue;
ceph_mdsc_release_dir_caps(req);
__send_request(mdsc, session, req, true);
}
mutex_unlock(&mdsc->mutex);
......@@ -3362,7 +3513,7 @@ static int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
/*
* Encode information about a cap for a reconnect with the MDS.
*/
static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg)
{
union {
......@@ -3385,6 +3536,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
/* These are lost when the session goes away */
if (S_ISDIR(inode->i_mode)) {
if (cap->issued & CEPH_CAP_DIR_CREATE) {
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
}
cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
}
if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
......@@ -3626,6 +3786,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
if (!reply)
goto fail_nomsg;
xa_destroy(&session->s_delegated_inos);
mutex_lock(&session->s_mutex);
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
......@@ -3681,7 +3843,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.msg_version = 2;
}
/* trsaverse this session's caps */
err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state);
err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
spin_lock(&session->s_cap_lock);
session->s_cap_reconnect = 0;
......
......@@ -23,8 +23,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_RECLAIM_CLIENT,
CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_DELEG_INO,
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
};
/*
......@@ -37,6 +38,7 @@ enum ceph_feature_type {
CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
\
CEPHFS_FEATURE_MAX, \
}
......@@ -201,6 +203,7 @@ struct ceph_mds_session {
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
struct xarray s_delegated_inos;
};
/*
......@@ -255,6 +258,7 @@ struct ceph_mds_request {
#define CEPH_MDS_R_GOT_RESULT (5) /* got a result */
#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
#define CEPH_MDS_R_ASYNC (8) /* async request */
unsigned long r_req_flags;
struct mutex r_fill_mutex;
......@@ -263,6 +267,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
int r_request_release_offset;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
......@@ -280,12 +285,16 @@ struct ceph_mds_request {
int r_old_inode_drop, r_old_inode_unless;
struct ceph_msg *r_request; /* original request */
int r_request_release_offset;
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
struct page *r_locked_page;
int r_err;
struct page *r_locked_page;
int r_dir_caps;
int r_num_caps;
u32 r_readdir_offset;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
unsigned long r_request_started; /* start time for mds request only,
......@@ -304,6 +313,7 @@ struct ceph_mds_request {
int r_num_fwd; /* number of forward attempts */
int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/
u64 r_deleg_ino;
struct list_head r_wait;
struct completion r_completion;
......@@ -315,10 +325,8 @@ struct ceph_mds_request {
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation;
int r_num_caps;
};
struct ceph_pool_perm {
......@@ -488,6 +496,7 @@ extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
......@@ -537,4 +546,15 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
extern int ceph_trim_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
int max_caps);
static inline int ceph_wait_on_async_create(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT,
TASK_INTERRUPTIBLE);
}
extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session);
extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino);
#endif
......@@ -155,6 +155,7 @@ enum {
Opt_acl,
Opt_quotadf,
Opt_copyfrom,
Opt_wsync,
};
enum ceph_recover_session_mode {
......@@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
fsparam_u32 ("wsize", Opt_wsize),
fsparam_flag_no ("wsync", Opt_wsync),
{}
};
......@@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
fc->sb_flags &= ~SB_POSIXACL;
}
break;
case Opt_wsync:
if (!result.negated)
fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS;
else
fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS;
break;
default:
BUG();
}
......@@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
seq_show_option(m, "recover_session", "clean");
if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
seq_puts(m, ",nowsync");
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
......@@ -729,6 +740,7 @@ struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
struct kmem_cache *ceph_dir_file_cachep;
struct kmem_cache *ceph_mds_request_cachep;
static void ceph_inode_init_once(void *foo)
{
......@@ -769,6 +781,10 @@ static int __init init_caches(void)
if (!ceph_dir_file_cachep)
goto bad_dir_file;
ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
if (!ceph_mds_request_cachep)
goto bad_mds_req;
error = ceph_fscache_register();
if (error)
goto bad_fscache;
......@@ -776,6 +792,8 @@ static int __init init_caches(void)
return 0;
bad_fscache:
kmem_cache_destroy(ceph_mds_request_cachep);
bad_mds_req:
kmem_cache_destroy(ceph_dir_file_cachep);
bad_dir_file:
kmem_cache_destroy(ceph_file_cachep);
......@@ -804,6 +822,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
kmem_cache_destroy(ceph_dir_file_cachep);
kmem_cache_destroy(ceph_mds_request_cachep);
ceph_fscache_unregister();
}
......@@ -1107,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc)
static int ceph_reconfigure_fc(struct fs_context *fc)
{
struct ceph_parse_opts_ctx *pctx = fc->fs_private;
struct ceph_mount_options *fsopt = pctx->opts;
struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb);
if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
ceph_set_mount_opt(fsc, ASYNC_DIROPS);
else
ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
sync_filesystem(fc->root->d_sb);
return 0;
}
......
......@@ -43,13 +43,16 @@
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
CEPH_MOUNT_OPT_NOCOPYFROM)
#define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt
#define ceph_clear_mount_opt(fsc, opt) \
(fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
......@@ -170,9 +173,9 @@ struct ceph_cap {
struct list_head caps_item;
};
#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */
#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */
#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */
struct ceph_cap_flush {
u64 tid;
......@@ -284,6 +287,7 @@ struct ceph_dentry_info {
#define CEPH_DENTRY_REFERENCED 1
#define CEPH_DENTRY_LEASE_LIST 2
#define CEPH_DENTRY_SHRINK_LIST 4
#define CEPH_DENTRY_PRIMARY_LINK 8
struct ceph_inode_xattrs_info {
/*
......@@ -315,13 +319,14 @@ struct ceph_inode_info {
u64 i_inline_version;
u32 i_time_warp_seq;
unsigned i_ceph_flags;
unsigned long i_ceph_flags;
atomic64_t i_release_count;
atomic64_t i_ordered_count;
atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
struct ceph_file_layout i_cached_layout; // for async creates
char *i_symlink;
/* for dirs */
......@@ -352,7 +357,6 @@ struct ceph_inode_info {
struct ceph_cap_flush *i_prealloc_cap_flush;
struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
struct ceph_cap_reservation i_cap_migration_resv;
......@@ -361,6 +365,8 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
unsigned long i_last_rd;
unsigned long i_last_wr;
int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
......@@ -375,7 +381,7 @@ struct ceph_inode_info {
/* held references to caps */
int i_pin_ref;
int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref;
int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
atomic_t i_filelock_ref;
atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */
......@@ -511,18 +517,18 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
* Ceph inode.
*/
#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */
#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */
#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */
#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */
#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */
#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
/*
* Masks of ceph inode work.
......@@ -674,18 +680,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
/*
* wanted, by virtue of open file modes AND cap refs (buffered/cached data)
*/
static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
if (w & CEPH_CAP_FILE_BUFFER)
w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
return w;
return ci->i_nr_by_mode[0];
}
extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
extern int __ceph_caps_wanted(struct ceph_inode_info *ci);
/* what the mds thinks we want */
extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check);
......@@ -899,6 +899,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
}
/* inode.c */
struct ceph_mds_reply_info_in;
struct ceph_mds_reply_dirfrag;
extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
......@@ -914,6 +917,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
u64 time_warp_seq, struct timespec64 *ctime,
struct timespec64 *mtime,
struct timespec64 *atime);
extern int ceph_fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_reply_info_in *iinfo,
struct ceph_mds_reply_dirfrag *dirinfo,
struct ceph_mds_session *session, int cap_fmode,
struct ceph_cap_reservation *caps_reservation);
extern int ceph_fill_trace(struct super_block *sb,
struct ceph_mds_request *req);
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
......@@ -1042,7 +1050,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
struct ceph_cap_reservation *ctx);
extern void ceph_add_cap(struct inode *inode,
struct ceph_mds_session *session, u64 cap_id,
int fmode, unsigned issued, unsigned wanted,
unsigned issued, unsigned wanted,
unsigned cap, unsigned seq, u64 realmino, int flags,
struct ceph_cap **new_cap);
extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
......@@ -1058,8 +1066,12 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
struct ceph_inode_info *ci);
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
int mds);
extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
bool snap_rwsem_locked);
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
......@@ -1084,8 +1096,10 @@ extern int ceph_try_get_caps(struct inode *inode,
int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count);
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count);
extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
struct ceph_mds_client *mdsc, int fmode);
/* addr.c */
extern const struct address_space_operations ceph_aops;
......@@ -1097,7 +1111,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
/* file.c */
extern const struct file_operations ceph_file_fops;
extern int ceph_renew_caps(struct inode *inode);
extern int ceph_renew_caps(struct inode *inode, int fmode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode);
......
......@@ -446,6 +446,7 @@ union ceph_mds_request_args {
#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
#define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */
struct ceph_mds_request_head {
__le64 oldest_client_tid;
......@@ -530,6 +531,9 @@ struct ceph_mds_reply_lease {
__le32 seq;
} __attribute__ ((packed));
#define CEPH_LEASE_VALID (1 | 2) /* old and new bit values */
#define CEPH_LEASE_PRIMARY_LINK 4 /* primary linkage */
struct ceph_mds_reply_dirfrag {
__le32 frag; /* fragment */
__le32 auth; /* auth mds, if this is a delegation point */
......@@ -564,6 +568,7 @@ struct ceph_filelock {
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
#define CEPH_FILE_MODE_BITS 4
#define CEPH_FILE_MODE_MASK ((1 << CEPH_FILE_MODE_BITS) - 1)
int ceph_flags_to_mode(int flags);
......@@ -655,10 +660,19 @@ int ceph_flags_to_mode(int flags);
#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
CEPH_CAP_PIN)
#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
CEPH_LOCK_IXATTR)
/* cap masks async dir operations */
#define CEPH_CAP_DIR_CREATE CEPH_CAP_FILE_CACHE
#define CEPH_CAP_DIR_UNLINK CEPH_CAP_FILE_RD
#define CEPH_CAP_ANY_DIR_OPS (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
int ceph_caps_for_mode(int mode);
enum {
......
......@@ -2,22 +2,8 @@
#ifndef _FS_CEPH_DEBUGFS_H
#define _FS_CEPH_DEBUGFS_H
#include <linux/ceph/ceph_debug.h>
#include <linux/ceph/types.h>
#define CEPH_DEFINE_SHOW_FUNC(name) \
static int name##_open(struct inode *inode, struct file *file) \
{ \
return single_open(file, name, inode->i_private); \
} \
\
static const struct file_operations name##_fops = { \
.open = name##_open, \
.read = seq_read, \
.llseek = seq_lseek, \
.release = single_release, \
};
/* debugfs.c */
extern void ceph_debugfs_init(void);
extern void ceph_debugfs_cleanup(void);
......
......@@ -272,6 +272,7 @@ extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep;
extern struct kmem_cache *ceph_dir_file_cachep;
extern struct kmem_cache *ceph_mds_request_cachep;
/* ceph_common.c */
extern bool libceph_compatible(void *data);
......
......@@ -509,23 +509,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
struct page *req_page, size_t req_len,
struct page **resp_pages, size_t *resp_len);
extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
u64 off, u64 *plen,
u32 truncate_seq, u64 truncate_size,
struct page **pages, int nr_pages,
int page_align);
extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
struct ceph_snap_context *sc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
struct timespec64 *mtime,
struct page **pages, int nr_pages);
int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
u64 src_snapid, u64 src_version,
struct ceph_object_id *src_oid,
......
......@@ -383,11 +383,11 @@ static int client_options_show(struct seq_file *s, void *p)
return 0;
}
CEPH_DEFINE_SHOW_FUNC(monmap_show)
CEPH_DEFINE_SHOW_FUNC(osdmap_show)
CEPH_DEFINE_SHOW_FUNC(monc_show)
CEPH_DEFINE_SHOW_FUNC(osdc_show)
CEPH_DEFINE_SHOW_FUNC(client_options_show)
DEFINE_SHOW_ATTRIBUTE(monmap);
DEFINE_SHOW_ATTRIBUTE(osdmap);
DEFINE_SHOW_ATTRIBUTE(monc);
DEFINE_SHOW_ATTRIBUTE(osdc);
DEFINE_SHOW_ATTRIBUTE(client_options);
void __init ceph_debugfs_init(void)
{
......@@ -414,31 +414,31 @@ void ceph_debugfs_client_init(struct ceph_client *client)
0400,
client->debugfs_dir,
client,
&monc_show_fops);
&monc_fops);
client->osdc.debugfs_file = debugfs_create_file("osdc",
0400,
client->debugfs_dir,
client,
&osdc_show_fops);
&osdc_fops);
client->debugfs_monmap = debugfs_create_file("monmap",
0400,
client->debugfs_dir,
client,
&monmap_show_fops);
&monmap_fops);
client->debugfs_osdmap = debugfs_create_file("osdmap",
0400,
client->debugfs_dir,
client,
&osdmap_show_fops);
&osdmap_fops);
client->debugfs_options = debugfs_create_file("client_options",
0400,
client->debugfs_dir,
client,
&client_options_show_fops);
&client_options_fops);
}
void ceph_debugfs_client_cleanup(struct ceph_client *client)
......
......@@ -467,7 +467,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
struct ceph_client *client = monc->client;
struct ceph_monmap *monmap = NULL, *old = monc->monmap;
struct ceph_monmap *monmap;
void *p, *end;
mutex_lock(&monc->mutex);
......@@ -484,13 +484,13 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
goto out;
}
if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
if (ceph_check_fsid(client, &monmap->fsid) < 0) {
kfree(monmap);
goto out;
}
client->monc.monmap = monmap;
kfree(old);
kfree(monc->monmap);
monc->monmap = monmap;
__ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
client->have_fsid = true;
......
......@@ -3483,9 +3483,6 @@ static int ceph_redirect_decode(void **p, void *end,
goto e_inval;
}
len = ceph_decode_32(p);
*p += len; /* skip osd_instructions */
/* skip the rest */
*p = struct_end;
out:
......@@ -5228,85 +5225,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
}
/*
* Read some contiguous pages. If we cross a stripe boundary, shorten
* *plen. Return number of bytes read, or error.
*/
int ceph_osdc_readpages(struct ceph_osd_client *osdc,
struct ceph_vino vino, struct ceph_file_layout *layout,
u64 off, u64 *plen,
u32 truncate_seq, u64 truncate_size,
struct page **pages, int num_pages, int page_align)
{
struct ceph_osd_request *req;
int rc = 0;
dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
vino.snap, off, *plen);
req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, truncate_seq, truncate_size,
false);
if (IS_ERR(req))
return PTR_ERR(req);
/* it may be a short read due to an object boundary */
osd_req_op_extent_osd_data_pages(req, 0,
pages, *plen, page_align, false, false);
dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
off, *plen, *plen, page_align);
rc = ceph_osdc_start_request(osdc, req, false);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
ceph_osdc_put_request(req);
dout("readpages result %d\n", rc);
return rc;
}
EXPORT_SYMBOL(ceph_osdc_readpages);
/*
* do a synchronous write on N pages
*/
int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
struct ceph_file_layout *layout,
struct ceph_snap_context *snapc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
struct timespec64 *mtime,
struct page **pages, int num_pages)
{
struct ceph_osd_request *req;
int rc = 0;
int page_align = off & ~PAGE_MASK;
req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
snapc, truncate_seq, truncate_size,
true);
if (IS_ERR(req))
return PTR_ERR(req);
/* it may be a short write due to an object boundary */
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
false, false);
dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
req->r_mtime = *mtime;
rc = ceph_osdc_start_request(osdc, req, true);
if (!rc)
rc = ceph_osdc_wait_request(osdc, req);
ceph_osdc_put_request(req);
if (rc == 0)
rc = len;
dout("writepages result %d\n", rc);
return rc;
}
EXPORT_SYMBOL(ceph_osdc_writepages);
static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
u64 src_snapid, u64 src_version,
struct ceph_object_id *src_oid,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment