Commit cc8362b1 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph changes from Sage Weil:
 "Lots of stuff this time around:

   - lots of cleanup and refactoring in the libceph messenger code, and
     many hard to hit races and bugs closed as a result.
   - lots of cleanup and refactoring in the rbd code from Alex Elder,
     mostly in preparation for the layering functionality that will be
     coming in 3.7.
   - some misc rbd cleanups from Josh Durgin that are finally going
     upstream
   - support for CRUSH tunables (used by newer clusters to improve the
     data placement)
   - some cleanup in our use of d_parent that Al brought up a while back
   - a random collection of fixes across the tree

  There is another patch coming that fixes up our ->atomic_open()
  behavior, but I'm going to hammer on it a bit more before sending it."

Fix up conflicts due to commits that were already committed earlier in
drivers/block/rbd.c, net/ceph/{messenger.c, osd_client.c}

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (132 commits)
  rbd: create rbd_refresh_helper()
  rbd: return obj version in __rbd_refresh_header()
  rbd: fixes in rbd_header_from_disk()
  rbd: always pass ops array to rbd_req_sync_op()
  rbd: pass null version pointer in add_snap()
  rbd: make rbd_create_rw_ops() return a pointer
  rbd: have __rbd_add_snap_dev() return a pointer
  libceph: recheck con state after allocating incoming message
  libceph: change ceph_con_in_msg_alloc convention to be less weird
  libceph: avoid dropping con mutex before fault
  libceph: verify state after retaking con lock after dispatch
  libceph: revoke mon_client messages on session restart
  libceph: fix handling of immediate socket connect failure
  ceph: update MAINTAINERS file
  libceph: be less chatty about stray replies
  libceph: clear all flags on con_close
  libceph: clean up con flags
  libceph: replace connection state bits with states
  libceph: drop unnecessary CLOSED check in socket state change callback
  libceph: close socket directly from ceph_con_close()
  ...
parents 2e3ee613 1fe5e993
...@@ -35,8 +35,14 @@ name ...@@ -35,8 +35,14 @@ name
pool pool
The pool where this rbd image resides. The pool-name pair is unique The name of the storage pool where this rbd image resides.
per rados system. An rbd image name is unique within its pool.
pool_id
The unique identifier for the rbd image's pool. This is
a permanent attribute of the pool. A pool's id will never
change.
size size
......
...@@ -1789,15 +1789,16 @@ F: arch/powerpc/oprofile/*cell* ...@@ -1789,15 +1789,16 @@ F: arch/powerpc/oprofile/*cell*
F: arch/powerpc/platforms/cell/ F: arch/powerpc/platforms/cell/
CEPH DISTRIBUTED FILE SYSTEM CLIENT CEPH DISTRIBUTED FILE SYSTEM CLIENT
M: Sage Weil <sage@newdream.net> M: Sage Weil <sage@inktank.com>
L: ceph-devel@vger.kernel.org L: ceph-devel@vger.kernel.org
W: http://ceph.newdream.net/ W: http://ceph.com/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported S: Supported
F: Documentation/filesystems/ceph.txt F: Documentation/filesystems/ceph.txt
F: fs/ceph F: fs/ceph
F: net/ceph F: net/ceph
F: include/linux/ceph F: include/linux/ceph
F: include/linux/crush
CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
L: linux-usb@vger.kernel.org L: linux-usb@vger.kernel.org
...@@ -5639,10 +5640,12 @@ S: Supported ...@@ -5639,10 +5640,12 @@ S: Supported
F: arch/hexagon/ F: arch/hexagon/
RADOS BLOCK DEVICE (RBD) RADOS BLOCK DEVICE (RBD)
F: include/linux/qnxtypes.h M: Yehuda Sadeh <yehuda@inktank.com>
M: Yehuda Sadeh <yehuda@hq.newdream.net> M: Sage Weil <sage@inktank.com>
M: Sage Weil <sage@newdream.net> M: Alex Elder <elder@inktank.com>
M: ceph-devel@vger.kernel.org M: ceph-devel@vger.kernel.org
W: http://ceph.com/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
S: Supported S: Supported
F: drivers/block/rbd.c F: drivers/block/rbd.c
F: drivers/block/rbd_types.h F: drivers/block/rbd_types.h
......
...@@ -55,8 +55,6 @@ ...@@ -55,8 +55,6 @@
#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
#define RBD_MAX_POOL_NAME_LEN 64
#define RBD_MAX_SNAP_NAME_LEN 32 #define RBD_MAX_SNAP_NAME_LEN 32
#define RBD_MAX_OPT_LEN 1024 #define RBD_MAX_OPT_LEN 1024
...@@ -78,13 +76,12 @@ ...@@ -78,13 +76,12 @@
*/ */
struct rbd_image_header { struct rbd_image_header {
u64 image_size; u64 image_size;
char block_name[32]; char *object_prefix;
__u8 obj_order; __u8 obj_order;
__u8 crypt_type; __u8 crypt_type;
__u8 comp_type; __u8 comp_type;
struct ceph_snap_context *snapc; struct ceph_snap_context *snapc;
size_t snap_names_len; size_t snap_names_len;
u64 snap_seq;
u32 total_snaps; u32 total_snaps;
char *snap_names; char *snap_names;
...@@ -150,7 +147,7 @@ struct rbd_snap { ...@@ -150,7 +147,7 @@ struct rbd_snap {
* a single device * a single device
*/ */
struct rbd_device { struct rbd_device {
int id; /* blkdev unique id */ int dev_id; /* blkdev unique id */
int major; /* blkdev assigned major */ int major; /* blkdev assigned major */
struct gendisk *disk; /* blkdev's gendisk and rq */ struct gendisk *disk; /* blkdev's gendisk and rq */
...@@ -163,20 +160,24 @@ struct rbd_device { ...@@ -163,20 +160,24 @@ struct rbd_device {
spinlock_t lock; /* queue lock */ spinlock_t lock; /* queue lock */
struct rbd_image_header header; struct rbd_image_header header;
char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ char *image_name;
int obj_len; size_t image_name_len;
char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ char *header_name;
char pool_name[RBD_MAX_POOL_NAME_LEN]; char *pool_name;
int poolid; int pool_id;
struct ceph_osd_event *watch_event; struct ceph_osd_event *watch_event;
struct ceph_osd_request *watch_request; struct ceph_osd_request *watch_request;
/* protects updating the header */ /* protects updating the header */
struct rw_semaphore header_rwsem; struct rw_semaphore header_rwsem;
char snap_name[RBD_MAX_SNAP_NAME_LEN]; /* name of the snapshot this device reads from */
char *snap_name;
/* id of the snapshot this device reads from */
u64 snap_id; /* current snapshot id */ u64 snap_id; /* current snapshot id */
int read_only; /* whether the snap_id this device reads from still exists */
bool snap_exists;
int read_only;
struct list_head node; struct list_head node;
...@@ -201,8 +202,7 @@ static ssize_t rbd_snap_add(struct device *dev, ...@@ -201,8 +202,7 @@ static ssize_t rbd_snap_add(struct device *dev,
struct device_attribute *attr, struct device_attribute *attr,
const char *buf, const char *buf,
size_t count); size_t count);
static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, static void __rbd_remove_snap_dev(struct rbd_snap *snap);
struct rbd_snap *snap);
static ssize_t rbd_add(struct bus_type *bus, const char *buf, static ssize_t rbd_add(struct bus_type *bus, const char *buf,
size_t count); size_t count);
...@@ -240,7 +240,7 @@ static void rbd_put_dev(struct rbd_device *rbd_dev) ...@@ -240,7 +240,7 @@ static void rbd_put_dev(struct rbd_device *rbd_dev)
put_device(&rbd_dev->dev); put_device(&rbd_dev->dev);
} }
static int __rbd_refresh_header(struct rbd_device *rbd_dev); static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
static int rbd_open(struct block_device *bdev, fmode_t mode) static int rbd_open(struct block_device *bdev, fmode_t mode)
{ {
...@@ -273,9 +273,9 @@ static const struct block_device_operations rbd_bd_ops = { ...@@ -273,9 +273,9 @@ static const struct block_device_operations rbd_bd_ops = {
/* /*
* Initialize an rbd client instance. * Initialize an rbd client instance.
* We own *opt. * We own *ceph_opts.
*/ */
static struct rbd_client *rbd_client_create(struct ceph_options *opt, static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
struct rbd_options *rbd_opts) struct rbd_options *rbd_opts)
{ {
struct rbd_client *rbdc; struct rbd_client *rbdc;
...@@ -291,10 +291,10 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, ...@@ -291,10 +291,10 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
rbdc->client = ceph_create_client(opt, rbdc, 0, 0); rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
if (IS_ERR(rbdc->client)) if (IS_ERR(rbdc->client))
goto out_mutex; goto out_mutex;
opt = NULL; /* Now rbdc->client is responsible for opt */ ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
ret = ceph_open_session(rbdc->client); ret = ceph_open_session(rbdc->client);
if (ret < 0) if (ret < 0)
...@@ -317,23 +317,23 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, ...@@ -317,23 +317,23 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt,
mutex_unlock(&ctl_mutex); mutex_unlock(&ctl_mutex);
kfree(rbdc); kfree(rbdc);
out_opt: out_opt:
if (opt) if (ceph_opts)
ceph_destroy_options(opt); ceph_destroy_options(ceph_opts);
return ERR_PTR(ret); return ERR_PTR(ret);
} }
/* /*
* Find a ceph client with specific addr and configuration. * Find a ceph client with specific addr and configuration.
*/ */
static struct rbd_client *__rbd_client_find(struct ceph_options *opt) static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
{ {
struct rbd_client *client_node; struct rbd_client *client_node;
if (opt->flags & CEPH_OPT_NOSHARE) if (ceph_opts->flags & CEPH_OPT_NOSHARE)
return NULL; return NULL;
list_for_each_entry(client_node, &rbd_client_list, node) list_for_each_entry(client_node, &rbd_client_list, node)
if (ceph_compare_options(opt, client_node->client) == 0) if (!ceph_compare_options(ceph_opts, client_node->client))
return client_node; return client_node;
return NULL; return NULL;
} }
...@@ -349,7 +349,7 @@ enum { ...@@ -349,7 +349,7 @@ enum {
/* string args above */ /* string args above */
}; };
static match_table_t rbdopt_tokens = { static match_table_t rbd_opts_tokens = {
{Opt_notify_timeout, "notify_timeout=%d"}, {Opt_notify_timeout, "notify_timeout=%d"},
/* int args above */ /* int args above */
/* string args above */ /* string args above */
...@@ -358,11 +358,11 @@ static match_table_t rbdopt_tokens = { ...@@ -358,11 +358,11 @@ static match_table_t rbdopt_tokens = {
static int parse_rbd_opts_token(char *c, void *private) static int parse_rbd_opts_token(char *c, void *private)
{ {
struct rbd_options *rbdopt = private; struct rbd_options *rbd_opts = private;
substring_t argstr[MAX_OPT_ARGS]; substring_t argstr[MAX_OPT_ARGS];
int token, intval, ret; int token, intval, ret;
token = match_token(c, rbdopt_tokens, argstr); token = match_token(c, rbd_opts_tokens, argstr);
if (token < 0) if (token < 0)
return -EINVAL; return -EINVAL;
...@@ -383,7 +383,7 @@ static int parse_rbd_opts_token(char *c, void *private) ...@@ -383,7 +383,7 @@ static int parse_rbd_opts_token(char *c, void *private)
switch (token) { switch (token) {
case Opt_notify_timeout: case Opt_notify_timeout:
rbdopt->notify_timeout = intval; rbd_opts->notify_timeout = intval;
break; break;
default: default:
BUG_ON(token); BUG_ON(token);
...@@ -400,7 +400,7 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, ...@@ -400,7 +400,7 @@ static struct rbd_client *rbd_get_client(const char *mon_addr,
char *options) char *options)
{ {
struct rbd_client *rbdc; struct rbd_client *rbdc;
struct ceph_options *opt; struct ceph_options *ceph_opts;
struct rbd_options *rbd_opts; struct rbd_options *rbd_opts;
rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
...@@ -409,29 +409,29 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, ...@@ -409,29 +409,29 @@ static struct rbd_client *rbd_get_client(const char *mon_addr,
rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
opt = ceph_parse_options(options, mon_addr, ceph_opts = ceph_parse_options(options, mon_addr,
mon_addr + mon_addr_len, mon_addr + mon_addr_len,
parse_rbd_opts_token, rbd_opts); parse_rbd_opts_token, rbd_opts);
if (IS_ERR(opt)) { if (IS_ERR(ceph_opts)) {
kfree(rbd_opts); kfree(rbd_opts);
return ERR_CAST(opt); return ERR_CAST(ceph_opts);
} }
spin_lock(&rbd_client_list_lock); spin_lock(&rbd_client_list_lock);
rbdc = __rbd_client_find(opt); rbdc = __rbd_client_find(ceph_opts);
if (rbdc) { if (rbdc) {
/* using an existing client */ /* using an existing client */
kref_get(&rbdc->kref); kref_get(&rbdc->kref);
spin_unlock(&rbd_client_list_lock); spin_unlock(&rbd_client_list_lock);
ceph_destroy_options(opt); ceph_destroy_options(ceph_opts);
kfree(rbd_opts); kfree(rbd_opts);
return rbdc; return rbdc;
} }
spin_unlock(&rbd_client_list_lock); spin_unlock(&rbd_client_list_lock);
rbdc = rbd_client_create(opt, rbd_opts); rbdc = rbd_client_create(ceph_opts, rbd_opts);
if (IS_ERR(rbdc)) if (IS_ERR(rbdc))
kfree(rbd_opts); kfree(rbd_opts);
...@@ -480,46 +480,60 @@ static void rbd_coll_release(struct kref *kref) ...@@ -480,46 +480,60 @@ static void rbd_coll_release(struct kref *kref)
kfree(coll); kfree(coll);
} }
static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
{
return !memcmp(&ondisk->text,
RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
}
/* /*
* Create a new header structure, translate header format from the on-disk * Create a new header structure, translate header format from the on-disk
* header. * header.
*/ */
static int rbd_header_from_disk(struct rbd_image_header *header, static int rbd_header_from_disk(struct rbd_image_header *header,
struct rbd_image_header_ondisk *ondisk, struct rbd_image_header_ondisk *ondisk,
u32 allocated_snaps, u32 allocated_snaps)
gfp_t gfp_flags)
{ {
u32 i, snap_count; u32 snap_count;
if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) if (!rbd_dev_ondisk_valid(ondisk))
return -ENXIO; return -ENXIO;
snap_count = le32_to_cpu(ondisk->snap_count); snap_count = le32_to_cpu(ondisk->snap_count);
if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context)) if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context))
/ sizeof (*ondisk)) / sizeof (u64))
return -EINVAL; return -EINVAL;
header->snapc = kmalloc(sizeof(struct ceph_snap_context) + header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
snap_count * sizeof(u64), snap_count * sizeof(u64),
gfp_flags); GFP_KERNEL);
if (!header->snapc) if (!header->snapc)
return -ENOMEM; return -ENOMEM;
header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
if (snap_count) { if (snap_count) {
header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
header->snap_names = kmalloc(header->snap_names_len, header->snap_names = kmalloc(header->snap_names_len,
gfp_flags); GFP_KERNEL);
if (!header->snap_names) if (!header->snap_names)
goto err_snapc; goto err_snapc;
header->snap_sizes = kmalloc(snap_count * sizeof(u64), header->snap_sizes = kmalloc(snap_count * sizeof(u64),
gfp_flags); GFP_KERNEL);
if (!header->snap_sizes) if (!header->snap_sizes)
goto err_names; goto err_names;
} else { } else {
WARN_ON(ondisk->snap_names_len);
header->snap_names_len = 0;
header->snap_names = NULL; header->snap_names = NULL;
header->snap_sizes = NULL; header->snap_sizes = NULL;
} }
memcpy(header->block_name, ondisk->block_name,
header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
GFP_KERNEL);
if (!header->object_prefix)
goto err_sizes;
memcpy(header->object_prefix, ondisk->block_name,
sizeof(ondisk->block_name)); sizeof(ondisk->block_name));
header->object_prefix[sizeof (ondisk->block_name)] = '\0';
header->image_size = le64_to_cpu(ondisk->image_size); header->image_size = le64_to_cpu(ondisk->image_size);
header->obj_order = ondisk->options.order; header->obj_order = ondisk->options.order;
...@@ -527,11 +541,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, ...@@ -527,11 +541,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
header->comp_type = ondisk->options.comp_type; header->comp_type = ondisk->options.comp_type;
atomic_set(&header->snapc->nref, 1); atomic_set(&header->snapc->nref, 1);
header->snap_seq = le64_to_cpu(ondisk->snap_seq); header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
header->snapc->num_snaps = snap_count; header->snapc->num_snaps = snap_count;
header->total_snaps = snap_count; header->total_snaps = snap_count;
if (snap_count && allocated_snaps == snap_count) { if (snap_count && allocated_snaps == snap_count) {
int i;
for (i = 0; i < snap_count; i++) { for (i = 0; i < snap_count; i++) {
header->snapc->snaps[i] = header->snapc->snaps[i] =
le64_to_cpu(ondisk->snaps[i].id); le64_to_cpu(ondisk->snaps[i].id);
...@@ -540,16 +556,22 @@ static int rbd_header_from_disk(struct rbd_image_header *header, ...@@ -540,16 +556,22 @@ static int rbd_header_from_disk(struct rbd_image_header *header,
} }
/* copy snapshot names */ /* copy snapshot names */
memcpy(header->snap_names, &ondisk->snaps[i], memcpy(header->snap_names, &ondisk->snaps[snap_count],
header->snap_names_len); header->snap_names_len);
} }
return 0; return 0;
err_sizes:
kfree(header->snap_sizes);
header->snap_sizes = NULL;
err_names: err_names:
kfree(header->snap_names); kfree(header->snap_names);
header->snap_names = NULL;
err_snapc: err_snapc:
kfree(header->snapc); kfree(header->snapc);
header->snapc = NULL;
return -ENOMEM; return -ENOMEM;
} }
...@@ -575,52 +597,50 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, ...@@ -575,52 +597,50 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
return -ENOENT; return -ENOENT;
} }
static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
{ {
struct rbd_image_header *header = &dev->header; int ret;
struct ceph_snap_context *snapc = header->snapc;
int ret = -ENOENT;
BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
down_write(&dev->header_rwsem); down_write(&rbd_dev->header_rwsem);
if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME, if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
sizeof (RBD_SNAP_HEAD_NAME))) { sizeof (RBD_SNAP_HEAD_NAME))) {
if (header->total_snaps) rbd_dev->snap_id = CEPH_NOSNAP;
snapc->seq = header->snap_seq; rbd_dev->snap_exists = false;
else rbd_dev->read_only = 0;
snapc->seq = 0;
dev->snap_id = CEPH_NOSNAP;
dev->read_only = 0;
if (size) if (size)
*size = header->image_size; *size = rbd_dev->header.image_size;
} else { } else {
ret = snap_by_name(header, dev->snap_name, &snapc->seq, size); u64 snap_id = 0;
ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
&snap_id, size);
if (ret < 0) if (ret < 0)
goto done; goto done;
dev->snap_id = snapc->seq; rbd_dev->snap_id = snap_id;
dev->read_only = 1; rbd_dev->snap_exists = true;
rbd_dev->read_only = 1;
} }
ret = 0; ret = 0;
done: done:
up_write(&dev->header_rwsem); up_write(&rbd_dev->header_rwsem);
return ret; return ret;
} }
static void rbd_header_free(struct rbd_image_header *header) static void rbd_header_free(struct rbd_image_header *header)
{ {
kfree(header->snapc); kfree(header->object_prefix);
kfree(header->snap_names);
kfree(header->snap_sizes); kfree(header->snap_sizes);
kfree(header->snap_names);
ceph_put_snap_context(header->snapc);
} }
/* /*
* get the actual striped segment name, offset and length * get the actual striped segment name, offset and length
*/ */
static u64 rbd_get_segment(struct rbd_image_header *header, static u64 rbd_get_segment(struct rbd_image_header *header,
const char *block_name, const char *object_prefix,
u64 ofs, u64 len, u64 ofs, u64 len,
char *seg_name, u64 *segofs) char *seg_name, u64 *segofs)
{ {
...@@ -628,7 +648,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header, ...@@ -628,7 +648,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header,
if (seg_name) if (seg_name)
snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
"%s.%012llx", block_name, seg); "%s.%012llx", object_prefix, seg);
ofs = ofs & ((1 << header->obj_order) - 1); ofs = ofs & ((1 << header->obj_order) - 1);
len = min_t(u64, len, (1 << header->obj_order) - ofs); len = min_t(u64, len, (1 << header->obj_order) - ofs);
...@@ -726,9 +746,8 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, ...@@ -726,9 +746,8 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
* split_bio will BUG_ON if this is not the case * split_bio will BUG_ON if this is not the case
*/ */
dout("bio_chain_clone split! total=%d remaining=%d" dout("bio_chain_clone split! total=%d remaining=%d"
"bi_size=%d\n", "bi_size=%u\n",
(int)total, (int)len-total, total, len - total, old_chain->bi_size);
(int)old_chain->bi_size);
/* split the bio. We'll release it either in the next /* split the bio. We'll release it either in the next
call, or it will have to be released outside */ call, or it will have to be released outside */
...@@ -777,22 +796,24 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, ...@@ -777,22 +796,24 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
/* /*
* helpers for osd request op vectors. * helpers for osd request op vectors.
*/ */
static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
int num_ops, int opcode, u32 payload_len)
int opcode, {
u32 payload_len) struct ceph_osd_req_op *ops;
{
*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
GFP_NOIO); if (!ops)
if (!*ops) return NULL;
return -ENOMEM;
(*ops)[0].op = opcode; ops[0].op = opcode;
/* /*
* op extent offset and length will be set later on * op extent offset and length will be set later on
* in calc_raw_layout() * in calc_raw_layout()
*/ */
(*ops)[0].payload_len = payload_len; ops[0].payload_len = payload_len;
return 0;
return ops;
} }
static void rbd_destroy_ops(struct ceph_osd_req_op *ops) static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
...@@ -808,8 +829,8 @@ static void rbd_coll_end_req_index(struct request *rq, ...@@ -808,8 +829,8 @@ static void rbd_coll_end_req_index(struct request *rq,
struct request_queue *q; struct request_queue *q;
int min, max, i; int min, max, i;
dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
coll, index, ret, len); coll, index, ret, (unsigned long long) len);
if (!rq) if (!rq)
return; return;
...@@ -848,16 +869,15 @@ static void rbd_coll_end_req(struct rbd_request *req, ...@@ -848,16 +869,15 @@ static void rbd_coll_end_req(struct rbd_request *req,
* Send ceph osd request * Send ceph osd request
*/ */
static int rbd_do_request(struct request *rq, static int rbd_do_request(struct request *rq,
struct rbd_device *dev, struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc, struct ceph_snap_context *snapc,
u64 snapid, u64 snapid,
const char *obj, u64 ofs, u64 len, const char *object_name, u64 ofs, u64 len,
struct bio *bio, struct bio *bio,
struct page **pages, struct page **pages,
int num_pages, int num_pages,
int flags, int flags,
struct ceph_osd_req_op *ops, struct ceph_osd_req_op *ops,
int num_reply,
struct rbd_req_coll *coll, struct rbd_req_coll *coll,
int coll_index, int coll_index,
void (*rbd_cb)(struct ceph_osd_request *req, void (*rbd_cb)(struct ceph_osd_request *req,
...@@ -887,15 +907,13 @@ static int rbd_do_request(struct request *rq, ...@@ -887,15 +907,13 @@ static int rbd_do_request(struct request *rq,
req_data->coll_index = coll_index; req_data->coll_index = coll_index;
} }
dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
(unsigned long long) ofs, (unsigned long long) len);
down_read(&dev->header_rwsem);
osdc = &dev->rbd_client->client->osdc; osdc = &rbd_dev->rbd_client->client->osdc;
req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
false, GFP_NOIO, pages, bio); false, GFP_NOIO, pages, bio);
if (!req) { if (!req) {
up_read(&dev->header_rwsem);
ret = -ENOMEM; ret = -ENOMEM;
goto done_pages; goto done_pages;
} }
...@@ -912,7 +930,7 @@ static int rbd_do_request(struct request *rq, ...@@ -912,7 +930,7 @@ static int rbd_do_request(struct request *rq,
reqhead = req->r_request->front.iov_base; reqhead = req->r_request->front.iov_base;
reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
strncpy(req->r_oid, obj, sizeof(req->r_oid)); strncpy(req->r_oid, object_name, sizeof(req->r_oid));
req->r_oid_len = strlen(req->r_oid); req->r_oid_len = strlen(req->r_oid);
layout = &req->r_file_layout; layout = &req->r_file_layout;
...@@ -920,7 +938,7 @@ static int rbd_do_request(struct request *rq, ...@@ -920,7 +938,7 @@ static int rbd_do_request(struct request *rq,
layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
layout->fl_stripe_count = cpu_to_le32(1); layout->fl_stripe_count = cpu_to_le32(1);
layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
layout->fl_pg_pool = cpu_to_le32(dev->poolid); layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
req, ops); req, ops);
...@@ -929,7 +947,6 @@ static int rbd_do_request(struct request *rq, ...@@ -929,7 +947,6 @@ static int rbd_do_request(struct request *rq,
snapc, snapc,
&mtime, &mtime,
req->r_oid, req->r_oid_len); req->r_oid, req->r_oid_len);
up_read(&dev->header_rwsem);
if (linger_req) { if (linger_req) {
ceph_osdc_set_request_linger(osdc, req); ceph_osdc_set_request_linger(osdc, req);
...@@ -944,8 +961,9 @@ static int rbd_do_request(struct request *rq, ...@@ -944,8 +961,9 @@ static int rbd_do_request(struct request *rq,
ret = ceph_osdc_wait_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req);
if (ver) if (ver)
*ver = le64_to_cpu(req->r_reassert_version.version); *ver = le64_to_cpu(req->r_reassert_version.version);
dout("reassert_ver=%lld\n", dout("reassert_ver=%llu\n",
le64_to_cpu(req->r_reassert_version.version)); (unsigned long long)
le64_to_cpu(req->r_reassert_version.version));
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
} }
return ret; return ret;
...@@ -979,7 +997,8 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) ...@@ -979,7 +997,8 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
bytes = le64_to_cpu(op->extent.length); bytes = le64_to_cpu(op->extent.length);
read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
(unsigned long long) bytes, read_op, (int) rc);
if (rc == -ENOENT && read_op) { if (rc == -ENOENT && read_op) {
zero_bio_chain(req_data->bio, 0); zero_bio_chain(req_data->bio, 0);
...@@ -1006,14 +1025,12 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg ...@@ -1006,14 +1025,12 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg
/* /*
* Do a synchronous ceph osd operation * Do a synchronous ceph osd operation
*/ */
static int rbd_req_sync_op(struct rbd_device *dev, static int rbd_req_sync_op(struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc, struct ceph_snap_context *snapc,
u64 snapid, u64 snapid,
int opcode,
int flags, int flags,
struct ceph_osd_req_op *orig_ops, struct ceph_osd_req_op *ops,
int num_reply, const char *object_name,
const char *obj,
u64 ofs, u64 len, u64 ofs, u64 len,
char *buf, char *buf,
struct ceph_osd_request **linger_req, struct ceph_osd_request **linger_req,
...@@ -1022,45 +1039,28 @@ static int rbd_req_sync_op(struct rbd_device *dev, ...@@ -1022,45 +1039,28 @@ static int rbd_req_sync_op(struct rbd_device *dev,
int ret; int ret;
struct page **pages; struct page **pages;
int num_pages; int num_pages;
struct ceph_osd_req_op *ops = orig_ops;
u32 payload_len; BUG_ON(ops == NULL);
num_pages = calc_pages_for(ofs , len); num_pages = calc_pages_for(ofs , len);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) if (IS_ERR(pages))
return PTR_ERR(pages); return PTR_ERR(pages);
if (!orig_ops) { ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); object_name, ofs, len, NULL,
ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
if (ret < 0)
goto done;
if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
if (ret < 0)
goto done_ops;
}
}
ret = rbd_do_request(NULL, dev, snapc, snapid,
obj, ofs, len, NULL,
pages, num_pages, pages, num_pages,
flags, flags,
ops, ops,
2,
NULL, 0, NULL, 0,
NULL, NULL,
linger_req, ver); linger_req, ver);
if (ret < 0) if (ret < 0)
goto done_ops; goto done;
if ((flags & CEPH_OSD_FLAG_READ) && buf) if ((flags & CEPH_OSD_FLAG_READ) && buf)
ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
done_ops:
if (!orig_ops)
rbd_destroy_ops(ops);
done: done:
ceph_release_page_vector(pages, num_pages); ceph_release_page_vector(pages, num_pages);
return ret; return ret;
...@@ -1070,10 +1070,10 @@ static int rbd_req_sync_op(struct rbd_device *dev, ...@@ -1070,10 +1070,10 @@ static int rbd_req_sync_op(struct rbd_device *dev,
* Do an asynchronous ceph osd operation * Do an asynchronous ceph osd operation
*/ */
static int rbd_do_op(struct request *rq, static int rbd_do_op(struct request *rq,
struct rbd_device *rbd_dev , struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc, struct ceph_snap_context *snapc,
u64 snapid, u64 snapid,
int opcode, int flags, int num_reply, int opcode, int flags,
u64 ofs, u64 len, u64 ofs, u64 len,
struct bio *bio, struct bio *bio,
struct rbd_req_coll *coll, struct rbd_req_coll *coll,
...@@ -1091,14 +1091,15 @@ static int rbd_do_op(struct request *rq, ...@@ -1091,14 +1091,15 @@ static int rbd_do_op(struct request *rq,
return -ENOMEM; return -ENOMEM;
seg_len = rbd_get_segment(&rbd_dev->header, seg_len = rbd_get_segment(&rbd_dev->header,
rbd_dev->header.block_name, rbd_dev->header.object_prefix,
ofs, len, ofs, len,
seg_name, &seg_ofs); seg_name, &seg_ofs);
payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); ret = -ENOMEM;
if (ret < 0) ops = rbd_create_rw_ops(1, opcode, payload_len);
if (!ops)
goto done; goto done;
/* we've taken care of segment sizes earlier when we /* we've taken care of segment sizes earlier when we
...@@ -1112,7 +1113,6 @@ static int rbd_do_op(struct request *rq, ...@@ -1112,7 +1113,6 @@ static int rbd_do_op(struct request *rq,
NULL, 0, NULL, 0,
flags, flags,
ops, ops,
num_reply,
coll, coll_index, coll, coll_index,
rbd_req_cb, 0, NULL); rbd_req_cb, 0, NULL);
...@@ -1136,7 +1136,6 @@ static int rbd_req_write(struct request *rq, ...@@ -1136,7 +1136,6 @@ static int rbd_req_write(struct request *rq,
return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
CEPH_OSD_OP_WRITE, CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
2,
ofs, len, bio, coll, coll_index); ofs, len, bio, coll, coll_index);
} }
...@@ -1155,55 +1154,58 @@ static int rbd_req_read(struct request *rq, ...@@ -1155,55 +1154,58 @@ static int rbd_req_read(struct request *rq,
snapid, snapid,
CEPH_OSD_OP_READ, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ, CEPH_OSD_FLAG_READ,
2,
ofs, len, bio, coll, coll_index); ofs, len, bio, coll, coll_index);
} }
/* /*
* Request sync osd read * Request sync osd read
*/ */
static int rbd_req_sync_read(struct rbd_device *dev, static int rbd_req_sync_read(struct rbd_device *rbd_dev,
struct ceph_snap_context *snapc,
u64 snapid, u64 snapid,
const char *obj, const char *object_name,
u64 ofs, u64 len, u64 ofs, u64 len,
char *buf, char *buf,
u64 *ver) u64 *ver)
{ {
return rbd_req_sync_op(dev, NULL, struct ceph_osd_req_op *ops;
int ret;
ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
if (!ops)
return -ENOMEM;
ret = rbd_req_sync_op(rbd_dev, NULL,
snapid, snapid,
CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ, CEPH_OSD_FLAG_READ,
NULL, ops, object_name, ofs, len, buf, NULL, ver);
1, obj, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops);
return ret;
} }
/* /*
* Request sync osd watch * Request sync osd watch
*/ */
static int rbd_req_sync_notify_ack(struct rbd_device *dev, static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
u64 ver, u64 ver,
u64 notify_id, u64 notify_id)
const char *obj)
{ {
struct ceph_osd_req_op *ops; struct ceph_osd_req_op *ops;
struct page **pages = NULL;
int ret; int ret;
ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
if (ret < 0) if (!ops)
return ret; return -ENOMEM;
ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); ops[0].watch.ver = cpu_to_le64(ver);
ops[0].watch.cookie = notify_id; ops[0].watch.cookie = notify_id;
ops[0].watch.flag = 0; ops[0].watch.flag = 0;
ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
obj, 0, 0, NULL, rbd_dev->header_name, 0, 0, NULL,
pages, 0, NULL, 0,
CEPH_OSD_FLAG_READ, CEPH_OSD_FLAG_READ,
ops, ops,
1,
NULL, 0, NULL, 0,
rbd_simple_req_cb, 0, NULL); rbd_simple_req_cb, 0, NULL);
...@@ -1213,54 +1215,53 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, ...@@ -1213,54 +1215,53 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev,
static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
{ {
struct rbd_device *dev = (struct rbd_device *)data; struct rbd_device *rbd_dev = (struct rbd_device *)data;
u64 hver;
int rc; int rc;
if (!dev) if (!rbd_dev)
return; return;
dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
notify_id, (int)opcode); rbd_dev->header_name, (unsigned long long) notify_id,
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); (unsigned int) opcode);
rc = __rbd_refresh_header(dev); rc = rbd_refresh_header(rbd_dev, &hver);
mutex_unlock(&ctl_mutex);
if (rc) if (rc)
pr_warning(RBD_DRV_NAME "%d got notification but failed to " pr_warning(RBD_DRV_NAME "%d got notification but failed to "
" update snaps: %d\n", dev->major, rc); " update snaps: %d\n", rbd_dev->major, rc);
rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
} }
/* /*
* Request sync osd watch * Request sync osd watch
*/ */
static int rbd_req_sync_watch(struct rbd_device *dev, static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
const char *obj,
u64 ver)
{ {
struct ceph_osd_req_op *ops; struct ceph_osd_req_op *ops;
struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
int ret;
int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
if (ret < 0) if (!ops)
return ret; return -ENOMEM;
ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
(void *)dev, &dev->watch_event); (void *)rbd_dev, &rbd_dev->watch_event);
if (ret < 0) if (ret < 0)
goto fail; goto fail;
ops[0].watch.ver = cpu_to_le64(ver); ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
ops[0].watch.flag = 1; ops[0].watch.flag = 1;
ret = rbd_req_sync_op(dev, NULL, ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP, CEPH_NOSNAP,
0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops, ops,
1, obj, 0, 0, NULL, rbd_dev->header_name,
&dev->watch_request, NULL); 0, 0, NULL,
&rbd_dev->watch_request, NULL);
if (ret < 0) if (ret < 0)
goto fail_event; goto fail_event;
...@@ -1269,8 +1270,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev, ...@@ -1269,8 +1270,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev,
return 0; return 0;
fail_event: fail_event:
ceph_osdc_cancel_event(dev->watch_event); ceph_osdc_cancel_event(rbd_dev->watch_event);
dev->watch_event = NULL; rbd_dev->watch_event = NULL;
fail: fail:
rbd_destroy_ops(ops); rbd_destroy_ops(ops);
return ret; return ret;
...@@ -1279,64 +1280,65 @@ static int rbd_req_sync_watch(struct rbd_device *dev, ...@@ -1279,64 +1280,65 @@ static int rbd_req_sync_watch(struct rbd_device *dev,
/* /*
* Request sync osd unwatch * Request sync osd unwatch
*/ */
static int rbd_req_sync_unwatch(struct rbd_device *dev, static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
const char *obj)
{ {
struct ceph_osd_req_op *ops; struct ceph_osd_req_op *ops;
int ret;
int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
if (ret < 0) if (!ops)
return ret; return -ENOMEM;
ops[0].watch.ver = 0; ops[0].watch.ver = 0;
ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
ops[0].watch.flag = 0; ops[0].watch.flag = 0;
ret = rbd_req_sync_op(dev, NULL, ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP, CEPH_NOSNAP,
0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops, ops,
1, obj, 0, 0, NULL, NULL, NULL); rbd_dev->header_name,
0, 0, NULL, NULL, NULL);
rbd_destroy_ops(ops); rbd_destroy_ops(ops);
ceph_osdc_cancel_event(dev->watch_event); ceph_osdc_cancel_event(rbd_dev->watch_event);
dev->watch_event = NULL; rbd_dev->watch_event = NULL;
return ret; return ret;
} }
struct rbd_notify_info { struct rbd_notify_info {
struct rbd_device *dev; struct rbd_device *rbd_dev;
}; };
static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
{ {
struct rbd_device *dev = (struct rbd_device *)data; struct rbd_device *rbd_dev = (struct rbd_device *)data;
if (!dev) if (!rbd_dev)
return; return;
dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
notify_id, (int)opcode); rbd_dev->header_name, (unsigned long long) notify_id,
(unsigned int) opcode);
} }
/* /*
* Request sync osd notify * Request sync osd notify
*/ */
static int rbd_req_sync_notify(struct rbd_device *dev, static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
const char *obj)
{ {
struct ceph_osd_req_op *ops; struct ceph_osd_req_op *ops;
struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
struct ceph_osd_event *event; struct ceph_osd_event *event;
struct rbd_notify_info info; struct rbd_notify_info info;
int payload_len = sizeof(u32) + sizeof(u32); int payload_len = sizeof(u32) + sizeof(u32);
int ret; int ret;
ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
if (ret < 0) if (!ops)
return ret; return -ENOMEM;
info.dev = dev; info.rbd_dev = rbd_dev;
ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
(void *)&info, &event); (void *)&info, &event);
...@@ -1349,12 +1351,12 @@ static int rbd_req_sync_notify(struct rbd_device *dev, ...@@ -1349,12 +1351,12 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
ops[0].watch.prot_ver = RADOS_NOTIFY_VER; ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
ops[0].watch.timeout = 12; ops[0].watch.timeout = 12;
ret = rbd_req_sync_op(dev, NULL, ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP, CEPH_NOSNAP,
0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops, ops,
1, obj, 0, 0, NULL, NULL, NULL); rbd_dev->header_name,
0, 0, NULL, NULL, NULL);
if (ret < 0) if (ret < 0)
goto fail_event; goto fail_event;
...@@ -1373,36 +1375,37 @@ static int rbd_req_sync_notify(struct rbd_device *dev, ...@@ -1373,36 +1375,37 @@ static int rbd_req_sync_notify(struct rbd_device *dev,
/* /*
* Request sync osd read * Request sync osd read
*/ */
static int rbd_req_sync_exec(struct rbd_device *dev, static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
const char *obj, const char *object_name,
const char *cls, const char *class_name,
const char *method, const char *method_name,
const char *data, const char *data,
int len, int len,
u64 *ver) u64 *ver)
{ {
struct ceph_osd_req_op *ops; struct ceph_osd_req_op *ops;
int cls_len = strlen(cls); int class_name_len = strlen(class_name);
int method_len = strlen(method); int method_name_len = strlen(method_name);
int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, int ret;
cls_len + method_len + len);
if (ret < 0)
return ret;
ops[0].cls.class_name = cls; ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
ops[0].cls.class_len = (__u8)cls_len; class_name_len + method_name_len + len);
ops[0].cls.method_name = method; if (!ops)
ops[0].cls.method_len = (__u8)method_len; return -ENOMEM;
ops[0].cls.class_name = class_name;
ops[0].cls.class_len = (__u8) class_name_len;
ops[0].cls.method_name = method_name;
ops[0].cls.method_len = (__u8) method_name_len;
ops[0].cls.argc = 0; ops[0].cls.argc = 0;
ops[0].cls.indata = data; ops[0].cls.indata = data;
ops[0].cls.indata_len = len; ops[0].cls.indata_len = len;
ret = rbd_req_sync_op(dev, NULL, ret = rbd_req_sync_op(rbd_dev, NULL,
CEPH_NOSNAP, CEPH_NOSNAP,
0,
CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
ops, ops,
1, obj, 0, 0, NULL, NULL, ver); object_name, 0, 0, NULL, NULL, ver);
rbd_destroy_ops(ops); rbd_destroy_ops(ops);
...@@ -1437,10 +1440,12 @@ static void rbd_rq_fn(struct request_queue *q) ...@@ -1437,10 +1440,12 @@ static void rbd_rq_fn(struct request_queue *q)
struct bio *bio; struct bio *bio;
struct bio *rq_bio, *next_bio = NULL; struct bio *rq_bio, *next_bio = NULL;
bool do_write; bool do_write;
int size, op_size = 0; unsigned int size;
u64 op_size = 0;
u64 ofs; u64 ofs;
int num_segs, cur_seg = 0; int num_segs, cur_seg = 0;
struct rbd_req_coll *coll; struct rbd_req_coll *coll;
struct ceph_snap_context *snapc;
/* peek at request from block layer */ /* peek at request from block layer */
if (!rq) if (!rq)
...@@ -1467,23 +1472,38 @@ static void rbd_rq_fn(struct request_queue *q) ...@@ -1467,23 +1472,38 @@ static void rbd_rq_fn(struct request_queue *q)
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
down_read(&rbd_dev->header_rwsem);
if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
up_read(&rbd_dev->header_rwsem);
dout("request for non-existent snapshot");
spin_lock_irq(q->queue_lock);
__blk_end_request_all(rq, -ENXIO);
continue;
}
snapc = ceph_get_snap_context(rbd_dev->header.snapc);
up_read(&rbd_dev->header_rwsem);
dout("%s 0x%x bytes at 0x%llx\n", dout("%s 0x%x bytes at 0x%llx\n",
do_write ? "write" : "read", do_write ? "write" : "read",
size, blk_rq_pos(rq) * SECTOR_SIZE); size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
coll = rbd_alloc_coll(num_segs); coll = rbd_alloc_coll(num_segs);
if (!coll) { if (!coll) {
spin_lock_irq(q->queue_lock); spin_lock_irq(q->queue_lock);
__blk_end_request_all(rq, -ENOMEM); __blk_end_request_all(rq, -ENOMEM);
ceph_put_snap_context(snapc);
continue; continue;
} }
do { do {
/* a bio clone to be passed down to OSD req */ /* a bio clone to be passed down to OSD req */
dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
op_size = rbd_get_segment(&rbd_dev->header, op_size = rbd_get_segment(&rbd_dev->header,
rbd_dev->header.block_name, rbd_dev->header.object_prefix,
ofs, size, ofs, size,
NULL, NULL); NULL, NULL);
kref_get(&coll->kref); kref_get(&coll->kref);
...@@ -1499,7 +1519,7 @@ static void rbd_rq_fn(struct request_queue *q) ...@@ -1499,7 +1519,7 @@ static void rbd_rq_fn(struct request_queue *q)
/* init OSD command: write or read */ /* init OSD command: write or read */
if (do_write) if (do_write)
rbd_req_write(rq, rbd_dev, rbd_req_write(rq, rbd_dev,
rbd_dev->header.snapc, snapc,
ofs, ofs,
op_size, bio, op_size, bio,
coll, cur_seg); coll, cur_seg);
...@@ -1522,6 +1542,8 @@ static void rbd_rq_fn(struct request_queue *q) ...@@ -1522,6 +1542,8 @@ static void rbd_rq_fn(struct request_queue *q)
if (bp) if (bp)
bio_pair_release(bp); bio_pair_release(bp);
spin_lock_irq(q->queue_lock); spin_lock_irq(q->queue_lock);
ceph_put_snap_context(snapc);
} }
} }
...@@ -1592,18 +1614,19 @@ static int rbd_read_header(struct rbd_device *rbd_dev, ...@@ -1592,18 +1614,19 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
return -ENOMEM; return -ENOMEM;
rc = rbd_req_sync_read(rbd_dev, rc = rbd_req_sync_read(rbd_dev,
NULL, CEPH_NOSNAP, CEPH_NOSNAP,
rbd_dev->obj_md_name, rbd_dev->header_name,
0, len, 0, len,
(char *)dh, &ver); (char *)dh, &ver);
if (rc < 0) if (rc < 0)
goto out_dh; goto out_dh;
rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); rc = rbd_header_from_disk(header, dh, snap_count);
if (rc < 0) { if (rc < 0) {
if (rc == -ENXIO) if (rc == -ENXIO)
pr_warning("unrecognized header format" pr_warning("unrecognized header format"
" for image %s", rbd_dev->obj); " for image %s\n",
rbd_dev->image_name);
goto out_dh; goto out_dh;
} }
...@@ -1628,7 +1651,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, ...@@ -1628,7 +1651,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev,
/* /*
* create a snapshot * create a snapshot
*/ */
static int rbd_header_add_snap(struct rbd_device *dev, static int rbd_header_add_snap(struct rbd_device *rbd_dev,
const char *snap_name, const char *snap_name,
gfp_t gfp_flags) gfp_t gfp_flags)
{ {
...@@ -1636,16 +1659,15 @@ static int rbd_header_add_snap(struct rbd_device *dev, ...@@ -1636,16 +1659,15 @@ static int rbd_header_add_snap(struct rbd_device *dev,
u64 new_snapid; u64 new_snapid;
int ret; int ret;
void *data, *p, *e; void *data, *p, *e;
u64 ver;
struct ceph_mon_client *monc; struct ceph_mon_client *monc;
/* we should create a snapshot only if we're pointing at the head */ /* we should create a snapshot only if we're pointing at the head */
if (dev->snap_id != CEPH_NOSNAP) if (rbd_dev->snap_id != CEPH_NOSNAP)
return -EINVAL; return -EINVAL;
monc = &dev->rbd_client->client->monc; monc = &rbd_dev->rbd_client->client->monc;
ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid); ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
dout("created snapid=%lld\n", new_snapid); dout("created snapid=%llu\n", (unsigned long long) new_snapid);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -1659,19 +1681,13 @@ static int rbd_header_add_snap(struct rbd_device *dev, ...@@ -1659,19 +1681,13 @@ static int rbd_header_add_snap(struct rbd_device *dev,
ceph_encode_string_safe(&p, e, snap_name, name_len, bad); ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
ceph_encode_64_safe(&p, e, new_snapid, bad); ceph_encode_64_safe(&p, e, new_snapid, bad);
ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
data, p - data, &ver); "rbd", "snap_add",
data, p - data, NULL);
kfree(data); kfree(data);
if (ret < 0) return ret < 0 ? ret : 0;
return ret;
down_write(&dev->header_rwsem);
dev->header.snapc->seq = new_snapid;
up_write(&dev->header_rwsem);
return 0;
bad: bad:
return -ERANGE; return -ERANGE;
} }
...@@ -1679,52 +1695,52 @@ static int rbd_header_add_snap(struct rbd_device *dev, ...@@ -1679,52 +1695,52 @@ static int rbd_header_add_snap(struct rbd_device *dev,
static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
{ {
struct rbd_snap *snap; struct rbd_snap *snap;
struct rbd_snap *next;
while (!list_empty(&rbd_dev->snaps)) { list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node); __rbd_remove_snap_dev(snap);
__rbd_remove_snap_dev(rbd_dev, snap);
}
} }
/* /*
* only read the first part of the ondisk header, without the snaps info * only read the first part of the ondisk header, without the snaps info
*/ */
static int __rbd_refresh_header(struct rbd_device *rbd_dev) static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
{ {
int ret; int ret;
struct rbd_image_header h; struct rbd_image_header h;
u64 snap_seq;
int follow_seq = 0;
ret = rbd_read_header(rbd_dev, &h); ret = rbd_read_header(rbd_dev, &h);
if (ret < 0) if (ret < 0)
return ret; return ret;
/* resized? */
set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
down_write(&rbd_dev->header_rwsem); down_write(&rbd_dev->header_rwsem);
snap_seq = rbd_dev->header.snapc->seq; /* resized? */
if (rbd_dev->header.total_snaps && if (rbd_dev->snap_id == CEPH_NOSNAP) {
rbd_dev->header.snapc->snaps[0] == snap_seq) sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
/* pointing at the head, will need to follow that
if head moves */
follow_seq = 1;
kfree(rbd_dev->header.snapc); dout("setting size to %llu sectors", (unsigned long long) size);
kfree(rbd_dev->header.snap_names); set_capacity(rbd_dev->disk, size);
}
/* rbd_dev->header.object_prefix shouldn't change */
kfree(rbd_dev->header.snap_sizes); kfree(rbd_dev->header.snap_sizes);
kfree(rbd_dev->header.snap_names);
/* osd requests may still refer to snapc */
ceph_put_snap_context(rbd_dev->header.snapc);
if (hver)
*hver = h.obj_version;
rbd_dev->header.obj_version = h.obj_version;
rbd_dev->header.image_size = h.image_size;
rbd_dev->header.total_snaps = h.total_snaps; rbd_dev->header.total_snaps = h.total_snaps;
rbd_dev->header.snapc = h.snapc; rbd_dev->header.snapc = h.snapc;
rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_names = h.snap_names;
rbd_dev->header.snap_names_len = h.snap_names_len; rbd_dev->header.snap_names_len = h.snap_names_len;
rbd_dev->header.snap_sizes = h.snap_sizes; rbd_dev->header.snap_sizes = h.snap_sizes;
if (follow_seq) /* Free the extra copy of the object prefix */
rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
else kfree(h.object_prefix);
rbd_dev->header.snapc->seq = snap_seq;
ret = __rbd_init_snaps_header(rbd_dev); ret = __rbd_init_snaps_header(rbd_dev);
...@@ -1733,6 +1749,17 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev) ...@@ -1733,6 +1749,17 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev)
return ret; return ret;
} }
static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
{
int ret;
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
ret = __rbd_refresh_header(rbd_dev, hver);
mutex_unlock(&ctl_mutex);
return ret;
}
static int rbd_init_disk(struct rbd_device *rbd_dev) static int rbd_init_disk(struct rbd_device *rbd_dev)
{ {
struct gendisk *disk; struct gendisk *disk;
...@@ -1762,7 +1789,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) ...@@ -1762,7 +1789,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
goto out; goto out;
snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
rbd_dev->id); rbd_dev->dev_id);
disk->major = rbd_dev->major; disk->major = rbd_dev->major;
disk->first_minor = 0; disk->first_minor = 0;
disk->fops = &rbd_bd_ops; disk->fops = &rbd_bd_ops;
...@@ -1819,8 +1846,13 @@ static ssize_t rbd_size_show(struct device *dev, ...@@ -1819,8 +1846,13 @@ static ssize_t rbd_size_show(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
sector_t size;
down_read(&rbd_dev->header_rwsem);
size = get_capacity(rbd_dev->disk);
up_read(&rbd_dev->header_rwsem);
return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
} }
static ssize_t rbd_major_show(struct device *dev, static ssize_t rbd_major_show(struct device *dev,
...@@ -1848,12 +1880,20 @@ static ssize_t rbd_pool_show(struct device *dev, ...@@ -1848,12 +1880,20 @@ static ssize_t rbd_pool_show(struct device *dev,
return sprintf(buf, "%s\n", rbd_dev->pool_name); return sprintf(buf, "%s\n", rbd_dev->pool_name);
} }
static ssize_t rbd_pool_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%d\n", rbd_dev->pool_id);
}
static ssize_t rbd_name_show(struct device *dev, static ssize_t rbd_name_show(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
return sprintf(buf, "%s\n", rbd_dev->obj); return sprintf(buf, "%s\n", rbd_dev->image_name);
} }
static ssize_t rbd_snap_show(struct device *dev, static ssize_t rbd_snap_show(struct device *dev,
...@@ -1871,23 +1911,18 @@ static ssize_t rbd_image_refresh(struct device *dev, ...@@ -1871,23 +1911,18 @@ static ssize_t rbd_image_refresh(struct device *dev,
size_t size) size_t size)
{ {
struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
int rc; int ret;
int ret = size;
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
rc = __rbd_refresh_header(rbd_dev); ret = rbd_refresh_header(rbd_dev, NULL);
if (rc < 0)
ret = rc;
mutex_unlock(&ctl_mutex); return ret < 0 ? ret : size;
return ret;
} }
static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
...@@ -1898,6 +1933,7 @@ static struct attribute *rbd_attrs[] = { ...@@ -1898,6 +1933,7 @@ static struct attribute *rbd_attrs[] = {
&dev_attr_major.attr, &dev_attr_major.attr,
&dev_attr_client_id.attr, &dev_attr_client_id.attr,
&dev_attr_pool.attr, &dev_attr_pool.attr,
&dev_attr_pool_id.attr,
&dev_attr_name.attr, &dev_attr_name.attr,
&dev_attr_current_snap.attr, &dev_attr_current_snap.attr,
&dev_attr_refresh.attr, &dev_attr_refresh.attr,
...@@ -1977,15 +2013,13 @@ static struct device_type rbd_snap_device_type = { ...@@ -1977,15 +2013,13 @@ static struct device_type rbd_snap_device_type = {
.release = rbd_snap_dev_release, .release = rbd_snap_dev_release,
}; };
static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, static void __rbd_remove_snap_dev(struct rbd_snap *snap)
struct rbd_snap *snap)
{ {
list_del(&snap->node); list_del(&snap->node);
device_unregister(&snap->dev); device_unregister(&snap->dev);
} }
static int rbd_register_snap_dev(struct rbd_device *rbd_dev, static int rbd_register_snap_dev(struct rbd_snap *snap,
struct rbd_snap *snap,
struct device *parent) struct device *parent)
{ {
struct device *dev = &snap->dev; struct device *dev = &snap->dev;
...@@ -2000,29 +2034,36 @@ static int rbd_register_snap_dev(struct rbd_device *rbd_dev, ...@@ -2000,29 +2034,36 @@ static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
return ret; return ret;
} }
static int __rbd_add_snap_dev(struct rbd_device *rbd_dev, static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
int i, const char *name, int i, const char *name)
struct rbd_snap **snapp)
{ {
struct rbd_snap *snap;
int ret; int ret;
struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
snap = kzalloc(sizeof (*snap), GFP_KERNEL);
if (!snap) if (!snap)
return -ENOMEM; return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
snap->name = kstrdup(name, GFP_KERNEL); snap->name = kstrdup(name, GFP_KERNEL);
if (!snap->name)
goto err;
snap->size = rbd_dev->header.snap_sizes[i]; snap->size = rbd_dev->header.snap_sizes[i];
snap->id = rbd_dev->header.snapc->snaps[i]; snap->id = rbd_dev->header.snapc->snaps[i];
if (device_is_registered(&rbd_dev->dev)) { if (device_is_registered(&rbd_dev->dev)) {
ret = rbd_register_snap_dev(rbd_dev, snap, ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
&rbd_dev->dev);
if (ret < 0) if (ret < 0)
goto err; goto err;
} }
*snapp = snap;
return 0; return snap;
err: err:
kfree(snap->name); kfree(snap->name);
kfree(snap); kfree(snap);
return ret;
return ERR_PTR(ret);
} }
/* /*
...@@ -2055,7 +2096,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) ...@@ -2055,7 +2096,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
const char *name, *first_name; const char *name, *first_name;
int i = rbd_dev->header.total_snaps; int i = rbd_dev->header.total_snaps;
struct rbd_snap *snap, *old_snap = NULL; struct rbd_snap *snap, *old_snap = NULL;
int ret;
struct list_head *p, *n; struct list_head *p, *n;
first_name = rbd_dev->header.snap_names; first_name = rbd_dev->header.snap_names;
...@@ -2070,8 +2110,15 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) ...@@ -2070,8 +2110,15 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
cur_id = rbd_dev->header.snapc->snaps[i - 1]; cur_id = rbd_dev->header.snapc->snaps[i - 1];
if (!i || old_snap->id < cur_id) { if (!i || old_snap->id < cur_id) {
/* old_snap->id was skipped, thus was removed */ /*
__rbd_remove_snap_dev(rbd_dev, old_snap); * old_snap->id was skipped, thus was
* removed. If this rbd_dev is mapped to
* the removed snapshot, record that it no
* longer exists, to prevent further I/O.
*/
if (rbd_dev->snap_id == old_snap->id)
rbd_dev->snap_exists = false;
__rbd_remove_snap_dev(old_snap);
continue; continue;
} }
if (old_snap->id == cur_id) { if (old_snap->id == cur_id) {
...@@ -2091,9 +2138,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) ...@@ -2091,9 +2138,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
if (cur_id >= old_snap->id) if (cur_id >= old_snap->id)
break; break;
/* a new snapshot */ /* a new snapshot */
ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
if (ret < 0) if (IS_ERR(snap))
return ret; return PTR_ERR(snap);
/* note that we add it backward so using n and not p */ /* note that we add it backward so using n and not p */
list_add(&snap->node, n); list_add(&snap->node, n);
...@@ -2107,9 +2154,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) ...@@ -2107,9 +2154,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
WARN_ON(1); WARN_ON(1);
return -EINVAL; return -EINVAL;
} }
ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
if (ret < 0) if (IS_ERR(snap))
return ret; return PTR_ERR(snap);
list_add(&snap->node, &rbd_dev->snaps); list_add(&snap->node, &rbd_dev->snaps);
} }
...@@ -2129,14 +2176,13 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) ...@@ -2129,14 +2176,13 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
dev->type = &rbd_device_type; dev->type = &rbd_device_type;
dev->parent = &rbd_root_dev; dev->parent = &rbd_root_dev;
dev->release = rbd_dev_release; dev->release = rbd_dev_release;
dev_set_name(dev, "%d", rbd_dev->id); dev_set_name(dev, "%d", rbd_dev->dev_id);
ret = device_register(dev); ret = device_register(dev);
if (ret < 0) if (ret < 0)
goto out; goto out;
list_for_each_entry(snap, &rbd_dev->snaps, node) { list_for_each_entry(snap, &rbd_dev->snaps, node) {
ret = rbd_register_snap_dev(rbd_dev, snap, ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
&rbd_dev->dev);
if (ret < 0) if (ret < 0)
break; break;
} }
...@@ -2155,12 +2201,9 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) ...@@ -2155,12 +2201,9 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
int ret, rc; int ret, rc;
do { do {
ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, ret = rbd_req_sync_watch(rbd_dev);
rbd_dev->header.obj_version);
if (ret == -ERANGE) { if (ret == -ERANGE) {
mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); rc = rbd_refresh_header(rbd_dev, NULL);
rc = __rbd_refresh_header(rbd_dev);
mutex_unlock(&ctl_mutex);
if (rc < 0) if (rc < 0)
return rc; return rc;
} }
...@@ -2177,7 +2220,7 @@ static atomic64_t rbd_id_max = ATOMIC64_INIT(0); ...@@ -2177,7 +2220,7 @@ static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
*/ */
static void rbd_id_get(struct rbd_device *rbd_dev) static void rbd_id_get(struct rbd_device *rbd_dev)
{ {
rbd_dev->id = atomic64_inc_return(&rbd_id_max); rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
spin_lock(&rbd_dev_list_lock); spin_lock(&rbd_dev_list_lock);
list_add_tail(&rbd_dev->node, &rbd_dev_list); list_add_tail(&rbd_dev->node, &rbd_dev_list);
...@@ -2191,7 +2234,7 @@ static void rbd_id_get(struct rbd_device *rbd_dev) ...@@ -2191,7 +2234,7 @@ static void rbd_id_get(struct rbd_device *rbd_dev)
static void rbd_id_put(struct rbd_device *rbd_dev) static void rbd_id_put(struct rbd_device *rbd_dev)
{ {
struct list_head *tmp; struct list_head *tmp;
int rbd_id = rbd_dev->id; int rbd_id = rbd_dev->dev_id;
int max_id; int max_id;
BUG_ON(rbd_id < 1); BUG_ON(rbd_id < 1);
...@@ -2282,19 +2325,58 @@ static inline size_t copy_token(const char **buf, ...@@ -2282,19 +2325,58 @@ static inline size_t copy_token(const char **buf,
} }
/* /*
* This fills in the pool_name, obj, obj_len, snap_name, obj_len, * Finds the next token in *buf, dynamically allocates a buffer big
* enough to hold a copy of it, and copies the token into the new
* buffer. The copy is guaranteed to be terminated with '\0'. Note
* that a duplicate buffer is created even for a zero-length token.
*
* Returns a pointer to the newly-allocated duplicate, or a null
* pointer if memory for the duplicate was not available. If
* the lenp argument is a non-null pointer, the length of the token
* (not including the '\0') is returned in *lenp.
*
* If successful, the *buf pointer will be updated to point beyond
* the end of the found token.
*
* Note: uses GFP_KERNEL for allocation.
*/
static inline char *dup_token(const char **buf, size_t *lenp)
{
char *dup;
size_t len;
len = next_token(buf);
dup = kmalloc(len + 1, GFP_KERNEL);
if (!dup)
return NULL;
memcpy(dup, *buf, len);
*(dup + len) = '\0';
*buf += len;
if (lenp)
*lenp = len;
return dup;
}
/*
* This fills in the pool_name, image_name, image_name_len, snap_name,
* rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
* on the list of monitor addresses and other options provided via * on the list of monitor addresses and other options provided via
* /sys/bus/rbd/add. * /sys/bus/rbd/add.
*
* Note: rbd_dev is assumed to have been initially zero-filled.
*/ */
static int rbd_add_parse_args(struct rbd_device *rbd_dev, static int rbd_add_parse_args(struct rbd_device *rbd_dev,
const char *buf, const char *buf,
const char **mon_addrs, const char **mon_addrs,
size_t *mon_addrs_size, size_t *mon_addrs_size,
char *options, char *options,
size_t options_size) size_t options_size)
{ {
size_t len; size_t len;
int ret;
/* The first four tokens are required */ /* The first four tokens are required */
...@@ -2310,56 +2392,74 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, ...@@ -2310,56 +2392,74 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev,
if (!len || len >= options_size) if (!len || len >= options_size)
return -EINVAL; return -EINVAL;
len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name)); ret = -ENOMEM;
if (!len || len >= sizeof (rbd_dev->pool_name)) rbd_dev->pool_name = dup_token(&buf, NULL);
return -EINVAL; if (!rbd_dev->pool_name)
goto out_err;
len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
if (!len || len >= sizeof (rbd_dev->obj))
return -EINVAL;
/* We have the object length in hand, save it. */ rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
if (!rbd_dev->image_name)
goto out_err;
rbd_dev->obj_len = len; /* Create the name of the header object */
BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
< RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX)); + sizeof (RBD_SUFFIX),
sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX); GFP_KERNEL);
if (!rbd_dev->header_name)
goto out_err;
sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
/* /*
* The snapshot name is optional, but it's an error if it's * The snapshot name is optional. If none is is supplied,
* too long. If no snapshot is supplied, fill in the default. * we use the default value.
*/ */
len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name)); rbd_dev->snap_name = dup_token(&buf, &len);
if (!len) if (!rbd_dev->snap_name)
goto out_err;
if (!len) {
/* Replace the empty name with the default */
kfree(rbd_dev->snap_name);
rbd_dev->snap_name
= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
if (!rbd_dev->snap_name)
goto out_err;
memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
sizeof (RBD_SNAP_HEAD_NAME)); sizeof (RBD_SNAP_HEAD_NAME));
else if (len >= sizeof (rbd_dev->snap_name)) }
return -EINVAL;
return 0; return 0;
out_err:
kfree(rbd_dev->header_name);
kfree(rbd_dev->image_name);
kfree(rbd_dev->pool_name);
rbd_dev->pool_name = NULL;
return ret;
} }
static ssize_t rbd_add(struct bus_type *bus, static ssize_t rbd_add(struct bus_type *bus,
const char *buf, const char *buf,
size_t count) size_t count)
{ {
struct rbd_device *rbd_dev; char *options;
struct rbd_device *rbd_dev = NULL;
const char *mon_addrs = NULL; const char *mon_addrs = NULL;
size_t mon_addrs_size = 0; size_t mon_addrs_size = 0;
char *options = NULL;
struct ceph_osd_client *osdc; struct ceph_osd_client *osdc;
int rc = -ENOMEM; int rc = -ENOMEM;
if (!try_module_get(THIS_MODULE)) if (!try_module_get(THIS_MODULE))
return -ENODEV; return -ENODEV;
rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
if (!rbd_dev)
goto err_nomem;
options = kmalloc(count, GFP_KERNEL); options = kmalloc(count, GFP_KERNEL);
if (!options) if (!options)
goto err_nomem; goto err_nomem;
rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
if (!rbd_dev)
goto err_nomem;
/* static rbd_device initialization */ /* static rbd_device initialization */
spin_lock_init(&rbd_dev->lock); spin_lock_init(&rbd_dev->lock);
...@@ -2367,15 +2467,13 @@ static ssize_t rbd_add(struct bus_type *bus, ...@@ -2367,15 +2467,13 @@ static ssize_t rbd_add(struct bus_type *bus,
INIT_LIST_HEAD(&rbd_dev->snaps); INIT_LIST_HEAD(&rbd_dev->snaps);
init_rwsem(&rbd_dev->header_rwsem); init_rwsem(&rbd_dev->header_rwsem);
init_rwsem(&rbd_dev->header_rwsem);
/* generate unique id: find highest unique id, add one */ /* generate unique id: find highest unique id, add one */
rbd_id_get(rbd_dev); rbd_id_get(rbd_dev);
/* Fill in the device name, now that we have its id. */ /* Fill in the device name, now that we have its id. */
BUILD_BUG_ON(DEV_NAME_LEN BUILD_BUG_ON(DEV_NAME_LEN
< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id); sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
/* parse add command */ /* parse add command */
rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
...@@ -2395,7 +2493,7 @@ static ssize_t rbd_add(struct bus_type *bus, ...@@ -2395,7 +2493,7 @@ static ssize_t rbd_add(struct bus_type *bus,
rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
if (rc < 0) if (rc < 0)
goto err_out_client; goto err_out_client;
rbd_dev->poolid = rc; rbd_dev->pool_id = rc;
/* register our block device */ /* register our block device */
rc = register_blkdev(0, rbd_dev->name); rc = register_blkdev(0, rbd_dev->name);
...@@ -2435,10 +2533,16 @@ static ssize_t rbd_add(struct bus_type *bus, ...@@ -2435,10 +2533,16 @@ static ssize_t rbd_add(struct bus_type *bus,
err_out_client: err_out_client:
rbd_put_client(rbd_dev); rbd_put_client(rbd_dev);
err_put_id: err_put_id:
if (rbd_dev->pool_name) {
kfree(rbd_dev->snap_name);
kfree(rbd_dev->header_name);
kfree(rbd_dev->image_name);
kfree(rbd_dev->pool_name);
}
rbd_id_put(rbd_dev); rbd_id_put(rbd_dev);
err_nomem: err_nomem:
kfree(options);
kfree(rbd_dev); kfree(rbd_dev);
kfree(options);
dout("Error adding device %s\n", buf); dout("Error adding device %s\n", buf);
module_put(THIS_MODULE); module_put(THIS_MODULE);
...@@ -2446,7 +2550,7 @@ static ssize_t rbd_add(struct bus_type *bus, ...@@ -2446,7 +2550,7 @@ static ssize_t rbd_add(struct bus_type *bus,
return (ssize_t) rc; return (ssize_t) rc;
} }
static struct rbd_device *__rbd_get_dev(unsigned long id) static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
{ {
struct list_head *tmp; struct list_head *tmp;
struct rbd_device *rbd_dev; struct rbd_device *rbd_dev;
...@@ -2454,7 +2558,7 @@ static struct rbd_device *__rbd_get_dev(unsigned long id) ...@@ -2454,7 +2558,7 @@ static struct rbd_device *__rbd_get_dev(unsigned long id)
spin_lock(&rbd_dev_list_lock); spin_lock(&rbd_dev_list_lock);
list_for_each(tmp, &rbd_dev_list) { list_for_each(tmp, &rbd_dev_list) {
rbd_dev = list_entry(tmp, struct rbd_device, node); rbd_dev = list_entry(tmp, struct rbd_device, node);
if (rbd_dev->id == id) { if (rbd_dev->dev_id == dev_id) {
spin_unlock(&rbd_dev_list_lock); spin_unlock(&rbd_dev_list_lock);
return rbd_dev; return rbd_dev;
} }
...@@ -2474,7 +2578,7 @@ static void rbd_dev_release(struct device *dev) ...@@ -2474,7 +2578,7 @@ static void rbd_dev_release(struct device *dev)
rbd_dev->watch_request); rbd_dev->watch_request);
} }
if (rbd_dev->watch_event) if (rbd_dev->watch_event)
rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); rbd_req_sync_unwatch(rbd_dev);
rbd_put_client(rbd_dev); rbd_put_client(rbd_dev);
...@@ -2483,6 +2587,10 @@ static void rbd_dev_release(struct device *dev) ...@@ -2483,6 +2587,10 @@ static void rbd_dev_release(struct device *dev)
unregister_blkdev(rbd_dev->major, rbd_dev->name); unregister_blkdev(rbd_dev->major, rbd_dev->name);
/* done with the id, and with the rbd_dev */ /* done with the id, and with the rbd_dev */
kfree(rbd_dev->snap_name);
kfree(rbd_dev->header_name);
kfree(rbd_dev->pool_name);
kfree(rbd_dev->image_name);
rbd_id_put(rbd_dev); rbd_id_put(rbd_dev);
kfree(rbd_dev); kfree(rbd_dev);
...@@ -2544,7 +2652,7 @@ static ssize_t rbd_snap_add(struct device *dev, ...@@ -2544,7 +2652,7 @@ static ssize_t rbd_snap_add(struct device *dev,
if (ret < 0) if (ret < 0)
goto err_unlock; goto err_unlock;
ret = __rbd_refresh_header(rbd_dev); ret = __rbd_refresh_header(rbd_dev, NULL);
if (ret < 0) if (ret < 0)
goto err_unlock; goto err_unlock;
...@@ -2553,7 +2661,7 @@ static ssize_t rbd_snap_add(struct device *dev, ...@@ -2553,7 +2661,7 @@ static ssize_t rbd_snap_add(struct device *dev,
mutex_unlock(&ctl_mutex); mutex_unlock(&ctl_mutex);
/* make a best effort, don't error if failed */ /* make a best effort, don't error if failed */
rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); rbd_req_sync_notify(rbd_dev);
ret = count; ret = count;
kfree(name); kfree(name);
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#define RBD_MIN_OBJ_ORDER 16 #define RBD_MIN_OBJ_ORDER 16
#define RBD_MAX_OBJ_ORDER 30 #define RBD_MAX_OBJ_ORDER 30
#define RBD_MAX_OBJ_NAME_LEN 96
#define RBD_MAX_SEG_NAME_LEN 128 #define RBD_MAX_SEG_NAME_LEN 128
#define RBD_COMP_NONE 0 #define RBD_COMP_NONE 0
......
...@@ -51,8 +51,7 @@ int ceph_init_dentry(struct dentry *dentry) ...@@ -51,8 +51,7 @@ int ceph_init_dentry(struct dentry *dentry)
goto out_unlock; goto out_unlock;
} }
if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
d_set_d_op(dentry, &ceph_dentry_ops); d_set_d_op(dentry, &ceph_dentry_ops);
else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
d_set_d_op(dentry, &ceph_snapdir_dentry_ops); d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
...@@ -79,7 +78,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) ...@@ -79,7 +78,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
return NULL; return NULL;
spin_lock(&dentry->d_lock); spin_lock(&dentry->d_lock);
if (dentry->d_parent) { if (!IS_ROOT(dentry)) {
inode = dentry->d_parent->d_inode; inode = dentry->d_parent->d_inode;
ihold(inode); ihold(inode);
} }
...@@ -1154,7 +1153,7 @@ static void ceph_d_prune(struct dentry *dentry) ...@@ -1154,7 +1153,7 @@ static void ceph_d_prune(struct dentry *dentry)
dout("ceph_d_prune %p\n", dentry); dout("ceph_d_prune %p\n", dentry);
/* do we have a valid parent? */ /* do we have a valid parent? */
if (!dentry->d_parent || IS_ROOT(dentry)) if (IS_ROOT(dentry))
return; return;
/* if we are not hashed, we don't affect D_COMPLETE */ /* if we are not hashed, we don't affect D_COMPLETE */
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "super.h" #include "super.h"
#include "mds_client.h" #include "mds_client.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/messenger.h> #include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h> #include <linux/ceph/decode.h>
#include <linux/ceph/pagelist.h> #include <linux/ceph/pagelist.h>
...@@ -394,11 +395,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ...@@ -394,11 +395,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_seq = 0; s->s_seq = 0;
mutex_init(&s->s_mutex); mutex_init(&s->s_mutex);
ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
s->s_con.private = s;
s->s_con.ops = &mds_con_ops;
s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
s->s_con.peer_name.num = cpu_to_le64(mds);
spin_lock_init(&s->s_gen_ttl_lock); spin_lock_init(&s->s_gen_ttl_lock);
s->s_cap_gen = 0; s->s_cap_gen = 0;
...@@ -440,7 +437,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ...@@ -440,7 +437,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
mdsc->sessions[mds] = s; mdsc->sessions[mds] = s;
atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
return s; return s;
...@@ -1472,11 +1470,6 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, ...@@ -1472,11 +1470,6 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
else else
len += 1 + temp->d_name.len; len += 1 + temp->d_name.len;
temp = temp->d_parent; temp = temp->d_parent;
if (temp == NULL) {
rcu_read_unlock();
pr_err("build_path corrupt dentry %p\n", dentry);
return ERR_PTR(-EINVAL);
}
} }
rcu_read_unlock(); rcu_read_unlock();
if (len) if (len)
...@@ -1513,12 +1506,6 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, ...@@ -1513,12 +1506,6 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
if (pos) if (pos)
path[--pos] = '/'; path[--pos] = '/';
temp = temp->d_parent; temp = temp->d_parent;
if (temp == NULL) {
rcu_read_unlock();
pr_err("build_path corrupt dentry\n");
kfree(path);
return ERR_PTR(-EINVAL);
}
} }
rcu_read_unlock(); rcu_read_unlock();
if (pos != 0 || read_seqretry(&rename_lock, seq)) { if (pos != 0 || read_seqretry(&rename_lock, seq)) {
...@@ -2531,7 +2518,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, ...@@ -2531,7 +2518,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0; session->s_seq = 0;
ceph_con_close(&session->s_con);
ceph_con_open(&session->s_con, ceph_con_open(&session->s_con,
CEPH_ENTITY_TYPE_MDS, mds,
ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
/* replay unsafe requests */ /* replay unsafe requests */
......
...@@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) ...@@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc; struct ceph_snap_context *snapc;
int err = 0; int err = 0;
int i; u32 num = realm->num_prior_parent_snaps + realm->num_snaps;
int num = realm->num_prior_parent_snaps + realm->num_snaps;
/* /*
* build parent context, if it hasn't been built. * build parent context, if it hasn't been built.
...@@ -321,11 +320,11 @@ static int build_snap_context(struct ceph_snap_realm *realm) ...@@ -321,11 +320,11 @@ static int build_snap_context(struct ceph_snap_realm *realm)
realm->cached_context->seq == realm->seq && realm->cached_context->seq == realm->seq &&
(!parent || (!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) { realm->cached_context->seq >= parent->cached_context->seq)) {
dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
" (unchanged)\n", " (unchanged)\n",
realm->ino, realm, realm->cached_context, realm->ino, realm, realm->cached_context,
realm->cached_context->seq, realm->cached_context->seq,
realm->cached_context->num_snaps); (unsigned int) realm->cached_context->num_snaps);
return 0; return 0;
} }
...@@ -342,6 +341,8 @@ static int build_snap_context(struct ceph_snap_realm *realm) ...@@ -342,6 +341,8 @@ static int build_snap_context(struct ceph_snap_realm *realm)
num = 0; num = 0;
snapc->seq = realm->seq; snapc->seq = realm->seq;
if (parent) { if (parent) {
u32 i;
/* include any of parent's snaps occurring _after_ my /* include any of parent's snaps occurring _after_ my
parent became my parent */ parent became my parent */
for (i = 0; i < parent->cached_context->num_snaps; i++) for (i = 0; i < parent->cached_context->num_snaps; i++)
...@@ -361,8 +362,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) ...@@ -361,8 +362,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num; snapc->num_snaps = num;
dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
if (realm->cached_context) if (realm->cached_context)
ceph_put_snap_context(realm->cached_context); ceph_put_snap_context(realm->cached_context);
...@@ -402,9 +404,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm) ...@@ -402,9 +404,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm)
* helper to allocate and decode an array of snapids. free prior * helper to allocate and decode an array of snapids. free prior
* instance, if any. * instance, if any.
*/ */
static int dup_array(u64 **dst, __le64 *src, int num) static int dup_array(u64 **dst, __le64 *src, u32 num)
{ {
int i; u32 i;
kfree(*dst); kfree(*dst);
if (num) { if (num) {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "super.h" #include "super.h"
#include "mds_client.h" #include "mds_client.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/decode.h> #include <linux/ceph/decode.h>
#include <linux/ceph/mon_client.h> #include <linux/ceph/mon_client.h>
#include <linux/ceph/auth.h> #include <linux/ceph/auth.h>
......
...@@ -612,9 +612,9 @@ struct ceph_snap_realm { ...@@ -612,9 +612,9 @@ struct ceph_snap_realm {
u64 parent_since; /* snapid when our current parent became so */ u64 parent_since; /* snapid when our current parent became so */
u64 *prior_parent_snaps; /* snaps inherited from any parents we */ u64 *prior_parent_snaps; /* snaps inherited from any parents we */
int num_prior_parent_snaps; /* had prior to parent_since */ u32 num_prior_parent_snaps; /* had prior to parent_since */
u64 *snaps; /* snaps specific to this realm */ u64 *snaps; /* snaps specific to this realm */
int num_snaps; u32 num_snaps;
struct ceph_snap_realm *parent; struct ceph_snap_realm *parent;
struct list_head children; /* list of child realms */ struct list_head children; /* list of child realms */
......
...@@ -457,6 +457,7 @@ static int __build_xattrs(struct inode *inode) ...@@ -457,6 +457,7 @@ static int __build_xattrs(struct inode *inode)
for (i = 0; i < numattr; i++) for (i = 0; i < numattr; i++)
kfree(xattrs[i]); kfree(xattrs[i]);
kfree(xattrs); kfree(xattrs);
xattrs = NULL;
goto start; goto start;
} }
err = -EIO; err = -EIO;
......
#ifndef __CEPH_FEATURES
#define __CEPH_FEATURES
/*
* feature bits
*/
#define CEPH_FEATURE_UID (1<<0)
#define CEPH_FEATURE_NOSRCADDR (1<<1)
#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
#define CEPH_FEATURE_FLOCK (1<<3)
#define CEPH_FEATURE_SUBSCRIBE2 (1<<4)
#define CEPH_FEATURE_MONNAMES (1<<5)
#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
/* bits 8-17 defined by user-space; not supported yet here */
#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
/*
* Features supported.
*/
#define CEPH_FEATURES_SUPPORTED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \
CEPH_FEATURE_CRUSH_TUNABLES)
#define CEPH_FEATURES_REQUIRED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR)
#endif
...@@ -35,20 +35,6 @@ ...@@ -35,20 +35,6 @@
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
#define CEPH_MAX_MON 31 #define CEPH_MAX_MON 31
/*
* feature bits
*/
#define CEPH_FEATURE_UID (1<<0)
#define CEPH_FEATURE_NOSRCADDR (1<<1)
#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
#define CEPH_FEATURE_FLOCK (1<<3)
#define CEPH_FEATURE_SUBSCRIBE2 (1<<4)
#define CEPH_FEATURE_MONNAMES (1<<5)
#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
/* /*
* ceph_file_layout - describe data layout for a file/inode * ceph_file_layout - describe data layout for a file/inode
*/ */
......
#ifndef __CEPH_DECODE_H #ifndef __CEPH_DECODE_H
#define __CEPH_DECODE_H #define __CEPH_DECODE_H
#include <linux/err.h>
#include <linux/bug.h> #include <linux/bug.h>
#include <linux/time.h> #include <linux/time.h>
#include <asm/unaligned.h> #include <asm/unaligned.h>
...@@ -84,6 +85,52 @@ static inline int ceph_has_room(void **p, void *end, size_t n) ...@@ -84,6 +85,52 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
ceph_decode_copy(p, pv, n); \ ceph_decode_copy(p, pv, n); \
} while (0) } while (0)
/*
* Allocate a buffer big enough to hold the wire-encoded string, and
* decode the string into it. The resulting string will always be
* terminated with '\0'. If successful, *p will be advanced
* past the decoded data. Also, if lenp is not a null pointer, the
* length (not including the terminating '\0') will be recorded in
* *lenp. Note that a zero-length string is a valid return value.
*
* Returns a pointer to the newly-allocated string buffer, or a
* pointer-coded errno if an error occurs. Neither *p nor *lenp
* will have been updated if an error is returned.
*
* There are two possible failures:
* - converting the string would require accessing memory at or
* beyond the "end" pointer provided (-E
* - memory could not be allocated for the result
*/
static inline char *ceph_extract_encoded_string(void **p, void *end,
size_t *lenp, gfp_t gfp)
{
u32 len;
void *sp = *p;
char *buf;
ceph_decode_32_safe(&sp, end, len, bad);
if (!ceph_has_room(&sp, end, len))
goto bad;
buf = kmalloc(len + 1, gfp);
if (!buf)
return ERR_PTR(-ENOMEM);
if (len)
memcpy(buf, sp, len);
buf[len] = '\0';
*p = (char *) *p + sizeof (u32) + len;
if (lenp)
*lenp = (size_t) len;
return buf;
bad:
return ERR_PTR(-ERANGE);
}
/* /*
* struct ceph_timespec <-> struct timespec * struct ceph_timespec <-> struct timespec
*/ */
...@@ -151,7 +198,7 @@ static inline void ceph_encode_filepath(void **p, void *end, ...@@ -151,7 +198,7 @@ static inline void ceph_encode_filepath(void **p, void *end,
u64 ino, const char *path) u64 ino, const char *path)
{ {
u32 len = path ? strlen(path) : 0; u32 len = path ? strlen(path) : 0;
BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); BUG_ON(*p + 1 + sizeof(ino) + sizeof(len) + len > end);
ceph_encode_8(p, 1); ceph_encode_8(p, 1);
ceph_encode_64(p, ino); ceph_encode_64(p, ino);
ceph_encode_32(p, len); ceph_encode_32(p, len);
......
...@@ -22,12 +22,6 @@ ...@@ -22,12 +22,6 @@
#include "osd_client.h" #include "osd_client.h"
#include "ceph_fs.h" #include "ceph_fs.h"
/*
* Supported features
*/
#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR
/* /*
* mount options * mount options
*/ */
...@@ -132,7 +126,7 @@ struct ceph_client { ...@@ -132,7 +126,7 @@ struct ceph_client {
u32 supported_features; u32 supported_features;
u32 required_features; u32 required_features;
struct ceph_messenger *msgr; /* messenger instance */ struct ceph_messenger msgr; /* messenger instance */
struct ceph_mon_client monc; struct ceph_mon_client monc;
struct ceph_osd_client osdc; struct ceph_osd_client osdc;
...@@ -160,7 +154,7 @@ struct ceph_client { ...@@ -160,7 +154,7 @@ struct ceph_client {
struct ceph_snap_context { struct ceph_snap_context {
atomic_t nref; atomic_t nref;
u64 seq; u64 seq;
int num_snaps; u32 num_snaps;
u64 snaps[]; u64 snaps[];
}; };
......
...@@ -31,9 +31,6 @@ struct ceph_connection_operations { ...@@ -31,9 +31,6 @@ struct ceph_connection_operations {
int (*verify_authorizer_reply) (struct ceph_connection *con, int len); int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
int (*invalidate_authorizer)(struct ceph_connection *con); int (*invalidate_authorizer)(struct ceph_connection *con);
/* protocol version mismatch */
void (*bad_proto) (struct ceph_connection *con);
/* there was some error on the socket (disconnect, whatever) */ /* there was some error on the socket (disconnect, whatever) */
void (*fault) (struct ceph_connection *con); void (*fault) (struct ceph_connection *con);
...@@ -53,6 +50,7 @@ struct ceph_messenger { ...@@ -53,6 +50,7 @@ struct ceph_messenger {
struct ceph_entity_inst inst; /* my name+address */ struct ceph_entity_inst inst; /* my name+address */
struct ceph_entity_addr my_enc_addr; struct ceph_entity_addr my_enc_addr;
atomic_t stopping;
bool nocrc; bool nocrc;
/* /*
...@@ -80,7 +78,10 @@ struct ceph_msg { ...@@ -80,7 +78,10 @@ struct ceph_msg {
unsigned nr_pages; /* size of page array */ unsigned nr_pages; /* size of page array */
unsigned page_alignment; /* io offset in first page */ unsigned page_alignment; /* io offset in first page */
struct ceph_pagelist *pagelist; /* instead of pages */ struct ceph_pagelist *pagelist; /* instead of pages */
struct ceph_connection *con;
struct list_head list_head; struct list_head list_head;
struct kref kref; struct kref kref;
struct bio *bio; /* instead of pages/pagelist */ struct bio *bio; /* instead of pages/pagelist */
struct bio *bio_iter; /* bio iterator */ struct bio *bio_iter; /* bio iterator */
...@@ -105,23 +106,6 @@ struct ceph_msg_pos { ...@@ -105,23 +106,6 @@ struct ceph_msg_pos {
#define BASE_DELAY_INTERVAL (HZ/2) #define BASE_DELAY_INTERVAL (HZ/2)
#define MAX_DELAY_INTERVAL (5 * 60 * HZ) #define MAX_DELAY_INTERVAL (5 * 60 * HZ)
/*
* ceph_connection state bit flags
*/
#define LOSSYTX 0 /* we can close channel or drop messages on errors */
#define CONNECTING 1
#define NEGOTIATING 2
#define KEEPALIVE_PENDING 3
#define WRITE_PENDING 4 /* we have data ready to send */
#define STANDBY 8 /* no outgoing messages, socket closed. we keep
* the ceph_connection around to maintain shared
* state with the peer. */
#define CLOSED 10 /* we've closed the connection */
#define SOCK_CLOSED 11 /* socket state changed to closed */
#define OPENING 13 /* open connection w/ (possibly new) peer */
#define DEAD 14 /* dead, about to kfree */
#define BACKOFF 15
/* /*
* A single connection with another host. * A single connection with another host.
* *
...@@ -131,18 +115,22 @@ struct ceph_msg_pos { ...@@ -131,18 +115,22 @@ struct ceph_msg_pos {
*/ */
struct ceph_connection { struct ceph_connection {
void *private; void *private;
atomic_t nref;
const struct ceph_connection_operations *ops; const struct ceph_connection_operations *ops;
struct ceph_messenger *msgr; struct ceph_messenger *msgr;
atomic_t sock_state;
struct socket *sock; struct socket *sock;
unsigned long state; /* connection state (see flags above) */ struct ceph_entity_addr peer_addr; /* peer address */
struct ceph_entity_addr peer_addr_for_me;
unsigned long flags;
unsigned long state;
const char *error_msg; /* error message, if any */ const char *error_msg; /* error message, if any */
struct ceph_entity_addr peer_addr; /* peer address */
struct ceph_entity_name peer_name; /* peer name */ struct ceph_entity_name peer_name; /* peer name */
struct ceph_entity_addr peer_addr_for_me;
unsigned peer_features; unsigned peer_features;
u32 connect_seq; /* identify the most recent connection u32 connect_seq; /* identify the most recent connection
attempt for this connection, client */ attempt for this connection, client */
...@@ -207,24 +195,26 @@ extern int ceph_msgr_init(void); ...@@ -207,24 +195,26 @@ extern int ceph_msgr_init(void);
extern void ceph_msgr_exit(void); extern void ceph_msgr_exit(void);
extern void ceph_msgr_flush(void); extern void ceph_msgr_flush(void);
extern struct ceph_messenger *ceph_messenger_create( extern void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr, struct ceph_entity_addr *myaddr,
u32 features, u32 required); u32 supported_features,
extern void ceph_messenger_destroy(struct ceph_messenger *); u32 required_features,
bool nocrc);
extern void ceph_con_init(struct ceph_messenger *msgr, extern void ceph_con_init(struct ceph_connection *con, void *private,
struct ceph_connection *con); const struct ceph_connection_operations *ops,
struct ceph_messenger *msgr);
extern void ceph_con_open(struct ceph_connection *con, extern void ceph_con_open(struct ceph_connection *con,
__u8 entity_type, __u64 entity_num,
struct ceph_entity_addr *addr); struct ceph_entity_addr *addr);
extern bool ceph_con_opened(struct ceph_connection *con); extern bool ceph_con_opened(struct ceph_connection *con);
extern void ceph_con_close(struct ceph_connection *con); extern void ceph_con_close(struct ceph_connection *con);
extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
extern void ceph_con_revoke_message(struct ceph_connection *con, extern void ceph_msg_revoke(struct ceph_msg *msg);
struct ceph_msg *msg); extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
extern void ceph_con_keepalive(struct ceph_connection *con); extern void ceph_con_keepalive(struct ceph_connection *con);
extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
extern void ceph_con_put(struct ceph_connection *con);
extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
bool can_fail); bool can_fail);
......
...@@ -70,7 +70,7 @@ struct ceph_mon_client { ...@@ -70,7 +70,7 @@ struct ceph_mon_client {
bool hunting; bool hunting;
int cur_mon; /* last monitor i contacted */ int cur_mon; /* last monitor i contacted */
unsigned long sub_sent, sub_renew_after; unsigned long sub_sent, sub_renew_after;
struct ceph_connection *con; struct ceph_connection con;
bool have_fsid; bool have_fsid;
/* pending generic requests */ /* pending generic requests */
......
...@@ -11,10 +11,11 @@ ...@@ -11,10 +11,11 @@
struct ceph_msgpool { struct ceph_msgpool {
const char *name; const char *name;
mempool_t *pool; mempool_t *pool;
int type; /* preallocated message type */
int front_len; /* preallocated payload size */ int front_len; /* preallocated payload size */
}; };
extern int ceph_msgpool_init(struct ceph_msgpool *pool, extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
int front_len, int size, bool blocking, int front_len, int size, bool blocking,
const char *name); const char *name);
extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
......
...@@ -154,6 +154,14 @@ struct crush_map { ...@@ -154,6 +154,14 @@ struct crush_map {
__s32 max_buckets; __s32 max_buckets;
__u32 max_rules; __u32 max_rules;
__s32 max_devices; __s32 max_devices;
/* choose local retries before re-descent */
__u32 choose_local_tries;
/* choose local attempts using a fallback permutation before
* re-descent */
__u32 choose_local_fallback_tries;
/* choose attempts before giving up */
__u32 choose_total_tries;
}; };
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <linux/string.h> #include <linux/string.h>
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h> #include <linux/ceph/libceph.h>
#include <linux/ceph/debugfs.h> #include <linux/ceph/debugfs.h>
#include <linux/ceph/decode.h> #include <linux/ceph/decode.h>
...@@ -460,27 +461,23 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, ...@@ -460,27 +461,23 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
client->auth_err = 0; client->auth_err = 0;
client->extra_mon_dispatch = NULL; client->extra_mon_dispatch = NULL;
client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
supported_features; supported_features;
client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
required_features; required_features;
/* msgr */ /* msgr */
if (ceph_test_opt(client, MYIP)) if (ceph_test_opt(client, MYIP))
myaddr = &client->options->my_addr; myaddr = &client->options->my_addr;
client->msgr = ceph_messenger_create(myaddr, ceph_messenger_init(&client->msgr, myaddr,
client->supported_features, client->supported_features,
client->required_features); client->required_features,
if (IS_ERR(client->msgr)) { ceph_test_opt(client, NOCRC));
err = PTR_ERR(client->msgr);
goto fail;
}
client->msgr->nocrc = ceph_test_opt(client, NOCRC);
/* subsystems */ /* subsystems */
err = ceph_monc_init(&client->monc, client); err = ceph_monc_init(&client->monc, client);
if (err < 0) if (err < 0)
goto fail_msgr; goto fail;
err = ceph_osdc_init(&client->osdc, client); err = ceph_osdc_init(&client->osdc, client);
if (err < 0) if (err < 0)
goto fail_monc; goto fail_monc;
...@@ -489,8 +486,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, ...@@ -489,8 +486,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
fail_monc: fail_monc:
ceph_monc_stop(&client->monc); ceph_monc_stop(&client->monc);
fail_msgr:
ceph_messenger_destroy(client->msgr);
fail: fail:
kfree(client); kfree(client);
return ERR_PTR(err); return ERR_PTR(err);
...@@ -501,6 +496,8 @@ void ceph_destroy_client(struct ceph_client *client) ...@@ -501,6 +496,8 @@ void ceph_destroy_client(struct ceph_client *client)
{ {
dout("destroy_client %p\n", client); dout("destroy_client %p\n", client);
atomic_set(&client->msgr.stopping, 1);
/* unmount */ /* unmount */
ceph_osdc_stop(&client->osdc); ceph_osdc_stop(&client->osdc);
...@@ -508,8 +505,6 @@ void ceph_destroy_client(struct ceph_client *client) ...@@ -508,8 +505,6 @@ void ceph_destroy_client(struct ceph_client *client)
ceph_debugfs_client_cleanup(client); ceph_debugfs_client_cleanup(client);
ceph_messenger_destroy(client->msgr);
ceph_destroy_options(client->options); ceph_destroy_options(client->options);
kfree(client); kfree(client);
......
...@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map, ...@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
int item = 0; int item = 0;
int itemtype; int itemtype;
int collide, reject; int collide, reject;
const unsigned int orig_tries = 5; /* attempts before we fall back to search */
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep); bucket->id, x, outpos, numrep);
...@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map, ...@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
reject = 1; reject = 1;
goto reject; goto reject;
} }
if (flocal >= (in->size>>1) && if (map->choose_local_fallback_tries > 0 &&
flocal > orig_tries) flocal >= (in->size>>1) &&
flocal > map->choose_local_fallback_tries)
item = bucket_perm_choose(in, x, r); item = bucket_perm_choose(in, x, r);
else else
item = crush_bucket_choose(in, x, r); item = crush_bucket_choose(in, x, r);
...@@ -422,13 +422,14 @@ static int crush_choose(const struct crush_map *map, ...@@ -422,13 +422,14 @@ static int crush_choose(const struct crush_map *map,
ftotal++; ftotal++;
flocal++; flocal++;
if (collide && flocal < 3) if (collide && flocal <= map->choose_local_tries)
/* retry locally a few times */ /* retry locally a few times */
retry_bucket = 1; retry_bucket = 1;
else if (flocal <= in->size + orig_tries) else if (map->choose_local_fallback_tries > 0 &&
flocal <= in->size + map->choose_local_fallback_tries)
/* exhaustive bucket search */ /* exhaustive bucket search */
retry_bucket = 1; retry_bucket = 1;
else if (ftotal < 20) else if (ftotal <= map->choose_total_tries)
/* then retry descent */ /* then retry descent */
retry_descent = 1; retry_descent = 1;
else else
......
...@@ -29,6 +29,74 @@ ...@@ -29,6 +29,74 @@
* the sender. * the sender.
*/ */
/*
* We track the state of the socket on a given connection using
* values defined below. The transition to a new socket state is
* handled by a function which verifies we aren't coming from an
* unexpected state.
*
* --------
* | NEW* | transient initial state
* --------
* | con_sock_state_init()
* v
* ----------
* | CLOSED | initialized, but no socket (and no
* ---------- TCP connection)
* ^ \
* | \ con_sock_state_connecting()
* | ----------------------
* | \
* + con_sock_state_closed() \
* |+--------------------------- \
* | \ \ \
* | ----------- \ \
* | | CLOSING | socket event; \ \
* | ----------- await close \ \
* | ^ \ |
* | | \ |
* | + con_sock_state_closing() \ |
* | / \ | |
* | / --------------- | |
* | / \ v v
* | / --------------
* | / -----------------| CONNECTING | socket created, TCP
* | | / -------------- connect initiated
* | | | con_sock_state_connected()
* | | v
* -------------
* | CONNECTED | TCP connection established
* -------------
*
* State values for ceph_connection->sock_state; NEW is assumed to be 0.
*/
#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */
#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */
#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */
#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
/*
* connection states
*/
#define CON_STATE_CLOSED 1 /* -> PREOPEN */
#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
/*
* ceph_connection flag bits
*/
#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
* messages on errors */
#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
/* static tag bytes (protocol control messages) */ /* static tag bytes (protocol control messages) */
static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_msg = CEPH_MSGR_TAG_MSG;
static char tag_ack = CEPH_MSGR_TAG_ACK; static char tag_ack = CEPH_MSGR_TAG_ACK;
...@@ -147,72 +215,130 @@ void ceph_msgr_flush(void) ...@@ -147,72 +215,130 @@ void ceph_msgr_flush(void)
} }
EXPORT_SYMBOL(ceph_msgr_flush); EXPORT_SYMBOL(ceph_msgr_flush);
/* Connection socket state transition functions */
static void con_sock_state_init(struct ceph_connection *con)
{
int old_state;
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
printk("%s: unexpected old state %d\n", __func__, old_state);
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
CON_SOCK_STATE_CLOSED);
}
static void con_sock_state_connecting(struct ceph_connection *con)
{
int old_state;
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
printk("%s: unexpected old state %d\n", __func__, old_state);
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
CON_SOCK_STATE_CONNECTING);
}
static void con_sock_state_connected(struct ceph_connection *con)
{
int old_state;
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
printk("%s: unexpected old state %d\n", __func__, old_state);
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
CON_SOCK_STATE_CONNECTED);
}
static void con_sock_state_closing(struct ceph_connection *con)
{
int old_state;
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
old_state != CON_SOCK_STATE_CONNECTED &&
old_state != CON_SOCK_STATE_CLOSING))
printk("%s: unexpected old state %d\n", __func__, old_state);
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
CON_SOCK_STATE_CLOSING);
}
static void con_sock_state_closed(struct ceph_connection *con)
{
int old_state;
old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
old_state != CON_SOCK_STATE_CLOSING &&
old_state != CON_SOCK_STATE_CONNECTING &&
old_state != CON_SOCK_STATE_CLOSED))
printk("%s: unexpected old state %d\n", __func__, old_state);
dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
CON_SOCK_STATE_CLOSED);
}
/* /*
* socket callback functions * socket callback functions
*/ */
/* data available on socket, or listen socket received a connect */ /* data available on socket, or listen socket received a connect */
static void ceph_data_ready(struct sock *sk, int count_unused) static void ceph_sock_data_ready(struct sock *sk, int count_unused)
{ {
struct ceph_connection *con = sk->sk_user_data; struct ceph_connection *con = sk->sk_user_data;
if (atomic_read(&con->msgr->stopping)) {
return;
}
if (sk->sk_state != TCP_CLOSE_WAIT) { if (sk->sk_state != TCP_CLOSE_WAIT) {
dout("ceph_data_ready on %p state = %lu, queueing work\n", dout("%s on %p state = %lu, queueing work\n", __func__,
con, con->state); con, con->state);
queue_con(con); queue_con(con);
} }
} }
/* socket has buffer space for writing */ /* socket has buffer space for writing */
static void ceph_write_space(struct sock *sk) static void ceph_sock_write_space(struct sock *sk)
{ {
struct ceph_connection *con = sk->sk_user_data; struct ceph_connection *con = sk->sk_user_data;
/* only queue to workqueue if there is data we want to write, /* only queue to workqueue if there is data we want to write,
* and there is sufficient space in the socket buffer to accept * and there is sufficient space in the socket buffer to accept
* more data. clear SOCK_NOSPACE so that ceph_write_space() * more data. clear SOCK_NOSPACE so that ceph_sock_write_space()
* doesn't get called again until try_write() fills the socket * doesn't get called again until try_write() fills the socket
* buffer. See net/ipv4/tcp_input.c:tcp_check_space() * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
* and net/core/stream.c:sk_stream_write_space(). * and net/core/stream.c:sk_stream_write_space().
*/ */
if (test_bit(WRITE_PENDING, &con->state)) { if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) {
if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
dout("ceph_write_space %p queueing write work\n", con); dout("%s %p queueing write work\n", __func__, con);
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
queue_con(con); queue_con(con);
} }
} else { } else {
dout("ceph_write_space %p nothing to write\n", con); dout("%s %p nothing to write\n", __func__, con);
} }
} }
/* socket's state has changed */ /* socket's state has changed */
static void ceph_state_change(struct sock *sk) static void ceph_sock_state_change(struct sock *sk)
{ {
struct ceph_connection *con = sk->sk_user_data; struct ceph_connection *con = sk->sk_user_data;
dout("ceph_state_change %p state = %lu sk_state = %u\n", dout("%s %p state = %lu sk_state = %u\n", __func__,
con, con->state, sk->sk_state); con, con->state, sk->sk_state);
if (test_bit(CLOSED, &con->state))
return;
switch (sk->sk_state) { switch (sk->sk_state) {
case TCP_CLOSE: case TCP_CLOSE:
dout("ceph_state_change TCP_CLOSE\n"); dout("%s TCP_CLOSE\n", __func__);
case TCP_CLOSE_WAIT: case TCP_CLOSE_WAIT:
dout("ceph_state_change TCP_CLOSE_WAIT\n"); dout("%s TCP_CLOSE_WAIT\n", __func__);
if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { con_sock_state_closing(con);
if (test_bit(CONNECTING, &con->state)) set_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
con->error_msg = "connection failed"; queue_con(con);
else
con->error_msg = "socket closed";
queue_con(con);
}
break; break;
case TCP_ESTABLISHED: case TCP_ESTABLISHED:
dout("ceph_state_change TCP_ESTABLISHED\n"); dout("%s TCP_ESTABLISHED\n", __func__);
con_sock_state_connected(con);
queue_con(con); queue_con(con);
break; break;
default: /* Everything else is uninteresting */ default: /* Everything else is uninteresting */
...@@ -228,9 +354,9 @@ static void set_sock_callbacks(struct socket *sock, ...@@ -228,9 +354,9 @@ static void set_sock_callbacks(struct socket *sock,
{ {
struct sock *sk = sock->sk; struct sock *sk = sock->sk;
sk->sk_user_data = con; sk->sk_user_data = con;
sk->sk_data_ready = ceph_data_ready; sk->sk_data_ready = ceph_sock_data_ready;
sk->sk_write_space = ceph_write_space; sk->sk_write_space = ceph_sock_write_space;
sk->sk_state_change = ceph_state_change; sk->sk_state_change = ceph_sock_state_change;
} }
...@@ -262,6 +388,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) ...@@ -262,6 +388,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
con_sock_state_connecting(con);
ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
O_NONBLOCK); O_NONBLOCK);
if (ret == -EINPROGRESS) { if (ret == -EINPROGRESS) {
...@@ -277,7 +404,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) ...@@ -277,7 +404,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
return ret; return ret;
} }
con->sock = sock; con->sock = sock;
return 0; return 0;
} }
...@@ -333,16 +459,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page, ...@@ -333,16 +459,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
*/ */
static int con_close_socket(struct ceph_connection *con) static int con_close_socket(struct ceph_connection *con)
{ {
int rc; int rc = 0;
dout("con_close_socket on %p sock %p\n", con, con->sock); dout("con_close_socket on %p sock %p\n", con, con->sock);
if (!con->sock) if (con->sock) {
return 0; rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
set_bit(SOCK_CLOSED, &con->state); sock_release(con->sock);
rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); con->sock = NULL;
sock_release(con->sock); }
con->sock = NULL;
clear_bit(SOCK_CLOSED, &con->state); /*
* Forcibly clear the SOCK_CLOSED flag. It gets set
* independent of the connection mutex, and we could have
* received a socket close event before we had the chance to
* shut the socket down.
*/
clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
con_sock_state_closed(con);
return rc; return rc;
} }
...@@ -353,6 +487,10 @@ static int con_close_socket(struct ceph_connection *con) ...@@ -353,6 +487,10 @@ static int con_close_socket(struct ceph_connection *con)
static void ceph_msg_remove(struct ceph_msg *msg) static void ceph_msg_remove(struct ceph_msg *msg)
{ {
list_del_init(&msg->list_head); list_del_init(&msg->list_head);
BUG_ON(msg->con == NULL);
msg->con->ops->put(msg->con);
msg->con = NULL;
ceph_msg_put(msg); ceph_msg_put(msg);
} }
static void ceph_msg_remove_list(struct list_head *head) static void ceph_msg_remove_list(struct list_head *head)
...@@ -372,8 +510,11 @@ static void reset_connection(struct ceph_connection *con) ...@@ -372,8 +510,11 @@ static void reset_connection(struct ceph_connection *con)
ceph_msg_remove_list(&con->out_sent); ceph_msg_remove_list(&con->out_sent);
if (con->in_msg) { if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
con->in_msg->con = NULL;
ceph_msg_put(con->in_msg); ceph_msg_put(con->in_msg);
con->in_msg = NULL; con->in_msg = NULL;
con->ops->put(con);
} }
con->connect_seq = 0; con->connect_seq = 0;
...@@ -391,32 +532,44 @@ static void reset_connection(struct ceph_connection *con) ...@@ -391,32 +532,44 @@ static void reset_connection(struct ceph_connection *con)
*/ */
void ceph_con_close(struct ceph_connection *con) void ceph_con_close(struct ceph_connection *con)
{ {
mutex_lock(&con->mutex);
dout("con_close %p peer %s\n", con, dout("con_close %p peer %s\n", con,
ceph_pr_addr(&con->peer_addr.in_addr)); ceph_pr_addr(&con->peer_addr.in_addr));
set_bit(CLOSED, &con->state); /* in case there's queued work */ con->state = CON_STATE_CLOSED;
clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */
clear_bit(KEEPALIVE_PENDING, &con->state); clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
clear_bit(WRITE_PENDING, &con->state); clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
mutex_lock(&con->mutex); clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
clear_bit(CON_FLAG_BACKOFF, &con->flags);
reset_connection(con); reset_connection(con);
con->peer_global_seq = 0; con->peer_global_seq = 0;
cancel_delayed_work(&con->work); cancel_delayed_work(&con->work);
con_close_socket(con);
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
queue_con(con);
} }
EXPORT_SYMBOL(ceph_con_close); EXPORT_SYMBOL(ceph_con_close);
/* /*
* Reopen a closed connection, with a new peer address. * Reopen a closed connection, with a new peer address.
*/ */
void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) void ceph_con_open(struct ceph_connection *con,
__u8 entity_type, __u64 entity_num,
struct ceph_entity_addr *addr)
{ {
mutex_lock(&con->mutex);
dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
set_bit(OPENING, &con->state);
clear_bit(CLOSED, &con->state); BUG_ON(con->state != CON_STATE_CLOSED);
con->state = CON_STATE_PREOPEN;
con->peer_name.type = (__u8) entity_type;
con->peer_name.num = cpu_to_le64(entity_num);
memcpy(&con->peer_addr, addr, sizeof(*addr)); memcpy(&con->peer_addr, addr, sizeof(*addr));
con->delay = 0; /* reset backoff memory */ con->delay = 0; /* reset backoff memory */
mutex_unlock(&con->mutex);
queue_con(con); queue_con(con);
} }
EXPORT_SYMBOL(ceph_con_open); EXPORT_SYMBOL(ceph_con_open);
...@@ -429,43 +582,27 @@ bool ceph_con_opened(struct ceph_connection *con) ...@@ -429,43 +582,27 @@ bool ceph_con_opened(struct ceph_connection *con)
return con->connect_seq > 0; return con->connect_seq > 0;
} }
/*
* generic get/put
*/
struct ceph_connection *ceph_con_get(struct ceph_connection *con)
{
int nref = __atomic_add_unless(&con->nref, 1, 0);
dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1);
return nref ? con : NULL;
}
void ceph_con_put(struct ceph_connection *con)
{
int nref = atomic_dec_return(&con->nref);
BUG_ON(nref < 0);
if (nref == 0) {
BUG_ON(con->sock);
kfree(con);
}
dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref);
}
/* /*
* initialize a new connection. * initialize a new connection.
*/ */
void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) void ceph_con_init(struct ceph_connection *con, void *private,
const struct ceph_connection_operations *ops,
struct ceph_messenger *msgr)
{ {
dout("con_init %p\n", con); dout("con_init %p\n", con);
memset(con, 0, sizeof(*con)); memset(con, 0, sizeof(*con));
atomic_set(&con->nref, 1); con->private = private;
con->ops = ops;
con->msgr = msgr; con->msgr = msgr;
con_sock_state_init(con);
mutex_init(&con->mutex); mutex_init(&con->mutex);
INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_queue);
INIT_LIST_HEAD(&con->out_sent); INIT_LIST_HEAD(&con->out_sent);
INIT_DELAYED_WORK(&con->work, con_work); INIT_DELAYED_WORK(&con->work, con_work);
con->state = CON_STATE_CLOSED;
} }
EXPORT_SYMBOL(ceph_con_init); EXPORT_SYMBOL(ceph_con_init);
...@@ -486,14 +623,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) ...@@ -486,14 +623,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
return ret; return ret;
} }
static void ceph_con_out_kvec_reset(struct ceph_connection *con) static void con_out_kvec_reset(struct ceph_connection *con)
{ {
con->out_kvec_left = 0; con->out_kvec_left = 0;
con->out_kvec_bytes = 0; con->out_kvec_bytes = 0;
con->out_kvec_cur = &con->out_kvec[0]; con->out_kvec_cur = &con->out_kvec[0];
} }
static void ceph_con_out_kvec_add(struct ceph_connection *con, static void con_out_kvec_add(struct ceph_connection *con,
size_t size, void *data) size_t size, void *data)
{ {
int index; int index;
...@@ -507,6 +644,53 @@ static void ceph_con_out_kvec_add(struct ceph_connection *con, ...@@ -507,6 +644,53 @@ static void ceph_con_out_kvec_add(struct ceph_connection *con,
con->out_kvec_bytes += size; con->out_kvec_bytes += size;
} }
#ifdef CONFIG_BLOCK
static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
{
if (!bio) {
*iter = NULL;
*seg = 0;
return;
}
*iter = bio;
*seg = bio->bi_idx;
}
static void iter_bio_next(struct bio **bio_iter, int *seg)
{
if (*bio_iter == NULL)
return;
BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
(*seg)++;
if (*seg == (*bio_iter)->bi_vcnt)
init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
}
#endif
static void prepare_write_message_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->out_msg;
BUG_ON(!msg);
BUG_ON(!msg->hdr.data_len);
/* initialize page iterator */
con->out_msg_pos.page = 0;
if (msg->pages)
con->out_msg_pos.page_pos = msg->page_alignment;
else
con->out_msg_pos.page_pos = 0;
#ifdef CONFIG_BLOCK
if (msg->bio)
init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
#endif
con->out_msg_pos.data_pos = 0;
con->out_msg_pos.did_page_crc = false;
con->out_more = 1; /* data + footer will follow */
}
/* /*
* Prepare footer for currently outgoing message, and finish things * Prepare footer for currently outgoing message, and finish things
* off. Assumes out_kvec* are already valid.. we just add on to the end. * off. Assumes out_kvec* are already valid.. we just add on to the end.
...@@ -516,6 +700,8 @@ static void prepare_write_message_footer(struct ceph_connection *con) ...@@ -516,6 +700,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
struct ceph_msg *m = con->out_msg; struct ceph_msg *m = con->out_msg;
int v = con->out_kvec_left; int v = con->out_kvec_left;
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con); dout("prepare_write_message_footer %p\n", con);
con->out_kvec_is_msg = true; con->out_kvec_is_msg = true;
con->out_kvec[v].iov_base = &m->footer; con->out_kvec[v].iov_base = &m->footer;
...@@ -534,7 +720,7 @@ static void prepare_write_message(struct ceph_connection *con) ...@@ -534,7 +720,7 @@ static void prepare_write_message(struct ceph_connection *con)
struct ceph_msg *m; struct ceph_msg *m;
u32 crc; u32 crc;
ceph_con_out_kvec_reset(con); con_out_kvec_reset(con);
con->out_kvec_is_msg = true; con->out_kvec_is_msg = true;
con->out_msg_done = false; con->out_msg_done = false;
...@@ -542,14 +728,16 @@ static void prepare_write_message(struct ceph_connection *con) ...@@ -542,14 +728,16 @@ static void prepare_write_message(struct ceph_connection *con)
* TCP packet that's a good thing. */ * TCP packet that's a good thing. */
if (con->in_seq > con->in_seq_acked) { if (con->in_seq > con->in_seq_acked) {
con->in_seq_acked = con->in_seq; con->in_seq_acked = con->in_seq;
ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
con->out_temp_ack = cpu_to_le64(con->in_seq_acked); con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), con_out_kvec_add(con, sizeof (con->out_temp_ack),
&con->out_temp_ack); &con->out_temp_ack);
} }
BUG_ON(list_empty(&con->out_queue));
m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
con->out_msg = m; con->out_msg = m;
BUG_ON(m->con != con);
/* put message on sent list */ /* put message on sent list */
ceph_msg_get(m); ceph_msg_get(m);
...@@ -576,18 +764,18 @@ static void prepare_write_message(struct ceph_connection *con) ...@@ -576,18 +764,18 @@ static void prepare_write_message(struct ceph_connection *con)
BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
/* tag + hdr + front + middle */ /* tag + hdr + front + middle */
ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
if (m->middle) if (m->middle)
ceph_con_out_kvec_add(con, m->middle->vec.iov_len, con_out_kvec_add(con, m->middle->vec.iov_len,
m->middle->vec.iov_base); m->middle->vec.iov_base);
/* fill in crc (except data pages), footer */ /* fill in crc (except data pages), footer */
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
con->out_msg->hdr.crc = cpu_to_le32(crc); con->out_msg->hdr.crc = cpu_to_le32(crc);
con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; con->out_msg->footer.flags = 0;
crc = crc32c(0, m->front.iov_base, m->front.iov_len); crc = crc32c(0, m->front.iov_base, m->front.iov_len);
con->out_msg->footer.front_crc = cpu_to_le32(crc); con->out_msg->footer.front_crc = cpu_to_le32(crc);
...@@ -597,28 +785,19 @@ static void prepare_write_message(struct ceph_connection *con) ...@@ -597,28 +785,19 @@ static void prepare_write_message(struct ceph_connection *con)
con->out_msg->footer.middle_crc = cpu_to_le32(crc); con->out_msg->footer.middle_crc = cpu_to_le32(crc);
} else } else
con->out_msg->footer.middle_crc = 0; con->out_msg->footer.middle_crc = 0;
con->out_msg->footer.data_crc = 0; dout("%s front_crc %u middle_crc %u\n", __func__,
dout("prepare_write_message front_crc %u data_crc %u\n",
le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.front_crc),
le32_to_cpu(con->out_msg->footer.middle_crc)); le32_to_cpu(con->out_msg->footer.middle_crc));
/* is there a data payload? */ /* is there a data payload? */
if (le32_to_cpu(m->hdr.data_len) > 0) { con->out_msg->footer.data_crc = 0;
/* initialize page iterator */ if (m->hdr.data_len)
con->out_msg_pos.page = 0; prepare_write_message_data(con);
if (m->pages) else
con->out_msg_pos.page_pos = m->page_alignment;
else
con->out_msg_pos.page_pos = 0;
con->out_msg_pos.data_pos = 0;
con->out_msg_pos.did_page_crc = false;
con->out_more = 1; /* data + footer will follow */
} else {
/* no, queue up footer too and be done */ /* no, queue up footer too and be done */
prepare_write_message_footer(con); prepare_write_message_footer(con);
}
set_bit(WRITE_PENDING, &con->state); set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
} }
/* /*
...@@ -630,16 +809,16 @@ static void prepare_write_ack(struct ceph_connection *con) ...@@ -630,16 +809,16 @@ static void prepare_write_ack(struct ceph_connection *con)
con->in_seq_acked, con->in_seq); con->in_seq_acked, con->in_seq);
con->in_seq_acked = con->in_seq; con->in_seq_acked = con->in_seq;
ceph_con_out_kvec_reset(con); con_out_kvec_reset(con);
ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
con->out_temp_ack = cpu_to_le64(con->in_seq_acked); con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), con_out_kvec_add(con, sizeof (con->out_temp_ack),
&con->out_temp_ack); &con->out_temp_ack);
con->out_more = 1; /* more will follow.. eventually.. */ con->out_more = 1; /* more will follow.. eventually.. */
set_bit(WRITE_PENDING, &con->state); set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
} }
/* /*
...@@ -648,9 +827,9 @@ static void prepare_write_ack(struct ceph_connection *con) ...@@ -648,9 +827,9 @@ static void prepare_write_ack(struct ceph_connection *con)
static void prepare_write_keepalive(struct ceph_connection *con) static void prepare_write_keepalive(struct ceph_connection *con)
{ {
dout("prepare_write_keepalive %p\n", con); dout("prepare_write_keepalive %p\n", con);
ceph_con_out_kvec_reset(con); con_out_kvec_reset(con);
ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
set_bit(WRITE_PENDING, &con->state); set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
} }
/* /*
...@@ -665,27 +844,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection ...@@ -665,27 +844,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
if (!con->ops->get_authorizer) { if (!con->ops->get_authorizer) {
con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
con->out_connect.authorizer_len = 0; con->out_connect.authorizer_len = 0;
return NULL; return NULL;
} }
/* Can't hold the mutex while getting authorizer */ /* Can't hold the mutex while getting authorizer */
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (IS_ERR(auth)) if (IS_ERR(auth))
return auth; return auth;
if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) if (con->state != CON_STATE_NEGOTIATING)
return ERR_PTR(-EAGAIN); return ERR_PTR(-EAGAIN);
con->auth_reply_buf = auth->authorizer_reply_buf; con->auth_reply_buf = auth->authorizer_reply_buf;
con->auth_reply_buf_len = auth->authorizer_reply_buf_len; con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
return auth; return auth;
} }
...@@ -694,12 +867,12 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection ...@@ -694,12 +867,12 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
*/ */
static void prepare_write_banner(struct ceph_connection *con) static void prepare_write_banner(struct ceph_connection *con)
{ {
ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
&con->msgr->my_enc_addr); &con->msgr->my_enc_addr);
con->out_more = 0; con->out_more = 0;
set_bit(WRITE_PENDING, &con->state); set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
} }
static int prepare_write_connect(struct ceph_connection *con) static int prepare_write_connect(struct ceph_connection *con)
...@@ -742,14 +915,15 @@ static int prepare_write_connect(struct ceph_connection *con) ...@@ -742,14 +915,15 @@ static int prepare_write_connect(struct ceph_connection *con)
con->out_connect.authorizer_len = auth ? con->out_connect.authorizer_len = auth ?
cpu_to_le32(auth->authorizer_buf_len) : 0; cpu_to_le32(auth->authorizer_buf_len) : 0;
ceph_con_out_kvec_add(con, sizeof (con->out_connect), con_out_kvec_reset(con);
con_out_kvec_add(con, sizeof (con->out_connect),
&con->out_connect); &con->out_connect);
if (auth && auth->authorizer_buf_len) if (auth && auth->authorizer_buf_len)
ceph_con_out_kvec_add(con, auth->authorizer_buf_len, con_out_kvec_add(con, auth->authorizer_buf_len,
auth->authorizer_buf); auth->authorizer_buf);
con->out_more = 0; con->out_more = 0;
set_bit(WRITE_PENDING, &con->state); set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
return 0; return 0;
} }
...@@ -797,30 +971,34 @@ static int write_partial_kvec(struct ceph_connection *con) ...@@ -797,30 +971,34 @@ static int write_partial_kvec(struct ceph_connection *con)
return ret; /* done! */ return ret; /* done! */
} }
#ifdef CONFIG_BLOCK static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) size_t len, size_t sent, bool in_trail)
{ {
if (!bio) { struct ceph_msg *msg = con->out_msg;
*iter = NULL;
*seg = 0;
return;
}
*iter = bio;
*seg = bio->bi_idx;
}
static void iter_bio_next(struct bio **bio_iter, int *seg) BUG_ON(!msg);
{ BUG_ON(!sent);
if (*bio_iter == NULL)
return;
BUG_ON(*seg >= (*bio_iter)->bi_vcnt); con->out_msg_pos.data_pos += sent;
con->out_msg_pos.page_pos += sent;
if (sent < len)
return;
(*seg)++; BUG_ON(sent != len);
if (*seg == (*bio_iter)->bi_vcnt) con->out_msg_pos.page_pos = 0;
init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); con->out_msg_pos.page++;
} con->out_msg_pos.did_page_crc = false;
if (in_trail)
list_move_tail(&page->lru,
&msg->trail->head);
else if (msg->pagelist)
list_move_tail(&page->lru,
&msg->pagelist->head);
#ifdef CONFIG_BLOCK
else if (msg->bio)
iter_bio_next(&msg->bio_iter, &msg->bio_seg);
#endif #endif
}
/* /*
* Write as much message data payload as we can. If we finish, queue * Write as much message data payload as we can. If we finish, queue
...@@ -837,41 +1015,36 @@ static int write_partial_msg_pages(struct ceph_connection *con) ...@@ -837,41 +1015,36 @@ static int write_partial_msg_pages(struct ceph_connection *con)
bool do_datacrc = !con->msgr->nocrc; bool do_datacrc = !con->msgr->nocrc;
int ret; int ret;
int total_max_write; int total_max_write;
int in_trail = 0; bool in_trail = false;
size_t trail_len = (msg->trail ? msg->trail->length : 0); const size_t trail_len = (msg->trail ? msg->trail->length : 0);
const size_t trail_off = data_len - trail_len;
dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, con, msg, con->out_msg_pos.page, msg->nr_pages,
con->out_msg_pos.page_pos); con->out_msg_pos.page_pos);
#ifdef CONFIG_BLOCK /*
if (msg->bio && !msg->bio_iter) * Iterate through each page that contains data to be
init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); * written, and send as much as possible for each.
#endif *
* If we are calculating the data crc (the default), we will
* need to map the page. If we have no pages, they have
* been revoked, so use the zero page.
*/
while (data_len > con->out_msg_pos.data_pos) { while (data_len > con->out_msg_pos.data_pos) {
struct page *page = NULL; struct page *page = NULL;
int max_write = PAGE_SIZE; int max_write = PAGE_SIZE;
int bio_offset = 0; int bio_offset = 0;
total_max_write = data_len - trail_len - in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off;
con->out_msg_pos.data_pos; if (!in_trail)
total_max_write = trail_off - con->out_msg_pos.data_pos;
/*
* if we are calculating the data crc (the default), we need
* to map the page. if our pages[] has been revoked, use the
* zero page.
*/
/* have we reached the trail part of the data? */
if (con->out_msg_pos.data_pos >= data_len - trail_len) {
in_trail = 1;
if (in_trail) {
total_max_write = data_len - con->out_msg_pos.data_pos; total_max_write = data_len - con->out_msg_pos.data_pos;
page = list_first_entry(&msg->trail->head, page = list_first_entry(&msg->trail->head,
struct page, lru); struct page, lru);
max_write = PAGE_SIZE;
} else if (msg->pages) { } else if (msg->pages) {
page = msg->pages[con->out_msg_pos.page]; page = msg->pages[con->out_msg_pos.page];
} else if (msg->pagelist) { } else if (msg->pagelist) {
...@@ -894,15 +1067,14 @@ static int write_partial_msg_pages(struct ceph_connection *con) ...@@ -894,15 +1067,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
if (do_datacrc && !con->out_msg_pos.did_page_crc) { if (do_datacrc && !con->out_msg_pos.did_page_crc) {
void *base; void *base;
u32 crc; u32 crc = le32_to_cpu(msg->footer.data_crc);
u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
char *kaddr; char *kaddr;
kaddr = kmap(page); kaddr = kmap(page);
BUG_ON(kaddr == NULL); BUG_ON(kaddr == NULL);
base = kaddr + con->out_msg_pos.page_pos + bio_offset; base = kaddr + con->out_msg_pos.page_pos + bio_offset;
crc = crc32c(tmpcrc, base, len); crc = crc32c(crc, base, len);
con->out_msg->footer.data_crc = cpu_to_le32(crc); msg->footer.data_crc = cpu_to_le32(crc);
con->out_msg_pos.did_page_crc = true; con->out_msg_pos.did_page_crc = true;
} }
ret = ceph_tcp_sendpage(con->sock, page, ret = ceph_tcp_sendpage(con->sock, page,
...@@ -915,31 +1087,15 @@ static int write_partial_msg_pages(struct ceph_connection *con) ...@@ -915,31 +1087,15 @@ static int write_partial_msg_pages(struct ceph_connection *con)
if (ret <= 0) if (ret <= 0)
goto out; goto out;
con->out_msg_pos.data_pos += ret; out_msg_pos_next(con, page, len, (size_t) ret, in_trail);
con->out_msg_pos.page_pos += ret;
if (ret == len) {
con->out_msg_pos.page_pos = 0;
con->out_msg_pos.page++;
con->out_msg_pos.did_page_crc = false;
if (in_trail)
list_move_tail(&page->lru,
&msg->trail->head);
else if (msg->pagelist)
list_move_tail(&page->lru,
&msg->pagelist->head);
#ifdef CONFIG_BLOCK
else if (msg->bio)
iter_bio_next(&msg->bio_iter, &msg->bio_seg);
#endif
}
} }
dout("write_partial_msg_pages %p msg %p done\n", con, msg); dout("write_partial_msg_pages %p msg %p done\n", con, msg);
/* prepare and queue up footer, too */ /* prepare and queue up footer, too */
if (!do_datacrc) if (!do_datacrc)
con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
ceph_con_out_kvec_reset(con); con_out_kvec_reset(con);
prepare_write_message_footer(con); prepare_write_message_footer(con);
ret = 1; ret = 1;
out: out:
...@@ -1351,20 +1507,14 @@ static int process_banner(struct ceph_connection *con) ...@@ -1351,20 +1507,14 @@ static int process_banner(struct ceph_connection *con)
ceph_pr_addr(&con->msgr->inst.addr.in_addr)); ceph_pr_addr(&con->msgr->inst.addr.in_addr));
} }
set_bit(NEGOTIATING, &con->state);
prepare_read_connect(con);
return 0; return 0;
} }
static void fail_protocol(struct ceph_connection *con) static void fail_protocol(struct ceph_connection *con)
{ {
reset_connection(con); reset_connection(con);
set_bit(CLOSED, &con->state); /* in case there's queued work */ BUG_ON(con->state != CON_STATE_NEGOTIATING);
con->state = CON_STATE_CLOSED;
mutex_unlock(&con->mutex);
if (con->ops->bad_proto)
con->ops->bad_proto(con);
mutex_lock(&con->mutex);
} }
static int process_connect(struct ceph_connection *con) static int process_connect(struct ceph_connection *con)
...@@ -1407,7 +1557,6 @@ static int process_connect(struct ceph_connection *con) ...@@ -1407,7 +1557,6 @@ static int process_connect(struct ceph_connection *con)
return -1; return -1;
} }
con->auth_retry = 1; con->auth_retry = 1;
ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con); ret = prepare_write_connect(con);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -1428,7 +1577,6 @@ static int process_connect(struct ceph_connection *con) ...@@ -1428,7 +1577,6 @@ static int process_connect(struct ceph_connection *con)
ENTITY_NAME(con->peer_name), ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr.in_addr)); ceph_pr_addr(&con->peer_addr.in_addr));
reset_connection(con); reset_connection(con);
ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con); ret = prepare_write_connect(con);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -1440,8 +1588,7 @@ static int process_connect(struct ceph_connection *con) ...@@ -1440,8 +1588,7 @@ static int process_connect(struct ceph_connection *con)
if (con->ops->peer_reset) if (con->ops->peer_reset)
con->ops->peer_reset(con); con->ops->peer_reset(con);
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (test_bit(CLOSED, &con->state) || if (con->state != CON_STATE_NEGOTIATING)
test_bit(OPENING, &con->state))
return -EAGAIN; return -EAGAIN;
break; break;
...@@ -1454,7 +1601,6 @@ static int process_connect(struct ceph_connection *con) ...@@ -1454,7 +1601,6 @@ static int process_connect(struct ceph_connection *con)
le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->out_connect.connect_seq),
le32_to_cpu(con->in_reply.connect_seq)); le32_to_cpu(con->in_reply.connect_seq));
con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con); ret = prepare_write_connect(con);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -1471,7 +1617,6 @@ static int process_connect(struct ceph_connection *con) ...@@ -1471,7 +1617,6 @@ static int process_connect(struct ceph_connection *con)
le32_to_cpu(con->in_reply.global_seq)); le32_to_cpu(con->in_reply.global_seq));
get_global_seq(con->msgr, get_global_seq(con->msgr,
le32_to_cpu(con->in_reply.global_seq)); le32_to_cpu(con->in_reply.global_seq));
ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con); ret = prepare_write_connect(con);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -1489,7 +1634,10 @@ static int process_connect(struct ceph_connection *con) ...@@ -1489,7 +1634,10 @@ static int process_connect(struct ceph_connection *con)
fail_protocol(con); fail_protocol(con);
return -1; return -1;
} }
clear_bit(CONNECTING, &con->state);
BUG_ON(con->state != CON_STATE_NEGOTIATING);
con->state = CON_STATE_OPEN;
con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
con->connect_seq++; con->connect_seq++;
con->peer_features = server_feat; con->peer_features = server_feat;
...@@ -1501,7 +1649,9 @@ static int process_connect(struct ceph_connection *con) ...@@ -1501,7 +1649,9 @@ static int process_connect(struct ceph_connection *con)
le32_to_cpu(con->in_reply.connect_seq)); le32_to_cpu(con->in_reply.connect_seq));
if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
set_bit(LOSSYTX, &con->state); set_bit(CON_FLAG_LOSSYTX, &con->flags);
con->delay = 0; /* reset backoff memory */
prepare_read_tag(con); prepare_read_tag(con);
break; break;
...@@ -1587,10 +1737,7 @@ static int read_partial_message_section(struct ceph_connection *con, ...@@ -1587,10 +1737,7 @@ static int read_partial_message_section(struct ceph_connection *con,
return 1; return 1;
} }
static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
struct ceph_msg_header *hdr,
int *skip);
static int read_partial_message_pages(struct ceph_connection *con, static int read_partial_message_pages(struct ceph_connection *con,
struct page **pages, struct page **pages,
...@@ -1633,9 +1780,6 @@ static int read_partial_message_bio(struct ceph_connection *con, ...@@ -1633,9 +1780,6 @@ static int read_partial_message_bio(struct ceph_connection *con,
void *p; void *p;
int ret, left; int ret, left;
if (IS_ERR(bv))
return PTR_ERR(bv);
left = min((int)(data_len - con->in_msg_pos.data_pos), left = min((int)(data_len - con->in_msg_pos.data_pos),
(int)(bv->bv_len - con->in_msg_pos.page_pos)); (int)(bv->bv_len - con->in_msg_pos.page_pos));
...@@ -1672,7 +1816,6 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -1672,7 +1816,6 @@ static int read_partial_message(struct ceph_connection *con)
int ret; int ret;
unsigned int front_len, middle_len, data_len; unsigned int front_len, middle_len, data_len;
bool do_datacrc = !con->msgr->nocrc; bool do_datacrc = !con->msgr->nocrc;
int skip;
u64 seq; u64 seq;
u32 crc; u32 crc;
...@@ -1723,10 +1866,13 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -1723,10 +1866,13 @@ static int read_partial_message(struct ceph_connection *con)
/* allocate message? */ /* allocate message? */
if (!con->in_msg) { if (!con->in_msg) {
int skip = 0;
dout("got hdr type %d front %d data %d\n", con->in_hdr.type, dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
con->in_hdr.front_len, con->in_hdr.data_len); con->in_hdr.front_len, con->in_hdr.data_len);
skip = 0; ret = ceph_con_in_msg_alloc(con, &skip);
con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (ret < 0)
return ret;
if (skip) { if (skip) {
/* skip this message */ /* skip this message */
dout("alloc_msg said skip message\n"); dout("alloc_msg said skip message\n");
...@@ -1737,11 +1883,9 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -1737,11 +1883,9 @@ static int read_partial_message(struct ceph_connection *con)
con->in_seq++; con->in_seq++;
return 0; return 0;
} }
if (!con->in_msg) {
con->error_msg = BUG_ON(!con->in_msg);
"error allocating memory for incoming message"; BUG_ON(con->in_msg->con != con);
return -ENOMEM;
}
m = con->in_msg; m = con->in_msg;
m->front.iov_len = 0; /* haven't read it yet */ m->front.iov_len = 0; /* haven't read it yet */
if (m->middle) if (m->middle)
...@@ -1753,6 +1897,11 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -1753,6 +1897,11 @@ static int read_partial_message(struct ceph_connection *con)
else else
con->in_msg_pos.page_pos = 0; con->in_msg_pos.page_pos = 0;
con->in_msg_pos.data_pos = 0; con->in_msg_pos.data_pos = 0;
#ifdef CONFIG_BLOCK
if (m->bio)
init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
#endif
} }
/* front */ /* front */
...@@ -1769,10 +1918,6 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -1769,10 +1918,6 @@ static int read_partial_message(struct ceph_connection *con)
if (ret <= 0) if (ret <= 0)
return ret; return ret;
} }
#ifdef CONFIG_BLOCK
if (m->bio && !m->bio_iter)
init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
#endif
/* (page) data */ /* (page) data */
while (con->in_msg_pos.data_pos < data_len) { while (con->in_msg_pos.data_pos < data_len) {
...@@ -1783,7 +1928,7 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -1783,7 +1928,7 @@ static int read_partial_message(struct ceph_connection *con)
return ret; return ret;
#ifdef CONFIG_BLOCK #ifdef CONFIG_BLOCK
} else if (m->bio) { } else if (m->bio) {
BUG_ON(!m->bio_iter);
ret = read_partial_message_bio(con, ret = read_partial_message_bio(con,
&m->bio_iter, &m->bio_seg, &m->bio_iter, &m->bio_seg,
data_len, do_datacrc); data_len, do_datacrc);
...@@ -1837,8 +1982,11 @@ static void process_message(struct ceph_connection *con) ...@@ -1837,8 +1982,11 @@ static void process_message(struct ceph_connection *con)
{ {
struct ceph_msg *msg; struct ceph_msg *msg;
BUG_ON(con->in_msg->con != con);
con->in_msg->con = NULL;
msg = con->in_msg; msg = con->in_msg;
con->in_msg = NULL; con->in_msg = NULL;
con->ops->put(con);
/* if first message, set peer_name */ /* if first message, set peer_name */
if (con->peer_name.type == 0) if (con->peer_name.type == 0)
...@@ -1858,7 +2006,6 @@ static void process_message(struct ceph_connection *con) ...@@ -1858,7 +2006,6 @@ static void process_message(struct ceph_connection *con)
con->ops->dispatch(con, msg); con->ops->dispatch(con, msg);
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
prepare_read_tag(con);
} }
...@@ -1870,22 +2017,19 @@ static int try_write(struct ceph_connection *con) ...@@ -1870,22 +2017,19 @@ static int try_write(struct ceph_connection *con)
{ {
int ret = 1; int ret = 1;
dout("try_write start %p state %lu nref %d\n", con, con->state, dout("try_write start %p state %lu\n", con, con->state);
atomic_read(&con->nref));
more: more:
dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
/* open the socket first? */ /* open the socket first? */
if (con->sock == NULL) { if (con->state == CON_STATE_PREOPEN) {
ceph_con_out_kvec_reset(con); BUG_ON(con->sock);
con->state = CON_STATE_CONNECTING;
con_out_kvec_reset(con);
prepare_write_banner(con); prepare_write_banner(con);
ret = prepare_write_connect(con);
if (ret < 0)
goto out;
prepare_read_banner(con); prepare_read_banner(con);
set_bit(CONNECTING, &con->state);
clear_bit(NEGOTIATING, &con->state);
BUG_ON(con->in_msg); BUG_ON(con->in_msg);
con->in_tag = CEPH_MSGR_TAG_READY; con->in_tag = CEPH_MSGR_TAG_READY;
...@@ -1932,7 +2076,7 @@ static int try_write(struct ceph_connection *con) ...@@ -1932,7 +2076,7 @@ static int try_write(struct ceph_connection *con)
} }
do_next: do_next:
if (!test_bit(CONNECTING, &con->state)) { if (con->state == CON_STATE_OPEN) {
/* is anything else pending? */ /* is anything else pending? */
if (!list_empty(&con->out_queue)) { if (!list_empty(&con->out_queue)) {
prepare_write_message(con); prepare_write_message(con);
...@@ -1942,14 +2086,15 @@ static int try_write(struct ceph_connection *con) ...@@ -1942,14 +2086,15 @@ static int try_write(struct ceph_connection *con)
prepare_write_ack(con); prepare_write_ack(con);
goto more; goto more;
} }
if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING,
&con->flags)) {
prepare_write_keepalive(con); prepare_write_keepalive(con);
goto more; goto more;
} }
} }
/* Nothing to do! */ /* Nothing to do! */
clear_bit(WRITE_PENDING, &con->state); clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
dout("try_write nothing else to write.\n"); dout("try_write nothing else to write.\n");
ret = 0; ret = 0;
out: out:
...@@ -1966,38 +2111,42 @@ static int try_read(struct ceph_connection *con) ...@@ -1966,38 +2111,42 @@ static int try_read(struct ceph_connection *con)
{ {
int ret = -1; int ret = -1;
if (!con->sock) more:
return 0; dout("try_read start on %p state %lu\n", con, con->state);
if (con->state != CON_STATE_CONNECTING &&
if (test_bit(STANDBY, &con->state)) con->state != CON_STATE_NEGOTIATING &&
con->state != CON_STATE_OPEN)
return 0; return 0;
dout("try_read start on %p\n", con); BUG_ON(!con->sock);
more:
dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
con->in_base_pos); con->in_base_pos);
/* if (con->state == CON_STATE_CONNECTING) {
* process_connect and process_message drop and re-take dout("try_read connecting\n");
* con->mutex. make sure we handle a racing close or reopen. ret = read_partial_banner(con);
*/ if (ret <= 0)
if (test_bit(CLOSED, &con->state) || goto out;
test_bit(OPENING, &con->state)) { ret = process_banner(con);
ret = -EAGAIN; if (ret < 0)
goto out;
BUG_ON(con->state != CON_STATE_CONNECTING);
con->state = CON_STATE_NEGOTIATING;
/* Banner is good, exchange connection info */
ret = prepare_write_connect(con);
if (ret < 0)
goto out;
prepare_read_connect(con);
/* Send connection info before awaiting response */
goto out; goto out;
} }
if (test_bit(CONNECTING, &con->state)) { if (con->state == CON_STATE_NEGOTIATING) {
if (!test_bit(NEGOTIATING, &con->state)) { dout("try_read negotiating\n");
dout("try_read connecting\n");
ret = read_partial_banner(con);
if (ret <= 0)
goto out;
ret = process_banner(con);
if (ret < 0)
goto out;
}
ret = read_partial_connect(con); ret = read_partial_connect(con);
if (ret <= 0) if (ret <= 0)
goto out; goto out;
...@@ -2007,6 +2156,8 @@ static int try_read(struct ceph_connection *con) ...@@ -2007,6 +2156,8 @@ static int try_read(struct ceph_connection *con)
goto more; goto more;
} }
BUG_ON(con->state != CON_STATE_OPEN);
if (con->in_base_pos < 0) { if (con->in_base_pos < 0) {
/* /*
* skipping + discarding content. * skipping + discarding content.
...@@ -2040,7 +2191,8 @@ static int try_read(struct ceph_connection *con) ...@@ -2040,7 +2191,8 @@ static int try_read(struct ceph_connection *con)
prepare_read_ack(con); prepare_read_ack(con);
break; break;
case CEPH_MSGR_TAG_CLOSE: case CEPH_MSGR_TAG_CLOSE:
set_bit(CLOSED, &con->state); /* fixme */ con_close_socket(con);
con->state = CON_STATE_CLOSED;
goto out; goto out;
default: default:
goto bad_tag; goto bad_tag;
...@@ -2063,6 +2215,8 @@ static int try_read(struct ceph_connection *con) ...@@ -2063,6 +2215,8 @@ static int try_read(struct ceph_connection *con)
if (con->in_tag == CEPH_MSGR_TAG_READY) if (con->in_tag == CEPH_MSGR_TAG_READY)
goto more; goto more;
process_message(con); process_message(con);
if (con->state == CON_STATE_OPEN)
prepare_read_tag(con);
goto more; goto more;
} }
if (con->in_tag == CEPH_MSGR_TAG_ACK) { if (con->in_tag == CEPH_MSGR_TAG_ACK) {
...@@ -2091,12 +2245,6 @@ static int try_read(struct ceph_connection *con) ...@@ -2091,12 +2245,6 @@ static int try_read(struct ceph_connection *con)
*/ */
static void queue_con(struct ceph_connection *con) static void queue_con(struct ceph_connection *con)
{ {
if (test_bit(DEAD, &con->state)) {
dout("queue_con %p ignoring: DEAD\n",
con);
return;
}
if (!con->ops->get(con)) { if (!con->ops->get(con)) {
dout("queue_con %p ref count 0\n", con); dout("queue_con %p ref count 0\n", con);
return; return;
...@@ -2121,7 +2269,26 @@ static void con_work(struct work_struct *work) ...@@ -2121,7 +2269,26 @@ static void con_work(struct work_struct *work)
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
restart: restart:
if (test_and_clear_bit(BACKOFF, &con->state)) { if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
switch (con->state) {
case CON_STATE_CONNECTING:
con->error_msg = "connection failed";
break;
case CON_STATE_NEGOTIATING:
con->error_msg = "negotiation failed";
break;
case CON_STATE_OPEN:
con->error_msg = "socket closed";
break;
default:
dout("unrecognized con state %d\n", (int)con->state);
con->error_msg = "unrecognized con state";
BUG();
}
goto fault;
}
if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
dout("con_work %p backing off\n", con); dout("con_work %p backing off\n", con);
if (queue_delayed_work(ceph_msgr_wq, &con->work, if (queue_delayed_work(ceph_msgr_wq, &con->work,
round_jiffies_relative(con->delay))) { round_jiffies_relative(con->delay))) {
...@@ -2135,35 +2302,35 @@ static void con_work(struct work_struct *work) ...@@ -2135,35 +2302,35 @@ static void con_work(struct work_struct *work)
} }
} }
if (test_bit(STANDBY, &con->state)) { if (con->state == CON_STATE_STANDBY) {
dout("con_work %p STANDBY\n", con); dout("con_work %p STANDBY\n", con);
goto done; goto done;
} }
if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ if (con->state == CON_STATE_CLOSED) {
dout("con_work CLOSED\n"); dout("con_work %p CLOSED\n", con);
con_close_socket(con); BUG_ON(con->sock);
goto done; goto done;
} }
if (test_and_clear_bit(OPENING, &con->state)) { if (con->state == CON_STATE_PREOPEN) {
/* reopen w/ new peer */
dout("con_work OPENING\n"); dout("con_work OPENING\n");
con_close_socket(con); BUG_ON(con->sock);
} }
if (test_and_clear_bit(SOCK_CLOSED, &con->state))
goto fault;
ret = try_read(con); ret = try_read(con);
if (ret == -EAGAIN) if (ret == -EAGAIN)
goto restart; goto restart;
if (ret < 0) if (ret < 0) {
con->error_msg = "socket error on read";
goto fault; goto fault;
}
ret = try_write(con); ret = try_write(con);
if (ret == -EAGAIN) if (ret == -EAGAIN)
goto restart; goto restart;
if (ret < 0) if (ret < 0) {
con->error_msg = "socket error on write";
goto fault; goto fault;
}
done: done:
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
...@@ -2172,7 +2339,6 @@ static void con_work(struct work_struct *work) ...@@ -2172,7 +2339,6 @@ static void con_work(struct work_struct *work)
return; return;
fault: fault:
mutex_unlock(&con->mutex);
ceph_fault(con); /* error/fault path */ ceph_fault(con); /* error/fault path */
goto done_unlocked; goto done_unlocked;
} }
...@@ -2183,26 +2349,31 @@ static void con_work(struct work_struct *work) ...@@ -2183,26 +2349,31 @@ static void con_work(struct work_struct *work)
* exponential backoff * exponential backoff
*/ */
static void ceph_fault(struct ceph_connection *con) static void ceph_fault(struct ceph_connection *con)
__releases(con->mutex)
{ {
pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
dout("fault %p state %lu to peer %s\n", dout("fault %p state %lu to peer %s\n",
con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
if (test_bit(LOSSYTX, &con->state)) { BUG_ON(con->state != CON_STATE_CONNECTING &&
dout("fault on LOSSYTX channel\n"); con->state != CON_STATE_NEGOTIATING &&
goto out; con->state != CON_STATE_OPEN);
}
mutex_lock(&con->mutex);
if (test_bit(CLOSED, &con->state))
goto out_unlock;
con_close_socket(con); con_close_socket(con);
if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) {
dout("fault on LOSSYTX channel, marking CLOSED\n");
con->state = CON_STATE_CLOSED;
goto out_unlock;
}
if (con->in_msg) { if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
con->in_msg->con = NULL;
ceph_msg_put(con->in_msg); ceph_msg_put(con->in_msg);
con->in_msg = NULL; con->in_msg = NULL;
con->ops->put(con);
} }
/* Requeue anything that hasn't been acked */ /* Requeue anything that hasn't been acked */
...@@ -2211,12 +2382,13 @@ static void ceph_fault(struct ceph_connection *con) ...@@ -2211,12 +2382,13 @@ static void ceph_fault(struct ceph_connection *con)
/* If there are no messages queued or keepalive pending, place /* If there are no messages queued or keepalive pending, place
* the connection in a STANDBY state */ * the connection in a STANDBY state */
if (list_empty(&con->out_queue) && if (list_empty(&con->out_queue) &&
!test_bit(KEEPALIVE_PENDING, &con->state)) { !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) {
dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
clear_bit(WRITE_PENDING, &con->state); clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
set_bit(STANDBY, &con->state); con->state = CON_STATE_STANDBY;
} else { } else {
/* retry after a delay. */ /* retry after a delay. */
con->state = CON_STATE_PREOPEN;
if (con->delay == 0) if (con->delay == 0)
con->delay = BASE_DELAY_INTERVAL; con->delay = BASE_DELAY_INTERVAL;
else if (con->delay < MAX_DELAY_INTERVAL) else if (con->delay < MAX_DELAY_INTERVAL)
...@@ -2237,13 +2409,12 @@ static void ceph_fault(struct ceph_connection *con) ...@@ -2237,13 +2409,12 @@ static void ceph_fault(struct ceph_connection *con)
* that when con_work restarts we schedule the * that when con_work restarts we schedule the
* delay then. * delay then.
*/ */
set_bit(BACKOFF, &con->state); set_bit(CON_FLAG_BACKOFF, &con->flags);
} }
} }
out_unlock: out_unlock:
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
out:
/* /*
* in case we faulted due to authentication, invalidate our * in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones. * current tickets so that we can get new ones.
...@@ -2260,18 +2431,14 @@ static void ceph_fault(struct ceph_connection *con) ...@@ -2260,18 +2431,14 @@ static void ceph_fault(struct ceph_connection *con)
/* /*
* create a new messenger instance * initialize a new messenger instance
*/ */
struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, void ceph_messenger_init(struct ceph_messenger *msgr,
u32 supported_features, struct ceph_entity_addr *myaddr,
u32 required_features) u32 supported_features,
u32 required_features,
bool nocrc)
{ {
struct ceph_messenger *msgr;
msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
if (msgr == NULL)
return ERR_PTR(-ENOMEM);
msgr->supported_features = supported_features; msgr->supported_features = supported_features;
msgr->required_features = required_features; msgr->required_features = required_features;
...@@ -2284,30 +2451,23 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, ...@@ -2284,30 +2451,23 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
msgr->inst.addr.type = 0; msgr->inst.addr.type = 0;
get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
encode_my_addr(msgr); encode_my_addr(msgr);
msgr->nocrc = nocrc;
dout("messenger_create %p\n", msgr); atomic_set(&msgr->stopping, 0);
return msgr;
}
EXPORT_SYMBOL(ceph_messenger_create);
void ceph_messenger_destroy(struct ceph_messenger *msgr) dout("%s %p\n", __func__, msgr);
{
dout("destroy %p\n", msgr);
kfree(msgr);
dout("destroyed messenger %p\n", msgr);
} }
EXPORT_SYMBOL(ceph_messenger_destroy); EXPORT_SYMBOL(ceph_messenger_init);
static void clear_standby(struct ceph_connection *con) static void clear_standby(struct ceph_connection *con)
{ {
/* come back from STANDBY? */ /* come back from STANDBY? */
if (test_and_clear_bit(STANDBY, &con->state)) { if (con->state == CON_STATE_STANDBY) {
mutex_lock(&con->mutex);
dout("clear_standby %p and ++connect_seq\n", con); dout("clear_standby %p and ++connect_seq\n", con);
con->state = CON_STATE_PREOPEN;
con->connect_seq++; con->connect_seq++;
WARN_ON(test_bit(WRITE_PENDING, &con->state)); WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags));
WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags));
mutex_unlock(&con->mutex);
} }
} }
...@@ -2316,21 +2476,24 @@ static void clear_standby(struct ceph_connection *con) ...@@ -2316,21 +2476,24 @@ static void clear_standby(struct ceph_connection *con)
*/ */
void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
{ {
if (test_bit(CLOSED, &con->state)) {
dout("con_send %p closed, dropping %p\n", con, msg);
ceph_msg_put(msg);
return;
}
/* set src+dst */ /* set src+dst */
msg->hdr.src = con->msgr->inst.name; msg->hdr.src = con->msgr->inst.name;
BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
msg->needs_out_seq = true; msg->needs_out_seq = true;
/* queue */
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (con->state == CON_STATE_CLOSED) {
dout("con_send %p closed, dropping %p\n", con, msg);
ceph_msg_put(msg);
mutex_unlock(&con->mutex);
return;
}
BUG_ON(msg->con != NULL);
msg->con = con->ops->get(con);
BUG_ON(msg->con == NULL);
BUG_ON(!list_empty(&msg->list_head)); BUG_ON(!list_empty(&msg->list_head));
list_add_tail(&msg->list_head, &con->out_queue); list_add_tail(&msg->list_head, &con->out_queue);
dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
...@@ -2339,12 +2502,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -2339,12 +2502,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.front_len),
le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.middle_len),
le32_to_cpu(msg->hdr.data_len)); le32_to_cpu(msg->hdr.data_len));
clear_standby(con);
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
/* if there wasn't anything waiting to send before, queue /* if there wasn't anything waiting to send before, queue
* new work */ * new work */
clear_standby(con); if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
queue_con(con); queue_con(con);
} }
EXPORT_SYMBOL(ceph_con_send); EXPORT_SYMBOL(ceph_con_send);
...@@ -2352,24 +2516,34 @@ EXPORT_SYMBOL(ceph_con_send); ...@@ -2352,24 +2516,34 @@ EXPORT_SYMBOL(ceph_con_send);
/* /*
* Revoke a message that was previously queued for send * Revoke a message that was previously queued for send
*/ */
void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) void ceph_msg_revoke(struct ceph_msg *msg)
{ {
struct ceph_connection *con = msg->con;
if (!con)
return; /* Message not in our possession */
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (!list_empty(&msg->list_head)) { if (!list_empty(&msg->list_head)) {
dout("con_revoke %p msg %p - was on queue\n", con, msg); dout("%s %p msg %p - was on queue\n", __func__, con, msg);
list_del_init(&msg->list_head); list_del_init(&msg->list_head);
ceph_msg_put(msg); BUG_ON(msg->con == NULL);
msg->con->ops->put(msg->con);
msg->con = NULL;
msg->hdr.seq = 0; msg->hdr.seq = 0;
ceph_msg_put(msg);
} }
if (con->out_msg == msg) { if (con->out_msg == msg) {
dout("con_revoke %p msg %p - was sending\n", con, msg); dout("%s %p msg %p - was sending\n", __func__, con, msg);
con->out_msg = NULL; con->out_msg = NULL;
if (con->out_kvec_is_msg) { if (con->out_kvec_is_msg) {
con->out_skip = con->out_kvec_bytes; con->out_skip = con->out_kvec_bytes;
con->out_kvec_is_msg = false; con->out_kvec_is_msg = false;
} }
ceph_msg_put(msg);
msg->hdr.seq = 0; msg->hdr.seq = 0;
ceph_msg_put(msg);
} }
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
} }
...@@ -2377,17 +2551,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -2377,17 +2551,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
/* /*
* Revoke a message that we may be reading data into * Revoke a message that we may be reading data into
*/ */
void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) void ceph_msg_revoke_incoming(struct ceph_msg *msg)
{ {
struct ceph_connection *con;
BUG_ON(msg == NULL);
if (!msg->con) {
dout("%s msg %p null con\n", __func__, msg);
return; /* Message not in our possession */
}
con = msg->con;
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (con->in_msg && con->in_msg == msg) { if (con->in_msg == msg) {
unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
/* skip rest of message */ /* skip rest of message */
dout("con_revoke_pages %p msg %p revoked\n", con, msg); dout("%s %p msg %p revoked\n", __func__, con, msg);
con->in_base_pos = con->in_base_pos - con->in_base_pos = con->in_base_pos -
sizeof(struct ceph_msg_header) - sizeof(struct ceph_msg_header) -
front_len - front_len -
middle_len - middle_len -
...@@ -2398,8 +2582,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -2398,8 +2582,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
con->in_tag = CEPH_MSGR_TAG_READY; con->in_tag = CEPH_MSGR_TAG_READY;
con->in_seq++; con->in_seq++;
} else { } else {
dout("con_revoke_pages %p msg %p pages %p no-op\n", dout("%s %p in_msg %p msg %p no-op\n",
con, con->in_msg, msg); __func__, con, con->in_msg, msg);
} }
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
} }
...@@ -2410,9 +2594,11 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -2410,9 +2594,11 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
void ceph_con_keepalive(struct ceph_connection *con) void ceph_con_keepalive(struct ceph_connection *con)
{ {
dout("con_keepalive %p\n", con); dout("con_keepalive %p\n", con);
mutex_lock(&con->mutex);
clear_standby(con); clear_standby(con);
if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && mutex_unlock(&con->mutex);
test_and_set_bit(WRITE_PENDING, &con->state) == 0) if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 &&
test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
queue_con(con); queue_con(con);
} }
EXPORT_SYMBOL(ceph_con_keepalive); EXPORT_SYMBOL(ceph_con_keepalive);
...@@ -2431,6 +2617,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, ...@@ -2431,6 +2617,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
if (m == NULL) if (m == NULL)
goto out; goto out;
kref_init(&m->kref); kref_init(&m->kref);
m->con = NULL;
INIT_LIST_HEAD(&m->list_head); INIT_LIST_HEAD(&m->list_head);
m->hdr.tid = 0; m->hdr.tid = 0;
...@@ -2526,46 +2714,77 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -2526,46 +2714,77 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
} }
/* /*
* Generic message allocator, for incoming messages. * Allocate a message for receiving an incoming message on a
* connection, and save the result in con->in_msg. Uses the
* connection's private alloc_msg op if available.
*
* Returns 0 on success, or a negative error code.
*
* On success, if we set *skip = 1:
* - the next message should be skipped and ignored.
* - con->in_msg == NULL
* or if we set *skip = 0:
* - con->in_msg is non-null.
* On error (ENOMEM, EAGAIN, ...),
* - con->in_msg == NULL
*/ */
static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
struct ceph_msg_header *hdr,
int *skip)
{ {
struct ceph_msg_header *hdr = &con->in_hdr;
int type = le16_to_cpu(hdr->type); int type = le16_to_cpu(hdr->type);
int front_len = le32_to_cpu(hdr->front_len); int front_len = le32_to_cpu(hdr->front_len);
int middle_len = le32_to_cpu(hdr->middle_len); int middle_len = le32_to_cpu(hdr->middle_len);
struct ceph_msg *msg = NULL; int ret = 0;
int ret;
BUG_ON(con->in_msg != NULL);
if (con->ops->alloc_msg) { if (con->ops->alloc_msg) {
struct ceph_msg *msg;
mutex_unlock(&con->mutex); mutex_unlock(&con->mutex);
msg = con->ops->alloc_msg(con, hdr, skip); msg = con->ops->alloc_msg(con, hdr, skip);
mutex_lock(&con->mutex); mutex_lock(&con->mutex);
if (!msg || *skip) if (con->state != CON_STATE_OPEN) {
return NULL; ceph_msg_put(msg);
return -EAGAIN;
}
con->in_msg = msg;
if (con->in_msg) {
con->in_msg->con = con->ops->get(con);
BUG_ON(con->in_msg->con == NULL);
}
if (*skip) {
con->in_msg = NULL;
return 0;
}
if (!con->in_msg) {
con->error_msg =
"error allocating memory for incoming message";
return -ENOMEM;
}
} }
if (!msg) { if (!con->in_msg) {
*skip = 0; con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
msg = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!con->in_msg) {
if (!msg) {
pr_err("unable to allocate msg type %d len %d\n", pr_err("unable to allocate msg type %d len %d\n",
type, front_len); type, front_len);
return NULL; return -ENOMEM;
} }
msg->page_alignment = le16_to_cpu(hdr->data_off); con->in_msg->con = con->ops->get(con);
BUG_ON(con->in_msg->con == NULL);
con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
} }
memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
if (middle_len && !msg->middle) { if (middle_len && !con->in_msg->middle) {
ret = ceph_alloc_middle(con, msg); ret = ceph_alloc_middle(con, con->in_msg);
if (ret < 0) { if (ret < 0) {
ceph_msg_put(msg); ceph_msg_put(con->in_msg);
return NULL; con->in_msg = NULL;
} }
} }
return msg; return ret;
} }
......
...@@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) ...@@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
monc->pending_auth = 1; monc->pending_auth = 1;
monc->m_auth->front.iov_len = len; monc->m_auth->front.iov_len = len;
monc->m_auth->hdr.front_len = cpu_to_le32(len); monc->m_auth->hdr.front_len = cpu_to_le32(len);
ceph_con_revoke(monc->con, monc->m_auth); ceph_msg_revoke(monc->m_auth);
ceph_msg_get(monc->m_auth); /* keep our ref */ ceph_msg_get(monc->m_auth); /* keep our ref */
ceph_con_send(monc->con, monc->m_auth); ceph_con_send(&monc->con, monc->m_auth);
} }
/* /*
...@@ -117,8 +117,11 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) ...@@ -117,8 +117,11 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
static void __close_session(struct ceph_mon_client *monc) static void __close_session(struct ceph_mon_client *monc)
{ {
dout("__close_session closing mon%d\n", monc->cur_mon); dout("__close_session closing mon%d\n", monc->cur_mon);
ceph_con_revoke(monc->con, monc->m_auth); ceph_msg_revoke(monc->m_auth);
ceph_con_close(monc->con); ceph_msg_revoke_incoming(monc->m_auth_reply);
ceph_msg_revoke(monc->m_subscribe);
ceph_msg_revoke_incoming(monc->m_subscribe_ack);
ceph_con_close(&monc->con);
monc->cur_mon = -1; monc->cur_mon = -1;
monc->pending_auth = 0; monc->pending_auth = 0;
ceph_auth_reset(monc->auth); ceph_auth_reset(monc->auth);
...@@ -142,9 +145,8 @@ static int __open_session(struct ceph_mon_client *monc) ...@@ -142,9 +145,8 @@ static int __open_session(struct ceph_mon_client *monc)
monc->want_next_osdmap = !!monc->want_next_osdmap; monc->want_next_osdmap = !!monc->want_next_osdmap;
dout("open_session mon%d opening\n", monc->cur_mon); dout("open_session mon%d opening\n", monc->cur_mon);
monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; ceph_con_open(&monc->con,
monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); CEPH_ENTITY_TYPE_MON, monc->cur_mon,
ceph_con_open(monc->con,
&monc->monmap->mon_inst[monc->cur_mon].addr); &monc->monmap->mon_inst[monc->cur_mon].addr);
/* initiatiate authentication handshake */ /* initiatiate authentication handshake */
...@@ -226,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) ...@@ -226,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
msg->front.iov_len = p - msg->front.iov_base; msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_con_revoke(monc->con, msg); ceph_msg_revoke(msg);
ceph_con_send(monc->con, ceph_msg_get(msg)); ceph_con_send(&monc->con, ceph_msg_get(msg));
monc->sub_sent = jiffies | 1; /* never 0 */ monc->sub_sent = jiffies | 1; /* never 0 */
} }
...@@ -247,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, ...@@ -247,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
if (monc->hunting) { if (monc->hunting) {
pr_info("mon%d %s session established\n", pr_info("mon%d %s session established\n",
monc->cur_mon, monc->cur_mon,
ceph_pr_addr(&monc->con->peer_addr.in_addr)); ceph_pr_addr(&monc->con.peer_addr.in_addr));
monc->hunting = false; monc->hunting = false;
} }
dout("handle_subscribe_ack after %d seconds\n", seconds); dout("handle_subscribe_ack after %d seconds\n", seconds);
...@@ -439,6 +441,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, ...@@ -439,6 +441,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
m = NULL; m = NULL;
} else { } else {
dout("get_generic_reply %lld got %p\n", tid, req->reply); dout("get_generic_reply %lld got %p\n", tid, req->reply);
*skip = 0;
m = ceph_msg_get(req->reply); m = ceph_msg_get(req->reply);
/* /*
* we don't need to track the connection reading into * we don't need to track the connection reading into
...@@ -461,7 +464,7 @@ static int do_generic_request(struct ceph_mon_client *monc, ...@@ -461,7 +464,7 @@ static int do_generic_request(struct ceph_mon_client *monc,
req->request->hdr.tid = cpu_to_le64(req->tid); req->request->hdr.tid = cpu_to_le64(req->tid);
__insert_generic_request(monc, req); __insert_generic_request(monc, req);
monc->num_generic_requests++; monc->num_generic_requests++;
ceph_con_send(monc->con, ceph_msg_get(req->request)); ceph_con_send(&monc->con, ceph_msg_get(req->request));
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
err = wait_for_completion_interruptible(&req->completion); err = wait_for_completion_interruptible(&req->completion);
...@@ -684,8 +687,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc) ...@@ -684,8 +687,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
req = rb_entry(p, struct ceph_mon_generic_request, node); req = rb_entry(p, struct ceph_mon_generic_request, node);
ceph_con_revoke(monc->con, req->request); ceph_msg_revoke(req->request);
ceph_con_send(monc->con, ceph_msg_get(req->request)); ceph_msg_revoke_incoming(req->reply);
ceph_con_send(&monc->con, ceph_msg_get(req->request));
} }
} }
...@@ -705,7 +709,7 @@ static void delayed_work(struct work_struct *work) ...@@ -705,7 +709,7 @@ static void delayed_work(struct work_struct *work)
__close_session(monc); __close_session(monc);
__open_session(monc); /* continue hunting */ __open_session(monc); /* continue hunting */
} else { } else {
ceph_con_keepalive(monc->con); ceph_con_keepalive(&monc->con);
__validate_auth(monc); __validate_auth(monc);
...@@ -760,19 +764,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -760,19 +764,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
goto out; goto out;
/* connection */ /* connection */
monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
if (!monc->con)
goto out_monmap;
ceph_con_init(monc->client->msgr, monc->con);
monc->con->private = monc;
monc->con->ops = &mon_con_ops;
/* authentication */ /* authentication */
monc->auth = ceph_auth_init(cl->options->name, monc->auth = ceph_auth_init(cl->options->name,
cl->options->key); cl->options->key);
if (IS_ERR(monc->auth)) { if (IS_ERR(monc->auth)) {
err = PTR_ERR(monc->auth); err = PTR_ERR(monc->auth);
goto out_con; goto out_monmap;
} }
monc->auth->want_keys = monc->auth->want_keys =
CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
...@@ -801,6 +798,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -801,6 +798,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
if (!monc->m_auth) if (!monc->m_auth)
goto out_auth_reply; goto out_auth_reply;
ceph_con_init(&monc->con, monc, &mon_con_ops,
&monc->client->msgr);
monc->cur_mon = -1; monc->cur_mon = -1;
monc->hunting = true; monc->hunting = true;
monc->sub_renew_after = jiffies; monc->sub_renew_after = jiffies;
...@@ -824,8 +824,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -824,8 +824,6 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
ceph_msg_put(monc->m_subscribe_ack); ceph_msg_put(monc->m_subscribe_ack);
out_auth: out_auth:
ceph_auth_destroy(monc->auth); ceph_auth_destroy(monc->auth);
out_con:
monc->con->ops->put(monc->con);
out_monmap: out_monmap:
kfree(monc->monmap); kfree(monc->monmap);
out: out:
...@@ -841,10 +839,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ...@@ -841,10 +839,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
__close_session(monc); __close_session(monc);
monc->con->private = NULL;
monc->con->ops->put(monc->con);
monc->con = NULL;
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
/* /*
...@@ -888,8 +882,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ...@@ -888,8 +882,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
dout("authenticated, starting session\n"); dout("authenticated, starting session\n");
monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
monc->client->msgr->inst.name.num = monc->client->msgr.inst.name.num =
cpu_to_le64(monc->auth->global_id); cpu_to_le64(monc->auth->global_id);
__send_subscribe(monc); __send_subscribe(monc);
...@@ -1000,6 +994,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, ...@@ -1000,6 +994,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
case CEPH_MSG_MDS_MAP: case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP: case CEPH_MSG_OSD_MAP:
m = ceph_msg_new(type, front_len, GFP_NOFS, false); m = ceph_msg_new(type, front_len, GFP_NOFS, false);
if (!m)
return NULL; /* ENOMEM--return skip == 0 */
break; break;
} }
...@@ -1029,7 +1025,7 @@ static void mon_fault(struct ceph_connection *con) ...@@ -1029,7 +1025,7 @@ static void mon_fault(struct ceph_connection *con)
if (!monc->hunting) if (!monc->hunting)
pr_info("mon%d %s session lost, " pr_info("mon%d %s session lost, "
"hunting for new mon\n", monc->cur_mon, "hunting for new mon\n", monc->cur_mon,
ceph_pr_addr(&monc->con->peer_addr.in_addr)); ceph_pr_addr(&monc->con.peer_addr.in_addr));
__close_session(monc); __close_session(monc);
if (!monc->hunting) { if (!monc->hunting) {
...@@ -1044,9 +1040,23 @@ static void mon_fault(struct ceph_connection *con) ...@@ -1044,9 +1040,23 @@ static void mon_fault(struct ceph_connection *con)
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
} }
/*
* We can ignore refcounting on the connection struct, as all references
* will come from the messenger workqueue, which is drained prior to
* mon_client destruction.
*/
static struct ceph_connection *con_get(struct ceph_connection *con)
{
return con;
}
static void con_put(struct ceph_connection *con)
{
}
static const struct ceph_connection_operations mon_con_ops = { static const struct ceph_connection_operations mon_con_ops = {
.get = ceph_con_get, .get = con_get,
.put = ceph_con_put, .put = con_put,
.dispatch = dispatch, .dispatch = dispatch,
.fault = mon_fault, .fault = mon_fault,
.alloc_msg = mon_alloc_msg, .alloc_msg = mon_alloc_msg,
......
...@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) ...@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
struct ceph_msgpool *pool = arg; struct ceph_msgpool *pool = arg;
struct ceph_msg *msg; struct ceph_msg *msg;
msg = ceph_msg_new(0, pool->front_len, gfp_mask, true); msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
if (!msg) { if (!msg) {
dout("msgpool_alloc %s failed\n", pool->name); dout("msgpool_alloc %s failed\n", pool->name);
} else { } else {
...@@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg) ...@@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg)
ceph_msg_put(msg); ceph_msg_put(msg);
} }
int ceph_msgpool_init(struct ceph_msgpool *pool, int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
int front_len, int size, bool blocking, const char *name) int front_len, int size, bool blocking, const char *name)
{ {
dout("msgpool %s init\n", name); dout("msgpool %s init\n", name);
pool->type = type;
pool->front_len = front_len; pool->front_len = front_len;
pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
if (!pool->pool) if (!pool->pool)
...@@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, ...@@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
WARN_ON(1); WARN_ON(1);
/* try to alloc a fresh message */ /* try to alloc a fresh message */
return ceph_msg_new(0, front_len, GFP_NOFS, false); return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
} }
msg = mempool_alloc(pool->pool, GFP_NOFS); msg = mempool_alloc(pool->pool, GFP_NOFS);
......
...@@ -140,10 +140,9 @@ void ceph_osdc_release_request(struct kref *kref) ...@@ -140,10 +140,9 @@ void ceph_osdc_release_request(struct kref *kref)
if (req->r_request) if (req->r_request)
ceph_msg_put(req->r_request); ceph_msg_put(req->r_request);
if (req->r_con_filling_msg) { if (req->r_con_filling_msg) {
dout("release_request revoking pages %p from con %p\n", dout("%s revoking pages %p from con %p\n", __func__,
req->r_pages, req->r_con_filling_msg); req->r_pages, req->r_con_filling_msg);
ceph_con_revoke_message(req->r_con_filling_msg, ceph_msg_revoke_incoming(req->r_reply);
req->r_reply);
req->r_con_filling_msg->ops->put(req->r_con_filling_msg); req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
} }
if (req->r_reply) if (req->r_reply)
...@@ -214,10 +213,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ...@@ -214,10 +213,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
kref_init(&req->r_kref); kref_init(&req->r_kref);
init_completion(&req->r_completion); init_completion(&req->r_completion);
init_completion(&req->r_safe_completion); init_completion(&req->r_safe_completion);
rb_init_node(&req->r_node);
INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_unsafe_item);
INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_item);
INIT_LIST_HEAD(&req->r_linger_osd); INIT_LIST_HEAD(&req->r_linger_osd);
INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_req_lru_item);
INIT_LIST_HEAD(&req->r_osd_item);
req->r_flags = flags; req->r_flags = flags;
WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
...@@ -243,6 +245,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ...@@ -243,6 +245,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
} }
ceph_pagelist_init(req->r_trail); ceph_pagelist_init(req->r_trail);
} }
/* create request message; allow space for oid */ /* create request message; allow space for oid */
msg_size += MAX_OBJ_NAME_SIZE; msg_size += MAX_OBJ_NAME_SIZE;
if (snapc) if (snapc)
...@@ -256,7 +259,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ...@@ -256,7 +259,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
return NULL; return NULL;
} }
msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
memset(msg->front.iov_base, 0, msg->front.iov_len); memset(msg->front.iov_base, 0, msg->front.iov_len);
req->r_request = msg; req->r_request = msg;
...@@ -624,7 +626,7 @@ static void osd_reset(struct ceph_connection *con) ...@@ -624,7 +626,7 @@ static void osd_reset(struct ceph_connection *con)
/* /*
* Track open sessions with osds. * Track open sessions with osds.
*/ */
static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
{ {
struct ceph_osd *osd; struct ceph_osd *osd;
...@@ -634,15 +636,13 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) ...@@ -634,15 +636,13 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
atomic_set(&osd->o_ref, 1); atomic_set(&osd->o_ref, 1);
osd->o_osdc = osdc; osd->o_osdc = osdc;
osd->o_osd = onum;
INIT_LIST_HEAD(&osd->o_requests); INIT_LIST_HEAD(&osd->o_requests);
INIT_LIST_HEAD(&osd->o_linger_requests); INIT_LIST_HEAD(&osd->o_linger_requests);
INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_osd_lru);
osd->o_incarnation = 1; osd->o_incarnation = 1;
ceph_con_init(osdc->client->msgr, &osd->o_con); ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
osd->o_con.private = osd;
osd->o_con.ops = &osd_con_ops;
osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
INIT_LIST_HEAD(&osd->o_keepalive_item); INIT_LIST_HEAD(&osd->o_keepalive_item);
return osd; return osd;
...@@ -688,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) ...@@ -688,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
static void remove_all_osds(struct ceph_osd_client *osdc) static void remove_all_osds(struct ceph_osd_client *osdc)
{ {
dout("__remove_old_osds %p\n", osdc); dout("%s %p\n", __func__, osdc);
mutex_lock(&osdc->request_mutex); mutex_lock(&osdc->request_mutex);
while (!RB_EMPTY_ROOT(&osdc->osds)) { while (!RB_EMPTY_ROOT(&osdc->osds)) {
struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
...@@ -752,7 +752,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) ...@@ -752,7 +752,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
ret = -EAGAIN; ret = -EAGAIN;
} else { } else {
ceph_con_close(&osd->o_con); ceph_con_close(&osd->o_con);
ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
&osdc->osdmap->osd_addr[osd->o_osd]);
osd->o_incarnation++; osd->o_incarnation++;
} }
return ret; return ret;
...@@ -853,7 +854,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, ...@@ -853,7 +854,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
if (req->r_osd) { if (req->r_osd) {
/* make sure the original request isn't in flight. */ /* make sure the original request isn't in flight. */
ceph_con_revoke(&req->r_osd->o_con, req->r_request); ceph_msg_revoke(req->r_request);
list_del_init(&req->r_osd_item); list_del_init(&req->r_osd_item);
if (list_empty(&req->r_osd->o_requests) && if (list_empty(&req->r_osd->o_requests) &&
...@@ -880,7 +881,7 @@ static void __unregister_request(struct ceph_osd_client *osdc, ...@@ -880,7 +881,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
static void __cancel_request(struct ceph_osd_request *req) static void __cancel_request(struct ceph_osd_request *req)
{ {
if (req->r_sent && req->r_osd) { if (req->r_sent && req->r_osd) {
ceph_con_revoke(&req->r_osd->o_con, req->r_request); ceph_msg_revoke(req->r_request);
req->r_sent = 0; req->r_sent = 0;
} }
} }
...@@ -890,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc, ...@@ -890,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
{ {
dout("__register_linger_request %p\n", req); dout("__register_linger_request %p\n", req);
list_add_tail(&req->r_linger_item, &osdc->req_linger); list_add_tail(&req->r_linger_item, &osdc->req_linger);
list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); if (req->r_osd)
list_add_tail(&req->r_linger_osd,
&req->r_osd->o_linger_requests);
} }
static void __unregister_linger_request(struct ceph_osd_client *osdc, static void __unregister_linger_request(struct ceph_osd_client *osdc,
...@@ -998,18 +1001,18 @@ static int __map_request(struct ceph_osd_client *osdc, ...@@ -998,18 +1001,18 @@ static int __map_request(struct ceph_osd_client *osdc,
req->r_osd = __lookup_osd(osdc, o); req->r_osd = __lookup_osd(osdc, o);
if (!req->r_osd && o >= 0) { if (!req->r_osd && o >= 0) {
err = -ENOMEM; err = -ENOMEM;
req->r_osd = create_osd(osdc); req->r_osd = create_osd(osdc, o);
if (!req->r_osd) { if (!req->r_osd) {
list_move(&req->r_req_lru_item, &osdc->req_notarget); list_move(&req->r_req_lru_item, &osdc->req_notarget);
goto out; goto out;
} }
dout("map_request osd %p is osd%d\n", req->r_osd, o); dout("map_request osd %p is osd%d\n", req->r_osd, o);
req->r_osd->o_osd = o;
req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
__insert_osd(osdc, req->r_osd); __insert_osd(osdc, req->r_osd);
ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); ceph_con_open(&req->r_osd->o_con,
CEPH_ENTITY_TYPE_OSD, o,
&osdc->osdmap->osd_addr[o]);
} }
if (req->r_osd) { if (req->r_osd) {
...@@ -1304,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) ...@@ -1304,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
mutex_lock(&osdc->request_mutex); mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { for (p = rb_first(&osdc->requests); p; ) {
req = rb_entry(p, struct ceph_osd_request, r_node); req = rb_entry(p, struct ceph_osd_request, r_node);
p = rb_next(p);
err = __map_request(osdc, req, force_resend); err = __map_request(osdc, req, force_resend);
if (err < 0) if (err < 0)
continue; /* error */ continue; /* error */
...@@ -1313,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) ...@@ -1313,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
dout("%p tid %llu maps to no osd\n", req, req->r_tid); dout("%p tid %llu maps to no osd\n", req, req->r_tid);
needmap++; /* request a newer map */ needmap++; /* request a newer map */
} else if (err > 0) { } else if (err > 0) {
dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, if (!req->r_linger) {
req->r_osd ? req->r_osd->o_osd : -1); dout("%p tid %llu requeued on osd%d\n", req,
if (!req->r_linger) req->r_tid,
req->r_osd ? req->r_osd->o_osd : -1);
req->r_flags |= CEPH_OSD_FLAG_RETRY; req->r_flags |= CEPH_OSD_FLAG_RETRY;
}
}
if (req->r_linger && list_empty(&req->r_linger_item)) {
/*
* register as a linger so that we will
* re-submit below and get a new tid
*/
dout("%p tid %llu restart on osd%d\n",
req, req->r_tid,
req->r_osd ? req->r_osd->o_osd : -1);
__register_linger_request(osdc, req);
__unregister_request(osdc, req);
} }
} }
...@@ -1391,7 +1408,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1391,7 +1408,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
epoch, maplen); epoch, maplen);
newmap = osdmap_apply_incremental(&p, next, newmap = osdmap_apply_incremental(&p, next,
osdc->osdmap, osdc->osdmap,
osdc->client->msgr); &osdc->client->msgr);
if (IS_ERR(newmap)) { if (IS_ERR(newmap)) {
err = PTR_ERR(newmap); err = PTR_ERR(newmap);
goto bad; goto bad;
...@@ -1839,11 +1856,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) ...@@ -1839,11 +1856,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
if (!osdc->req_mempool) if (!osdc->req_mempool)
goto out; goto out;
err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
OSD_OP_FRONT_LEN, 10, true,
"osd_op"); "osd_op");
if (err < 0) if (err < 0)
goto out_mempool; goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply, err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
OSD_OPREPLY_FRONT_LEN, 10, true, OSD_OPREPLY_FRONT_LEN, 10, true,
"osd_op_reply"); "osd_op_reply");
if (err < 0) if (err < 0)
...@@ -2019,15 +2037,15 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, ...@@ -2019,15 +2037,15 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
if (!req) { if (!req) {
*skip = 1; *skip = 1;
m = NULL; m = NULL;
pr_info("get_reply unknown tid %llu from osd%d\n", tid, dout("get_reply unknown tid %llu from osd%d\n", tid,
osd->o_osd); osd->o_osd);
goto out; goto out;
} }
if (req->r_con_filling_msg) { if (req->r_con_filling_msg) {
dout("get_reply revoking msg %p from old con %p\n", dout("%s revoking msg %p from old con %p\n", __func__,
req->r_reply, req->r_con_filling_msg); req->r_reply, req->r_con_filling_msg);
ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); ceph_msg_revoke_incoming(req->r_reply);
req->r_con_filling_msg->ops->put(req->r_con_filling_msg); req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
req->r_con_filling_msg = NULL; req->r_con_filling_msg = NULL;
} }
...@@ -2080,6 +2098,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, ...@@ -2080,6 +2098,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
int type = le16_to_cpu(hdr->type); int type = le16_to_cpu(hdr->type);
int front = le32_to_cpu(hdr->front_len); int front = le32_to_cpu(hdr->front_len);
*skip = 0;
switch (type) { switch (type) {
case CEPH_MSG_OSD_MAP: case CEPH_MSG_OSD_MAP:
case CEPH_MSG_WATCH_NOTIFY: case CEPH_MSG_WATCH_NOTIFY:
......
...@@ -135,6 +135,21 @@ static int crush_decode_straw_bucket(void **p, void *end, ...@@ -135,6 +135,21 @@ static int crush_decode_straw_bucket(void **p, void *end,
return -EINVAL; return -EINVAL;
} }
static int skip_name_map(void **p, void *end)
{
int len;
ceph_decode_32_safe(p, end, len ,bad);
while (len--) {
int strlen;
*p += sizeof(u32);
ceph_decode_32_safe(p, end, strlen, bad);
*p += strlen;
}
return 0;
bad:
return -EINVAL;
}
static struct crush_map *crush_decode(void *pbyval, void *end) static struct crush_map *crush_decode(void *pbyval, void *end)
{ {
struct crush_map *c; struct crush_map *c;
...@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
void **p = &pbyval; void **p = &pbyval;
void *start = pbyval; void *start = pbyval;
u32 magic; u32 magic;
u32 num_name_maps;
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
...@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
if (c == NULL) if (c == NULL)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
/* set tunables to default values */
c->choose_local_tries = 2;
c->choose_local_fallback_tries = 5;
c->choose_total_tries = 19;
ceph_decode_need(p, end, 4*sizeof(u32), bad); ceph_decode_need(p, end, 4*sizeof(u32), bad);
magic = ceph_decode_32(p); magic = ceph_decode_32(p);
if (magic != CRUSH_MAGIC) { if (magic != CRUSH_MAGIC) {
...@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
} }
/* ignore trailing name maps. */ /* ignore trailing name maps. */
for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
err = skip_name_map(p, end);
if (err < 0)
goto done;
}
/* tunables */
ceph_decode_need(p, end, 3*sizeof(u32), done);
c->choose_local_tries = ceph_decode_32(p);
c->choose_local_fallback_tries = ceph_decode_32(p);
c->choose_total_tries = ceph_decode_32(p);
dout("crush decode tunable choose_local_tries = %d",
c->choose_local_tries);
dout("crush decode tunable choose_local_fallback_tries = %d",
c->choose_local_fallback_tries);
dout("crush decode tunable choose_total_tries = %d",
c->choose_total_tries);
done:
dout("crush_decode success\n"); dout("crush_decode success\n");
return c; return c;
...@@ -488,15 +527,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) ...@@ -488,15 +527,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
ceph_decode_32_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, pool, bad);
ceph_decode_32_safe(p, end, len, bad); ceph_decode_32_safe(p, end, len, bad);
dout(" pool %d len %d\n", pool, len); dout(" pool %d len %d\n", pool, len);
ceph_decode_need(p, end, len, bad);
pi = __lookup_pg_pool(&map->pg_pools, pool); pi = __lookup_pg_pool(&map->pg_pools, pool);
if (pi) { if (pi) {
char *name = kstrndup(*p, len, GFP_NOFS);
if (!name)
return -ENOMEM;
kfree(pi->name); kfree(pi->name);
pi->name = kmalloc(len + 1, GFP_NOFS); pi->name = name;
if (pi->name) { dout(" name is %s\n", pi->name);
memcpy(pi->name, *p, len);
pi->name[len] = '\0';
dout(" name is %s\n", pi->name);
}
} }
*p += len; *p += len;
} }
...@@ -666,6 +706,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ...@@ -666,6 +706,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
ceph_decode_copy(p, &pgid, sizeof(pgid)); ceph_decode_copy(p, &pgid, sizeof(pgid));
n = ceph_decode_32(p); n = ceph_decode_32(p);
err = -EINVAL;
if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
goto bad;
ceph_decode_need(p, end, n * sizeof(u32), bad); ceph_decode_need(p, end, n * sizeof(u32), bad);
err = -ENOMEM; err = -ENOMEM;
pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
...@@ -889,6 +932,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -889,6 +932,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
(void) __remove_pg_mapping(&map->pg_temp, pgid); (void) __remove_pg_mapping(&map->pg_temp, pgid);
/* insert */ /* insert */
if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
err = -EINVAL;
goto bad;
}
pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
if (!pg) { if (!pg) {
err = -ENOMEM; err = -ENOMEM;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment