Commit f24e9980 authored by Sage Weil's avatar Sage Weil

ceph: OSD client

The OSD client is responsible for reading and writing data from/to the
object storage pool.  This includes determining where objects are
stored in the cluster, and ensuring that requests are retried or
redirected in the event of a node failure or data migration.

If an OSD does not respond before a timeout expires, keepalive
messages are sent across the lossless, ordered communications channel
to ensure that any break in the TCP is discovered.  If the session
does reset, a reconnection is attempted and affected requests are
resent (by the message transport layer).
Signed-off-by: default avatarSage Weil <sage@newdream.net>
parent 2f2dc053
This diff is collapsed.
#ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H
#include <linux/completion.h>
#include <linux/mempool.h>
#include <linux/rbtree.h>
#include "types.h"
#include "osdmap.h"
#include "messenger.h"
struct ceph_msg;
struct ceph_snap_context;
struct ceph_osd_request;
struct ceph_osd_client;
/*
* completion callback for async writepages
*/
typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
struct ceph_msg *);
/* a given osd we're communicating with */
struct ceph_osd {
atomic_t o_ref;
struct ceph_osd_client *o_osdc;
int o_osd;
int o_incarnation;
struct rb_node o_node;
struct ceph_connection o_con;
struct list_head o_requests;
};
/* an in-flight request */
struct ceph_osd_request {
u64 r_tid; /* unique for this client */
struct rb_node r_node;
struct list_head r_osd_item;
struct ceph_osd *r_osd;
struct ceph_msg *r_request, *r_reply;
int r_result;
int r_flags; /* any additional flags for the osd */
u32 r_sent; /* >0 if r_request is sending/sent */
int r_prepared_pages, r_got_reply;
struct ceph_osd_client *r_osdc;
atomic_t r_ref;
bool r_mempool;
struct completion r_completion, r_safe_completion;
ceph_osdc_callback_t r_callback, r_safe_callback;
struct ceph_eversion r_reassert_version;
struct list_head r_unsafe_item;
struct inode *r_inode; /* for use by callbacks */
struct writeback_control *r_wbc; /* ditto */
char r_oid[40]; /* object name */
int r_oid_len;
unsigned long r_timeout_stamp;
bool r_resend; /* msg send failed, needs retry */
struct ceph_file_layout r_file_layout;
struct ceph_snap_context *r_snapc; /* snap context for writes */
unsigned r_num_pages; /* size of page array (follows) */
struct page **r_pages; /* pages for data payload */
int r_pages_from_pool;
int r_own_pages; /* if true, i own page list */
};
struct ceph_osd_client {
struct ceph_client *client;
struct ceph_osdmap *osdmap; /* current map */
struct rw_semaphore map_sem;
struct completion map_waiters;
u64 last_requested_map;
struct mutex request_mutex;
struct rb_root osds; /* osds */
u64 timeout_tid; /* tid of timeout triggering rq */
u64 last_tid; /* tid of last request */
struct rb_root requests; /* pending requests */
int num_requests;
struct delayed_work timeout_work;
struct dentry *debugfs_file;
mempool_t *req_mempool;
struct ceph_msgpool msgpool_op;
struct ceph_msgpool msgpool_op_reply;
};
extern int ceph_osdc_init(struct ceph_osd_client *osdc,
struct ceph_client *client);
extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout,
struct ceph_vino vino,
u64 offset, u64 *len, int op, int flags,
struct ceph_snap_context *snapc,
int do_sync, u32 truncate_seq,
u64 truncate_size,
struct timespec *mtime,
bool use_mempool, int num_reply);
static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
{
atomic_inc(&req->r_ref);
}
extern void ceph_osdc_put_request(struct ceph_osd_request *req);
extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req,
bool nofail);
extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
struct ceph_osd_request *req);
extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
u64 off, u64 *plen,
u32 truncate_seq, u64 truncate_size,
struct page **pages, int nr_pages);
extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct ceph_vino vino,
struct ceph_file_layout *layout,
struct ceph_snap_context *sc,
u64 off, u64 len,
u32 truncate_seq, u64 truncate_size,
struct timespec *mtime,
struct page **pages, int nr_pages,
int flags, int do_sync, bool nofail);
#endif
This diff is collapsed.
#ifndef _FS_CEPH_OSDMAP_H
#define _FS_CEPH_OSDMAP_H
#include <linux/rbtree.h>
#include "types.h"
#include "ceph_fs.h"
#include "crush/crush.h"
/*
* The osd map describes the current membership of the osd cluster and
* specifies the mapping of objects to placement groups and placement
* groups to (sets of) osds. That is, it completely specifies the
* (desired) distribution of all data objects in the system at some
* point in time.
*
* Each map version is identified by an epoch, which increases monotonically.
*
* The map can be updated either via an incremental map (diff) describing
* the change between two successive epochs, or as a fully encoded map.
*/
struct ceph_pg_pool_info {
struct ceph_pg_pool v;
int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
};
struct ceph_pg_mapping {
struct rb_node node;
u64 pgid;
int len;
int osds[];
};
struct ceph_osdmap {
struct ceph_fsid fsid;
u32 epoch;
u32 mkfs_epoch;
struct ceph_timespec created, modified;
u32 flags; /* CEPH_OSDMAP_* */
u32 max_osd; /* size of osd_state, _offload, _addr arrays */
u8 *osd_state; /* CEPH_OSD_* */
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
struct ceph_entity_addr *osd_addr;
struct rb_root pg_temp;
u32 num_pools;
struct ceph_pg_pool_info *pg_pool;
/* the CRUSH map specifies the mapping of placement groups to
* the list of osds that store+replicate them. */
struct crush_map *crush;
};
/*
* file layout helpers
*/
#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
#define ceph_file_layout_stripe_count(l) \
((__s32)le32_to_cpu((l).fl_stripe_count))
#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
#define ceph_file_layout_object_su(l) \
((__s32)le32_to_cpu((l).fl_object_stripe_unit))
#define ceph_file_layout_pg_preferred(l) \
((__s32)le32_to_cpu((l).fl_pg_preferred))
#define ceph_file_layout_pg_pool(l) \
((__s32)le32_to_cpu((l).fl_pg_pool))
static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
{
return le32_to_cpu(l->fl_stripe_unit) *
le32_to_cpu(l->fl_stripe_count);
}
/* "period" == bytes before i start on a new set of objects */
static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
{
return le32_to_cpu(l->fl_object_size) *
le32_to_cpu(l->fl_stripe_count);
}
static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
{
return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
}
static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
{
return map && (map->flags & flag);
}
extern char *ceph_osdmap_state_str(char *str, int len, int state);
static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
int osd)
{
if (osd >= map->max_osd)
return NULL;
return &map->osd_addr[osd];
}
extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_osdmap *map,
struct ceph_messenger *msgr);
extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
/* calculate mapping of a file extent to an object */
extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 *plen,
u64 *bno, u64 *oxoff, u64 *oxlen);
/* calculate mapping of object to a placement group */
extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
const char *oid,
struct ceph_file_layout *fl,
struct ceph_osdmap *osdmap);
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, union ceph_pg pgid);
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment