Commit 83ca14fd authored by Sage Weil's avatar Sage Weil

libceph: add support for HASHPSPOOL pool flag

The legacy behavior adds the pgid seed and pool together as the input for
CRUSH.  That is problematic because each pool's PGs end up mapping to the
same OSDs: 1.5 == 2.4 == 3.3 == ...

Instead, if the HASHPSPOOL flag is set, we has the ps and pool together and
feed that into CRUSH.  This ensures that two adjacent pools will map to
an independent pseudorandom set of OSDs.

Advertise our support for this via a protocol feature flag.
Signed-off-by: default avatarSage Weil <sage@inktank.com>
Reviewed-by: default avatarAlex Elder <elder@inktank.com>
parent 1b83bef2
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
#define CEPH_FEATURE_OSD_HBMSGS (1<<28) #define CEPH_FEATURE_OSD_HBMSGS (1<<28)
#define CEPH_FEATURE_MDSENC (1<<29) #define CEPH_FEATURE_MDSENC (1<<29)
#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
/* /*
* Features supported. * Features supported.
...@@ -45,7 +46,8 @@ ...@@ -45,7 +46,8 @@
CEPH_FEATURE_OSDENC | \ CEPH_FEATURE_OSDENC | \
CEPH_FEATURE_CRUSH_TUNABLES | \ CEPH_FEATURE_CRUSH_TUNABLES | \
CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_REPLY_CREATE_INODE) CEPH_FEATURE_REPLY_CREATE_INODE | \
CEPH_FEATURE_OSDHASHPSPOOL)
#define CEPH_FEATURES_REQUIRED_DEFAULT \ #define CEPH_FEATURES_REQUIRED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \ (CEPH_FEATURE_NOSRCADDR | \
......
...@@ -23,6 +23,8 @@ struct ceph_pg { ...@@ -23,6 +23,8 @@ struct ceph_pg {
uint32_t seed; uint32_t seed;
}; };
#define CEPH_POOL_FLAG_HASHPSPOOL 1
struct ceph_pg_pool_info { struct ceph_pg_pool_info {
struct rb_node node; struct rb_node node;
s64 id; s64 id;
......
...@@ -1127,18 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, ...@@ -1127,18 +1127,16 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
struct ceph_pg_mapping *pg; struct ceph_pg_mapping *pg;
struct ceph_pg_pool_info *pool; struct ceph_pg_pool_info *pool;
int ruleno; int ruleno;
unsigned int poolid, ps, pps, t, r; int r;
u32 pps;
poolid = pgid.pool; pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
ps = pgid.seed;
pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
if (!pool) if (!pool)
return NULL; return NULL;
/* pg_temp? */ /* pg_temp? */
t = ceph_stable_mod(ps, pool->pg_num, pool->pgp_num_mask); pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
pgid.seed = t; pool->pgp_num_mask);
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) { if (pg) {
*num = pg->len; *num = pg->len;
...@@ -1149,20 +1147,35 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, ...@@ -1149,20 +1147,35 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
pool->type, pool->size); pool->type, pool->size);
if (ruleno < 0) { if (ruleno < 0) {
pr_err("no crush rule pool %d ruleset %d type %d size %d\n", pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
poolid, pool->crush_ruleset, pool->type, pgid.pool, pool->crush_ruleset, pool->type,
pool->size); pool->size);
return NULL; return NULL;
} }
pps = ceph_stable_mod(ps, pool->pgp_num, pool->pgp_num_mask); if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
pps += poolid; /* hash pool id and seed sothat pool PGs do not overlap */
pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
ceph_stable_mod(pgid.seed, pool->pgp_num,
pool->pgp_num_mask),
pgid.pool);
} else {
/*
* legacy ehavior: add ps and pool together. this is
* not a great approach because the PGs from each pool
* will overlap on top of each other: 0.5 == 1.4 ==
* 2.3 == ...
*/
pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
pool->pgp_num_mask) +
(unsigned)pgid.pool;
}
r = crush_do_rule(osdmap->crush, ruleno, pps, osds, r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
min_t(int, pool->size, *num), min_t(int, pool->size, *num),
osdmap->osd_weight); osdmap->osd_weight);
if (r < 0) { if (r < 0) {
pr_err("error %d from crush rule: pool %d ruleset %d type %d" pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
" size %d\n", r, poolid, pool->crush_ruleset, " size %d\n", r, pgid.pool, pool->crush_ruleset,
pool->type, pool->size); pool->type, pool->size);
return NULL; return NULL;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment