Commit 4d7ace02 authored by Xiubo Li's avatar Xiubo Li Committed by Ilya Dryomov

ceph: fix mdsmap cluster available check based on laggy number

In case the max_mds > 1 in MDS cluster and there is no any standby
MDS and all the max_mds MDSs are in up:active state, if one of the
up:active MDSs is dead, the m->m_num_laggy in kclient will be 1.
Then the mount will fail without considering other healthy MDSs.

There manybe some MDSs still "in" the cluster but not in up:active
state, we will ignore them. Only when all the up:active MDSs in
the cluster are laggy will treat the cluster as not be available.

In case decreasing the max_mds, the cluster will not stop the extra
up:active MDSs immediately and there will be a latency. During it
the up:active MDS number will be larger than the max_mds, so later
the m_info memories will 100% be reallocated.

Here will pick out the up:active MDSs as the m_num_mds and allocate
the needed memories once.
Signed-off-by: default avatarXiubo Li <xiubli@redhat.com>
Reviewed-by: default avatarJeff Layton <jlayton@kernel.org>
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent d80865bf
...@@ -113,6 +113,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -113,6 +113,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
int err; int err;
u8 mdsmap_v, mdsmap_cv; u8 mdsmap_v, mdsmap_cv;
u16 mdsmap_ev; u16 mdsmap_ev;
u32 possible_max_rank;
m = kzalloc(sizeof(*m), GFP_NOFS); m = kzalloc(sizeof(*m), GFP_NOFS);
if (!m) if (!m)
...@@ -138,14 +139,30 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -138,14 +139,30 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p); m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p); m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p); m->m_max_mds = ceph_decode_32(p);
m->m_num_mds = m->m_max_mds;
/*
* pick out the active nodes as the m_num_mds, the m_num_mds
* maybe larger than m_max_mds when decreasing the max_mds in
* cluster side, in other case it should less than or equal
* to m_max_mds.
*/
m->m_num_mds = n = ceph_decode_32(p);
m->m_num_active_mds = m->m_num_mds;
/*
* the possible max rank, it maybe larger than the m->m_num_mds,
* for example if the mds_max == 2 in the cluster, when the MDS(0)
* was laggy and being replaced by a new MDS, we will temporarily
* receive a new mds map with n_num_mds == 1 and the active MDS(1),
* and the mds rank >= m->m_num_mds.
*/
possible_max_rank = max((u32)m->m_num_mds, m->m_max_mds);
m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
if (!m->m_info) if (!m->m_info)
goto nomem; goto nomem;
/* pick out active nodes from mds_info (state > 0) */ /* pick out active nodes from mds_info (state > 0) */
n = ceph_decode_32(p);
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
u64 global_id; u64 global_id;
u32 namelen; u32 namelen;
...@@ -215,18 +232,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -215,18 +232,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_mds_state_name(state), ceph_mds_state_name(state),
laggy ? "(laggy)" : ""); laggy ? "(laggy)" : "");
if (mds < 0 || state <= 0) if (mds < 0 || mds >= possible_max_rank) {
pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
continue; continue;
}
if (mds >= m->m_num_mds) { if (state <= 0) {
int new_num = max(mds + 1, m->m_num_mds * 2); pr_warn("mdsmap_decode got incorrect state(%s)\n",
void *new_m_info = krealloc(m->m_info, ceph_mds_state_name(state));
new_num * sizeof(*m->m_info), continue;
GFP_NOFS | __GFP_ZERO);
if (!new_m_info)
goto nomem;
m->m_info = new_m_info;
m->m_num_mds = new_num;
} }
info = &m->m_info[mds]; info = &m->m_info[mds];
...@@ -247,14 +261,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -247,14 +261,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL; info->export_targets = NULL;
} }
} }
if (m->m_num_mds > m->m_max_mds) {
/* find max up mds */
for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
if (i == 0 || m->m_info[i-1].state > 0)
break;
}
m->m_num_mds = i;
}
/* pg_pools */ /* pg_pools */
ceph_decode_32_safe(p, end, n, bad); ceph_decode_32_safe(p, end, n, bad);
...@@ -396,7 +402,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) ...@@ -396,7 +402,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false; return false;
if (m->m_damaged) if (m->m_damaged)
return false; return false;
if (m->m_num_laggy > 0) if (m->m_num_laggy == m->m_num_active_mds)
return false; return false;
for (i = 0; i < m->m_num_mds; i++) { for (i = 0; i < m->m_num_mds; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
......
...@@ -25,8 +25,9 @@ struct ceph_mdsmap { ...@@ -25,8 +25,9 @@ struct ceph_mdsmap {
u32 m_session_timeout; /* seconds */ u32 m_session_timeout; /* seconds */
u32 m_session_autoclose; /* seconds */ u32 m_session_autoclose; /* seconds */
u64 m_max_file_size; u64 m_max_file_size;
u32 m_max_mds; /* size of m_addr, m_state arrays */ u32 m_max_mds; /* expected up:active mds number */
int m_num_mds; int m_num_active_mds; /* actual up:active mds number */
int m_num_mds; /* size of m_info array */
struct ceph_mds_info *m_info; struct ceph_mds_info *m_info;
/* which object pools file data can be stored in */ /* which object pools file data can be stored in */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment