Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
a02a946d
Commit
a02a946d
authored
Jun 19, 2017
by
Ilya Dryomov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
libceph: respect RADOS_BACKOFF backoffs
Signed-off-by:
Ilya Dryomov
<
idryomov@gmail.com
>
parent
76f827a7
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
737 additions
and
0 deletions
+737
-0
include/linux/ceph/ceph_fs.h
include/linux/ceph/ceph_fs.h
+1
-0
include/linux/ceph/osd_client.h
include/linux/ceph/osd_client.h
+45
-0
include/linux/ceph/osdmap.h
include/linux/ceph/osdmap.h
+1
-0
include/linux/ceph/rados.h
include/linux/ceph/rados.h
+6
-0
net/ceph/ceph_common.c
net/ceph/ceph_common.c
+1
-0
net/ceph/debugfs.c
net/ceph/debugfs.c
+74
-0
net/ceph/osd_client.c
net/ceph/osd_client.c
+593
-0
net/ceph/osdmap.c
net/ceph/osdmap.c
+16
-0
No files found.
include/linux/ceph/ceph_fs.h
View file @
a02a946d
...
...
@@ -147,6 +147,7 @@ struct ceph_dir_layout {
#define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43
#define CEPH_MSG_WATCH_NOTIFY 44
#define CEPH_MSG_OSD_BACKOFF 61
/* watch-notify operations */
...
...
include/linux/ceph/osd_client.h
View file @
a02a946d
#ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H
#include <linux/bitrev.h>
#include <linux/completion.h>
#include <linux/kref.h>
#include <linux/mempool.h>
...
...
@@ -36,6 +37,8 @@ struct ceph_osd {
struct
ceph_connection
o_con
;
struct
rb_root
o_requests
;
struct
rb_root
o_linger_requests
;
struct
rb_root
o_backoff_mappings
;
struct
rb_root
o_backoffs_by_id
;
struct
list_head
o_osd_lru
;
struct
ceph_auth_handshake
o_auth
;
unsigned
long
lru_ttl
;
...
...
@@ -275,6 +278,48 @@ struct ceph_watch_item {
struct
ceph_entity_addr
addr
;
};
struct
ceph_spg_mapping
{
struct
rb_node
node
;
struct
ceph_spg
spgid
;
struct
rb_root
backoffs
;
};
struct
ceph_hobject_id
{
void
*
key
;
size_t
key_len
;
void
*
oid
;
size_t
oid_len
;
u64
snapid
;
u32
hash
;
u8
is_max
;
void
*
nspace
;
size_t
nspace_len
;
s64
pool
;
/* cache */
u32
hash_reverse_bits
;
};
static
inline
void
ceph_hoid_build_hash_cache
(
struct
ceph_hobject_id
*
hoid
)
{
hoid
->
hash_reverse_bits
=
bitrev32
(
hoid
->
hash
);
}
/*
* PG-wide backoff: [begin, end)
* per-object backoff: begin == end
*/
struct
ceph_osd_backoff
{
struct
rb_node
spg_node
;
struct
rb_node
id_node
;
struct
ceph_spg
spgid
;
u64
id
;
struct
ceph_hobject_id
*
begin
;
struct
ceph_hobject_id
*
end
;
};
#define CEPH_LINGER_ID_START 0xffff000000000000ULL
struct
ceph_osd_client
{
...
...
include/linux/ceph/osdmap.h
View file @
a02a946d
...
...
@@ -32,6 +32,7 @@ struct ceph_spg {
};
int
ceph_pg_compare
(
const
struct
ceph_pg
*
lhs
,
const
struct
ceph_pg
*
rhs
);
int
ceph_spg_compare
(
const
struct
ceph_spg
*
lhs
,
const
struct
ceph_spg
*
rhs
);
#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0)
/* hash pg seed and pool id
together */
...
...
include/linux/ceph/rados.h
View file @
a02a946d
...
...
@@ -439,6 +439,12 @@ enum {
const
char
*
ceph_osd_watch_op_name
(
int
o
);
enum
{
CEPH_OSD_BACKOFF_OP_BLOCK
=
1
,
CEPH_OSD_BACKOFF_OP_ACK_BLOCK
=
2
,
CEPH_OSD_BACKOFF_OP_UNBLOCK
=
3
,
};
/*
* an individual object operation. each may be accompanied by some data
* payload
...
...
net/ceph/ceph_common.c
View file @
a02a946d
...
...
@@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type)
case
CEPH_MSG_OSD_OP
:
return
"osd_op"
;
case
CEPH_MSG_OSD_OPREPLY
:
return
"osd_opreply"
;
case
CEPH_MSG_WATCH_NOTIFY
:
return
"watch_notify"
;
case
CEPH_MSG_OSD_BACKOFF
:
return
"osd_backoff"
;
default:
return
"unknown"
;
}
}
...
...
net/ceph/debugfs.c
View file @
a02a946d
...
...
@@ -243,6 +243,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
mutex_unlock
(
&
osd
->
lock
);
}
static
void
dump_snapid
(
struct
seq_file
*
s
,
u64
snapid
)
{
if
(
snapid
==
CEPH_NOSNAP
)
seq_puts
(
s
,
"head"
);
else
if
(
snapid
==
CEPH_SNAPDIR
)
seq_puts
(
s
,
"snapdir"
);
else
seq_printf
(
s
,
"%llx"
,
snapid
);
}
static
void
dump_name_escaped
(
struct
seq_file
*
s
,
unsigned
char
*
name
,
size_t
len
)
{
size_t
i
;
for
(
i
=
0
;
i
<
len
;
i
++
)
{
if
(
name
[
i
]
==
'%'
||
name
[
i
]
==
':'
||
name
[
i
]
==
'/'
||
name
[
i
]
<
32
||
name
[
i
]
>=
127
)
{
seq_printf
(
s
,
"%%%02x"
,
name
[
i
]);
}
else
{
seq_putc
(
s
,
name
[
i
]);
}
}
}
static
void
dump_hoid
(
struct
seq_file
*
s
,
const
struct
ceph_hobject_id
*
hoid
)
{
if
(
hoid
->
snapid
==
0
&&
hoid
->
hash
==
0
&&
!
hoid
->
is_max
&&
hoid
->
pool
==
S64_MIN
)
{
seq_puts
(
s
,
"MIN"
);
return
;
}
if
(
hoid
->
is_max
)
{
seq_puts
(
s
,
"MAX"
);
return
;
}
seq_printf
(
s
,
"%lld:%08x:"
,
hoid
->
pool
,
hoid
->
hash_reverse_bits
);
dump_name_escaped
(
s
,
hoid
->
nspace
,
hoid
->
nspace_len
);
seq_putc
(
s
,
':'
);
dump_name_escaped
(
s
,
hoid
->
key
,
hoid
->
key_len
);
seq_putc
(
s
,
':'
);
dump_name_escaped
(
s
,
hoid
->
oid
,
hoid
->
oid_len
);
seq_putc
(
s
,
':'
);
dump_snapid
(
s
,
hoid
->
snapid
);
}
static
void
dump_backoffs
(
struct
seq_file
*
s
,
struct
ceph_osd
*
osd
)
{
struct
rb_node
*
n
;
mutex_lock
(
&
osd
->
lock
);
for
(
n
=
rb_first
(
&
osd
->
o_backoffs_by_id
);
n
;
n
=
rb_next
(
n
))
{
struct
ceph_osd_backoff
*
backoff
=
rb_entry
(
n
,
struct
ceph_osd_backoff
,
id_node
);
seq_printf
(
s
,
"osd%d
\t
"
,
osd
->
o_osd
);
dump_spgid
(
s
,
&
backoff
->
spgid
);
seq_printf
(
s
,
"
\t
%llu
\t
"
,
backoff
->
id
);
dump_hoid
(
s
,
backoff
->
begin
);
seq_putc
(
s
,
'\t'
);
dump_hoid
(
s
,
backoff
->
end
);
seq_putc
(
s
,
'\n'
);
}
mutex_unlock
(
&
osd
->
lock
);
}
static
int
osdc_show
(
struct
seq_file
*
s
,
void
*
pp
)
{
struct
ceph_client
*
client
=
s
->
private
;
...
...
@@ -268,6 +335,13 @@ static int osdc_show(struct seq_file *s, void *pp)
}
dump_linger_requests
(
s
,
&
osdc
->
homeless_osd
);
seq_puts
(
s
,
"BACKOFFS
\n
"
);
for
(
n
=
rb_first
(
&
osdc
->
osds
);
n
;
n
=
rb_next
(
n
))
{
struct
ceph_osd
*
osd
=
rb_entry
(
n
,
struct
ceph_osd
,
o_node
);
dump_backoffs
(
s
,
osd
);
}
up_read
(
&
osdc
->
lock
);
return
0
;
}
...
...
net/ceph/osd_client.c
View file @
a02a946d
...
...
@@ -50,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
struct
ceph_osd_linger_request
*
lreq
);
static
void
unlink_linger
(
struct
ceph_osd
*
osd
,
struct
ceph_osd_linger_request
*
lreq
);
static
void
clear_backoffs
(
struct
ceph_osd
*
osd
);
#if 1
static
inline
bool
rwsem_is_wrlocked
(
struct
rw_semaphore
*
sem
)
...
...
@@ -1019,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd)
RB_CLEAR_NODE
(
&
osd
->
o_node
);
osd
->
o_requests
=
RB_ROOT
;
osd
->
o_linger_requests
=
RB_ROOT
;
osd
->
o_backoff_mappings
=
RB_ROOT
;
osd
->
o_backoffs_by_id
=
RB_ROOT
;
INIT_LIST_HEAD
(
&
osd
->
o_osd_lru
);
INIT_LIST_HEAD
(
&
osd
->
o_keepalive_item
);
osd
->
o_incarnation
=
1
;
...
...
@@ -1030,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd)
WARN_ON
(
!
RB_EMPTY_NODE
(
&
osd
->
o_node
));
WARN_ON
(
!
RB_EMPTY_ROOT
(
&
osd
->
o_requests
));
WARN_ON
(
!
RB_EMPTY_ROOT
(
&
osd
->
o_linger_requests
));
WARN_ON
(
!
RB_EMPTY_ROOT
(
&
osd
->
o_backoff_mappings
));
WARN_ON
(
!
RB_EMPTY_ROOT
(
&
osd
->
o_backoffs_by_id
));
WARN_ON
(
!
list_empty
(
&
osd
->
o_osd_lru
));
WARN_ON
(
!
list_empty
(
&
osd
->
o_keepalive_item
));
...
...
@@ -1150,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd)
unlink_linger
(
osd
,
lreq
);
link_linger
(
&
osdc
->
homeless_osd
,
lreq
);
}
clear_backoffs
(
osd
);
__remove_osd_from_lru
(
osd
);
erase_osd
(
&
osdc
->
osds
,
osd
);
...
...
@@ -1431,6 +1437,328 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
return
ct_res
;
}
static
struct
ceph_spg_mapping
*
alloc_spg_mapping
(
void
)
{
struct
ceph_spg_mapping
*
spg
;
spg
=
kmalloc
(
sizeof
(
*
spg
),
GFP_NOIO
);
if
(
!
spg
)
return
NULL
;
RB_CLEAR_NODE
(
&
spg
->
node
);
spg
->
backoffs
=
RB_ROOT
;
return
spg
;
}
static
void
free_spg_mapping
(
struct
ceph_spg_mapping
*
spg
)
{
WARN_ON
(
!
RB_EMPTY_NODE
(
&
spg
->
node
));
WARN_ON
(
!
RB_EMPTY_ROOT
(
&
spg
->
backoffs
));
kfree
(
spg
);
}
/*
* rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
* ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
* defined only within a specific spgid; it does not pass anything to
* children on split, or to another primary.
*/
DEFINE_RB_FUNCS2
(
spg_mapping
,
struct
ceph_spg_mapping
,
spgid
,
ceph_spg_compare
,
RB_BYPTR
,
const
struct
ceph_spg
*
,
node
)
static
u64
hoid_get_bitwise_key
(
const
struct
ceph_hobject_id
*
hoid
)
{
return
hoid
->
is_max
?
0x100000000ull
:
hoid
->
hash_reverse_bits
;
}
static
void
hoid_get_effective_key
(
const
struct
ceph_hobject_id
*
hoid
,
void
**
pkey
,
size_t
*
pkey_len
)
{
if
(
hoid
->
key_len
)
{
*
pkey
=
hoid
->
key
;
*
pkey_len
=
hoid
->
key_len
;
}
else
{
*
pkey
=
hoid
->
oid
;
*
pkey_len
=
hoid
->
oid_len
;
}
}
static
int
compare_names
(
const
void
*
name1
,
size_t
name1_len
,
const
void
*
name2
,
size_t
name2_len
)
{
int
ret
;
ret
=
memcmp
(
name1
,
name2
,
min
(
name1_len
,
name2_len
));
if
(
!
ret
)
{
if
(
name1_len
<
name2_len
)
ret
=
-
1
;
else
if
(
name1_len
>
name2_len
)
ret
=
1
;
}
return
ret
;
}
static
int
hoid_compare
(
const
struct
ceph_hobject_id
*
lhs
,
const
struct
ceph_hobject_id
*
rhs
)
{
void
*
effective_key1
,
*
effective_key2
;
size_t
effective_key1_len
,
effective_key2_len
;
int
ret
;
if
(
lhs
->
is_max
<
rhs
->
is_max
)
return
-
1
;
if
(
lhs
->
is_max
>
rhs
->
is_max
)
return
1
;
if
(
lhs
->
pool
<
rhs
->
pool
)
return
-
1
;
if
(
lhs
->
pool
>
rhs
->
pool
)
return
1
;
if
(
hoid_get_bitwise_key
(
lhs
)
<
hoid_get_bitwise_key
(
rhs
))
return
-
1
;
if
(
hoid_get_bitwise_key
(
lhs
)
>
hoid_get_bitwise_key
(
rhs
))
return
1
;
ret
=
compare_names
(
lhs
->
nspace
,
lhs
->
nspace_len
,
rhs
->
nspace
,
rhs
->
nspace_len
);
if
(
ret
)
return
ret
;
hoid_get_effective_key
(
lhs
,
&
effective_key1
,
&
effective_key1_len
);
hoid_get_effective_key
(
rhs
,
&
effective_key2
,
&
effective_key2_len
);
ret
=
compare_names
(
effective_key1
,
effective_key1_len
,
effective_key2
,
effective_key2_len
);
if
(
ret
)
return
ret
;
ret
=
compare_names
(
lhs
->
oid
,
lhs
->
oid_len
,
rhs
->
oid
,
rhs
->
oid_len
);
if
(
ret
)
return
ret
;
if
(
lhs
->
snapid
<
rhs
->
snapid
)
return
-
1
;
if
(
lhs
->
snapid
>
rhs
->
snapid
)
return
1
;
return
0
;
}
/*
* For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
* compat stuff here.
*
* Assumes @hoid is zero-initialized.
*/
static
int
decode_hoid
(
void
**
p
,
void
*
end
,
struct
ceph_hobject_id
*
hoid
)
{
u8
struct_v
;
u32
struct_len
;
int
ret
;
ret
=
ceph_start_decoding
(
p
,
end
,
4
,
"hobject_t"
,
&
struct_v
,
&
struct_len
);
if
(
ret
)
return
ret
;
if
(
struct_v
<
4
)
{
pr_err
(
"got struct_v %d < 4 of hobject_t
\n
"
,
struct_v
);
goto
e_inval
;
}
hoid
->
key
=
ceph_extract_encoded_string
(
p
,
end
,
&
hoid
->
key_len
,
GFP_NOIO
);
if
(
IS_ERR
(
hoid
->
key
))
{
ret
=
PTR_ERR
(
hoid
->
key
);
hoid
->
key
=
NULL
;
return
ret
;
}
hoid
->
oid
=
ceph_extract_encoded_string
(
p
,
end
,
&
hoid
->
oid_len
,
GFP_NOIO
);
if
(
IS_ERR
(
hoid
->
oid
))
{
ret
=
PTR_ERR
(
hoid
->
oid
);
hoid
->
oid
=
NULL
;
return
ret
;
}
ceph_decode_64_safe
(
p
,
end
,
hoid
->
snapid
,
e_inval
);
ceph_decode_32_safe
(
p
,
end
,
hoid
->
hash
,
e_inval
);
ceph_decode_8_safe
(
p
,
end
,
hoid
->
is_max
,
e_inval
);
hoid
->
nspace
=
ceph_extract_encoded_string
(
p
,
end
,
&
hoid
->
nspace_len
,
GFP_NOIO
);
if
(
IS_ERR
(
hoid
->
nspace
))
{
ret
=
PTR_ERR
(
hoid
->
nspace
);
hoid
->
nspace
=
NULL
;
return
ret
;
}
ceph_decode_64_safe
(
p
,
end
,
hoid
->
pool
,
e_inval
);
ceph_hoid_build_hash_cache
(
hoid
);
return
0
;
e_inval:
return
-
EINVAL
;
}
static
int
hoid_encoding_size
(
const
struct
ceph_hobject_id
*
hoid
)
{
return
8
+
4
+
1
+
8
+
/* snapid, hash, is_max, pool */
4
+
hoid
->
key_len
+
4
+
hoid
->
oid_len
+
4
+
hoid
->
nspace_len
;
}
static
void
encode_hoid
(
void
**
p
,
void
*
end
,
const
struct
ceph_hobject_id
*
hoid
)
{
ceph_start_encoding
(
p
,
4
,
3
,
hoid_encoding_size
(
hoid
));
ceph_encode_string
(
p
,
end
,
hoid
->
key
,
hoid
->
key_len
);
ceph_encode_string
(
p
,
end
,
hoid
->
oid
,
hoid
->
oid_len
);
ceph_encode_64
(
p
,
hoid
->
snapid
);
ceph_encode_32
(
p
,
hoid
->
hash
);
ceph_encode_8
(
p
,
hoid
->
is_max
);
ceph_encode_string
(
p
,
end
,
hoid
->
nspace
,
hoid
->
nspace_len
);
ceph_encode_64
(
p
,
hoid
->
pool
);
}
static
void
free_hoid
(
struct
ceph_hobject_id
*
hoid
)
{
if
(
hoid
)
{
kfree
(
hoid
->
key
);
kfree
(
hoid
->
oid
);
kfree
(
hoid
->
nspace
);
kfree
(
hoid
);
}
}
static
struct
ceph_osd_backoff
*
alloc_backoff
(
void
)
{
struct
ceph_osd_backoff
*
backoff
;
backoff
=
kzalloc
(
sizeof
(
*
backoff
),
GFP_NOIO
);
if
(
!
backoff
)
return
NULL
;
RB_CLEAR_NODE
(
&
backoff
->
spg_node
);
RB_CLEAR_NODE
(
&
backoff
->
id_node
);
return
backoff
;
}
static
void
free_backoff
(
struct
ceph_osd_backoff
*
backoff
)
{
WARN_ON
(
!
RB_EMPTY_NODE
(
&
backoff
->
spg_node
));
WARN_ON
(
!
RB_EMPTY_NODE
(
&
backoff
->
id_node
));
free_hoid
(
backoff
->
begin
);
free_hoid
(
backoff
->
end
);
kfree
(
backoff
);
}
/*
* Within a specific spgid, backoffs are managed by ->begin hoid.
*/
DEFINE_RB_INSDEL_FUNCS2
(
backoff
,
struct
ceph_osd_backoff
,
begin
,
hoid_compare
,
RB_BYVAL
,
spg_node
);
static
struct
ceph_osd_backoff
*
lookup_containing_backoff
(
struct
rb_root
*
root
,
const
struct
ceph_hobject_id
*
hoid
)
{
struct
rb_node
*
n
=
root
->
rb_node
;
while
(
n
)
{
struct
ceph_osd_backoff
*
cur
=
rb_entry
(
n
,
struct
ceph_osd_backoff
,
spg_node
);
int
cmp
;
cmp
=
hoid_compare
(
hoid
,
cur
->
begin
);
if
(
cmp
<
0
)
{
n
=
n
->
rb_left
;
}
else
if
(
cmp
>
0
)
{
if
(
hoid_compare
(
hoid
,
cur
->
end
)
<
0
)
return
cur
;
n
=
n
->
rb_right
;
}
else
{
return
cur
;
}
}
return
NULL
;
}
/*
* Each backoff has a unique id within its OSD session.
*/
DEFINE_RB_FUNCS
(
backoff_by_id
,
struct
ceph_osd_backoff
,
id
,
id_node
)
static
void
clear_backoffs
(
struct
ceph_osd
*
osd
)
{
while
(
!
RB_EMPTY_ROOT
(
&
osd
->
o_backoff_mappings
))
{
struct
ceph_spg_mapping
*
spg
=
rb_entry
(
rb_first
(
&
osd
->
o_backoff_mappings
),
struct
ceph_spg_mapping
,
node
);
while
(
!
RB_EMPTY_ROOT
(
&
spg
->
backoffs
))
{
struct
ceph_osd_backoff
*
backoff
=
rb_entry
(
rb_first
(
&
spg
->
backoffs
),
struct
ceph_osd_backoff
,
spg_node
);
erase_backoff
(
&
spg
->
backoffs
,
backoff
);
erase_backoff_by_id
(
&
osd
->
o_backoffs_by_id
,
backoff
);
free_backoff
(
backoff
);
}
erase_spg_mapping
(
&
osd
->
o_backoff_mappings
,
spg
);
free_spg_mapping
(
spg
);
}
}
/*
* Set up a temporary, non-owning view into @t.
*/
static
void
hoid_fill_from_target
(
struct
ceph_hobject_id
*
hoid
,
const
struct
ceph_osd_request_target
*
t
)
{
hoid
->
key
=
NULL
;
hoid
->
key_len
=
0
;
hoid
->
oid
=
t
->
target_oid
.
name
;
hoid
->
oid_len
=
t
->
target_oid
.
name_len
;
hoid
->
snapid
=
CEPH_NOSNAP
;
hoid
->
hash
=
t
->
pgid
.
seed
;
hoid
->
is_max
=
false
;
if
(
t
->
target_oloc
.
pool_ns
)
{
hoid
->
nspace
=
t
->
target_oloc
.
pool_ns
->
str
;
hoid
->
nspace_len
=
t
->
target_oloc
.
pool_ns
->
len
;
}
else
{
hoid
->
nspace
=
NULL
;
hoid
->
nspace_len
=
0
;
}
hoid
->
pool
=
t
->
target_oloc
.
pool
;
ceph_hoid_build_hash_cache
(
hoid
);
}
static
bool
should_plug_request
(
struct
ceph_osd_request
*
req
)
{
struct
ceph_osd
*
osd
=
req
->
r_osd
;
struct
ceph_spg_mapping
*
spg
;
struct
ceph_osd_backoff
*
backoff
;
struct
ceph_hobject_id
hoid
;
spg
=
lookup_spg_mapping
(
&
osd
->
o_backoff_mappings
,
&
req
->
r_t
.
spgid
);
if
(
!
spg
)
return
false
;
hoid_fill_from_target
(
&
hoid
,
&
req
->
r_t
);
backoff
=
lookup_containing_backoff
(
&
spg
->
backoffs
,
&
hoid
);
if
(
!
backoff
)
return
false
;
dout
(
"%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu
\n
"
,
__func__
,
req
,
req
->
r_tid
,
osd
->
o_osd
,
backoff
->
spgid
.
pgid
.
pool
,
backoff
->
spgid
.
pgid
.
seed
,
backoff
->
spgid
.
shard
,
backoff
->
id
);
return
true
;
}
static
void
setup_request_data
(
struct
ceph_osd_request
*
req
,
struct
ceph_msg
*
msg
)
{
...
...
@@ -1707,6 +2035,10 @@ static void send_request(struct ceph_osd_request *req)
verify_osd_locked
(
osd
);
WARN_ON
(
osd
->
o_osd
!=
req
->
r_t
.
osd
);
/* backoff? */
if
(
should_plug_request
(
req
))
return
;
/*
* We may have a previously queued request message hanging
* around. Cancel it to avoid corrupting the msgr.
...
...
@@ -3527,6 +3859,8 @@ static void kick_osd_requests(struct ceph_osd *osd)
{
struct
rb_node
*
n
;
clear_backoffs
(
osd
);
for
(
n
=
rb_first
(
&
osd
->
o_requests
);
n
;
)
{
struct
ceph_osd_request
*
req
=
rb_entry
(
n
,
struct
ceph_osd_request
,
r_node
);
...
...
@@ -3572,6 +3906,261 @@ static void osd_fault(struct ceph_connection *con)
up_write
(
&
osdc
->
lock
);
}
struct
MOSDBackoff
{
struct
ceph_spg
spgid
;
u32
map_epoch
;
u8
op
;
u64
id
;
struct
ceph_hobject_id
*
begin
;
struct
ceph_hobject_id
*
end
;
};
static
int
decode_MOSDBackoff
(
const
struct
ceph_msg
*
msg
,
struct
MOSDBackoff
*
m
)
{
void
*
p
=
msg
->
front
.
iov_base
;
void
*
const
end
=
p
+
msg
->
front
.
iov_len
;
u8
struct_v
;
u32
struct_len
;
int
ret
;
ret
=
ceph_start_decoding
(
&
p
,
end
,
1
,
"spg_t"
,
&
struct_v
,
&
struct_len
);
if
(
ret
)
return
ret
;
ret
=
ceph_decode_pgid
(
&
p
,
end
,
&
m
->
spgid
.
pgid
);
if
(
ret
)
return
ret
;
ceph_decode_8_safe
(
&
p
,
end
,
m
->
spgid
.
shard
,
e_inval
);
ceph_decode_32_safe
(
&
p
,
end
,
m
->
map_epoch
,
e_inval
);
ceph_decode_8_safe
(
&
p
,
end
,
m
->
op
,
e_inval
);
ceph_decode_64_safe
(
&
p
,
end
,
m
->
id
,
e_inval
);
m
->
begin
=
kzalloc
(
sizeof
(
*
m
->
begin
),
GFP_NOIO
);
if
(
!
m
->
begin
)
return
-
ENOMEM
;
ret
=
decode_hoid
(
&
p
,
end
,
m
->
begin
);
if
(
ret
)
{
free_hoid
(
m
->
begin
);
return
ret
;
}
m
->
end
=
kzalloc
(
sizeof
(
*
m
->
end
),
GFP_NOIO
);
if
(
!
m
->
end
)
{
free_hoid
(
m
->
begin
);
return
-
ENOMEM
;
}
ret
=
decode_hoid
(
&
p
,
end
,
m
->
end
);
if
(
ret
)
{
free_hoid
(
m
->
begin
);
free_hoid
(
m
->
end
);
return
ret
;
}
return
0
;
e_inval:
return
-
EINVAL
;
}
static
struct
ceph_msg
*
create_backoff_message
(
const
struct
ceph_osd_backoff
*
backoff
,
u32
map_epoch
)
{
struct
ceph_msg
*
msg
;
void
*
p
,
*
end
;
int
msg_size
;
msg_size
=
CEPH_ENCODING_START_BLK_LEN
+
CEPH_PGID_ENCODING_LEN
+
1
;
/* spgid */
msg_size
+=
4
+
1
+
8
;
/* map_epoch, op, id */
msg_size
+=
CEPH_ENCODING_START_BLK_LEN
+
hoid_encoding_size
(
backoff
->
begin
);
msg_size
+=
CEPH_ENCODING_START_BLK_LEN
+
hoid_encoding_size
(
backoff
->
end
);
msg
=
ceph_msg_new
(
CEPH_MSG_OSD_BACKOFF
,
msg_size
,
GFP_NOIO
,
true
);
if
(
!
msg
)
return
NULL
;
p
=
msg
->
front
.
iov_base
;
end
=
p
+
msg
->
front_alloc_len
;
encode_spgid
(
&
p
,
&
backoff
->
spgid
);
ceph_encode_32
(
&
p
,
map_epoch
);
ceph_encode_8
(
&
p
,
CEPH_OSD_BACKOFF_OP_ACK_BLOCK
);
ceph_encode_64
(
&
p
,
backoff
->
id
);
encode_hoid
(
&
p
,
end
,
backoff
->
begin
);
encode_hoid
(
&
p
,
end
,
backoff
->
end
);
BUG_ON
(
p
!=
end
);
msg
->
front
.
iov_len
=
p
-
msg
->
front
.
iov_base
;
msg
->
hdr
.
version
=
cpu_to_le16
(
1
);
/* MOSDBackoff v1 */
msg
->
hdr
.
front_len
=
cpu_to_le32
(
msg
->
front
.
iov_len
);
return
msg
;
}
static
void
handle_backoff_block
(
struct
ceph_osd
*
osd
,
struct
MOSDBackoff
*
m
)
{
struct
ceph_spg_mapping
*
spg
;
struct
ceph_osd_backoff
*
backoff
;
struct
ceph_msg
*
msg
;
dout
(
"%s osd%d spgid %llu.%xs%d id %llu
\n
"
,
__func__
,
osd
->
o_osd
,
m
->
spgid
.
pgid
.
pool
,
m
->
spgid
.
pgid
.
seed
,
m
->
spgid
.
shard
,
m
->
id
);
spg
=
lookup_spg_mapping
(
&
osd
->
o_backoff_mappings
,
&
m
->
spgid
);
if
(
!
spg
)
{
spg
=
alloc_spg_mapping
();
if
(
!
spg
)
{
pr_err
(
"%s failed to allocate spg
\n
"
,
__func__
);
return
;
}
spg
->
spgid
=
m
->
spgid
;
/* struct */
insert_spg_mapping
(
&
osd
->
o_backoff_mappings
,
spg
);
}
backoff
=
alloc_backoff
();
if
(
!
backoff
)
{
pr_err
(
"%s failed to allocate backoff
\n
"
,
__func__
);
return
;
}
backoff
->
spgid
=
m
->
spgid
;
/* struct */
backoff
->
id
=
m
->
id
;
backoff
->
begin
=
m
->
begin
;
m
->
begin
=
NULL
;
/* backoff now owns this */
backoff
->
end
=
m
->
end
;
m
->
end
=
NULL
;
/* ditto */
insert_backoff
(
&
spg
->
backoffs
,
backoff
);
insert_backoff_by_id
(
&
osd
->
o_backoffs_by_id
,
backoff
);
/*
* Ack with original backoff's epoch so that the OSD can
* discard this if there was a PG split.
*/
msg
=
create_backoff_message
(
backoff
,
m
->
map_epoch
);
if
(
!
msg
)
{
pr_err
(
"%s failed to allocate msg
\n
"
,
__func__
);
return
;
}
ceph_con_send
(
&
osd
->
o_con
,
msg
);
}
static
bool
target_contained_by
(
const
struct
ceph_osd_request_target
*
t
,
const
struct
ceph_hobject_id
*
begin
,
const
struct
ceph_hobject_id
*
end
)
{
struct
ceph_hobject_id
hoid
;
int
cmp
;
hoid_fill_from_target
(
&
hoid
,
t
);
cmp
=
hoid_compare
(
&
hoid
,
begin
);
return
!
cmp
||
(
cmp
>
0
&&
hoid_compare
(
&
hoid
,
end
)
<
0
);
}
static
void
handle_backoff_unblock
(
struct
ceph_osd
*
osd
,
const
struct
MOSDBackoff
*
m
)
{
struct
ceph_spg_mapping
*
spg
;
struct
ceph_osd_backoff
*
backoff
;
struct
rb_node
*
n
;
dout
(
"%s osd%d spgid %llu.%xs%d id %llu
\n
"
,
__func__
,
osd
->
o_osd
,
m
->
spgid
.
pgid
.
pool
,
m
->
spgid
.
pgid
.
seed
,
m
->
spgid
.
shard
,
m
->
id
);
backoff
=
lookup_backoff_by_id
(
&
osd
->
o_backoffs_by_id
,
m
->
id
);
if
(
!
backoff
)
{
pr_err
(
"%s osd%d spgid %llu.%xs%d id %llu backoff dne
\n
"
,
__func__
,
osd
->
o_osd
,
m
->
spgid
.
pgid
.
pool
,
m
->
spgid
.
pgid
.
seed
,
m
->
spgid
.
shard
,
m
->
id
);
return
;
}
if
(
hoid_compare
(
backoff
->
begin
,
m
->
begin
)
&&
hoid_compare
(
backoff
->
end
,
m
->
end
))
{
pr_err
(
"%s osd%d spgid %llu.%xs%d id %llu bad range?
\n
"
,
__func__
,
osd
->
o_osd
,
m
->
spgid
.
pgid
.
pool
,
m
->
spgid
.
pgid
.
seed
,
m
->
spgid
.
shard
,
m
->
id
);
/* unblock it anyway... */
}
spg
=
lookup_spg_mapping
(
&
osd
->
o_backoff_mappings
,
&
backoff
->
spgid
);
BUG_ON
(
!
spg
);
erase_backoff
(
&
spg
->
backoffs
,
backoff
);
erase_backoff_by_id
(
&
osd
->
o_backoffs_by_id
,
backoff
);
free_backoff
(
backoff
);
if
(
RB_EMPTY_ROOT
(
&
spg
->
backoffs
))
{
erase_spg_mapping
(
&
osd
->
o_backoff_mappings
,
spg
);
free_spg_mapping
(
spg
);
}
for
(
n
=
rb_first
(
&
osd
->
o_requests
);
n
;
n
=
rb_next
(
n
))
{
struct
ceph_osd_request
*
req
=
rb_entry
(
n
,
struct
ceph_osd_request
,
r_node
);
if
(
!
ceph_spg_compare
(
&
req
->
r_t
.
spgid
,
&
m
->
spgid
))
{
/*
* Match against @m, not @backoff -- the PG may
* have split on the OSD.
*/
if
(
target_contained_by
(
&
req
->
r_t
,
m
->
begin
,
m
->
end
))
{
/*
* If no other installed backoff applies,
* resend.
*/
send_request
(
req
);
}
}
}
}
static
void
handle_backoff
(
struct
ceph_osd
*
osd
,
struct
ceph_msg
*
msg
)
{
struct
ceph_osd_client
*
osdc
=
osd
->
o_osdc
;
struct
MOSDBackoff
m
;
int
ret
;
down_read
(
&
osdc
->
lock
);
if
(
!
osd_registered
(
osd
))
{
dout
(
"%s osd%d unknown
\n
"
,
__func__
,
osd
->
o_osd
);
up_read
(
&
osdc
->
lock
);
return
;
}
WARN_ON
(
osd
->
o_osd
!=
le64_to_cpu
(
msg
->
hdr
.
src
.
num
));
mutex_lock
(
&
osd
->
lock
);
ret
=
decode_MOSDBackoff
(
msg
,
&
m
);
if
(
ret
)
{
pr_err
(
"failed to decode MOSDBackoff: %d
\n
"
,
ret
);
ceph_msg_dump
(
msg
);
goto
out_unlock
;
}
switch
(
m
.
op
)
{
case
CEPH_OSD_BACKOFF_OP_BLOCK
:
handle_backoff_block
(
osd
,
&
m
);
break
;
case
CEPH_OSD_BACKOFF_OP_UNBLOCK
:
handle_backoff_unblock
(
osd
,
&
m
);
break
;
default:
pr_err
(
"%s osd%d unknown op %d
\n
"
,
__func__
,
osd
->
o_osd
,
m
.
op
);
}
free_hoid
(
m
.
begin
);
free_hoid
(
m
.
end
);
out_unlock:
mutex_unlock
(
&
osd
->
lock
);
up_read
(
&
osdc
->
lock
);
}
/*
* Process osd watch notifications
*/
...
...
@@ -4509,6 +5098,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
case
CEPH_MSG_OSD_OPREPLY
:
handle_reply
(
osd
,
msg
);
break
;
case
CEPH_MSG_OSD_BACKOFF
:
handle_backoff
(
osd
,
msg
);
break
;
case
CEPH_MSG_WATCH_NOTIFY
:
handle_watch_notify
(
osdc
,
msg
);
break
;
...
...
@@ -4631,6 +5223,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
*
skip
=
0
;
switch
(
type
)
{
case
CEPH_MSG_OSD_MAP
:
case
CEPH_MSG_OSD_BACKOFF
:
case
CEPH_MSG_WATCH_NOTIFY
:
return
alloc_msg_with_page_vector
(
hdr
);
case
CEPH_MSG_OSD_OPREPLY
:
...
...
net/ceph/osdmap.c
View file @
a02a946d
...
...
@@ -418,6 +418,22 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
return
0
;
}
int
ceph_spg_compare
(
const
struct
ceph_spg
*
lhs
,
const
struct
ceph_spg
*
rhs
)
{
int
ret
;
ret
=
ceph_pg_compare
(
&
lhs
->
pgid
,
&
rhs
->
pgid
);
if
(
ret
)
return
ret
;
if
(
lhs
->
shard
<
rhs
->
shard
)
return
-
1
;
if
(
lhs
->
shard
>
rhs
->
shard
)
return
1
;
return
0
;
}
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds) and primary_temp (explicit primary setting)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment