Commit a1f0358b authored by Kent Overstreet's avatar Kent Overstreet

bcache: Incremental gc

Big garbage collection rewrite; now, garbage collection uses the same
mechanisms as used elsewhere for inserting/updating btree node pointers,
instead of rewriting interior btree nodes in place.

This makes the code significantly cleaner and less fragile, and means we
can now make garbage collection incremental - it doesn't have to hold a
write lock on the root of the btree for the entire duration of garbage
collection.

This means that there's less of a latency hit for doing garbage
collection, which means we can gc more frequently (and do a better job
of reclaiming from the cache), and we can coalesce across more btree
nodes (improving our space efficiency).
Signed-off-by: default avatarKent Overstreet <kmo@daterainc.com>
parent 8835c123
...@@ -477,7 +477,6 @@ struct gc_stat { ...@@ -477,7 +477,6 @@ struct gc_stat {
size_t nkeys; size_t nkeys;
uint64_t data; /* sectors */ uint64_t data; /* sectors */
uint64_t dirty; /* sectors */
unsigned in_use; /* percent */ unsigned in_use; /* percent */
}; };
......
...@@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) ...@@ -1176,12 +1176,10 @@ uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k)
static int btree_gc_mark_node(struct btree *b, unsigned *keys, static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
struct gc_stat *gc)
{ {
uint8_t stale = 0; uint8_t stale = 0;
unsigned last_dev = -1; unsigned keys = 0, good_keys = 0;
struct bcache_device *d = NULL;
struct bkey *k; struct bkey *k;
struct btree_iter iter; struct btree_iter iter;
struct bset_tree *t; struct bset_tree *t;
...@@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, ...@@ -1189,27 +1187,17 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
gc->nodes++; gc->nodes++;
for_each_key_filter(b, k, &iter, bch_ptr_invalid) { for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
if (last_dev != KEY_INODE(k)) {
last_dev = KEY_INODE(k);
d = KEY_INODE(k) < b->c->nr_uuids
? b->c->devices[last_dev]
: NULL;
}
stale = max(stale, btree_mark_key(b, k)); stale = max(stale, btree_mark_key(b, k));
keys++;
if (bch_ptr_bad(b, k)) if (bch_ptr_bad(b, k))
continue; continue;
*keys += bkey_u64s(k);
gc->key_bytes += bkey_u64s(k); gc->key_bytes += bkey_u64s(k);
gc->nkeys++; gc->nkeys++;
good_keys++;
gc->data += KEY_SIZE(k); gc->data += KEY_SIZE(k);
if (KEY_DIRTY(k))
gc->dirty += KEY_SIZE(k);
} }
for (t = b->sets; t <= &b->sets[b->nsets]; t++) for (t = b->sets; t <= &b->sets[b->nsets]; t++)
...@@ -1218,79 +1206,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, ...@@ -1218,79 +1206,74 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys,
bkey_cmp(&b->key, &t->end) < 0, bkey_cmp(&b->key, &t->end) < 0,
b, "found short btree key in gc"); b, "found short btree key in gc");
return stale; if (b->c->gc_always_rewrite)
} return true;
static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k) if (stale > 10)
{ return true;
/*
* We block priorities from being written for the duration of garbage
* collection, so we can't sleep in btree_alloc() ->
* bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
* our closure.
*/
struct btree *n = btree_node_alloc_replacement(b);
if (!IS_ERR_OR_NULL(n)) {
swap(b, n);
memcpy(k->ptr, b->key.ptr, if ((keys - good_keys) * 2 > keys)
sizeof(uint64_t) * KEY_PTRS(&b->key)); return true;
btree_node_free(n);
up_write(&n->lock);
}
return b; return false;
} }
/* #define GC_MERGE_NODES 4U
* Leaving this at 2 until we've got incremental garbage collection done; it
* could be higher (and has been tested with 4) except that garbage collection
* could take much longer, adversely affecting latency.
*/
#define GC_MERGE_NODES 2U
struct gc_merge_info { struct gc_merge_info {
struct btree *b; struct btree *b;
struct bkey *k;
unsigned keys; unsigned keys;
}; };
static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, static int bch_btree_insert_node(struct btree *, struct btree_op *,
struct gc_merge_info *r) struct keylist *, atomic_t *, struct bkey *);
static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
struct keylist *keylist, struct gc_stat *gc,
struct gc_merge_info *r)
{ {
unsigned nodes = 0, keys = 0, blocks; unsigned i, nodes = 0, keys = 0, blocks;
int i; struct btree *new_nodes[GC_MERGE_NODES];
struct closure cl; struct closure cl;
struct bkey *k;
memset(new_nodes, 0, sizeof(new_nodes));
closure_init_stack(&cl); closure_init_stack(&cl);
while (nodes < GC_MERGE_NODES && r[nodes].b) while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
keys += r[nodes++].keys; keys += r[nodes++].keys;
blocks = btree_default_blocks(b->c) * 2 / 3; blocks = btree_default_blocks(b->c) * 2 / 3;
if (nodes < 2 || if (nodes < 2 ||
__set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
return; return 0;
for (i = nodes - 1; i >= 0; --i) {
if (r[i].b->written)
r[i].b = btree_gc_alloc(r[i].b, r[i].k);
if (r[i].b->written) for (i = 0; i < nodes; i++) {
return; new_nodes[i] = btree_node_alloc_replacement(r[i].b);
if (IS_ERR_OR_NULL(new_nodes[i]))
goto out_nocoalesce;
} }
for (i = nodes - 1; i > 0; --i) { for (i = nodes - 1; i > 0; --i) {
struct bset *n1 = r[i].b->sets->data; struct bset *n1 = new_nodes[i]->sets->data;
struct bset *n2 = r[i - 1].b->sets->data; struct bset *n2 = new_nodes[i - 1]->sets->data;
struct bkey *k, *last = NULL; struct bkey *k, *last = NULL;
keys = 0; keys = 0;
if (i == 1) { if (i > 1) {
for (k = n2->start;
k < end(n2);
k = bkey_next(k)) {
if (__set_blocks(n1, n1->keys + keys +
bkey_u64s(k), b->c) > blocks)
break;
last = k;
keys += bkey_u64s(k);
}
} else {
/* /*
* Last node we're not getting rid of - we're getting * Last node we're not getting rid of - we're getting
* rid of the node at r[0]. Have to try and fit all of * rid of the node at r[0]. Have to try and fit all of
...@@ -1299,37 +1282,27 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, ...@@ -1299,37 +1282,27 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
* length keys (shouldn't be possible in practice, * length keys (shouldn't be possible in practice,
* though) * though)
*/ */
if (__set_blocks(n1, n1->keys + r->keys, if (__set_blocks(n1, n1->keys + n2->keys,
b->c) > btree_blocks(r[i].b)) b->c) > btree_blocks(new_nodes[i]))
return; goto out_nocoalesce;
keys = n2->keys; keys = n2->keys;
/* Take the key of the node we're getting rid of */
last = &r->b->key; last = &r->b->key;
} else }
for (k = n2->start;
k < end(n2);
k = bkey_next(k)) {
if (__set_blocks(n1, n1->keys + keys +
bkey_u64s(k), b->c) > blocks)
break;
last = k;
keys += bkey_u64s(k);
}
BUG_ON(__set_blocks(n1, n1->keys + keys, BUG_ON(__set_blocks(n1, n1->keys + keys,
b->c) > btree_blocks(r[i].b)); b->c) > btree_blocks(new_nodes[i]));
if (last) { if (last)
bkey_copy_key(&r[i].b->key, last); bkey_copy_key(&new_nodes[i]->key, last);
bkey_copy_key(r[i].k, last);
}
memcpy(end(n1), memcpy(end(n1),
n2->start, n2->start,
(void *) node(n2, keys) - (void *) n2->start); (void *) node(n2, keys) - (void *) n2->start);
n1->keys += keys; n1->keys += keys;
r[i].keys = n1->keys;
memmove(n2->start, memmove(n2->start,
node(n2, keys), node(n2, keys),
...@@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc, ...@@ -1337,93 +1310,175 @@ static void btree_gc_coalesce(struct btree *b, struct gc_stat *gc,
n2->keys -= keys; n2->keys -= keys;
r[i].keys = n1->keys; if (bch_keylist_realloc(keylist,
r[i - 1].keys = n2->keys; KEY_PTRS(&new_nodes[i]->key), b->c))
goto out_nocoalesce;
bch_btree_node_write(new_nodes[i], &cl);
bch_keylist_add(keylist, &new_nodes[i]->key);
} }
btree_node_free(r->b); for (i = 0; i < nodes; i++) {
up_write(&r->b->lock); if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
goto out_nocoalesce;
trace_bcache_btree_gc_coalesce(nodes); make_btree_freeing_key(r[i].b, keylist->top);
bch_keylist_push(keylist);
}
/* We emptied out this node */
BUG_ON(new_nodes[0]->sets->data->keys);
btree_node_free(new_nodes[0]);
rw_unlock(true, new_nodes[0]);
closure_sync(&cl);
for (i = 0; i < nodes; i++) {
btree_node_free(r[i].b);
rw_unlock(true, r[i].b);
r[i].b = new_nodes[i];
}
bch_btree_insert_node(b, op, keylist, NULL, NULL);
BUG_ON(!bch_keylist_empty(keylist));
memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
r[nodes - 1].b = ERR_PTR(-EINTR);
trace_bcache_btree_gc_coalesce(nodes);
gc->nodes--; gc->nodes--;
nodes--;
memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); /* Invalidated our iterator */
memset(&r[nodes], 0, sizeof(struct gc_merge_info)); return -EINTR;
out_nocoalesce:
closure_sync(&cl);
while ((k = bch_keylist_pop(keylist)))
if (!bkey_cmp(k, &ZERO_KEY))
atomic_dec(&b->c->prio_blocked);
for (i = 0; i < nodes; i++)
if (!IS_ERR_OR_NULL(new_nodes[i])) {
btree_node_free(new_nodes[i]);
rw_unlock(true, new_nodes[i]);
}
return 0;
} }
static int btree_gc_recurse(struct btree *b, struct btree_op *op, static unsigned btree_gc_count_keys(struct btree *b)
struct closure *writes, struct gc_stat *gc)
{ {
void write(struct btree *r) struct bkey *k;
{ struct btree_iter iter;
if (!r->written || btree_node_dirty(r)) unsigned ret = 0;
bch_btree_node_write(r, writes);
up_write(&r->lock); for_each_key_filter(b, k, &iter, bch_ptr_bad)
} ret += bkey_u64s(k);
int ret = 0, stale; return ret;
}
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
{
unsigned i; unsigned i;
int ret = 0;
bool should_rewrite;
struct btree *n;
struct bkey *k;
struct keylist keys;
struct btree_iter iter;
struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info r[GC_MERGE_NODES];
struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
memset(r, 0, sizeof(r)); bch_keylist_init(&keys);
bch_btree_iter_init(b, &iter, &b->c->gc_done);
while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { for (i = 0; i < GC_MERGE_NODES; i++)
r->b = bch_btree_node_get(b->c, r->k, b->level - 1, true); r[i].b = ERR_PTR(-EINTR);
if (IS_ERR(r->b)) { while (1) {
ret = PTR_ERR(r->b); k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
break; if (k) {
r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
if (IS_ERR(r->b)) {
ret = PTR_ERR(r->b);
break;
}
r->keys = btree_gc_count_keys(r->b);
ret = btree_gc_coalesce(b, op, &keys, gc, r);
if (ret)
break;
} }
r->keys = 0; if (!last->b)
stale = btree_gc_mark_node(r->b, &r->keys, gc); break;
if (!b->written && if (!IS_ERR(last->b)) {
(r->b->level || stale > 10 || should_rewrite = btree_gc_mark_node(last->b, gc);
b->c->gc_always_rewrite)) if (should_rewrite) {
r->b = btree_gc_alloc(r->b, r->k); n = btree_node_alloc_replacement(last->b);
if (r->b->level) if (!IS_ERR_OR_NULL(n)) {
ret = btree_gc_recurse(r->b, op, writes, gc); bch_btree_node_write_sync(n);
bch_keylist_add(&keys, &n->key);
if (ret) { make_btree_freeing_key(last->b,
write(r->b); keys.top);
break; bch_keylist_push(&keys);
}
btree_node_free(last->b);
bkey_copy_key(&b->c->gc_done, r->k); bch_btree_insert_node(b, op, &keys,
NULL, NULL);
BUG_ON(!bch_keylist_empty(&keys));
if (!b->written) rw_unlock(true, last->b);
btree_gc_coalesce(b, gc, r); last->b = n;
/* Invalidated our iterator */
ret = -EINTR;
break;
}
}
if (r[GC_MERGE_NODES - 1].b) if (last->b->level) {
write(r[GC_MERGE_NODES - 1].b); ret = btree_gc_recurse(last->b, op, writes, gc);
if (ret)
break;
}
memmove(&r[1], &r[0], bkey_copy_key(&b->c->gc_done, &last->b->key);
sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
/*
* Must flush leaf nodes before gc ends, since replace
* operations aren't journalled
*/
if (btree_node_dirty(last->b))
bch_btree_node_write(last->b, writes);
rw_unlock(true, last->b);
}
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
/* When we've got incremental GC working, we'll want to do
* if (should_resched())
* return -EAGAIN;
*/
cond_resched();
#if 0
if (need_resched()) { if (need_resched()) {
ret = -EAGAIN; ret = -EAGAIN;
break; break;
} }
#endif
} }
for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) for (i = 0; i < GC_MERGE_NODES; i++)
write(r[i].b); if (!IS_ERR_OR_NULL(r[i].b)) {
if (btree_node_dirty(r[i].b))
bch_btree_node_write(r[i].b, writes);
rw_unlock(true, r[i].b);
}
/* Might have freed some children, must remove their keys */ bch_keylist_free(&keys);
if (!b->written)
bch_btree_sort(b);
return ret; return ret;
} }
...@@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ...@@ -1432,27 +1487,31 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc) struct closure *writes, struct gc_stat *gc)
{ {
struct btree *n = NULL; struct btree *n = NULL;
unsigned keys = 0; int ret = 0;
int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); bool should_rewrite;
if (b->level || stale > 10) should_rewrite = btree_gc_mark_node(b, gc);
if (should_rewrite) {
n = btree_node_alloc_replacement(b); n = btree_node_alloc_replacement(b);
if (!IS_ERR_OR_NULL(n)) if (!IS_ERR_OR_NULL(n)) {
swap(b, n); bch_btree_node_write_sync(n);
bch_btree_set_root(n);
if (b->level) btree_node_free(b);
ret = btree_gc_recurse(b, op, writes, gc); rw_unlock(true, n);
if (!b->written || btree_node_dirty(b)) return -EINTR;
bch_btree_node_write_sync(b); }
}
if (!IS_ERR_OR_NULL(n)) { if (b->level) {
bch_btree_set_root(b); ret = btree_gc_recurse(b, op, writes, gc);
btree_node_free(n); if (ret)
rw_unlock(true, b); return ret;
} }
bkey_copy_key(&b->c->gc_done, &b->key);
return ret; return ret;
} }
...@@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c) ...@@ -1550,29 +1609,20 @@ static void bch_btree_gc(struct cache_set *c)
btree_gc_start(c); btree_gc_start(c);
atomic_inc(&c->prio_blocked); do {
ret = btree_root(gc_root, c, &op, &writes, &stats);
ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes);
closure_sync(&writes);
if (ret) {
pr_warn("gc failed!");
return;
}
/* Possibly wait for new UUIDs or whatever to hit disk */ if (ret && ret != -EAGAIN)
bch_journal_meta(c, &writes); pr_warn("gc failed!");
closure_sync(&writes); } while (ret);
available = bch_btree_gc_finish(c); available = bch_btree_gc_finish(c);
atomic_dec(&c->prio_blocked);
wake_up_allocators(c); wake_up_allocators(c);
bch_time_stats_update(&c->btree_gc_time, start_time); bch_time_stats_update(&c->btree_gc_time, start_time);
stats.key_bytes *= sizeof(uint64_t); stats.key_bytes *= sizeof(uint64_t);
stats.dirty <<= 9;
stats.data <<= 9; stats.data <<= 9;
stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
...@@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c) ...@@ -1585,14 +1635,28 @@ static void bch_btree_gc(struct cache_set *c)
static int bch_gc_thread(void *arg) static int bch_gc_thread(void *arg)
{ {
struct cache_set *c = arg; struct cache_set *c = arg;
struct cache *ca;
unsigned i;
while (1) { while (1) {
again:
bch_btree_gc(c); bch_btree_gc(c);
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
if (kthread_should_stop()) if (kthread_should_stop())
break; break;
mutex_lock(&c->bucket_lock);
for_each_cache(ca, c, i)
if (ca->invalidate_needs_gc) {
mutex_unlock(&c->bucket_lock);
set_current_state(TASK_RUNNING);
goto again;
}
mutex_unlock(&c->bucket_lock);
try_to_freeze(); try_to_freeze();
schedule(); schedule();
} }
...@@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op, ...@@ -2083,8 +2147,6 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
bch_keylist_init(&split_keys); bch_keylist_init(&split_keys);
BUG_ON(b->level);
do { do {
BUG_ON(b->level && replace_key); BUG_ON(b->level && replace_key);
......
...@@ -201,7 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k) ...@@ -201,7 +201,7 @@ static inline bool bkey_written(struct btree *b, struct bkey *k)
static inline void set_gc_sectors(struct cache_set *c) static inline void set_gc_sectors(struct cache_set *c)
{ {
atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8); atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
} }
static inline struct bkey *bch_btree_iter_init(struct btree *b, static inline struct bkey *bch_btree_iter_init(struct btree *b,
......
...@@ -489,7 +489,6 @@ SHOW(__bch_cache_set) ...@@ -489,7 +489,6 @@ SHOW(__bch_cache_set)
sysfs_print(btree_used_percent, btree_used(c)); sysfs_print(btree_used_percent, btree_used(c));
sysfs_print(btree_nodes, c->gc_stats.nodes); sysfs_print(btree_nodes, c->gc_stats.nodes);
sysfs_hprint(dirty_data, c->gc_stats.dirty);
sysfs_hprint(average_key_size, average_key_size(c)); sysfs_hprint(average_key_size, average_key_size(c));
sysfs_print(cache_read_races, sysfs_print(cache_read_races,
...@@ -642,7 +641,6 @@ static struct attribute *bch_cache_set_files[] = { ...@@ -642,7 +641,6 @@ static struct attribute *bch_cache_set_files[] = {
&sysfs_cache_available_percent, &sysfs_cache_available_percent,
&sysfs_average_key_size, &sysfs_average_key_size,
&sysfs_dirty_data,
&sysfs_errors, &sysfs_errors,
&sysfs_io_error_limit, &sysfs_io_error_limit,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment