Commit b20ba1bc authored by Javier González's avatar Javier González Committed by Jens Axboe

lightnvm: pblk: redesign GC algorithm

At the moment, in order to get enough read parallelism, we have recycled
several lines at the same time. This approach has proven not to work
well when reaching capacity, since we end up mixing valid data from all
lines, thus not maintaining a sustainable free/recycled line ratio.

The new design, relies on a two level workqueue mechanism. In the first
level, we read the metadata for a number of lines based on the GC list
they reside on (this is governed by the number of valid sectors in each
line). In the second level, we recycle a single line at a time. Here, we
issue reads in parallel, while a single GC write thread places data in
the write buffer. This design allows to (i) only move data from one line
at a time, thus maintaining a sane free/recycled ration and (ii)
maintain the GC writer busy with recycled data.
Signed-off-by: default avatarJavier González <javier@cnexlabs.com>
Signed-off-by: default avatarMatias Bjørling <matias@cnexlabs.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 476118c9
......@@ -302,12 +302,12 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
line->gc_group = PBLK_LINEGC_FULL;
move_list = &l_mg->gc_full_list;
}
} else if (vsc < lm->mid_thrs) {
} else if (vsc < lm->high_thrs) {
if (line->gc_group != PBLK_LINEGC_HIGH) {
line->gc_group = PBLK_LINEGC_HIGH;
move_list = &l_mg->gc_high_list;
}
} else if (vsc < lm->high_thrs) {
} else if (vsc < lm->mid_thrs) {
if (line->gc_group != PBLK_LINEGC_MID) {
line->gc_group = PBLK_LINEGC_MID;
move_list = &l_mg->gc_mid_list;
......@@ -1199,6 +1199,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
if (pblk_line_prepare(pblk, line)) {
pr_err("pblk: failed to prepare line %d\n", line->id);
list_add(&line->list, &l_mg->free_list);
l_mg->nr_free_lines++;
return NULL;
}
......@@ -1465,6 +1466,8 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
spin_unlock(&line->lock);
spin_unlock(&l_mg->gc_lock);
pblk_gc_should_kick(pblk);
}
void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line)
......
This diff is collapsed.
......@@ -199,12 +199,22 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
struct pblk_line *line;
struct pblk_rb_entry *entry;
struct pblk_w_ctx *w_ctx;
unsigned int user_io = 0, gc_io = 0;
unsigned int i;
int flags;
for (i = 0; i < to_update; i++) {
entry = &rb->entries[*l2p_upd];
w_ctx = &entry->w_ctx;
flags = READ_ONCE(entry->w_ctx.flags);
if (flags & PBLK_IOTYPE_USER)
user_io++;
else if (flags & PBLK_IOTYPE_GC)
gc_io++;
else
WARN(1, "pblk: unknown IO type\n");
pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
entry->cacheline);
......@@ -214,6 +224,8 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
*l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
}
pblk_rl_out(&pblk->rl, user_io, gc_io);
return 0;
}
......@@ -531,7 +543,6 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
struct pblk_rb_entry *entry;
struct page *page;
unsigned int pad = 0, to_read = nr_entries;
unsigned int user_io = 0, gc_io = 0;
unsigned int i;
int flags;
......@@ -555,13 +566,6 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
if (!(flags & PBLK_WRITTEN_DATA))
goto try;
if (flags & PBLK_IOTYPE_USER)
user_io++;
else if (flags & PBLK_IOTYPE_GC)
gc_io++;
else
WARN(1, "pblk: unknown IO type\n");
page = virt_to_page(entry->data);
if (!page) {
pr_err("pblk: could not allocate write bio page\n");
......@@ -613,7 +617,6 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
}
}
pblk_rl_out(&pblk->rl, user_io, gc_io);
#ifdef CONFIG_NVM_DEBUG
atomic_long_add(pad, &((struct pblk *)
(container_of(rb, struct pblk, rwb)))->padded_writes);
......
......@@ -27,7 +27,7 @@ int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
{
int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
return (!(rb_user_cnt >= rl->rb_user_max));
}
int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
......@@ -37,7 +37,7 @@ int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
/* If there is no user I/O let GC take over space on the write buffer */
rb_user_active = READ_ONCE(rl->rb_user_active);
return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
return (!(rb_gc_cnt >= rl->rb_gc_max && rb_user_active));
}
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
......@@ -77,33 +77,32 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
if (free_blocks >= rl->high) {
rl->rb_user_max = max - rl->rb_gc_rsv;
rl->rb_gc_max = rl->rb_gc_rsv;
rl->rb_user_max = max;
rl->rb_gc_max = 0;
rl->rb_state = PBLK_RL_HIGH;
} else if (free_blocks < rl->high) {
int shift = rl->high_pw - rl->rb_windows_pw;
int user_windows = free_blocks >> shift;
int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
int gc_max;
rl->rb_user_max = user_max;
gc_max = max - rl->rb_user_max;
rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
rl->rb_gc_max = max - user_max;
if (free_blocks > rl->low)
rl->rb_state = PBLK_RL_MID;
else
if (free_blocks <= rl->rsv_blocks) {
rl->rb_user_max = 0;
rl->rb_gc_max = max;
}
/* In the worst case, we will need to GC lines in the low list
* (high valid sector count). If there are lines to GC on high
* or mid lists, these will be prioritized
*/
rl->rb_state = PBLK_RL_LOW;
}
return rl->rb_state;
}
void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
{
rl->rb_gc_rsv = rl->rb_gc_max = rsv;
}
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
......@@ -122,11 +121,15 @@ void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
int blk_in_line = atomic_read(&line->blk_in_line);
int ret;
atomic_sub(blk_in_line, &rl->free_blocks);
}
void pblk_gc_should_kick(struct pblk *pblk)
{
struct pblk_rl *rl = &pblk->rl;
int ret;
/* Rates will not change that often - no need to lock update */
ret = pblk_rl_update_rates(rl, rl->rb_budget);
......@@ -136,11 +139,16 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
pblk_gc_should_stop(pblk);
}
int pblk_rl_gc_thrs(struct pblk_rl *rl)
int pblk_rl_high_thrs(struct pblk_rl *rl)
{
return rl->high;
}
int pblk_rl_low_thrs(struct pblk_rl *rl)
{
return rl->low;
}
int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
{
return rl->rb_user_max;
......@@ -161,15 +169,23 @@ void pblk_rl_free(struct pblk_rl *rl)
void pblk_rl_init(struct pblk_rl *rl, int budget)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
struct pblk_line_meta *lm = &pblk->lm;
int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
unsigned int rb_windows;
rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
rl->high_pw = get_count_order(rl->high);
rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
if (rl->low < min_blocks)
rl->low = min_blocks;
rl->rsv_blocks = min_blocks;
/* This will always be a power-of-2 */
rb_windows = budget / PBLK_MAX_REQ_ADDRS;
rl->rb_windows_pw = get_count_order(rb_windows) + 1;
rl->rb_windows_pw = get_count_order(rb_windows);
/* To start with, all buffer is available to user I/O writers */
rl->rb_budget = budget;
......@@ -180,5 +196,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
atomic_set(&rl->rb_gc_cnt, 0);
setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
rl->rb_user_active = 0;
rl->rb_gc_active = 0;
}
......@@ -49,30 +49,26 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
int free_blocks, total_blocks;
int rb_user_max, rb_user_cnt;
int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state;
int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
free_blocks = atomic_read(&pblk->rl.free_blocks);
rb_user_max = pblk->rl.rb_user_max;
rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
rb_gc_max = pblk->rl.rb_gc_max;
rb_gc_rsv = pblk->rl.rb_gc_rsv;
rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
rb_budget = pblk->rl.rb_budget;
rb_state = pblk->rl.rb_state;
total_blocks = geo->blks_per_lun * geo->nr_luns;
total_blocks = pblk->rl.total_blocks;
return snprintf(page, PAGE_SIZE,
"u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
"u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
rb_user_cnt,
rb_user_max,
rb_gc_cnt,
rb_gc_max,
rb_gc_rsv,
rb_state,
rb_budget,
pblk->rl.low,
......@@ -237,7 +233,8 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
spin_unlock(&l_mg->free_lock);
if (nr_free_lines != free_line_cnt)
pr_err("pblk: corrupted free line list\n");
pr_err("pblk: corrupted free line list:%d/%d\n",
nr_free_lines, free_line_cnt);
sz = snprintf(page, PAGE_SIZE - sz,
"line: nluns:%d, nblks:%d, nsecs:%d\n",
......@@ -319,32 +316,11 @@ static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
}
#endif
static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
size_t len)
{
struct pblk_gc *gc = &pblk->gc;
size_t c_len;
int value;
c_len = strcspn(page, "\n");
if (c_len >= len)
return -EINVAL;
if (kstrtouint(page, 0, &value))
return -EINVAL;
spin_lock(&gc->lock);
pblk_rl_set_gc_rsc(&pblk->rl, value);
spin_unlock(&gc->lock);
return len;
}
static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
size_t len)
{
size_t c_len;
int force;
int ret, force;
c_len = strcspn(page, "\n");
if (c_len >= len)
......@@ -353,10 +329,7 @@ static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
if (kstrtouint(page, 0, &force))
return -EINVAL;
if (force < 0 || force > 1)
return -EINVAL;
pblk_gc_sysfs_force(pblk, force);
ret = pblk_gc_sysfs_force(pblk, force);
return len;
}
......@@ -434,11 +407,6 @@ static struct attribute sys_max_sec_per_write = {
.mode = 0644,
};
static struct attribute sys_gc_rl_max = {
.name = "gc_rl_max",
.mode = 0200,
};
#ifdef CONFIG_NVM_DEBUG
static struct attribute sys_stats_debug_attr = {
.name = "stats",
......@@ -453,7 +421,6 @@ static struct attribute *pblk_attrs[] = {
&sys_gc_state,
&sys_gc_force,
&sys_max_sec_per_write,
&sys_gc_rl_max,
&sys_rb_attr,
&sys_stats_ppaf_attr,
&sys_lines_attr,
......@@ -499,9 +466,7 @@ static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
{
struct pblk *pblk = container_of(kobj, struct pblk, kobj);
if (strcmp(attr->name, "gc_rl_max") == 0)
return pblk_sysfs_rate_store(pblk, buf, len);
else if (strcmp(attr->name, "gc_force") == 0)
if (strcmp(attr->name, "gc_force") == 0)
return pblk_sysfs_gc_force(pblk, buf, len);
else if (strcmp(attr->name, "max_sec_per_write") == 0)
return pblk_sysfs_set_sec_per_write(pblk, buf, len);
......
......@@ -72,11 +72,15 @@ enum {
PBLK_BLK_ST_CLOSED = 0x2,
};
struct pblk_sec_meta {
u64 reserved;
__le64 lba;
};
/* The number of GC lists and the rate-limiter states go together. This way the
* rate-limiter can dictate how much GC is needed based on resource utilization.
*/
#define PBLK_NR_GC_LISTS 3
#define PBLK_MAX_GC_JOBS 32
#define PBLK_GC_NR_LISTS 3
enum {
PBLK_RL_HIGH = 1,
......@@ -84,11 +88,6 @@ enum {
PBLK_RL_LOW = 3,
};
struct pblk_sec_meta {
u64 reserved;
__le64 lba;
};
#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
/* write buffer completion context */
......@@ -195,29 +194,39 @@ struct pblk_lun {
struct pblk_gc_rq {
struct pblk_line *line;
void *data;
u64 *lba_list;
u64 lba_list[PBLK_MAX_REQ_ADDRS];
int nr_secs;
int secs_to_gc;
struct list_head list;
};
struct pblk_gc {
/* These states are not protected by a lock since (i) they are in the
* fast path, and (ii) they are not critical.
*/
int gc_active;
int gc_enabled;
int gc_forced;
int gc_jobs_active;
atomic_t inflight_gc;
struct task_struct *gc_ts;
struct task_struct *gc_writer_ts;
struct task_struct *gc_reader_ts;
struct workqueue_struct *gc_line_reader_wq;
struct workqueue_struct *gc_reader_wq;
struct timer_list gc_timer;
struct semaphore gc_sem;
atomic_t inflight_gc;
int w_entries;
struct list_head w_list;
struct list_head r_list;
spinlock_t lock;
spinlock_t w_lock;
spinlock_t r_lock;
};
struct pblk_rl {
......@@ -229,10 +238,8 @@ struct pblk_rl {
*/
unsigned int high_pw; /* High rounded up as a power of 2 */
#define PBLK_USER_HIGH_THRS 2 /* Begin write limit at 50 percent
* available blks
*/
#define PBLK_USER_LOW_THRS 20 /* Aggressive GC at 5% available blocks */
#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */
#define PBLK_USER_LOW_THRS 10 /* Aggressive GC at 10% available blocks */
int rb_windows_pw; /* Number of rate windows in the write buffer
* given as a power-of-2. This guarantees that
......@@ -250,7 +257,11 @@ struct pblk_rl {
int rb_state; /* Rate-limiter current state */
atomic_t rb_gc_cnt; /* GC I/O buffer counter */
int rsv_blocks; /* Reserved blocks for GC */
int rb_user_active;
int rb_gc_active;
struct timer_list u_timer;
unsigned long long nr_secs;
......@@ -428,7 +439,7 @@ struct pblk_line_mgmt {
struct list_head bad_list; /* Full lines bad */
/* GC lists - use gc_lock */
struct list_head *gc_lists[PBLK_NR_GC_LISTS];
struct list_head *gc_lists[PBLK_GC_NR_LISTS];
struct list_head gc_high_list; /* Full lines ready to GC, high isc */
struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */
struct list_head gc_low_list; /* Full lines ready to GC, low isc */
......@@ -768,30 +779,34 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
/*
* pblk gc
*/
#define PBLK_GC_TRIES 3
#define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */
#define PBLK_GC_W_QD 1024 /* Queue depth for inflight GC write I/Os */
#define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */
#define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */
int pblk_gc_init(struct pblk *pblk);
void pblk_gc_exit(struct pblk *pblk);
void pblk_gc_should_start(struct pblk *pblk);
void pblk_gc_should_stop(struct pblk *pblk);
int pblk_gc_status(struct pblk *pblk);
void pblk_gc_should_kick(struct pblk *pblk);
void pblk_gc_kick(struct pblk *pblk);
void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
int *gc_active);
void pblk_gc_sysfs_force(struct pblk *pblk, int force);
int pblk_gc_sysfs_force(struct pblk *pblk, int force);
/*
* pblk rate limiter
*/
void pblk_rl_init(struct pblk_rl *rl, int budget);
void pblk_rl_free(struct pblk_rl *rl);
int pblk_rl_gc_thrs(struct pblk_rl *rl);
int pblk_rl_high_thrs(struct pblk_rl *rl);
int pblk_rl_low_thrs(struct pblk_rl *rl);
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv);
int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
......@@ -837,6 +852,17 @@ static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta)
return (emeta_to_lbas(pblk, emeta) + pblk->lm.emeta_len[2]);
}
static inline int pblk_line_vsc(struct pblk_line *line)
{
int vsc;
spin_lock(&line->lock);
vsc = le32_to_cpu(*line->vsc);
spin_unlock(&line->lock);
return vsc;
}
#define NVM_MEM_PAGE_WRITE (8)
static inline int pblk_pad_distance(struct pblk *pblk)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment