Commit 7ad651b5 authored by Lars Ellenberg's avatar Lars Ellenberg Committed by Philipp Reisner

drbd: new on-disk activity log transaction format

Use a new on-disk transaction format for the activity log, which allows
for multiple changes to the active set per transaction.

Using 4k transaction blocks, we can now get rid of the work-around code
to deal with devices not supporting 512 byte logical block size.
Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
parent 46a15bc3
This diff is collapsed.
...@@ -1069,7 +1069,6 @@ struct drbd_conf { ...@@ -1069,7 +1069,6 @@ struct drbd_conf {
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait; wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
struct mutex md_io_mutex; /* protects the md_io_buffer */ struct mutex md_io_mutex; /* protects the md_io_buffer */
spinlock_t al_lock; spinlock_t al_lock;
wait_queue_head_t al_wait; wait_queue_head_t al_wait;
...@@ -1259,22 +1258,39 @@ extern void drbd_ldev_destroy(struct drbd_conf *mdev); ...@@ -1259,22 +1258,39 @@ extern void drbd_ldev_destroy(struct drbd_conf *mdev);
* either at the end of the backing device * either at the end of the backing device
* or on a separate meta data device. */ * or on a separate meta data device. */
#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
/* The following numbers are sectors */ /* The following numbers are sectors */
/* Allows up to about 3.8TB, so if you want more,
* you need to use the "flexible" meta data format. */
#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ #define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ #define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
/* Allows up to about 3.8TB */ #define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
/* Since the smalles IO unit is usually 512 byte */ /* we do all meta data IO in 4k blocks */
#define MD_SECTOR_SHIFT 9 #define MD_BLOCK_SHIFT 12
#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT) #define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
/* activity log */ /* One activity log extent represents 4M of storage */
#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */ #define AL_EXTENT_SHIFT 22
#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT) #define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
/* We could make these currently hardcoded constants configurable
* variables at create-md time (or even re-configurable at runtime?).
* Which will require some more changes to the DRBD "super block"
* and attach code.
*
* updates per transaction:
* This many changes to the active set can be logged with one transaction.
* This number is arbitrary.
* context per transaction:
* This many context extent numbers are logged with each transaction.
* This number is resulting from the transaction block size (4k), the layout
* of the transaction header, and the number of updates per transaction.
* See drbd_actlog.c:struct al_transaction_on_disk
* */
#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
#if BITS_PER_LONG == 32 #if BITS_PER_LONG == 32
#define LN2_BPL 5 #define LN2_BPL 5
#define cpu_to_lel(A) cpu_to_le32(A) #define cpu_to_lel(A) cpu_to_le32(A)
......
...@@ -2841,10 +2841,6 @@ void drbd_ldev_destroy(struct drbd_conf *mdev) ...@@ -2841,10 +2841,6 @@ void drbd_ldev_destroy(struct drbd_conf *mdev)
drbd_free_bc(mdev->ldev); drbd_free_bc(mdev->ldev);
mdev->ldev = NULL;); mdev->ldev = NULL;);
if (mdev->md_io_tmpp) {
__free_page(mdev->md_io_tmpp);
mdev->md_io_tmpp = NULL;
}
clear_bit(GO_DISKLESS, &mdev->flags); clear_bit(GO_DISKLESS, &mdev->flags);
} }
......
...@@ -527,7 +527,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, ...@@ -527,7 +527,7 @@ static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
case DRBD_MD_INDEX_FLEX_INT: case DRBD_MD_INDEX_FLEX_INT:
bdev->md.md_offset = drbd_md_ss__(mdev, bdev); bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
/* al size is still fixed */ /* al size is still fixed */
bdev->md.al_offset = -MD_AL_MAX_SIZE; bdev->md.al_offset = -MD_AL_SECTORS;
/* we need (slightly less than) ~ this much bitmap sectors: */ /* we need (slightly less than) ~ this much bitmap sectors: */
md_size_sect = drbd_get_capacity(bdev->backing_bdev); md_size_sect = drbd_get_capacity(bdev->backing_bdev);
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
...@@ -751,8 +751,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev) ...@@ -751,8 +751,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
unsigned int in_use; unsigned int in_use;
int i; int i;
if (!expect(mdev->sync_conf.al_extents >= 7)) if (!expect(mdev->sync_conf.al_extents >= DRBD_AL_EXTENTS_MIN))
mdev->sync_conf.al_extents = 127; mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_MIN;
if (mdev->act_log && if (mdev->act_log &&
mdev->act_log->nr_elements == mdev->sync_conf.al_extents) mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
...@@ -760,7 +760,7 @@ static int drbd_check_al_size(struct drbd_conf *mdev) ...@@ -760,7 +760,7 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
in_use = 0; in_use = 0;
t = mdev->act_log; t = mdev->act_log;
n = lc_create("act_log", drbd_al_ext_cache, 1, n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
if (n == NULL) { if (n == NULL) {
...@@ -932,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp ...@@ -932,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
union drbd_state ns, os; union drbd_state ns, os;
enum drbd_state_rv rv; enum drbd_state_rv rv;
int cp_discovered = 0; int cp_discovered = 0;
int logical_block_size;
drbd_reconfig_start(mdev); drbd_reconfig_start(mdev);
...@@ -1087,25 +1086,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp ...@@ -1087,25 +1086,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_md_set_sector_offsets(mdev, nbc); drbd_md_set_sector_offsets(mdev, nbc);
/* allocate a second IO page if logical_block_size != 512 */
logical_block_size = bdev_logical_block_size(nbc->md_bdev);
if (logical_block_size == 0)
logical_block_size = MD_SECTOR_SIZE;
if (logical_block_size != MD_SECTOR_SIZE) {
if (!mdev->md_io_tmpp) {
struct page *page = alloc_page(GFP_NOIO);
if (!page)
goto force_diskless_dec;
dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
logical_block_size, MD_SECTOR_SIZE);
dev_warn(DEV, "Workaround engaged (has performance impact).\n");
mdev->md_io_tmpp = page;
}
}
if (!mdev->bitmap) { if (!mdev->bitmap) {
if (drbd_bm_init(mdev)) { if (drbd_bm_init(mdev)) {
retcode = ERR_NOMEM; retcode = ERR_NOMEM;
...@@ -1804,14 +1784,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n ...@@ -1804,14 +1784,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
if (!expect(sc.rate >= 1)) if (!expect(sc.rate >= 1))
sc.rate = 1; sc.rate = 1;
if (!expect(sc.al_extents >= 7))
sc.al_extents = 127; /* arbitrary minimum */ /* clip to allowed range */
#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) if (!expect(sc.al_extents >= DRBD_AL_EXTENTS_MIN))
if (sc.al_extents > AL_MAX) { sc.al_extents = DRBD_AL_EXTENTS_MIN;
dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); if (!expect(sc.al_extents <= DRBD_AL_EXTENTS_MAX))
sc.al_extents = AL_MAX; sc.al_extents = DRBD_AL_EXTENTS_MAX;
}
#undef AL_MAX
/* to avoid spurious errors when configuring minors before configuring /* to avoid spurious errors when configuring minors before configuring
* the minors they depend on: if necessary, first create the minor we * the minors they depend on: if necessary, first create the minor we
......
...@@ -336,6 +336,10 @@ enum drbd_timeout_flag { ...@@ -336,6 +336,10 @@ enum drbd_timeout_flag {
#define DRBD_MAGIC 0x83740267 #define DRBD_MAGIC 0x83740267
#define DRBD_MAGIC_BIG 0x835a #define DRBD_MAGIC_BIG 0x835a
/* how I came up with this magic?
* base64 decode "actlog==" ;) */
#define DRBD_AL_MAGIC 0x69cb65a2
/* these are of type "int" */ /* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1 #define DRBD_MD_INDEX_INTERNAL -1
#define DRBD_MD_INDEX_FLEX_EXT -2 #define DRBD_MD_INDEX_FLEX_EXT -2
......
...@@ -102,10 +102,12 @@ ...@@ -102,10 +102,12 @@
#define DRBD_RATE_DEF 250 /* kb/second */ #define DRBD_RATE_DEF 250 /* kb/second */
/* less than 7 would hit performance unnecessarily. /* less than 7 would hit performance unnecessarily.
* 3833 is the largest prime that still does fit * 919 slots context information per transaction,
* into 64 sectors of activity log */ * 32k activity log, 4k transaction size,
* one transaction in flight:
* 919 * 7 = 6433 */
#define DRBD_AL_EXTENTS_MIN 7 #define DRBD_AL_EXTENTS_MIN 7
#define DRBD_AL_EXTENTS_MAX 3833 #define DRBD_AL_EXTENTS_MAX 6433
#define DRBD_AL_EXTENTS_DEF 127 #define DRBD_AL_EXTENTS_DEF 127
#define DRBD_AFTER_MIN -1 #define DRBD_AFTER_MIN -1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment