Commit 98683650 authored by Philipp Reisner's avatar Philipp Reisner

Merge branch 'drbd-8.4_ed6' into for-3.8-drivers-drbd-8.4_ed6

parents ccae7868 328e0f12
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd-y += drbd_interval.o drbd_state.o
drbd-y += drbd_nla.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
......@@ -24,21 +24,73 @@
*/
#include <linux/slab.h>
#include <linux/crc32c.h>
#include <linux/drbd.h>
#include <linux/drbd_limits.h>
#include <linux/dynamic_debug.h>
#include "drbd_int.h"
#include "drbd_wrappers.h"
/* We maintain a trivial checksum in our on disk activity log.
* With that we can ensure correct operation even when the storage
* device might do a partial (last) sector write while losing power.
*/
struct __packed al_transaction {
u32 magic;
u32 tr_number;
struct __packed {
u32 pos;
u32 extent; } updates[1 + AL_EXTENTS_PT];
u32 xor_sum;
enum al_transaction_types {
AL_TR_UPDATE = 0,
AL_TR_INITIALIZED = 0xffff
};
/* all fields on disc in big endian */
struct __packed al_transaction_on_disk {
/* don't we all like magic */
__be32 magic;
/* to identify the most recent transaction block
* in the on disk ring buffer */
__be32 tr_number;
/* checksum on the full 4k block, with this field set to 0. */
__be32 crc32c;
/* type of transaction, special transaction types like:
* purge-all, set-all-idle, set-all-active, ... to-be-defined
* see also enum al_transaction_types */
__be16 transaction_type;
/* we currently allow only a few thousand extents,
* so 16bit will be enough for the slot number. */
/* how many updates in this transaction */
__be16 n_updates;
/* maximum slot number, "al-extents" in drbd.conf speak.
* Having this in each transaction should make reconfiguration
* of that parameter easier. */
__be16 context_size;
/* slot number the context starts with */
__be16 context_start_slot_nr;
/* Some reserved bytes. Expected usage is a 64bit counter of
* sectors-written since device creation, and other data generation tag
* supporting usage */
__be32 __reserved[4];
/* --- 36 byte used --- */
/* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
* in one transaction, then use the remaining byte in the 4k block for
* context information. "Flexible" number of updates per transaction
* does not help, as we have to account for the case when all update
* slots are used anyways, so it would only complicate code without
* additional benefit.
*/
__be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION];
/* but the extent number is 32bit, which at an extent size of 4 MiB
* allows to cover device sizes of up to 2**54 Byte (16 PiB) */
__be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION];
/* --- 420 bytes used (36 + 64*6) --- */
/* 4096 - 420 = 3676 = 919 * 4 */
__be32 context[AL_CONTEXT_PER_TRANSACTION];
};
struct update_odbm_work {
......@@ -48,22 +100,11 @@ struct update_odbm_work {
struct update_al_work {
struct drbd_work w;
struct lc_element *al_ext;
struct completion event;
unsigned int enr;
/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
unsigned int old_enr;
};
struct drbd_atodb_wait {
atomic_t count;
struct completion io_done;
struct drbd_conf *mdev;
int error;
int err;
};
int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
static int al_write_transaction(struct drbd_conf *mdev);
void *drbd_md_get_buffer(struct drbd_conf *mdev)
{
......@@ -85,12 +126,17 @@ void drbd_md_put_buffer(struct drbd_conf *mdev)
void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
unsigned int *done)
{
long dt = bdev->dc.disk_timeout * HZ / 10;
long dt;
rcu_read_lock();
dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
rcu_read_unlock();
dt = dt * HZ / 10;
if (dt == 0)
dt = MAX_SCHEDULE_TIMEOUT;
dt = wait_event_timeout(mdev->misc_wait,
*done || drbd_test_flag(mdev, FORCE_DETACH), dt);
*done || test_bit(FORCE_DETACH, &mdev->flags), dt);
if (dt == 0) {
dev_err(DEV, "meta-data IO operation timed out\n");
drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH);
......@@ -103,20 +149,20 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
int rw, int size)
{
struct bio *bio;
int ok;
int err;
mdev->md_io.done = 0;
mdev->md_io.error = -ENODEV;
if ((rw & WRITE) && !drbd_test_flag(mdev, MD_NO_FUA))
if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
rw |= REQ_FUA | REQ_FLUSH;
rw |= REQ_SYNC;
bio = bio_alloc_drbd(GFP_NOIO);
bio->bi_bdev = bdev->md_bdev;
bio->bi_sector = sector;
ok = (bio_add_page(bio, page, size, 0) == size);
if (!ok)
err = -EIO;
if (bio_add_page(bio, page, size, 0) != size)
goto out;
bio->bi_private = &mdev->md_io;
bio->bi_end_io = drbd_md_io_complete;
......@@ -124,7 +170,7 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
ok = 0;
err = -ENODEV;
goto out;
}
......@@ -135,85 +181,46 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
else
submit_bio(rw, bio);
wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done);
ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
if (bio_flagged(bio, BIO_UPTODATE))
err = mdev->md_io.error;
out:
bio_put(bio);
return ok;
return err;
}
int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
sector_t sector, int rw)
{
int logical_block_size, mask, ok;
int offset = 0;
int err;
struct page *iop = mdev->md_io_page;
D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
BUG_ON(!bdev->md_bdev);
logical_block_size = bdev_logical_block_size(bdev->md_bdev);
if (logical_block_size == 0)
logical_block_size = MD_SECTOR_SIZE;
/* in case logical_block_size != 512 [ s390 only? ] */
if (logical_block_size != MD_SECTOR_SIZE) {
mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
D_ASSERT(mask == 1 || mask == 3 || mask == 7);
D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
offset = sector & mask;
sector = sector & ~mask;
iop = mdev->md_io_tmpp;
if (rw & WRITE) {
/* these are GFP_KERNEL pages, pre-allocated
* on device initialization */
void *p = page_address(mdev->md_io_page);
void *hp = page_address(mdev->md_io_tmpp);
ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
READ, logical_block_size);
if (unlikely(!ok)) {
dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
"READ [logical_block_size!=512]) failed!\n",
(unsigned long long)sector);
return 0;
}
memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
}
}
dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
if (sector < drbd_md_first_sector(bdev) ||
sector > drbd_md_last_sector(bdev))
sector + 7 > drbd_md_last_sector(bdev))
dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
if (unlikely(!ok)) {
dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
return 0;
err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
if (err) {
dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
}
if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
void *p = page_address(mdev->md_io_page);
void *hp = page_address(mdev->md_io_tmpp);
memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
}
return ok;
return err;
}
static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
{
struct lc_element *al_ext;
struct lc_element *tmp;
unsigned long al_flags = 0;
int wake;
spin_lock_irq(&mdev->al_lock);
......@@ -228,76 +235,92 @@ static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
return NULL;
}
}
al_ext = lc_get(mdev->act_log, enr);
al_flags = mdev->act_log->flags;
al_ext = lc_get(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
/*
if (!al_ext) {
if (al_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
if (al_flags & LC_DIRTY)
dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
}
*/
return al_ext;
}
void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
struct lc_element *al_ext;
struct update_al_work al_work;
/* for bios crossing activity log extent boundaries,
* we may need to activate two extents in one go */
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
unsigned enr;
bool locked = false;
D_ASSERT(first <= last);
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
for (enr = first; enr <= last; enr++)
wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
if (al_ext->lc_number != enr) {
/* Serialize multiple transactions.
* This uses test_and_set_bit, memory barrier is implicit.
*/
wait_event(mdev->al_wait,
mdev->act_log->pending_changes == 0 ||
(locked = lc_try_lock_for_transaction(mdev->act_log)));
if (locked) {
/* drbd_al_write_transaction(mdev,al_ext,enr);
* recurses into generic_make_request(), which
* disallows recursion, bios being serialized on the
* current->bio_tail list now.
* we have to delegate updates to the activity log
* to the worker thread. */
init_completion(&al_work.event);
al_work.al_ext = al_ext;
al_work.enr = enr;
al_work.old_enr = al_ext->lc_number;
al_work.w.cb = w_al_write_transaction;
drbd_queue_work_front(&mdev->data.work, &al_work.w);
wait_for_completion(&al_work.event);
mdev->al_writ_cnt++;
spin_lock_irq(&mdev->al_lock);
lc_changed(mdev->act_log, al_ext);
spin_unlock_irq(&mdev->al_lock);
/* Double check: it may have been committed by someone else,
* while we have been waiting for the lock. */
if (mdev->act_log->pending_changes) {
bool write_al_updates;
rcu_read_lock();
write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
rcu_read_unlock();
if (write_al_updates) {
al_write_transaction(mdev);
mdev->al_writ_cnt++;
}
spin_lock_irq(&mdev->al_lock);
/* FIXME
if (err)
we need an "lc_cancel" here;
*/
lc_committed(mdev->act_log);
spin_unlock_irq(&mdev->al_lock);
}
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
}
}
void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
/* for bios crossing activity log extent boundaries,
* we may need to activate two extents in one go */
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
unsigned enr;
struct lc_element *extent;
unsigned long flags;
D_ASSERT(first <= last);
spin_lock_irqsave(&mdev->al_lock, flags);
extent = lc_find(mdev->act_log, enr);
if (!extent) {
spin_unlock_irqrestore(&mdev->al_lock, flags);
dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
return;
for (enr = first; enr <= last; enr++) {
extent = lc_find(mdev->act_log, enr);
if (!extent) {
dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
continue;
}
lc_put(mdev->act_log, extent);
}
if (lc_put(mdev->act_log, extent) == 0)
wake_up(&mdev->al_wait);
spin_unlock_irqrestore(&mdev->al_lock, flags);
wake_up(&mdev->al_wait);
}
#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
......@@ -323,296 +346,148 @@ static unsigned int rs_extent_to_bm_page(unsigned int rs_enr)
return rs_enr >>
/* bit to page */
((PAGE_SHIFT + 3) -
/* al extent number to bit */
/* resync extent number to bit */
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
}
int
w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
static int
_al_write_transaction(struct drbd_conf *mdev)
{
struct update_al_work *aw = container_of(w, struct update_al_work, w);
struct lc_element *updated = aw->al_ext;
const unsigned int new_enr = aw->enr;
const unsigned int evicted = aw->old_enr;
struct al_transaction *buffer;
struct al_transaction_on_disk *buffer;
struct lc_element *e;
sector_t sector;
int i, n, mx;
unsigned int extent_nr;
u32 xor_sum = 0;
int i, mx;
unsigned extent_nr;
unsigned crc = 0;
int err = 0;
if (!get_ldev(mdev)) {
dev_err(DEV,
"disk is %s, cannot start al transaction (-%d +%d)\n",
drbd_disk_str(mdev->state.disk), evicted, new_enr);
complete(&((struct update_al_work *)w)->event);
return 1;
dev_err(DEV, "disk is %s, cannot start al transaction\n",
drbd_disk_str(mdev->state.disk));
return -EIO;
}
/* do we have to do a bitmap write, first?
* TODO reduce maximum latency:
* submit both bios, then wait for both,
* instead of doing two synchronous sector writes.
* For now, we must not write the transaction,
* if we cannot write out the bitmap of the evicted extent. */
if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
/* The bitmap write may have failed, causing a state change. */
if (mdev->state.disk < D_INCONSISTENT) {
dev_err(DEV,
"disk is %s, cannot write al transaction (-%d +%d)\n",
drbd_disk_str(mdev->state.disk), evicted, new_enr);
complete(&((struct update_al_work *)w)->event);
"disk is %s, cannot write al transaction\n",
drbd_disk_str(mdev->state.disk));
put_ldev(mdev);
return 1;
return -EIO;
}
buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
if (!buffer) {
dev_err(DEV, "disk failed while waiting for md_io buffer\n");
complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
return 1;
return -ENODEV;
}
buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
memset(buffer, 0, sizeof(*buffer));
buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
n = lc_index_of(mdev->act_log, updated);
i = 0;
buffer->updates[0].pos = cpu_to_be32(n);
buffer->updates[0].extent = cpu_to_be32(new_enr);
/* Even though no one can start to change this list
* once we set the LC_LOCKED -- from drbd_al_begin_io(),
* lc_try_lock_for_transaction() --, someone may still
* be in the process of changing it. */
spin_lock_irq(&mdev->al_lock);
list_for_each_entry(e, &mdev->act_log->to_be_changed, list) {
if (i == AL_UPDATES_PER_TRANSACTION) {
i++;
break;
}
buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
if (e->lc_number != LC_FREE)
drbd_bm_mark_for_writeout(mdev,
al_extent_to_bm_page(e->lc_number));
i++;
}
spin_unlock_irq(&mdev->al_lock);
BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
xor_sum ^= new_enr;
buffer->n_updates = cpu_to_be16(i);
for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
buffer->update_slot_nr[i] = cpu_to_be16(-1);
buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
}
buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements);
buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle);
mx = min_t(int, AL_EXTENTS_PT,
mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
mdev->act_log->nr_elements - mdev->al_tr_cycle);
for (i = 0; i < mx; i++) {
unsigned idx = mdev->al_tr_cycle + i;
extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
buffer->updates[i+1].pos = cpu_to_be32(idx);
buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
xor_sum ^= extent_nr;
}
for (; i < AL_EXTENTS_PT; i++) {
buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
xor_sum ^= LC_FREE;
buffer->context[i] = cpu_to_be32(extent_nr);
}
mdev->al_tr_cycle += AL_EXTENTS_PT;
for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
buffer->context[i] = cpu_to_be32(LC_FREE);
mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
mdev->al_tr_cycle = 0;
buffer->xor_sum = cpu_to_be32(xor_sum);
sector = mdev->ldev->md.md_offset
+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
+ mdev->ldev->md.al_offset
+ mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
if (++mdev->al_tr_pos >
div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
mdev->al_tr_pos = 0;
crc = crc32c(0, buffer, 4096);
buffer->crc32c = cpu_to_be32(crc);
D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
mdev->al_tr_number++;
if (drbd_bm_write_hinted(mdev))
err = -EIO;
/* drbd_chk_io_error done already */
else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
err = -EIO;
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
} else {
/* advance ringbuffer position and transaction counter */
mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
mdev->al_tr_number++;
}
drbd_md_put_buffer(mdev);
complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
return 1;
return err;
}
/**
* drbd_al_read_tr() - Read a single transaction from the on disk activity log
* @mdev: DRBD device.
* @bdev: Block device to read form.
* @b: pointer to an al_transaction.
* @index: On disk slot of the transaction to read.
*
* Returns -1 on IO error, 0 on checksum error and 1 upon success.
*/
static int drbd_al_read_tr(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev,
struct al_transaction *b,
int index)
{
sector_t sector;
int rv, i;
u32 xor_sum = 0;
sector = bdev->md.md_offset + bdev->md.al_offset + index;
/* Dont process error normally,
* as this is done before disk is attached! */
if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
return -1;
rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
for (i = 0; i < AL_EXTENTS_PT + 1; i++)
xor_sum ^= be32_to_cpu(b->updates[i].extent);
rv &= (xor_sum == be32_to_cpu(b->xor_sum));
return rv;
}
/**
* drbd_al_read_log() - Restores the activity log from its on disk representation.
* @mdev: DRBD device.
* @bdev: Block device to read form.
*
* Returns 1 on success, returns 0 when reading the log failed due to IO errors.
*/
int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
static int w_al_write_transaction(struct drbd_work *w, int unused)
{
struct al_transaction *buffer;
int i;
int rv;
int mx;
int active_extents = 0;
int transactions = 0;
int found_valid = 0;
int from = 0;
int to = 0;
u32 from_tnr = 0;
u32 to_tnr = 0;
u32 cnr;
mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
/* lock out all other meta data io for now,
* and make sure the page is mapped.
*/
buffer = drbd_md_get_buffer(mdev);
if (!buffer)
return 0;
/* Find the valid transaction in the log */
for (i = 0; i <= mx; i++) {
rv = drbd_al_read_tr(mdev, bdev, buffer, i);
if (rv == 0)
continue;
if (rv == -1) {
drbd_md_put_buffer(mdev);
return 0;
}
cnr = be32_to_cpu(buffer->tr_number);
if (++found_valid == 1) {
from = i;
to = i;
from_tnr = cnr;
to_tnr = cnr;
continue;
}
if ((int)cnr - (int)from_tnr < 0) {
D_ASSERT(from_tnr - cnr + i - from == mx+1);
from = i;
from_tnr = cnr;
}
if ((int)cnr - (int)to_tnr > 0) {
D_ASSERT(cnr - to_tnr == i - to);
to = i;
to_tnr = cnr;
}
}
if (!found_valid) {
dev_warn(DEV, "No usable activity log found.\n");
drbd_md_put_buffer(mdev);
return 1;
}
/* Read the valid transactions.
* dev_info(DEV, "Reading from %d to %d.\n",from,to); */
i = from;
while (1) {
int j, pos;
unsigned int extent_nr;
unsigned int trn;
rv = drbd_al_read_tr(mdev, bdev, buffer, i);
ERR_IF(rv == 0) goto cancel;
if (rv == -1) {
drbd_md_put_buffer(mdev);
return 0;
}
trn = be32_to_cpu(buffer->tr_number);
spin_lock_irq(&mdev->al_lock);
/* This loop runs backwards because in the cyclic
elements there might be an old version of the
updated element (in slot 0). So the element in slot 0
can overwrite old versions. */
for (j = AL_EXTENTS_PT; j >= 0; j--) {
pos = be32_to_cpu(buffer->updates[j].pos);
extent_nr = be32_to_cpu(buffer->updates[j].extent);
if (extent_nr == LC_FREE)
continue;
lc_set(mdev->act_log, extent_nr, pos);
active_extents++;
}
spin_unlock_irq(&mdev->al_lock);
transactions++;
cancel:
if (i == to)
break;
i++;
if (i > mx)
i = 0;
}
mdev->al_tr_number = to_tnr+1;
mdev->al_tr_pos = to;
if (++mdev->al_tr_pos >
div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
mdev->al_tr_pos = 0;
/* ok, we are done with it */
drbd_md_put_buffer(mdev);
struct update_al_work *aw = container_of(w, struct update_al_work, w);
struct drbd_conf *mdev = w->mdev;
int err;
dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
transactions, active_extents);
err = _al_write_transaction(mdev);
aw->err = err;
complete(&aw->event);
return 1;
return err != -EIO ? err : 0;
}
/**
* drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
* @mdev: DRBD device.
*/
void drbd_al_apply_to_bm(struct drbd_conf *mdev)
/* Calls from worker context (see w_restart_disk_io()) need to write the
transaction directly. Others came through generic_make_request(),
those need to delegate it to the worker. */
static int al_write_transaction(struct drbd_conf *mdev)
{
unsigned int enr;
unsigned long add = 0;
char ppb[10];
int i, tmp;
struct update_al_work al_work;
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
if (current == mdev->tconn->worker.task)
return _al_write_transaction(mdev);
for (i = 0; i < mdev->act_log->nr_elements; i++) {
enr = lc_element_by_index(mdev->act_log, i)->lc_number;
if (enr == LC_FREE)
continue;
tmp = drbd_bm_ALe_set_all(mdev, enr);
dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
add += tmp;
}
init_completion(&al_work.event);
al_work.w.cb = w_al_write_transaction;
al_work.w.mdev = mdev;
drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
wait_for_completion(&al_work.event);
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
ppsize(ppb, Bit2KB(add)));
return al_work.err;
}
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
......@@ -642,7 +517,7 @@ void drbd_al_shrink(struct drbd_conf *mdev)
struct lc_element *al_ext;
int i;
D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags));
for (i = 0; i < mdev->act_log->nr_elements; i++) {
al_ext = lc_element_by_index(mdev->act_log, i);
......@@ -654,15 +529,17 @@ void drbd_al_shrink(struct drbd_conf *mdev)
wake_up(&mdev->al_wait);
}
static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
static int w_update_odbm(struct drbd_work *w, int unused)
{
struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
struct drbd_conf *mdev = w->mdev;
struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
kfree(udw);
return 1;
return 0;
}
drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
......@@ -680,9 +557,9 @@ static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused
break;
}
}
drbd_bcast_sync_progress(mdev);
drbd_bcast_event(mdev, &sib);
return 1;
return 0;
}
......@@ -752,7 +629,9 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
}
ext->rs_left = rs_left;
ext->rs_failed = success ? 0 : count;
lc_changed(mdev->resync, &ext->lce);
/* we don't keep a persistent log of the resync lru,
* we can commit any change right away. */
lc_committed(mdev->resync);
}
lc_put(mdev->resync, &ext->lce);
/* no race, we are within the al_lock! */
......@@ -764,7 +643,8 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
if (udw) {
udw->enr = ext->lce.lc_number;
udw->w.cb = w_update_odbm;
drbd_queue_work_front(&mdev->data.work, &udw->w);
udw->w.mdev = mdev;
drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w);
} else {
dev_warn(DEV, "Could not kmalloc an udw\n");
}
......@@ -810,16 +690,22 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
int wake_up = 0;
unsigned long flags;
if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
}
if (!get_ldev(mdev))
return; /* no disk, no metadata, no bitmap to clear bits in */
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
ERR_IF(sector >= nr_sectors) return;
ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
if (!expect(sector < nr_sectors))
goto out;
if (!expect(esector < nr_sectors))
esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
......@@ -827,7 +713,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
* round up start sector, round down end sector. we make sure we only
* clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
if (unlikely(esector < BM_SECT_PER_BIT-1))
return;
goto out;
if (unlikely(esector == (nr_sectors-1)))
ebnr = lbnr;
else
......@@ -835,14 +721,14 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
if (sbnr > ebnr)
return;
goto out;
/*
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
if (count && get_ldev(mdev)) {
if (count) {
drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
spin_lock_irqsave(&mdev->al_lock, flags);
drbd_try_clear_on_disk_bm(mdev, sector, count, true);
......@@ -851,8 +737,9 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
put_ldev(mdev);
}
out:
put_ldev(mdev);
if (wake_up)
wake_up(&mdev->al_wait);
}
......@@ -868,7 +755,7 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
const char *file, const unsigned int line)
{
unsigned long sbnr, ebnr, lbnr, flags;
unsigned long sbnr, ebnr, flags;
sector_t esector, nr_sectors;
unsigned int enr, count = 0;
struct lc_element *e;
......@@ -877,7 +764,7 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
if (size == 0)
return 0;
if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "sector: %llus, size: %d\n",
(unsigned long long)sector, size);
return 0;
......@@ -889,12 +776,10 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
ERR_IF(sector >= nr_sectors)
if (!expect(sector < nr_sectors))
goto out;
ERR_IF(esector >= nr_sectors)
esector = (nr_sectors-1);
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
if (!expect(esector < nr_sectors))
esector = nr_sectors - 1;
/* we set it out of sync,
* we do not need to round anything here */
......@@ -937,7 +822,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
lc_changed(mdev->resync, &bm_ext->lce);
lc_committed(mdev->resync);
wakeup = 1;
}
if (bm_ext->lce.refcnt == 1)
......@@ -953,7 +838,7 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
BUG_ON(rs_flags & LC_DIRTY);
BUG_ON(rs_flags & LC_LOCKED);
}
return bm_ext;
......@@ -961,26 +846,12 @@ struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
{
struct lc_element *al_ext;
int rv = 0;
int rv;
spin_lock_irq(&mdev->al_lock);
if (unlikely(enr == mdev->act_log->new_number))
rv = 1;
else {
al_ext = lc_find(mdev->act_log, enr);
if (al_ext) {
if (al_ext->refcnt)
rv = 1;
}
}
rv = lc_is_used(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
/*
if (unlikely(rv)) {
dev_info(DEV, "Delaying sync read until app's write is done\n");
}
*/
return rv;
}
......@@ -1110,13 +981,13 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
BUG_ON(rs_flags & LC_DIRTY);
BUG_ON(rs_flags & LC_LOCKED);
goto try_again;
}
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
lc_changed(mdev->resync, &bm_ext->lce);
lc_committed(mdev->resync);
wake_up(&mdev->al_wait);
D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
}
......@@ -1127,8 +998,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
}
check_al:
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
if (unlikely(al_enr+i == mdev->act_log->new_number))
goto try_again;
if (lc_is_used(mdev->act_log, al_enr+i))
goto try_again;
}
......@@ -1263,7 +1132,7 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
sector_t esector, nr_sectors;
int wake_up = 0;
if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
......@@ -1271,8 +1140,10 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
ERR_IF(sector >= nr_sectors) return;
ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
if (!expect(sector < nr_sectors))
return;
if (!expect(esector < nr_sectors))
esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
......
......@@ -119,13 +119,9 @@ static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
func, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
drbd_task_to_thread_name(mdev->tconn, current),
func, b->bm_why ?: "?",
drbd_task_to_thread_name(mdev->tconn, b->bm_task));
}
void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
......@@ -142,13 +138,9 @@ void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
if (trylock_failed) {
dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
why, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
drbd_task_to_thread_name(mdev->tconn, current),
why, b->bm_why ?: "?",
drbd_task_to_thread_name(mdev->tconn, b->bm_task));
mutex_lock(&b->bm_change);
}
if (BM_LOCKED_MASK & b->bm_flags)
......@@ -196,6 +188,9 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
/* to mark for lazy writeout once syncer cleared all clearable bits,
* we if bits have been cleared since last IO. */
#define BM_PAGE_LAZY_WRITEOUT 28
/* pages marked with this "HINT" will be considered for writeout
* on activity log transactions */
#define BM_PAGE_HINT_WRITEOUT 27
/* store_page_idx uses non-atomic assignment. It is only used directly after
* allocating the page. All other bm_set_page_* and bm_clear_page_* need to
......@@ -227,8 +222,7 @@ static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
{
struct drbd_bitmap *b = mdev->bitmap;
void *addr = &page_private(b->bm_pages[page_nr]);
clear_bit(BM_PAGE_IO_LOCK, addr);
smp_mb__after_clear_bit();
clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
wake_up(&mdev->bitmap->bm_io_wait);
}
......@@ -246,6 +240,27 @@ static void bm_set_page_need_writeout(struct page *page)
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
}
/**
* drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
* @mdev: DRBD device.
* @page_nr: the bitmap page to mark with the "hint" flag
*
* From within an activity log transaction, we mark a few pages with these
* hints, then call drbd_bm_write_hinted(), which will only write out changed
* pages which are flagged with this mark.
*/
void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr)
{
struct page *page;
if (page_nr >= mdev->bitmap->bm_number_of_pages) {
dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n",
page_nr, (int)mdev->bitmap->bm_number_of_pages);
return;
}
page = mdev->bitmap->bm_pages[page_nr];
set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
}
static int bm_test_page_unchanged(struct page *page)
{
volatile const unsigned long *addr = &page_private(page);
......@@ -376,7 +391,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
* GFP_NOIO, as this is called while drbd IO is "suspended",
* and during resize or attach on diskless Primary,
* we must not block on IO to ourselves.
* Context is receiver thread or cqueue thread/dmsetup. */
* Context is receiver thread or dmsetup. */
bytes = sizeof(struct page *)*want;
new_pages = kzalloc(bytes, GFP_NOIO);
if (!new_pages) {
......@@ -441,7 +456,8 @@ int drbd_bm_init(struct drbd_conf *mdev)
sector_t drbd_bm_capacity(struct drbd_conf *mdev)
{
ERR_IF(!mdev->bitmap) return 0;
if (!expect(mdev->bitmap))
return 0;
return mdev->bitmap->bm_dev_capacity;
}
......@@ -449,7 +465,8 @@ sector_t drbd_bm_capacity(struct drbd_conf *mdev)
*/
void drbd_bm_cleanup(struct drbd_conf *mdev)
{
ERR_IF (!mdev->bitmap) return;
if (!expect(mdev->bitmap))
return;
bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
kfree(mdev->bitmap);
......@@ -612,7 +629,8 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
int err = 0, growing;
int opages_vmalloced;
ERR_IF(!b) return -ENOMEM;
if (!expect(b))
return -ENOMEM;
drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
......@@ -734,8 +752,10 @@ unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
unsigned long s;
unsigned long flags;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
s = b->bm_set;
......@@ -758,8 +778,10 @@ unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
size_t drbd_bm_words(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
return b->bm_words;
}
......@@ -767,7 +789,8 @@ size_t drbd_bm_words(struct drbd_conf *mdev)
unsigned long drbd_bm_bits(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
if (!expect(b))
return 0;
return b->bm_bits;
}
......@@ -788,8 +811,10 @@ void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
if (number == 0)
return;
WARN_ON(offset >= b->bm_words);
......@@ -833,8 +858,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
if ((offset >= b->bm_words) ||
......@@ -862,8 +889,10 @@ void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
void drbd_bm_set_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0xff, b->bm_words);
......@@ -876,8 +905,10 @@ void drbd_bm_set_all(struct drbd_conf *mdev)
void drbd_bm_clear_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (!expect(b))
return;
if (!expect(b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0, b->bm_words);
......@@ -891,7 +922,8 @@ struct bm_aio_ctx {
unsigned int done;
unsigned flags;
#define BM_AIO_COPY_PAGES 1
#define BM_WRITE_ALL_PAGES 2
#define BM_AIO_WRITE_HINTED 2
#define BM_WRITE_ALL_PAGES 4
int error;
struct kref kref;
};
......@@ -1062,6 +1094,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (rw & WRITE) {
if ((flags & BM_AIO_WRITE_HINTED) &&
!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;
if (!(flags & BM_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
......@@ -1094,9 +1131,11 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
else
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
rw == WRITE ? "WRITE" : "READ",
count, jiffies - now);
/* summary for global bitmap IO */
if (flags == 0)
dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
rw == WRITE ? "WRITE" : "READ",
count, jiffies - now);
if (ctx->error) {
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
......@@ -1117,8 +1156,9 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
}
now = b->bm_set;
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
if (flags == 0)
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
......@@ -1181,9 +1221,17 @@ int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
}
/**
* drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
* @mdev: DRBD device.
*/
int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local)
{
return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
}
/**
* drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
* drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
* @mdev: DRBD device.
* @idx: bitmap page index
*
......@@ -1291,8 +1339,10 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
struct drbd_bitmap *b = mdev->bitmap;
unsigned long i = DRBD_END_OF_BITMAP;
ERR_IF(!b) return i;
ERR_IF(!b->bm_pages) return i;
if (!expect(b))
return i;
if (!expect(b->bm_pages))
return i;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1393,8 +1443,10 @@ static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
struct drbd_bitmap *b = mdev->bitmap;
int c = 0;
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 1;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
......@@ -1425,13 +1477,21 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
{
int i;
int bits;
int changed = 0;
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
b->bm_set += BITS_PER_LONG - bits;
changed += BITS_PER_LONG - bits;
}
kunmap_atomic(paddr);
if (changed) {
/* We only need lazy writeout, the information is still in the
* remote bitmap as well, and is reconstructed during the next
* bitmap exchange, if lost locally due to a crash. */
bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
b->bm_set += changed;
}
}
/* Same thing as drbd_bm_set_bits,
......@@ -1526,8 +1586,10 @@ int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
unsigned long *p_addr;
int i;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1561,8 +1623,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 1;
if (!expect(b))
return 1;
if (!expect(b->bm_pages))
return 1;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1575,11 +1639,10 @@ int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsi
bm_unmap(p_addr);
p_addr = bm_map_pidx(b, idx);
}
ERR_IF (bitnr >= b->bm_bits) {
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
} else {
if (expect(bitnr < b->bm_bits))
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
}
else
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
}
if (p_addr)
bm_unmap(p_addr);
......@@ -1609,8 +1672,10 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
unsigned long flags;
unsigned long *p_addr, *bm;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
if (!expect(b))
return 0;
if (!expect(b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
......@@ -1632,47 +1697,3 @@ int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
spin_unlock_irqrestore(&b->bm_lock, flags);
return count;
}
/* Set all bits covered by the AL-extent al_enr.
* Returns number of bits changed. */
unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr, *bm;
unsigned long weight;
unsigned long s, e;
int count, i, do_now;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_SET & b->bm_flags)
bm_print_lock_info(mdev);
weight = b->bm_set;
s = al_enr * BM_WORDS_PER_AL_EXT;
e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
/* assert that s and e are on the same page */
D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
== s >> (PAGE_SHIFT - LN2_BPL + 3));
count = 0;
if (s < b->bm_words) {
i = do_now = e-s;
p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
bm = p_addr + MLPP(s);
while (i--) {
count += hweight_long(*bm);
*bm = -1UL;
bm++;
}
bm_unmap(p_addr);
b->bm_set += do_now*BITS_PER_LONG - count;
if (e == b->bm_words)
b->bm_set -= bm_clear_surplus(b);
} else {
dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
}
weight = b->bm_set - weight;
spin_unlock_irq(&b->bm_lock);
return weight;
}
......@@ -39,9 +39,13 @@
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
#include <linux/idr.h>
#include <net/tcp.h>
#include <linux/lru_cache.h>
#include <linux/prefetch.h>
#include <linux/drbd_genl_api.h>
#include <linux/drbd.h>
#include "drbd_state.h"
#ifdef __CHECKER__
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
......@@ -61,7 +65,6 @@
extern unsigned int minor_count;
extern bool disable_sendpage;
extern bool allow_oos;
extern unsigned int cn_idx;
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int enable_faults;
......@@ -86,34 +89,44 @@ extern char usermode_helper[];
*/
#define DRBD_SIGKILL SIGHUP
/* All EEs on the free list should have ID_VACANT (== 0)
* freshly allocated EEs get !ID_VACANT (== 1)
* so if it says "cannot dereference null pointer at address 0x00000001",
* it is most likely one of these :( */
#define ID_IN_SYNC (4711ULL)
#define ID_OUT_OF_SYNC (4712ULL)
#define ID_SYNCER (-1ULL)
#define ID_VACANT 0
#define is_syncer_block_id(id) ((id) == ID_SYNCER)
#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
struct drbd_conf;
struct drbd_tconn;
/* to shorten dev_warn(DEV, "msg"); and relatives statements */
#define DEV (disk_to_dev(mdev->vdisk))
#define conn_printk(LEVEL, TCONN, FMT, ARGS...) \
printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS)
#define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS)
#define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS)
#define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS)
#define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS)
#define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS)
#define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS)
#define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS)
#define D_ASSERT(exp) if (!(exp)) \
dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
#define ERR_IF(exp) if (({ \
int _b = (exp) != 0; \
if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
__func__, #exp, __FILE__, __LINE__); \
_b; \
}))
/**
* expect - Make an assertion
*
* Unlike the assert macro, this macro returns a boolean result.
*/
#define expect(exp) ({ \
bool _bool = (exp); \
if (!_bool) \
dev_err(DEV, "ASSERTION %s FAILED in %s\n", \
#exp, __func__); \
_bool; \
})
/* Defines to control fault insertion */
enum {
......@@ -150,15 +163,12 @@ drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
/* usual integer division */
#define div_floor(A, B) ((A)/(B))
/* drbd_meta-data.c (still in drbd_main.c) */
/* 4th incarnation of the disk layout. */
#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
extern struct drbd_conf **minor_table;
extern struct ratelimit_state drbd_ratelimit_state;
extern struct idr minors; /* RCU, updates: genl_lock() */
extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */
/* on the wire */
enum drbd_packets {
enum drbd_packet {
/* receiver (data socket) */
P_DATA = 0x00,
P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
......@@ -186,7 +196,7 @@ enum drbd_packets {
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
......@@ -207,77 +217,23 @@ enum drbd_packets {
P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
P_MAX_CMD = 0x2A,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
/* special command ids for handshake */
P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */
P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */
P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */
};
static inline const char *cmdname(enum drbd_packets cmd)
{
/* THINK may need to become several global tables
* when we want to support more than
* one PRO_VERSION */
static const char *cmdnames[] = {
[P_DATA] = "Data",
[P_DATA_REPLY] = "DataReply",
[P_RS_DATA_REPLY] = "RSDataReply",
[P_BARRIER] = "Barrier",
[P_BITMAP] = "ReportBitMap",
[P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
[P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
[P_UNPLUG_REMOTE] = "UnplugRemote",
[P_DATA_REQUEST] = "DataRequest",
[P_RS_DATA_REQUEST] = "RSDataRequest",
[P_SYNC_PARAM] = "SyncParam",
[P_SYNC_PARAM89] = "SyncParam89",
[P_PROTOCOL] = "ReportProtocol",
[P_UUIDS] = "ReportUUIDs",
[P_SIZES] = "ReportSizes",
[P_STATE] = "ReportState",
[P_SYNC_UUID] = "ReportSyncUUID",
[P_AUTH_CHALLENGE] = "AuthChallenge",
[P_AUTH_RESPONSE] = "AuthResponse",
[P_PING] = "Ping",
[P_PING_ACK] = "PingAck",
[P_RECV_ACK] = "RecvAck",
[P_WRITE_ACK] = "WriteAck",
[P_RS_WRITE_ACK] = "RSWriteAck",
[P_DISCARD_ACK] = "DiscardAck",
[P_NEG_ACK] = "NegAck",
[P_NEG_DREPLY] = "NegDReply",
[P_NEG_RS_DREPLY] = "NegRSDReply",
[P_BARRIER_ACK] = "BarrierAck",
[P_STATE_CHG_REQ] = "StateChgRequest",
[P_STATE_CHG_REPLY] = "StateChgReply",
[P_OV_REQUEST] = "OVRequest",
[P_OV_REPLY] = "OVReply",
[P_OV_RESULT] = "OVResult",
[P_CSUM_RS_REQUEST] = "CsumRSRequest",
[P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
[P_COMPRESSED_BITMAP] = "CBitmap",
[P_DELAY_PROBE] = "DelayProbe",
[P_OUT_OF_SYNC] = "OutOfSync",
[P_MAX_CMD] = NULL,
};
if (cmd == P_HAND_SHAKE_M)
return "HandShakeM";
if (cmd == P_HAND_SHAKE_S)
return "HandShakeS";
if (cmd == P_HAND_SHAKE)
return "HandShake";
if (cmd >= P_MAX_CMD)
return "Unknown";
return cmdnames[cmd];
}
extern const char *cmdname(enum drbd_packet cmd);
/* for sending/receiving the bitmap,
* possibly in some encoding scheme */
......@@ -337,37 +293,24 @@ struct p_header80 {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
u8 payload[0];
} __packed;
/* Header for big packets, Used for data packets exceeding 64kB */
struct p_header95 {
u16 magic; /* use DRBD_MAGIC_BIG here */
u16 command;
u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
u8 payload[0];
u32 length;
} __packed;
union p_header {
struct p_header80 h80;
struct p_header95 h95;
};
/*
* short commands, packets without payload, plain p_header:
* P_PING
* P_PING_ACK
* P_BECOME_SYNC_TARGET
* P_BECOME_SYNC_SOURCE
* P_UNPLUG_REMOTE
*/
struct p_header100 {
u32 magic;
u16 volume;
u16 command;
u32 length;
u32 pad;
} __packed;
/*
* commands with out-of-struct payload:
* P_BITMAP (no additional fields)
* P_DATA, P_DATA_REPLY (see p_data)
* P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
*/
extern unsigned int drbd_header_size(struct drbd_tconn *tconn);
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1 /* depricated */
......@@ -377,9 +320,10 @@ union p_header {
#define DP_FUA 16 /* equals REQ_FUA */
#define DP_FLUSH 32 /* equals REQ_FLUSH */
#define DP_DISCARD 64 /* equals REQ_DISCARD */
#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
struct p_data {
union p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
......@@ -390,21 +334,18 @@ struct p_data {
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
* P_DISCARD_ACK (proto C, two-primaries conflict detection)
* P_SUPERSEDED (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
u32 seq_num;
} __packed;
struct p_block_req {
struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
......@@ -413,59 +354,52 @@ struct p_block_req {
/*
* commands with their own struct for additional fields:
* P_HAND_SHAKE
* P_CONNECTION_FEATURES
* P_BARRIER
* P_BARRIER_ACK
* P_SYNC_PARAM
* ReportParams
*/
struct p_handshake {
struct p_header80 head; /* 8 bytes */
struct p_connection_features {
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
/* should be more than enough for future enhancements
* for now, feature_flags and the reserverd array shall be zero.
* for now, feature_flags and the reserved array shall be zero.
*/
u32 _pad;
u64 reserverd[7];
u64 reserved[7];
} __packed;
/* 80 bytes, FIXED for the next century */
struct p_barrier {
struct p_header80 head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
struct p_header80 head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
struct p_header80 head;
u32 rate;
u32 resync_rate;
/* Since protocol version 88 and higher. */
char verify_alg[0];
} __packed;
struct p_rs_param_89 {
struct p_header80 head;
u32 rate;
u32 resync_rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
struct p_rs_param_95 {
struct p_header80 head;
u32 rate;
u32 resync_rate;
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
u32 c_plan_ahead;
......@@ -475,12 +409,11 @@ struct p_rs_param_95 {
} __packed;
enum drbd_conn_flags {
CF_WANT_LOSE = 1,
CF_DISCARD_MY_DATA = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
struct p_header80 head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
......@@ -494,17 +427,14 @@ struct p_protocol {
} __packed;
struct p_uuids {
struct p_header80 head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
struct p_header80 head;
u64 uuid;
} __packed;
struct p_sizes {
struct p_header80 head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
......@@ -514,18 +444,15 @@ struct p_sizes {
} __packed;
struct p_state {
struct p_header80 head;
u32 state;
} __packed;
struct p_req_state {
struct p_header80 head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
struct p_header80 head;
u32 retcode;
} __packed;
......@@ -539,15 +466,7 @@ struct p_drbd06_param {
u32 bit_map_gen[5];
} __packed;
struct p_discard {
struct p_header80 head;
u64 block_id;
u32 seq_num;
u32 pad;
} __packed;
struct p_block_desc {
struct p_header80 head;
u64 sector;
u32 blksize;
u32 pad; /* to multiple of 8 Byte */
......@@ -563,7 +482,6 @@ enum drbd_bitmap_code {
};
struct p_compressed_bm {
struct p_header80 head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
......@@ -575,90 +493,22 @@ struct p_compressed_bm {
} __packed;
struct p_delay_probe93 {
struct p_header80 head;
u32 seq_num; /* sequence number to match the two probe packets */
u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
/* DCBP: Drbd Compressed Bitmap Packet ... */
static inline enum drbd_bitmap_code
DCBP_get_code(struct p_compressed_bm *p)
{
return (enum drbd_bitmap_code)(p->encoding & 0x0f);
}
static inline void
DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
{
BUG_ON(code & ~0xf);
p->encoding = (p->encoding & ~0xf) | code;
}
static inline int
DCBP_get_start(struct p_compressed_bm *p)
{
return (p->encoding & 0x80) != 0;
}
static inline void
DCBP_set_start(struct p_compressed_bm *p, int set)
{
p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
}
static inline int
DCBP_get_pad_bits(struct p_compressed_bm *p)
{
return (p->encoding >> 4) & 0x7;
}
static inline void
DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
{
BUG_ON(n & ~0x7);
p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
}
/* one bitmap packet, including the p_header,
* should fit within one _architecture independend_ page.
* so we need to use the fixed size 4KiB page size
* most architectures have used for a long time.
/*
* Bitmap packets need to fit within a single page on the sender and receiver,
* so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
*/
#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
#if (PAGE_SIZE < 4096)
/* drbd_send_bitmap / receive_bitmap would break horribly */
#error "PAGE_SIZE too small"
#endif
union p_polymorph {
union p_header header;
struct p_handshake handshake;
struct p_data data;
struct p_block_ack block_ack;
struct p_barrier barrier;
struct p_barrier_ack barrier_ack;
struct p_rs_param_89 rs_param_89;
struct p_rs_param_95 rs_param_95;
struct p_protocol protocol;
struct p_sizes sizes;
struct p_uuids uuids;
struct p_state state;
struct p_req_state req_state;
struct p_req_state_reply req_state_reply;
struct p_block_req block_req;
struct p_delay_probe93 delay_probe93;
struct p_rs_uuid rs_uuid;
struct p_block_desc block_desc;
} __packed;
#define DRBD_SOCKET_BUFFER_SIZE 4096
/**********************************************************************/
enum drbd_thread_state {
None,
Running,
Exiting,
Restarting
NONE,
RUNNING,
EXITING,
RESTARTING
};
struct drbd_thread {
......@@ -667,8 +517,9 @@ struct drbd_thread {
struct completion stop;
enum drbd_thread_state t_state;
int (*function) (struct drbd_thread *);
struct drbd_conf *mdev;
struct drbd_tconn *tconn;
int reset_cpu_mask;
char name[9];
};
static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
......@@ -681,58 +532,54 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
return thi->t_state;
}
struct drbd_work;
typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
struct drbd_work {
struct list_head list;
drbd_work_cb cb;
int (*cb)(struct drbd_work *, int cancel);
union {
struct drbd_conf *mdev;
struct drbd_tconn *tconn;
};
};
struct drbd_tl_epoch;
#include "drbd_interval.h"
extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *);
struct drbd_request {
struct drbd_work w;
struct drbd_conf *mdev;
/* if local IO is not allowed, will be NULL.
* if local IO _is_ allowed, holds the locally submitted bio clone,
* or, after local IO completion, the ERR_PTR(error).
* see drbd_endio_pri(). */
* see drbd_request_endio(). */
struct bio *private_bio;
struct hlist_node collision;
sector_t sector;
unsigned int size;
unsigned int epoch; /* barrier_nr */
struct drbd_interval i;
/* barrier_nr: used to check on "completion" whether this req was in
/* epoch: used to check on "completion" whether this req was in
* the current epoch, and we therefore have to close it,
* starting a new epoch...
* causing a p_barrier packet to be send, starting a new epoch.
*
* This corresponds to "barrier" in struct p_barrier[_ack],
* and to "barrier_nr" in struct drbd_epoch (and various
* comments/function parameters/local variable names).
*/
unsigned int epoch;
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
unsigned long rq_state; /* see comments above _req_mod() */
unsigned long start_time;
};
struct drbd_tl_epoch {
struct drbd_work w;
struct list_head requests; /* requests before */
struct drbd_tl_epoch *next; /* pointer to the next barrier */
unsigned int br_number; /* the barriers identifier. */
int n_writes; /* number of requests attached before this barrier */
};
/* once it hits 0, we may complete the master_bio */
atomic_t completion_ref;
/* once it hits 0, we may destroy this drbd_request object */
struct kref kref;
struct drbd_request;
/* These Tl_epoch_entries may be in one of 6 lists:
active_ee .. data packet being written
sync_ee .. syncer block being written
done_ee .. block written, need to send P_WRITE_ACK
read_ee .. [RS]P_DATA_REQUEST being read
*/
unsigned rq_state; /* see comments above _req_mod() */
};
struct drbd_epoch {
struct drbd_tconn *tconn;
struct list_head list;
unsigned int barrier_nr;
atomic_t epoch_size; /* increased on every request added. */
......@@ -762,17 +609,14 @@ struct digest_info {
void *digest;
};
struct drbd_epoch_entry {
struct drbd_peer_request {
struct drbd_work w;
struct hlist_node collision;
struct drbd_epoch *epoch; /* for writes */
struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
unsigned int size;
struct drbd_interval i;
/* see comments on ee flag bits below */
unsigned long flags;
sector_t sector;
union {
u64 block_id;
struct digest_info *digest;
......@@ -793,31 +637,37 @@ enum {
* we need to resubmit without the barrier flag. */
__EE_RESUBMITTED,
/* we may have several bios per epoch entry.
/* we may have several bios per peer request.
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
/* Conflicting local requests need to be restarted after this request */
__EE_RESTART_REQUESTS,
/* The peer wants a write ACK for this (wire proto C) */
__EE_SEND_WRITE_ACK,
/* Is set when net_conf had two_primaries set while creating this peer_req */
__EE_IN_INTERVAL_TREE,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
/* global flag bits */
enum drbd_flag {
CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
SEND_PING, /* whether asender should send a ping asap */
/* flag bits per mdev */
enum {
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
CL_ST_CHG_SUCCESS,
CL_ST_CHG_FAIL,
CRASHED_PRIMARY, /* This node was a crashed primary.
......@@ -835,33 +685,14 @@ enum drbd_flag {
WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
NET_CONGESTED, /* The data socket is congested */
CONFIG_PENDING, /* serialization of (re)configuration requests.
* if set, also prevents the device from dying */
DEVICE_DYING, /* device became unconfigured,
* but worker thread is still handling the cleanup.
* reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
* while this is set. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from
* the peer, if it changed there as well. */
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
NEW_CUR_UUID, /* Create new current UUID when thawing IO */
AL_SUSPENDED, /* Activity logging is currently suspended. */
AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
STATE_SENT, /* Do not change state/UUIDs while this is set */
CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
* pending, from drbd worker context.
* If set, bdi_write_congested() returns true,
* so shrink_page_list() would not recurse into,
* and potentially deadlock on, this drbd worker.
*/
DISCONNECT_SENT, /* Currently the last bit in this 32bit word */
/* keep last */
DRBD_N_FLAGS,
B_RS_H_DONE, /* Before resync handler done (already executed) */
DISCARD_MY_DATA, /* discard_my_data flag per volume */
READ_BALANCE_RR,
};
struct drbd_bitmap; /* opaque for drbd_conf */
......@@ -899,18 +730,17 @@ enum bm_flag {
struct drbd_work_queue {
struct list_head q;
struct semaphore s; /* producers up it, worker down()s it */
spinlock_t q_lock; /* to protect the list. */
wait_queue_head_t q_wait;
};
struct drbd_socket {
struct drbd_work_queue work;
struct mutex mutex;
struct socket *socket;
/* this way we get our
* send/receive buffers off the stack */
union p_polymorph sbuf;
union p_polymorph rbuf;
void *sbuf;
void *rbuf;
};
struct drbd_md {
......@@ -927,24 +757,16 @@ struct drbd_md {
s32 bm_offset; /* signed relative sector offset to bitmap */
/* u32 al_nr_extents; important for restoring the AL
* is stored into sync_conf.al_extents, which in turn
* is stored into ldev->dc.al_extents, which in turn
* gets applied to act_log->nr_elements
*/
};
/* for sync_conf and other types... */
#define NL_PACKET(name, number, fields) struct name { fields };
#define NL_INTEGER(pn,pr,member) int member;
#define NL_INT64(pn,pr,member) __u64 member;
#define NL_BIT(pn,pr,member) unsigned member:1;
#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
#include <linux/drbd_nl.h>
struct drbd_backing_dev {
struct block_device *backing_bdev;
struct block_device *md_bdev;
struct drbd_md md;
struct disk_conf dc; /* The user provided config... */
struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */
sector_t known_size; /* last known size of that backing device */
};
......@@ -968,17 +790,116 @@ enum write_ordering_e {
};
struct fifo_buffer {
int *values;
unsigned int head_index;
unsigned int size;
int total; /* sum of all values */
int values[0];
};
extern struct fifo_buffer *fifo_alloc(int fifo_size);
/* flag bits per tconn */
enum {
NET_CONGESTED, /* The data socket is congested */
RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
SEND_PING, /* whether asender should send a ping asap */
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
CONN_WD_ST_CHG_OKAY,
CONN_WD_ST_CHG_FAIL,
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
STATE_SENT, /* Do not change state/UUIDs while this is set */
CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
* pending, from drbd worker context.
* If set, bdi_write_congested() returns true,
* so shrink_page_list() would not recurse into,
* and potentially deadlock on, this drbd worker.
*/
DISCONNECT_SENT,
};
struct drbd_tconn { /* is a resource from the config file */
char *name; /* Resource name */
struct list_head all_tconn; /* linked on global drbd_tconns */
struct kref kref;
struct idr volumes; /* <tconn, vnr> to mdev mapping */
enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
unsigned susp:1; /* IO suspended by user */
unsigned susp_nod:1; /* IO suspended because no data */
unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
struct mutex cstate_mutex; /* Protects graceful disconnects */
unsigned long flags;
struct net_conf *net_conf; /* content protected by rcu */
struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
struct res_opts res_opts;
struct sockaddr_storage my_addr;
int my_addr_len;
struct sockaddr_storage peer_addr;
int peer_addr_len;
struct drbd_socket data; /* data/barrier/cstate/parameter packets */
struct drbd_socket meta; /* ping/ack (metadata) packets */
int agreed_pro_version; /* actually used protocol version */
unsigned long last_received; /* in jiffies, either socket */
unsigned int ko_count;
spinlock_t req_lock;
struct list_head transfer_log; /* all requests not yet fully processed */
struct crypto_hash *cram_hmac_tfm;
struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */
struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */
struct crypto_hash *csums_tfm;
struct crypto_hash *verify_tfm;
void *int_dig_in;
void *int_dig_vv;
/* receiver side */
struct drbd_epoch *current_epoch;
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
atomic_t current_tle_nr; /* transfer log epoch number */
unsigned current_tle_writes; /* writes seen within this tl epoch */
unsigned long last_reconnect_jif;
struct drbd_thread receiver;
struct drbd_thread worker;
struct drbd_thread asender;
cpumask_var_t cpu_mask;
/* sender side */
struct drbd_work_queue sender_work;
struct {
/* whether this sender thread
* has processed a single write yet. */
bool seen_any_write_yet;
/* Which barrier number to send with the next P_BARRIER */
int current_epoch_nr;
/* how many write requests have been sent
* with req->epoch == current_epoch_nr.
* If none, no P_BARRIER will be sent. */
unsigned current_epoch_writes;
} send;
};
struct drbd_conf {
unsigned long drbd_flags[(DRBD_N_FLAGS + BITS_PER_LONG -1)/BITS_PER_LONG];
struct drbd_tconn *tconn;
int vnr; /* volume number within the connection */
struct kref kref;
/* things that are stored as / read from meta data on disk */
unsigned long flags;
/* configured by drbdsetup */
struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
struct syncer_conf sync_conf;
struct drbd_backing_dev *ldev __protected_by(local);
sector_t p_size; /* partner's disk size */
......@@ -986,11 +907,7 @@ struct drbd_conf {
struct block_device *this_bdev;
struct gendisk *vdisk;
struct drbd_socket data; /* data/barrier/cstate/parameter packets */
struct drbd_socket meta; /* ping/ack (metadata) packets */
int agreed_pro_version; /* actually used protocol version */
unsigned long last_received; /* in jiffies, either socket */
unsigned int ko_count;
unsigned long last_reattach_jif;
struct drbd_work resync_work,
unplug_work,
go_diskless,
......@@ -1010,10 +927,9 @@ struct drbd_conf {
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
union drbd_state state;
union drbd_dev_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
......@@ -1023,17 +939,12 @@ struct drbd_conf {
atomic_t ap_bio_cnt; /* Requests we need to complete */
atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
atomic_t unacked_cnt; /* Need to send replys for */
atomic_t unacked_cnt; /* Need to send replies for */
atomic_t local_cnt; /* Waiting for local completion */
atomic_t net_cnt; /* Users of net_conf */
spinlock_t req_lock;
struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
struct drbd_tl_epoch *newest_tle;
struct drbd_tl_epoch *oldest_tle;
struct list_head out_of_sequence_requests;
struct list_head barrier_acked_requests;
struct hlist_head *tl_hash;
unsigned int tl_hash_s;
/* Interval tree of pending local requests */
struct rb_root read_requests;
struct rb_root write_requests;
/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
unsigned long rs_total;
......@@ -1053,6 +964,7 @@ struct drbd_conf {
unsigned long rs_mark_time[DRBD_SYNC_MARKS];
/* current index into rs_mark_{left,time} */
int rs_last_mark;
unsigned long rs_last_bcast; /* [unit jiffies] */
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
......@@ -1064,14 +976,7 @@ struct drbd_conf {
/* size of out-of-sync range in sectors. */
sector_t ov_last_oos_size;
unsigned long ov_left; /* in bits */
struct crypto_hash *csums_tfm;
struct crypto_hash *verify_tfm;
unsigned long last_reattach_jif;
unsigned long last_reconnect_jif;
struct drbd_thread receiver;
struct drbd_thread worker;
struct drbd_thread asender;
struct drbd_bitmap *bitmap;
unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
......@@ -1084,29 +989,19 @@ struct drbd_conf {
int open_cnt;
u64 *p_uuid;
struct drbd_epoch *current_epoch;
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* send ack */
struct list_head read_ee; /* IO in progress (any read) */
struct list_head done_ee; /* need to send P_WRITE_ACK */
struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head net_ee; /* zero-copy network send in progress */
struct hlist_head *ee_hash; /* is proteced by req_lock! */
unsigned int ee_hash_s;
/* this one is protected by ee_lock, single thread */
struct drbd_epoch_entry *last_write_w_barrier;
int next_barrier_nr;
struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
struct drbd_md_io md_io;
atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
spinlock_t al_lock;
......@@ -1115,22 +1010,16 @@ struct drbd_conf {
unsigned int al_tr_number;
int al_tr_cycle;
int al_tr_pos; /* position of the next transaction in the journal */
struct crypto_hash *cram_hmac_tfm;
struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
void *int_dig_out;
void *int_dig_in;
void *int_dig_vv;
wait_queue_head_t seq_wait;
atomic_t packet_seq;
unsigned int peer_seq;
spinlock_t peer_seq_lock;
unsigned int minor;
unsigned long comm_bm_set; /* communicated number of set bits. */
cpumask_var_t cpu_mask;
struct bm_io_work bm_io_work;
u64 ed_uuid; /* UUID of the exposed data */
struct mutex state_mutex;
struct mutex own_state_mutex;
struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */
char congestion_reason; /* Why we where congested... */
atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
atomic_t rs_sect_ev; /* for submitted resync data rate, both */
......@@ -1138,46 +1027,16 @@ struct drbd_conf {
int rs_last_events; /* counter of read or write "events" (unit sectors)
* on the lower level device when we last looked. */
int c_sync_rate; /* current resync rate after syncer throttle magic */
struct fifo_buffer rs_plan_s; /* correction values of resync planer */
struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
int rs_planed; /* resync sectors already planned */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
unsigned int peer_max_bio_size;
unsigned int local_max_bio_size;
};
static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
{
set_bit(f, &mdev->drbd_flags[0]);
}
static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
{
clear_bit(f, &mdev->drbd_flags[0]);
}
static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f)
{
return test_bit(f, &mdev->drbd_flags[0]);
}
static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
{
return test_and_set_bit(f, &mdev->drbd_flags[0]);
}
static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
{
return test_and_clear_bit(f, &mdev->drbd_flags[0]);
}
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
{
struct drbd_conf *mdev;
mdev = minor < minor_count ? minor_table[minor] : NULL;
return mdev;
return (struct drbd_conf *)idr_find(&minors, minor);
}
static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
......@@ -1185,29 +1044,9 @@ static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
return mdev->minor;
}
/* returns 1 if it was successful,
* returns 0 if there was no data socket.
* so wherever you are going to use the data.socket, e.g. do
* if (!drbd_get_data_sock(mdev))
* return 0;
* CODE();
* drbd_put_data_sock(mdev);
*/
static inline int drbd_get_data_sock(struct drbd_conf *mdev)
{
mutex_lock(&mdev->data.mutex);
/* drbd_disconnect() could have called drbd_free_sock()
* while we were waiting in down()... */
if (unlikely(mdev->data.socket == NULL)) {
mutex_unlock(&mdev->data.mutex);
return 0;
}
return 1;
}
static inline void drbd_put_data_sock(struct drbd_conf *mdev)
static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr)
{
mutex_unlock(&mdev->data.mutex);
return (struct drbd_conf *)idr_find(&tconn->volumes, vnr);
}
/*
......@@ -1216,99 +1055,69 @@ static inline void drbd_put_data_sock(struct drbd_conf *mdev)
/* drbd_main.c */
enum chg_state_flags {
CS_HARD = 1,
CS_VERBOSE = 2,
CS_WAIT_COMPLETE = 4,
CS_SERIALIZE = 8,
CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
};
enum dds_flags {
DDSF_FORCED = 1,
DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
};
extern void drbd_init_set_defaults(struct drbd_conf *mdev);
extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
enum chg_state_flags f,
union drbd_state mask,
union drbd_state val);
extern void drbd_force_state(struct drbd_conf *, union drbd_state,
union drbd_state);
extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
union drbd_state,
union drbd_state,
enum chg_state_flags);
extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
enum chg_state_flags,
struct completion *done);
extern void print_st_err(struct drbd_conf *, union drbd_state,
union drbd_state, int);
extern int drbd_thread_start(struct drbd_thread *thi);
extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task);
#ifdef CONFIG_SMP
extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn);
#else
#define drbd_thread_current_set_cpu(A) ({})
#define drbd_calc_cpu_mask(A) ({})
#endif
extern void drbd_free_resources(struct drbd_conf *mdev);
extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(struct drbd_conf *mdev);
extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_conf *mdev);
extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
void *buf, size_t size, unsigned msg_flags);
extern int drbd_send_protocol(struct drbd_conf *mdev);
extern void tl_clear(struct drbd_tconn *);
extern void drbd_free_sock(struct drbd_tconn *tconn);
extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
void *buf, size_t size, unsigned msg_flags);
extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t,
unsigned);
extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd);
extern int drbd_send_protocol(struct drbd_tconn *tconn);
extern int drbd_send_uuids(struct drbd_conf *mdev);
extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
extern int drbd_send_current_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags);
#define USE_DATA_SOCKET 1
#define USE_META_SOCKET 0
extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
enum drbd_packets cmd, struct p_header80 *h,
size_t size);
extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
char *data, size_t size);
extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
u32 set_size);
extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
struct drbd_epoch_entry *e);
extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_block_req *rp);
extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_data *dp, int data_size);
extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
extern int drbd_send_sync_param(struct drbd_conf *mdev);
extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr,
u32 set_size);
extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet,
struct drbd_peer_request *);
extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
struct p_block_req *rp);
extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
struct p_data *dp, int data_size);
extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
sector_t sector, int blksize, u64 block_id);
extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
struct drbd_epoch_entry *e);
extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *);
extern int drbd_send_block(struct drbd_conf *, enum drbd_packet,
struct drbd_peer_request *);
extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
sector_t sector, int size, u64 block_id);
extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
sector_t sector,int size,
void *digest, int digest_size,
enum drbd_packets cmd);
extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector,
int size, void *digest, int digest_size,
enum drbd_packet cmd);
extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
extern int drbd_send_bitmap(struct drbd_conf *mdev);
extern int _drbd_send_bitmap(struct drbd_conf *mdev);
extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode);
extern void drbd_free_bc(struct drbd_backing_dev *ldev);
extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
extern void conn_md_sync(struct drbd_tconn *tconn);
extern void drbd_md_sync(struct drbd_conf *mdev);
extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
......@@ -1334,33 +1143,52 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
extern int drbd_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags);
extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern void drbd_go_diskless(struct drbd_conf *mdev);
extern void drbd_ldev_destroy(struct drbd_conf *mdev);
/* Meta data layout
We reserve a 128MB Block (4k aligned)
* either at the end of the backing device
* or on a separate meta data device. */
#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
/* The following numbers are sectors */
#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
/* Allows up to about 3.8TB */
#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
/* Since the smalles IO unit is usually 512 byte */
#define MD_SECTOR_SHIFT 9
#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
/* activity log */
#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
/* Allows up to about 3.8TB, so if you want more,
* you need to use the "flexible" meta data format. */
#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
/* we do all meta data IO in 4k blocks */
#define MD_BLOCK_SHIFT 12
#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
/* One activity log extent represents 4M of storage */
#define AL_EXTENT_SHIFT 22
#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
/* We could make these currently hardcoded constants configurable
* variables at create-md time (or even re-configurable at runtime?).
* Which will require some more changes to the DRBD "super block"
* and attach code.
*
* updates per transaction:
* This many changes to the active set can be logged with one transaction.
* This number is arbitrary.
* context per transaction:
* This many context extent numbers are logged with each transaction.
* This number is resulting from the transaction block size (4k), the layout
* of the transaction header, and the number of updates per transaction.
* See drbd_actlog.c:struct al_transaction_on_disk
* */
#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
#if BITS_PER_LONG == 32
#define LN2_BPL 5
#define cpu_to_lel(A) cpu_to_le32(A)
......@@ -1396,11 +1224,14 @@ struct bm_extent {
#define SLEEP_TIME (HZ/10)
#define BM_BLOCK_SHIFT 12 /* 4k per bit */
/* We do bitmap IO in units of 4k blocks.
* We also still have a hardcoded 4k per bit relation. */
#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
* per sector of on disk bitmap */
#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
/* mostly arbitrarily set the represented size of one bitmap extent,
* aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
* at 4k per bit resolution) */
#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */
#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
......@@ -1468,17 +1299,20 @@ struct bm_extent {
#endif
#endif
/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
* With a value of 8 all IO in one 128K block make it to the same slot of the
* hash table. */
#define HT_SHIFT 8
#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
* so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
* Since we may live in a mixed-platform cluster,
* we limit us to a platform agnostic constant here for now.
* A followup commit may allow even bigger BIO sizes,
* once we thought that through. */
#define DRBD_MAX_BIO_SIZE (1U << 20)
#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
#endif
#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
/* Number of elements in the app_reads_hash */
#define APP_R_HSIZE 15
#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
extern int drbd_bm_init(struct drbd_conf *mdev);
extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
......@@ -1500,11 +1334,11 @@ extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr);
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
unsigned long al_enr);
extern size_t drbd_bm_words(struct drbd_conf *mdev);
extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
......@@ -1529,7 +1363,7 @@ extern void drbd_bm_unlock(struct drbd_conf *mdev);
/* drbd_main.c */
extern struct kmem_cache *drbd_request_cache;
extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
extern struct kmem_cache *drbd_ee_cache; /* peer requests */
extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t *drbd_request_mempool;
......@@ -1569,12 +1403,22 @@ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
extern rwlock_t global_state_lock;
extern struct drbd_conf *drbd_new_device(unsigned int minor);
extern void drbd_free_mdev(struct drbd_conf *mdev);
extern int conn_lowest_minor(struct drbd_tconn *tconn);
enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr);
extern void drbd_minor_destroy(struct kref *kref);
extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts);
extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts);
extern void conn_destroy(struct kref *kref);
struct drbd_tconn *conn_get_by_name(const char *name);
extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
void *peer_addr, int peer_addr_len);
extern void conn_free_crypto(struct drbd_tconn *tconn);
extern int proc_details;
/* drbd_req */
extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
......@@ -1582,10 +1426,11 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t);
/* drbd_nl.c */
extern int drbd_msg_put_info(const char *info);
extern void drbd_suspend_io(struct drbd_conf *mdev);
extern void drbd_resume_io(struct drbd_conf *mdev);
extern char *ppsize(char *buf, unsigned long long size);
extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
extern void resync_after_online_grow(struct drbd_conf *);
......@@ -1593,13 +1438,14 @@ extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
enum drbd_role new_role,
int force);
extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
extern bool conn_try_outdate_peer(struct drbd_tconn *tconn);
extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
extern int drbd_worker(struct drbd_thread *thi);
extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor);
void drbd_resync_after_changed(struct drbd_conf *mdev);
extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
extern void resume_next_sg(struct drbd_conf *mdev);
extern void suspend_other_sg(struct drbd_conf *mdev);
......@@ -1608,13 +1454,13 @@ extern int drbd_resync_finished(struct drbd_conf *mdev);
extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
extern void drbd_md_put_buffer(struct drbd_conf *mdev);
extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev, sector_t sector, int rw);
struct drbd_backing_dev *bdev, sector_t sector, int rw);
extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int);
extern void wait_until_done_or_force_detached(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev, unsigned int *done);
extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
static inline void ov_oos_print(struct drbd_conf *mdev)
static inline void ov_out_of_sync_print(struct drbd_conf *mdev)
{
if (mdev->ov_last_oos_size) {
dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
......@@ -1626,97 +1472,102 @@ static inline void ov_oos_print(struct drbd_conf *mdev)
extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *,
struct drbd_peer_request *, void *);
/* worker callbacks */
extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_data_req(struct drbd_work *, int);
extern int w_e_end_rsdata_req(struct drbd_work *, int);
extern int w_e_end_csum_rs_req(struct drbd_work *, int);
extern int w_e_end_ov_reply(struct drbd_work *, int);
extern int w_e_end_ov_req(struct drbd_work *, int);
extern int w_ov_finished(struct drbd_work *, int);
extern int w_resync_timer(struct drbd_work *, int);
extern int w_send_write_hint(struct drbd_work *, int);
extern int w_make_resync_request(struct drbd_work *, int);
extern int w_send_dblock(struct drbd_work *, int);
extern int w_send_read_req(struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_work *, int);
extern int w_e_reissue(struct drbd_work *, int);
extern int w_restart_disk_io(struct drbd_work *, int);
extern int w_send_out_of_sync(struct drbd_work *, int);
extern int w_start_resync(struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
extern void start_resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
const unsigned rw, const int fault_type);
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
u64 id,
sector_t sector,
unsigned int data_size,
gfp_t gfp_mask) __must_hold(local);
extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
int is_net);
#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern int drbd_submit_peer_request(struct drbd_conf *,
struct drbd_peer_request *, const unsigned,
const int);
extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64,
sector_t, unsigned int,
gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *,
int);
#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool);
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
extern void drbd_flush_workqueue(struct drbd_conf *mdev);
extern void drbd_free_tl_hash(struct drbd_conf *mdev);
extern void conn_flush_workqueue(struct drbd_tconn *tconn);
extern int drbd_connected(struct drbd_conf *mdev);
static inline void drbd_flush_workqueue(struct drbd_conf *mdev)
{
conn_flush_workqueue(mdev->tconn);
}
/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
* mess with get_fs/set_fs, we know we are KERNEL_DS always. */
/* Yes, there is kernel_setsockopt, but only since 2.6.18.
* So we have our own copy of it here. */
static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, int optlen)
char *optval, int optlen)
{
mm_segment_t oldfs = get_fs();
char __user *uoptval;
int err;
uoptval = (char __user __force *)optval;
set_fs(KERNEL_DS);
if (level == SOL_SOCKET)
err = sock_setsockopt(sock, level, optname, optval, optlen);
err = sock_setsockopt(sock, level, optname, uoptval, optlen);
else
err = sock->ops->setsockopt(sock, level, optname, optval,
err = sock->ops->setsockopt(sock, level, optname, uoptval,
optlen);
set_fs(oldfs);
return err;
}
static inline void drbd_tcp_cork(struct socket *sock)
{
int __user val = 1;
int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
(char __user *)&val, sizeof(val));
(char*)&val, sizeof(val));
}
static inline void drbd_tcp_uncork(struct socket *sock)
{
int __user val = 0;
int val = 0;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
(char __user *)&val, sizeof(val));
(char*)&val, sizeof(val));
}
static inline void drbd_tcp_nodelay(struct socket *sock)
{
int __user val = 1;
int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
(char __user *)&val, sizeof(val));
(char*)&val, sizeof(val));
}
static inline void drbd_tcp_quickack(struct socket *sock)
{
int __user val = 2;
int val = 2;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
(char __user *)&val, sizeof(val));
(char*)&val, sizeof(val));
}
void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo);
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
......@@ -1725,8 +1576,8 @@ extern const char *drbd_conn_str(enum drbd_conns s);
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
......@@ -1734,7 +1585,6 @@ extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
extern int drbd_rs_del_all(struct drbd_conf *mdev);
extern void drbd_rs_failed_io(struct drbd_conf *mdev,
sector_t sector, int size);
extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
int size, const char *file, const unsigned int line);
......@@ -1744,73 +1594,24 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
int size, const char *file, const unsigned int line);
#define drbd_set_out_of_sync(mdev, sector, size) \
__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
extern void drbd_al_shrink(struct drbd_conf *mdev);
/* drbd_nl.c */
void drbd_nl_cleanup(void);
int __init drbd_nl_init(void);
void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
void drbd_bcast_sync_progress(struct drbd_conf *mdev);
void drbd_bcast_ee(struct drbd_conf *mdev,
const char *reason, const int dgs,
const char* seen_hash, const char* calc_hash,
const struct drbd_epoch_entry* e);
/**
* DOC: DRBD State macros
*
* These macros are used to express state changes in easily readable form.
*
* The NS macros expand to a mask and a value, that can be bit ored onto the
* current state as soon as the spinlock (req_lock) was taken.
*
* The _NS macros are used for state functions that get called with the
* spinlock. These macros expand directly to the new state value.
*
* Besides the basic forms NS() and _NS() additional _?NS[23] are defined
* to express state changes that affect more than one aspect of the state.
*
* E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
* Means that the network connection was established and that the peer
* is in secondary role.
*/
#define role_MASK R_MASK
#define peer_MASK R_MASK
#define disk_MASK D_MASK
#define pdsk_MASK D_MASK
#define conn_MASK C_MASK
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
#define susp_nod_MASK 1
#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T = (S); val; })
#define NS2(T1, S1, T2, S2) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val; })
#define NS3(T1, S1, T2, S2, T3, S3) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val.T3 = (S3); val; })
#define _NS(D, T, S) \
D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
#define _NS2(D, T1, S1, T2, S2) \
D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
__ns.T2 = (S2); __ns; })
#define _NS3(D, T1, S1, T2, S2, T3, S3) \
D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
/* state info broadcast */
struct sib_info {
enum drbd_state_info_bcast_reason sib_reason;
union {
struct {
char *helper_name;
unsigned helper_exit_code;
};
struct {
union drbd_state os;
union drbd_state ns;
};
};
};
void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib);
/*
* inline helper functions
......@@ -1827,9 +1628,10 @@ static inline struct page *page_chain_next(struct page *page)
#define page_chain_for_each_safe(page, n) \
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
{
struct page *page = e->pages;
struct page *page = peer_req->pages;
page_chain_for_each(page) {
if (page_count(page) > 1)
return 1;
......@@ -1837,18 +1639,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
return 0;
}
static inline void drbd_state_lock(struct drbd_conf *mdev)
{
wait_event(mdev->misc_wait,
!drbd_test_and_set_flag(mdev, CLUSTER_ST_CHANGE));
}
static inline void drbd_state_unlock(struct drbd_conf *mdev)
{
drbd_clear_flag(mdev, CLUSTER_ST_CHANGE);
wake_up(&mdev->misc_wait);
}
static inline enum drbd_state_rv
_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
enum chg_state_flags flags, struct completion *done)
......@@ -1862,21 +1652,16 @@ _drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
return rv;
}
/**
* drbd_request_state() - Reqest a state change
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
*
* This is the most graceful way of requesting a state change. It is verbose
* quite verbose in case the state change is not possible, and all those
* state changes are globally serialized.
*/
static inline int drbd_request_state(struct drbd_conf *mdev,
union drbd_state mask,
union drbd_state val)
static inline union drbd_state drbd_read_state(struct drbd_conf *mdev)
{
return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
union drbd_state rv;
rv.i = mdev->state.i;
rv.susp = mdev->tconn->susp;
rv.susp_nod = mdev->tconn->susp_nod;
rv.susp_fen = mdev->tconn->susp_fen;
return rv;
}
enum drbd_force_detach_flags {
......@@ -1891,8 +1676,13 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
enum drbd_force_detach_flags df,
const char *where)
{
switch (mdev->ldev->dc.on_io_error) {
case EP_PASS_ON:
enum drbd_io_error_p ep;
rcu_read_lock();
ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
rcu_read_unlock();
switch (ep) {
case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s.\n", where);
......@@ -1923,11 +1713,11 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
* we read meta data only once during attach,
* which will fail in case of errors.
*/
drbd_set_flag(mdev, WAS_IO_ERROR);
set_bit(WAS_IO_ERROR, &mdev->flags);
if (df == DRBD_READ_ERROR)
drbd_set_flag(mdev, WAS_READ_ERROR);
set_bit(WAS_READ_ERROR, &mdev->flags);
if (df == DRBD_FORCE_DETACH)
drbd_set_flag(mdev, FORCE_DETACH);
set_bit(FORCE_DETACH, &mdev->flags);
if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
dev_err(DEV,
......@@ -1951,9 +1741,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
{
if (error) {
unsigned long flags;
spin_lock_irqsave(&mdev->req_lock, flags);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
__drbd_chk_io_error_(mdev, forcedetach, where);
spin_unlock_irqrestore(&mdev->req_lock, flags);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
}
}
......@@ -1965,9 +1755,9 @@ static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
* BTW, for internal meta data, this happens to be the maximum capacity
* we could agree upon with our peer node.
*/
static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
{
switch (bdev->dc.meta_dev_idx) {
switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.bm_offset;
......@@ -1977,13 +1767,30 @@ static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
}
}
static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
{
int meta_dev_idx;
rcu_read_lock();
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
rcu_read_unlock();
return _drbd_md_first_sector(meta_dev_idx, bdev);
}
/**
* drbd_md_last_sector() - Return the last sector number of the meta data area
* @bdev: Meta data block device.
*/
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{
switch (bdev->dc.meta_dev_idx) {
int meta_dev_idx;
rcu_read_lock();
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
rcu_read_unlock();
switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + MD_AL_OFFSET - 1;
......@@ -2011,12 +1818,18 @@ static inline sector_t drbd_get_capacity(struct block_device *bdev)
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
switch (bdev->dc.meta_dev_idx) {
int meta_dev_idx;
rcu_read_lock();
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
rcu_read_unlock();
switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
drbd_md_first_sector(bdev))
_drbd_md_first_sector(meta_dev_idx, bdev))
: 0;
break;
case DRBD_MD_INDEX_FLEX_EXT:
......@@ -2042,9 +1855,15 @@ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
switch (bdev->dc.meta_dev_idx) {
int meta_dev_idx;
rcu_read_lock();
meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
rcu_read_unlock();
switch (meta_dev_idx) {
default: /* external, some index */
return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
return MD_RESERVED_SECT * meta_dev_idx;
case DRBD_MD_INDEX_INTERNAL:
/* with drbd08, internal meta data is always "flexible" */
case DRBD_MD_INDEX_FLEX_INT:
......@@ -2070,9 +1889,8 @@ drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add(&w->list, &q->q);
up(&q->s); /* within the spinlock,
see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
wake_up(&q->q_wait);
}
static inline void
......@@ -2081,41 +1899,35 @@ drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add_tail(&w->list, &q->q);
up(&q->s); /* within the spinlock,
see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
wake_up(&q->q_wait);
}
static inline void wake_asender(struct drbd_conf *mdev)
{
if (drbd_test_flag(mdev, SIGNAL_ASENDER))
force_sig(DRBD_SIG, mdev->asender.task);
}
static inline void request_ping(struct drbd_conf *mdev)
static inline void wake_asender(struct drbd_tconn *tconn)
{
drbd_set_flag(mdev, SEND_PING);
wake_asender(mdev);
if (test_bit(SIGNAL_ASENDER, &tconn->flags))
force_sig(DRBD_SIG, tconn->asender.task);
}
static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
enum drbd_packets cmd)
static inline void request_ping(struct drbd_tconn *tconn)
{
struct p_header80 h;
return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
set_bit(SEND_PING, &tconn->flags);
wake_asender(tconn);
}
static inline int drbd_send_ping(struct drbd_conf *mdev)
{
struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
}
extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *);
extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *);
extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *,
enum drbd_packet, unsigned int, void *,
unsigned int);
extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *,
enum drbd_packet, unsigned int, void *,
unsigned int);
static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
{
struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
}
extern int drbd_send_ping(struct drbd_tconn *tconn);
extern int drbd_send_ping_ack(struct drbd_tconn *tconn);
extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state);
extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state);
static inline void drbd_thread_stop(struct drbd_thread *thi)
{
......@@ -2137,21 +1949,21 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
* or implicit barrier packets as necessary.
* increased:
* w_send_barrier
* _req_mod(req, queue_for_net_write or queue_for_net_read);
* _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
* it is much easier and equally valid to count what we queue for the
* worker, even before it actually was queued or send.
* (drbd_make_request_common; recovery path on read io-error)
* decreased:
* got_BarrierAck (respective tl_clear, tl_clear_barrier)
* _req_mod(req, data_received)
* _req_mod(req, DATA_RECEIVED)
* [from receive_DataReply]
* _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
* _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
* [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
* for some reason it is NOT decreased in got_NegAck,
* but in the resulting cleanup code from report_params.
* we should try to remember the reason for that...
* _req_mod(req, send_failed or send_canceled)
* _req_mod(req, connection_lost_while_pending)
* _req_mod(req, SEND_FAILED or SEND_CANCELED)
* _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
* [from tl_clear_barrier]
*/
static inline void inc_ap_pending(struct drbd_conf *mdev)
......@@ -2159,17 +1971,19 @@ static inline void inc_ap_pending(struct drbd_conf *mdev)
atomic_inc(&mdev->ap_pending_cnt);
}
#define ERR_IF_CNT_IS_NEGATIVE(which) \
if (atomic_read(&mdev->which) < 0) \
#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \
if (atomic_read(&mdev->which) < 0) \
dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
__func__ , __LINE__ , \
atomic_read(&mdev->which))
func, line, \
atomic_read(&mdev->which))
#define dec_ap_pending(mdev) do { \
typecheck(struct drbd_conf *, mdev); \
if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
wake_up(&mdev->misc_wait); \
ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
#define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__)
static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line)
{
if (atomic_dec_and_test(&mdev->ap_pending_cnt))
wake_up(&mdev->misc_wait);
ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
}
/* counts how many resync-related answers we still expect from the peer
* increase decrease
......@@ -2182,10 +1996,12 @@ static inline void inc_rs_pending(struct drbd_conf *mdev)
atomic_inc(&mdev->rs_pending_cnt);
}
#define dec_rs_pending(mdev) do { \
typecheck(struct drbd_conf *, mdev); \
atomic_dec(&mdev->rs_pending_cnt); \
ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
#define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__)
static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line)
{
atomic_dec(&mdev->rs_pending_cnt);
ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
}
/* counts how many answers we still need to send to the peer.
* increased on
......@@ -2201,38 +2017,18 @@ static inline void inc_unacked(struct drbd_conf *mdev)
atomic_inc(&mdev->unacked_cnt);
}
#define dec_unacked(mdev) do { \
typecheck(struct drbd_conf *, mdev); \
atomic_dec(&mdev->unacked_cnt); \
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
#define sub_unacked(mdev, n) do { \
typecheck(struct drbd_conf *, mdev); \
atomic_sub(n, &mdev->unacked_cnt); \
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
static inline void put_net_conf(struct drbd_conf *mdev)
#define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__)
static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line)
{
if (atomic_dec_and_test(&mdev->net_cnt))
wake_up(&mdev->net_cnt_wait);
atomic_dec(&mdev->unacked_cnt);
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
/**
* get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
* @mdev: DRBD device.
*
* You have to call put_net_conf() when finished working with mdev->net_conf.
*/
static inline int get_net_conf(struct drbd_conf *mdev)
#define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__)
static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line)
{
int have_net_conf;
atomic_inc(&mdev->net_cnt);
have_net_conf = mdev->state.conn >= C_UNCONNECTED;
if (!have_net_conf)
put_net_conf(mdev);
return have_net_conf;
atomic_sub(n, &mdev->unacked_cnt);
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
/**
......@@ -2336,17 +2132,20 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
* maybe re-implement using semaphores? */
static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
{
int mxb = 1000000; /* arbitrary limit on open requests */
if (get_net_conf(mdev)) {
mxb = mdev->net_conf->max_buffers;
put_net_conf(mdev);
}
struct net_conf *nc;
int mxb;
rcu_read_lock();
nc = rcu_dereference(mdev->tconn->net_conf);
mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */
rcu_read_unlock();
return mxb;
}
static inline int drbd_state_is_stable(struct drbd_conf *mdev)
{
union drbd_state s = mdev->state;
union drbd_dev_state s = mdev->state;
/* DO NOT add a default clause, we want the compiler to warn us
* for any newly introduced state we may have forgotten to add here */
......@@ -2380,7 +2179,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
/* Allow IO in BM exchange states with new protocols */
case C_WF_BITMAP_S:
if (mdev->agreed_pro_version < 96)
if (mdev->tconn->agreed_pro_version < 96)
return 0;
break;
......@@ -2402,7 +2201,7 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
/* disk state is stable as well. */
break;
/* no new io accepted during tansitional states */
/* no new io accepted during transitional states */
case D_ATTACHING:
case D_NEGOTIATING:
case D_UNKNOWN:
......@@ -2414,18 +2213,20 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
return 1;
}
static inline int is_susp(union drbd_state s)
static inline int drbd_suspended(struct drbd_conf *mdev)
{
return s.susp || s.susp_nod || s.susp_fen;
struct drbd_tconn *tconn = mdev->tconn;
return tconn->susp || tconn->susp_fen || tconn->susp_nod;
}
static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
if (is_susp(mdev->state))
if (drbd_suspended(mdev))
return false;
if (drbd_test_flag(mdev, SUSPEND_IO))
if (test_bit(SUSPEND_IO, &mdev->flags))
return false;
/* to avoid potential deadlock or bitmap corruption,
......@@ -2440,35 +2241,35 @@ static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
* and we are within the spinlock anyways, we have this workaround. */
if (atomic_read(&mdev->ap_bio_cnt) > mxb)
return false;
if (drbd_test_flag(mdev, BITMAP_IO))
if (test_bit(BITMAP_IO, &mdev->flags))
return false;
return true;
}
static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
static inline bool inc_ap_bio_cond(struct drbd_conf *mdev)
{
bool rv = false;
spin_lock_irq(&mdev->req_lock);
spin_lock_irq(&mdev->tconn->req_lock);
rv = may_inc_ap_bio(mdev);
if (rv)
atomic_add(count, &mdev->ap_bio_cnt);
spin_unlock_irq(&mdev->req_lock);
atomic_inc(&mdev->ap_bio_cnt);
spin_unlock_irq(&mdev->tconn->req_lock);
return rv;
}
static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
static inline void inc_ap_bio(struct drbd_conf *mdev)
{
/* we wait here
* as long as the device is suspended
* until the bitmap is no longer on the fly during connection
* handshake as long as we would exeed the max_buffer limit.
* handshake as long as we would exceed the max_buffer limit.
*
* to avoid races with the reconnect code,
* we need to atomic_inc within the spinlock. */
wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev));
}
static inline void dec_ap_bio(struct drbd_conf *mdev)
......@@ -2478,9 +2279,9 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
D_ASSERT(ap_bio >= 0);
if (ap_bio == 0 && drbd_test_flag(mdev, BITMAP_IO)) {
if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
}
/* this currently does wake_up for every dec_ap_bio!
......@@ -2490,6 +2291,12 @@ static inline void dec_ap_bio(struct drbd_conf *mdev)
wake_up(&mdev->misc_wait);
}
static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev)
{
return mdev->tconn->agreed_pro_version >= 97 &&
mdev->tconn->agreed_pro_version != 100;
}
static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
{
int changed = mdev->ed_uuid != val;
......@@ -2497,40 +2304,6 @@ static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
return changed;
}
static inline int seq_cmp(u32 a, u32 b)
{
/* we assume wrap around at 32bit.
* for wrap around at 24bit (old atomic_t),
* we'd have to
* a <<= 8; b <<= 8;
*/
return (s32)(a) - (s32)(b);
}
#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
/* CAUTION: please no side effects in arguments! */
#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
{
unsigned int m;
spin_lock(&mdev->peer_seq_lock);
m = seq_max(mdev->peer_seq, new_seq);
mdev->peer_seq = m;
spin_unlock(&mdev->peer_seq_lock);
if (m == new_seq)
wake_up(&mdev->seq_wait);
}
static inline void drbd_update_congested(struct drbd_conf *mdev)
{
struct sock *sk = mdev->data.socket->sk;
if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
drbd_set_flag(mdev, NET_CONGESTED);
}
static inline int drbd_queue_order_type(struct drbd_conf *mdev)
{
/* sorry, we currently have no working implementation
......@@ -2545,15 +2318,46 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
{
int r;
if (drbd_test_flag(mdev, MD_NO_FUA))
if (mdev->ldev == NULL) {
dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n");
return;
}
if (test_bit(MD_NO_FUA, &mdev->flags))
return;
r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL);
if (r) {
drbd_set_flag(mdev, MD_NO_FUA);
set_bit(MD_NO_FUA, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
}
}
#endif
/* This is defined in drivers/md/md.h as well. Should go into wait.h */
#define __wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
wait_queue_t __wait; \
init_waitqueue_entry(&__wait, current); \
\
add_wait_queue(&wq, &__wait); \
for (;;) { \
set_current_state(TASK_UNINTERRUPTIBLE); \
if (condition) \
break; \
spin_unlock_irq(&lock); \
cmd; \
schedule(); \
spin_lock_irq(&lock); \
} \
current->state = TASK_RUNNING; \
remove_wait_queue(&wq, &__wait); \
} while (0)
#define wait_event_lock_irq(wq, condition, lock, cmd) \
do { \
if (condition) \
break; \
__wait_event_lock_irq(wq, condition, lock, cmd); \
} while (0)
#include <asm/bug.h>
#include <linux/rbtree_augmented.h>
#include "drbd_interval.h"
/**
* interval_end - return end of @node
*/
static inline
sector_t interval_end(struct rb_node *node)
{
struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
return this->end;
}
/**
* compute_subtree_last - compute end of @node
*
* The end of an interval is the highest (start + (size >> 9)) value of this
* node and of its children. Called for @node and its parents whenever the end
* may have changed.
*/
static inline sector_t
compute_subtree_last(struct drbd_interval *node)
{
sector_t max = node->sector + (node->size >> 9);
if (node->rb.rb_left) {
sector_t left = interval_end(node->rb.rb_left);
if (left > max)
max = left;
}
if (node->rb.rb_right) {
sector_t right = interval_end(node->rb.rb_right);
if (right > max)
max = right;
}
return max;
}
static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
{
while (rb != stop) {
struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
sector_t subtree_last = compute_subtree_last(node);
if (node->end == subtree_last)
break;
node->end = subtree_last;
rb = rb_parent(&node->rb);
}
}
static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
{
struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
new->end = old->end;
}
static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
{
struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
new->end = old->end;
old->end = compute_subtree_last(old);
}
static const struct rb_augment_callbacks augment_callbacks = {
augment_propagate,
augment_copy,
augment_rotate,
};
/**
* drbd_insert_interval - insert a new interval into a tree
*/
bool
drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
{
struct rb_node **new = &root->rb_node, *parent = NULL;
BUG_ON(!IS_ALIGNED(this->size, 512));
while (*new) {
struct drbd_interval *here =
rb_entry(*new, struct drbd_interval, rb);
parent = *new;
if (this->sector < here->sector)
new = &(*new)->rb_left;
else if (this->sector > here->sector)
new = &(*new)->rb_right;
else if (this < here)
new = &(*new)->rb_left;
else if (this > here)
new = &(*new)->rb_right;
else
return false;
}
rb_link_node(&this->rb, parent, new);
rb_insert_augmented(&this->rb, root, &augment_callbacks);
return true;
}
/**
* drbd_contains_interval - check if a tree contains a given interval
* @sector: start sector of @interval
* @interval: may not be a valid pointer
*
* Returns if the tree contains the node @interval with start sector @start.
* Does not dereference @interval until @interval is known to be a valid object
* in @tree. Returns %false if @interval is in the tree but with a different
* sector number.
*/
bool
drbd_contains_interval(struct rb_root *root, sector_t sector,
struct drbd_interval *interval)
{
struct rb_node *node = root->rb_node;
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (sector < here->sector)
node = node->rb_left;
else if (sector > here->sector)
node = node->rb_right;
else if (interval < here)
node = node->rb_left;
else if (interval > here)
node = node->rb_right;
else
return true;
}
return false;
}
/**
* drbd_remove_interval - remove an interval from a tree
*/
void
drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
{
rb_erase_augmented(&this->rb, root, &augment_callbacks);
}
/**
* drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
* @sector: start sector
* @size: size, aligned to 512 bytes
*
* Returns an interval overlapping with [sector, sector + size), or NULL if
* there is none. When there is more than one overlapping interval in the
* tree, the interval with the lowest start sector is returned, and all other
* overlapping intervals will be on the right side of the tree, reachable with
* rb_next().
*/
struct drbd_interval *
drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
{
struct rb_node *node = root->rb_node;
struct drbd_interval *overlap = NULL;
sector_t end = sector + (size >> 9);
BUG_ON(!IS_ALIGNED(size, 512));
while (node) {
struct drbd_interval *here =
rb_entry(node, struct drbd_interval, rb);
if (node->rb_left &&
sector < interval_end(node->rb_left)) {
/* Overlap if any must be on left side */
node = node->rb_left;
} else if (here->sector < end &&
sector < here->sector + (here->size >> 9)) {
overlap = here;
break;
} else if (sector >= here->sector) {
/* Overlap if any must be on right side */
node = node->rb_right;
} else
break;
}
return overlap;
}
struct drbd_interval *
drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
{
sector_t end = sector + (size >> 9);
struct rb_node *node;
for (;;) {
node = rb_next(&i->rb);
if (!node)
return NULL;
i = rb_entry(node, struct drbd_interval, rb);
if (i->sector >= end)
return NULL;
if (sector < i->sector + (i->size >> 9))
return i;
}
}
#ifndef __DRBD_INTERVAL_H
#define __DRBD_INTERVAL_H
#include <linux/types.h>
#include <linux/rbtree.h>
struct drbd_interval {
struct rb_node rb;
sector_t sector; /* start sector of the interval */
unsigned int size; /* size in bytes */
sector_t end; /* highest interval end in subtree */
int local:1 /* local or remote request? */;
int waiting:1;
};
static inline void drbd_clear_interval(struct drbd_interval *i)
{
RB_CLEAR_NODE(&i->rb);
}
static inline bool drbd_interval_empty(struct drbd_interval *i)
{
return RB_EMPTY_NODE(&i->rb);
}
extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
extern bool drbd_contains_interval(struct rb_root *, sector_t,
struct drbd_interval *);
extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
unsigned int);
extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
unsigned int);
#define drbd_for_each_overlap(i, root, sector, size) \
for (i = drbd_find_overlap(root, sector, size); \
i; \
i = drbd_next_overlap(i, sector, size))
#endif /* __DRBD_INTERVAL_H */
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#include "drbd_wrappers.h"
#include <linux/kernel.h>
#include <net/netlink.h>
#include <linux/drbd_genl_api.h>
#include "drbd_nla.h"
static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
{
struct nlattr *head = nla_data(nla);
int len = nla_len(nla);
int rem;
/*
* validate_nla (called from nla_parse_nested) ignores attributes
* beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
* In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
* flag set also, check and remove that flag before calling
* nla_parse_nested.
*/
nla_for_each_attr(nla, head, len, rem) {
if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
if (nla_type(nla) > maxtype)
return -EOPNOTSUPP;
}
}
return 0;
}
int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy)
{
int err;
err = drbd_nla_check_mandatory(maxtype, nla);
if (!err)
err = nla_parse_nested(tb, maxtype, nla, policy);
return err;
}
struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
{
int err;
/*
* If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
* we don't know about that attribute, reject all the nested
* attributes.
*/
err = drbd_nla_check_mandatory(maxtype, nla);
if (err)
return ERR_PTR(err);
return nla_find_nested(nla, attrtype);
}
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H
extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy);
extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
#endif /* __DRBD_NLA_H */
......@@ -171,7 +171,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
if (mdev->state.conn == C_VERIFY_S ||
mdev->state.conn == C_VERIFY_T) {
bit_pos = bm_bits - mdev->ov_left;
if (mdev->agreed_pro_version >= 97)
if (verify_can_do_stop_sector(mdev))
stop_sector = mdev->ov_stop_sector;
} else
bit_pos = mdev->bm_resync_fo;
......@@ -200,9 +200,11 @@ static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
static int drbd_seq_show(struct seq_file *seq, void *v)
{
int i, hole = 0;
int i, prev_i = -1;
const char *sn;
struct drbd_conf *mdev;
struct net_conf *nc;
char wp;
static char write_ordering_chars[] = {
[WO_none] = 'n',
......@@ -233,16 +235,11 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
oos .. known out-of-sync kB
*/
for (i = 0; i < minor_count; i++) {
mdev = minor_to_mdev(i);
if (!mdev) {
hole = 1;
continue;
}
if (hole) {
hole = 0;
rcu_read_lock();
idr_for_each_entry(&minors, mdev, i) {
if (prev_i != i - 1)
seq_printf(seq, "\n");
}
prev_i = i;
sn = drbd_conn_str(mdev->state.conn);
......@@ -254,6 +251,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
/* reset mdev->congestion_reason */
bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
nc = rcu_dereference(mdev->tconn->net_conf);
wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
......@@ -263,14 +262,13 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
drbd_role_str(mdev->state.peer),
drbd_disk_str(mdev->state.disk),
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
is_susp(mdev->state) ? 's' : 'r',
wp,
drbd_suspended(mdev) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
drbd_test_flag(mdev, AL_SUSPENDED) ? 's' : '-',
test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
......@@ -282,8 +280,8 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
atomic_read(&mdev->rs_pending_cnt),
atomic_read(&mdev->unacked_cnt),
atomic_read(&mdev->ap_bio_cnt),
mdev->epochs,
write_ordering_chars[mdev->write_ordering]
mdev->tconn->epochs,
write_ordering_chars[mdev->tconn->write_ordering]
);
seq_printf(seq, " oos:%llu\n",
Bit2KB((unsigned long long)
......@@ -308,6 +306,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
}
}
}
rcu_read_unlock();
return 0;
}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -31,6 +31,8 @@
#include "drbd_req.h"
static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
/* Update disk stats at start of I/O request */
static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
{
......@@ -40,6 +42,8 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
part_round_stats(cpu, &mdev->vdisk->part0);
part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
(void) cpu; /* The macro invocations above want the cpu argument, I do not like
the compiler warning about cpu only assigned but never used... */
part_inc_in_flight(&mdev->vdisk->part0, rw);
part_stat_unlock();
}
......@@ -57,9 +61,51 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
part_stat_unlock();
}
static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
static struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
struct drbd_request *req;
req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (!req)
return NULL;
drbd_req_make_private_bio(req, bio_src);
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->w.mdev = mdev;
req->master_bio = bio_src;
req->epoch = 0;
drbd_clear_interval(&req->i);
req->i.sector = bio_src->bi_sector;
req->i.size = bio_src->bi_size;
req->i.local = true;
req->i.waiting = false;
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
/* one reference to be put by __drbd_make_request */
atomic_set(&req->completion_ref, 1);
/* one kref as long as completion_ref > 0 */
kref_init(&req->kref);
return req;
}
void drbd_req_destroy(struct kref *kref)
{
const unsigned long s = req->rq_state;
struct drbd_request *req = container_of(kref, struct drbd_request, kref);
struct drbd_conf *mdev = req->w.mdev;
const unsigned s = req->rq_state;
if ((req->master_bio && !(s & RQ_POSTPONED)) ||
atomic_read(&req->completion_ref) ||
(s & RQ_LOCAL_PENDING) ||
((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
s, atomic_read(&req->completion_ref));
return;
}
/* remove it from the transfer log.
* well, only if it had been there in the first
......@@ -67,24 +113,33 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
* and never sent), it should still be "empty" as
* initialized in drbd_req_new(), so we can list_del() it
* here unconditionally */
list_del(&req->tl_requests);
list_del_init(&req->tl_requests);
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
if (rw == WRITE) {
if (s & RQ_WRITE) {
/* Set out-of-sync unless both OK flags are set
* (local only or remote failed).
* Other places where we set out-of-sync:
* READ with local io-error */
if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
drbd_set_out_of_sync(mdev, req->sector, req->size);
if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
drbd_set_in_sync(mdev, req->sector, req->size);
/* There is a special case:
* we may notice late that IO was suspended,
* and postpone, or schedule for retry, a write,
* before it even was submitted or sent.
* In that case we do not want to touch the bitmap at all.
*/
if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
drbd_set_in_sync(mdev, req->i.sector, req->i.size);
}
/* one might be tempted to move the drbd_al_complete_io
* to the local io completion callback drbd_endio_pri.
* to the local io completion callback drbd_request_endio.
* but, if this was a mirror write, we may only
* drbd_al_complete_io after this is RQ_NET_DONE,
* otherwise the extent could be dropped from the al
......@@ -93,109 +148,35 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
* but after the extent has been dropped from the al,
* we would forget to resync the corresponding extent.
*/
if (s & RQ_LOCAL_MASK) {
if (s & RQ_IN_ACT_LOG) {
if (get_ldev_if_state(mdev, D_FAILED)) {
if (s & RQ_IN_ACT_LOG)
drbd_al_complete_io(mdev, req->sector);
drbd_al_complete_io(mdev, &req->i);
put_ldev(mdev);
} else if (__ratelimit(&drbd_ratelimit_state)) {
dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
"but my Disk seems to have failed :(\n",
(unsigned long long) req->sector);
dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), "
"but my Disk seems to have failed :(\n",
(unsigned long long) req->i.sector, req->i.size);
}
}
}
drbd_req_free(req);
mempool_free(req, drbd_request_mempool);
}
static void queue_barrier(struct drbd_conf *mdev)
{
struct drbd_tl_epoch *b;
/* We are within the req_lock. Once we queued the barrier for sending,
* we set the CREATE_BARRIER bit. It is cleared as soon as a new
* barrier/epoch object is added. This is the only place this bit is
* set. It indicates that the barrier for this epoch is already queued,
* and no new epoch has been created yet. */
if (drbd_test_flag(mdev, CREATE_BARRIER))
return;
b = mdev->newest_tle;
b->w.cb = w_send_barrier;
/* inc_ap_pending done here, so we won't
* get imbalanced on connection loss.
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in tl_clear. */
inc_ap_pending(mdev);
drbd_queue_work(&mdev->data.work, &b->w);
drbd_set_flag(mdev, CREATE_BARRIER);
static void wake_all_senders(struct drbd_tconn *tconn) {
wake_up(&tconn->sender_work.q_wait);
}
static void _about_to_complete_local_write(struct drbd_conf *mdev,
struct drbd_request *req)
/* must hold resource->req_lock */
static void start_new_tl_epoch(struct drbd_tconn *tconn)
{
const unsigned long s = req->rq_state;
struct drbd_request *i;
struct drbd_epoch_entry *e;
struct hlist_node *n;
struct hlist_head *slot;
/* Before we can signal completion to the upper layers,
* we may need to close the current epoch.
* We can skip this, if this request has not even been sent, because we
* did not have a fully established connection yet/anymore, during
* bitmap exchange, or while we are C_AHEAD due to congestion policy.
*/
if (mdev->state.conn >= C_CONNECTED &&
(s & RQ_NET_SENT) != 0 &&
req->epoch == mdev->newest_tle->br_number)
queue_barrier(mdev);
/* we need to do the conflict detection stuff,
* if we have the ee_hash (two_primaries) and
* this has been on the network */
if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
const sector_t sector = req->sector;
const int size = req->size;
/* ASSERT:
* there must be no conflicting requests, since
* they must have been failed on the spot */
#define OVERLAPS overlaps(sector, size, i->sector, i->size)
slot = tl_hash_slot(mdev, sector);
hlist_for_each_entry(i, n, slot, collision) {
if (OVERLAPS) {
dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
"other: %p %llus +%u\n",
req, (unsigned long long)sector, size,
i, (unsigned long long)i->sector, i->size);
}
}
/* no point closing an epoch, if it is empty, anyways. */
if (tconn->current_tle_writes == 0)
return;
/* maybe "wake" those conflicting epoch entries
* that wait for this request to finish.
*
* currently, there can be only _one_ such ee
* (well, or some more, which would be pending
* P_DISCARD_ACK not yet sent by the asender...),
* since we block the receiver thread upon the
* first conflict detection, which will wait on
* misc_wait. maybe we want to assert that?
*
* anyways, if we found one,
* we just have to do a wake_up. */
#undef OVERLAPS
#define OVERLAPS overlaps(sector, size, e->sector, e->size)
slot = ee_hash_slot(mdev, req->sector);
hlist_for_each_entry(e, n, slot, collision) {
if (OVERLAPS) {
wake_up(&mdev->misc_wait);
break;
}
}
}
#undef OVERLAPS
tconn->current_tle_writes = 0;
atomic_inc(&tconn->current_tle_nr);
wake_all_senders(tconn);
}
void complete_master_bio(struct drbd_conf *mdev,
......@@ -205,17 +186,33 @@ void complete_master_bio(struct drbd_conf *mdev,
dec_ap_bio(mdev);
}
static void drbd_remove_request_interval(struct rb_root *root,
struct drbd_request *req)
{
struct drbd_conf *mdev = req->w.mdev;
struct drbd_interval *i = &req->i;
drbd_remove_interval(root, i);
/* Wake up any processes waiting for this request to complete. */
if (i->waiting)
wake_up(&mdev->misc_wait);
}
/* Helper for __req_mod().
* Set m->bio to the master bio, if it is fit to be completed,
* or leave it alone (it is initialized to NULL in __req_mod),
* if it has already been completed, or cannot be completed yet.
* If m->bio is set, the error status to be returned is placed in m->error.
*/
void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
static
void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
{
const unsigned long s = req->rq_state;
struct drbd_conf *mdev = req->mdev;
int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
const unsigned s = req->rq_state;
struct drbd_conf *mdev = req->w.mdev;
int rw;
int error, ok;
/* we must not complete the master bio, while it is
* still being processed by _drbd_send_zc_bio (drbd_send_dblock)
......@@ -226,178 +223,219 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
* the receiver,
* the bio_endio completion callbacks.
*/
if (s & RQ_NET_QUEUED)
return;
if (s & RQ_NET_PENDING)
if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
(s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
(s & RQ_COMPLETION_SUSP)) {
dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
return;
if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
}
if (!req->master_bio) {
dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
return;
}
if (req->master_bio) {
/* this is data_received (remote read)
* or protocol C P_WRITE_ACK
* or protocol B P_RECV_ACK
* or protocol A "handed_over_to_network" (SendAck)
* or canceled or failed,
* or killed from the transfer log due to connection loss.
*/
rw = bio_rw(req->master_bio);
/*
* figure out whether to report success or failure.
*
* report success when at least one of the operations succeeded.
* or, to put the other way,
* only report failure, when both operations failed.
*
* what to do about the failures is handled elsewhere.
* what we need to do here is just: complete the master_bio.
*
* local completion error, if any, has been stored as ERR_PTR
* in private_bio within drbd_endio_pri.
*/
int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
int error = PTR_ERR(req->private_bio);
/*
* figure out whether to report success or failure.
*
* report success when at least one of the operations succeeded.
* or, to put the other way,
* only report failure, when both operations failed.
*
* what to do about the failures is handled elsewhere.
* what we need to do here is just: complete the master_bio.
*
* local completion error, if any, has been stored as ERR_PTR
* in private_bio within drbd_request_endio.
*/
ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
error = PTR_ERR(req->private_bio);
/* remove the request from the conflict detection
* respective block_id verification hash */
if (!hlist_unhashed(&req->collision))
hlist_del(&req->collision);
else
D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
/* remove the request from the conflict detection
* respective block_id verification hash */
if (!drbd_interval_empty(&req->i)) {
struct rb_root *root;
/* for writes we need to do some extra housekeeping */
if (rw == WRITE)
_about_to_complete_local_write(mdev, req);
root = &mdev->write_requests;
else
root = &mdev->read_requests;
drbd_remove_request_interval(root, req);
} else if (!(s & RQ_POSTPONED))
D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
/* Update disk stats */
_drbd_end_io_acct(mdev, req);
/* Before we can signal completion to the upper layers,
* we may need to close the current transfer log epoch.
* We are within the request lock, so we can simply compare
* the request epoch number with the current transfer log
* epoch number. If they match, increase the current_tle_nr,
* and reset the transfer log epoch write_cnt.
*/
if (rw == WRITE &&
req->epoch == atomic_read(&mdev->tconn->current_tle_nr))
start_new_tl_epoch(mdev->tconn);
/* Update disk stats */
_drbd_end_io_acct(mdev, req);
/* If READ failed,
* have it be pushed back to the retry work queue,
* so it will re-enter __drbd_make_request(),
* and be re-assigned to a suitable local or remote path,
* or failed if we do not have access to good data anymore.
*
* Unless it was failed early by __drbd_make_request(),
* because no path was available, in which case
* it was not even added to the transfer_log.
*
* READA may fail, and will not be retried.
*
* WRITE should have used all available paths already.
*/
if (!ok && rw == READ && !list_empty(&req->tl_requests))
req->rq_state |= RQ_POSTPONED;
if (!(req->rq_state & RQ_POSTPONED)) {
m->error = ok ? 0 : (error ?: -EIO);
m->bio = req->master_bio;
req->master_bio = NULL;
}
}
if (s & RQ_LOCAL_PENDING)
return;
static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
{
struct drbd_conf *mdev = req->w.mdev;
D_ASSERT(m || (req->rq_state & RQ_POSTPONED));
if (!atomic_sub_and_test(put, &req->completion_ref))
return 0;
if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
/* this is disconnected (local only) operation,
* or protocol C P_WRITE_ACK,
* or protocol A or B P_BARRIER_ACK,
* or killed from the transfer log due to connection loss. */
_req_is_done(mdev, req, rw);
drbd_req_complete(req, m);
if (req->rq_state & RQ_POSTPONED) {
/* don't destroy the req object just yet,
* but queue it for retry */
drbd_restart_request(req);
return 0;
}
/* else: network part and not DONE yet. that is
* protocol A or B, barrier ack still pending... */
return 1;
}
static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
/* I'd like this to be the only place that manipulates
* req->completion_ref and req->kref. */
static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
int clear, int set)
{
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
unsigned s = req->rq_state;
int c_put = 0;
int k_put = 0;
if (!is_susp(mdev->state))
_req_may_be_done(req, m);
}
if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP))
set |= RQ_COMPLETION_SUSP;
/*
* checks whether there was an overlapping request
* or ee already registered.
*
* if so, return 1, in which case this request is completed on the spot,
* without ever being submitted or send.
*
* return 0 if it is ok to submit this request.
*
* NOTE:
* paranoia: assume something above us is broken, and issues different write
* requests for the same block simultaneously...
*
* To ensure these won't be reordered differently on both nodes, resulting in
* diverging data sets, we discard the later one(s). Not that this is supposed
* to happen, but this is the rationale why we also have to check for
* conflicting requests with local origin, and why we have to do so regardless
* of whether we allowed multiple primaries.
*
* BTW, in case we only have one primary, the ee_hash is empty anyways, and the
* second hlist_for_each_entry becomes a noop. This is even simpler than to
* grab a reference on the net_conf, and check for the two_primaries flag...
*/
static int _req_conflicts(struct drbd_request *req)
{
struct drbd_conf *mdev = req->mdev;
const sector_t sector = req->sector;
const int size = req->size;
struct drbd_request *i;
struct drbd_epoch_entry *e;
struct hlist_node *n;
struct hlist_head *slot;
/* apply */
D_ASSERT(hlist_unhashed(&req->collision));
req->rq_state &= ~clear;
req->rq_state |= set;
if (!get_net_conf(mdev))
return 0;
/* no change? */
if (req->rq_state == s)
return;
/* BUG_ON */
ERR_IF (mdev->tl_hash_s == 0)
goto out_no_conflict;
BUG_ON(mdev->tl_hash == NULL);
#define OVERLAPS overlaps(i->sector, i->size, sector, size)
slot = tl_hash_slot(mdev, sector);
hlist_for_each_entry(i, n, slot, collision) {
if (OVERLAPS) {
dev_alert(DEV, "%s[%u] Concurrent local write detected! "
"[DISCARD L] new: %llus +%u; "
"pending: %llus +%u\n",
current->comm, current->pid,
(unsigned long long)sector, size,
(unsigned long long)i->sector, i->size);
goto out_conflict;
}
/* intent: get references */
if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
atomic_inc(&req->completion_ref);
if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
inc_ap_pending(mdev);
atomic_inc(&req->completion_ref);
}
if (mdev->ee_hash_s) {
/* now, check for overlapping requests with remote origin */
BUG_ON(mdev->ee_hash == NULL);
#undef OVERLAPS
#define OVERLAPS overlaps(e->sector, e->size, sector, size)
slot = ee_hash_slot(mdev, sector);
hlist_for_each_entry(e, n, slot, collision) {
if (OVERLAPS) {
dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
" [DISCARD L] new: %llus +%u; "
"pending: %llus +%u\n",
current->comm, current->pid,
(unsigned long long)sector, size,
(unsigned long long)e->sector, e->size);
goto out_conflict;
}
}
if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
atomic_inc(&req->completion_ref);
if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
kref_get(&req->kref); /* wait for the DONE */
if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
atomic_inc(&req->completion_ref);
/* progress: put references */
if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
++c_put;
if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
D_ASSERT(req->rq_state & RQ_LOCAL_PENDING);
/* local completion may still come in later,
* we need to keep the req object around. */
kref_get(&req->kref);
++c_put;
}
#undef OVERLAPS
out_no_conflict:
/* this is like it should be, and what we expected.
* our users do behave after all... */
put_net_conf(mdev);
return 0;
if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
if (req->rq_state & RQ_LOCAL_ABORTED)
++k_put;
else
++c_put;
}
out_conflict:
put_net_conf(mdev);
return 1;
if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
dec_ap_pending(mdev);
++c_put;
}
if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
++c_put;
if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
if (req->rq_state & RQ_NET_SENT)
atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
++k_put;
}
/* potentially complete and destroy */
if (k_put || c_put) {
/* Completion does it's own kref_put. If we are going to
* kref_sub below, we need req to be still around then. */
int at_least = k_put + !!c_put;
int refcount = atomic_read(&req->kref.refcount);
if (refcount < at_least)
dev_err(DEV,
"mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
s, req->rq_state, refcount, at_least);
}
/* If we made progress, retry conflicting peer requests, if any. */
if (req->i.waiting)
wake_up(&mdev->misc_wait);
if (c_put)
k_put += drbd_req_put_completion_ref(req, m, c_put);
if (k_put)
kref_sub(&req->kref, k_put, drbd_req_destroy);
}
static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req)
{
char b[BDEVNAME_SIZE];
if (__ratelimit(&drbd_ratelimit_state))
if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n",
(req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
(unsigned long long)req->sector,
req->size >> 9,
(unsigned long long)req->i.sector,
req->i.size >> 9,
bdevname(mdev->ldev->backing_bdev, b));
}
......@@ -416,9 +454,12 @@ static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *re
int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m)
{
struct drbd_conf *mdev = req->mdev;
int rv = 0;
m->bio = NULL;
struct drbd_conf *mdev = req->w.mdev;
struct net_conf *nc;
int p, rv = 0;
if (m)
m->bio = NULL;
switch (what) {
default:
......@@ -427,118 +468,91 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* does not happen...
* initialization done in drbd_req_new
case created:
case CREATED:
break;
*/
case to_be_send: /* via network */
/* reached via drbd_make_request_common
case TO_BE_SENT: /* via network */
/* reached via __drbd_make_request
* and from w_read_retry_remote */
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
req->rq_state |= RQ_NET_PENDING;
inc_ap_pending(mdev);
rcu_read_lock();
nc = rcu_dereference(mdev->tconn->net_conf);
p = nc->wire_protocol;
rcu_read_unlock();
req->rq_state |=
p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
mod_rq_state(req, m, 0, RQ_NET_PENDING);
break;
case to_be_submitted: /* locally */
/* reached via drbd_make_request_common */
case TO_BE_SUBMITTED: /* locally */
/* reached via __drbd_make_request */
D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
req->rq_state |= RQ_LOCAL_PENDING;
mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
break;
case completed_ok:
case COMPLETED_OK:
if (req->rq_state & RQ_WRITE)
mdev->writ_cnt += req->size>>9;
mdev->writ_cnt += req->i.size >> 9;
else
mdev->read_cnt += req->size>>9;
mdev->read_cnt += req->i.size >> 9;
req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done_not_susp(req, m);
mod_rq_state(req, m, RQ_LOCAL_PENDING,
RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
break;
case abort_disk_io:
req->rq_state |= RQ_LOCAL_ABORTED;
if (req->rq_state & RQ_WRITE)
_req_may_be_done_not_susp(req, m);
else
goto goto_queue_for_net_read;
case ABORT_DISK_IO:
mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
break;
case write_completed_with_error:
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
case WRITE_COMPLETED_WITH_ERROR:
drbd_report_io_error(mdev, req);
__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
_req_may_be_done_not_susp(req, m);
mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
break;
case read_ahead_completed_with_error:
/* it is legal to fail READA */
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done_not_susp(req, m);
break;
case read_completed_with_error:
drbd_set_out_of_sync(mdev, req->sector, req->size);
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
if (req->rq_state & RQ_LOCAL_ABORTED) {
_req_may_be_done(req, m);
break;
}
case READ_COMPLETED_WITH_ERROR:
drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
drbd_report_io_error(mdev, req);
__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
/* fall through. */
case READ_AHEAD_COMPLETED_WITH_ERROR:
/* it is legal to fail READA, no __drbd_chk_io_error in that case. */
mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
break;
goto_queue_for_net_read:
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
/* no point in retrying if there is no good remote data,
* or we have no connection. */
if (mdev->state.pdsk != D_UP_TO_DATE) {
_req_may_be_done_not_susp(req, m);
break;
}
/* _req_mod(req,to_be_send); oops, recursion... */
req->rq_state |= RQ_NET_PENDING;
inc_ap_pending(mdev);
/* fall through: _req_mod(req,queue_for_net_read); */
case queue_for_net_read:
case QUEUE_FOR_NET_READ:
/* READ or READA, and
* no local disk,
* or target area marked as invalid,
* or just got an io-error. */
/* from drbd_make_request_common
/* from __drbd_make_request
* or from bio_endio during read io-error recovery */
/* so we can verify the handle in the answer packet
* corresponding hlist_del is in _req_may_be_done() */
hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
/* So we can verify the handle in the answer packet.
* Corresponding drbd_remove_request_interval is in
* drbd_req_complete() */
D_ASSERT(drbd_interval_empty(&req->i));
drbd_insert_interval(&mdev->read_requests, &req->i);
drbd_set_flag(mdev, UNPLUG_REMOTE);
set_bit(UNPLUG_REMOTE, &mdev->flags);
D_ASSERT(req->rq_state & RQ_NET_PENDING);
req->rq_state |= RQ_NET_QUEUED;
req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
? w_read_retry_remote
: w_send_read_req;
drbd_queue_work(&mdev->data.work, &req->w);
D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0);
mod_rq_state(req, m, 0, RQ_NET_QUEUED);
req->w.cb = w_send_read_req;
drbd_queue_work(&mdev->tconn->sender_work, &req->w);
break;
case queue_for_net_write:
case QUEUE_FOR_NET_WRITE:
/* assert something? */
/* from drbd_make_request_common only */
/* from __drbd_make_request only */
hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
/* corresponding hlist_del is in _req_may_be_done() */
/* Corresponding drbd_remove_request_interval is in
* drbd_req_complete() */
D_ASSERT(drbd_interval_empty(&req->i));
drbd_insert_interval(&mdev->write_requests, &req->i);
/* NOTE
* In case the req ended up on the transfer log before being
......@@ -549,7 +563,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
*
* _req_add_to_epoch(req); this has to be after the
* _maybe_start_new_epoch(req); which happened in
* drbd_make_request_common, because we now may set the bit
* __drbd_make_request, because we now may set the bit
* again ourselves to close the current epoch.
*
* Add req to the (now) current epoch (barrier). */
......@@ -557,204 +571,189 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* otherwise we may lose an unplug, which may cause some remote
* io-scheduler timeout to expire, increasing maximum latency,
* hurting performance. */
drbd_set_flag(mdev, UNPLUG_REMOTE);
/* see drbd_make_request_common,
* just after it grabs the req_lock */
D_ASSERT(drbd_test_flag(mdev, CREATE_BARRIER) == 0);
req->epoch = mdev->newest_tle->br_number;
/* increment size of current epoch */
mdev->newest_tle->n_writes++;
set_bit(UNPLUG_REMOTE, &mdev->flags);
/* queue work item to send data */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
req->rq_state |= RQ_NET_QUEUED;
mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
req->w.cb = w_send_dblock;
drbd_queue_work(&mdev->data.work, &req->w);
drbd_queue_work(&mdev->tconn->sender_work, &req->w);
/* close the epoch, in case it outgrew the limit */
if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
queue_barrier(mdev);
rcu_read_lock();
nc = rcu_dereference(mdev->tconn->net_conf);
p = nc->max_epoch_size;
rcu_read_unlock();
if (mdev->tconn->current_tle_writes >= p)
start_new_tl_epoch(mdev->tconn);
break;
case queue_for_send_oos:
req->rq_state |= RQ_NET_QUEUED;
req->w.cb = w_send_oos;
drbd_queue_work(&mdev->data.work, &req->w);
case QUEUE_FOR_SEND_OOS:
mod_rq_state(req, m, 0, RQ_NET_QUEUED);
req->w.cb = w_send_out_of_sync;
drbd_queue_work(&mdev->tconn->sender_work, &req->w);
break;
case read_retry_remote_canceled:
case send_canceled:
case send_failed:
case READ_RETRY_REMOTE_CANCELED:
case SEND_CANCELED:
case SEND_FAILED:
/* real cleanup will be done from tl_clear. just update flags
* so it is no longer marked as on the worker queue */
req->rq_state &= ~RQ_NET_QUEUED;
/* if we did it right, tl_clear should be scheduled only after
* this, so this should not be necessary! */
_req_may_be_done_not_susp(req, m);
mod_rq_state(req, m, RQ_NET_QUEUED, 0);
break;
case handed_over_to_network:
case HANDED_OVER_TO_NETWORK:
/* assert something? */
if (bio_data_dir(req->master_bio) == WRITE)
atomic_add(req->size>>9, &mdev->ap_in_flight);
if (bio_data_dir(req->master_bio) == WRITE &&
mdev->net_conf->wire_protocol == DRBD_PROT_A) {
!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
/* this is what is dangerous about protocol A:
* pretend it was successfully written on the peer. */
if (req->rq_state & RQ_NET_PENDING) {
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
req->rq_state |= RQ_NET_OK;
} /* else: neg-ack was faster... */
if (req->rq_state & RQ_NET_PENDING)
mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
/* else: neg-ack was faster... */
/* it is still not yet RQ_NET_DONE until the
* corresponding epoch barrier got acked as well,
* so we know what to dirty on connection loss */
}
req->rq_state &= ~RQ_NET_QUEUED;
req->rq_state |= RQ_NET_SENT;
_req_may_be_done_not_susp(req, m);
mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
break;
case oos_handed_to_network:
case OOS_HANDED_TO_NETWORK:
/* Was not set PENDING, no longer QUEUED, so is now DONE
* as far as this connection is concerned. */
req->rq_state &= ~RQ_NET_QUEUED;
req->rq_state |= RQ_NET_DONE;
_req_may_be_done_not_susp(req, m);
mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
break;
case connection_lost_while_pending:
case CONNECTION_LOST_WHILE_PENDING:
/* transfer log cleanup after connection loss */
/* assert something? */
if (req->rq_state & RQ_NET_PENDING)
dec_ap_pending(mdev);
req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
req->rq_state |= RQ_NET_DONE;
if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
atomic_sub(req->size>>9, &mdev->ap_in_flight);
/* if it is still queued, we may not complete it here.
* it will be canceled soon. */
if (!(req->rq_state & RQ_NET_QUEUED))
_req_may_be_done(req, m); /* Allowed while state.susp */
mod_rq_state(req, m,
RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
RQ_NET_DONE);
break;
case conflict_discarded_by_peer:
/* for discarded conflicting writes of multiple primaries,
case CONFLICT_RESOLVED:
/* for superseded conflicting writes of multiple primaries,
* there is no need to keep anything in the tl, potential
* node crashes are covered by the activity log. */
if (what == conflict_discarded_by_peer)
dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
" DRBD is not a random data generator!\n",
(unsigned long long)req->sector, req->size);
req->rq_state |= RQ_NET_DONE;
/* fall through */
case write_acked_by_peer_and_sis:
case write_acked_by_peer:
if (what == write_acked_by_peer_and_sis)
req->rq_state |= RQ_NET_SIS;
* node crashes are covered by the activity log.
*
* If this request had been marked as RQ_POSTPONED before,
* it will actually not be completed, but "restarted",
* resubmitted from the retry worker context. */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
break;
case WRITE_ACKED_BY_PEER_AND_SIS:
req->rq_state |= RQ_NET_SIS;
case WRITE_ACKED_BY_PEER:
D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
/* protocol C; successfully written on peer.
* Nothing more to do here.
* We want to keep the tl in place for all protocols, to cater
* for volatile write-back caches on lower level devices. */
case recv_acked_by_peer:
goto ack_common;
case RECV_ACKED_BY_PEER:
D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK);
/* protocol B; pretends to be successfully written on peer.
* see also notes above in handed_over_to_network about
* see also notes above in HANDED_OVER_TO_NETWORK about
* protocol != C */
req->rq_state |= RQ_NET_OK;
ack_common:
D_ASSERT(req->rq_state & RQ_NET_PENDING);
dec_ap_pending(mdev);
atomic_sub(req->size>>9, &mdev->ap_in_flight);
req->rq_state &= ~RQ_NET_PENDING;
_req_may_be_done_not_susp(req, m);
mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
break;
case neg_acked:
/* assert something? */
if (req->rq_state & RQ_NET_PENDING) {
dec_ap_pending(mdev);
atomic_sub(req->size>>9, &mdev->ap_in_flight);
}
req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
case POSTPONE_WRITE:
D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
/* If this node has already detected the write conflict, the
* worker will be waiting on misc_wait. Wake it up once this
* request has completed locally.
*/
D_ASSERT(req->rq_state & RQ_NET_PENDING);
req->rq_state |= RQ_POSTPONED;
if (req->i.waiting)
wake_up(&mdev->misc_wait);
/* Do not clear RQ_NET_PENDING. This request will make further
* progress via restart_conflicting_writes() or
* fail_postponed_requests(). Hopefully. */
break;
req->rq_state |= RQ_NET_DONE;
_req_may_be_done_not_susp(req, m);
/* else: done by handed_over_to_network */
case NEG_ACKED:
mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
break;
case fail_frozen_disk_io:
case FAIL_FROZEN_DISK_IO:
if (!(req->rq_state & RQ_LOCAL_COMPLETED))
break;
_req_may_be_done(req, m); /* Allowed while state.susp */
mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
break;
case restart_frozen_disk_io:
case RESTART_FROZEN_DISK_IO:
if (!(req->rq_state & RQ_LOCAL_COMPLETED))
break;
req->rq_state &= ~RQ_LOCAL_COMPLETED;
mod_rq_state(req, m,
RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
RQ_LOCAL_PENDING);
rv = MR_READ;
if (bio_data_dir(req->master_bio) == WRITE)
rv = MR_WRITE;
get_ldev(mdev);
get_ldev(mdev); /* always succeeds in this call path */
req->w.cb = w_restart_disk_io;
drbd_queue_work(&mdev->data.work, &req->w);
drbd_queue_work(&mdev->tconn->sender_work, &req->w);
break;
case resend:
case RESEND:
/* Simply complete (local only) READs. */
if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
_req_may_be_done(req, m);
mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
break;
}
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
before the connection loss (B&C only); only P_BARRIER_ACK was missing.
Trowing them out of the TL here by pretending we got a BARRIER_ACK
We ensure that the peer was not rebooted */
before the connection loss (B&C only); only P_BARRIER_ACK
(or the local completion?) was missing when we suspended.
Throwing them out of the TL here by pretending we got a BARRIER_ACK.
During connection handshake, we ensure that the peer was not rebooted. */
if (!(req->rq_state & RQ_NET_OK)) {
/* FIXME could this possibly be a req->w.cb == w_send_out_of_sync?
* in that case we must not set RQ_NET_PENDING. */
mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
if (req->w.cb) {
drbd_queue_work(&mdev->data.work, &req->w);
drbd_queue_work(&mdev->tconn->sender_work, &req->w);
rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
}
} /* else: FIXME can this happen? */
break;
}
/* else, fall through to barrier_acked */
/* else, fall through to BARRIER_ACKED */
case barrier_acked:
case BARRIER_ACKED:
/* barrier ack for READ requests does not make sense */
if (!(req->rq_state & RQ_WRITE))
break;
if (req->rq_state & RQ_NET_PENDING) {
/* barrier came in before all requests have been acked.
/* barrier came in before all requests were acked.
* this is bad, because if the connection is lost now,
* we won't be able to clean them up... */
dev_err(DEV, "FIXME (barrier_acked but pending)\n");
list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n");
}
if ((req->rq_state & RQ_NET_MASK) != 0) {
req->rq_state |= RQ_NET_DONE;
if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
atomic_sub(req->size>>9, &mdev->ap_in_flight);
}
_req_may_be_done(req, m); /* Allowed while state.susp */
/* Allowed to complete requests, even while suspended.
* As this is called for all requests within a matching epoch,
* we need to filter, and only set RQ_NET_DONE for those that
* have actually been on the wire. */
mod_rq_state(req, m, RQ_COMPLETION_SUSP,
(req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
break;
case data_received:
case DATA_RECEIVED:
D_ASSERT(req->rq_state & RQ_NET_PENDING);
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
_req_may_be_done_not_susp(req, m);
mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
break;
};
......@@ -768,75 +767,265 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
* since size may be bigger than BM_BLOCK_SIZE,
* we may need to check several bits.
*/
static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
{
unsigned long sbnr, ebnr;
sector_t esector, nr_sectors;
if (mdev->state.disk == D_UP_TO_DATE)
return 1;
if (mdev->state.disk >= D_OUTDATED)
return 0;
if (mdev->state.disk < D_INCONSISTENT)
return 0;
/* state.disk == D_INCONSISTENT We will have a look at the BitMap */
nr_sectors = drbd_get_capacity(mdev->this_bdev);
return true;
if (mdev->state.disk != D_INCONSISTENT)
return false;
esector = sector + (size >> 9) - 1;
nr_sectors = drbd_get_capacity(mdev->this_bdev);
D_ASSERT(sector < nr_sectors);
D_ASSERT(esector < nr_sectors);
sbnr = BM_SECT_TO_BIT(sector);
ebnr = BM_SECT_TO_BIT(esector);
return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
}
static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
enum drbd_read_balancing rbm)
{
struct backing_dev_info *bdi;
int stripe_shift;
switch (rbm) {
case RB_CONGESTED_REMOTE:
bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
return bdi_read_congested(bdi);
case RB_LEAST_PENDING:
return atomic_read(&mdev->local_cnt) >
atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt);
case RB_32K_STRIPING: /* stripe_shift = 15 */
case RB_64K_STRIPING:
case RB_128K_STRIPING:
case RB_256K_STRIPING:
case RB_512K_STRIPING:
case RB_1M_STRIPING: /* stripe_shift = 20 */
stripe_shift = (rbm - RB_32K_STRIPING + 15);
return (sector >> (stripe_shift - 9)) & 1;
case RB_ROUND_ROBIN:
return test_and_change_bit(READ_BALANCE_RR, &mdev->flags);
case RB_PREFER_REMOTE:
return true;
case RB_PREFER_LOCAL:
default:
return false;
}
}
/*
* complete_conflicting_writes - wait for any conflicting write requests
*
* The write_requests tree contains all active write requests which we
* currently know about. Wait for any requests to complete which conflict with
* the new one.
*
* Only way out: remove the conflicting intervals from the tree.
*/
static void complete_conflicting_writes(struct drbd_request *req)
{
DEFINE_WAIT(wait);
struct drbd_conf *mdev = req->w.mdev;
struct drbd_interval *i;
sector_t sector = req->i.sector;
int size = req->i.size;
i = drbd_find_overlap(&mdev->write_requests, sector, size);
if (!i)
return;
for (;;) {
prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
i = drbd_find_overlap(&mdev->write_requests, sector, size);
if (!i)
break;
/* Indicate to wake up device->misc_wait on progress. */
i->waiting = true;
spin_unlock_irq(&mdev->tconn->req_lock);
schedule();
spin_lock_irq(&mdev->tconn->req_lock);
}
finish_wait(&mdev->misc_wait, &wait);
}
/* called within req_lock and rcu_read_lock() */
static void maybe_pull_ahead(struct drbd_conf *mdev)
{
int congested = 0;
struct drbd_tconn *tconn = mdev->tconn;
struct net_conf *nc;
bool congested = false;
enum drbd_on_congestion on_congestion;
nc = rcu_dereference(tconn->net_conf);
on_congestion = nc ? nc->on_congestion : OC_BLOCK;
if (on_congestion == OC_BLOCK ||
tconn->agreed_pro_version < 96)
return;
/* If I don't even have good local storage, we can not reasonably try
* to pull ahead of the peer. We also need the local reference to make
* sure mdev->act_log is there.
* Note: caller has to make sure that net_conf is there.
*/
if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
return;
if (mdev->net_conf->cong_fill &&
atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
if (nc->cong_fill &&
atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
dev_info(DEV, "Congestion-fill threshold reached\n");
congested = 1;
congested = true;
}
if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
if (mdev->act_log->used >= nc->cong_extents) {
dev_info(DEV, "Congestion-extents threshold reached\n");
congested = 1;
congested = true;
}
if (congested) {
queue_barrier(mdev); /* last barrier, after mirrored writes */
/* start a new epoch for non-mirrored writes */
start_new_tl_epoch(mdev->tconn);
if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
if (on_congestion == OC_PULL_AHEAD)
_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
else /*mdev->net_conf->on_congestion == OC_DISCONNECT */
else /*nc->on_congestion == OC_DISCONNECT */
_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
}
put_ldev(mdev);
}
static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
/* If this returns false, and req->private_bio is still set,
* this should be submitted locally.
*
* If it returns false, but req->private_bio is not set,
* we do not have access to good data :(
*
* Otherwise, this destroys req->private_bio, if any,
* and returns true.
*/
static bool do_remote_read(struct drbd_request *req)
{
struct drbd_conf *mdev = req->w.mdev;
enum drbd_read_balancing rbm;
if (req->private_bio) {
if (!drbd_may_do_local_read(mdev,
req->i.sector, req->i.size)) {
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(mdev);
}
}
if (mdev->state.pdsk != D_UP_TO_DATE)
return false;
if (req->private_bio == NULL)
return true;
/* TODO: improve read balancing decisions, take into account drbd
* protocol, pending requests etc. */
rcu_read_lock();
rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
rcu_read_unlock();
if (rbm == RB_PREFER_LOCAL && req->private_bio)
return false; /* submit locally */
if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
if (req->private_bio) {
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(mdev);
}
return true;
}
return false;
}
/* returns number of connections (== 1, for drbd 8.4)
* expected to actually write this data,
* which does NOT include those that we are L_AHEAD for. */
static int drbd_process_write_request(struct drbd_request *req)
{
struct drbd_conf *mdev = req->w.mdev;
int remote, send_oos;
rcu_read_lock();
remote = drbd_should_do_remote(mdev->state);
if (remote) {
maybe_pull_ahead(mdev);
remote = drbd_should_do_remote(mdev->state);
}
send_oos = drbd_should_send_out_of_sync(mdev->state);
rcu_read_unlock();
/* Need to replicate writes. Unless it is an empty flush,
* which is better mapped to a DRBD P_BARRIER packet,
* also for drbd wire protocol compatibility reasons.
* If this was a flush, just start a new epoch.
* Unless the current epoch was empty anyways, or we are not currently
* replicating, in which case there is no point. */
if (unlikely(req->i.size == 0)) {
/* The only size==0 bios we expect are empty flushes. */
D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
if (remote)
start_new_tl_epoch(mdev->tconn);
return 0;
}
if (!remote && !send_oos)
return 0;
D_ASSERT(!(remote && send_oos));
if (remote) {
_req_mod(req, TO_BE_SENT);
_req_mod(req, QUEUE_FOR_NET_WRITE);
} else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
_req_mod(req, QUEUE_FOR_SEND_OOS);
return remote;
}
static void
drbd_submit_req_private_bio(struct drbd_request *req)
{
struct drbd_conf *mdev = req->w.mdev;
struct bio *bio = req->private_bio;
const int rw = bio_rw(bio);
bio->bi_bdev = mdev->ldev->backing_bdev;
/* State may have changed since we grabbed our reference on the
* ->ldev member. Double check, and short-circuit to endio.
* In case the last activity log transaction failed to get on
* stable storage, and this is a WRITE, we may not even submit
* this bio. */
if (get_ldev(mdev)) {
if (drbd_insert_fault(mdev,
rw == WRITE ? DRBD_FAULT_DT_WR
: rw == READ ? DRBD_FAULT_DT_RD
: DRBD_FAULT_DT_RA))
bio_endio(bio, -EIO);
else
generic_make_request(bio);
put_ldev(mdev);
} else
bio_endio(bio, -EIO);
}
void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
{
const int rw = bio_rw(bio);
const int size = bio->bi_size;
const sector_t sector = bio->bi_sector;
struct drbd_tl_epoch *b = NULL;
struct bio_and_error m = { NULL, };
struct drbd_request *req;
int local, remote, send_oos = 0;
int err = -EIO;
int ret = 0;
union drbd_state s;
bool no_remote = false;
/* allocate outside of all locks; */
req = drbd_req_new(mdev, bio);
......@@ -846,55 +1035,14 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
* if user cannot handle io errors, that's not our business. */
dev_err(DEV, "could not kmalloc() req\n");
bio_endio(bio, -ENOMEM);
return 0;
return;
}
req->start_time = start_time;
local = get_ldev(mdev);
if (!local) {
bio_put(req->private_bio); /* or we get a bio leak */
if (!get_ldev(mdev)) {
bio_put(req->private_bio);
req->private_bio = NULL;
}
if (rw == WRITE) {
/* Need to replicate writes. Unless it is an empty flush,
* which is better mapped to a DRBD P_BARRIER packet,
* also for drbd wire protocol compatibility reasons. */
if (unlikely(size == 0)) {
/* The only size==0 bios we expect are empty flushes. */
D_ASSERT(bio->bi_rw & REQ_FLUSH);
remote = 0;
} else
remote = 1;
} else {
/* READ || READA */
if (local) {
if (!drbd_may_do_local_read(mdev, sector, size)) {
/* we could kick the syncer to
* sync this extent asap, wait for
* it, then continue locally.
* Or just issue the request remotely.
*/
local = 0;
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(mdev);
}
}
remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
}
/* If we have a disk, but a READA request is mapped to remote,
* we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
* Just fail that READA request right here.
*
* THINK: maybe fail all READA when not local?
* or make this configurable...
* if network is slow, READA won't do any good.
*/
if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
err = -EWOULDBLOCK;
goto fail_and_free_req;
}
/* For WRITES going to the local disk, grab a reference on the target
* extent. This waits for any resync activity in the corresponding
......@@ -903,349 +1051,131 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
* of transactional on-disk meta data updates.
* Empty flushes don't need to go into the activity log, they can only
* flush data for pending writes which are already in there. */
if (rw == WRITE && local && size
&& !drbd_test_flag(mdev, AL_SUSPENDED)) {
if (rw == WRITE && req->private_bio && req->i.size
&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
req->rq_state |= RQ_IN_ACT_LOG;
drbd_al_begin_io(mdev, sector);
drbd_al_begin_io(mdev, &req->i);
}
s = mdev->state;
remote = remote && drbd_should_do_remote(s);
send_oos = rw == WRITE && drbd_should_send_oos(s);
D_ASSERT(!(remote && send_oos));
if (!(local || remote) && !is_susp(mdev->state)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
(unsigned long long)req->sector, req->size >> 9);
goto fail_free_complete;
}
/* For WRITE request, we have to make sure that we have an
* unused_spare_tle, in case we need to start a new epoch.
* I try to be smart and avoid to pre-allocate always "just in case",
* but there is a race between testing the bit and pointer outside the
* spinlock, and grabbing the spinlock.
* if we lost that race, we retry. */
if (rw == WRITE && (remote || send_oos) &&
mdev->unused_spare_tle == NULL &&
drbd_test_flag(mdev, CREATE_BARRIER)) {
allocate_barrier:
b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
if (!b) {
dev_err(DEV, "Failed to alloc barrier.\n");
err = -ENOMEM;
goto fail_free_complete;
}
spin_lock_irq(&mdev->tconn->req_lock);
if (rw == WRITE) {
/* This may temporarily give up the req_lock,
* but will re-aquire it before it returns here.
* Needs to be before the check on drbd_suspended() */
complete_conflicting_writes(req);
}
/* GOOD, everything prepared, grab the spin_lock */
spin_lock_irq(&mdev->req_lock);
if (is_susp(mdev->state)) {
/* If we got suspended, use the retry mechanism of
drbd_make_request() to restart processing of this
bio. In the next call to drbd_make_request
we sleep in inc_ap_bio() */
ret = 1;
spin_unlock_irq(&mdev->req_lock);
goto fail_free_complete;
}
/* no more giving up req_lock from now on! */
if (remote || send_oos) {
remote = drbd_should_do_remote(mdev->state);
send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
D_ASSERT(!(remote && send_oos));
if (!(remote || send_oos))
dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
if (!(local || remote)) {
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
spin_unlock_irq(&mdev->req_lock);
goto fail_free_complete;
if (drbd_suspended(mdev)) {
/* push back and retry: */
req->rq_state |= RQ_POSTPONED;
if (req->private_bio) {
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(mdev);
}
goto out;
}
if (b && mdev->unused_spare_tle == NULL) {
mdev->unused_spare_tle = b;
b = NULL;
}
if (rw == WRITE && (remote || send_oos) &&
mdev->unused_spare_tle == NULL &&
drbd_test_flag(mdev, CREATE_BARRIER)) {
/* someone closed the current epoch
* while we were grabbing the spinlock */
spin_unlock_irq(&mdev->req_lock);
goto allocate_barrier;
}
/* Update disk stats */
_drbd_start_io_acct(mdev, req, bio);
/* _maybe_start_new_epoch(mdev);
* If we need to generate a write barrier packet, we have to add the
* new epoch (barrier) object, and queue the barrier packet for sending,
* and queue the req's data after it _within the same lock_, otherwise
* we have race conditions were the reorder domains could be mixed up.
*
* Even read requests may start a new epoch and queue the corresponding
* barrier packet. To get the write ordering right, we only have to
* make sure that, if this is a write request and it triggered a
* barrier packet, this request is queued within the same spinlock. */
if ((remote || send_oos) && mdev->unused_spare_tle &&
drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
_tl_add_barrier(mdev, mdev->unused_spare_tle);
mdev->unused_spare_tle = NULL;
} else {
D_ASSERT(!(remote && rw == WRITE &&
drbd_test_flag(mdev, CREATE_BARRIER)));
/* We fail READ/READA early, if we can not serve it.
* We must do this before req is registered on any lists.
* Otherwise, drbd_req_complete() will queue failed READ for retry. */
if (rw != WRITE) {
if (!do_remote_read(req) && !req->private_bio)
goto nodata;
}
/* NOTE
* Actually, 'local' may be wrong here already, since we may have failed
* to write to the meta data, and may become wrong anytime because of
* local io-error for some other request, which would lead to us
* "detaching" the local disk.
*
* 'remote' may become wrong any time because the network could fail.
*
* This is a harmless race condition, though, since it is handled
* correctly at the appropriate places; so it just defers the failure
* of the respective operation.
*/
/* mark them early for readability.
* this just sets some state flags. */
if (remote)
_req_mod(req, to_be_send);
if (local)
_req_mod(req, to_be_submitted);
/* check this request on the collision detection hash tables.
* if we have a conflict, just complete it here.
* THINK do we want to check reads, too? (I don't think so...) */
if (rw == WRITE && _req_conflicts(req))
goto fail_conflicting;
/* which transfer log epoch does this belong to? */
req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
/* no point in adding empty flushes to the transfer log,
* they are mapped to drbd barriers already. */
if (likely(size!=0))
list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
if (likely(req->i.size!=0)) {
if (rw == WRITE)
mdev->tconn->current_tle_writes++;
/* NOTE remote first: to get the concurrent write detection right,
* we must register the request before start of local IO. */
if (remote) {
/* either WRITE and C_CONNECTED,
* or READ, and no local disk,
* or READ, but not in sync.
*/
_req_mod(req, (rw == WRITE)
? queue_for_net_write
: queue_for_net_read);
list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
}
if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
_req_mod(req, queue_for_send_oos);
if (remote &&
mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
maybe_pull_ahead(mdev);
/* If this was a flush, queue a drbd barrier/start a new epoch.
* Unless the current epoch was empty anyways, or we are not currently
* replicating, in which case there is no point. */
if (unlikely(bio->bi_rw & REQ_FLUSH)
&& mdev->newest_tle->n_writes
&& drbd_should_do_remote(mdev->state))
queue_barrier(mdev);
spin_unlock_irq(&mdev->req_lock);
kfree(b); /* if someone else has beaten us to it... */
if (local) {
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
/* State may have changed since we grabbed our reference on the
* mdev->ldev member. Double check, and short-circuit to endio.
* In case the last activity log transaction failed to get on
* stable storage, and this is a WRITE, we may not even submit
* this bio. */
if (get_ldev(mdev)) {
if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
: rw == READ ? DRBD_FAULT_DT_RD
: DRBD_FAULT_DT_RA))
bio_endio(req->private_bio, -EIO);
else
generic_make_request(req->private_bio);
put_ldev(mdev);
if (rw == WRITE) {
if (!drbd_process_write_request(req))
no_remote = true;
} else {
/* We either have a private_bio, or we can read from remote.
* Otherwise we had done the goto nodata above. */
if (req->private_bio == NULL) {
_req_mod(req, TO_BE_SENT);
_req_mod(req, QUEUE_FOR_NET_READ);
} else
bio_endio(req->private_bio, -EIO);
no_remote = true;
}
return 0;
fail_conflicting:
/* this is a conflicting request.
* even though it may have been only _partially_
* overlapping with one of the currently pending requests,
* without even submitting or sending it, we will
* pretend that it was successfully served right now.
*/
_drbd_end_io_acct(mdev, req);
spin_unlock_irq(&mdev->req_lock);
if (remote)
dec_ap_pending(mdev);
/* THINK: do we want to fail it (-EIO), or pretend success?
* this pretends success. */
err = 0;
fail_free_complete:
if (req->rq_state & RQ_IN_ACT_LOG)
drbd_al_complete_io(mdev, sector);
fail_and_free_req:
if (local) {
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(mdev);
if (req->private_bio) {
/* needs to be marked within the same spinlock */
_req_mod(req, TO_BE_SUBMITTED);
/* but we need to give up the spinlock to submit */
spin_unlock_irq(&mdev->tconn->req_lock);
drbd_submit_req_private_bio(req);
spin_lock_irq(&mdev->tconn->req_lock);
} else if (no_remote) {
nodata:
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
(unsigned long long)req->i.sector, req->i.size >> 9);
/* A write may have been queued for send_oos, however.
* So we can not simply free it, we must go through drbd_req_put_completion_ref() */
}
if (!ret)
bio_endio(bio, err);
drbd_req_free(req);
dec_ap_bio(mdev);
kfree(b);
return ret;
}
/* helper function for drbd_make_request
* if we can determine just by the mdev (state) that this request will fail,
* return 1
* otherwise return 0
*/
static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
{
if (mdev->state.role != R_PRIMARY &&
(!allow_oos || is_write)) {
if (__ratelimit(&drbd_ratelimit_state)) {
dev_err(DEV, "Process %s[%u] tried to %s; "
"since we are not in Primary state, "
"we cannot allow this\n",
current->comm, current->pid,
is_write ? "WRITE" : "READ");
}
return 1;
}
out:
if (drbd_req_put_completion_ref(req, &m, 1))
kref_put(&req->kref, drbd_req_destroy);
spin_unlock_irq(&mdev->tconn->req_lock);
return 0;
if (m.bio)
complete_master_bio(mdev, &m);
return;
}
void drbd_make_request(struct request_queue *q, struct bio *bio)
{
unsigned int s_enr, e_enr;
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
unsigned long start_time;
if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
bio_endio(bio, -EPERM);
return;
}
start_time = jiffies;
/*
* what we "blindly" assume:
*/
D_ASSERT((bio->bi_size & 0x1ff) == 0);
/* to make some things easier, force alignment of requests within the
* granularity of our hash tables */
s_enr = bio->bi_sector >> HT_SHIFT;
e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
if (likely(s_enr == e_enr)) {
do {
inc_ap_bio(mdev, 1);
} while (drbd_make_request_common(mdev, bio, start_time));
return;
}
/* can this bio be split generically?
* Maybe add our own split-arbitrary-bios function. */
if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
/* rather error out here than BUG in bio_split */
dev_err(DEV, "bio would need to, but cannot, be split: "
"(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
bio->bi_vcnt, bio->bi_idx, bio->bi_size,
(unsigned long long)bio->bi_sector);
bio_endio(bio, -EINVAL);
} else {
/* This bio crosses some boundary, so we have to split it. */
struct bio_pair *bp;
/* works for the "do not cross hash slot boundaries" case
* e.g. sector 262269, size 4096
* s_enr = 262269 >> 6 = 4097
* e_enr = (262269+8-1) >> 6 = 4098
* HT_SHIFT = 6
* sps = 64, mask = 63
* first_sectors = 64 - (262269 & 63) = 3
*/
const sector_t sect = bio->bi_sector;
const int sps = 1 << HT_SHIFT; /* sectors per slot */
const int mask = sps - 1;
const sector_t first_sectors = sps - (sect & mask);
bp = bio_split(bio, first_sectors);
D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
/* we need to get a "reference count" (ap_bio_cnt)
* to avoid races with the disconnect/reconnect/suspend code.
* In case we need to split the bio here, we need to get three references
* atomically, otherwise we might deadlock when trying to submit the
* second one! */
inc_ap_bio(mdev, 3);
D_ASSERT(e_enr == s_enr + 1);
while (drbd_make_request_common(mdev, &bp->bio1, start_time))
inc_ap_bio(mdev, 1);
while (drbd_make_request_common(mdev, &bp->bio2, start_time))
inc_ap_bio(mdev, 1);
dec_ap_bio(mdev);
bio_pair_release(bp);
}
inc_ap_bio(mdev);
__drbd_make_request(mdev, bio, start_time);
}
/* This is called by bio_add_page(). With this function we reduce
* the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
* units (was AL_EXTENTs).
/* This is called by bio_add_page().
*
* q->max_hw_sectors and other global limits are already enforced there.
*
* we do the calculation within the lower 32bit of the byte offsets,
* since we don't care for actual offset, but only check whether it
* would cross "activity log extent" boundaries.
* We need to call down to our lower level device,
* in case it has special restrictions.
*
* We also may need to enforce configured max-bio-bvecs limits.
*
* As long as the BIO is empty we have to allow at least one bvec,
* regardless of size and offset. so the resulting bio may still
* cross extent boundaries. those are dealt with (bio_split) in
* drbd_make_request.
* regardless of size and offset, so no need to ask lower levels.
*/
int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
{
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
unsigned int bio_offset =
(unsigned int)bvm->bi_sector << 9; /* 32 bit */
unsigned int bio_size = bvm->bi_size;
int limit, backing_limit;
limit = DRBD_MAX_BIO_SIZE
- ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
if (limit < 0)
limit = 0;
if (bio_size == 0) {
if (limit <= bvec->bv_len)
limit = bvec->bv_len;
} else if (limit && get_ldev(mdev)) {
int limit = DRBD_MAX_BIO_SIZE;
int backing_limit;
if (bio_size && get_ldev(mdev)) {
struct request_queue * const b =
mdev->ldev->backing_bdev->bd_disk->queue;
if (b->merge_bvec_fn) {
......@@ -1257,24 +1187,38 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct
return limit;
}
struct drbd_request *find_oldest_request(struct drbd_tconn *tconn)
{
/* Walk the transfer log,
* and find the oldest not yet completed request */
struct drbd_request *r;
list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
if (atomic_read(&r->completion_ref))
return r;
}
return NULL;
}
void request_timer_fn(unsigned long data)
{
struct drbd_conf *mdev = (struct drbd_conf *) data;
struct drbd_tconn *tconn = mdev->tconn;
struct drbd_request *req; /* oldest request */
struct list_head *le;
struct net_conf *nc;
unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
unsigned long now;
if (get_net_conf(mdev)) {
if (mdev->state.conn >= C_WF_REPORT_PARAMS)
ent = mdev->net_conf->timeout*HZ/10
* mdev->net_conf->ko_count;
put_net_conf(mdev);
}
rcu_read_lock();
nc = rcu_dereference(tconn->net_conf);
if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS)
ent = nc->timeout * HZ/10 * nc->ko_count;
if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
dt = mdev->ldev->dc.disk_timeout * HZ / 10;
dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10;
put_ldev(mdev);
}
rcu_read_unlock();
et = min_not_zero(dt, ent);
if (!et)
......@@ -1282,17 +1226,14 @@ void request_timer_fn(unsigned long data)
now = jiffies;
spin_lock_irq(&mdev->req_lock);
le = &mdev->oldest_tle->requests;
if (list_empty(le)) {
spin_unlock_irq(&mdev->req_lock);
spin_lock_irq(&tconn->req_lock);
req = find_oldest_request(tconn);
if (!req) {
spin_unlock_irq(&tconn->req_lock);
mod_timer(&mdev->request_timer, now + et);
return;
}
le = le->prev;
req = list_entry(le, struct drbd_request, tl_requests);
/* The request is considered timed out, if
* - we have some effective timeout from the configuration,
* with above state restrictions applied,
......@@ -1311,17 +1252,17 @@ void request_timer_fn(unsigned long data)
*/
if (ent && req->rq_state & RQ_NET_PENDING &&
time_after(now, req->start_time + ent) &&
!time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
!time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) {
dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
}
if (dt && req->rq_state & RQ_LOCAL_PENDING &&
if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev &&
time_after(now, req->start_time + dt) &&
!time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
__drbd_chk_io_error(mdev, DRBD_FORCE_DETACH);
}
nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
spin_unlock_irq(&mdev->req_lock);
spin_unlock_irq(&tconn->req_lock);
mod_timer(&mdev->request_timer, nt);
}
......@@ -77,40 +77,41 @@
*/
enum drbd_req_event {
created,
to_be_send,
to_be_submitted,
CREATED,
TO_BE_SENT,
TO_BE_SUBMITTED,
/* XXX yes, now I am inconsistent...
* these are not "events" but "actions"
* oh, well... */
queue_for_net_write,
queue_for_net_read,
queue_for_send_oos,
send_canceled,
send_failed,
handed_over_to_network,
oos_handed_to_network,
connection_lost_while_pending,
read_retry_remote_canceled,
recv_acked_by_peer,
write_acked_by_peer,
write_acked_by_peer_and_sis, /* and set_in_sync */
conflict_discarded_by_peer,
neg_acked,
barrier_acked, /* in protocol A and B */
data_received, /* (remote read) */
read_completed_with_error,
read_ahead_completed_with_error,
write_completed_with_error,
abort_disk_io,
completed_ok,
resend,
fail_frozen_disk_io,
restart_frozen_disk_io,
nothing, /* for tracing only */
QUEUE_FOR_NET_WRITE,
QUEUE_FOR_NET_READ,
QUEUE_FOR_SEND_OOS,
SEND_CANCELED,
SEND_FAILED,
HANDED_OVER_TO_NETWORK,
OOS_HANDED_TO_NETWORK,
CONNECTION_LOST_WHILE_PENDING,
READ_RETRY_REMOTE_CANCELED,
RECV_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
CONFLICT_RESOLVED,
POSTPONE_WRITE,
NEG_ACKED,
BARRIER_ACKED, /* in protocol A and B */
DATA_RECEIVED, /* (remote read) */
READ_COMPLETED_WITH_ERROR,
READ_AHEAD_COMPLETED_WITH_ERROR,
WRITE_COMPLETED_WITH_ERROR,
ABORT_DISK_IO,
COMPLETED_OK,
RESEND,
FAIL_FROZEN_DISK_IO,
RESTART_FROZEN_DISK_IO,
NOTHING,
};
/* encoding of request states for now. we don't actually need that many bits.
......@@ -142,8 +143,8 @@ enum drbd_req_state_bits {
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
* 11100: write_acked (C),
* data_received (for remote read, any protocol)
* 11100: write acked (C),
* data received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
......@@ -198,6 +199,22 @@ enum drbd_req_state_bits {
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
/* The peer has sent a retry ACK */
__RQ_POSTPONED,
/* would have been completed,
* but was not, because of drbd_suspended() */
__RQ_COMPLETION_SUSP,
/* We expect a receive ACK (wire proto B) */
__RQ_EXP_RECEIVE_ACK,
/* We expect a write ACK (wite proto C) */
__RQ_EXP_WRITE_ACK,
/* waiting for a barrier ack, did an extra kref_get */
__RQ_EXP_BARR_ACK,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
......@@ -219,56 +236,16 @@ enum drbd_req_state_bits {
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
#define MR_WRITE_SHIFT 0
#define MR_WRITE (1 << MR_WRITE_SHIFT)
#define MR_READ_SHIFT 1
#define MR_READ (1 << MR_READ_SHIFT)
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->ee_hash_s == 0);
return mdev->ee_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
}
/* transfer log (drbd_request objects) */
static inline
struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->tl_hash_s == 0);
return mdev->tl_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
}
/* application reads (drbd_request objects) */
static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
return mdev->app_reads_hash
+ ((unsigned int)(sector) % APP_R_HSIZE);
}
/* when we receive the answer for a read request,
* verify that we actually know about it */
static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
u64 id, sector_t sector)
{
struct hlist_head *slot = ar_hash_slot(mdev, sector);
struct hlist_node *n;
struct drbd_request *req;
hlist_for_each_entry(req, n, slot, collision) {
if ((unsigned long)req == (unsigned long)id) {
D_ASSERT(req->sector == sector);
return req;
}
}
return NULL;
}
#define MR_WRITE 1
#define MR_READ 2
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
......@@ -278,41 +255,10 @@ static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bi
req->private_bio = bio;
bio->bi_private = req;
bio->bi_end_io = drbd_endio_pri;
bio->bi_end_io = drbd_request_endio;
bio->bi_next = NULL;
}
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
drbd_req_make_private_bio(req, bio_src);
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->mdev = mdev;
req->master_bio = bio_src;
req->epoch = 0;
req->sector = bio_src->bi_sector;
req->size = bio_src->bi_size;
INIT_HLIST_NODE(&req->collision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
}
return req;
}
static inline void drbd_req_free(struct drbd_request *req)
{
mempool_free(req, drbd_request_mempool);
}
static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
{
return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
}
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_size, or similar. But that would be too ugly. */
......@@ -321,6 +267,7 @@ struct bio_and_error {
int error;
};
extern void drbd_req_destroy(struct kref *kref);
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
......@@ -328,13 +275,17 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
extern void request_timer_fn(unsigned long data);
extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
/* this is in drbd_main.c */
extern void drbd_restart_request(struct drbd_request *req);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
......@@ -354,13 +305,13 @@ static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
unsigned long flags;
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
spin_lock_irqsave(&mdev->req_lock, flags);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
rv = __req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (m.bio)
complete_master_bio(mdev, &m);
......@@ -368,7 +319,7 @@ static inline int req_mod(struct drbd_request *req,
return rv;
}
static inline bool drbd_should_do_remote(union drbd_state s)
static inline bool drbd_should_do_remote(union drbd_dev_state s)
{
return s.pdsk == D_UP_TO_DATE ||
(s.pdsk >= D_INCONSISTENT &&
......@@ -378,7 +329,7 @@ static inline bool drbd_should_do_remote(union drbd_state s)
That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
states. */
}
static inline bool drbd_should_send_oos(union drbd_state s)
static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
{
return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
......
/*
drbd_state.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
from Logicworks, Inc. for making SDP replication support possible.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/drbd_limits.h>
#include "drbd_int.h"
#include "drbd_req.h"
/* in drbd_main.c */
extern void tl_abort_disk_io(struct drbd_conf *mdev);
struct after_state_chg_work {
struct drbd_work w;
union drbd_state os;
union drbd_state ns;
enum chg_state_flags flags;
struct completion *done;
};
enum sanitize_state_warnings {
NO_WARNING,
ABORTED_ONLINE_VERIFY,
ABORTED_RESYNC,
CONNECTION_LOST_NEGOTIATING,
IMPLICITLY_UPGRADED_DISK,
IMPLICITLY_UPGRADED_PDSK,
};
static int w_after_state_ch(struct drbd_work *w, int unused);
static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, enum chg_state_flags flags);
static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_tconn *);
static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns,
enum sanitize_state_warnings *warn);
static inline bool is_susp(union drbd_state s)
{
return s.susp || s.susp_nod || s.susp_fen;
}
bool conn_all_vols_unconf(struct drbd_tconn *tconn)
{
struct drbd_conf *mdev;
bool rv = true;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
if (mdev->state.disk != D_DISKLESS ||
mdev->state.conn != C_STANDALONE ||
mdev->state.role != R_SECONDARY) {
rv = false;
break;
}
}
rcu_read_unlock();
return rv;
}
/* Unfortunately the states where not correctly ordered, when
they where defined. therefore can not use max_t() here. */
static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
{
if (role1 == R_PRIMARY || role2 == R_PRIMARY)
return R_PRIMARY;
if (role1 == R_SECONDARY || role2 == R_SECONDARY)
return R_SECONDARY;
return R_UNKNOWN;
}
static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
{
if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
return R_UNKNOWN;
if (role1 == R_SECONDARY || role2 == R_SECONDARY)
return R_SECONDARY;
return R_PRIMARY;
}
enum drbd_role conn_highest_role(struct drbd_tconn *tconn)
{
enum drbd_role role = R_UNKNOWN;
struct drbd_conf *mdev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr)
role = max_role(role, mdev->state.role);
rcu_read_unlock();
return role;
}
enum drbd_role conn_highest_peer(struct drbd_tconn *tconn)
{
enum drbd_role peer = R_UNKNOWN;
struct drbd_conf *mdev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr)
peer = max_role(peer, mdev->state.peer);
rcu_read_unlock();
return peer;
}
enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn)
{
enum drbd_disk_state ds = D_DISKLESS;
struct drbd_conf *mdev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr)
ds = max_t(enum drbd_disk_state, ds, mdev->state.disk);
rcu_read_unlock();
return ds;
}
enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn)
{
enum drbd_disk_state ds = D_MASK;
struct drbd_conf *mdev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr)
ds = min_t(enum drbd_disk_state, ds, mdev->state.disk);
rcu_read_unlock();
return ds;
}
enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn)
{
enum drbd_disk_state ds = D_DISKLESS;
struct drbd_conf *mdev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr)
ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk);
rcu_read_unlock();
return ds;
}
enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn)
{
enum drbd_conns conn = C_MASK;
struct drbd_conf *mdev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr)
conn = min_t(enum drbd_conns, conn, mdev->state.conn);
rcu_read_unlock();
return conn;
}
static bool no_peer_wf_report_params(struct drbd_tconn *tconn)
{
struct drbd_conf *mdev;
int vnr;
bool rv = true;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr)
if (mdev->state.conn == C_WF_REPORT_PARAMS) {
rv = false;
break;
}
rcu_read_unlock();
return rv;
}
/**
* cl_wide_st_chg() - true if the state change is a cluster wide one
* @mdev: DRBD device.
* @os: old (current) state.
* @ns: new (wanted) state.
*/
static int cl_wide_st_chg(struct drbd_conf *mdev,
union drbd_state os, union drbd_state ns)
{
return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
(os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
(os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
(os.disk != D_FAILED && ns.disk == D_FAILED))) ||
(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
(os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
}
static union drbd_state
apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
{
union drbd_state ns;
ns.i = (os.i & ~mask.i) | val.i;
return ns;
}
enum drbd_state_rv
drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
union drbd_state mask, union drbd_state val)
{
unsigned long flags;
union drbd_state ns;
enum drbd_state_rv rv;
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
ns = apply_mask_val(drbd_read_state(mdev), mask, val);
rv = _drbd_set_state(mdev, ns, f, NULL);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
return rv;
}
/**
* drbd_force_state() - Impose a change which happens outside our control on our state
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
*/
void drbd_force_state(struct drbd_conf *mdev,
union drbd_state mask, union drbd_state val)
{
drbd_change_state(mdev, CS_HARD, mask, val);
}
static enum drbd_state_rv
_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
union drbd_state val)
{
union drbd_state os, ns;
unsigned long flags;
enum drbd_state_rv rv;
if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
return SS_CW_SUCCESS;
if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
return SS_CW_FAILED_BY_PEER;
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
os = drbd_read_state(mdev);
ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
rv = is_valid_transition(os, ns);
if (rv >= SS_SUCCESS)
rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
if (!cl_wide_st_chg(mdev, os, ns))
rv = SS_CW_NO_NEED;
if (rv == SS_UNKNOWN_ERROR) {
rv = is_valid_state(mdev, ns);
if (rv >= SS_SUCCESS) {
rv = is_valid_soft_transition(os, ns, mdev->tconn);
if (rv >= SS_SUCCESS)
rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
}
}
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
return rv;
}
/**
* drbd_req_state() - Perform an eventually cluster wide state change
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
* @f: flags
*
* Should not be called directly, use drbd_request_state() or
* _drbd_request_state().
*/
static enum drbd_state_rv
drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
union drbd_state val, enum chg_state_flags f)
{
struct completion done;
unsigned long flags;
union drbd_state os, ns;
enum drbd_state_rv rv;
init_completion(&done);
if (f & CS_SERIALIZE)
mutex_lock(mdev->state_mutex);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
os = drbd_read_state(mdev);
ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
rv = is_valid_transition(os, ns);
if (rv < SS_SUCCESS) {
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
goto abort;
}
if (cl_wide_st_chg(mdev, os, ns)) {
rv = is_valid_state(mdev, ns);
if (rv == SS_SUCCESS)
rv = is_valid_soft_transition(os, ns, mdev->tconn);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (rv < SS_SUCCESS) {
if (f & CS_VERBOSE)
print_st_err(mdev, os, ns, rv);
goto abort;
}
if (drbd_send_state_req(mdev, mask, val)) {
rv = SS_CW_FAILED_BY_PEER;
if (f & CS_VERBOSE)
print_st_err(mdev, os, ns, rv);
goto abort;
}
wait_event(mdev->state_wait,
(rv = _req_st_cond(mdev, mask, val)));
if (rv < SS_SUCCESS) {
if (f & CS_VERBOSE)
print_st_err(mdev, os, ns, rv);
goto abort;
}
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
ns = apply_mask_val(drbd_read_state(mdev), mask, val);
rv = _drbd_set_state(mdev, ns, f, &done);
} else {
rv = _drbd_set_state(mdev, ns, f, &done);
}
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
D_ASSERT(current != mdev->tconn->worker.task);
wait_for_completion(&done);
}
abort:
if (f & CS_SERIALIZE)
mutex_unlock(mdev->state_mutex);
return rv;
}
/**
* _drbd_request_state() - Request a state change (with flags)
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
* @f: flags
*
* Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
* flag, or when logging of failed state change requests is not desired.
*/
enum drbd_state_rv
_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
union drbd_state val, enum chg_state_flags f)
{
enum drbd_state_rv rv;
wait_event(mdev->state_wait,
(rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
return rv;
}
static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
{
dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
name,
drbd_conn_str(ns.conn),
drbd_role_str(ns.role),
drbd_role_str(ns.peer),
drbd_disk_str(ns.disk),
drbd_disk_str(ns.pdsk),
is_susp(ns) ? 's' : 'r',
ns.aftr_isp ? 'a' : '-',
ns.peer_isp ? 'p' : '-',
ns.user_isp ? 'u' : '-',
ns.susp_fen ? 'F' : '-',
ns.susp_nod ? 'N' : '-'
);
}
void print_st_err(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, enum drbd_state_rv err)
{
if (err == SS_IN_TRANSIENT_STATE)
return;
dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
print_st(mdev, " state", os);
print_st(mdev, "wanted", ns);
}
static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
enum chg_state_flags flags)
{
char *pbp;
pbp = pb;
*pbp = 0;
if (ns.role != os.role && flags & CS_DC_ROLE)
pbp += sprintf(pbp, "role( %s -> %s ) ",
drbd_role_str(os.role),
drbd_role_str(ns.role));
if (ns.peer != os.peer && flags & CS_DC_PEER)
pbp += sprintf(pbp, "peer( %s -> %s ) ",
drbd_role_str(os.peer),
drbd_role_str(ns.peer));
if (ns.conn != os.conn && flags & CS_DC_CONN)
pbp += sprintf(pbp, "conn( %s -> %s ) ",
drbd_conn_str(os.conn),
drbd_conn_str(ns.conn));
if (ns.disk != os.disk && flags & CS_DC_DISK)
pbp += sprintf(pbp, "disk( %s -> %s ) ",
drbd_disk_str(os.disk),
drbd_disk_str(ns.disk));
if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
drbd_disk_str(os.pdsk),
drbd_disk_str(ns.pdsk));
return pbp - pb;
}
static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns,
enum chg_state_flags flags)
{
char pb[300];
char *pbp = pb;
pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
if (ns.aftr_isp != os.aftr_isp)
pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
os.aftr_isp,
ns.aftr_isp);
if (ns.peer_isp != os.peer_isp)
pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
os.peer_isp,
ns.peer_isp);
if (ns.user_isp != os.user_isp)
pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
os.user_isp,
ns.user_isp);
if (pbp != pb)
dev_info(DEV, "%s\n", pb);
}
static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns,
enum chg_state_flags flags)
{
char pb[300];
char *pbp = pb;
pbp += print_state_change(pbp, os, ns, flags);
if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
pbp += sprintf(pbp, "susp( %d -> %d ) ",
is_susp(os),
is_susp(ns));
if (pbp != pb)
conn_info(tconn, "%s\n", pb);
}
/**
* is_valid_state() - Returns an SS_ error code if ns is not valid
* @mdev: DRBD device.
* @ns: State to consider.
*/
static enum drbd_state_rv
is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
{
/* See drbd_state_sw_errors in drbd_strings.c */
enum drbd_fencing_p fp;
enum drbd_state_rv rv = SS_SUCCESS;
struct net_conf *nc;
rcu_read_lock();
fp = FP_DONT_CARE;
if (get_ldev(mdev)) {
fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
put_ldev(mdev);
}
nc = rcu_dereference(mdev->tconn->net_conf);
if (nc) {
if (!nc->two_primaries && ns.role == R_PRIMARY) {
if (ns.peer == R_PRIMARY)
rv = SS_TWO_PRIMARIES;
else if (conn_highest_peer(mdev->tconn) == R_PRIMARY)
rv = SS_O_VOL_PEER_PRI;
}
}
if (rv <= 0)
/* already found a reason to abort */;
else if (ns.role == R_SECONDARY && mdev->open_cnt)
rv = SS_DEVICE_IN_USE;
else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
rv = SS_NO_UP_TO_DATE_DISK;
else if (fp >= FP_RESOURCE &&
ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
rv = SS_PRIMARY_NOP;
else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
rv = SS_NO_UP_TO_DATE_DISK;
else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
rv = SS_NO_LOCAL_DISK;
else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
rv = SS_NO_REMOTE_DISK;
else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
rv = SS_NO_UP_TO_DATE_DISK;
else if ((ns.conn == C_CONNECTED ||
ns.conn == C_WF_BITMAP_S ||
ns.conn == C_SYNC_SOURCE ||
ns.conn == C_PAUSED_SYNC_S) &&
ns.disk == D_OUTDATED)
rv = SS_CONNECTED_OUTDATES;
else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
(nc->verify_alg[0] == 0))
rv = SS_NO_VERIFY_ALG;
else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
mdev->tconn->agreed_pro_version < 88)
rv = SS_NOT_SUPPORTED;
else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
rv = SS_CONNECTED_OUTDATES;
rcu_read_unlock();
return rv;
}
/**
* is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
* This function limits state transitions that may be declined by DRBD. I.e.
* user requests (aka soft transitions).
* @mdev: DRBD device.
* @ns: new state.
* @os: old state.
*/
static enum drbd_state_rv
is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_tconn *tconn)
{
enum drbd_state_rv rv = SS_SUCCESS;
if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
os.conn > C_CONNECTED)
rv = SS_RESYNC_RUNNING;
if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
rv = SS_ALREADY_STANDALONE;
if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
rv = SS_IS_DISKLESS;
if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
rv = SS_NO_NET_CONFIG;
if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
rv = SS_LOWER_THAN_OUTDATED;
if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
rv = SS_IN_TRANSIENT_STATE;
/* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
rv = SS_IN_TRANSIENT_STATE; */
/* While establishing a connection only allow cstate to change.
Delay/refuse role changes, detach attach etc... */
if (test_bit(STATE_SENT, &tconn->flags) &&
!(os.conn == C_WF_REPORT_PARAMS ||
(ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
rv = SS_IN_TRANSIENT_STATE;
if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
rv = SS_NEED_CONNECTION;
if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
ns.conn != os.conn && os.conn > C_CONNECTED)
rv = SS_RESYNC_RUNNING;
if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
os.conn < C_CONNECTED)
rv = SS_NEED_CONNECTION;
if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
&& os.conn < C_WF_REPORT_PARAMS)
rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
return rv;
}
static enum drbd_state_rv
is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
{
/* no change -> nothing to do, at least for the connection part */
if (oc == nc)
return SS_NOTHING_TO_DO;
/* disconnect of an unconfigured connection does not make sense */
if (oc == C_STANDALONE && nc == C_DISCONNECTING)
return SS_ALREADY_STANDALONE;
/* from C_STANDALONE, we start with C_UNCONNECTED */
if (oc == C_STANDALONE && nc != C_UNCONNECTED)
return SS_NEED_CONNECTION;
/* When establishing a connection we need to go through WF_REPORT_PARAMS!
Necessary to do the right thing upon invalidate-remote on a disconnected resource */
if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
return SS_NEED_CONNECTION;
/* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
return SS_IN_TRANSIENT_STATE;
/* After C_DISCONNECTING only C_STANDALONE may follow */
if (oc == C_DISCONNECTING && nc != C_STANDALONE)
return SS_IN_TRANSIENT_STATE;
return SS_SUCCESS;
}
/**
* is_valid_transition() - Returns an SS_ error code if the state transition is not possible
* This limits hard state transitions. Hard state transitions are facts there are
* imposed on DRBD by the environment. E.g. disk broke or network broke down.
* But those hard state transitions are still not allowed to do everything.
* @ns: new state.
* @os: old state.
*/
static enum drbd_state_rv
is_valid_transition(union drbd_state os, union drbd_state ns)
{
enum drbd_state_rv rv;
rv = is_valid_conn_transition(os.conn, ns.conn);
/* we cannot fail (again) if we already detached */
if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
rv = SS_IS_DISKLESS;
return rv;
}
static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
{
static const char *msg_table[] = {
[NO_WARNING] = "",
[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
[ABORTED_RESYNC] = "Resync aborted.",
[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
};
if (warn != NO_WARNING)
dev_warn(DEV, "%s\n", msg_table[warn]);
}
/**
* sanitize_state() - Resolves implicitly necessary additional changes to a state transition
* @mdev: DRBD device.
* @os: old state.
* @ns: new state.
* @warn_sync_abort:
*
* When we loose connection, we have to set the state of the peers disk (pdsk)
* to D_UNKNOWN. This rule and many more along those lines are in this function.
*/
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns,
enum sanitize_state_warnings *warn)
{
enum drbd_fencing_p fp;
enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
if (warn)
*warn = NO_WARNING;
fp = FP_DONT_CARE;
if (get_ldev(mdev)) {
rcu_read_lock();
fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
rcu_read_unlock();
put_ldev(mdev);
}
/* Implications from connection to peer and peer_isp */
if (ns.conn < C_CONNECTED) {
ns.peer_isp = 0;
ns.peer = R_UNKNOWN;
if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
ns.pdsk = D_UNKNOWN;
}
/* Clear the aftr_isp when becoming unconfigured */
if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
ns.aftr_isp = 0;
/* An implication of the disk states onto the connection state */
/* Abort resync if a disk fails/detaches */
if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
if (warn)
*warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
ns.conn = C_CONNECTED;
}
/* Connection breaks down before we finished "Negotiating" */
if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
get_ldev_if_state(mdev, D_NEGOTIATING)) {
if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
ns.disk = mdev->new_state_tmp.disk;
ns.pdsk = mdev->new_state_tmp.pdsk;
} else {
if (warn)
*warn = CONNECTION_LOST_NEGOTIATING;
ns.disk = D_DISKLESS;
ns.pdsk = D_UNKNOWN;
}
put_ldev(mdev);
}
/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
ns.disk = D_UP_TO_DATE;
if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
ns.pdsk = D_UP_TO_DATE;
}
/* Implications of the connection stat on the disk states */
disk_min = D_DISKLESS;
disk_max = D_UP_TO_DATE;
pdsk_min = D_INCONSISTENT;
pdsk_max = D_UNKNOWN;
switch ((enum drbd_conns)ns.conn) {
case C_WF_BITMAP_T:
case C_PAUSED_SYNC_T:
case C_STARTING_SYNC_T:
case C_WF_SYNC_UUID:
case C_BEHIND:
disk_min = D_INCONSISTENT;
disk_max = D_OUTDATED;
pdsk_min = D_UP_TO_DATE;
pdsk_max = D_UP_TO_DATE;
break;
case C_VERIFY_S:
case C_VERIFY_T:
disk_min = D_UP_TO_DATE;
disk_max = D_UP_TO_DATE;
pdsk_min = D_UP_TO_DATE;
pdsk_max = D_UP_TO_DATE;
break;
case C_CONNECTED:
disk_min = D_DISKLESS;
disk_max = D_UP_TO_DATE;
pdsk_min = D_DISKLESS;
pdsk_max = D_UP_TO_DATE;
break;
case C_WF_BITMAP_S:
case C_PAUSED_SYNC_S:
case C_STARTING_SYNC_S:
case C_AHEAD:
disk_min = D_UP_TO_DATE;
disk_max = D_UP_TO_DATE;
pdsk_min = D_INCONSISTENT;
pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
break;
case C_SYNC_TARGET:
disk_min = D_INCONSISTENT;
disk_max = D_INCONSISTENT;
pdsk_min = D_UP_TO_DATE;
pdsk_max = D_UP_TO_DATE;
break;
case C_SYNC_SOURCE:
disk_min = D_UP_TO_DATE;
disk_max = D_UP_TO_DATE;
pdsk_min = D_INCONSISTENT;
pdsk_max = D_INCONSISTENT;
break;
case C_STANDALONE:
case C_DISCONNECTING:
case C_UNCONNECTED:
case C_TIMEOUT:
case C_BROKEN_PIPE:
case C_NETWORK_FAILURE:
case C_PROTOCOL_ERROR:
case C_TEAR_DOWN:
case C_WF_CONNECTION:
case C_WF_REPORT_PARAMS:
case C_MASK:
break;
}
if (ns.disk > disk_max)
ns.disk = disk_max;
if (ns.disk < disk_min) {
if (warn)
*warn = IMPLICITLY_UPGRADED_DISK;
ns.disk = disk_min;
}
if (ns.pdsk > pdsk_max)
ns.pdsk = pdsk_max;
if (ns.pdsk < pdsk_min) {
if (warn)
*warn = IMPLICITLY_UPGRADED_PDSK;
ns.pdsk = pdsk_min;
}
if (fp == FP_STONITH &&
(ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED))
ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO &&
(ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
if (ns.conn == C_SYNC_SOURCE)
ns.conn = C_PAUSED_SYNC_S;
if (ns.conn == C_SYNC_TARGET)
ns.conn = C_PAUSED_SYNC_T;
} else {
if (ns.conn == C_PAUSED_SYNC_S)
ns.conn = C_SYNC_SOURCE;
if (ns.conn == C_PAUSED_SYNC_T)
ns.conn = C_SYNC_TARGET;
}
return ns;
}
void drbd_resume_al(struct drbd_conf *mdev)
{
if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
dev_info(DEV, "Resumed AL updates\n");
}
/* helper for __drbd_set_state */
static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
{
if (mdev->tconn->agreed_pro_version < 90)
mdev->ov_start_sector = 0;
mdev->rs_total = drbd_bm_bits(mdev);
mdev->ov_position = 0;
if (cs == C_VERIFY_T) {
/* starting online verify from an arbitrary position
* does not fit well into the existing protocol.
* on C_VERIFY_T, we initialize ov_left and friends
* implicitly in receive_DataRequest once the
* first P_OV_REQUEST is received */
mdev->ov_start_sector = ~(sector_t)0;
} else {
unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
if (bit >= mdev->rs_total) {
mdev->ov_start_sector =
BM_BIT_TO_SECT(mdev->rs_total - 1);
mdev->rs_total = 1;
} else
mdev->rs_total -= bit;
mdev->ov_position = mdev->ov_start_sector;
}
mdev->ov_left = mdev->rs_total;
}
/**
* __drbd_set_state() - Set a new DRBD state
* @mdev: DRBD device.
* @ns: new state.
* @flags: Flags
* @done: Optional completion, that will get completed after the after_state_ch() finished
*
* Caller needs to hold req_lock, and global_state_lock. Do not call directly.
*/
enum drbd_state_rv
__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
enum chg_state_flags flags, struct completion *done)
{
union drbd_state os;
enum drbd_state_rv rv = SS_SUCCESS;
enum sanitize_state_warnings ssw;
struct after_state_chg_work *ascw;
os = drbd_read_state(mdev);
ns = sanitize_state(mdev, ns, &ssw);
if (ns.i == os.i)
return SS_NOTHING_TO_DO;
rv = is_valid_transition(os, ns);
if (rv < SS_SUCCESS)
return rv;
if (!(flags & CS_HARD)) {
/* pre-state-change checks ; only look at ns */
/* See drbd_state_sw_errors in drbd_strings.c */
rv = is_valid_state(mdev, ns);
if (rv < SS_SUCCESS) {
/* If the old state was illegal as well, then let
this happen...*/
if (is_valid_state(mdev, os) == rv)
rv = is_valid_soft_transition(os, ns, mdev->tconn);
} else
rv = is_valid_soft_transition(os, ns, mdev->tconn);
}
if (rv < SS_SUCCESS) {
if (flags & CS_VERBOSE)
print_st_err(mdev, os, ns, rv);
return rv;
}
print_sanitize_warnings(mdev, ssw);
drbd_pr_state_change(mdev, os, ns, flags);
/* Display changes to the susp* flags that where caused by the call to
sanitize_state(). Only display it here if we where not called from
_conn_request_state() */
if (!(flags & CS_DC_SUSP))
conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP);
/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
* on the ldev here, to be sure the transition -> D_DISKLESS resp.
* drbd_ldev_destroy() won't happen before our corresponding
* after_state_ch works run, where we put_ldev again. */
if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
(os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
atomic_inc(&mdev->local_cnt);
mdev->state.i = ns.i;
mdev->tconn->susp = ns.susp;
mdev->tconn->susp_nod = ns.susp_nod;
mdev->tconn->susp_fen = ns.susp_fen;
if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
drbd_print_uuids(mdev, "attached to UUIDs");
/* Wake up role changes, that were delayed because of connection establishing */
if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
no_peer_wf_report_params(mdev->tconn))
clear_bit(STATE_SENT, &mdev->tconn->flags);
wake_up(&mdev->misc_wait);
wake_up(&mdev->state_wait);
wake_up(&mdev->tconn->ping_wait);
/* Aborted verify run, or we reached the stop sector.
* Log the last position, unless end-of-device. */
if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
ns.conn <= C_CONNECTED) {
mdev->ov_start_sector =
BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
if (mdev->ov_left)
dev_info(DEV, "Online Verify reached sector %llu\n",
(unsigned long long)mdev->ov_start_sector);
}
if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
(ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
dev_info(DEV, "Syncer continues.\n");
mdev->rs_paused += (long)jiffies
-(long)mdev->rs_mark_time[mdev->rs_last_mark];
if (ns.conn == C_SYNC_TARGET)
mod_timer(&mdev->resync_timer, jiffies);
}
if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
(ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
dev_info(DEV, "Resync suspended\n");
mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
}
if (os.conn == C_CONNECTED &&
(ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
unsigned long now = jiffies;
int i;
set_ov_position(mdev, ns.conn);
mdev->rs_start = now;
mdev->rs_last_events = 0;
mdev->rs_last_sect_ev = 0;
mdev->ov_last_oos_size = 0;
mdev->ov_last_oos_start = 0;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
mdev->rs_mark_left[i] = mdev->ov_left;
mdev->rs_mark_time[i] = now;
}
drbd_rs_controller_reset(mdev);
if (ns.conn == C_VERIFY_S) {
dev_info(DEV, "Starting Online Verify from sector %llu\n",
(unsigned long long)mdev->ov_position);
mod_timer(&mdev->resync_timer, jiffies);
}
}
if (get_ldev(mdev)) {
u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
mdf &= ~MDF_AL_CLEAN;
if (test_bit(CRASHED_PRIMARY, &mdev->flags))
mdf |= MDF_CRASHED_PRIMARY;
if (mdev->state.role == R_PRIMARY ||
(mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
mdf |= MDF_PRIMARY_IND;
if (mdev->state.conn > C_WF_REPORT_PARAMS)
mdf |= MDF_CONNECTED_IND;
if (mdev->state.disk > D_INCONSISTENT)
mdf |= MDF_CONSISTENT;
if (mdev->state.disk > D_OUTDATED)
mdf |= MDF_WAS_UP_TO_DATE;
if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
mdf |= MDF_PEER_OUT_DATED;
if (mdf != mdev->ldev->md.flags) {
mdev->ldev->md.flags = mdf;
drbd_md_mark_dirty(mdev);
}
if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
put_ldev(mdev);
}
/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
set_bit(CONSIDER_RESYNC, &mdev->flags);
/* Receiver should clean up itself */
if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
drbd_thread_stop_nowait(&mdev->tconn->receiver);
/* Now the receiver finished cleaning up itself, it should die */
if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
drbd_thread_stop_nowait(&mdev->tconn->receiver);
/* Upon network failure, we need to restart the receiver. */
if (os.conn > C_WF_CONNECTION &&
ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
drbd_thread_restart_nowait(&mdev->tconn->receiver);
/* Resume AL writing if we get a connection */
if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
drbd_resume_al(mdev);
/* remember last attach time so request_timer_fn() won't
* kill newly established sessions while we are still trying to thaw
* previously frozen IO */
if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
ns.disk > D_NEGOTIATING)
mdev->last_reattach_jif = jiffies;
ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
if (ascw) {
ascw->os = os;
ascw->ns = ns;
ascw->flags = flags;
ascw->w.cb = w_after_state_ch;
ascw->w.mdev = mdev;
ascw->done = done;
drbd_queue_work(&mdev->tconn->sender_work, &ascw->w);
} else {
dev_err(DEV, "Could not kmalloc an ascw\n");
}
return rv;
}
static int w_after_state_ch(struct drbd_work *w, int unused)
{
struct after_state_chg_work *ascw =
container_of(w, struct after_state_chg_work, w);
struct drbd_conf *mdev = w->mdev;
after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
if (ascw->flags & CS_WAIT_COMPLETE) {
D_ASSERT(ascw->done != NULL);
complete(ascw->done);
}
kfree(ascw);
return 0;
}
static void abw_start_sync(struct drbd_conf *mdev, int rv)
{
if (rv) {
dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
return;
}
switch (mdev->state.conn) {
case C_STARTING_SYNC_T:
_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
break;
case C_STARTING_SYNC_S:
drbd_start_resync(mdev, C_SYNC_SOURCE);
break;
}
}
int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags)
{
int rv;
D_ASSERT(current == mdev->tconn->worker.task);
/* open coded non-blocking drbd_suspend_io(mdev); */
set_bit(SUSPEND_IO, &mdev->flags);
drbd_bm_lock(mdev, why, flags);
rv = io_fn(mdev);
drbd_bm_unlock(mdev);
drbd_resume_io(mdev);
return rv;
}
/**
* after_state_ch() - Perform after state change actions that may sleep
* @mdev: DRBD device.
* @os: old state.
* @ns: new state.
* @flags: Flags
*/
static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, enum chg_state_flags flags)
{
struct sib_info sib;
sib.sib_reason = SIB_STATE_CHANGE;
sib.os = os;
sib.ns = ns;
if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
clear_bit(CRASHED_PRIMARY, &mdev->flags);
if (mdev->p_uuid)
mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
}
/* Inform userspace about the change... */
drbd_bcast_event(mdev, &sib);
if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
(ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
drbd_khelper(mdev, "pri-on-incon-degr");
/* Here we have the actions that are performed after a
state change. This function might sleep */
if (ns.susp_nod) {
struct drbd_tconn *tconn = mdev->tconn;
enum drbd_req_event what = NOTHING;
spin_lock_irq(&tconn->req_lock);
if (os.conn < C_CONNECTED && conn_lowest_conn(tconn) >= C_CONNECTED)
what = RESEND;
if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
conn_lowest_disk(tconn) > D_NEGOTIATING)
what = RESTART_FROZEN_DISK_IO;
if (tconn->susp_nod && what != NOTHING) {
_tl_restart(tconn, what);
_conn_request_state(tconn,
(union drbd_state) { { .susp_nod = 1 } },
(union drbd_state) { { .susp_nod = 0 } },
CS_VERBOSE);
}
spin_unlock_irq(&tconn->req_lock);
}
if (ns.susp_fen) {
struct drbd_tconn *tconn = mdev->tconn;
spin_lock_irq(&tconn->req_lock);
if (tconn->susp_fen && conn_lowest_conn(tconn) >= C_CONNECTED) {
/* case2: The connection was established again: */
struct drbd_conf *odev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, odev, vnr)
clear_bit(NEW_CUR_UUID, &odev->flags);
rcu_read_unlock();
_tl_restart(tconn, RESEND);
_conn_request_state(tconn,
(union drbd_state) { { .susp_fen = 1 } },
(union drbd_state) { { .susp_fen = 0 } },
CS_VERBOSE);
}
spin_unlock_irq(&tconn->req_lock);
}
/* Became sync source. With protocol >= 96, we still need to send out
* the sync uuid now. Need to do that before any drbd_send_state, or
* the other side may go "paused sync" before receiving the sync uuids,
* which is unexpected. */
if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
(ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) {
drbd_gen_and_send_sync_uuid(mdev);
put_ldev(mdev);
}
/* Do not change the order of the if above and the two below... */
if (os.pdsk == D_DISKLESS &&
ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */
/* we probably will start a resync soon.
* make sure those things are properly reset. */
mdev->rs_total = 0;
mdev->rs_failed = 0;
atomic_set(&mdev->rs_pending_cnt, 0);
drbd_rs_cancel_all(mdev);
drbd_send_uuids(mdev);
drbd_send_state(mdev, ns);
}
/* No point in queuing send_bitmap if we don't have a connection
* anymore, so check also the _current_ state, not only the new state
* at the time this work was queued. */
if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
mdev->state.conn == C_WF_BITMAP_S)
drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
"send_bitmap (WFBitMapS)",
BM_LOCKED_TEST_ALLOWED);
/* Lost contact to peer's copy of the data */
if ((os.pdsk >= D_INCONSISTENT &&
os.pdsk != D_UNKNOWN &&
os.pdsk != D_OUTDATED)
&& (ns.pdsk < D_INCONSISTENT ||
ns.pdsk == D_UNKNOWN ||
ns.pdsk == D_OUTDATED)) {
if (get_ldev(mdev)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
if (drbd_suspended(mdev)) {
set_bit(NEW_CUR_UUID, &mdev->flags);
} else {
drbd_uuid_new_current(mdev);
drbd_send_uuids(mdev);
}
}
put_ldev(mdev);
}
}
if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
drbd_uuid_new_current(mdev);
drbd_send_uuids(mdev);
}
/* D_DISKLESS Peer becomes secondary */
if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
/* We may still be Primary ourselves.
* No harm done if the bitmap still changes,
* redirtied pages will follow later. */
drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
"demote diskless peer", BM_LOCKED_SET_ALLOWED);
put_ldev(mdev);
}
/* Write out all changed bits on demote.
* Though, no need to da that just yet
* if there is a resync going on still */
if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
/* No changes to the bitmap expected this time, so assert that,
* even though no harm was done if it did change. */
drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
"demote", BM_LOCKED_TEST_ALLOWED);
put_ldev(mdev);
}
/* Last part of the attaching process ... */
if (ns.conn >= C_CONNECTED &&
os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
drbd_send_sizes(mdev, 0, 0); /* to start sync... */
drbd_send_uuids(mdev);
drbd_send_state(mdev, ns);
}
/* We want to pause/continue resync, tell peer. */
if (ns.conn >= C_CONNECTED &&
((os.aftr_isp != ns.aftr_isp) ||
(os.user_isp != ns.user_isp)))
drbd_send_state(mdev, ns);
/* In case one of the isp bits got set, suspend other devices. */
if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
(ns.aftr_isp || ns.peer_isp || ns.user_isp))
suspend_other_sg(mdev);
/* Make sure the peer gets informed about eventual state
changes (ISP bits) while we were in WFReportParams. */
if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
drbd_send_state(mdev, ns);
if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
drbd_send_state(mdev, ns);
/* We are in the progress to start a full sync... */
if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
(os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
/* no other bitmap changes expected during this phase */
drbd_queue_bitmap_io(mdev,
&drbd_bmio_set_n_write, &abw_start_sync,
"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
/* We are invalidating our self... */
if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
/* other bitmap operation expected during this phase */
drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
"set_n_write from invalidate", BM_LOCKED_MASK);
/* first half of local IO error, failure to attach,
* or administrative detach */
if (os.disk != D_FAILED && ns.disk == D_FAILED) {
enum drbd_io_error_p eh = EP_PASS_ON;
int was_io_error = 0;
/* corresponding get_ldev was in __drbd_set_state, to serialize
* our cleanup here with the transition to D_DISKLESS.
* But is is still not save to dreference ldev here, since
* we might come from an failed Attach before ldev was set. */
if (mdev->ldev) {
rcu_read_lock();
eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
rcu_read_unlock();
was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
if (was_io_error && eh == EP_CALL_HELPER)
drbd_khelper(mdev, "local-io-error");
/* Immediately allow completion of all application IO,
* that waits for completion from the local disk,
* if this was a force-detach due to disk_timeout
* or administrator request (drbdsetup detach --force).
* Do NOT abort otherwise.
* Aborting local requests may cause serious problems,
* if requests are completed to upper layers already,
* and then later the already submitted local bio completes.
* This can cause DMA into former bio pages that meanwhile
* have been re-used for other things.
* So aborting local requests may cause crashes,
* or even worse, silent data corruption.
*/
if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
tl_abort_disk_io(mdev);
/* current state still has to be D_FAILED,
* there is only one way out: to D_DISKLESS,
* and that may only happen after our put_ldev below. */
if (mdev->state.disk != D_FAILED)
dev_err(DEV,
"ASSERT FAILED: disk is %s during detach\n",
drbd_disk_str(mdev->state.disk));
if (ns.conn >= C_CONNECTED)
drbd_send_state(mdev, ns);
drbd_rs_cancel_all(mdev);
/* In case we want to get something to stable storage still,
* this may be the last chance.
* Following put_ldev may transition to D_DISKLESS. */
drbd_md_sync(mdev);
}
put_ldev(mdev);
}
/* second half of local IO error, failure to attach,
* or administrative detach,
* after local_cnt references have reached zero again */
if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
/* We must still be diskless,
* re-attach has to be serialized with this! */
if (mdev->state.disk != D_DISKLESS)
dev_err(DEV,
"ASSERT FAILED: disk is %s while going diskless\n",
drbd_disk_str(mdev->state.disk));
if (ns.conn >= C_CONNECTED)
drbd_send_state(mdev, ns);
/* corresponding get_ldev in __drbd_set_state
* this may finally trigger drbd_ldev_destroy. */
put_ldev(mdev);
}
/* Notify peer that I had a local IO error, and did not detached.. */
if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
drbd_send_state(mdev, ns);
/* Disks got bigger while they were detached */
if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
if (ns.conn == C_CONNECTED)
resync_after_online_grow(mdev);
}
/* A resync finished or aborted, wake paused devices... */
if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
(os.peer_isp && !ns.peer_isp) ||
(os.user_isp && !ns.user_isp))
resume_next_sg(mdev);
/* sync target done with resync. Explicitly notify peer, even though
* it should (at least for non-empty resyncs) already know itself. */
if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
drbd_send_state(mdev, ns);
/* Verify finished, or reached stop sector. Peer did not know about
* the stop sector, and we may even have changed the stop sector during
* verify to interrupt/stop early. Send the new state. */
if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
&& verify_can_do_stop_sector(mdev))
drbd_send_state(mdev, ns);
/* This triggers bitmap writeout of potentially still unwritten pages
* if the resync finished cleanly, or aborted because of peer disk
* failure, or because of connection loss.
* For resync aborted because of local disk failure, we cannot do
* any bitmap writeout anymore.
* No harm done if some bits change during this phase.
*/
if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
put_ldev(mdev);
}
if (ns.disk == D_DISKLESS &&
ns.conn == C_STANDALONE &&
ns.role == R_SECONDARY) {
if (os.aftr_isp != ns.aftr_isp)
resume_next_sg(mdev);
}
drbd_md_sync(mdev);
}
struct after_conn_state_chg_work {
struct drbd_work w;
enum drbd_conns oc;
union drbd_state ns_min;
union drbd_state ns_max; /* new, max state, over all mdevs */
enum chg_state_flags flags;
};
static int w_after_conn_state_ch(struct drbd_work *w, int unused)
{
struct after_conn_state_chg_work *acscw =
container_of(w, struct after_conn_state_chg_work, w);
struct drbd_tconn *tconn = w->tconn;
enum drbd_conns oc = acscw->oc;
union drbd_state ns_max = acscw->ns_max;
struct drbd_conf *mdev;
int vnr;
kfree(acscw);
/* Upon network configuration, we need to start the receiver */
if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
drbd_thread_start(&tconn->receiver);
if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
struct net_conf *old_conf;
mutex_lock(&tconn->conf_update);
old_conf = tconn->net_conf;
tconn->my_addr_len = 0;
tconn->peer_addr_len = 0;
rcu_assign_pointer(tconn->net_conf, NULL);
conn_free_crypto(tconn);
mutex_unlock(&tconn->conf_update);
synchronize_rcu();
kfree(old_conf);
}
if (ns_max.susp_fen) {
/* case1: The outdate peer handler is successful: */
if (ns_max.pdsk <= D_OUTDATED) {
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
drbd_uuid_new_current(mdev);
clear_bit(NEW_CUR_UUID, &mdev->flags);
}
}
rcu_read_unlock();
spin_lock_irq(&tconn->req_lock);
_tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
_conn_request_state(tconn,
(union drbd_state) { { .susp_fen = 1 } },
(union drbd_state) { { .susp_fen = 0 } },
CS_VERBOSE);
spin_unlock_irq(&tconn->req_lock);
}
}
kref_put(&tconn->kref, &conn_destroy);
conn_md_sync(tconn);
return 0;
}
void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf)
{
enum chg_state_flags flags = ~0;
struct drbd_conf *mdev;
int vnr, first_vol = 1;
union drbd_dev_state os, cs = {
{ .role = R_SECONDARY,
.peer = R_UNKNOWN,
.conn = tconn->cstate,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN,
} };
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
os = mdev->state;
if (first_vol) {
cs = os;
first_vol = 0;
continue;
}
if (cs.role != os.role)
flags &= ~CS_DC_ROLE;
if (cs.peer != os.peer)
flags &= ~CS_DC_PEER;
if (cs.conn != os.conn)
flags &= ~CS_DC_CONN;
if (cs.disk != os.disk)
flags &= ~CS_DC_DISK;
if (cs.pdsk != os.pdsk)
flags &= ~CS_DC_PDSK;
}
rcu_read_unlock();
*pf |= CS_DC_MASK;
*pf &= flags;
(*pcs).i = cs.i;
}
static enum drbd_state_rv
conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags)
{
enum drbd_state_rv rv = SS_SUCCESS;
union drbd_state ns, os;
struct drbd_conf *mdev;
int vnr;
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
os = drbd_read_state(mdev);
ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
ns.disk = os.disk;
if (ns.i == os.i)
continue;
rv = is_valid_transition(os, ns);
if (rv < SS_SUCCESS)
break;
if (!(flags & CS_HARD)) {
rv = is_valid_state(mdev, ns);
if (rv < SS_SUCCESS) {
if (is_valid_state(mdev, os) == rv)
rv = is_valid_soft_transition(os, ns, tconn);
} else
rv = is_valid_soft_transition(os, ns, tconn);
}
if (rv < SS_SUCCESS)
break;
}
rcu_read_unlock();
if (rv < SS_SUCCESS && flags & CS_VERBOSE)
print_st_err(mdev, os, ns, rv);
return rv;
}
void
conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
{
union drbd_state ns, os, ns_max = { };
union drbd_state ns_min = {
{ .role = R_MASK,
.peer = R_MASK,
.conn = val.conn,
.disk = D_MASK,
.pdsk = D_MASK
} };
struct drbd_conf *mdev;
enum drbd_state_rv rv;
int vnr, number_of_volumes = 0;
if (mask.conn == C_MASK) {
/* remember last connect time so request_timer_fn() won't
* kill newly established sessions while we are still trying to thaw
* previously frozen IO */
if (tconn->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
tconn->last_reconnect_jif = jiffies;
tconn->cstate = val.conn;
}
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
number_of_volumes++;
os = drbd_read_state(mdev);
ns = apply_mask_val(os, mask, val);
ns = sanitize_state(mdev, ns, NULL);
if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
ns.disk = os.disk;
rv = __drbd_set_state(mdev, ns, flags, NULL);
if (rv < SS_SUCCESS)
BUG();
ns.i = mdev->state.i;
ns_max.role = max_role(ns.role, ns_max.role);
ns_max.peer = max_role(ns.peer, ns_max.peer);
ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
ns_min.role = min_role(ns.role, ns_min.role);
ns_min.peer = min_role(ns.peer, ns_min.peer);
ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
}
rcu_read_unlock();
if (number_of_volumes == 0) {
ns_min = ns_max = (union drbd_state) { {
.role = R_SECONDARY,
.peer = R_UNKNOWN,
.conn = val.conn,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN
} };
}
ns_min.susp = ns_max.susp = tconn->susp;
ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod;
ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen;
*pns_min = ns_min;
*pns_max = ns_max;
}
static enum drbd_state_rv
_conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
{
enum drbd_state_rv rv;
if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags))
return SS_CW_SUCCESS;
if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
return SS_CW_FAILED_BY_PEER;
rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR;
if (rv == SS_UNKNOWN_ERROR)
rv = conn_is_valid_transition(tconn, mask, val, 0);
if (rv == SS_SUCCESS)
rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
return rv;
}
enum drbd_state_rv
_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags)
{
enum drbd_state_rv rv = SS_SUCCESS;
struct after_conn_state_chg_work *acscw;
enum drbd_conns oc = tconn->cstate;
union drbd_state ns_max, ns_min, os;
bool have_mutex = false;
if (mask.conn) {
rv = is_valid_conn_transition(oc, val.conn);
if (rv < SS_SUCCESS)
goto abort;
}
rv = conn_is_valid_transition(tconn, mask, val, flags);
if (rv < SS_SUCCESS)
goto abort;
if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
!(flags & (CS_LOCAL_ONLY | CS_HARD))) {
/* This will be a cluster-wide state change.
* Need to give up the spinlock, grab the mutex,
* then send the state change request, ... */
spin_unlock_irq(&tconn->req_lock);
mutex_lock(&tconn->cstate_mutex);
have_mutex = true;
set_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
if (conn_send_state_req(tconn, mask, val)) {
/* sending failed. */
clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
rv = SS_CW_FAILED_BY_PEER;
/* need to re-aquire the spin lock, though */
goto abort_unlocked;
}
if (val.conn == C_DISCONNECTING)
set_bit(DISCONNECT_SENT, &tconn->flags);
/* ... and re-aquire the spinlock.
* If _conn_rq_cond() returned >= SS_SUCCESS, we must call
* conn_set_state() within the same spinlock. */
spin_lock_irq(&tconn->req_lock);
wait_event_lock_irq(tconn->ping_wait,
(rv = _conn_rq_cond(tconn, mask, val)),
tconn->req_lock,
);
clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
if (rv < SS_SUCCESS)
goto abort;
}
conn_old_common_state(tconn, &os, &flags);
flags |= CS_DC_SUSP;
conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags);
conn_pr_state_change(tconn, os, ns_max, flags);
acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
if (acscw) {
acscw->oc = os.conn;
acscw->ns_min = ns_min;
acscw->ns_max = ns_max;
acscw->flags = flags;
acscw->w.cb = w_after_conn_state_ch;
kref_get(&tconn->kref);
acscw->w.tconn = tconn;
drbd_queue_work(&tconn->sender_work, &acscw->w);
} else {
conn_err(tconn, "Could not kmalloc an acscw\n");
}
abort:
if (have_mutex) {
/* mutex_unlock() "... must not be used in interrupt context.",
* so give up the spinlock, then re-aquire it */
spin_unlock_irq(&tconn->req_lock);
abort_unlocked:
mutex_unlock(&tconn->cstate_mutex);
spin_lock_irq(&tconn->req_lock);
}
if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv));
conn_err(tconn, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
conn_err(tconn, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
}
return rv;
}
enum drbd_state_rv
conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags)
{
enum drbd_state_rv rv;
spin_lock_irq(&tconn->req_lock);
rv = _conn_request_state(tconn, mask, val, flags);
spin_unlock_irq(&tconn->req_lock);
return rv;
}
#ifndef DRBD_STATE_H
#define DRBD_STATE_H
struct drbd_conf;
struct drbd_tconn;
/**
* DOC: DRBD State macros
*
* These macros are used to express state changes in easily readable form.
*
* The NS macros expand to a mask and a value, that can be bit ored onto the
* current state as soon as the spinlock (req_lock) was taken.
*
* The _NS macros are used for state functions that get called with the
* spinlock. These macros expand directly to the new state value.
*
* Besides the basic forms NS() and _NS() additional _?NS[23] are defined
* to express state changes that affect more than one aspect of the state.
*
* E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
* Means that the network connection was established and that the peer
* is in secondary role.
*/
#define role_MASK R_MASK
#define peer_MASK R_MASK
#define disk_MASK D_MASK
#define pdsk_MASK D_MASK
#define conn_MASK C_MASK
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
#define susp_nod_MASK 1
#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T = (S); val; })
#define NS2(T1, S1, T2, S2) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val; })
#define NS3(T1, S1, T2, S2, T3, S3) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val.T3 = (S3); val; })
#define _NS(D, T, S) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
#define _NS2(D, T1, S1, T2, S2) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns; })
#define _NS3(D, T1, S1, T2, S2, T3, S3) \
D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
enum chg_state_flags {
CS_HARD = 1 << 0,
CS_VERBOSE = 1 << 1,
CS_WAIT_COMPLETE = 1 << 2,
CS_SERIALIZE = 1 << 3,
CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
CS_DC_PEER = 1 << 6,
CS_DC_CONN = 1 << 7,
CS_DC_DISK = 1 << 8,
CS_DC_PDSK = 1 << 9,
CS_DC_SUSP = 1 << 10,
CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
CS_IGN_OUTD_FAIL = 1 << 11,
};
/* drbd_dev_state and drbd_state are different types. This is to stress the
small difference. There is no suspended flag (.susp), and no suspended
while fence handler runs flas (susp_fen). */
union drbd_dev_state {
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned conn:5 ; /* 17/32 cstates */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned _unused:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned peer_isp:1 ;
unsigned user_isp:1 ;
unsigned _pad:11; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned _pad:11;
unsigned user_isp:1 ;
unsigned peer_isp:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned _unused:1 ;
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned conn:5 ; /* 17/32 cstates */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
#else
# error "this endianess is not supported"
#endif
};
unsigned int i;
};
extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
enum chg_state_flags f,
union drbd_state mask,
union drbd_state val);
extern void drbd_force_state(struct drbd_conf *, union drbd_state,
union drbd_state);
extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
union drbd_state,
union drbd_state,
enum chg_state_flags);
extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
enum chg_state_flags,
struct completion *done);
extern void print_st_err(struct drbd_conf *, union drbd_state,
union drbd_state, int);
enum drbd_state_rv
_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
enum drbd_state_rv
conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
enum chg_state_flags flags);
extern void drbd_resume_al(struct drbd_conf *mdev);
extern bool conn_all_vols_unconf(struct drbd_tconn *tconn);
/**
* drbd_request_state() - Reqest a state change
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
*
* This is the most graceful way of requesting a state change. It is verbose
* quite verbose in case the state change is not possible, and all those
* state changes are globally serialized.
*/
static inline int drbd_request_state(struct drbd_conf *mdev,
union drbd_state mask,
union drbd_state val)
{
return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
}
enum drbd_role conn_highest_role(struct drbd_tconn *tconn);
enum drbd_role conn_highest_peer(struct drbd_tconn *tconn);
enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn);
enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn);
enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn);
enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn);
#endif
......@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
};
const char *drbd_conn_str(enum drbd_conns s)
......
......@@ -38,16 +38,13 @@
#include "drbd_int.h"
#include "drbd_req.h"
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
static int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel);
static int w_make_ov_request(struct drbd_work *w, int cancel);
/* endio handlers:
* drbd_md_io_complete (defined here)
* drbd_endio_pri (defined here)
* drbd_endio_sec (defined here)
* drbd_request_endio (defined here)
* drbd_peer_request_endio (defined here)
* bm_async_io_complete (defined in drbd_bitmap.c)
*
* For all these callbacks, note the following:
......@@ -60,7 +57,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
/* About the global_state_lock
Each state transition on an device holds a read lock. In case we have
to evaluate the sync after dependencies, we grab a write lock, because
to evaluate the resync after dependencies, we grab a write lock, because
we need stable states on all devices for that. */
rwlock_t global_state_lock;
......@@ -98,97 +95,93 @@ void drbd_md_io_complete(struct bio *bio, int error)
/* reads on behalf of the partner,
* "submitted" by the receiver
*/
void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
struct drbd_conf *mdev = e->mdev;
D_ASSERT(e->block_id != ID_VACANT);
struct drbd_conf *mdev = peer_req->w.mdev;
spin_lock_irqsave(&mdev->req_lock, flags);
mdev->read_cnt += e->size >> 9;
list_del(&e->w.list);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
mdev->read_cnt += peer_req->i.size >> 9;
list_del(&peer_req->w.list);
if (list_empty(&mdev->read_ee))
wake_up(&mdev->ee_wait);
if (test_bit(__EE_WAS_ERROR, &e->flags))
if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
spin_unlock_irqrestore(&mdev->req_lock, flags);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
drbd_queue_work(&mdev->data.work, &e->w);
drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w);
put_ldev(mdev);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver, final stage. */
static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
struct drbd_conf *mdev = e->mdev;
sector_t e_sector;
struct drbd_conf *mdev = peer_req->w.mdev;
struct drbd_interval i;
int do_wake;
int is_syncer_req;
u64 block_id;
int do_al_complete_io;
D_ASSERT(e->block_id != ID_VACANT);
/* after we moved e to done_ee,
/* after we moved peer_req to done_ee,
* we may no longer access it,
* it may be freed/reused already!
* (as soon as we release the req_lock) */
e_sector = e->sector;
do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
is_syncer_req = is_syncer_block_id(e->block_id);
i = peer_req->i;
do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
block_id = peer_req->block_id;
spin_lock_irqsave(&mdev->req_lock, flags);
mdev->writ_cnt += e->size >> 9;
list_del(&e->w.list); /* has been on active_ee or sync_ee */
list_add_tail(&e->w.list, &mdev->done_ee);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
mdev->writ_cnt += peer_req->i.size >> 9;
list_move_tail(&peer_req->w.list, &mdev->done_ee);
/* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
* neither did we wake possibly waiting conflicting requests.
* done from "drbd_process_done_ee" within the appropriate w.cb
* (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
/*
* Do not remove from the write_requests tree here: we did not send the
* Ack yet and did not wake possibly waiting conflicting requests.
* Removed from the tree from "drbd_process_done_ee" within the
* appropriate w.cb (e_end_block/e_end_resync_block) or from
* _drbd_clear_done_ee.
*/
do_wake = is_syncer_req
? list_empty(&mdev->sync_ee)
: list_empty(&mdev->active_ee);
do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
if (test_bit(__EE_WAS_ERROR, &e->flags))
if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
spin_unlock_irqrestore(&mdev->req_lock, flags);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (is_syncer_req)
drbd_rs_complete_io(mdev, e_sector);
if (block_id == ID_SYNCER)
drbd_rs_complete_io(mdev, i.sector);
if (do_wake)
wake_up(&mdev->ee_wait);
if (do_al_complete_io)
drbd_al_complete_io(mdev, e_sector);
drbd_al_complete_io(mdev, &i);
wake_asender(mdev);
wake_asender(mdev->tconn);
put_ldev(mdev);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver.
*/
void drbd_endio_sec(struct bio *bio, int error)
void drbd_peer_request_endio(struct bio *bio, int error)
{
struct drbd_epoch_entry *e = bio->bi_private;
struct drbd_conf *mdev = e->mdev;
struct drbd_peer_request *peer_req = bio->bi_private;
struct drbd_conf *mdev = peer_req->w.mdev;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
int is_write = bio_data_dir(bio) == WRITE;
if (error && __ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "%s: error=%d s=%llus\n",
is_write ? "write" : "read", error,
(unsigned long long)e->sector);
(unsigned long long)peer_req->i.sector);
if (!error && !uptodate) {
if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
is_write ? "write" : "read",
(unsigned long long)e->sector);
(unsigned long long)peer_req->i.sector);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
......@@ -196,24 +189,24 @@ void drbd_endio_sec(struct bio *bio, int error)
}
if (error)
set_bit(__EE_WAS_ERROR, &e->flags);
set_bit(__EE_WAS_ERROR, &peer_req->flags);
bio_put(bio); /* no need for the bio anymore */
if (atomic_dec_and_test(&e->pending_bios)) {
if (atomic_dec_and_test(&peer_req->pending_bios)) {
if (is_write)
drbd_endio_write_sec_final(e);
drbd_endio_write_sec_final(peer_req);
else
drbd_endio_read_sec_final(e);
drbd_endio_read_sec_final(peer_req);
}
}
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
*/
void drbd_endio_pri(struct bio *bio, int error)
void drbd_request_endio(struct bio *bio, int error)
{
unsigned long flags;
struct drbd_request *req = bio->bi_private;
struct drbd_conf *mdev = req->mdev;
struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
......@@ -227,6 +220,7 @@ void drbd_endio_pri(struct bio *bio, int error)
error = -EIO;
}
/* If this request was aborted locally before,
* but now was completed "successfully",
* chances are that this caused arbitrary data corruption.
......@@ -266,50 +260,32 @@ void drbd_endio_pri(struct bio *bio, int error)
/* to avoid recursion in __req_mod */
if (unlikely(error)) {
what = (bio_data_dir(bio) == WRITE)
? write_completed_with_error
? WRITE_COMPLETED_WITH_ERROR
: (bio_rw(bio) == READ)
? read_completed_with_error
: read_ahead_completed_with_error;
? READ_COMPLETED_WITH_ERROR
: READ_AHEAD_COMPLETED_WITH_ERROR;
} else
what = completed_ok;
what = COMPLETED_OK;
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
/* not req_mod(), we need irqsave here! */
spin_lock_irqsave(&mdev->req_lock, flags);
spin_lock_irqsave(&mdev->tconn->req_lock, flags);
__req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
put_ldev(mdev);
if (m.bio)
complete_master_bio(mdev, &m);
}
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
/* We should not detach for read io-error,
* but try to WRITE the P_DATA_REPLY to the failed location,
* to give the disk the chance to relocate that block */
spin_lock_irq(&mdev->req_lock);
if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
_req_mod(req, read_retry_remote_canceled);
spin_unlock_irq(&mdev->req_lock);
return 1;
}
spin_unlock_irq(&mdev->req_lock);
return w_send_read_req(mdev, w, 0);
}
void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm,
struct drbd_peer_request *peer_req, void *digest)
{
struct hash_desc desc;
struct scatterlist sg;
struct page *page = e->pages;
struct page *page = peer_req->pages;
struct page *tmp;
unsigned len;
......@@ -326,7 +302,7 @@ void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_e
page = tmp;
}
/* and now the last, possibly only partially used page */
len = e->size & (PAGE_SIZE - 1);
len = peer_req->i.size & (PAGE_SIZE - 1);
sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
crypto_hash_update(&desc, &sg, sg.length);
crypto_hash_final(&desc, digest);
......@@ -352,59 +328,58 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
crypto_hash_final(&desc, digest);
}
/* TODO merge common code with w_e_end_ov_req */
int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
/* MAYBE merge common code with w_e_end_ov_req */
static int w_e_send_csum(struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
struct drbd_conf *mdev = w->mdev;
int digest_size;
void *digest;
int ok = 1;
D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
int err = 0;
if (unlikely(cancel))
goto out;
if (likely((e->flags & EE_WAS_ERROR) != 0))
if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
goto out;
digest_size = crypto_hash_digestsize(mdev->csums_tfm);
digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
sector_t sector = e->sector;
unsigned int size = e->size;
drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
/* Free e and pages before send.
sector_t sector = peer_req->i.sector;
unsigned int size = peer_req->i.size;
drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
/* Free peer_req and pages before send.
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
* drbd_pp_alloc due to pp_in_use > max_buffers. */
drbd_free_ee(mdev, e);
e = NULL;
* drbd_alloc_pages due to pp_in_use > max_buffers. */
drbd_free_peer_req(mdev, peer_req);
peer_req = NULL;
inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev, sector, size,
digest, digest_size,
P_CSUM_RS_REQUEST);
err = drbd_send_drequest_csum(mdev, sector, size,
digest, digest_size,
P_CSUM_RS_REQUEST);
kfree(digest);
} else {
dev_err(DEV, "kmalloc() of digest failed.\n");
ok = 0;
err = -ENOMEM;
}
out:
if (e)
drbd_free_ee(mdev, e);
if (peer_req)
drbd_free_peer_req(mdev, peer_req);
if (unlikely(!ok))
if (unlikely(err))
dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
return ok;
return err;
}
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
{
struct drbd_epoch_entry *e;
struct drbd_peer_request *peer_req;
if (!get_ldev(mdev))
return -EIO;
......@@ -414,45 +389,47 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
if (!e)
peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector,
size, GFP_TRY);
if (!peer_req)
goto defer;
e->w.cb = w_e_send_csum;
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
peer_req->w.cb = w_e_send_csum;
spin_lock_irq(&mdev->tconn->req_lock);
list_add(&peer_req->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->tconn->req_lock);
atomic_add(size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
return 0;
/* If it failed because of ENOMEM, retry should help. If it failed
* because bio_add_page failed (probably broken lower level driver),
* retry may or may not help.
* If it does not, you may need to force disconnect. */
spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list);
spin_unlock_irq(&mdev->req_lock);
spin_lock_irq(&mdev->tconn->req_lock);
list_del(&peer_req->w.list);
spin_unlock_irq(&mdev->tconn->req_lock);
drbd_free_ee(mdev, e);
drbd_free_peer_req(mdev, peer_req);
defer:
put_ldev(mdev);
return -EAGAIN;
}
int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_resync_timer(struct drbd_work *w, int cancel)
{
struct drbd_conf *mdev = w->mdev;
switch (mdev->state.conn) {
case C_VERIFY_S:
w_make_ov_request(mdev, w, cancel);
w_make_ov_request(w, cancel);
break;
case C_SYNC_TARGET:
w_make_resync_request(mdev, w, cancel);
w_make_resync_request(w, cancel);
break;
}
return 1;
return 0;
}
void resync_timer_fn(unsigned long data)
......@@ -460,7 +437,7 @@ void resync_timer_fn(unsigned long data)
struct drbd_conf *mdev = (struct drbd_conf *) data;
if (list_empty(&mdev->resync_work.list))
drbd_queue_work(&mdev->data.work, &mdev->resync_work);
drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work);
}
static void fifo_set(struct fifo_buffer *fb, int value)
......@@ -492,8 +469,24 @@ static void fifo_add_val(struct fifo_buffer *fb, int value)
fb->values[i] += value;
}
struct fifo_buffer *fifo_alloc(int fifo_size)
{
struct fifo_buffer *fb;
fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
if (!fb)
return NULL;
fb->head_index = 0;
fb->size = fifo_size;
fb->total = 0;
return fb;
}
static int drbd_rs_controller(struct drbd_conf *mdev)
{
struct disk_conf *dc;
unsigned int sect_in; /* Number of sectors that came in since the last turn */
unsigned int want; /* The number of sectors we want in the proxy */
int req_sect; /* Number of sectors to request in this turn */
......@@ -502,38 +495,39 @@ static int drbd_rs_controller(struct drbd_conf *mdev)
int steps; /* Number of time steps to plan ahead */
int curr_corr;
int max_sect;
struct fifo_buffer *plan;
sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
mdev->rs_in_flight -= sect_in;
spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
dc = rcu_dereference(mdev->ldev->disk_conf);
plan = rcu_dereference(mdev->rs_plan_s);
steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
} else { /* normal path */
want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
want = dc->c_fill_target ? dc->c_fill_target :
sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
}
correction = want - mdev->rs_in_flight - mdev->rs_planed;
correction = want - mdev->rs_in_flight - plan->total;
/* Plan ahead */
cps = correction / steps;
fifo_add_val(&mdev->rs_plan_s, cps);
mdev->rs_planed += cps * steps;
fifo_add_val(plan, cps);
plan->total += cps * steps;
/* What we do in this step */
curr_corr = fifo_push(&mdev->rs_plan_s, 0);
spin_unlock(&mdev->peer_seq_lock);
mdev->rs_planed -= curr_corr;
curr_corr = fifo_push(plan, 0);
plan->total -= curr_corr;
req_sect = sect_in + curr_corr;
if (req_sect < 0)
req_sect = 0;
max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
if (req_sect > max_sect)
req_sect = max_sect;
......@@ -549,22 +543,25 @@ static int drbd_rs_controller(struct drbd_conf *mdev)
static int drbd_rs_number_requests(struct drbd_conf *mdev)
{
int number;
if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
rcu_read_lock();
if (rcu_dereference(mdev->rs_plan_s)->size) {
number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
} else {
mdev->c_sync_rate = mdev->sync_conf.rate;
mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate;
number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
}
rcu_read_unlock();
/* ignore the amount of pending requests, the resync controller should
* throttle down to incoming reply rate soon enough anyways. */
return number;
}
static int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel)
int w_make_resync_request(struct drbd_work *w, int cancel)
{
struct drbd_conf *mdev = w->mdev;
unsigned long bit;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
......@@ -574,12 +571,12 @@ static int w_make_resync_request(struct drbd_conf *mdev,
int i = 0;
if (unlikely(cancel))
return 1;
return 0;
if (mdev->rs_total == 0) {
/* empty resync? */
drbd_resync_finished(mdev);
return 1;
return 0;
}
if (!get_ldev(mdev)) {
......@@ -588,7 +585,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
to continue resync with a broken disk makes no sense at
all */
dev_err(DEV, "Disk broke down during resync!\n");
return 1;
return 0;
}
max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
......@@ -598,15 +595,15 @@ static int w_make_resync_request(struct drbd_conf *mdev,
for (i = 0; i < number; i++) {
/* Stop generating RS requests, when half of the send buffer is filled */
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket) {
queued = mdev->data.socket->sk->sk_wmem_queued;
sndbuf = mdev->data.socket->sk->sk_sndbuf;
mutex_lock(&mdev->tconn->data.mutex);
if (mdev->tconn->data.socket) {
queued = mdev->tconn->data.socket->sk->sk_wmem_queued;
sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf;
} else {
queued = 1;
sndbuf = 0;
}
mutex_unlock(&mdev->data.mutex);
mutex_unlock(&mdev->tconn->data.mutex);
if (queued > sndbuf / 2)
goto requeue;
......@@ -617,7 +614,7 @@ static int w_make_resync_request(struct drbd_conf *mdev,
if (bit == DRBD_END_OF_BITMAP) {
mdev->bm_resync_fo = drbd_bm_bits(mdev);
put_ldev(mdev);
return 1;
return 0;
}
sector = BM_BIT_TO_SECT(bit);
......@@ -676,11 +673,11 @@ static int w_make_resync_request(struct drbd_conf *mdev,
/* adjust very last sectors, in case we are oddly sized */
if (sector + (size>>9) > capacity)
size = (capacity-sector)<<9;
if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
case -EIO: /* Disk failure */
put_ldev(mdev);
return 0;
return -EIO;
case -EAGAIN: /* allocation failed, or ldev busy */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
......@@ -693,13 +690,16 @@ static int w_make_resync_request(struct drbd_conf *mdev,
BUG();
}
} else {
int err;
inc_rs_pending(mdev);
if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
sector, size, ID_SYNCER)) {
err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
sector, size, ID_SYNCER);
if (err) {
dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
dec_rs_pending(mdev);
put_ldev(mdev);
return 0;
return err;
}
}
}
......@@ -712,18 +712,19 @@ static int w_make_resync_request(struct drbd_conf *mdev,
* until then resync "work" is "inactive" ...
*/
put_ldev(mdev);
return 1;
return 0;
}
requeue:
mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
return 1;
return 0;
}
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
static int w_make_ov_request(struct drbd_work *w, int cancel)
{
struct drbd_conf *mdev = w->mdev;
int number, i, size;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
......@@ -743,7 +744,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
* w_e_end_ov_reply().
* We need to send at least one request out. */
stop_sector_reached = i > 0
&& mdev->agreed_pro_version >= 97
&& verify_can_do_stop_sector(mdev)
&& sector >= mdev->ov_stop_sector;
if (stop_sector_reached)
break;
......@@ -760,7 +761,7 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
size = (capacity-sector)<<9;
inc_rs_pending(mdev);
if (!drbd_send_ov_request(mdev, sector, size)) {
if (drbd_send_ov_request(mdev, sector, size)) {
dec_rs_pending(mdev);
return 0;
}
......@@ -775,52 +776,34 @@ static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int ca
return 1;
}
void start_resync_timer_fn(unsigned long data)
{
struct drbd_conf *mdev = (struct drbd_conf *) data;
drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
}
int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
dev_warn(DEV, "w_start_resync later...\n");
mdev->start_resync_timer.expires = jiffies + HZ/10;
add_timer(&mdev->start_resync_timer);
return 1;
}
drbd_start_resync(mdev, C_SYNC_SOURCE);
drbd_clear_flag(mdev, AHEAD_TO_SYNC_SOURCE);
return 1;
}
int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_ov_finished(struct drbd_work *w, int cancel)
{
struct drbd_conf *mdev = w->mdev;
kfree(w);
ov_oos_print(mdev);
ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
return 1;
return 0;
}
static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
static int w_resync_finished(struct drbd_work *w, int cancel)
{
struct drbd_conf *mdev = w->mdev;
kfree(w);
drbd_resync_finished(mdev);
return 1;
return 0;
}
static void ping_peer(struct drbd_conf *mdev)
{
drbd_clear_flag(mdev, GOT_PING_ACK);
request_ping(mdev);
wait_event(mdev->misc_wait,
drbd_test_flag(mdev, GOT_PING_ACK) || mdev->state.conn < C_CONNECTED);
struct drbd_tconn *tconn = mdev->tconn;
clear_bit(GOT_PING_ACK, &tconn->flags);
request_ping(tconn);
wait_event(tconn->ping_wait,
test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED);
}
int drbd_resync_finished(struct drbd_conf *mdev)
......@@ -845,7 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
if (w) {
w->cb = w_resync_finished;
drbd_queue_work(&mdev->data.work, w);
w->mdev = mdev;
drbd_queue_work(&mdev->tconn->sender_work, w);
return 1;
}
dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
......@@ -868,8 +852,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
ping_peer(mdev);
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
spin_lock_irq(&mdev->tconn->req_lock);
os = drbd_read_state(mdev);
verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
......@@ -899,7 +883,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
khelper_cmd = "after-resync-target";
if (mdev->csums_tfm && mdev->rs_total) {
if (mdev->tconn->csums_tfm && mdev->rs_total) {
const unsigned long s = mdev->rs_same_csum;
const unsigned long t = mdev->rs_total;
const int ratio =
......@@ -957,7 +941,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
out_unlock:
spin_unlock_irq(&mdev->req_lock);
spin_unlock_irq(&mdev->tconn->req_lock);
put_ldev(mdev);
out:
mdev->rs_total = 0;
......@@ -977,19 +961,19 @@ int drbd_resync_finished(struct drbd_conf *mdev)
}
/* helper */
static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
{
if (drbd_ee_has_active_page(e)) {
if (drbd_peer_req_has_active_page(peer_req)) {
/* This might happen if sendpage() has not finished */
int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
atomic_add(i, &mdev->pp_in_use_by_net);
atomic_sub(i, &mdev->pp_in_use);
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->net_ee);
spin_unlock_irq(&mdev->req_lock);
spin_lock_irq(&mdev->tconn->req_lock);
list_add_tail(&peer_req->w.list, &mdev->net_ee);
spin_unlock_irq(&mdev->tconn->req_lock);
wake_up(&drbd_pp_wait);
} else
drbd_free_ee(mdev, e);
drbd_free_peer_req(mdev, peer_req);
}
/**
......@@ -998,174 +982,177 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_e_end_data_req(struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
int ok;
struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
struct drbd_conf *mdev = w->mdev;
int err;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
return 1;
return 0;
}
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
ok = drbd_send_block(mdev, P_DATA_REPLY, e);
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
err = drbd_send_block(mdev, P_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
(unsigned long long)e->sector);
(unsigned long long)peer_req->i.sector);
ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req);
}
dec_unacked(mdev);
move_to_net_ee_or_free(mdev, e);
move_to_net_ee_or_free(mdev, peer_req);
if (unlikely(!ok))
if (unlikely(err))
dev_err(DEV, "drbd_send_block() failed\n");
return ok;
return err;
}
/**
* w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
* w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
* @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
int ok;
struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
struct drbd_conf *mdev = w->mdev;
int err;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
return 1;
return 0;
}
if (get_ldev_if_state(mdev, D_FAILED)) {
drbd_rs_complete_io(mdev, e->sector);
drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
if (mdev->state.conn == C_AHEAD) {
ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
} else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req);
} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(mdev);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Not sending RSDataReply, "
"partner DISKLESS!\n");
ok = 1;
err = 0;
}
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
(unsigned long long)e->sector);
(unsigned long long)peer_req->i.sector);
ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
/* update resync data with failure */
drbd_rs_failed_io(mdev, e->sector, e->size);
drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size);
}
dec_unacked(mdev);
move_to_net_ee_or_free(mdev, e);
move_to_net_ee_or_free(mdev, peer_req);
if (unlikely(!ok))
if (unlikely(err))
dev_err(DEV, "drbd_send_block() failed\n");
return ok;
return err;
}
int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
struct drbd_conf *mdev = w->mdev;
struct digest_info *di;
int digest_size;
void *digest = NULL;
int ok, eq = 0;
int err, eq = 0;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
return 1;
return 0;
}
if (get_ldev(mdev)) {
drbd_rs_complete_io(mdev, e->sector);
drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
di = e->digest;
di = peer_req->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
* a real fix would be much more involved,
* introducing more locking mechanisms */
if (mdev->csums_tfm) {
digest_size = crypto_hash_digestsize(mdev->csums_tfm);
if (mdev->tconn->csums_tfm) {
digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
D_ASSERT(digest_size == di->digest_size);
digest = kmalloc(digest_size, GFP_NOIO);
}
if (digest) {
drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
if (eq) {
drbd_set_in_sync(mdev, e->sector, e->size);
drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size);
/* rs_same_csums unit is BM_BLOCK_SIZE */
mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req);
} else {
inc_rs_pending(mdev);
e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
kfree(di);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
}
} else {
ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
}
dec_unacked(mdev);
move_to_net_ee_or_free(mdev, e);
move_to_net_ee_or_free(mdev, peer_req);
if (unlikely(!ok))
if (unlikely(err))
dev_err(DEV, "drbd_send_block/ack() failed\n");
return ok;
return err;
}
/* TODO merge common code with w_e_send_csum */
int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_e_end_ov_req(struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
sector_t sector = e->sector;
unsigned int size = e->size;
struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
struct drbd_conf *mdev = w->mdev;
sector_t sector = peer_req->i.sector;
unsigned int size = peer_req->i.size;
int digest_size;
void *digest;
int ok = 1;
int err = 0;
if (unlikely(cancel))
goto out;
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (!digest) {
ok = 0; /* terminate the connection in case the allocation failed */
err = 1; /* terminate the connection in case the allocation failed */
goto out;
}
if (likely(!(e->flags & EE_WAS_ERROR)))
drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
if (likely(!(peer_req->flags & EE_WAS_ERROR)))
drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
else
memset(digest, 0, digest_size);
......@@ -1173,25 +1160,23 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
* drbd_pp_alloc due to pp_in_use > max_buffers. */
drbd_free_ee(mdev, e);
e = NULL;
* drbd_alloc_pages due to pp_in_use > max_buffers. */
drbd_free_peer_req(mdev, peer_req);
peer_req = NULL;
inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev, sector, size,
digest, digest_size,
P_OV_REPLY);
if (!ok)
err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY);
if (err)
dec_rs_pending(mdev);
kfree(digest);
out:
if (e)
drbd_free_ee(mdev, e);
if (peer_req)
drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
return ok;
return err;
}
void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size)
{
if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
mdev->ov_last_oos_size += size>>9;
......@@ -1202,37 +1187,38 @@ void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
drbd_set_out_of_sync(mdev, sector, size);
}
int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_e_end_ov_reply(struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
struct drbd_conf *mdev = w->mdev;
struct digest_info *di;
void *digest;
sector_t sector = e->sector;
unsigned int size = e->size;
sector_t sector = peer_req->i.sector;
unsigned int size = peer_req->i.size;
int digest_size;
int ok, eq = 0;
int err, eq = 0;
bool stop_sector_reached = false;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
return 1;
return 0;
}
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
if (get_ldev(mdev)) {
drbd_rs_complete_io(mdev, e->sector);
drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
di = e->digest;
di = peer_req->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
D_ASSERT(digest_size == di->digest_size);
eq = !memcmp(digest, di->digest, digest_size);
......@@ -1240,19 +1226,19 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
}
/* Free e and pages before send.
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
* drbd_pp_alloc due to pp_in_use > max_buffers. */
drbd_free_ee(mdev, e);
/* Free peer_req and pages before send.
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
* drbd_alloc_pages due to pp_in_use > max_buffers. */
drbd_free_peer_req(mdev, peer_req);
if (!eq)
drbd_ov_oos_found(mdev, sector, size);
drbd_ov_out_of_sync_found(mdev, sector, size);
else
ov_oos_print(mdev);
ov_out_of_sync_print(mdev);
ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
dec_unacked(mdev);
......@@ -1262,76 +1248,102 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
if ((mdev->ov_left & 0x200) == 0x200)
drbd_advance_rs_marks(mdev, mdev->ov_left);
stop_sector_reached = mdev->agreed_pro_version >= 97 &&
stop_sector_reached = verify_can_do_stop_sector(mdev) &&
(sector + (size>>9)) >= mdev->ov_stop_sector;
if (mdev->ov_left == 0 || stop_sector_reached) {
ov_oos_print(mdev);
ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
}
return ok;
return err;
}
int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_prev_work_done(struct drbd_work *w, int cancel)
{
struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
complete(&b->done);
return 1;
return 0;
}
int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
/* FIXME
* We need to track the number of pending barrier acks,
* and to be able to wait for them.
* See also comment in drbd_adm_attach before drbd_suspend_io.
*/
int drbd_send_barrier(struct drbd_tconn *tconn)
{
struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
struct p_barrier *p = &mdev->data.sbuf.barrier;
int ok = 1;
/* really avoid racing with tl_clear. w.cb may have been referenced
* just before it was reassigned and re-queued, so double check that.
* actually, this race was harmless, since we only try to send the
* barrier packet here, and otherwise do nothing with the object.
* but compare with the head of w_clear_epoch */
spin_lock_irq(&mdev->req_lock);
if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
cancel = 1;
spin_unlock_irq(&mdev->req_lock);
if (cancel)
return 1;
struct p_barrier *p;
struct drbd_socket *sock;
if (!drbd_get_data_sock(mdev))
return 0;
p->barrier = b->br_number;
/* inc_ap_pending was done where this was queued.
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in w_clear_epoch. */
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
(struct p_header80 *)p, sizeof(*p), 0);
drbd_put_data_sock(mdev);
return ok;
sock = &tconn->data;
p = conn_prepare_command(tconn, sock);
if (!p)
return -EIO;
p->barrier = tconn->send.current_epoch_nr;
p->pad = 0;
tconn->send.current_epoch_writes = 0;
return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0);
}
int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_send_write_hint(struct drbd_work *w, int cancel)
{
struct drbd_conf *mdev = w->mdev;
struct drbd_socket *sock;
if (cancel)
return 1;
return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
return 0;
sock = &mdev->tconn->data;
if (!drbd_prepare_command(mdev, sock))
return -EIO;
return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
}
static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch)
{
if (!tconn->send.seen_any_write_yet) {
tconn->send.seen_any_write_yet = true;
tconn->send.current_epoch_nr = epoch;
tconn->send.current_epoch_writes = 0;
}
}
static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch)
{
/* re-init if first write on this connection */
if (!tconn->send.seen_any_write_yet)
return;
if (tconn->send.current_epoch_nr != epoch) {
if (tconn->send.current_epoch_writes)
drbd_send_barrier(tconn);
tconn->send.current_epoch_nr = epoch;
}
}
int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_send_out_of_sync(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
int ok;
struct drbd_conf *mdev = w->mdev;
struct drbd_tconn *tconn = mdev->tconn;
int err;
if (unlikely(cancel)) {
req_mod(req, send_canceled);
return 1;
req_mod(req, SEND_CANCELED);
return 0;
}
ok = drbd_send_oos(mdev, req);
req_mod(req, oos_handed_to_network);
/* this time, no tconn->send.current_epoch_writes++;
* If it was sent, it was the closing barrier for the last
* replicated epoch, before we went into AHEAD mode.
* No more barriers will be sent, until we leave AHEAD mode again. */
maybe_send_barrier(tconn, req->epoch);
err = drbd_send_out_of_sync(mdev, req);
req_mod(req, OOS_HANDED_TO_NETWORK);
return ok;
return err;
}
/**
......@@ -1340,20 +1352,26 @@ int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_send_dblock(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
int ok;
struct drbd_conf *mdev = w->mdev;
struct drbd_tconn *tconn = mdev->tconn;
int err;
if (unlikely(cancel)) {
req_mod(req, send_canceled);
return 1;
req_mod(req, SEND_CANCELED);
return 0;
}
ok = drbd_send_dblock(mdev, req);
req_mod(req, ok ? handed_over_to_network : send_failed);
re_init_if_first_write(tconn, req->epoch);
maybe_send_barrier(tconn, req->epoch);
tconn->send.current_epoch_writes++;
err = drbd_send_dblock(mdev, req);
req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
return ok;
return err;
}
/**
......@@ -1362,57 +1380,61 @@ int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_send_read_req(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
int ok;
struct drbd_conf *mdev = w->mdev;
struct drbd_tconn *tconn = mdev->tconn;
int err;
if (unlikely(cancel)) {
req_mod(req, send_canceled);
return 1;
req_mod(req, SEND_CANCELED);
return 0;
}
ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
(unsigned long)req);
/* Even read requests may close a write epoch,
* if there was any yet. */
maybe_send_barrier(tconn, req->epoch);
if (!ok) {
/* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
* so this is probably redundant */
if (mdev->state.conn >= C_CONNECTED)
drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
}
req_mod(req, ok ? handed_over_to_network : send_failed);
err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size,
(unsigned long)req);
return ok;
req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
return err;
}
int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
int w_restart_disk_io(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
struct drbd_conf *mdev = w->mdev;
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
drbd_al_begin_io(mdev, req->sector);
/* Calling drbd_al_begin_io() out of the worker might deadlocks
theoretically. Practically it can not deadlock, since this is
only used when unfreezing IOs. All the extents of the requests
that made it into the TL are already active */
drbd_al_begin_io(mdev, &req->i);
drbd_req_make_private_bio(req, req->master_bio);
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
generic_make_request(req->private_bio);
return 1;
return 0;
}
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
int resync_after;
while (1) {
if (odev->sync_conf.after == -1)
if (!odev->ldev)
return 1;
rcu_read_lock();
resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
rcu_read_unlock();
if (resync_after == -1)
return 1;
odev = minor_to_mdev(resync_after);
if (!expect(odev))
return 1;
odev = minor_to_mdev(odev->sync_conf.after);
ERR_IF(!odev) return 1;
if ((odev->state.conn >= C_SYNC_SOURCE &&
odev->state.conn <= C_PAUSED_SYNC_T) ||
odev->state.aftr_isp || odev->state.peer_isp ||
......@@ -1432,16 +1454,15 @@ static int _drbd_pause_after(struct drbd_conf *mdev)
struct drbd_conf *odev;
int i, rv = 0;
for (i = 0; i < minor_count; i++) {
odev = minor_to_mdev(i);
if (!odev)
continue;
rcu_read_lock();
idr_for_each_entry(&minors, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (!_drbd_may_sync_now(odev))
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
!= SS_NOTHING_TO_DO);
}
rcu_read_unlock();
return rv;
}
......@@ -1457,10 +1478,8 @@ static int _drbd_resume_next(struct drbd_conf *mdev)
struct drbd_conf *odev;
int i, rv = 0;
for (i = 0; i < minor_count; i++) {
odev = minor_to_mdev(i);
if (!odev)
continue;
rcu_read_lock();
idr_for_each_entry(&minors, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (odev->state.aftr_isp) {
......@@ -1470,6 +1489,7 @@ static int _drbd_resume_next(struct drbd_conf *mdev)
!= SS_NOTHING_TO_DO) ;
}
}
rcu_read_unlock();
return rv;
}
......@@ -1487,57 +1507,86 @@ void suspend_other_sg(struct drbd_conf *mdev)
write_unlock_irq(&global_state_lock);
}
static int sync_after_error(struct drbd_conf *mdev, int o_minor)
/* caller must hold global_state_lock */
enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
{
struct drbd_conf *odev;
int resync_after;
if (o_minor == -1)
return NO_ERROR;
if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
return ERR_SYNC_AFTER;
return ERR_RESYNC_AFTER;
/* check for loops */
odev = minor_to_mdev(o_minor);
while (1) {
if (odev == mdev)
return ERR_SYNC_AFTER_CYCLE;
return ERR_RESYNC_AFTER_CYCLE;
rcu_read_lock();
resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
rcu_read_unlock();
/* dependency chain ends here, no cycles. */
if (odev->sync_conf.after == -1)
if (resync_after == -1)
return NO_ERROR;
/* follow the dependency chain */
odev = minor_to_mdev(odev->sync_conf.after);
odev = minor_to_mdev(resync_after);
}
}
int drbd_alter_sa(struct drbd_conf *mdev, int na)
/* caller must hold global_state_lock */
void drbd_resync_after_changed(struct drbd_conf *mdev)
{
int changes;
int retcode;
write_lock_irq(&global_state_lock);
retcode = sync_after_error(mdev, na);
if (retcode == NO_ERROR) {
mdev->sync_conf.after = na;
do {
changes = _drbd_pause_after(mdev);
changes |= _drbd_resume_next(mdev);
} while (changes);
}
write_unlock_irq(&global_state_lock);
return retcode;
do {
changes = _drbd_pause_after(mdev);
changes |= _drbd_resume_next(mdev);
} while (changes);
}
void drbd_rs_controller_reset(struct drbd_conf *mdev)
{
struct fifo_buffer *plan;
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
mdev->rs_in_flight = 0;
mdev->rs_planed = 0;
spin_lock(&mdev->peer_seq_lock);
fifo_set(&mdev->rs_plan_s, 0);
spin_unlock(&mdev->peer_seq_lock);
/* Updating the RCU protected object in place is necessary since
this function gets called from atomic context.
It is valid since all other updates also lead to an completely
empty fifo */
rcu_read_lock();
plan = rcu_dereference(mdev->rs_plan_s);
plan->total = 0;
fifo_set(plan, 0);
rcu_read_unlock();
}
void start_resync_timer_fn(unsigned long data)
{
struct drbd_conf *mdev = (struct drbd_conf *) data;
drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work);
}
int w_start_resync(struct drbd_work *w, int cancel)
{
struct drbd_conf *mdev = w->mdev;
if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
dev_warn(DEV, "w_start_resync later...\n");
mdev->start_resync_timer.expires = jiffies + HZ/10;
add_timer(&mdev->start_resync_timer);
return 0;
}
drbd_start_resync(mdev, C_SYNC_SOURCE);
clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
return 0;
}
/**
......@@ -1558,43 +1607,58 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
return;
}
if (side == C_SYNC_TARGET) {
/* Since application IO was locked out during C_WF_BITMAP_T and
C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
we check that we might make the data inconsistent. */
r = drbd_khelper(mdev, "before-resync-target");
r = (r >> 8) & 0xff;
if (r > 0) {
dev_info(DEV, "before-resync-target handler returned %d, "
"dropping connection.\n", r);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
return;
}
} else /* C_SYNC_SOURCE */ {
r = drbd_khelper(mdev, "before-resync-source");
r = (r >> 8) & 0xff;
if (r > 0) {
if (r == 3) {
dev_info(DEV, "before-resync-source handler returned %d, "
"ignoring. Old userland tools?", r);
} else {
dev_info(DEV, "before-resync-source handler returned %d, "
if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
if (side == C_SYNC_TARGET) {
/* Since application IO was locked out during C_WF_BITMAP_T and
C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
we check that we might make the data inconsistent. */
r = drbd_khelper(mdev, "before-resync-target");
r = (r >> 8) & 0xff;
if (r > 0) {
dev_info(DEV, "before-resync-target handler returned %d, "
"dropping connection.\n", r);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
return;
}
} else /* C_SYNC_SOURCE */ {
r = drbd_khelper(mdev, "before-resync-source");
r = (r >> 8) & 0xff;
if (r > 0) {
if (r == 3) {
dev_info(DEV, "before-resync-source handler returned %d, "
"ignoring. Old userland tools?", r);
} else {
dev_info(DEV, "before-resync-source handler returned %d, "
"dropping connection.\n", r);
conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
return;
}
}
}
}
drbd_state_lock(mdev);
if (current == mdev->tconn->worker.task) {
/* The worker should not sleep waiting for state_mutex,
that can take long */
if (!mutex_trylock(mdev->state_mutex)) {
set_bit(B_RS_H_DONE, &mdev->flags);
mdev->start_resync_timer.expires = jiffies + HZ/5;
add_timer(&mdev->start_resync_timer);
return;
}
} else {
mutex_lock(mdev->state_mutex);
}
clear_bit(B_RS_H_DONE, &mdev->flags);
write_lock_irq(&global_state_lock);
if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
write_unlock_irq(&global_state_lock);
drbd_state_unlock(mdev);
mutex_unlock(mdev->state_mutex);
return;
}
ns.i = mdev->state.i;
ns = drbd_read_state(mdev);
ns.aftr_isp = !_drbd_may_sync_now(mdev);
......@@ -1606,7 +1670,7 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
ns.pdsk = D_INCONSISTENT;
r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
ns = mdev->state;
ns = drbd_read_state(mdev);
if (ns.conn < C_CONNECTED)
r = SS_UNKNOWN_ERROR;
......@@ -1632,6 +1696,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
write_unlock_irq(&global_state_lock);
if (r == SS_SUCCESS) {
/* reset rs_last_bcast when a resync or verify is started,
* to deal with potential jiffies wrap. */
mdev->rs_last_bcast = jiffies - HZ;
dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
drbd_conn_str(ns.conn),
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
......@@ -1646,10 +1714,10 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
* drbd_resync_finished from here in that case.
* We drbd_gen_and_send_sync_uuid here for protocol < 96,
* and from after_state_ch otherwise. */
if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96)
drbd_gen_and_send_sync_uuid(mdev);
if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
......@@ -1660,10 +1728,16 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
if (side == C_SYNC_SOURCE)
schedule_timeout_interruptible(
mdev->net_conf->ping_int * HZ +
mdev->net_conf->ping_timeo*HZ/9);
if (side == C_SYNC_SOURCE) {
struct net_conf *nc;
int timeo;
rcu_read_lock();
nc = rcu_dereference(mdev->tconn->net_conf);
timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
rcu_read_unlock();
schedule_timeout_interruptible(timeo);
}
drbd_resync_finished(mdev);
}
......@@ -1678,114 +1752,180 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
drbd_md_sync(mdev);
}
put_ldev(mdev);
drbd_state_unlock(mdev);
mutex_unlock(mdev->state_mutex);
}
int drbd_worker(struct drbd_thread *thi)
/* If the resource already closed the current epoch, but we did not
* (because we have not yet seen new requests), we should send the
* corresponding barrier now. Must be checked within the same spinlock
* that is used to check for new requests. */
bool need_to_send_barrier(struct drbd_tconn *connection)
{
struct drbd_conf *mdev = thi->mdev;
struct drbd_work *w = NULL;
LIST_HEAD(work_list);
int intr = 0, i;
if (!connection->send.seen_any_write_yet)
return false;
/* Skip barriers that do not contain any writes.
* This may happen during AHEAD mode. */
if (!connection->send.current_epoch_writes)
return false;
/* ->req_lock is held when requests are queued on
* connection->sender_work, and put into ->transfer_log.
* It is also held when ->current_tle_nr is increased.
* So either there are already new requests queued,
* and corresponding barriers will be send there.
* Or nothing new is queued yet, so the difference will be 1.
*/
if (atomic_read(&connection->current_tle_nr) !=
connection->send.current_epoch_nr + 1)
return false;
return true;
}
sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
{
spin_lock_irq(&queue->q_lock);
list_splice_init(&queue->q, work_list);
spin_unlock_irq(&queue->q_lock);
return !list_empty(work_list);
}
bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
{
spin_lock_irq(&queue->q_lock);
if (!list_empty(&queue->q))
list_move(queue->q.next, work_list);
spin_unlock_irq(&queue->q_lock);
return !list_empty(work_list);
}
while (get_t_state(thi) == Running) {
drbd_thread_current_set_cpu(mdev);
void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list)
{
DEFINE_WAIT(wait);
struct net_conf *nc;
int uncork, cork;
if (down_trylock(&mdev->data.work.s)) {
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket && !mdev->net_conf->no_cork)
drbd_tcp_uncork(mdev->data.socket);
mutex_unlock(&mdev->data.mutex);
dequeue_work_item(&connection->sender_work, work_list);
if (!list_empty(work_list))
return;
intr = down_interruptible(&mdev->data.work.s);
/* Still nothing to do?
* Maybe we still need to close the current epoch,
* even if no new requests are queued yet.
*
* Also, poke TCP, just in case.
* Then wait for new work (or signal). */
rcu_read_lock();
nc = rcu_dereference(connection->net_conf);
uncork = nc ? nc->tcp_cork : 0;
rcu_read_unlock();
if (uncork) {
mutex_lock(&connection->data.mutex);
if (connection->data.socket)
drbd_tcp_uncork(connection->data.socket);
mutex_unlock(&connection->data.mutex);
}
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket && !mdev->net_conf->no_cork)
drbd_tcp_cork(mdev->data.socket);
mutex_unlock(&mdev->data.mutex);
for (;;) {
int send_barrier;
prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
spin_lock_irq(&connection->req_lock);
spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
/* dequeue single item only,
* we still use drbd_queue_work_front() in some places */
if (!list_empty(&connection->sender_work.q))
list_move(connection->sender_work.q.next, work_list);
spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
if (!list_empty(work_list) || signal_pending(current)) {
spin_unlock_irq(&connection->req_lock);
break;
}
send_barrier = need_to_send_barrier(connection);
spin_unlock_irq(&connection->req_lock);
if (send_barrier) {
drbd_send_barrier(connection);
connection->send.current_epoch_nr++;
}
schedule();
/* may be woken up for other things but new work, too,
* e.g. if the current epoch got closed.
* In which case we send the barrier above. */
}
finish_wait(&connection->sender_work.q_wait, &wait);
/* someone may have changed the config while we have been waiting above. */
rcu_read_lock();
nc = rcu_dereference(connection->net_conf);
cork = nc ? nc->tcp_cork : 0;
rcu_read_unlock();
mutex_lock(&connection->data.mutex);
if (connection->data.socket) {
if (cork)
drbd_tcp_cork(connection->data.socket);
else if (!uncork)
drbd_tcp_uncork(connection->data.socket);
}
mutex_unlock(&connection->data.mutex);
}
int drbd_worker(struct drbd_thread *thi)
{
struct drbd_tconn *tconn = thi->tconn;
struct drbd_work *w = NULL;
struct drbd_conf *mdev;
LIST_HEAD(work_list);
int vnr;
while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi);
if (intr) {
D_ASSERT(intr == -EINTR);
/* as long as we use drbd_queue_work_front(),
* we may only dequeue single work items here, not batches. */
if (list_empty(&work_list))
wait_for_work(tconn, &work_list);
if (signal_pending(current)) {
flush_signals(current);
ERR_IF (get_t_state(thi) == Running)
if (get_t_state(thi) == RUNNING) {
conn_warn(tconn, "Worker got an unexpected signal\n");
continue;
}
break;
}
if (get_t_state(thi) != Running)
if (get_t_state(thi) != RUNNING)
break;
/* With this break, we have done a down() but not consumed
the entry from the list. The cleanup code takes care of
this... */
w = NULL;
spin_lock_irq(&mdev->data.work.q_lock);
ERR_IF(list_empty(&mdev->data.work.q)) {
/* something terribly wrong in our logic.
* we were able to down() the semaphore,
* but the list is empty... doh.
*
* what is the best thing to do now?
* try again from scratch, restarting the receiver,
* asender, whatnot? could break even more ugly,
* e.g. when we are primary, but no good local data.
*
* I'll try to get away just starting over this loop.
*/
spin_unlock_irq(&mdev->data.work.q_lock);
continue;
}
w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
list_del_init(&w->list);
spin_unlock_irq(&mdev->data.work.q_lock);
if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
/* dev_warn(DEV, "worker: a callback failed! \n"); */
if (mdev->state.conn >= C_CONNECTED)
drbd_force_state(mdev,
NS(conn, C_NETWORK_FAILURE));
while (!list_empty(&work_list)) {
w = list_first_entry(&work_list, struct drbd_work, list);
list_del_init(&w->list);
if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0)
continue;
if (tconn->cstate >= C_WF_REPORT_PARAMS)
conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
}
}
D_ASSERT(drbd_test_flag(mdev, DEVICE_DYING));
D_ASSERT(drbd_test_flag(mdev, CONFIG_PENDING));
spin_lock_irq(&mdev->data.work.q_lock);
i = 0;
while (!list_empty(&mdev->data.work.q)) {
list_splice_init(&mdev->data.work.q, &work_list);
spin_unlock_irq(&mdev->data.work.q_lock);
do {
while (!list_empty(&work_list)) {
w = list_entry(work_list.next, struct drbd_work, list);
w = list_first_entry(&work_list, struct drbd_work, list);
list_del_init(&w->list);
w->cb(mdev, w, 1);
i++; /* dead debugging code */
w->cb(w, 1);
}
spin_lock_irq(&mdev->data.work.q_lock);
dequeue_work_batch(&tconn->sender_work, &work_list);
} while (!list_empty(&work_list));
rcu_read_lock();
idr_for_each_entry(&tconn->volumes, mdev, vnr) {
D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
kref_get(&mdev->kref);
rcu_read_unlock();
drbd_mdev_cleanup(mdev);
kref_put(&mdev->kref, &drbd_minor_destroy);
rcu_read_lock();
}
sema_init(&mdev->data.work.s, 0);
/* DANGEROUS race: if someone did queue his work within the spinlock,
* but up() ed outside the spinlock, we could get an up() on the
* semaphore without corresponding list entry.
* So don't do that.
*/
spin_unlock_irq(&mdev->data.work.q_lock);
D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
/* _drbd_set_state only uses stop_nowait.
* wait here for the Exiting receiver. */
drbd_thread_stop(&mdev->receiver);
drbd_mdev_cleanup(mdev);
dev_info(DEV, "worker terminated\n");
drbd_clear_flag(mdev, DEVICE_DYING);
drbd_clear_flag(mdev, CONFIG_PENDING);
wake_up(&mdev->state_wait);
rcu_read_unlock();
return 0;
}
......@@ -3,6 +3,7 @@
#include <linux/ctype.h>
#include <linux/mm.h>
#include "drbd_int.h"
/* see get_sb_bdev and bd_claim */
extern char *drbd_sec_holder;
......@@ -20,8 +21,8 @@ static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
extern void drbd_endio_sec(struct bio *bio, int error);
extern void drbd_endio_pri(struct bio *bio, int error);
extern void drbd_peer_request_endio(struct bio *bio, int error);
extern void drbd_request_endio(struct bio *bio, int error);
/*
* used to submit our private bio
......@@ -45,12 +46,6 @@ static inline void drbd_generic_make_request(struct drbd_conf *mdev,
generic_make_request(bio);
}
static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
{
return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
== CRYPTO_ALG_TYPE_HASH;
}
#ifndef __CHECKER__
# undef __cond_lock
# define __cond_lock(x,c) (c)
......
......@@ -51,12 +51,11 @@
#endif
extern const char *drbd_buildtag(void);
#define REL_VERSION "8.3.14"
#define API_VERSION 88
#define REL_VERSION "8.4.2"
#define API_VERSION 1
#define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 97
#define PRO_VERSION_MAX 101
enum drbd_io_error_p {
......@@ -66,7 +65,8 @@ enum drbd_io_error_p {
};
enum drbd_fencing_p {
FP_DONT_CARE,
FP_NOT_AVAIL = -1, /* Not a policy */
FP_DONT_CARE = 0,
FP_RESOURCE,
FP_STONITH
};
......@@ -102,6 +102,20 @@ enum drbd_on_congestion {
OC_DISCONNECT,
};
enum drbd_read_balancing {
RB_PREFER_LOCAL,
RB_PREFER_REMOTE,
RB_ROUND_ROBIN,
RB_LEAST_PENDING,
RB_CONGESTED_REMOTE,
RB_32K_STRIPING,
RB_64K_STRIPING,
RB_128K_STRIPING,
RB_256K_STRIPING,
RB_512K_STRIPING,
RB_1M_STRIPING,
};
/* KEEP the order, do not delete or insert. Only append. */
enum drbd_ret_code {
ERR_CODE_BASE = 100,
......@@ -122,7 +136,7 @@ enum drbd_ret_code {
ERR_AUTH_ALG = 120,
ERR_AUTH_ALG_ND = 121,
ERR_NOMEM = 122,
ERR_DISCARD = 123,
ERR_DISCARD_IMPOSSIBLE = 123,
ERR_DISK_CONFIGURED = 124,
ERR_NET_CONFIGURED = 125,
ERR_MANDATORY_TAG = 126,
......@@ -130,8 +144,8 @@ enum drbd_ret_code {
ERR_INTR = 129, /* EINTR */
ERR_RESIZE_RESYNC = 130,
ERR_NO_PRIMARY = 131,
ERR_SYNC_AFTER = 132,
ERR_SYNC_AFTER_CYCLE = 133,
ERR_RESYNC_AFTER = 132,
ERR_RESYNC_AFTER_CYCLE = 133,
ERR_PAUSE_IS_SET = 134,
ERR_PAUSE_IS_CLEAR = 135,
ERR_PACKET_NR = 137,
......@@ -155,6 +169,14 @@ enum drbd_ret_code {
ERR_CONG_NOT_PROTO_A = 155,
ERR_PIC_AFTER_DEP = 156,
ERR_PIC_PEER_DEP = 157,
ERR_RES_NOT_KNOWN = 158,
ERR_RES_IN_USE = 159,
ERR_MINOR_CONFIGURED = 160,
ERR_MINOR_EXISTS = 161,
ERR_INVALID_REQUEST = 162,
ERR_NEED_APV_100 = 163,
ERR_NEED_ALLOW_TWO_PRI = 164,
ERR_MD_UNCLEAN = 165,
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
......@@ -296,7 +318,8 @@ enum drbd_state_rv {
SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
SS_O_VOL_PEER_PRI = -20,
SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */
};
/* from drbd_strings.c */
......@@ -313,7 +336,9 @@ extern const char *drbd_set_st_err_str(enum drbd_state_rv);
#define MDF_FULL_SYNC (1 << 3)
#define MDF_WAS_UP_TO_DATE (1 << 4)
#define MDF_PEER_OUT_DATED (1 << 5)
#define MDF_CRASHED_PRIMARY (1 << 6)
#define MDF_CRASHED_PRIMARY (1 << 6)
#define MDF_AL_CLEAN (1 << 7)
#define MDF_AL_DISABLED (1 << 8)
enum drbd_uuid_index {
UI_CURRENT,
......@@ -333,37 +358,23 @@ enum drbd_timeout_flag {
#define UUID_JUST_CREATED ((__u64)4)
/* magic numbers used in meta data and network packets */
#define DRBD_MAGIC 0x83740267
#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
#define DRBD_MAGIC_BIG 0x835a
#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG)
#define DRBD_MAGIC_100 0x8620ec20
#define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3)
#define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4)
#define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5)
/* how I came up with this magic?
* base64 decode "actlog==" ;) */
#define DRBD_AL_MAGIC 0x69cb65a2
/* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1
#define DRBD_MD_INDEX_FLEX_EXT -2
#define DRBD_MD_INDEX_FLEX_INT -3
/* Start of the new netlink/connector stuff */
#define DRBD_NL_CREATE_DEVICE 0x01
#define DRBD_NL_SET_DEFAULTS 0x02
/* For searching a vacant cn_idx value */
#define CN_IDX_STEP 6977
struct drbd_nl_cfg_req {
int packet_type;
unsigned int drbd_minor;
int flags;
unsigned short tag_list[];
};
struct drbd_nl_cfg_reply {
int packet_type;
unsigned int minor;
int ret_code; /* enum ret_code or set_st_err_t */
unsigned short tag_list[]; /* only used with get_* calls */
};
#endif
/*
* General overview:
* full generic netlink message:
* |nlmsghdr|genlmsghdr|<payload>
*
* payload:
* |optional fixed size family header|<sequence of netlink attributes>
*
* sequence of netlink attributes:
* I chose to have all "top level" attributes NLA_NESTED,
* corresponding to some real struct.
* So we have a sequence of |tla, len|<nested nla sequence>
*
* nested nla sequence:
* may be empty, or contain a sequence of netlink attributes
* representing the struct fields.
*
* The tag number of any field (regardless of containing struct)
* will be available as T_ ## field_name,
* so you cannot have the same field name in two differnt structs.
*
* The tag numbers themselves are per struct, though,
* so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type,
* which we won't use here).
* The tag numbers are used as index in the respective nla_policy array.
*
* GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy
* genl_magic_struct.h
* generates the struct declaration,
* generates an entry in the tla enum,
* genl_magic_func.h
* generates an entry in the static tla policy
* with .type = NLA_NESTED
* generates the static <struct_name>_nl_policy definition,
* and static conversion functions
*
* genl_magic_func.h
*
* GENL_mc_group(group)
* genl_magic_struct.h
* does nothing
* genl_magic_func.h
* defines and registers the mcast group,
* and provides a send helper
*
* GENL_notification(op_name, op_num, mcast_group, tla list)
* These are notifications to userspace.
*
* genl_magic_struct.h
* generates an entry in the genl_ops enum,
* genl_magic_func.h
* does nothing
*
* mcast group: the name of the mcast group this notification should be
* expected on
* tla list: the list of expected top level attributes,
* for documentation and sanity checking.
*
* GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations"
* These are requests from userspace.
*
* _op and _notification share the same "number space",
* op_nr will be assigned to "genlmsghdr->cmd"
*
* genl_magic_struct.h
* generates an entry in the genl_ops enum,
* genl_magic_func.h
* generates an entry in the static genl_ops array,
* and static register/unregister functions to
* genl_register_family_with_ops().
*
* flags and handler:
* GENL_op_init( .doit = x, .dumpit = y, .flags = something)
* GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM
* tla list: the list of expected top level attributes,
* for documentation and sanity checking.
*/
/*
* STRUCTS
*/
/* this is sent kernel -> userland on various error conditions, and contains
* informational textual info, which is supposedly human readable.
* The computer relevant return code is in the drbd_genlmsghdr.
*/
GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply,
/* "arbitrary" size strings, nla_policy.len = 0 */
__str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0)
)
/* Configuration requests typically need a context to operate on.
* Possible keys are device minor (fits in the drbd_genlmsghdr),
* the replication link (aka connection) name,
* and/or the replication group (aka resource) name,
* and the volume id within the resource. */
GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context,
__u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume)
__str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128)
__bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128)
__bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128)
)
GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
__str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128)
__str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128)
__s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx)
/* use the resize command to try and change the disk_size */
__u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size)
/* we could change the max_bio_bvecs,
* but it won't propagate through the stack */
__u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs)
__u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF)
__u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF)
__u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF)
__s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF)
__u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF)
__u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF)
__u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF)
__u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF)
__u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF)
__u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF)
__flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF)
__flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF)
__flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF)
__flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF)
__u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
__u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
/* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */
__flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF)
)
GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
__str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32)
__u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF)
)
GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
__str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE,
shared_secret, SHARED_SECRET_MAX)
__str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX)
__str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX)
__str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX)
__str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX)
__u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF)
__u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF)
__u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF)
__u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF)
__u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF)
__u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF)
__u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF)
__u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF)
__u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF)
__u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF)
__u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF)
__u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF)
__u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF)
__u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF)
__u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF)
__u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF)
__u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF)
__u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF)
__flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF)
__flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data)
__flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF)
__flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF)
__flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative)
__flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF)
/* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */
)
GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate)
)
GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
__u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size)
__flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force)
__flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync)
)
GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
/* the reason of the broadcast,
* if this is an event triggered broadcast. */
__u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason)
__u32_field(2, DRBD_F_REQUIRED, current_state)
__u64_field(3, DRBD_GENLA_F_MANDATORY, capacity)
__u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid)
/* These are for broadcast from after state change work.
* prev_state and new_state are from the moment the state change took
* place, new_state is not neccessarily the same as current_state,
* there may have been more state changes since. Which will be
* broadcasted soon, in their respective after state change work. */
__u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state)
__u32_field(6, DRBD_GENLA_F_MANDATORY, new_state)
/* if we have a local disk: */
__bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64)))
__u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags)
__u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total)
__u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos)
/* and in case resync or online verify is active */
__u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total)
__u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed)
/* for pre and post notifications of helper execution */
__str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32)
__u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code)
__u64_field(15, 0, send_cnt)
__u64_field(16, 0, recv_cnt)
__u64_field(17, 0, read_cnt)
__u64_field(18, 0, writ_cnt)
__u64_field(19, 0, al_writ_cnt)
__u64_field(20, 0, bm_writ_cnt)
__u32_field(21, 0, ap_bio_cnt)
__u32_field(22, 0, ap_pending_cnt)
__u32_field(23, 0, rs_pending_cnt)
)
GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms,
__u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector)
__u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector)
)
GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm)
)
GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms,
__u32_field(1, DRBD_F_REQUIRED, timeout_type)
)
GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect)
)
GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach)
)
/*
* Notifications and commands (genlmsghdr->cmd)
*/
GENL_mc_group(events)
/* kernel -> userspace announcement of changes */
GENL_notification(
DRBD_EVENT, 1, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY)
)
/* query kernel for specific or all info */
GENL_op(
DRBD_ADM_GET_STATUS, 2,
GENL_op_init(
.doit = drbd_adm_get_status,
.dumpit = drbd_adm_get_status_all,
/* anyone may ask for the status,
* it is broadcasted anyways */
),
/* To select the object .doit.
* Or a subset of objects in .dumpit. */
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
)
/* add DRBD minor devices as volumes to resources */
GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
/* add or delete resources */
GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_RESOURCE_OPTS, 9,
GENL_doit(drbd_adm_resource_opts),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(
DRBD_ADM_CONNECT, 10,
GENL_doit(drbd_adm_connect),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_CHG_NET_OPTS, 29,
GENL_doit(drbd_adm_net_opts),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
)
GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_ATTACH, 12,
GENL_doit(drbd_adm_attach),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED)
)
GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28,
GENL_doit(drbd_adm_disk_opts),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_RESIZE, 13,
GENL_doit(drbd_adm_resize),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(
DRBD_ADM_PRIMARY, 14,
GENL_doit(drbd_adm_set_role),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_SECONDARY, 15,
GENL_doit(drbd_adm_set_role),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
)
GENL_op(
DRBD_ADM_NEW_C_UUID, 16,
GENL_doit(drbd_adm_new_c_uuid),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(
DRBD_ADM_START_OV, 17,
GENL_doit(drbd_adm_start_ov),
GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
#ifndef DRBD_GENL_STRUCT_H
#define DRBD_GENL_STRUCT_H
/**
* struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests
* @minor:
* For admin requests (user -> kernel): which minor device to operate on.
* For (unicast) replies or informational (broadcast) messages
* (kernel -> user): which minor device the information is about.
* If we do not operate on minors, but on connections or resources,
* the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT
* is used instead.
* @flags: possible operation modifiers (relevant only for user->kernel):
* DRBD_GENL_F_SET_DEFAULTS
* @volume:
* When creating a new minor (adding it to a resource), the resource needs
* to know which volume number within the resource this is supposed to be.
* The volume number corresponds to the same volume number on the remote side,
* whereas the minor number on the remote side may be different
* (union with flags).
* @ret_code: kernel->userland unicast cfg reply return code (union with flags);
*/
struct drbd_genlmsghdr {
__u32 minor;
union {
__u32 flags;
__s32 ret_code;
};
};
/* To be used in drbd_genlmsghdr.flags */
enum {
DRBD_GENL_F_SET_DEFAULTS = 1,
};
enum drbd_state_info_bcast_reason {
SIB_GET_STATUS_REPLY = 1,
SIB_STATE_CHANGE = 2,
SIB_HELPER_PRE = 3,
SIB_HELPER_POST = 4,
SIB_SYNC_PROGRESS = 5,
};
/* hack around predefined gcc/cpp "linux=1",
* we cannot possibly include <1/drbd_genl.h> */
#undef linux
#include <linux/drbd.h>
#define GENL_MAGIC_VERSION API_VERSION
#define GENL_MAGIC_FAMILY drbd
#define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr)
#define GENL_MAGIC_INCLUDE_FILE <linux/drbd_genl.h>
#include <linux/genl_magic_struct.h>
#endif
......@@ -16,29 +16,37 @@
#define DEBUG_RANGE_CHECK 0
#define DRBD_MINOR_COUNT_MIN 1
#define DRBD_MINOR_COUNT_MAX 256
#define DRBD_MINOR_COUNT_MAX 255
#define DRBD_MINOR_COUNT_DEF 32
#define DRBD_MINOR_COUNT_SCALE '1'
#define DRBD_VOLUME_MAX 65535
#define DRBD_DIALOG_REFRESH_MIN 0
#define DRBD_DIALOG_REFRESH_MAX 600
#define DRBD_DIALOG_REFRESH_SCALE '1'
/* valid port number */
#define DRBD_PORT_MIN 1
#define DRBD_PORT_MAX 0xffff
#define DRBD_PORT_SCALE '1'
/* startup { */
/* if you want more than 3.4 days, disable */
#define DRBD_WFC_TIMEOUT_MIN 0
#define DRBD_WFC_TIMEOUT_MAX 300000
#define DRBD_WFC_TIMEOUT_DEF 0
#define DRBD_WFC_TIMEOUT_SCALE '1'
#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
#define DRBD_DEGR_WFC_TIMEOUT_SCALE '1'
#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
#define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1'
/* }*/
/* net { */
......@@ -47,75 +55,91 @@
#define DRBD_TIMEOUT_MIN 1
#define DRBD_TIMEOUT_MAX 600
#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
#define DRBD_TIMEOUT_SCALE '1'
/* If backing disk takes longer than disk_timeout, mark the disk as failed */
#define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */
#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */
#define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */
#define DRBD_DISK_TIMEOUT_SCALE '1'
/* active connection retries when C_WF_CONNECTION */
#define DRBD_CONNECT_INT_MIN 1
#define DRBD_CONNECT_INT_MAX 120
#define DRBD_CONNECT_INT_DEF 10 /* seconds */
#define DRBD_CONNECT_INT_SCALE '1'
/* keep-alive probes when idle */
#define DRBD_PING_INT_MIN 1
#define DRBD_PING_INT_MAX 120
#define DRBD_PING_INT_DEF 10
#define DRBD_PING_INT_SCALE '1'
/* timeout for the ping packets.*/
#define DRBD_PING_TIMEO_MIN 1
#define DRBD_PING_TIMEO_MAX 300
#define DRBD_PING_TIMEO_DEF 5
#define DRBD_PING_TIMEO_SCALE '1'
/* max number of write requests between write barriers */
#define DRBD_MAX_EPOCH_SIZE_MIN 1
#define DRBD_MAX_EPOCH_SIZE_MAX 20000
#define DRBD_MAX_EPOCH_SIZE_DEF 2048
#define DRBD_MAX_EPOCH_SIZE_SCALE '1'
/* I don't think that a tcp send buffer of more than 10M is useful */
#define DRBD_SNDBUF_SIZE_MIN 0
#define DRBD_SNDBUF_SIZE_MAX (10<<20)
#define DRBD_SNDBUF_SIZE_DEF 0
#define DRBD_SNDBUF_SIZE_SCALE '1'
#define DRBD_RCVBUF_SIZE_MIN 0
#define DRBD_RCVBUF_SIZE_MAX (10<<20)
#define DRBD_RCVBUF_SIZE_DEF 0
#define DRBD_RCVBUF_SIZE_SCALE '1'
/* @4k PageSize -> 128kB - 512MB */
#define DRBD_MAX_BUFFERS_MIN 32
#define DRBD_MAX_BUFFERS_MAX 131072
#define DRBD_MAX_BUFFERS_DEF 2048
#define DRBD_MAX_BUFFERS_SCALE '1'
/* @4k PageSize -> 4kB - 512MB */
#define DRBD_UNPLUG_WATERMARK_MIN 1
#define DRBD_UNPLUG_WATERMARK_MAX 131072
#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
#define DRBD_UNPLUG_WATERMARK_SCALE '1'
/* 0 is disabled.
* 200 should be more than enough even for very short timeouts */
#define DRBD_KO_COUNT_MIN 0
#define DRBD_KO_COUNT_MAX 200
#define DRBD_KO_COUNT_DEF 0
#define DRBD_KO_COUNT_DEF 7
#define DRBD_KO_COUNT_SCALE '1'
/* } */
/* syncer { */
/* FIXME allow rate to be zero? */
#define DRBD_RATE_MIN 1
#define DRBD_RESYNC_RATE_MIN 1
/* channel bonding 10 GbE, or other hardware */
#define DRBD_RATE_MAX (4 << 20)
#define DRBD_RATE_DEF 250 /* kb/second */
#define DRBD_RESYNC_RATE_MAX (4 << 20)
#define DRBD_RESYNC_RATE_DEF 250
#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */
/* less than 7 would hit performance unnecessarily.
* 3833 is the largest prime that still does fit
* into 64 sectors of activity log */
* 919 slots context information per transaction,
* 32k activity log, 4k transaction size,
* one transaction in flight:
* 919 * 7 = 6433 */
#define DRBD_AL_EXTENTS_MIN 7
#define DRBD_AL_EXTENTS_MAX 3833
#define DRBD_AL_EXTENTS_DEF 127
#define DRBD_AL_EXTENTS_MAX 6433
#define DRBD_AL_EXTENTS_DEF 1237
#define DRBD_AL_EXTENTS_SCALE '1'
#define DRBD_AFTER_MIN -1
#define DRBD_AFTER_MAX 255
#define DRBD_AFTER_DEF -1
#define DRBD_MINOR_NUMBER_MIN -1
#define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1)
#define DRBD_MINOR_NUMBER_DEF -1
#define DRBD_MINOR_NUMBER_SCALE '1'
/* } */
......@@ -124,11 +148,12 @@
* the upper limit with 64bit kernel, enough ram and flexible meta data
* is 1 PiB, currently. */
/* DRBD_MAX_SECTORS */
#define DRBD_DISK_SIZE_SECT_MIN 0
#define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40))
#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
#define DRBD_DISK_SIZE_MIN 0
#define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40))
#define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */
#define DRBD_DISK_SIZE_SCALE 's' /* sectors */
#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
#define DRBD_ON_IO_ERROR_DEF EP_DETACH
#define DRBD_FENCING_DEF FP_DONT_CARE
#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
......@@ -136,38 +161,59 @@
#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
#define DRBD_ON_CONGESTION_DEF OC_BLOCK
#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL
#define DRBD_MAX_BIO_BVECS_MIN 0
#define DRBD_MAX_BIO_BVECS_MAX 128
#define DRBD_MAX_BIO_BVECS_DEF 0
#define DRBD_MAX_BIO_BVECS_SCALE '1'
#define DRBD_C_PLAN_AHEAD_MIN 0
#define DRBD_C_PLAN_AHEAD_MAX 300
#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */
#define DRBD_C_PLAN_AHEAD_DEF 20
#define DRBD_C_PLAN_AHEAD_SCALE '1'
#define DRBD_C_DELAY_TARGET_MIN 1
#define DRBD_C_DELAY_TARGET_MAX 100
#define DRBD_C_DELAY_TARGET_DEF 10
#define DRBD_C_DELAY_TARGET_SCALE '1'
#define DRBD_C_FILL_TARGET_MIN 0
#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */
#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
#define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */
#define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */
#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */
#define DRBD_C_MAX_RATE_MIN 250
#define DRBD_C_MAX_RATE_MAX (4 << 20)
#define DRBD_C_MAX_RATE_DEF 102400
#define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */
#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */
#define DRBD_C_MIN_RATE_MIN 0
#define DRBD_C_MIN_RATE_MAX (4 << 20)
#define DRBD_C_MIN_RATE_DEF 4096
#define DRBD_C_MIN_RATE_DEF 250
#define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */
#define DRBD_CONG_FILL_MIN 0
#define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */
#define DRBD_CONG_FILL_DEF 0
#define DRBD_CONG_FILL_SCALE 's' /* sectors */
#define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN
#define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX
#define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF
#define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE
#define DRBD_PROTOCOL_DEF DRBD_PROT_C
#define DRBD_DISK_BARRIER_DEF 0
#define DRBD_DISK_FLUSHES_DEF 1
#define DRBD_DISK_DRAIN_DEF 1
#define DRBD_MD_FLUSHES_DEF 1
#define DRBD_TCP_CORK_DEF 1
#define DRBD_AL_UPDATES_DEF 1
#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0
#define DRBD_ALWAYS_ASBP_DEF 0
#define DRBD_USE_RLE_DEF 1
#undef RANGE
#endif
/*
PAKET( name,
TYPE ( pn, pr, member )
...
)
You may never reissue one of the pn arguments
*/
#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
#endif
NL_PACKET(primary, 1,
NL_BIT( 1, T_MAY_IGNORE, primary_force)
)
NL_PACKET(secondary, 2, )
NL_PACKET(disk_conf, 3,
NL_INT64( 2, T_MAY_IGNORE, disk_size)
NL_STRING( 3, T_MANDATORY, backing_dev, 128)
NL_STRING( 4, T_MANDATORY, meta_dev, 128)
NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
NL_INTEGER( 7, T_MAY_IGNORE, fencing)
NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
/* 55 max_bio_size was available in 8.2.6rc2 */
NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout)
)
NL_PACKET(detach, 4,
NL_BIT( 88, T_MANDATORY, detach_force)
)
NL_PACKET(net_conf, 5,
NL_STRING( 8, T_MANDATORY, my_addr, 128)
NL_STRING( 9, T_MANDATORY, peer_addr, 128)
NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
NL_INTEGER( 14, T_MAY_IGNORE, timeout)
NL_INTEGER( 15, T_MANDATORY, wire_protocol)
NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
NL_INTEGER( 81, T_MAY_IGNORE, on_congestion)
NL_INTEGER( 82, T_MAY_IGNORE, cong_fill)
NL_INTEGER( 83, T_MAY_IGNORE, cong_extents)
/* 59 addr_family was available in GIT, never released */
NL_BIT( 60, T_MANDATORY, mind_af)
NL_BIT( 27, T_MAY_IGNORE, want_lose)
NL_BIT( 28, T_MAY_IGNORE, two_primaries)
NL_BIT( 41, T_MAY_IGNORE, always_asbp)
NL_BIT( 61, T_MAY_IGNORE, no_cork)
NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
NL_BIT( 70, T_MANDATORY, dry_run)
)
NL_PACKET(disconnect, 6,
NL_BIT( 84, T_MAY_IGNORE, force)
)
NL_PACKET(resize, 7,
NL_INT64( 29, T_MAY_IGNORE, resize_size)
NL_BIT( 68, T_MAY_IGNORE, resize_force)
NL_BIT( 69, T_MANDATORY, no_resync)
)
NL_PACKET(syncer_conf, 8,
NL_INTEGER( 30, T_MAY_IGNORE, rate)
NL_INTEGER( 31, T_MAY_IGNORE, after)
NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
/* NL_INTEGER( 71, T_MAY_IGNORE, dp_volume)
* NL_INTEGER( 72, T_MAY_IGNORE, dp_interval)
* NL_INTEGER( 73, T_MAY_IGNORE, throttle_th)
* NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th)
* feature will be reimplemented differently with 8.3.9 */
NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
NL_BIT( 65, T_MAY_IGNORE, use_rle)
NL_INTEGER( 75, T_MAY_IGNORE, on_no_data)
NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead)
NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target)
NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target)
NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate)
NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate)
)
NL_PACKET(invalidate, 9, )
NL_PACKET(invalidate_peer, 10, )
NL_PACKET(pause_sync, 11, )
NL_PACKET(resume_sync, 12, )
NL_PACKET(suspend_io, 13, )
NL_PACKET(resume_io, 14, )
NL_PACKET(outdate, 15, )
NL_PACKET(get_config, 16, )
NL_PACKET(get_state, 17,
NL_INTEGER( 33, T_MAY_IGNORE, state_i)
)
NL_PACKET(get_uuids, 18,
NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
)
NL_PACKET(get_timeout_flag, 19,
NL_BIT( 36, T_MAY_IGNORE, use_degraded)
)
NL_PACKET(call_helper, 20,
NL_STRING( 38, T_MAY_IGNORE, helper, 32)
)
/* Tag nr 42 already allocated in drbd-8.1 development. */
NL_PACKET(sync_progress, 23,
NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
)
NL_PACKET(dump_ee, 24,
NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
NL_INT64( 48, T_MAY_IGNORE, ee_sector)
NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
)
NL_PACKET(start_ov, 25,
NL_INT64( 66, T_MAY_IGNORE, start_sector)
NL_INT64( 90, T_MANDATORY, stop_sector)
)
NL_PACKET(new_c_uuid, 26,
NL_BIT( 63, T_MANDATORY, clear_bm)
)
#ifdef NL_RESPONSE
NL_RESPONSE(return_code_only, 27)
#endif
#undef NL_PACKET
#undef NL_INTEGER
#undef NL_INT64
#undef NL_BIT
#undef NL_STRING
#undef NL_RESPONSE
#ifndef DRBD_TAG_MAGIC_H
#define DRBD_TAG_MAGIC_H
#define TT_END 0
#define TT_REMOVED 0xE000
/* declare packet_type enums */
enum packet_types {
#define NL_PACKET(name, number, fields) P_ ## name = number,
#define NL_RESPONSE(name, number) P_ ## name = number,
#define NL_INTEGER(pn, pr, member)
#define NL_INT64(pn, pr, member)
#define NL_BIT(pn, pr, member)
#define NL_STRING(pn, pr, member, len)
#include <linux/drbd_nl.h>
P_nl_after_last_packet,
};
/* These struct are used to deduce the size of the tag lists: */
#define NL_PACKET(name, number, fields) \
struct name ## _tag_len_struct { fields };
#define NL_INTEGER(pn, pr, member) \
int member; int tag_and_len ## member;
#define NL_INT64(pn, pr, member) \
__u64 member; int tag_and_len ## member;
#define NL_BIT(pn, pr, member) \
unsigned char member:1; int tag_and_len ## member;
#define NL_STRING(pn, pr, member, len) \
unsigned char member[len]; int member ## _len; \
int tag_and_len ## member;
#include <linux/drbd_nl.h>
/* declare tag-list-sizes */
static const int tag_list_sizes[] = {
#define NL_PACKET(name, number, fields) 2 fields ,
#define NL_INTEGER(pn, pr, member) + 4 + 4
#define NL_INT64(pn, pr, member) + 4 + 8
#define NL_BIT(pn, pr, member) + 4 + 1
#define NL_STRING(pn, pr, member, len) + 4 + (len)
#include <linux/drbd_nl.h>
};
/* The two highest bits are used for the tag type */
#define TT_MASK 0xC000
#define TT_INTEGER 0x0000
#define TT_INT64 0x4000
#define TT_BIT 0x8000
#define TT_STRING 0xC000
/* The next bit indicates if processing of the tag is mandatory */
#define T_MANDATORY 0x2000
#define T_MAY_IGNORE 0x0000
#define TN_MASK 0x1fff
/* The remaining 13 bits are used to enumerate the tags */
#define tag_type(T) ((T) & TT_MASK)
#define tag_number(T) ((T) & TN_MASK)
/* declare tag enums */
#define NL_PACKET(name, number, fields) fields
enum drbd_tags {
#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
#include <linux/drbd_nl.h>
};
struct tag {
const char *name;
int type_n_flags;
int max_len;
};
/* declare tag names */
#define NL_PACKET(name, number, fields) fields
static const struct tag tag_descriptions[] = {
#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
#include <linux/drbd_nl.h>
};
#endif
#ifndef GENL_MAGIC_FUNC_H
#define GENL_MAGIC_FUNC_H
#include <linux/genl_magic_struct.h>
/*
* Magic: declare tla policy {{{1
* Magic: declare nested policies
* {{{2
*/
#undef GENL_mc_group
#define GENL_mc_group(group)
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list)
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
[tag_name] = { .type = NLA_NESTED },
static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static struct nla_policy s_name ## _nl_policy[] __read_mostly = \
{ s_fields };
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \
__put, __is_signed) \
[attr_nr] = { .type = nla_type },
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \
__get, __put, __is_signed) \
[attr_nr] = { .type = nla_type, \
.len = maxlen - (nla_type == NLA_NUL_STRING) },
#include GENL_MAGIC_INCLUDE_FILE
#ifndef __KERNEL__
#ifndef pr_info
#define pr_info(args...) fprintf(stderr, args);
#endif
#endif
#ifdef GENL_MAGIC_DEBUG
static void dprint_field(const char *dir, int nla_type,
const char *name, void *valp)
{
__u64 val = valp ? *(__u32 *)valp : 1;
switch (nla_type) {
case NLA_U8: val = (__u8)val;
case NLA_U16: val = (__u16)val;
case NLA_U32: val = (__u32)val;
pr_info("%s attr %s: %d 0x%08x\n", dir,
name, (int)val, (unsigned)val);
break;
case NLA_U64:
val = *(__u64*)valp;
pr_info("%s attr %s: %lld 0x%08llx\n", dir,
name, (long long)val, (unsigned long long)val);
break;
case NLA_FLAG:
if (val)
pr_info("%s attr %s: set\n", dir, name);
break;
}
}
static void dprint_array(const char *dir, int nla_type,
const char *name, const char *val, unsigned len)
{
switch (nla_type) {
case NLA_NUL_STRING:
if (len && val[len-1] == '\0')
len--;
pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val);
break;
default:
/* we can always show 4 byte,
* thats what nlattr are aligned to. */
pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n",
dir, name, len, val[0], val[1], val[2], val[3]);
}
}
#define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b);
/* Name is a member field name of the struct s.
* If s is NULL (only parsing, no copy requested in *_from_attrs()),
* nla is supposed to point to the attribute containing the information
* corresponding to that struct member. */
#define DPRINT_FIELD(dir, nla_type, name, s, nla) \
do { \
if (s) \
dprint_field(dir, nla_type, #name, &s->name); \
else if (nla) \
dprint_field(dir, nla_type, #name, \
(nla_type == NLA_FLAG) ? NULL \
: nla_data(nla)); \
} while (0)
#define DPRINT_ARRAY(dir, nla_type, name, s, nla) \
do { \
if (s) \
dprint_array(dir, nla_type, #name, \
s->name, s->name ## _len); \
else if (nla) \
dprint_array(dir, nla_type, #name, \
nla_data(nla), nla_len(nla)); \
} while (0)
#else
#define DPRINT_TLA(a, op, b) do {} while (0)
#define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0)
#define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0)
#endif
/*
* Magic: provide conversion functions {{{1
* populate struct from attribute table:
* {{{2
*/
/* processing of generic netlink messages is serialized.
* use one static buffer for parsing of nested attributes */
static struct nlattr *nested_attr_tb[128];
#ifndef BUILD_BUG_ON
/* Force a compilation error if condition is true */
#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
/* Force a compilation error if condition is true, but also produce a
result (of value 0 and type size_t), so the expression can be used
e.g. in a structure initializer (or where-ever else comma expressions
aren't permitted). */
#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
#endif
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
/* *_from_attrs functions are static, but potentially unused */ \
static int __ ## s_name ## _from_attrs(struct s_name *s, \
struct genl_info *info, bool exclude_invariants) \
{ \
const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \
struct nlattr *tla = info->attrs[tag_number]; \
struct nlattr **ntb = nested_attr_tb; \
struct nlattr *nla; \
int err; \
BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \
if (!tla) \
return -ENOMSG; \
DPRINT_TLA(#s_name, "<=-", #tag_name); \
err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \
if (err) \
return err; \
\
s_fields \
return 0; \
} __attribute__((unused)) \
static int s_name ## _from_attrs(struct s_name *s, \
struct genl_info *info) \
{ \
return __ ## s_name ## _from_attrs(s, info, false); \
} __attribute__((unused)) \
static int s_name ## _from_attrs_for_change(struct s_name *s, \
struct genl_info *info) \
{ \
return __ ## s_name ## _from_attrs(s, info, true); \
} __attribute__((unused)) \
#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \
nla = ntb[attr_nr]; \
if (nla) { \
if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \
pr_info("<< must not change invariant attr: %s\n", #name); \
return -EEXIST; \
} \
assignment; \
} else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \
/* attribute missing from payload, */ \
/* which was expected */ \
} else if ((attr_flag) & DRBD_F_REQUIRED) { \
pr_info("<< missing attr: %s\n", #name); \
return -ENOMSG; \
}
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
__assign(attr_nr, attr_flag, name, nla_type, type, \
if (s) \
s->name = __get(nla); \
DPRINT_FIELD("<<", nla_type, name, s, nla))
/* validate_nla() already checked nla_len <= maxlen appropriately. */
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
__assign(attr_nr, attr_flag, name, nla_type, type, \
if (s) \
s->name ## _len = \
__get(s->name, nla, maxlen); \
DPRINT_ARRAY("<<", nla_type, name, s, nla))
#include GENL_MAGIC_INCLUDE_FILE
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields)
/*
* Magic: define op number to op name mapping {{{1
* {{{2
*/
const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd)
{
switch (cmd) {
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list) \
case op_num: return #op_name;
#include GENL_MAGIC_INCLUDE_FILE
default:
return "unknown";
}
}
#ifdef __KERNEL__
#include <linux/stringify.h>
/*
* Magic: define genl_ops {{{1
* {{{2
*/
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list) \
{ \
handler \
.cmd = op_name, \
.policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \
},
#define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops)
static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list)
/*
* Define the genl_family, multicast groups, {{{1
* and provide register/unregister functions.
* {{{2
*/
#define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
static struct genl_family ZZZ_genl_family __read_mostly = {
.id = GENL_ID_GENERATE,
.name = __stringify(GENL_MAGIC_FAMILY),
.version = GENL_MAGIC_VERSION,
#ifdef GENL_MAGIC_FAMILY_HDRSZ
.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
#endif
.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
};
/*
* Magic: define multicast groups
* Magic: define multicast group registration helper
*/
#undef GENL_mc_group
#define GENL_mc_group(group) \
static struct genl_multicast_group \
CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \
.name = #group, \
}; \
static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \
struct sk_buff *skb, gfp_t flags) \
{ \
unsigned int group_id = \
CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \
if (!group_id) \
return -EINVAL; \
return genlmsg_multicast(skb, 0, group_id, flags); \
}
#include GENL_MAGIC_INCLUDE_FILE
int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
{
int err = genl_register_family_with_ops(&ZZZ_genl_family,
ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops));
if (err)
return err;
#undef GENL_mc_group
#define GENL_mc_group(group) \
err = genl_register_mc_group(&ZZZ_genl_family, \
&CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \
if (err) \
goto fail; \
else \
pr_info("%s: mcg %s: %u\n", #group, \
__stringify(GENL_MAGIC_FAMILY), \
CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id);
#include GENL_MAGIC_INCLUDE_FILE
#undef GENL_mc_group
#define GENL_mc_group(group)
return 0;
fail:
genl_unregister_family(&ZZZ_genl_family);
return err;
}
void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
{
genl_unregister_family(&ZZZ_genl_family);
}
/*
* Magic: provide conversion functions {{{1
* populate skb from struct.
* {{{2
*/
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \
const bool exclude_sensitive) \
{ \
struct nlattr *tla = nla_nest_start(skb, tag_number); \
if (!tla) \
goto nla_put_failure; \
DPRINT_TLA(#s_name, "-=>", #tag_name); \
s_fields \
nla_nest_end(skb, tla); \
return 0; \
\
nla_put_failure: \
if (tla) \
nla_nest_cancel(skb, tla); \
return -EMSGSIZE; \
} \
static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \
struct s_name *s) \
{ \
return s_name ## _to_skb(skb, s, 0); \
} \
static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \
struct s_name *s) \
{ \
return s_name ## _to_skb(skb, s, 1); \
}
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \
DPRINT_FIELD(">>", nla_type, name, s, NULL); \
if (__put(skb, attr_nr, s->name)) \
goto nla_put_failure; \
}
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \
DPRINT_ARRAY(">>",nla_type, name, s, NULL); \
if (__put(skb, attr_nr, min_t(int, maxlen, \
s->name ## _len + (nla_type == NLA_NUL_STRING)),\
s->name)) \
goto nla_put_failure; \
}
#include GENL_MAGIC_INCLUDE_FILE
/* Functions for initializing structs to default values. */
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed)
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed)
#undef __u32_field_def
#define __u32_field_def(attr_nr, attr_flag, name, default) \
x->name = default;
#undef __s32_field_def
#define __s32_field_def(attr_nr, attr_flag, name, default) \
x->name = default;
#undef __flg_field_def
#define __flg_field_def(attr_nr, attr_flag, name, default) \
x->name = default;
#undef __str_field_def
#define __str_field_def(attr_nr, attr_flag, name, maxlen) \
memset(x->name, 0, sizeof(x->name)); \
x->name ## _len = 0;
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \
static void set_ ## s_name ## _defaults(struct s_name *x) { \
s_fields \
}
#include GENL_MAGIC_INCLUDE_FILE
#endif /* __KERNEL__ */
/* }}}1 */
#endif /* GENL_MAGIC_FUNC_H */
/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */
#ifndef GENL_MAGIC_STRUCT_H
#define GENL_MAGIC_STRUCT_H
#ifndef GENL_MAGIC_FAMILY
# error "you need to define GENL_MAGIC_FAMILY before inclusion"
#endif
#ifndef GENL_MAGIC_VERSION
# error "you need to define GENL_MAGIC_VERSION before inclusion"
#endif
#ifndef GENL_MAGIC_INCLUDE_FILE
# error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion"
#endif
#include <linux/genetlink.h>
#include <linux/types.h>
#define CONCAT__(a,b) a ## b
#define CONCAT_(a,b) CONCAT__(a,b)
extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void);
extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
/*
* Extension of genl attribute validation policies {{{2
*/
/*
* @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not
* know about. This flag can be set in nlattr->nla_type to indicate that this
* attribute must not be ignored.
*
* We check and remove this flag in drbd_nla_check_mandatory() before
* validating the attribute types and lengths via nla_parse_nested().
*/
#define DRBD_GENLA_F_MANDATORY (1 << 14)
/*
* Flags specific to drbd and not visible at the netlink layer, used in
* <struct>_from_attrs and <struct>_to_skb:
*
* @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is
* invalid.
*
* @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be
* included in unpriviledged get requests or broadcasts.
*
* @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but
* cannot subsequently be changed.
*/
#define DRBD_F_REQUIRED (1 << 0)
#define DRBD_F_SENSITIVE (1 << 1)
#define DRBD_F_INVARIANT (1 << 2)
#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY))
/* }}}1
* MAGIC
* multi-include macro expansion magic starts here
*/
/* MAGIC helpers {{{2 */
/* possible field types */
#define __flg_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U8, char, \
nla_get_u8, nla_put_u8, false)
#define __u8_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \
nla_get_u8, nla_put_u8, false)
#define __u16_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U16, __u16, \
nla_get_u16, nla_put_u16, false)
#define __u32_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U32, __u32, \
nla_get_u32, nla_put_u32, false)
#define __s32_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U32, __s32, \
nla_get_u32, nla_put_u32, true)
#define __u64_field(attr_nr, attr_flag, name) \
__field(attr_nr, attr_flag, name, NLA_U64, __u64, \
nla_get_u64, nla_put_u64, false)
#define __str_field(attr_nr, attr_flag, name, maxlen) \
__array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \
nla_strlcpy, nla_put, false)
#define __bin_field(attr_nr, attr_flag, name, maxlen) \
__array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \
nla_memcpy, nla_put, false)
/* fields with default values */
#define __flg_field_def(attr_nr, attr_flag, name, default) \
__flg_field(attr_nr, attr_flag, name)
#define __u32_field_def(attr_nr, attr_flag, name, default) \
__u32_field(attr_nr, attr_flag, name)
#define __s32_field_def(attr_nr, attr_flag, name, default) \
__s32_field(attr_nr, attr_flag, name)
#define __str_field_def(attr_nr, attr_flag, name, maxlen) \
__str_field(attr_nr, attr_flag, name, maxlen)
#define GENL_op_init(args...) args
#define GENL_doit(handler) \
.doit = handler, \
.flags = GENL_ADMIN_PERM,
#define GENL_dumpit(handler) \
.dumpit = handler, \
.flags = GENL_ADMIN_PERM,
/* }}}1
* Magic: define the enum symbols for genl_ops
* Magic: define the enum symbols for top level attributes
* Magic: define the enum symbols for nested attributes
* {{{2
*/
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields)
#undef GENL_mc_group
#define GENL_mc_group(group)
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list) \
op_name = op_num,
#undef GENL_op
#define GENL_op(op_name, op_num, handler, tla_list) \
op_name = op_num,
enum {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list)
#undef GENL_op
#define GENL_op(op_name, op_num, handler, attr_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
tag_name = tag_number,
enum {
#include GENL_MAGIC_INCLUDE_FILE
};
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
enum { \
s_fields \
};
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, \
__get, __put, __is_signed) \
T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, \
maxlen, __get, __put, __is_signed) \
T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
#include GENL_MAGIC_INCLUDE_FILE
/* }}}1
* Magic: compile time assert unique numbers for operations
* Magic: -"- unique numbers for top level attributes
* Magic: -"- unique numbers for nested attributes
* {{{2
*/
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields)
#undef GENL_op
#define GENL_op(op_name, op_num, handler, attr_list) \
case op_name:
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list) \
case op_name:
static inline void ct_assert_unique_operations(void)
{
switch (0) {
#include GENL_MAGIC_INCLUDE_FILE
;
}
}
#undef GENL_op
#define GENL_op(op_name, op_num, handler, attr_list)
#undef GENL_notification
#define GENL_notification(op_name, op_num, mcast_group, tla_list)
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
case tag_number:
static inline void ct_assert_unique_top_level_attributes(void)
{
switch (0) {
#include GENL_MAGIC_INCLUDE_FILE
;
}
}
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
static inline void ct_assert_unique_ ## s_name ## _attributes(void) \
{ \
switch (0) { \
s_fields \
; \
} \
}
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
case attr_nr:
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
case attr_nr:
#include GENL_MAGIC_INCLUDE_FILE
/* }}}1
* Magic: declare structs
* struct <name> {
* fields
* };
* {{{2
*/
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
struct s_name { s_fields };
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
__is_signed) \
type name;
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, __is_signed) \
type name[maxlen]; \
__u32 name ## _len;
#include GENL_MAGIC_INCLUDE_FILE
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
enum { \
s_fields \
};
#undef __field
#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
is_signed) \
F_ ## name ## _IS_SIGNED = is_signed,
#undef __array
#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
__get, __put, is_signed) \
F_ ## name ## _IS_SIGNED = is_signed,
#include GENL_MAGIC_INCLUDE_FILE
/* }}}1 */
#endif /* GENL_MAGIC_STRUCT_H */
/* vim: set foldmethod=marker nofoldenable : */
......@@ -152,4 +152,15 @@ void ida_simple_remove(struct ida *ida, unsigned int id);
void __init idr_init_cache(void);
/**
* idr_for_each_entry - iterate over an idr's elements of a given type
* @idp: idr handle
* @entry: the type * to use as cursor
* @id: id entry's key
*/
#define idr_for_each_entry(idp, entry, id) \
for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \
entry != NULL; \
++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
#endif /* __IDR_H__ */
......@@ -166,9 +166,11 @@ struct lc_element {
/* if we want to track a larger set of objects,
* it needs to become arch independend u64 */
unsigned lc_number;
/* special label when on free list */
#define LC_FREE (~0U)
/* for pending changes */
unsigned lc_new_number;
};
struct lru_cache {
......@@ -176,6 +178,7 @@ struct lru_cache {
struct list_head lru;
struct list_head free;
struct list_head in_use;
struct list_head to_be_changed;
/* the pre-created kmem cache to allocate the objects from */
struct kmem_cache *lc_cache;
......@@ -186,7 +189,7 @@ struct lru_cache {
size_t element_off;
/* number of elements (indices) */
unsigned int nr_elements;
unsigned int nr_elements;
/* Arbitrary limit on maximum tracked objects. Practical limit is much
* lower due to allocation failures, probably. For typical use cases,
* nr_elements should be a few thousand at most.
......@@ -194,18 +197,19 @@ struct lru_cache {
* 8 high bits of .lc_index to be overloaded with flags in the future. */
#define LC_MAX_ACTIVE (1<<24)
/* allow to accumulate a few (index:label) changes,
* but no more than max_pending_changes */
unsigned int max_pending_changes;
/* number of elements currently on to_be_changed list */
unsigned int pending_changes;
/* statistics */
unsigned used; /* number of lelements currently on in_use list */
unsigned long hits, misses, starving, dirty, changed;
unsigned used; /* number of elements currently on in_use list */
unsigned long hits, misses, starving, locked, changed;
/* see below: flag-bits for lru_cache */
unsigned long flags;
/* when changing the label of an index element */
unsigned int new_number;
/* for paranoia when changing the label of an index element */
struct lc_element *changing_element;
void *lc_private;
const char *name;
......@@ -221,10 +225,15 @@ enum {
/* debugging aid, to catch concurrent access early.
* user needs to guarantee exclusive access by proper locking! */
__LC_PARANOIA,
/* if we need to change the set, but currently there is a changing
* transaction pending, we are "dirty", and must deferr further
* changing requests */
/* annotate that the set is "dirty", possibly accumulating further
* changes, until a transaction is finally triggered */
__LC_DIRTY,
/* Locked, no further changes allowed.
* Also used to serialize changing transactions. */
__LC_LOCKED,
/* if we need to change the set, but currently there is no free nor
* unused element available, we are "starving", and must not give out
* further references, to guarantee that eventually some refcnt will
......@@ -236,9 +245,11 @@ enum {
};
#define LC_PARANOIA (1<<__LC_PARANOIA)
#define LC_DIRTY (1<<__LC_DIRTY)
#define LC_LOCKED (1<<__LC_LOCKED)
#define LC_STARVING (1<<__LC_STARVING)
extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned max_pending_changes,
unsigned e_count, size_t e_size, size_t e_off);
extern void lc_reset(struct lru_cache *lc);
extern void lc_destroy(struct lru_cache *lc);
......@@ -249,7 +260,7 @@ extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
extern void lc_committed(struct lru_cache *lc);
struct seq_file;
extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
......@@ -258,17 +269,29 @@ extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char
void (*detail) (struct seq_file *, struct lc_element *));
/**
* lc_try_lock - can be used to stop lc_get() from changing the tracked set
* lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set
* @lc: the lru cache to operate on
*
* Note that the reference counts and order on the active and lru lists may
* still change. Returns true if we acquired the lock.
* Allows (expects) the set to be "dirty". Note that the reference counts and
* order on the active and lru lists may still change. Used to serialize
* changing transactions. Returns true if we aquired the lock.
*/
static inline int lc_try_lock(struct lru_cache *lc)
static inline int lc_try_lock_for_transaction(struct lru_cache *lc)
{
return !test_and_set_bit(__LC_DIRTY, &lc->flags);
return !test_and_set_bit(__LC_LOCKED, &lc->flags);
}
/**
* lc_try_lock - variant to stop lc_get() from changing the tracked set
* @lc: the lru cache to operate on
*
* Note that the reference counts and order on the active and lru lists may
* still change. Only works on a "clean" set. Returns true if we aquired the
* lock, which means there are no pending changes, and any further attempt to
* change the set will not succeed until the next lc_unlock().
*/
extern int lc_try_lock(struct lru_cache *lc);
/**
* lc_unlock - unlock @lc, allow lc_get() to change the set again
* @lc: the lru cache to operate on
......@@ -276,14 +299,10 @@ static inline int lc_try_lock(struct lru_cache *lc)
static inline void lc_unlock(struct lru_cache *lc)
{
clear_bit(__LC_DIRTY, &lc->flags);
smp_mb__after_clear_bit();
clear_bit_unlock(__LC_LOCKED, &lc->flags);
}
static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e = lc_find(lc, enr);
return e && e->refcnt;
}
extern bool lc_is_used(struct lru_cache *lc, unsigned int enr);
#define lc_entry(ptr, type, member) \
container_of(ptr, type, member)
......
......@@ -44,8 +44,8 @@ MODULE_LICENSE("GPL");
} while (0)
#define RETURN(x...) do { \
clear_bit(__LC_PARANOIA, &lc->flags); \
smp_mb__after_clear_bit(); return x ; } while (0)
clear_bit_unlock(__LC_PARANOIA, &lc->flags); \
return x ; } while (0)
/* BUG() if e is not one of the elements tracked by lc */
#define PARANOIA_LC_ELEMENT(lc, e) do { \
......@@ -55,9 +55,40 @@ MODULE_LICENSE("GPL");
BUG_ON(i >= lc_->nr_elements); \
BUG_ON(lc_->lc_element[i] != e_); } while (0)
/* We need to atomically
* - try to grab the lock (set LC_LOCKED)
* - only if there is no pending transaction
* (neither LC_DIRTY nor LC_STARVING is set)
* Because of PARANOIA_ENTRY() above abusing lc->flags as well,
* it is not sufficient to just say
* return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED);
*/
int lc_try_lock(struct lru_cache *lc)
{
unsigned long val;
do {
val = cmpxchg(&lc->flags, 0, LC_LOCKED);
} while (unlikely (val == LC_PARANOIA));
/* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */
return 0 == val;
#if 0
/* Alternative approach, spin in case someone enters or leaves a
* PARANOIA_ENTRY()/RETURN() section. */
unsigned long old, new, val;
do {
old = lc->flags & LC_PARANOIA;
new = old | LC_LOCKED;
val = cmpxchg(&lc->flags, old, new);
} while (unlikely (val == (old ^ LC_PARANOIA)));
return old == val;
#endif
}
/**
* lc_create - prepares to track objects in an active set
* @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
* @max_pending_changes: maximum changes to accumulate until a transaction is required
* @e_count: number of elements allowed to be active simultaneously
* @e_size: size of the tracked objects
* @e_off: offset to the &struct lc_element member in a tracked object
......@@ -66,6 +97,7 @@ MODULE_LICENSE("GPL");
* or NULL on (allocation) failure.
*/
struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned max_pending_changes,
unsigned e_count, size_t e_size, size_t e_off)
{
struct hlist_head *slot = NULL;
......@@ -98,12 +130,13 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
INIT_LIST_HEAD(&lc->to_be_changed);
lc->name = name;
lc->element_size = e_size;
lc->element_off = e_off;
lc->nr_elements = e_count;
lc->new_number = LC_FREE;
lc->max_pending_changes = max_pending_changes;
lc->lc_cache = cache;
lc->lc_element = element;
lc->lc_slot = slot;
......@@ -117,6 +150,7 @@ struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
e = p + e_off;
e->lc_index = i;
e->lc_number = LC_FREE;
e->lc_new_number = LC_FREE;
list_add(&e->list, &lc->free);
element[i] = e;
}
......@@ -175,15 +209,15 @@ void lc_reset(struct lru_cache *lc)
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
INIT_LIST_HEAD(&lc->to_be_changed);
lc->used = 0;
lc->hits = 0;
lc->misses = 0;
lc->starving = 0;
lc->dirty = 0;
lc->locked = 0;
lc->changed = 0;
lc->pending_changes = 0;
lc->flags = 0;
lc->changing_element = NULL;
lc->new_number = LC_FREE;
memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
for (i = 0; i < lc->nr_elements; i++) {
......@@ -194,6 +228,7 @@ void lc_reset(struct lru_cache *lc)
/* re-init it */
e->lc_index = i;
e->lc_number = LC_FREE;
e->lc_new_number = LC_FREE;
list_add(&e->list, &lc->free);
}
}
......@@ -208,14 +243,14 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
/* NOTE:
* total calls to lc_get are
* (starving + hits + misses)
* misses include "dirty" count (update from an other thread in
* misses include "locked" count (update from an other thread in
* progress) and "changed", when this in fact lead to an successful
* update of the cache.
*/
return seq_printf(seq, "\t%s: used:%u/%u "
"hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
"hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
lc->name, lc->used, lc->nr_elements,
lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
}
static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
......@@ -224,16 +259,8 @@ static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
}
/**
* lc_find - find element by label, if present in the hash table
* @lc: The lru_cache object
* @enr: element number
*
* Returns the pointer to an element, if the element with the requested
* "label" or element number is present in the hash table,
* or NULL if not found. Does not change the refcnt.
*/
struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr,
bool include_changing)
{
struct hlist_node *n;
struct lc_element *e;
......@@ -241,29 +268,48 @@ struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
BUG_ON(!lc);
BUG_ON(!lc->nr_elements);
hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
if (e->lc_number == enr)
/* "about to be changed" elements, pending transaction commit,
* are hashed by their "new number". "Normal" elements have
* lc_number == lc_new_number. */
if (e->lc_new_number != enr)
continue;
if (e->lc_new_number == e->lc_number || include_changing)
return e;
break;
}
return NULL;
}
/* returned element will be "recycled" immediately */
static struct lc_element *lc_evict(struct lru_cache *lc)
/**
* lc_find - find element by label, if present in the hash table
* @lc: The lru_cache object
* @enr: element number
*
* Returns the pointer to an element, if the element with the requested
* "label" or element number is present in the hash table,
* or NULL if not found. Does not change the refcnt.
* Ignores elements that are "about to be used", i.e. not yet in the active
* set, but still pending transaction commit.
*/
struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
{
struct list_head *n;
struct lc_element *e;
if (list_empty(&lc->lru))
return NULL;
n = lc->lru.prev;
e = list_entry(n, struct lc_element, list);
PARANOIA_LC_ELEMENT(lc, e);
return __lc_find(lc, enr, 0);
}
list_del(&e->list);
hlist_del(&e->colision);
return e;
/**
* lc_is_used - find element by label
* @lc: The lru_cache object
* @enr: element number
*
* Returns true, if the element with the requested "label" or element number is
* present in the hash table, and is used (refcnt > 0).
* Also finds elements that are not _currently_ used but only "about to be
* used", i.e. on the "to_be_changed" list, pending transaction commit.
*/
bool lc_is_used(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e = __lc_find(lc, enr, 1);
return e && e->refcnt;
}
/**
......@@ -280,22 +326,34 @@ void lc_del(struct lru_cache *lc, struct lc_element *e)
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt);
e->lc_number = LC_FREE;
e->lc_number = e->lc_new_number = LC_FREE;
hlist_del_init(&e->colision);
list_move(&e->list, &lc->free);
RETURN();
}
static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number)
{
struct list_head *n;
struct lc_element *e;
if (!list_empty(&lc->free))
n = lc->free.next;
else if (!list_empty(&lc->lru))
n = lc->lru.prev;
else
return NULL;
e = list_entry(n, struct lc_element, list);
PARANOIA_LC_ELEMENT(lc, e);
if (list_empty(&lc->free))
return lc_evict(lc);
e->lc_new_number = new_number;
if (!hlist_unhashed(&e->colision))
__hlist_del(&e->colision);
hlist_add_head(&e->colision, lc_hash_slot(lc, new_number));
list_move(&e->list, &lc->to_be_changed);
n = lc->free.next;
list_del(n);
return list_entry(n, struct lc_element, list);
return e;
}
static int lc_unused_element_available(struct lru_cache *lc)
......@@ -308,45 +366,7 @@ static int lc_unused_element_available(struct lru_cache *lc)
return 0;
}
/**
* lc_get - get element by label, maybe change the active set
* @lc: the lru cache to operate on
* @enr: the label to look up
*
* Finds an element in the cache, increases its usage count,
* "touches" and returns it.
*
* In case the requested number is not present, it needs to be added to the
* cache. Therefore it is possible that an other element becomes evicted from
* the cache. In either case, the user is notified so he is able to e.g. keep
* a persistent log of the cache changes, and therefore the objects in use.
*
* Return values:
* NULL
* The cache was marked %LC_STARVING,
* or the requested label was not in the active set
* and a changing transaction is still pending (@lc was marked %LC_DIRTY).
* Or no unused or free element could be recycled (@lc will be marked as
* %LC_STARVING, blocking further lc_get() operations).
*
* pointer to the element with the REQUESTED element number.
* In this case, it can be used right away
*
* pointer to an UNUSED element with some different element number,
* where that different number may also be %LC_FREE.
*
* In this case, the cache is marked %LC_DIRTY (blocking further changes),
* and the returned element pointer is removed from the lru list and
* hash collision chains. The user now should do whatever housekeeping
* is necessary.
* Then he must call lc_changed(lc,element_pointer), to finish
* the change.
*
* NOTE: The user needs to check the lc_number on EACH use, so he recognizes
* any cache set change.
*/
struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
{
struct lc_element *e;
......@@ -356,8 +376,12 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
RETURN(NULL);
}
e = lc_find(lc, enr);
if (e) {
e = __lc_find(lc, enr, 1);
/* if lc_new_number != lc_number,
* this enr is currently being pulled in already,
* and will be available once the pending transaction
* has been committed. */
if (e && e->lc_new_number == e->lc_number) {
++lc->hits;
if (e->refcnt++ == 0)
lc->used++;
......@@ -366,6 +390,26 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
}
++lc->misses;
if (!may_change)
RETURN(NULL);
/* It has been found above, but on the "to_be_changed" list, not yet
* committed. Don't pull it in twice, wait for the transaction, then
* try again */
if (e)
RETURN(NULL);
/* To avoid races with lc_try_lock(), first, mark us dirty
* (using test_and_set_bit, as it implies memory barriers), ... */
test_and_set_bit(__LC_DIRTY, &lc->flags);
/* ... only then check if it is locked anyways. If lc_unlock clears
* the dirty bit again, that's not a problem, we will come here again.
*/
if (test_bit(__LC_LOCKED, &lc->flags)) {
++lc->locked;
RETURN(NULL);
}
/* In case there is nothing available and we can not kick out
* the LRU element, we have to wait ...
......@@ -375,71 +419,109 @@ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
RETURN(NULL);
}
/* it was not present in the active set.
* we are going to recycle an unused (or even "free") element.
* user may need to commit a transaction to record that change.
* we serialize on flags & TF_DIRTY */
if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
++lc->dirty;
/* It was not present in the active set. We are going to recycle an
* unused (or even "free") element, but we won't accumulate more than
* max_pending_changes changes. */
if (lc->pending_changes >= lc->max_pending_changes)
RETURN(NULL);
}
e = lc_get_unused_element(lc);
e = lc_prepare_for_change(lc, enr);
BUG_ON(!e);
clear_bit(__LC_STARVING, &lc->flags);
BUG_ON(++e->refcnt != 1);
lc->used++;
lc->changing_element = e;
lc->new_number = enr;
lc->pending_changes++;
RETURN(e);
}
/* similar to lc_get,
* but only gets a new reference on an existing element.
* you either get the requested element, or NULL.
* will be consolidated into one function.
/**
* lc_get - get element by label, maybe change the active set
* @lc: the lru cache to operate on
* @enr: the label to look up
*
* Finds an element in the cache, increases its usage count,
* "touches" and returns it.
*
* In case the requested number is not present, it needs to be added to the
* cache. Therefore it is possible that an other element becomes evicted from
* the cache. In either case, the user is notified so he is able to e.g. keep
* a persistent log of the cache changes, and therefore the objects in use.
*
* Return values:
* NULL
* The cache was marked %LC_STARVING,
* or the requested label was not in the active set
* and a changing transaction is still pending (@lc was marked %LC_DIRTY).
* Or no unused or free element could be recycled (@lc will be marked as
* %LC_STARVING, blocking further lc_get() operations).
*
* pointer to the element with the REQUESTED element number.
* In this case, it can be used right away
*
* pointer to an UNUSED element with some different element number,
* where that different number may also be %LC_FREE.
*
* In this case, the cache is marked %LC_DIRTY,
* so lc_try_lock() will no longer succeed.
* The returned element pointer is moved to the "to_be_changed" list,
* and registered with the new element number on the hash collision chains,
* so it is possible to pick it up from lc_is_used().
* Up to "max_pending_changes" (see lc_create()) can be accumulated.
* The user now should do whatever housekeeping is necessary,
* typically serialize on lc_try_lock_for_transaction(), then call
* lc_committed(lc) and lc_unlock(), to finish the change.
*
* NOTE: The user needs to check the lc_number on EACH use, so he recognizes
* any cache set change.
*/
struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e;
PARANOIA_ENTRY();
if (lc->flags & LC_STARVING) {
++lc->starving;
RETURN(NULL);
}
return __lc_get(lc, enr, 1);
}
e = lc_find(lc, enr);
if (e) {
++lc->hits;
if (e->refcnt++ == 0)
lc->used++;
list_move(&e->list, &lc->in_use); /* Not evictable... */
}
RETURN(e);
/**
* lc_try_get - get element by label, if present; do not change the active set
* @lc: the lru cache to operate on
* @enr: the label to look up
*
* Finds an element in the cache, increases its usage count,
* "touches" and returns it.
*
* Return values:
* NULL
* The cache was marked %LC_STARVING,
* or the requested label was not in the active set
*
* pointer to the element with the REQUESTED element number.
* In this case, it can be used right away
*/
struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
{
return __lc_get(lc, enr, 0);
}
/**
* lc_changed - tell @lc that the change has been recorded
* lc_committed - tell @lc that pending changes have been recorded
* @lc: the lru cache to operate on
* @e: the element pending label change
*
* User is expected to serialize on explicit lc_try_lock_for_transaction()
* before the transaction is started, and later needs to lc_unlock() explicitly
* as well.
*/
void lc_changed(struct lru_cache *lc, struct lc_element *e)
void lc_committed(struct lru_cache *lc)
{
struct lc_element *e, *tmp;
PARANOIA_ENTRY();
BUG_ON(e != lc->changing_element);
PARANOIA_LC_ELEMENT(lc, e);
++lc->changed;
e->lc_number = lc->new_number;
list_add(&e->list, &lc->in_use);
hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
lc->changing_element = NULL;
lc->new_number = LC_FREE;
clear_bit(__LC_DIRTY, &lc->flags);
smp_mb__after_clear_bit();
list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) {
/* count number of changes, not number of transactions */
++lc->changed;
e->lc_number = e->lc_new_number;
list_move(&e->list, &lc->in_use);
}
lc->pending_changes = 0;
RETURN();
}
......@@ -458,13 +540,12 @@ unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
PARANOIA_ENTRY();
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt == 0);
BUG_ON(e == lc->changing_element);
BUG_ON(e->lc_number != e->lc_new_number);
if (--e->refcnt == 0) {
/* move it to the front of LRU. */
list_move(&e->list, &lc->lru);
lc->used--;
clear_bit(__LC_STARVING, &lc->flags);
smp_mb__after_clear_bit();
clear_bit_unlock(__LC_STARVING, &lc->flags);
}
RETURN(e->refcnt);
}
......@@ -504,16 +585,24 @@ unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
void lc_set(struct lru_cache *lc, unsigned int enr, int index)
{
struct lc_element *e;
struct list_head *lh;
if (index < 0 || index >= lc->nr_elements)
return;
e = lc_element_by_index(lc, index);
e->lc_number = enr;
BUG_ON(e->lc_number != e->lc_new_number);
BUG_ON(e->refcnt != 0);
e->lc_number = e->lc_new_number = enr;
hlist_del_init(&e->colision);
hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
if (enr == LC_FREE)
lh = &lc->free;
else {
hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
lh = &lc->lru;
}
list_move(&e->list, lh);
}
/**
......@@ -553,8 +642,10 @@ EXPORT_SYMBOL(lc_try_get);
EXPORT_SYMBOL(lc_find);
EXPORT_SYMBOL(lc_get);
EXPORT_SYMBOL(lc_put);
EXPORT_SYMBOL(lc_changed);
EXPORT_SYMBOL(lc_committed);
EXPORT_SYMBOL(lc_element_by_index);
EXPORT_SYMBOL(lc_index_of);
EXPORT_SYMBOL(lc_seq_printf_stats);
EXPORT_SYMBOL(lc_seq_dump_details);
EXPORT_SYMBOL(lc_try_lock);
EXPORT_SYMBOL(lc_is_used);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment