Commit 9104d31a authored by Lars Ellenberg's avatar Lars Ellenberg Committed by Jens Axboe

drbd: introduce WRITE_SAME support

We will support WRITE_SAME, if
 * all peers support WRITE_SAME (both in kernel and DRBD version),
 * all peer devices support WRITE_SAME
 * logical_block_size is identical on all peers.

We may at some point introduce a fallback on the receiving side
for devices/kernels that do not support WRITE_SAME,
by open-coding a submit loop. But not yet.
Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 60bac040
...@@ -840,6 +840,13 @@ static int update_sync_bits(struct drbd_device *device, ...@@ -840,6 +840,13 @@ static int update_sync_bits(struct drbd_device *device,
return count; return count;
} }
static bool plausible_request_size(int size)
{
return size > 0
&& size <= DRBD_MAX_BATCH_BIO_SIZE
&& IS_ALIGNED(size, 512);
}
/* clear the bit corresponding to the piece of storage in question: /* clear the bit corresponding to the piece of storage in question:
* size byte of data starting from sector. Only clear a bits of the affected * size byte of data starting from sector. Only clear a bits of the affected
* one ore more _aligned_ BM_BLOCK_SIZE blocks. * one ore more _aligned_ BM_BLOCK_SIZE blocks.
...@@ -859,7 +866,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, ...@@ -859,7 +866,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
if ((mode == SET_OUT_OF_SYNC) && size == 0) if ((mode == SET_OUT_OF_SYNC) && size == 0)
return 0; return 0;
if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { if (!plausible_request_size(size)) {
drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
drbd_change_sync_fname[mode], drbd_change_sync_fname[mode],
(unsigned long long)sector, size); (unsigned long long)sector, size);
......
...@@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re ...@@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
if (f & EE_IS_TRIM) { if (f & EE_IS_TRIM)
seq_putc(m, sep); __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
sep = '|'; seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
if (f & EE_IS_TRIM_USE_ZEROOUT)
seq_puts(m, "zero-out");
else
seq_puts(m, "trim");
}
seq_putc(m, '\n'); seq_putc(m, '\n');
} }
......
...@@ -468,6 +468,9 @@ enum { ...@@ -468,6 +468,9 @@ enum {
/* this is/was a write request */ /* this is/was a write request */
__EE_WRITE, __EE_WRITE,
/* this is/was a write same request */
__EE_WRITE_SAME,
/* this originates from application on peer /* this originates from application on peer
* (not some resync or verify or other DRBD internal request) */ * (not some resync or verify or other DRBD internal request) */
__EE_APPLICATION, __EE_APPLICATION,
...@@ -487,6 +490,7 @@ enum { ...@@ -487,6 +490,7 @@ enum {
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED (1<<__EE_SUBMITTED) #define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE) #define EE_WRITE (1<<__EE_WRITE)
#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
#define EE_APPLICATION (1<<__EE_APPLICATION) #define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ) #define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
...@@ -1350,8 +1354,8 @@ struct bm_extent { ...@@ -1350,8 +1354,8 @@ struct bm_extent {
/* For now, don't allow more than half of what we can "activate" in one /* For now, don't allow more than half of what we can "activate" in one
* activity log transaction to be discarded in one go. We may need to rework * activity log transaction to be discarded in one go. We may need to rework
* drbd_al_begin_io() to allow for even larger discard ranges */ * drbd_al_begin_io() to allow for even larger discard ranges */
#define DRBD_MAX_DISCARD_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE) #define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) #define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
extern int drbd_bm_init(struct drbd_device *device); extern int drbd_bm_init(struct drbd_device *device);
extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
...@@ -1488,7 +1492,8 @@ enum determine_dev_size { ...@@ -1488,7 +1492,8 @@ enum determine_dev_size {
extern enum determine_dev_size extern enum determine_dev_size
drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
extern void resync_after_online_grow(struct drbd_device *); extern void resync_after_online_grow(struct drbd_device *);
extern void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev); extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
struct drbd_backing_dev *bdev, struct o_qlim *o);
extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
enum drbd_role new_role, enum drbd_role new_role,
int force); int force);
...@@ -1569,7 +1574,7 @@ extern int drbd_submit_peer_request(struct drbd_device *, ...@@ -1569,7 +1574,7 @@ extern int drbd_submit_peer_request(struct drbd_device *,
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
sector_t, unsigned int, sector_t, unsigned int,
bool, unsigned int,
gfp_t) __must_hold(local); gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
int); int);
......
...@@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) ...@@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
} }
} }
/* communicated if (agreed_features & DRBD_FF_WSAME) */
void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q)
{
if (q) {
p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = blk_queue_discard(q);
p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
} else {
q = device->rq_queue;
p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
p->qlim->alignment_offset = 0;
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = 0;
p->qlim->discard_zeroes_data = 0;
p->qlim->write_same_capable = 0;
}
}
int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
{ {
struct drbd_device *device = peer_device->device; struct drbd_device *device = peer_device->device;
...@@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu ...@@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
sector_t d_size, u_size; sector_t d_size, u_size;
int q_order_type; int q_order_type;
unsigned int max_bio_size; unsigned int max_bio_size;
unsigned int packet_size;
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
packet_size = sizeof(*p);
if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
packet_size += sizeof(p->qlim[0]);
memset(p, 0, packet_size);
if (get_ldev_if_state(device, D_NEGOTIATING)) { if (get_ldev_if_state(device, D_NEGOTIATING)) {
D_ASSERT(device, device->ldev->backing_bdev); struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
d_size = drbd_get_max_capacity(device->ldev); d_size = drbd_get_max_capacity(device->ldev);
rcu_read_lock(); rcu_read_lock();
u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
rcu_read_unlock(); rcu_read_unlock();
q_order_type = drbd_queue_order_type(device); q_order_type = drbd_queue_order_type(device);
max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; max_bio_size = queue_max_hw_sectors(q) << 9;
max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
assign_p_sizes_qlim(device, p, q);
put_ldev(device); put_ldev(device);
} else { } else {
d_size = 0; d_size = 0;
u_size = 0; u_size = 0;
q_order_type = QUEUE_ORDERED_NONE; q_order_type = QUEUE_ORDERED_NONE;
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
assign_p_sizes_qlim(device, p, NULL);
} }
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
if (peer_device->connection->agreed_pro_version <= 94) if (peer_device->connection->agreed_pro_version <= 94)
max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
else if (peer_device->connection->agreed_pro_version < 100) else if (peer_device->connection->agreed_pro_version < 100)
...@@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu ...@@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
p->max_bio_size = cpu_to_be32(max_bio_size); p->max_bio_size = cpu_to_be32(max_bio_size);
p->queue_order_type = cpu_to_be16(q_order_type); p->queue_order_type = cpu_to_be16(q_order_type);
p->dds_flags = cpu_to_be16(flags); p->dds_flags = cpu_to_be16(flags);
return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
} }
/** /**
...@@ -1577,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio) ...@@ -1577,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
? 0 : MSG_MORE); ? 0 : MSG_MORE);
if (err) if (err)
return err; return err;
/* REQ_OP_WRITE_SAME has only one segment */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
} }
return 0; return 0;
} }
...@@ -1595,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b ...@@ -1595,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
bio_iter_last(bvec, iter) ? 0 : MSG_MORE); bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
if (err) if (err)
return err; return err;
/* REQ_OP_WRITE_SAME has only one segment */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
} }
return 0; return 0;
} }
...@@ -1626,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection, ...@@ -1626,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
(bio->bi_rw & REQ_FUA ? DP_FUA : 0) | (bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
(bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) | (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0); (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
else else
return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
...@@ -1639,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * ...@@ -1639,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
struct drbd_device *device = peer_device->device; struct drbd_device *device = peer_device->device;
struct drbd_socket *sock; struct drbd_socket *sock;
struct p_data *p; struct p_data *p;
struct p_wsame *wsame = NULL;
void *digest_out;
unsigned int dp_flags = 0; unsigned int dp_flags = 0;
int digest_size; int digest_size;
int err; int err;
...@@ -1674,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * ...@@ -1674,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
goto out; goto out;
} }
if (dp_flags & DP_WSAME) {
/* this will only work if DRBD_FF_WSAME is set AND the
* handshake agreed that all nodes and backend devices are
* WRITE_SAME capable and agree on logical_block_size */
wsame = (struct p_wsame*)p;
digest_out = wsame + 1;
wsame->size = cpu_to_be32(req->i.size);
} else
digest_out = p + 1;
/* our digest is still only over the payload. /* our digest is still only over the payload.
* TRIM does not carry any payload. */ * TRIM does not carry any payload. */
if (digest_size) if (digest_size)
drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); if (wsame) {
err =
__send_command(peer_device->connection, device->vnr, sock, P_WSAME,
sizeof(*wsame) + digest_size, NULL,
bio_iovec(req->master_bio).bv_len);
} else
err =
__send_command(peer_device->connection, device->vnr, sock, P_DATA,
sizeof(*p) + digest_size, NULL, req->i.size);
if (!err) { if (!err) {
/* For protocol A, we have to memcpy the payload into /* For protocol A, we have to memcpy the payload into
* socket buffers, as we may complete right away * socket buffers, as we may complete right away
...@@ -3660,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd) ...@@ -3660,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd)
* one PRO_VERSION */ * one PRO_VERSION */
static const char *cmdnames[] = { static const char *cmdnames[] = {
[P_DATA] = "Data", [P_DATA] = "Data",
[P_WSAME] = "WriteSame",
[P_TRIM] = "Trim",
[P_DATA_REPLY] = "DataReply", [P_DATA_REPLY] = "DataReply",
[P_RS_DATA_REPLY] = "RSDataReply", [P_RS_DATA_REPLY] = "RSDataReply",
[P_BARRIER] = "Barrier", [P_BARRIER] = "Barrier",
......
...@@ -1174,6 +1174,17 @@ static void blk_queue_discard_granularity(struct request_queue *q, unsigned int ...@@ -1174,6 +1174,17 @@ static void blk_queue_discard_granularity(struct request_queue *q, unsigned int
{ {
q->limits.discard_granularity = granularity; q->limits.discard_granularity = granularity;
} }
static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
{
/* when we introduced REQ_WRITE_SAME support, we also bumped
* our maximum supported batch bio size used for discards. */
if (connection->agreed_features & DRBD_FF_WSAME)
return DRBD_MAX_BBIO_SECTORS;
/* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
return AL_EXTENT_SIZE >> 9;
}
static void decide_on_discard_support(struct drbd_device *device, static void decide_on_discard_support(struct drbd_device *device,
struct request_queue *q, struct request_queue *q,
struct request_queue *b, struct request_queue *b,
...@@ -1190,7 +1201,7 @@ static void decide_on_discard_support(struct drbd_device *device, ...@@ -1190,7 +1201,7 @@ static void decide_on_discard_support(struct drbd_device *device,
can_do = false; can_do = false;
drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n"); drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
} }
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & FF_TRIM)) { if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
can_do = false; can_do = false;
drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
} }
...@@ -1202,7 +1213,7 @@ static void decide_on_discard_support(struct drbd_device *device, ...@@ -1202,7 +1213,7 @@ static void decide_on_discard_support(struct drbd_device *device,
* you care, you need to use devices with similar * you care, you need to use devices with similar
* topology on all peers. */ * topology on all peers. */
blk_queue_discard_granularity(q, 512); blk_queue_discard_granularity(q, 512);
q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
} else { } else {
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
...@@ -1223,8 +1234,67 @@ static void fixup_discard_if_not_supported(struct request_queue *q) ...@@ -1223,8 +1234,67 @@ static void fixup_discard_if_not_supported(struct request_queue *q)
} }
} }
static void decide_on_write_same_support(struct drbd_device *device,
struct request_queue *q,
struct request_queue *b, struct o_qlim *o)
{
struct drbd_peer_device *peer_device = first_peer_device(device);
struct drbd_connection *connection = peer_device->connection;
bool can_do = b ? b->limits.max_write_same_sectors : true;
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
can_do = false;
drbd_info(peer_device, "peer does not support WRITE_SAME\n");
}
if (o) {
/* logical block size; queue_logical_block_size(NULL) is 512 */
unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
unsigned int me_lbs_b = queue_logical_block_size(b);
unsigned int me_lbs = queue_logical_block_size(q);
if (me_lbs_b != me_lbs) {
drbd_warn(device,
"logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
me_lbs, me_lbs_b);
/* rather disable write same than trigger some BUG_ON later in the scsi layer. */
can_do = false;
}
if (me_lbs_b != peer_lbs) {
drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
me_lbs, peer_lbs);
if (can_do) {
drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
can_do = false;
}
me_lbs = max(me_lbs, me_lbs_b);
/* We cannot change the logical block size of an in-use queue.
* We can only hope that access happens to be properly aligned.
* If not, the peer will likely produce an IO error, and detach. */
if (peer_lbs > me_lbs) {
if (device->state.role != R_PRIMARY) {
blk_queue_logical_block_size(q, peer_lbs);
drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
} else {
drbd_warn(peer_device,
"current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
me_lbs, peer_lbs);
}
}
}
if (can_do && !o->write_same_capable) {
/* If we introduce an open-coded write-same loop on the receiving side,
* the peer would present itself as "capable". */
drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
can_do = false;
}
}
blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
}
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
unsigned int max_bio_size) unsigned int max_bio_size, struct o_qlim *o)
{ {
struct request_queue * const q = device->rq_queue; struct request_queue * const q = device->rq_queue;
unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_hw_sectors = max_bio_size >> 9;
...@@ -1244,15 +1314,15 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi ...@@ -1244,15 +1314,15 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
rcu_read_unlock(); rcu_read_unlock();
blk_set_stacking_limits(&q->limits); blk_set_stacking_limits(&q->limits);
blk_queue_max_write_same_sectors(q, 0);
} }
blk_queue_logical_block_size(q, 512);
blk_queue_max_hw_sectors(q, max_hw_sectors); blk_queue_max_hw_sectors(q, max_hw_sectors);
/* This is the workaround for "bio would need to, but cannot, be split" */ /* This is the workaround for "bio would need to, but cannot, be split" */
blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
blk_queue_segment_boundary(q, PAGE_SIZE-1); blk_queue_segment_boundary(q, PAGE_SIZE-1);
decide_on_discard_support(device, q, b, discard_zeroes_if_aligned); decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
decide_on_write_same_support(device, q, b, o);
if (b) { if (b) {
blk_queue_stack_limits(q, b); blk_queue_stack_limits(q, b);
...@@ -1266,7 +1336,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi ...@@ -1266,7 +1336,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
fixup_discard_if_not_supported(q); fixup_discard_if_not_supported(q);
} }
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev) void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
{ {
unsigned int now, new, local, peer; unsigned int now, new, local, peer;
...@@ -1309,7 +1379,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_ba ...@@ -1309,7 +1379,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_ba
if (new != now) if (new != now)
drbd_info(device, "max BIO size = %u\n", new); drbd_info(device, "max BIO size = %u\n", new);
drbd_setup_queue_param(device, bdev, new); drbd_setup_queue_param(device, bdev, new, o);
} }
/* Starts the worker thread */ /* Starts the worker thread */
...@@ -1542,7 +1612,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) ...@@ -1542,7 +1612,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned) if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
drbd_reconsider_queue_parameters(device, device->ldev); drbd_reconsider_queue_parameters(device, device->ldev, NULL);
drbd_md_sync(device); drbd_md_sync(device);
...@@ -1922,7 +1992,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1922,7 +1992,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
device->read_cnt = 0; device->read_cnt = 0;
device->writ_cnt = 0; device->writ_cnt = 0;
drbd_reconsider_queue_parameters(device, device->ldev); drbd_reconsider_queue_parameters(device, device->ldev, NULL);
/* If I am currently not R_PRIMARY, /* If I am currently not R_PRIMARY,
* but meta data primary indicator is set, * but meta data primary indicator is set,
......
...@@ -64,6 +64,11 @@ enum drbd_packet { ...@@ -64,6 +64,11 @@ enum drbd_packet {
P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */ P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */ P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */
/* REQ_WRITE_SAME.
* On a receiving side without REQ_WRITE_SAME,
* we may fall back to an opencoded loop instead. */
P_WSAME = 0x34,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101, P_MAX_OPT_CMD = 0x101,
...@@ -110,8 +115,11 @@ struct p_header100 { ...@@ -110,8 +115,11 @@ struct p_header100 {
u32 pad; u32 pad;
} __packed; } __packed;
/* these defines must not be changed without changing the protocol version */ /* These defines must not be changed without changing the protocol version.
#define DP_HARDBARRIER 1 /* depricated */ * New defines may only be introduced together with protocol version bump or
* new protocol feature flags.
*/
#define DP_HARDBARRIER 1 /* no longer used */
#define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4 #define DP_MAY_SET_IN_SYNC 4
#define DP_UNPLUG 8 /* not used anymore */ #define DP_UNPLUG 8 /* not used anymore */
...@@ -120,6 +128,7 @@ struct p_header100 { ...@@ -120,6 +128,7 @@ struct p_header100 {
#define DP_DISCARD 64 /* equals REQ_DISCARD */ #define DP_DISCARD 64 /* equals REQ_DISCARD */
#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */
struct p_data { struct p_data {
u64 sector; /* 64 bits sector number */ u64 sector; /* 64 bits sector number */
...@@ -133,6 +142,11 @@ struct p_trim { ...@@ -133,6 +142,11 @@ struct p_trim {
u32 size; /* == bio->bi_size */ u32 size; /* == bio->bi_size */
} __packed; } __packed;
struct p_wsame {
struct p_data p_data;
u32 size; /* == bio->bi_size */
} __packed;
/* /*
* commands which share a struct: * commands which share a struct:
* p_block_ack: * p_block_ack:
...@@ -164,8 +178,23 @@ struct p_block_req { ...@@ -164,8 +178,23 @@ struct p_block_req {
* ReportParams * ReportParams
*/ */
#define FF_TRIM 1 /* supports TRIM/DISCARD on the "wire" protocol */
#define FF_THIN_RESYNC 2 #define DRBD_FF_TRIM 1
/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks
* instead of fully allocate a supposedly thin volume on initial resync */
#define DRBD_FF_THIN_RESYNC 2
/* supports REQ_WRITE_SAME on the "wire" protocol.
* Note: this flag is overloaded,
* its presence also
* - indicates support for 128 MiB "batch bios",
* max discard size of 128 MiB
* instead of 4M before that.
* - indicates that we exchange additional settings in p_sizes
* drbd_send_sizes()/receive_sizes()
*/
#define DRBD_FF_WSAME 4
struct p_connection_features { struct p_connection_features {
u32 protocol_min; u32 protocol_min;
...@@ -240,6 +269,40 @@ struct p_rs_uuid { ...@@ -240,6 +269,40 @@ struct p_rs_uuid {
u64 uuid; u64 uuid;
} __packed; } __packed;
/* optional queue_limits if (agreed_features & DRBD_FF_WSAME)
* see also struct queue_limits, as of late 2015 */
struct o_qlim {
/* we don't need it yet, but we may as well communicate it now */
u32 physical_block_size;
/* so the original in struct queue_limits is unsigned short,
* but I'd have to put in padding anyways. */
u32 logical_block_size;
/* One incoming bio becomes one DRBD request,
* which may be translated to several bio on the receiving side.
* We don't need to communicate chunk/boundary/segment ... limits.
*/
/* various IO hints may be useful with "diskless client" setups */
u32 alignment_offset;
u32 io_min;
u32 io_opt;
/* We may need to communicate integrity stuff at some point,
* but let's not get ahead of ourselves. */
/* Backend discard capabilities.
* Receiving side uses "blkdev_issue_discard()", no need to communicate
* more specifics. If the backend cannot do discards, the DRBD peer
* may fall back to blkdev_issue_zeroout().
*/
u8 discard_enabled;
u8 discard_zeroes_data;
u8 write_same_capable;
u8 _pad;
} __packed;
struct p_sizes { struct p_sizes {
u64 d_size; /* size of disk */ u64 d_size; /* size of disk */
u64 u_size; /* user requested size */ u64 u_size; /* user requested size */
...@@ -247,6 +310,9 @@ struct p_sizes { ...@@ -247,6 +310,9 @@ struct p_sizes {
u32 max_bio_size; /* Maximal size of a BIO */ u32 max_bio_size; /* Maximal size of a BIO */
u16 queue_order_type; /* not yet implemented in DRBD*/ u16 queue_order_type; /* not yet implemented in DRBD*/
u16 dds_flags; /* use enum dds_flags here. */ u16 dds_flags; /* use enum dds_flags here. */
/* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */
struct o_qlim qlim[0];
} __packed; } __packed;
struct p_state { struct p_state {
......
This diff is collapsed.
...@@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r ...@@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
&device->vdisk->part0, req->start_jif); &device->vdisk->part0, req->start_jif);
} }
static struct drbd_request *drbd_req_new(struct drbd_device *device, static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src)
struct bio *bio_src)
{ {
struct drbd_request *req; struct drbd_request *req;
...@@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, ...@@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
memset(req, 0, sizeof(*req)); memset(req, 0, sizeof(*req));
drbd_req_make_private_bio(req, bio_src); drbd_req_make_private_bio(req, bio_src);
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
req->device = device; | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
req->master_bio = bio_src; | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
req->epoch = 0; req->device = device;
req->master_bio = bio_src;
req->epoch = 0;
drbd_clear_interval(&req->i); drbd_clear_interval(&req->i);
req->i.sector = bio_src->bi_iter.bi_sector; req->i.sector = bio_src->bi_iter.bi_sector;
......
...@@ -206,6 +206,8 @@ enum drbd_req_state_bits { ...@@ -206,6 +206,8 @@ enum drbd_req_state_bits {
/* Set when this is a write, clear for a read */ /* Set when this is a write, clear for a read */
__RQ_WRITE, __RQ_WRITE,
__RQ_WSAME,
__RQ_UNMAP,
/* Should call drbd_al_complete_io() for this request... */ /* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG, __RQ_IN_ACT_LOG,
...@@ -241,10 +243,11 @@ enum drbd_req_state_bits { ...@@ -241,10 +243,11 @@ enum drbd_req_state_bits {
#define RQ_NET_OK (1UL << __RQ_NET_OK) #define RQ_NET_OK (1UL << __RQ_NET_OK)
#define RQ_NET_SIS (1UL << __RQ_NET_SIS) #define RQ_NET_SIS (1UL << __RQ_NET_SIS)
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
#define RQ_WRITE (1UL << __RQ_WRITE) #define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_WSAME (1UL << __RQ_WSAME)
#define RQ_UNMAP (1UL << __RQ_UNMAP)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED) #define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) #define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
......
...@@ -320,6 +320,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest) ...@@ -320,6 +320,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
ahash_request_set_crypt(req, &sg, NULL, sg.length); ahash_request_set_crypt(req, &sg, NULL, sg.length);
crypto_ahash_update(req); crypto_ahash_update(req);
/* REQ_OP_WRITE_SAME has only one segment,
* checksum the payload only once. */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
} }
ahash_request_set_crypt(req, NULL, digest, 0); ahash_request_set_crypt(req, NULL, digest, 0);
crypto_ahash_final(req); crypto_ahash_final(req);
...@@ -387,7 +391,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, ...@@ -387,7 +391,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
/* GFP_TRY, because if there is no memory available right now, this may /* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */ * be rescheduled for later. It is "only" background resync, after all. */
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
size, true /* has real payload */, GFP_TRY); size, size, GFP_TRY);
if (!peer_req) if (!peer_req)
goto defer; goto defer;
...@@ -603,7 +607,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel) ...@@ -603,7 +607,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
return 0; return 0;
} }
if (connection->agreed_features & FF_THIN_RESYNC) { if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
rcu_read_lock(); rcu_read_lock();
discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity; discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
rcu_read_unlock(); rcu_read_unlock();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment