Commit 929cfdd5 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-2.6.40/drivers' of git://git.kernel.dk/linux-2.6-block

* 'for-2.6.40/drivers' of git://git.kernel.dk/linux-2.6-block: (110 commits)
  loop: handle on-demand devices correctly
  loop: limit 'max_part' module param to DISK_MAX_PARTS
  drbd: fix warning
  drbd: fix warning
  drbd: Fix spelling
  drbd: fix schedule in atomic
  drbd: Take a more conservative approach when deciding max_bio_size
  drbd: Fixed state transitions after async outdate-peer-handler returned
  drbd: Disallow the peer_disk_state to be D_OUTDATED while connected
  drbd: Fix for the connection problems on high latency links
  drbd: fix potential activity log refcount imbalance in error path
  drbd: Only downgrade the disk state in case of disk failures
  drbd: fix disconnect/reconnect loop, if ping-timeout == ping-int
  drbd: fix potential distributed deadlock
  lru_cache.h: fix comments referring to ts_ instead of lc_
  drbd: Fix for application IO with the on-io-error=pass-on policy
  xen/p2m: Add EXPORT_SYMBOL_GPL to the M2P override functions.
  xen/p2m/m2p/gnttab: Support GNTMAP_host_map in the M2P override.
  xen/blkback: don't fail empty barrier requests
  xen/blkback: fix xenbus_transaction_start() hang caused by double xenbus_transaction_end()
  ...
parents 798ce8f1 a1c15c59
...@@ -169,3 +169,18 @@ is issued which positions the tape to a known position. Typically you ...@@ -169,3 +169,18 @@ is issued which positions the tape to a known position. Typically you
must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example) must rewind the tape (by issuing "mt -f /dev/st0 rewind" for example)
before i/o can proceed again to a tape drive which was reset. before i/o can proceed again to a tape drive which was reset.
There is a cciss_tape_cmds module parameter which can be used to make cciss
allocate more commands for use by tape drives. Ordinarily only a few commands
(6) are allocated for tape drives because tape drives are slow and
infrequently used and the primary purpose of Smart Array controllers is to
act as a RAID controller for disk drives, so the vast majority of commands
are allocated for disk devices. However, if you have more than a few tape
drives attached to a smart array, the default number of commands may not be
enought (for example, if you have 8 tape drives, you could only rewind 6
at one time with the default number of commands.) The cciss_tape_cmds module
parameter allows more commands (up to 16 more) to be allocated for use by
tape drives. For example:
insmod cciss.ko cciss_tape_cmds=16
Or, as a kernel boot parameter passed in via grub: cciss.cciss_tape_cmds=8
...@@ -470,6 +470,27 @@ config XEN_BLKDEV_FRONTEND ...@@ -470,6 +470,27 @@ config XEN_BLKDEV_FRONTEND
block device driver. It communicates with a back-end driver block device driver. It communicates with a back-end driver
in another domain which drives the actual block device. in another domain which drives the actual block device.
config XEN_BLKDEV_BACKEND
tristate "Block-device backend driver"
depends on XEN_BACKEND
help
The block-device backend driver allows the kernel to export its
block devices to other guests via a high-performance shared-memory
interface.
The corresponding Linux frontend driver is enabled by the
CONFIG_XEN_BLKDEV_FRONTEND configuration option.
The backend driver attaches itself to a any block device specified
in the XenBus configuration. There are no limits to what the block
device as long as it has a major and minor.
If you are compiling a kernel to run in a Xen block backend driver
domain (often this is domain 0) you should say Y here. To
compile this driver as a module, chose M here: the module
will be called xen-blkback.
config VIRTIO_BLK config VIRTIO_BLK
tristate "Virtio block driver (EXPERIMENTAL)" tristate "Virtio block driver (EXPERIMENTAL)"
depends on EXPERIMENTAL && VIRTIO depends on EXPERIMENTAL && VIRTIO
......
...@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o ...@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
......
This diff is collapsed.
...@@ -200,7 +200,7 @@ struct ctlr_info ...@@ -200,7 +200,7 @@ struct ctlr_info
* the above. * the above.
*/ */
#define CCISS_BOARD_READY_WAIT_SECS (120) #define CCISS_BOARD_READY_WAIT_SECS (120)
#define CCISS_BOARD_NOT_READY_WAIT_SECS (10) #define CCISS_BOARD_NOT_READY_WAIT_SECS (100)
#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) #define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
#define CCISS_BOARD_READY_ITERATIONS \ #define CCISS_BOARD_READY_ITERATIONS \
((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
...@@ -209,8 +209,9 @@ struct ctlr_info ...@@ -209,8 +209,9 @@ struct ctlr_info
((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \ ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
CCISS_BOARD_READY_POLL_INTERVAL_MSECS) CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
#define CCISS_POST_RESET_PAUSE_MSECS (3000) #define CCISS_POST_RESET_PAUSE_MSECS (3000)
#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000) #define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000)
#define CCISS_POST_RESET_NOOP_RETRIES (12) #define CCISS_POST_RESET_NOOP_RETRIES (12)
#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000)
/* /*
Send the command to the hardware Send the command to the hardware
...@@ -239,11 +240,13 @@ static void SA5_intr_mask(ctlr_info_t *h, unsigned long val) ...@@ -239,11 +240,13 @@ static void SA5_intr_mask(ctlr_info_t *h, unsigned long val)
{ /* Turn interrupts on */ { /* Turn interrupts on */
h->interrupts_enabled = 1; h->interrupts_enabled = 1;
writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
} else /* Turn them off */ } else /* Turn them off */
{ {
h->interrupts_enabled = 0; h->interrupts_enabled = 0;
writel( SA5_INTR_OFF, writel( SA5_INTR_OFF,
h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
} }
} }
/* /*
...@@ -257,11 +260,13 @@ static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val) ...@@ -257,11 +260,13 @@ static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val)
{ /* Turn interrupts on */ { /* Turn interrupts on */
h->interrupts_enabled = 1; h->interrupts_enabled = 1;
writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
} else /* Turn them off */ } else /* Turn them off */
{ {
h->interrupts_enabled = 0; h->interrupts_enabled = 0;
writel( SA5B_INTR_OFF, writel( SA5B_INTR_OFF,
h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
} }
} }
...@@ -271,10 +276,12 @@ static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val) ...@@ -271,10 +276,12 @@ static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val)
if (val) { /* turn on interrupts */ if (val) { /* turn on interrupts */
h->interrupts_enabled = 1; h->interrupts_enabled = 1;
writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
} else { } else {
h->interrupts_enabled = 0; h->interrupts_enabled = 0;
writel(SA5_PERF_INTR_OFF, writel(SA5_PERF_INTR_OFF,
h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
(void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
} }
} }
......
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
#define CFGTBL_ChangeReq 0x00000001l #define CFGTBL_ChangeReq 0x00000001l
#define CFGTBL_AccCmds 0x00000001l #define CFGTBL_AccCmds 0x00000001l
#define DOORBELL_CTLR_RESET 0x00000004l #define DOORBELL_CTLR_RESET 0x00000004l
#define DOORBELL_CTLR_RESET2 0x00000020l
#define CFGTBL_Trans_Simple 0x00000002l #define CFGTBL_Trans_Simple 0x00000002l
#define CFGTBL_Trans_Performant 0x00000004l #define CFGTBL_Trans_Performant 0x00000004l
...@@ -142,6 +143,14 @@ typedef struct _ReadCapdata_struct_16 ...@@ -142,6 +143,14 @@ typedef struct _ReadCapdata_struct_16
#define BMIC_CACHE_FLUSH 0xc2 #define BMIC_CACHE_FLUSH 0xc2
#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */ #define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */
#define CCISS_ABORT_MSG 0x00
#define CCISS_RESET_MSG 0x01
#define CCISS_RESET_TYPE_CONTROLLER 0x00
#define CCISS_RESET_TYPE_BUS 0x01
#define CCISS_RESET_TYPE_TARGET 0x03
#define CCISS_RESET_TYPE_LUN 0x04
#define CCISS_NOOP_MSG 0x03
/* Command List Structure */ /* Command List Structure */
#define CTLR_LUNID "\0\0\0\0\0\0\0\0" #define CTLR_LUNID "\0\0\0\0\0\0\0\0"
...@@ -235,6 +244,8 @@ typedef struct _CfgTable_struct { ...@@ -235,6 +244,8 @@ typedef struct _CfgTable_struct {
u8 reserved[0x78 - 0x58]; u8 reserved[0x78 - 0x58];
u32 misc_fw_support; /* offset 0x78 */ u32 misc_fw_support; /* offset 0x78 */
#define MISC_FW_DOORBELL_RESET (0x02) #define MISC_FW_DOORBELL_RESET (0x02)
#define MISC_FW_DOORBELL_RESET2 (0x10)
u8 driver_version[32];
} CfgTable_struct; } CfgTable_struct;
struct TransTable_struct { struct TransTable_struct {
......
...@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = { ...@@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = {
.proc_name = "cciss", .proc_name = "cciss",
.proc_info = cciss_scsi_proc_info, .proc_info = cciss_scsi_proc_info,
.queuecommand = cciss_scsi_queue_command, .queuecommand = cciss_scsi_queue_command,
.can_queue = SCSI_CCISS_CAN_QUEUE,
.this_id = 7, .this_id = 7,
.cmd_per_lun = 1, .cmd_per_lun = 1,
.use_clustering = DISABLE_CLUSTERING, .use_clustering = DISABLE_CLUSTERING,
...@@ -108,16 +107,13 @@ struct cciss_scsi_cmd_stack_elem_t { ...@@ -108,16 +107,13 @@ struct cciss_scsi_cmd_stack_elem_t {
#pragma pack() #pragma pack()
#define CMD_STACK_SIZE (SCSI_CCISS_CAN_QUEUE * \
CCISS_MAX_SCSI_DEVS_PER_HBA + 2)
// plus two for init time usage
#pragma pack(1) #pragma pack(1)
struct cciss_scsi_cmd_stack_t { struct cciss_scsi_cmd_stack_t {
struct cciss_scsi_cmd_stack_elem_t *pool; struct cciss_scsi_cmd_stack_elem_t *pool;
struct cciss_scsi_cmd_stack_elem_t *elem[CMD_STACK_SIZE]; struct cciss_scsi_cmd_stack_elem_t **elem;
dma_addr_t cmd_pool_handle; dma_addr_t cmd_pool_handle;
int top; int top;
int nelems;
}; };
#pragma pack() #pragma pack()
...@@ -191,7 +187,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c) ...@@ -191,7 +187,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
sa = h->scsi_ctlr; sa = h->scsi_ctlr;
stk = &sa->cmd_stack; stk = &sa->cmd_stack;
stk->top++; stk->top++;
if (stk->top >= CMD_STACK_SIZE) { if (stk->top >= stk->nelems) {
dev_err(&h->pdev->dev, dev_err(&h->pdev->dev,
"scsi_cmd_free called too many times.\n"); "scsi_cmd_free called too many times.\n");
BUG(); BUG();
...@@ -206,13 +202,14 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa) ...@@ -206,13 +202,14 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
struct cciss_scsi_cmd_stack_t *stk; struct cciss_scsi_cmd_stack_t *stk;
size_t size; size_t size;
stk = &sa->cmd_stack;
stk->nelems = cciss_tape_cmds + 2;
sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
h->chainsize, CMD_STACK_SIZE); h->chainsize, stk->nelems);
if (!sa->cmd_sg_list && h->chainsize > 0) if (!sa->cmd_sg_list && h->chainsize > 0)
return -ENOMEM; return -ENOMEM;
stk = &sa->cmd_stack; size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE;
/* Check alignment, see cciss_cmd.h near CommandList_struct def. */ /* Check alignment, see cciss_cmd.h near CommandList_struct def. */
BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0); BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0);
...@@ -221,18 +218,23 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa) ...@@ -221,18 +218,23 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle); pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle);
if (stk->pool == NULL) { if (stk->pool == NULL) {
cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
sa->cmd_sg_list = NULL; sa->cmd_sg_list = NULL;
return -ENOMEM; return -ENOMEM;
} }
stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL);
for (i=0; i<CMD_STACK_SIZE; i++) { if (!stk->elem) {
pci_free_consistent(h->pdev, size, stk->pool,
stk->cmd_pool_handle);
return -1;
}
for (i = 0; i < stk->nelems; i++) {
stk->elem[i] = &stk->pool[i]; stk->elem[i] = &stk->pool[i];
stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle +
(sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
stk->elem[i]->cmdindex = i; stk->elem[i]->cmdindex = i;
} }
stk->top = CMD_STACK_SIZE-1; stk->top = stk->nelems-1;
return 0; return 0;
} }
...@@ -245,16 +247,18 @@ scsi_cmd_stack_free(ctlr_info_t *h) ...@@ -245,16 +247,18 @@ scsi_cmd_stack_free(ctlr_info_t *h)
sa = h->scsi_ctlr; sa = h->scsi_ctlr;
stk = &sa->cmd_stack; stk = &sa->cmd_stack;
if (stk->top != CMD_STACK_SIZE-1) { if (stk->top != stk->nelems-1) {
dev_warn(&h->pdev->dev, dev_warn(&h->pdev->dev,
"bug: %d scsi commands are still outstanding.\n", "bug: %d scsi commands are still outstanding.\n",
CMD_STACK_SIZE - stk->top); stk->nelems - stk->top);
} }
size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle); pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle);
stk->pool = NULL; stk->pool = NULL;
cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
kfree(stk->elem);
stk->elem = NULL;
} }
#if 0 #if 0
...@@ -859,6 +863,7 @@ cciss_scsi_detect(ctlr_info_t *h) ...@@ -859,6 +863,7 @@ cciss_scsi_detect(ctlr_info_t *h)
sh->io_port = 0; // good enough? FIXME, sh->io_port = 0; // good enough? FIXME,
sh->n_io_port = 0; // I don't think we use these two... sh->n_io_port = 0; // I don't think we use these two...
sh->this_id = SELF_SCSI_ID; sh->this_id = SELF_SCSI_ID;
sh->can_queue = cciss_tape_cmds;
sh->sg_tablesize = h->maxsgentries; sh->sg_tablesize = h->maxsgentries;
sh->max_cmd_len = MAX_COMMAND_SIZE; sh->max_cmd_len = MAX_COMMAND_SIZE;
......
...@@ -36,13 +36,9 @@ ...@@ -36,13 +36,9 @@
addressible natively, and may in fact turn addressible natively, and may in fact turn
out to be not scsi at all. */ out to be not scsi at all. */
#define SCSI_CCISS_CAN_QUEUE 2
/* /*
Note, cmd_per_lun could give us some trouble, so I'm setting it very low.
Likewise, SCSI_CCISS_CAN_QUEUE is set very conservatively.
If the upper scsi layer tries to track how many commands we have If the upper scsi layer tries to track how many commands we have
outstanding, it will be operating under the misapprehension that it is outstanding, it will be operating under the misapprehension that it is
the only one sending us requests. We also have the block interface, the only one sending us requests. We also have the block interface,
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "drbd_int.h" #include "drbd_int.h"
#include "drbd_wrappers.h" #include "drbd_wrappers.h"
/* We maintain a trivial check sum in our on disk activity log. /* We maintain a trivial checksum in our on disk activity log.
* With that we can ensure correct operation even when the storage * With that we can ensure correct operation even when the storage
* device might do a partial (last) sector write while losing power. * device might do a partial (last) sector write while losing power.
*/ */
......
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
* as we are "attached" to a local disk, which at 32 GiB for 1PiB storage * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
* seems excessive. * seems excessive.
* *
* We plan to reduce the amount of in-core bitmap pages by pageing them in * We plan to reduce the amount of in-core bitmap pages by paging them in
* and out against their on-disk location as necessary, but need to make * and out against their on-disk location as necessary, but need to make
* sure we don't cause too much meta data IO, and must not deadlock in * sure we don't cause too much meta data IO, and must not deadlock in
* tight memory situations. This needs some more work. * tight memory situations. This needs some more work.
...@@ -200,7 +200,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) ...@@ -200,7 +200,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
* we if bits have been cleared since last IO. */ * we if bits have been cleared since last IO. */
#define BM_PAGE_LAZY_WRITEOUT 28 #define BM_PAGE_LAZY_WRITEOUT 28
/* store_page_idx uses non-atomic assingment. It is only used directly after /* store_page_idx uses non-atomic assignment. It is only used directly after
* allocating the page. All other bm_set_page_* and bm_clear_page_* need to * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
* use atomic bit manipulation, as set_out_of_sync (and therefore bitmap * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
* changes) may happen from various contexts, and wait_on_bit/wake_up_bit * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
...@@ -318,7 +318,7 @@ static void bm_unmap(unsigned long *p_addr) ...@@ -318,7 +318,7 @@ static void bm_unmap(unsigned long *p_addr)
/* word offset from start of bitmap to word number _in_page_ /* word offset from start of bitmap to word number _in_page_
* modulo longs per page * modulo longs per page
#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
so do it explicitly: so do it explicitly:
*/ */
#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
......
...@@ -700,7 +700,7 @@ struct drbd_request { ...@@ -700,7 +700,7 @@ struct drbd_request {
* see drbd_endio_pri(). */ * see drbd_endio_pri(). */
struct bio *private_bio; struct bio *private_bio;
struct hlist_node colision; struct hlist_node collision;
sector_t sector; sector_t sector;
unsigned int size; unsigned int size;
unsigned int epoch; /* barrier_nr */ unsigned int epoch; /* barrier_nr */
...@@ -766,7 +766,7 @@ struct digest_info { ...@@ -766,7 +766,7 @@ struct digest_info {
struct drbd_epoch_entry { struct drbd_epoch_entry {
struct drbd_work w; struct drbd_work w;
struct hlist_node colision; struct hlist_node collision;
struct drbd_epoch *epoch; /* for writes */ struct drbd_epoch *epoch; /* for writes */
struct drbd_conf *mdev; struct drbd_conf *mdev;
struct page *pages; struct page *pages;
...@@ -1129,6 +1129,8 @@ struct drbd_conf { ...@@ -1129,6 +1129,8 @@ struct drbd_conf {
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
int rs_planed; /* resync sectors already planned */ int rs_planed; /* resync sectors already planned */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
int peer_max_bio_size;
int local_max_bio_size;
}; };
static inline struct drbd_conf *minor_to_mdev(unsigned int minor) static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
...@@ -1218,8 +1220,6 @@ extern void drbd_free_resources(struct drbd_conf *mdev); ...@@ -1218,8 +1220,6 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
unsigned int set_size); unsigned int set_size);
extern void tl_clear(struct drbd_conf *mdev); extern void tl_clear(struct drbd_conf *mdev);
enum drbd_req_event;
extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_conf *mdev); extern void drbd_free_sock(struct drbd_conf *mdev);
extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
...@@ -1434,6 +1434,7 @@ struct bm_extent { ...@@ -1434,6 +1434,7 @@ struct bm_extent {
* hash table. */ * hash table. */
#define HT_SHIFT 8 #define HT_SHIFT 8
#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */
#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ #define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
...@@ -1518,9 +1519,9 @@ extern void drbd_resume_io(struct drbd_conf *mdev); ...@@ -1518,9 +1519,9 @@ extern void drbd_resume_io(struct drbd_conf *mdev);
extern char *ppsize(char *buf, unsigned long long size); extern char *ppsize(char *buf, unsigned long long size);
extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
extern void resync_after_online_grow(struct drbd_conf *); extern void resync_after_online_grow(struct drbd_conf *);
extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
enum drbd_role new_role, enum drbd_role new_role,
int force); int force);
...@@ -1828,6 +1829,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, ...@@ -1828,6 +1829,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach,
if (!forcedetach) { if (!forcedetach) {
if (__ratelimit(&drbd_ratelimit_state)) if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s.\n", where); dev_err(DEV, "Local IO failed in %s.\n", where);
if (mdev->state.disk > D_INCONSISTENT)
_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
break; break;
} }
/* NOTE fall through to detach case if forcedetach set */ /* NOTE fall through to detach case if forcedetach set */
...@@ -2153,6 +2156,10 @@ static inline int get_net_conf(struct drbd_conf *mdev) ...@@ -2153,6 +2156,10 @@ static inline int get_net_conf(struct drbd_conf *mdev)
static inline void put_ldev(struct drbd_conf *mdev) static inline void put_ldev(struct drbd_conf *mdev)
{ {
int i = atomic_dec_return(&mdev->local_cnt); int i = atomic_dec_return(&mdev->local_cnt);
/* This may be called from some endio handler,
* so we must not sleep here. */
__release(local); __release(local);
D_ASSERT(i >= 0); D_ASSERT(i >= 0);
if (i == 0) { if (i == 0) {
......
...@@ -745,6 +745,9 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns) ...@@ -745,6 +745,9 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
mdev->agreed_pro_version < 88) mdev->agreed_pro_version < 88)
rv = SS_NOT_SUPPORTED; rv = SS_NOT_SUPPORTED;
else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
rv = SS_CONNECTED_OUTDATES;
return rv; return rv;
} }
...@@ -1565,6 +1568,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, ...@@ -1565,6 +1568,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
put_ldev(mdev); put_ldev(mdev);
} }
/* Notify peer that I had a local IO error, and did not detached.. */
if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
drbd_send_state(mdev);
/* Disks got bigger while they were detached */ /* Disks got bigger while they were detached */
if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
...@@ -2064,7 +2071,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl ...@@ -2064,7 +2071,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
{ {
struct p_sizes p; struct p_sizes p;
sector_t d_size, u_size; sector_t d_size, u_size;
int q_order_type; int q_order_type, max_bio_size;
int ok; int ok;
if (get_ldev_if_state(mdev, D_NEGOTIATING)) { if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
...@@ -2072,17 +2079,20 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl ...@@ -2072,17 +2079,20 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
d_size = drbd_get_max_capacity(mdev->ldev); d_size = drbd_get_max_capacity(mdev->ldev);
u_size = mdev->ldev->dc.disk_size; u_size = mdev->ldev->dc.disk_size;
q_order_type = drbd_queue_order_type(mdev); q_order_type = drbd_queue_order_type(mdev);
max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
put_ldev(mdev); put_ldev(mdev);
} else { } else {
d_size = 0; d_size = 0;
u_size = 0; u_size = 0;
q_order_type = QUEUE_ORDERED_NONE; q_order_type = QUEUE_ORDERED_NONE;
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
} }
p.d_size = cpu_to_be64(d_size); p.d_size = cpu_to_be64(d_size);
p.u_size = cpu_to_be64(u_size); p.u_size = cpu_to_be64(u_size);
p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9); p.max_bio_size = cpu_to_be32(max_bio_size);
p.queue_order_type = cpu_to_be16(q_order_type); p.queue_order_type = cpu_to_be16(q_order_type);
p.dds_flags = cpu_to_be16(flags); p.dds_flags = cpu_to_be16(flags);
...@@ -2722,7 +2732,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) ...@@ -2722,7 +2732,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
/* double check digest, sometimes buffers have been modified in flight. */ /* double check digest, sometimes buffers have been modified in flight. */
if (dgs > 0 && dgs <= 64) { if (dgs > 0 && dgs <= 64) {
/* 64 byte, 512 bit, is the larges digest size /* 64 byte, 512 bit, is the largest digest size
* currently supported in kernel crypto. */ * currently supported in kernel crypto. */
unsigned char digest[64]; unsigned char digest[64];
drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest); drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
...@@ -3041,6 +3051,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) ...@@ -3041,6 +3051,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
mdev->agreed_pro_version = PRO_VERSION_MAX; mdev->agreed_pro_version = PRO_VERSION_MAX;
mdev->write_ordering = WO_bdev_flush; mdev->write_ordering = WO_bdev_flush;
mdev->resync_wenr = LC_FREE; mdev->resync_wenr = LC_FREE;
mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
} }
void drbd_mdev_cleanup(struct drbd_conf *mdev) void drbd_mdev_cleanup(struct drbd_conf *mdev)
...@@ -3275,7 +3287,7 @@ static void drbd_delete_device(unsigned int minor) ...@@ -3275,7 +3287,7 @@ static void drbd_delete_device(unsigned int minor)
drbd_release_ee_lists(mdev); drbd_release_ee_lists(mdev);
/* should be free'd on disconnect? */ /* should be freed on disconnect? */
kfree(mdev->ee_hash); kfree(mdev->ee_hash);
/* /*
mdev->ee_hash_s = 0; mdev->ee_hash_s = 0;
...@@ -3415,7 +3427,9 @@ struct drbd_conf *drbd_new_device(unsigned int minor) ...@@ -3415,7 +3427,9 @@ struct drbd_conf *drbd_new_device(unsigned int minor)
q->backing_dev_info.congested_data = mdev; q->backing_dev_info.congested_data = mdev;
blk_queue_make_request(q, drbd_make_request); blk_queue_make_request(q, drbd_make_request);
blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9); /* Setting the max_hw_sectors to an odd value of 8kibyte here
This triggers a max_bio_size message upon first attach or connect */
blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
blk_queue_merge_bvec(q, drbd_merge_bvec); blk_queue_merge_bvec(q, drbd_merge_bvec);
q->queue_lock = &mdev->req_lock; q->queue_lock = &mdev->req_lock;
...@@ -3627,7 +3641,8 @@ struct meta_data_on_disk { ...@@ -3627,7 +3641,8 @@ struct meta_data_on_disk {
/* `-- act_log->nr_elements <-- sync_conf.al_extents */ /* `-- act_log->nr_elements <-- sync_conf.al_extents */
u32 bm_offset; /* offset to the bitmap, from here */ u32 bm_offset; /* offset to the bitmap, from here */
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
u32 reserved_u32[4]; u32 la_peer_max_bio_size; /* last peer max_bio_size */
u32 reserved_u32[3];
} __packed; } __packed;
...@@ -3668,6 +3683,7 @@ void drbd_md_sync(struct drbd_conf *mdev) ...@@ -3668,6 +3683,7 @@ void drbd_md_sync(struct drbd_conf *mdev)
buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset; sector = mdev->ldev->md.md_offset;
...@@ -3751,6 +3767,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) ...@@ -3751,6 +3767,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
spin_lock_irq(&mdev->req_lock);
if (mdev->state.conn < C_CONNECTED) {
int peer;
peer = be32_to_cpu(buffer->la_peer_max_bio_size);
peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
mdev->peer_max_bio_size = peer;
}
spin_unlock_irq(&mdev->req_lock);
if (mdev->sync_conf.al_extents < 7) if (mdev->sync_conf.al_extents < 7)
mdev->sync_conf.al_extents = 127; mdev->sync_conf.al_extents = 127;
......
...@@ -272,9 +272,28 @@ static int _try_outdate_peer_async(void *data) ...@@ -272,9 +272,28 @@ static int _try_outdate_peer_async(void *data)
{ {
struct drbd_conf *mdev = (struct drbd_conf *)data; struct drbd_conf *mdev = (struct drbd_conf *)data;
enum drbd_disk_state nps; enum drbd_disk_state nps;
union drbd_state ns;
nps = drbd_try_outdate_peer(mdev); nps = drbd_try_outdate_peer(mdev);
drbd_request_state(mdev, NS(pdsk, nps));
/* Not using
drbd_request_state(mdev, NS(pdsk, nps));
here, because we might were able to re-establish the connection
in the meantime. This can only partially be solved in the state's
engine is_valid_state() and is_valid_state_transition()
functions.
nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
therefore we have to have the pre state change check here.
*/
spin_lock_irq(&mdev->req_lock);
ns = mdev->state;
if (ns.conn < C_WF_REPORT_PARAMS) {
ns.pdsk = nps;
_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
}
spin_unlock_irq(&mdev->req_lock);
return 0; return 0;
} }
...@@ -577,7 +596,7 @@ void drbd_resume_io(struct drbd_conf *mdev) ...@@ -577,7 +596,7 @@ void drbd_resume_io(struct drbd_conf *mdev)
* Returns 0 on success, negative return values indicate errors. * Returns 0 on success, negative return values indicate errors.
* You should call drbd_md_sync() after calling this function. * You should call drbd_md_sync() after calling this function.
*/ */
enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
{ {
sector_t prev_first_sect, prev_size; /* previous meta location */ sector_t prev_first_sect, prev_size; /* previous meta location */
sector_t la_size; sector_t la_size;
...@@ -773,30 +792,78 @@ static int drbd_check_al_size(struct drbd_conf *mdev) ...@@ -773,30 +792,78 @@ static int drbd_check_al_size(struct drbd_conf *mdev)
return 0; return 0;
} }
void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local) static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size)
{ {
struct request_queue * const q = mdev->rq_queue; struct request_queue * const q = mdev->rq_queue;
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; int max_hw_sectors = max_bio_size >> 9;
int max_segments = mdev->ldev->dc.max_bio_bvecs; int max_segments = 0;
int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
if (get_ldev_if_state(mdev, D_ATTACHING)) {
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
max_segments = mdev->ldev->dc.max_bio_bvecs;
put_ldev(mdev);
}
blk_queue_logical_block_size(q, 512); blk_queue_logical_block_size(q, 512);
blk_queue_max_hw_sectors(q, max_hw_sectors); blk_queue_max_hw_sectors(q, max_hw_sectors);
/* This is the workaround for "bio would need to, but cannot, be split" */ /* This is the workaround for "bio would need to, but cannot, be split" */
blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
blk_queue_stack_limits(q, b);
dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9); if (get_ldev_if_state(mdev, D_ATTACHING)) {
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
blk_queue_stack_limits(q, b);
if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
q->backing_dev_info.ra_pages, q->backing_dev_info.ra_pages,
b->backing_dev_info.ra_pages); b->backing_dev_info.ra_pages);
q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
}
put_ldev(mdev);
} }
} }
void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
{
int now, new, local, peer;
now = queue_max_hw_sectors(mdev->rq_queue) << 9;
local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */
peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */
if (get_ldev_if_state(mdev, D_ATTACHING)) {
local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
mdev->local_max_bio_size = local;
put_ldev(mdev);
}
/* We may ignore peer limits if the peer is modern enough.
Because new from 8.3.8 onwards the peer can use multiple
BIOs for a single peer_request */
if (mdev->state.conn >= C_CONNECTED) {
if (mdev->agreed_pro_version < 94)
peer = mdev->peer_max_bio_size;
else if (mdev->agreed_pro_version == 94)
peer = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
peer = DRBD_MAX_BIO_SIZE;
}
new = min_t(int, local, peer);
if (mdev->state.role == R_PRIMARY && new < now)
dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now);
if (new != now)
dev_info(DEV, "max BIO size = %u\n", new);
drbd_setup_queue_param(mdev, new);
}
/* serialize deconfig (worker exiting, doing cleanup) /* serialize deconfig (worker exiting, doing cleanup)
* and reconfig (drbdsetup disk, drbdsetup net) * and reconfig (drbdsetup disk, drbdsetup net)
* *
...@@ -865,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp ...@@ -865,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
struct block_device *bdev; struct block_device *bdev;
struct lru_cache *resync_lru = NULL; struct lru_cache *resync_lru = NULL;
union drbd_state ns, os; union drbd_state ns, os;
unsigned int max_bio_size;
enum drbd_state_rv rv; enum drbd_state_rv rv;
int cp_discovered = 0; int cp_discovered = 0;
int logical_block_size; int logical_block_size;
...@@ -1117,20 +1183,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp ...@@ -1117,20 +1183,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
mdev->read_cnt = 0; mdev->read_cnt = 0;
mdev->writ_cnt = 0; mdev->writ_cnt = 0;
max_bio_size = DRBD_MAX_BIO_SIZE; drbd_reconsider_max_bio_size(mdev);
if (mdev->state.conn == C_CONNECTED) {
/* We are Primary, Connected, and now attach a new local
* backing store. We must not increase the user visible maximum
* bio size on this device to something the peer may not be
* able to handle. */
if (mdev->agreed_pro_version < 94)
max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
else if (mdev->agreed_pro_version == 94)
max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
/* else: drbd 8.3.9 and later, stay with default */
}
drbd_setup_queue_param(mdev, max_bio_size);
/* If I am currently not R_PRIMARY, /* If I am currently not R_PRIMARY,
* but meta data primary indicator is set, * but meta data primary indicator is set,
...@@ -1152,7 +1205,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp ...@@ -1152,7 +1205,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
!drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
set_bit(USE_DEGR_WFC_T, &mdev->flags); set_bit(USE_DEGR_WFC_T, &mdev->flags);
dd = drbd_determin_dev_size(mdev, 0); dd = drbd_determine_dev_size(mdev, 0);
if (dd == dev_size_error) { if (dd == dev_size_error) {
retcode = ERR_NOMEM_BITMAP; retcode = ERR_NOMEM_BITMAP;
goto force_diskless_dec; goto force_diskless_dec;
...@@ -1281,11 +1334,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp ...@@ -1281,11 +1334,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply) struct drbd_nl_cfg_reply *reply)
{ {
enum drbd_ret_code retcode;
int ret;
drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
if (mdev->state.disk == D_DISKLESS) /* D_FAILED will transition to DISKLESS. */
wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); ret = wait_event_interruptible(mdev->misc_wait,
mdev->state.disk != D_FAILED);
drbd_resume_io(mdev); drbd_resume_io(mdev);
if ((int)retcode == (int)SS_IS_DISKLESS)
retcode = SS_NOTHING_TO_DO;
if (ret)
retcode = ERR_INTR;
reply->ret_code = retcode;
return 0; return 0;
} }
...@@ -1658,7 +1719,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, ...@@ -1658,7 +1719,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
dd = drbd_determin_dev_size(mdev, ddsf); dd = drbd_determine_dev_size(mdev, ddsf);
drbd_md_sync(mdev); drbd_md_sync(mdev);
put_ldev(mdev); put_ldev(mdev);
if (dd == dev_size_error) { if (dd == dev_size_error) {
......
...@@ -333,7 +333,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, ...@@ -333,7 +333,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
if (!page) if (!page)
goto fail; goto fail;
INIT_HLIST_NODE(&e->colision); INIT_HLIST_NODE(&e->collision);
e->epoch = NULL; e->epoch = NULL;
e->mdev = mdev; e->mdev = mdev;
e->pages = page; e->pages = page;
...@@ -356,7 +356,7 @@ void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int i ...@@ -356,7 +356,7 @@ void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int i
kfree(e->digest); kfree(e->digest);
drbd_pp_free(mdev, e->pages, is_net); drbd_pp_free(mdev, e->pages, is_net);
D_ASSERT(atomic_read(&e->pending_bios) == 0); D_ASSERT(atomic_read(&e->pending_bios) == 0);
D_ASSERT(hlist_unhashed(&e->colision)); D_ASSERT(hlist_unhashed(&e->collision));
mempool_free(e, drbd_ee_mempool); mempool_free(e, drbd_ee_mempool);
} }
...@@ -787,7 +787,7 @@ static int drbd_connect(struct drbd_conf *mdev) ...@@ -787,7 +787,7 @@ static int drbd_connect(struct drbd_conf *mdev)
} }
if (sock && msock) { if (sock && msock) {
schedule_timeout_interruptible(HZ / 10); schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
ok = drbd_socket_okay(mdev, &sock); ok = drbd_socket_okay(mdev, &sock);
ok = drbd_socket_okay(mdev, &msock) && ok; ok = drbd_socket_okay(mdev, &msock) && ok;
if (ok) if (ok)
...@@ -899,11 +899,6 @@ static int drbd_connect(struct drbd_conf *mdev) ...@@ -899,11 +899,6 @@ static int drbd_connect(struct drbd_conf *mdev)
drbd_thread_start(&mdev->asender); drbd_thread_start(&mdev->asender);
if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
put_ldev(mdev);
}
if (drbd_send_protocol(mdev) == -1) if (drbd_send_protocol(mdev) == -1)
return -1; return -1;
drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sync_param(mdev, &mdev->sync_conf);
...@@ -1418,7 +1413,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u ...@@ -1418,7 +1413,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u
sector_t sector = e->sector; sector_t sector = e->sector;
int ok; int ok;
D_ASSERT(hlist_unhashed(&e->colision)); D_ASSERT(hlist_unhashed(&e->collision));
if (likely((e->flags & EE_WAS_ERROR) == 0)) { if (likely((e->flags & EE_WAS_ERROR) == 0)) {
drbd_set_in_sync(mdev, sector, e->size); drbd_set_in_sync(mdev, sector, e->size);
...@@ -1487,7 +1482,7 @@ static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsi ...@@ -1487,7 +1482,7 @@ static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
return false; return false;
} }
/* hlist_del(&req->colision) is done in _req_may_be_done, to avoid /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
* special casing it there for the various failure cases. * special casing it there for the various failure cases.
* still no race with drbd_fail_pending_reads */ * still no race with drbd_fail_pending_reads */
ok = recv_dless_read(mdev, req, sector, data_size); ok = recv_dless_read(mdev, req, sector, data_size);
...@@ -1558,11 +1553,11 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) ...@@ -1558,11 +1553,11 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
if (mdev->net_conf->two_primaries) { if (mdev->net_conf->two_primaries) {
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
D_ASSERT(!hlist_unhashed(&e->colision)); D_ASSERT(!hlist_unhashed(&e->collision));
hlist_del_init(&e->colision); hlist_del_init(&e->collision);
spin_unlock_irq(&mdev->req_lock); spin_unlock_irq(&mdev->req_lock);
} else { } else {
D_ASSERT(hlist_unhashed(&e->colision)); D_ASSERT(hlist_unhashed(&e->collision));
} }
drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
...@@ -1579,8 +1574,8 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u ...@@ -1579,8 +1574,8 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
D_ASSERT(!hlist_unhashed(&e->colision)); D_ASSERT(!hlist_unhashed(&e->collision));
hlist_del_init(&e->colision); hlist_del_init(&e->collision);
spin_unlock_irq(&mdev->req_lock); spin_unlock_irq(&mdev->req_lock);
dec_unacked(mdev); dec_unacked(mdev);
...@@ -1755,7 +1750,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -1755,7 +1750,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
#define OVERLAPS overlaps(i->sector, i->size, sector, size) #define OVERLAPS overlaps(i->sector, i->size, sector, size)
slot = tl_hash_slot(mdev, sector); slot = tl_hash_slot(mdev, sector);
...@@ -1765,7 +1760,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -1765,7 +1760,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
int have_conflict = 0; int have_conflict = 0;
prepare_to_wait(&mdev->misc_wait, &wait, prepare_to_wait(&mdev->misc_wait, &wait,
TASK_INTERRUPTIBLE); TASK_INTERRUPTIBLE);
hlist_for_each_entry(i, n, slot, colision) { hlist_for_each_entry(i, n, slot, collision) {
if (OVERLAPS) { if (OVERLAPS) {
/* only ALERT on first iteration, /* only ALERT on first iteration,
* we may be woken up early... */ * we may be woken up early... */
...@@ -1804,7 +1799,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -1804,7 +1799,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
} }
if (signal_pending(current)) { if (signal_pending(current)) {
hlist_del_init(&e->colision); hlist_del_init(&e->collision);
spin_unlock_irq(&mdev->req_lock); spin_unlock_irq(&mdev->req_lock);
...@@ -1862,7 +1857,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -1862,7 +1857,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
dev_err(DEV, "submit failed, triggering re-connect\n"); dev_err(DEV, "submit failed, triggering re-connect\n");
spin_lock_irq(&mdev->req_lock); spin_lock_irq(&mdev->req_lock);
list_del(&e->w.list); list_del(&e->w.list);
hlist_del_init(&e->colision); hlist_del_init(&e->collision);
spin_unlock_irq(&mdev->req_lock); spin_unlock_irq(&mdev->req_lock);
if (e->flags & EE_CALL_AL_COMPLETE_IO) if (e->flags & EE_CALL_AL_COMPLETE_IO)
drbd_al_complete_io(mdev, e->sector); drbd_al_complete_io(mdev, e->sector);
...@@ -2916,12 +2911,6 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi ...@@ -2916,12 +2911,6 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
return false; return false;
} }
static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
{
/* sorry, we currently have no working implementation
* of distributed TCQ */
}
/* warn if the arguments differ by more than 12.5% */ /* warn if the arguments differ by more than 12.5% */
static void warn_if_differ_considerably(struct drbd_conf *mdev, static void warn_if_differ_considerably(struct drbd_conf *mdev,
const char *s, sector_t a, sector_t b) const char *s, sector_t a, sector_t b)
...@@ -2939,7 +2928,6 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -2939,7 +2928,6 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
{ {
struct p_sizes *p = &mdev->data.rbuf.sizes; struct p_sizes *p = &mdev->data.rbuf.sizes;
enum determine_dev_size dd = unchanged; enum determine_dev_size dd = unchanged;
unsigned int max_bio_size;
sector_t p_size, p_usize, my_usize; sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */ int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf; enum dds_flags ddsf;
...@@ -2994,7 +2982,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -2994,7 +2982,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
ddsf = be16_to_cpu(p->dds_flags); ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(mdev)) { if (get_ldev(mdev)) {
dd = drbd_determin_dev_size(mdev, ddsf); dd = drbd_determine_dev_size(mdev, ddsf);
put_ldev(mdev); put_ldev(mdev);
if (dd == dev_size_error) if (dd == dev_size_error)
return false; return false;
...@@ -3004,23 +2992,15 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ...@@ -3004,23 +2992,15 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
drbd_set_my_capacity(mdev, p_size); drbd_set_my_capacity(mdev, p_size);
} }
mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
drbd_reconsider_max_bio_size(mdev);
if (get_ldev(mdev)) { if (get_ldev(mdev)) {
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
ldsc = 1; ldsc = 1;
} }
if (mdev->agreed_pro_version < 94)
max_bio_size = be32_to_cpu(p->max_bio_size);
else if (mdev->agreed_pro_version == 94)
max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
max_bio_size = DRBD_MAX_BIO_SIZE;
if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
drbd_setup_queue_param(mdev, max_bio_size);
drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
put_ldev(mdev); put_ldev(mdev);
} }
...@@ -4275,7 +4255,7 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, ...@@ -4275,7 +4255,7 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
struct hlist_node *n; struct hlist_node *n;
struct drbd_request *req; struct drbd_request *req;
hlist_for_each_entry(req, n, slot, colision) { hlist_for_each_entry(req, n, slot, collision) {
if ((unsigned long)req == (unsigned long)id) { if ((unsigned long)req == (unsigned long)id) {
if (req->sector != sector) { if (req->sector != sector) {
dev_err(DEV, "_ack_id_to_req: found req %p but it has " dev_err(DEV, "_ack_id_to_req: found req %p but it has "
...@@ -4554,6 +4534,7 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -4554,6 +4534,7 @@ int drbd_asender(struct drbd_thread *thi)
int received = 0; int received = 0;
int expect = sizeof(struct p_header80); int expect = sizeof(struct p_header80);
int empty; int empty;
int ping_timeout_active = 0;
sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
...@@ -4566,6 +4547,7 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -4566,6 +4547,7 @@ int drbd_asender(struct drbd_thread *thi)
ERR_IF(!drbd_send_ping(mdev)) goto reconnect; ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
mdev->meta.socket->sk->sk_rcvtimeo = mdev->meta.socket->sk->sk_rcvtimeo =
mdev->net_conf->ping_timeo*HZ/10; mdev->net_conf->ping_timeo*HZ/10;
ping_timeout_active = 1;
} }
/* conditionally cork; /* conditionally cork;
...@@ -4620,8 +4602,7 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -4620,8 +4602,7 @@ int drbd_asender(struct drbd_thread *thi)
dev_err(DEV, "meta connection shut down by peer.\n"); dev_err(DEV, "meta connection shut down by peer.\n");
goto reconnect; goto reconnect;
} else if (rv == -EAGAIN) { } else if (rv == -EAGAIN) {
if (mdev->meta.socket->sk->sk_rcvtimeo == if (ping_timeout_active) {
mdev->net_conf->ping_timeo*HZ/10) {
dev_err(DEV, "PingAck did not arrive in time.\n"); dev_err(DEV, "PingAck did not arrive in time.\n");
goto reconnect; goto reconnect;
} }
...@@ -4660,6 +4641,11 @@ int drbd_asender(struct drbd_thread *thi) ...@@ -4660,6 +4641,11 @@ int drbd_asender(struct drbd_thread *thi)
if (!cmd->process(mdev, h)) if (!cmd->process(mdev, h))
goto reconnect; goto reconnect;
/* the idle_timeout (ping-int)
* has been restored in got_PingAck() */
if (cmd == get_asender_cmd(P_PING_ACK))
ping_timeout_active = 0;
buf = h; buf = h;
received = 0; received = 0;
expect = sizeof(struct p_header80); expect = sizeof(struct p_header80);
......
...@@ -163,7 +163,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev, ...@@ -163,7 +163,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
* they must have been failed on the spot */ * they must have been failed on the spot */
#define OVERLAPS overlaps(sector, size, i->sector, i->size) #define OVERLAPS overlaps(sector, size, i->sector, i->size)
slot = tl_hash_slot(mdev, sector); slot = tl_hash_slot(mdev, sector);
hlist_for_each_entry(i, n, slot, colision) { hlist_for_each_entry(i, n, slot, collision) {
if (OVERLAPS) { if (OVERLAPS) {
dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
"other: %p %llus +%u\n", "other: %p %llus +%u\n",
...@@ -187,7 +187,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev, ...@@ -187,7 +187,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev,
#undef OVERLAPS #undef OVERLAPS
#define OVERLAPS overlaps(sector, size, e->sector, e->size) #define OVERLAPS overlaps(sector, size, e->sector, e->size)
slot = ee_hash_slot(mdev, req->sector); slot = ee_hash_slot(mdev, req->sector);
hlist_for_each_entry(e, n, slot, colision) { hlist_for_each_entry(e, n, slot, collision) {
if (OVERLAPS) { if (OVERLAPS) {
wake_up(&mdev->misc_wait); wake_up(&mdev->misc_wait);
break; break;
...@@ -260,8 +260,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) ...@@ -260,8 +260,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
/* remove the request from the conflict detection /* remove the request from the conflict detection
* respective block_id verification hash */ * respective block_id verification hash */
if (!hlist_unhashed(&req->colision)) if (!hlist_unhashed(&req->collision))
hlist_del(&req->colision); hlist_del(&req->collision);
else else
D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
...@@ -329,7 +329,7 @@ static int _req_conflicts(struct drbd_request *req) ...@@ -329,7 +329,7 @@ static int _req_conflicts(struct drbd_request *req)
struct hlist_node *n; struct hlist_node *n;
struct hlist_head *slot; struct hlist_head *slot;
D_ASSERT(hlist_unhashed(&req->colision)); D_ASSERT(hlist_unhashed(&req->collision));
if (!get_net_conf(mdev)) if (!get_net_conf(mdev))
return 0; return 0;
...@@ -341,7 +341,7 @@ static int _req_conflicts(struct drbd_request *req) ...@@ -341,7 +341,7 @@ static int _req_conflicts(struct drbd_request *req)
#define OVERLAPS overlaps(i->sector, i->size, sector, size) #define OVERLAPS overlaps(i->sector, i->size, sector, size)
slot = tl_hash_slot(mdev, sector); slot = tl_hash_slot(mdev, sector);
hlist_for_each_entry(i, n, slot, colision) { hlist_for_each_entry(i, n, slot, collision) {
if (OVERLAPS) { if (OVERLAPS) {
dev_alert(DEV, "%s[%u] Concurrent local write detected! " dev_alert(DEV, "%s[%u] Concurrent local write detected! "
"[DISCARD L] new: %llus +%u; " "[DISCARD L] new: %llus +%u; "
...@@ -359,7 +359,7 @@ static int _req_conflicts(struct drbd_request *req) ...@@ -359,7 +359,7 @@ static int _req_conflicts(struct drbd_request *req)
#undef OVERLAPS #undef OVERLAPS
#define OVERLAPS overlaps(e->sector, e->size, sector, size) #define OVERLAPS overlaps(e->sector, e->size, sector, size)
slot = ee_hash_slot(mdev, sector); slot = ee_hash_slot(mdev, sector);
hlist_for_each_entry(e, n, slot, colision) { hlist_for_each_entry(e, n, slot, collision) {
if (OVERLAPS) { if (OVERLAPS) {
dev_alert(DEV, "%s[%u] Concurrent remote write detected!" dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
" [DISCARD L] new: %llus +%u; " " [DISCARD L] new: %llus +%u; "
...@@ -491,7 +491,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, ...@@ -491,7 +491,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* so we can verify the handle in the answer packet /* so we can verify the handle in the answer packet
* corresponding hlist_del is in _req_may_be_done() */ * corresponding hlist_del is in _req_may_be_done() */
hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
set_bit(UNPLUG_REMOTE, &mdev->flags); set_bit(UNPLUG_REMOTE, &mdev->flags);
...@@ -507,7 +507,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, ...@@ -507,7 +507,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* assert something? */ /* assert something? */
/* from drbd_make_request_common only */ /* from drbd_make_request_common only */
hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
/* corresponding hlist_del is in _req_may_be_done() */ /* corresponding hlist_del is in _req_may_be_done() */
/* NOTE /* NOTE
...@@ -1033,7 +1033,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns ...@@ -1033,7 +1033,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
err = 0; err = 0;
fail_free_complete: fail_free_complete:
if (rw == WRITE && local) if (req->rq_state & RQ_IN_ACT_LOG)
drbd_al_complete_io(mdev, sector); drbd_al_complete_io(mdev, sector);
fail_and_free_req: fail_and_free_req:
if (local) { if (local) {
......
...@@ -256,7 +256,7 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, ...@@ -256,7 +256,7 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
struct hlist_node *n; struct hlist_node *n;
struct drbd_request *req; struct drbd_request *req;
hlist_for_each_entry(req, n, slot, colision) { hlist_for_each_entry(req, n, slot, collision) {
if ((unsigned long)req == (unsigned long)id) { if ((unsigned long)req == (unsigned long)id) {
D_ASSERT(req->sector == sector); D_ASSERT(req->sector == sector);
return req; return req;
...@@ -291,7 +291,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, ...@@ -291,7 +291,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
req->epoch = 0; req->epoch = 0;
req->sector = bio_src->bi_sector; req->sector = bio_src->bi_sector;
req->size = bio_src->bi_size; req->size = bio_src->bi_size;
INIT_HLIST_NODE(&req->colision); INIT_HLIST_NODE(&req->collision);
INIT_LIST_HEAD(&req->tl_requests); INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list); INIT_LIST_HEAD(&req->w.list);
} }
...@@ -323,6 +323,7 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, ...@@ -323,6 +323,7 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
extern void complete_master_bio(struct drbd_conf *mdev, extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m); struct bio_and_error *m);
extern void request_timer_fn(unsigned long data); extern void request_timer_fn(unsigned long data);
extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
/* use this if you don't want to deal with calling complete_master_bio() /* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */ * outside the spinlock, e.g. when walking some list on cleanup. */
......
...@@ -126,7 +126,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo ...@@ -126,7 +126,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo
list_del(&e->w.list); /* has been on active_ee or sync_ee */ list_del(&e->w.list); /* has been on active_ee or sync_ee */
list_add_tail(&e->w.list, &mdev->done_ee); list_add_tail(&e->w.list, &mdev->done_ee);
/* No hlist_del_init(&e->colision) here, we did not send the Ack yet, /* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
* neither did we wake possibly waiting conflicting requests. * neither did we wake possibly waiting conflicting requests.
* done from "drbd_process_done_ee" within the appropriate w.cb * done from "drbd_process_done_ee" within the appropriate w.cb
* (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
...@@ -297,42 +297,48 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * ...@@ -297,42 +297,48 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
crypto_hash_final(&desc, digest); crypto_hash_final(&desc, digest);
} }
static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) /* TODO merge common code with w_e_end_ov_req */
int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{ {
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
int digest_size; int digest_size;
void *digest; void *digest;
int ok; int ok = 1;
D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
if (unlikely(cancel)) { if (unlikely(cancel))
drbd_free_ee(mdev, e); goto out;
return 1;
}
if (likely((e->flags & EE_WAS_ERROR) == 0)) { if (likely((e->flags & EE_WAS_ERROR) != 0))
digest_size = crypto_hash_digestsize(mdev->csums_tfm); goto out;
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
inc_rs_pending(mdev); digest_size = crypto_hash_digestsize(mdev->csums_tfm);
ok = drbd_send_drequest_csum(mdev, digest = kmalloc(digest_size, GFP_NOIO);
e->sector, if (digest) {
e->size, sector_t sector = e->sector;
digest, unsigned int size = e->size;
digest_size, drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
P_CSUM_RS_REQUEST); /* Free e and pages before send.
kfree(digest); * In case we block on congestion, we could otherwise run into
} else { * some distributed deadlock, if the other side blocks on
dev_err(DEV, "kmalloc() of digest failed.\n"); * congestion as well, because our receiver blocks in
ok = 0; * drbd_pp_alloc due to pp_in_use > max_buffers. */
} drbd_free_ee(mdev, e);
} else e = NULL;
ok = 1; inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev, sector, size,
digest, digest_size,
P_CSUM_RS_REQUEST);
kfree(digest);
} else {
dev_err(DEV, "kmalloc() of digest failed.\n");
ok = 0;
}
drbd_free_ee(mdev, e); out:
if (e)
drbd_free_ee(mdev, e);
if (unlikely(!ok)) if (unlikely(!ok))
dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
...@@ -834,7 +840,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) ...@@ -834,7 +840,7 @@ int drbd_resync_finished(struct drbd_conf *mdev)
const int ratio = const int ratio =
(t == 0) ? 0 : (t == 0) ? 0 :
(t < 100000) ? ((s*100)/t) : (s/(t/100)); (t < 100000) ? ((s*100)/t) : (s/(t/100));
dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
"transferred %luK total %luK\n", "transferred %luK total %luK\n",
ratio, ratio,
Bit2KB(mdev->rs_same_csum), Bit2KB(mdev->rs_same_csum),
...@@ -1071,9 +1077,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) ...@@ -1071,9 +1077,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return ok; return ok;
} }
/* TODO merge common code with w_e_send_csum */
int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{ {
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
sector_t sector = e->sector;
unsigned int size = e->size;
int digest_size; int digest_size;
void *digest; void *digest;
int ok = 1; int ok = 1;
...@@ -1093,17 +1102,25 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) ...@@ -1093,17 +1102,25 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
else else
memset(digest, 0, digest_size); memset(digest, 0, digest_size);
/* Free e and pages before send.
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
* drbd_pp_alloc due to pp_in_use > max_buffers. */
drbd_free_ee(mdev, e);
e = NULL;
inc_rs_pending(mdev); inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev, e->sector, e->size, ok = drbd_send_drequest_csum(mdev, sector, size,
digest, digest_size, P_OV_REPLY); digest, digest_size,
P_OV_REPLY);
if (!ok) if (!ok)
dec_rs_pending(mdev); dec_rs_pending(mdev);
kfree(digest); kfree(digest);
out: out:
drbd_free_ee(mdev, e); if (e)
drbd_free_ee(mdev, e);
dec_unacked(mdev); dec_unacked(mdev);
return ok; return ok;
} }
...@@ -1122,8 +1139,10 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) ...@@ -1122,8 +1139,10 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{ {
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
struct digest_info *di; struct digest_info *di;
int digest_size;
void *digest; void *digest;
sector_t sector = e->sector;
unsigned int size = e->size;
int digest_size;
int ok, eq = 0; int ok, eq = 0;
if (unlikely(cancel)) { if (unlikely(cancel)) {
...@@ -1153,16 +1172,21 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) ...@@ -1153,16 +1172,21 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
} }
} }
dec_unacked(mdev); /* Free e and pages before send.
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
* drbd_pp_alloc due to pp_in_use > max_buffers. */
drbd_free_ee(mdev, e);
if (!eq) if (!eq)
drbd_ov_oos_found(mdev, e->sector, e->size); drbd_ov_oos_found(mdev, sector, size);
else else
ov_oos_print(mdev); ov_oos_print(mdev);
ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
drbd_free_ee(mdev, e); dec_unacked(mdev);
--mdev->ov_left; --mdev->ov_left;
......
...@@ -1658,7 +1658,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data) ...@@ -1658,7 +1658,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
struct kobject *kobj; struct kobject *kobj;
mutex_lock(&loop_devices_mutex); mutex_lock(&loop_devices_mutex);
lo = loop_init_one(dev & MINORMASK); lo = loop_init_one(MINOR(dev) >> part_shift);
kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM); kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
mutex_unlock(&loop_devices_mutex); mutex_unlock(&loop_devices_mutex);
...@@ -1691,15 +1691,18 @@ static int __init loop_init(void) ...@@ -1691,15 +1691,18 @@ static int __init loop_init(void)
if (max_part > 0) if (max_part > 0)
part_shift = fls(max_part); part_shift = fls(max_part);
if ((1UL << part_shift) > DISK_MAX_PARTS)
return -EINVAL;
if (max_loop > 1UL << (MINORBITS - part_shift)) if (max_loop > 1UL << (MINORBITS - part_shift))
return -EINVAL; return -EINVAL;
if (max_loop) { if (max_loop) {
nr = max_loop; nr = max_loop;
range = max_loop; range = max_loop << part_shift;
} else { } else {
nr = 8; nr = 8;
range = 1UL << (MINORBITS - part_shift); range = 1UL << MINORBITS;
} }
if (register_blkdev(LOOP_MAJOR, "loop")) if (register_blkdev(LOOP_MAJOR, "loop"))
...@@ -1738,7 +1741,7 @@ static void __exit loop_exit(void) ...@@ -1738,7 +1741,7 @@ static void __exit loop_exit(void)
unsigned long range; unsigned long range;
struct loop_device *lo, *next; struct loop_device *lo, *next;
range = max_loop ? max_loop : 1UL << (MINORBITS - part_shift); range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
list_for_each_entry_safe(lo, next, &loop_devices, lo_list) list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
loop_del_one(lo); loop_del_one(lo);
......
obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
xen-blkback-y := blkback.o xenbus.o
This diff is collapsed.
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
#define __XEN_BLKIF__BACKEND__COMMON_H__
#include <linux/version.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/vmalloc.h>
#include <linux/wait.h>
#include <linux/io.h>
#include <asm/setup.h>
#include <asm/pgalloc.h>
#include <asm/hypervisor.h>
#include <xen/grant_table.h>
#include <xen/xenbus.h>
#include <xen/interface/io/ring.h>
#include <xen/interface/io/blkif.h>
#include <xen/interface/io/protocols.h>
#define DRV_PFX "xen-blkback:"
#define DPRINTK(fmt, args...) \
pr_debug(DRV_PFX "(%s:%d) " fmt ".\n", \
__func__, __LINE__, ##args)
/* Not a real protocol. Used to generate ring structs which contain
* the elements common to all protocols only. This way we get a
* compiler-checkable way to use common struct elements, so we can
* avoid using switch(protocol) in a number of places. */
struct blkif_common_request {
char dummy;
};
struct blkif_common_response {
char dummy;
};
/* i386 protocol version */
#pragma pack(push, 4)
struct blkif_x86_32_request {
uint8_t operation; /* BLKIF_OP_??? */
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t id; /* private guest value, echoed in resp */
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
struct blkif_x86_32_response {
uint64_t id; /* copied from request */
uint8_t operation; /* copied from request */
int16_t status; /* BLKIF_RSP_??? */
};
#pragma pack(pop)
/* x86_64 protocol version */
struct blkif_x86_64_request {
uint8_t operation; /* BLKIF_OP_??? */
uint8_t nr_segments; /* number of segments */
blkif_vdev_t handle; /* only for read/write requests */
uint64_t __attribute__((__aligned__(8))) id;
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
struct blkif_x86_64_response {
uint64_t __attribute__((__aligned__(8))) id;
uint8_t operation; /* copied from request */
int16_t status; /* BLKIF_RSP_??? */
};
DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
struct blkif_common_response);
DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
struct blkif_x86_32_response);
DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
struct blkif_x86_64_response);
union blkif_back_rings {
struct blkif_back_ring native;
struct blkif_common_back_ring common;
struct blkif_x86_32_back_ring x86_32;
struct blkif_x86_64_back_ring x86_64;
};
enum blkif_protocol {
BLKIF_PROTOCOL_NATIVE = 1,
BLKIF_PROTOCOL_X86_32 = 2,
BLKIF_PROTOCOL_X86_64 = 3,
};
struct xen_vbd {
/* What the domain refers to this vbd as. */
blkif_vdev_t handle;
/* Non-zero -> read-only */
unsigned char readonly;
/* VDISK_xxx */
unsigned char type;
/* phys device that this vbd maps to. */
u32 pdevice;
struct block_device *bdev;
/* Cached size parameter. */
sector_t size;
bool flush_support;
};
struct backend_info;
struct xen_blkif {
/* Unique identifier for this interface. */
domid_t domid;
unsigned int handle;
/* Physical parameters of the comms window. */
unsigned int irq;
/* Comms information. */
enum blkif_protocol blk_protocol;
union blkif_back_rings blk_rings;
struct vm_struct *blk_ring_area;
/* The VBD attached to this interface. */
struct xen_vbd vbd;
/* Back pointer to the backend_info. */
struct backend_info *be;
/* Private fields. */
spinlock_t blk_ring_lock;
atomic_t refcnt;
wait_queue_head_t wq;
/* One thread per one blkif. */
struct task_struct *xenblkd;
unsigned int waiting_reqs;
/* statistics */
unsigned long st_print;
int st_rd_req;
int st_wr_req;
int st_oo_req;
int st_f_req;
int st_rd_sect;
int st_wr_sect;
wait_queue_head_t waiting_to_free;
grant_handle_t shmem_handle;
grant_ref_t shmem_ref;
};
#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
(_v)->bdev->bd_part->nr_sects : \
get_capacity((_v)->bdev->bd_disk))
#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
#define xen_blkif_put(_b) \
do { \
if (atomic_dec_and_test(&(_b)->refcnt)) \
wake_up(&(_b)->waiting_to_free);\
} while (0)
struct phys_req {
unsigned short dev;
unsigned short nr_sects;
struct block_device *bdev;
blkif_sector_t sector_number;
};
int xen_blkif_interface_init(void);
int xen_blkif_xenbus_init(void);
irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
int xen_blkif_schedule(void *arg);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state);
struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
struct blkif_x86_32_request *src)
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
dst->operation = src->operation;
dst->nr_segments = src->nr_segments;
dst->handle = src->handle;
dst->id = src->id;
dst->u.rw.sector_number = src->sector_number;
barrier();
if (n > dst->nr_segments)
n = dst->nr_segments;
for (i = 0; i < n; i++)
dst->u.rw.seg[i] = src->seg[i];
}
static inline void blkif_get_x86_64_req(struct blkif_request *dst,
struct blkif_x86_64_request *src)
{
int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
dst->operation = src->operation;
dst->nr_segments = src->nr_segments;
dst->handle = src->handle;
dst->id = src->id;
dst->u.rw.sector_number = src->sector_number;
barrier();
if (n > dst->nr_segments)
n = dst->nr_segments;
for (i = 0; i < n; i++)
dst->u.rw.seg[i] = src->seg[i];
}
#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
This diff is collapsed.
...@@ -97,6 +97,7 @@ struct blkfront_info ...@@ -97,6 +97,7 @@ struct blkfront_info
struct blk_shadow shadow[BLK_RING_SIZE]; struct blk_shadow shadow[BLK_RING_SIZE];
unsigned long shadow_free; unsigned long shadow_free;
unsigned int feature_flush; unsigned int feature_flush;
unsigned int flush_op;
int is_ready; int is_ready;
}; };
...@@ -250,8 +251,7 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -250,8 +251,7 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
/* /*
* Generate a Xen blkfront IO request from a blk layer request. Reads * Generate a Xen blkfront IO request from a blk layer request. Reads
* and writes are handled as expected. Since we lack a loose flush * and writes are handled as expected.
* request, we map flushes into a full ordered barrier.
* *
* @req: a request struct * @req: a request struct
*/ */
...@@ -293,14 +293,13 @@ static int blkif_queue_request(struct request *req) ...@@ -293,14 +293,13 @@ static int blkif_queue_request(struct request *req)
if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
/* /*
* Ideally we could just do an unordered * Ideally we can do an unordered flush-to-disk. In case the
* flush-to-disk, but all we have is a full write * backend onlysupports barriers, use that. A barrier request
* barrier at the moment. However, a barrier write is
* a superset of FUA, so we can implement it the same * a superset of FUA, so we can implement it the same
* way. (It's also a FLUSH+FUA, since it is * way. (It's also a FLUSH+FUA, since it is
* guaranteed ordered WRT previous writes.) * guaranteed ordered WRT previous writes.)
*/ */
ring_req->operation = BLKIF_OP_WRITE_BARRIER; ring_req->operation = info->flush_op;
} }
ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
...@@ -433,8 +432,11 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) ...@@ -433,8 +432,11 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
static void xlvbd_flush(struct blkfront_info *info) static void xlvbd_flush(struct blkfront_info *info)
{ {
blk_queue_flush(info->rq, info->feature_flush); blk_queue_flush(info->rq, info->feature_flush);
printk(KERN_INFO "blkfront: %s: barriers %s\n", printk(KERN_INFO "blkfront: %s: %s: %s\n",
info->gd->disk_name, info->gd->disk_name,
info->flush_op == BLKIF_OP_WRITE_BARRIER ?
"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
"flush diskcache" : "barrier or flush"),
info->feature_flush ? "enabled" : "disabled"); info->feature_flush ? "enabled" : "disabled");
} }
...@@ -720,15 +722,20 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -720,15 +722,20 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
switch (bret->operation) { switch (bret->operation) {
case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", printk(KERN_WARNING "blkfront: %s: write %s op failed\n",
info->flush_op == BLKIF_OP_WRITE_BARRIER ?
"barrier" : "flush disk cache",
info->gd->disk_name); info->gd->disk_name);
error = -EOPNOTSUPP; error = -EOPNOTSUPP;
} }
if (unlikely(bret->status == BLKIF_RSP_ERROR && if (unlikely(bret->status == BLKIF_RSP_ERROR &&
info->shadow[id].req.nr_segments == 0)) { info->shadow[id].req.nr_segments == 0)) {
printk(KERN_WARNING "blkfront: %s: empty write barrier op failed\n", printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n",
info->flush_op == BLKIF_OP_WRITE_BARRIER ?
"barrier" : "flush disk cache",
info->gd->disk_name); info->gd->disk_name);
error = -EOPNOTSUPP; error = -EOPNOTSUPP;
} }
...@@ -736,6 +743,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) ...@@ -736,6 +743,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
if (error == -EOPNOTSUPP) if (error == -EOPNOTSUPP)
error = 0; error = 0;
info->feature_flush = 0; info->feature_flush = 0;
info->flush_op = 0;
xlvbd_flush(info); xlvbd_flush(info);
} }
/* fall through */ /* fall through */
...@@ -1100,7 +1108,7 @@ static void blkfront_connect(struct blkfront_info *info) ...@@ -1100,7 +1108,7 @@ static void blkfront_connect(struct blkfront_info *info)
unsigned long sector_size; unsigned long sector_size;
unsigned int binfo; unsigned int binfo;
int err; int err;
int barrier; int barrier, flush;
switch (info->connected) { switch (info->connected) {
case BLKIF_STATE_CONNECTED: case BLKIF_STATE_CONNECTED:
...@@ -1140,8 +1148,11 @@ static void blkfront_connect(struct blkfront_info *info) ...@@ -1140,8 +1148,11 @@ static void blkfront_connect(struct blkfront_info *info)
return; return;
} }
info->feature_flush = 0;
info->flush_op = 0;
err = xenbus_gather(XBT_NIL, info->xbdev->otherend, err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
"feature-barrier", "%lu", &barrier, "feature-barrier", "%d", &barrier,
NULL); NULL);
/* /*
...@@ -1151,11 +1162,23 @@ static void blkfront_connect(struct blkfront_info *info) ...@@ -1151,11 +1162,23 @@ static void blkfront_connect(struct blkfront_info *info)
* *
* If there are barriers, then we use flush. * If there are barriers, then we use flush.
*/ */
info->feature_flush = 0; if (!err && barrier) {
if (!err && barrier)
info->feature_flush = REQ_FLUSH | REQ_FUA; info->feature_flush = REQ_FLUSH | REQ_FUA;
info->flush_op = BLKIF_OP_WRITE_BARRIER;
}
/*
* And if there is "feature-flush-cache" use that above
* barriers.
*/
err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
"feature-flush-cache", "%d", &flush,
NULL);
if (!err && flush) {
info->feature_flush = REQ_FLUSH;
info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
}
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) { if (err) {
xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
......
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
/* Although the Linux source code makes a difference between /* Although the Linux source code makes a difference between
generic endianness and the bitfields' endianness, there is no generic endianness and the bitfields' endianness, there is no
architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness architecture as of Linux-2.6.24-rc4 where the bitfields' endianness
does not match the generic endianness. */ does not match the generic endianness. */
#if __BYTE_ORDER == __LITTLE_ENDIAN #if __BYTE_ORDER == __LITTLE_ENDIAN
...@@ -53,7 +53,7 @@ ...@@ -53,7 +53,7 @@
extern const char *drbd_buildtag(void); extern const char *drbd_buildtag(void);
#define REL_VERSION "8.3.10" #define REL_VERSION "8.3.11"
#define API_VERSION 88 #define API_VERSION 88
#define PRO_VERSION_MIN 86 #define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 96 #define PRO_VERSION_MAX 96
...@@ -195,7 +195,7 @@ enum drbd_conns { ...@@ -195,7 +195,7 @@ enum drbd_conns {
C_WF_REPORT_PARAMS, /* we have a socket */ C_WF_REPORT_PARAMS, /* we have a socket */
C_CONNECTED, /* we have introduced each other */ C_CONNECTED, /* we have introduced each other */
C_STARTING_SYNC_S, /* starting full sync by admin request. */ C_STARTING_SYNC_S, /* starting full sync by admin request. */
C_STARTING_SYNC_T, /* stariing full sync by admin request. */ C_STARTING_SYNC_T, /* starting full sync by admin request. */
C_WF_BITMAP_S, C_WF_BITMAP_S,
C_WF_BITMAP_T, C_WF_BITMAP_T,
C_WF_SYNC_UUID, C_WF_SYNC_UUID,
...@@ -236,7 +236,7 @@ union drbd_state { ...@@ -236,7 +236,7 @@ union drbd_state {
* pointed out by Maxim Uvarov q<muvarov@ru.mvista.com> * pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
* even though we transmit as "cpu_to_be32(state)", * even though we transmit as "cpu_to_be32(state)",
* the offsets of the bitfields still need to be swapped * the offsets of the bitfields still need to be swapped
* on different endianess. * on different endianness.
*/ */
struct { struct {
#if defined(__LITTLE_ENDIAN_BITFIELD) #if defined(__LITTLE_ENDIAN_BITFIELD)
...@@ -266,7 +266,7 @@ union drbd_state { ...@@ -266,7 +266,7 @@ union drbd_state {
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned role:2 ; /* 3/4 primary/secondary/unknown */ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
#else #else
# error "this endianess is not supported" # error "this endianness is not supported"
#endif #endif
}; };
unsigned int i; unsigned int i;
......
...@@ -30,7 +30,7 @@ enum packet_types { ...@@ -30,7 +30,7 @@ enum packet_types {
int tag_and_len ## member; int tag_and_len ## member;
#include "linux/drbd_nl.h" #include "linux/drbd_nl.h"
/* declate tag-list-sizes */ /* declare tag-list-sizes */
static const int tag_list_sizes[] = { static const int tag_list_sizes[] = {
#define NL_PACKET(name, number, fields) 2 fields , #define NL_PACKET(name, number, fields) 2 fields ,
#define NL_INTEGER(pn, pr, member) + 4 + 4 #define NL_INTEGER(pn, pr, member) + 4 + 4
......
...@@ -139,9 +139,9 @@ write intent log information, three of which are mentioned here. ...@@ -139,9 +139,9 @@ write intent log information, three of which are mentioned here.
* .list is on one of three lists: * .list is on one of three lists:
* in_use: currently in use (refcnt > 0, lc_number != LC_FREE) * in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
* lru: unused but ready to be reused or recycled * lru: unused but ready to be reused or recycled
* (ts_refcnt == 0, lc_number != LC_FREE), * (lc_refcnt == 0, lc_number != LC_FREE),
* free: unused but ready to be recycled * free: unused but ready to be recycled
* (ts_refcnt == 0, lc_number == LC_FREE), * (lc_refcnt == 0, lc_number == LC_FREE),
* *
* an element is said to be "in the active set", * an element is said to be "in the active set",
* if either on "in_use" or "lru", i.e. lc_number != LC_FREE. * if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
...@@ -160,8 +160,8 @@ struct lc_element { ...@@ -160,8 +160,8 @@ struct lc_element {
struct hlist_node colision; struct hlist_node colision;
struct list_head list; /* LRU list or free list */ struct list_head list; /* LRU list or free list */
unsigned refcnt; unsigned refcnt;
/* back "pointer" into ts_cache->element[index], /* back "pointer" into lc_cache->element[index],
* for paranoia, and for "ts_element_to_index" */ * for paranoia, and for "lc_element_to_index" */
unsigned lc_index; unsigned lc_index;
/* if we want to track a larger set of objects, /* if we want to track a larger set of objects,
* it needs to become arch independend u64 */ * it needs to become arch independend u64 */
...@@ -190,8 +190,8 @@ struct lru_cache { ...@@ -190,8 +190,8 @@ struct lru_cache {
/* Arbitrary limit on maximum tracked objects. Practical limit is much /* Arbitrary limit on maximum tracked objects. Practical limit is much
* lower due to allocation failures, probably. For typical use cases, * lower due to allocation failures, probably. For typical use cases,
* nr_elements should be a few thousand at most. * nr_elements should be a few thousand at most.
* This also limits the maximum value of ts_element.ts_index, allowing the * This also limits the maximum value of lc_element.lc_index, allowing the
* 8 high bits of .ts_index to be overloaded with flags in the future. */ * 8 high bits of .lc_index to be overloaded with flags in the future. */
#define LC_MAX_ACTIVE (1<<24) #define LC_MAX_ACTIVE (1<<24)
/* statistics */ /* statistics */
......
...@@ -44,6 +44,19 @@ typedef uint64_t blkif_sector_t; ...@@ -44,6 +44,19 @@ typedef uint64_t blkif_sector_t;
*/ */
#define BLKIF_OP_WRITE_BARRIER 2 #define BLKIF_OP_WRITE_BARRIER 2
/*
* Recognised if "feature-flush-cache" is present in backend xenbus
* info. A flush will ask the underlying storage hardware to flush its
* non-volatile caches as appropriate. The "feature-flush-cache" node
* contains a boolean indicating whether flush requests are likely to
* succeed or fail. Either way, a flush request may fail at any time
* with BLKIF_RSP_EOPNOTSUPP if it is unsupported by the underlying
* block-device hardware. The boolean simply indicates whether or not it
* is worthwhile for the frontend to attempt flushes. If a backend does
* not recognise BLKIF_OP_WRITE_FLUSH_CACHE, it should *not* create the
* "feature-flush-cache" node!
*/
#define BLKIF_OP_FLUSH_DISKCACHE 3
/* /*
* Maximum scatter/gather segments per request. * Maximum scatter/gather segments per request.
* This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment