Commit c80ddb52 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md-3.5' of git://neil.brown.name/md

Pull md updates from NeilBrown:
 "It's been a busy cycle for md - lots of fun stuff here..  if you like
  this kind of thing :-)

  Main features:
   - RAID10 arrays can be reshaped - adding and removing devices and
     changing chunks (not 'far' array though)
   - allow RAID5 arrays to be reshaped with a backup file (not tested
     yet, but the priciple works fine for RAID10).
   - arrays can be reshaped while a bitmap is present - you no longer
     need to remove it first
   - SSSE3 support for RAID6 syndrome calculations

  and of course a number of minor fixes etc."

* tag 'md-3.5' of git://neil.brown.name/md: (56 commits)
  md/bitmap: record the space available for the bitmap in the superblock.
  md/raid10: Remove extras after reshape to smaller number of devices.
  md/raid5: improve removal of extra devices after reshape.
  md: check the return of mddev_find()
  MD RAID1: Further conditionalize 'fullsync'
  DM RAID: Use md_error() in place of simply setting Faulty bit
  DM RAID: Record and handle missing devices
  DM RAID: Set recovery flags on resume
  md/raid5: Allow reshape while a bitmap is present.
  md/raid10: resize bitmap when required during reshape.
  md: allow array to be resized while bitmap is present.
  md/bitmap: make sure reshape request are reflected in superblock.
  md/bitmap: add bitmap_resize function to allow bitmap resizing.
  md/bitmap: use DIV_ROUND_UP instead of open-code
  md/bitmap: create a 'struct bitmap_counts' substructure of 'struct bitmap'
  md/bitmap: make bitmap bitops atomic.
  md/bitmap: make _page_attr bitops atomic.
  md/bitmap: merge bitmap_file_unmap and bitmap_file_put.
  md/bitmap: remove async freeing of bitmap file.
  md/bitmap: convert some spin_lock_irqsave to spin_lock_irq
  ...
parents 2c13bc0f 1dff2b87
...@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI ...@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions? # does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS := -m elf_$(UTS_MACHINE)
......
...@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = { ...@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
.do_5 = xor_sse_5, .do_5 = xor_sse_5,
}; };
/* Also try the AVX routines */
#include "xor_avx.h"
/* Also try the generic routines. */ /* Also try the generic routines. */
#include <asm-generic/xor.h> #include <asm-generic/xor.h>
...@@ -871,6 +874,7 @@ do { \ ...@@ -871,6 +874,7 @@ do { \
xor_speed(&xor_block_8regs_p); \ xor_speed(&xor_block_8regs_p); \
xor_speed(&xor_block_32regs); \ xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_32regs_p); \ xor_speed(&xor_block_32regs_p); \
AVX_XOR_SPEED; \
if (cpu_has_xmm) \ if (cpu_has_xmm) \
xor_speed(&xor_block_pIII_sse); \ xor_speed(&xor_block_pIII_sse); \
if (cpu_has_mmx) { \ if (cpu_has_mmx) { \
...@@ -883,6 +887,6 @@ do { \ ...@@ -883,6 +887,6 @@ do { \
We may also be able to load into the L1 only depending on how the cpu We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */ deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) \ #define XOR_SELECT_TEMPLATE(FASTEST) \
(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
#endif /* _ASM_X86_XOR_32_H */ #endif /* _ASM_X86_XOR_32_H */
...@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = { ...@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
.do_5 = xor_sse_5, .do_5 = xor_sse_5,
}; };
/* Also try the AVX routines */
#include "xor_avx.h"
#undef XOR_TRY_TEMPLATES #undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \ #define XOR_TRY_TEMPLATES \
do { \ do { \
AVX_XOR_SPEED; \
xor_speed(&xor_block_sse); \ xor_speed(&xor_block_sse); \
} while (0) } while (0)
/* We force the use of the SSE xor block because it can write around L2. /* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */ deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) #define XOR_SELECT_TEMPLATE(FASTEST) \
AVX_SELECT(&xor_block_sse)
#endif /* _ASM_X86_XOR_64_H */ #endif /* _ASM_X86_XOR_64_H */
#ifndef _ASM_X86_XOR_AVX_H
#define _ASM_X86_XOR_AVX_H
/*
* Optimized RAID-5 checksumming functions for AVX
*
* Copyright (C) 2012 Intel Corporation
* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#ifdef CONFIG_AS_AVX
#include <linux/compiler.h>
#include <asm/i387.h>
#define ALIGN32 __aligned(32)
#define YMM_SAVED_REGS 4
#define YMMS_SAVE \
do { \
preempt_disable(); \
cr0 = read_cr0(); \
clts(); \
asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
} while (0);
#define YMMS_RESTORE \
do { \
asm volatile("sfence" : : : "memory"); \
asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
write_cr0(cr0); \
preempt_enable(); \
} while (0);
#define BLOCK4(i) \
BLOCK(32 * i, 0) \
BLOCK(32 * (i + 1), 1) \
BLOCK(32 * (i + 2), 2) \
BLOCK(32 * (i + 3), 3)
#define BLOCK16() \
BLOCK4(0) \
BLOCK4(4) \
BLOCK4(8) \
BLOCK4(12)
static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
{
unsigned long cr0, lines = bytes >> 9;
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
YMMS_SAVE
while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p0[i / sizeof(*p0)])); \
asm volatile("vmovdqa %%ymm" #reg ", %0" : \
"=m" (p0[i / sizeof(*p0)])); \
} while (0);
BLOCK16()
p0 = (unsigned long *)((uintptr_t)p0 + 512);
p1 = (unsigned long *)((uintptr_t)p1 + 512);
}
YMMS_RESTORE
}
static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
unsigned long *p2)
{
unsigned long cr0, lines = bytes >> 9;
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
YMMS_SAVE
while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p1[i / sizeof(*p1)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p0[i / sizeof(*p0)])); \
asm volatile("vmovdqa %%ymm" #reg ", %0" : \
"=m" (p0[i / sizeof(*p0)])); \
} while (0);
BLOCK16()
p0 = (unsigned long *)((uintptr_t)p0 + 512);
p1 = (unsigned long *)((uintptr_t)p1 + 512);
p2 = (unsigned long *)((uintptr_t)p2 + 512);
}
YMMS_RESTORE
}
static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
unsigned long *p2, unsigned long *p3)
{
unsigned long cr0, lines = bytes >> 9;
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
YMMS_SAVE
while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p2[i / sizeof(*p2)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p1[i / sizeof(*p1)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p0[i / sizeof(*p0)])); \
asm volatile("vmovdqa %%ymm" #reg ", %0" : \
"=m" (p0[i / sizeof(*p0)])); \
} while (0);
BLOCK16();
p0 = (unsigned long *)((uintptr_t)p0 + 512);
p1 = (unsigned long *)((uintptr_t)p1 + 512);
p2 = (unsigned long *)((uintptr_t)p2 + 512);
p3 = (unsigned long *)((uintptr_t)p3 + 512);
}
YMMS_RESTORE
}
static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
unsigned long *p2, unsigned long *p3, unsigned long *p4)
{
unsigned long cr0, lines = bytes >> 9;
char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
YMMS_SAVE
while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p3[i / sizeof(*p3)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p2[i / sizeof(*p2)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p1[i / sizeof(*p1)])); \
asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
"m" (p0[i / sizeof(*p0)])); \
asm volatile("vmovdqa %%ymm" #reg ", %0" : \
"=m" (p0[i / sizeof(*p0)])); \
} while (0);
BLOCK16()
p0 = (unsigned long *)((uintptr_t)p0 + 512);
p1 = (unsigned long *)((uintptr_t)p1 + 512);
p2 = (unsigned long *)((uintptr_t)p2 + 512);
p3 = (unsigned long *)((uintptr_t)p3 + 512);
p4 = (unsigned long *)((uintptr_t)p4 + 512);
}
YMMS_RESTORE
}
static struct xor_block_template xor_block_avx = {
.name = "avx",
.do_2 = xor_avx_2,
.do_3 = xor_avx_3,
.do_4 = xor_avx_4,
.do_5 = xor_avx_5,
};
#define AVX_XOR_SPEED \
do { \
if (cpu_has_avx) \
xor_speed(&xor_block_avx); \
} while (0)
#define AVX_SELECT(FASTEST) \
(cpu_has_avx ? &xor_block_avx : FASTEST)
#else
#define AVX_XOR_SPEED {}
#define AVX_SELECT(FASTEST) (FASTEST)
#endif
#endif
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/raid/xor.h> #include <linux/raid/xor.h>
#include <linux/jiffies.h> #include <linux/jiffies.h>
#include <linux/preempt.h>
#include <asm/xor.h> #include <asm/xor.h>
/* The xor routines to use. */ /* The xor routines to use. */
...@@ -63,12 +64,14 @@ static void ...@@ -63,12 +64,14 @@ static void
do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
{ {
int speed; int speed;
unsigned long now; unsigned long now, j;
int i, count, max; int i, count, max;
tmpl->next = template_list; tmpl->next = template_list;
template_list = tmpl; template_list = tmpl;
preempt_disable();
/* /*
* Count the number of XORs done during a whole jiffy, and use * Count the number of XORs done during a whole jiffy, and use
* this to calculate the speed of checksumming. We use a 2-page * this to calculate the speed of checksumming. We use a 2-page
...@@ -76,9 +79,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) ...@@ -76,9 +79,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
*/ */
max = 0; max = 0;
for (i = 0; i < 5; i++) { for (i = 0; i < 5; i++) {
now = jiffies; j = jiffies;
count = 0; count = 0;
while (jiffies == now) { while ((now = jiffies) == j)
cpu_relax();
while (time_before(jiffies, now + 1)) {
mb(); /* prevent loop optimzation */ mb(); /* prevent loop optimzation */
tmpl->do_2(BENCH_SIZE, b1, b2); tmpl->do_2(BENCH_SIZE, b1, b2);
mb(); mb();
...@@ -89,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) ...@@ -89,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
max = count; max = count;
} }
preempt_enable();
speed = max * (HZ * BENCH_SIZE / 1024); speed = max * (HZ * BENCH_SIZE / 1024);
tmpl->speed = speed; tmpl->speed = speed;
......
...@@ -45,7 +45,7 @@ static inline char *bmname(struct bitmap *bitmap) ...@@ -45,7 +45,7 @@ static inline char *bmname(struct bitmap *bitmap)
* if we find our page, we increment the page's refcount so that it stays * if we find our page, we increment the page's refcount so that it stays
* allocated while we're using it * allocated while we're using it
*/ */
static int bitmap_checkpage(struct bitmap *bitmap, static int bitmap_checkpage(struct bitmap_counts *bitmap,
unsigned long page, int create) unsigned long page, int create)
__releases(bitmap->lock) __releases(bitmap->lock)
__acquires(bitmap->lock) __acquires(bitmap->lock)
...@@ -76,8 +76,7 @@ __acquires(bitmap->lock) ...@@ -76,8 +76,7 @@ __acquires(bitmap->lock)
spin_lock_irq(&bitmap->lock); spin_lock_irq(&bitmap->lock);
if (mappage == NULL) { if (mappage == NULL) {
pr_debug("%s: bitmap map page allocation failed, hijacking\n", pr_debug("md/bitmap: map page allocation failed, hijacking\n");
bmname(bitmap));
/* failed - set the hijacked flag so that we can use the /* failed - set the hijacked flag so that we can use the
* pointer as a counter */ * pointer as a counter */
if (!bitmap->bp[page].map) if (!bitmap->bp[page].map)
...@@ -100,7 +99,7 @@ __acquires(bitmap->lock) ...@@ -100,7 +99,7 @@ __acquires(bitmap->lock)
/* if page is completely empty, put it back on the free list, or dealloc it */ /* if page is completely empty, put it back on the free list, or dealloc it */
/* if page was hijacked, unmark the flag so it might get alloced next time */ /* if page was hijacked, unmark the flag so it might get alloced next time */
/* Note: lock should be held when calling this */ /* Note: lock should be held when calling this */
static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page)
{ {
char *ptr; char *ptr;
...@@ -130,22 +129,14 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) ...@@ -130,22 +129,14 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
*/ */
/* IO operations when bitmap is stored near all superblocks */ /* IO operations when bitmap is stored near all superblocks */
static struct page *read_sb_page(struct mddev *mddev, loff_t offset, static int read_sb_page(struct mddev *mddev, loff_t offset,
struct page *page, struct page *page,
unsigned long index, int size) unsigned long index, int size)
{ {
/* choose a good rdev and read the page from there */ /* choose a good rdev and read the page from there */
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t target; sector_t target;
int did_alloc = 0;
if (!page) {
page = alloc_page(GFP_KERNEL);
if (!page)
return ERR_PTR(-ENOMEM);
did_alloc = 1;
}
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (! test_bit(In_sync, &rdev->flags) if (! test_bit(In_sync, &rdev->flags)
...@@ -158,15 +149,10 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, ...@@ -158,15 +149,10 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
roundup(size, bdev_logical_block_size(rdev->bdev)), roundup(size, bdev_logical_block_size(rdev->bdev)),
page, READ, true)) { page, READ, true)) {
page->index = index; page->index = index;
attach_page_buffers(page, NULL); /* so that free_buffer will return 0;
* quietly no-op */
return page;
} }
} }
if (did_alloc) return -EIO;
put_page(page);
return ERR_PTR(-EIO);
} }
static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev)
...@@ -208,6 +194,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -208,6 +194,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
struct md_rdev *rdev = NULL; struct md_rdev *rdev = NULL;
struct block_device *bdev; struct block_device *bdev;
struct mddev *mddev = bitmap->mddev; struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
int size = PAGE_SIZE; int size = PAGE_SIZE;
...@@ -215,9 +202,13 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -215,9 +202,13 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
if (page->index == bitmap->file_pages-1) if (page->index == store->file_pages-1) {
size = roundup(bitmap->last_page_size, int last_page_size = store->bytes & (PAGE_SIZE-1);
if (last_page_size == 0)
last_page_size = PAGE_SIZE;
size = roundup(last_page_size,
bdev_logical_block_size(bdev)); bdev_logical_block_size(bdev));
}
/* Just make sure we aren't corrupting data or /* Just make sure we aren't corrupting data or
* metadata * metadata
*/ */
...@@ -276,10 +267,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -276,10 +267,10 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
{ {
struct buffer_head *bh; struct buffer_head *bh;
if (bitmap->file == NULL) { if (bitmap->storage.file == NULL) {
switch (write_sb_page(bitmap, page, wait)) { switch (write_sb_page(bitmap, page, wait)) {
case -EINVAL: case -EINVAL:
bitmap->flags |= BITMAP_WRITE_ERROR; set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
} }
} else { } else {
...@@ -297,20 +288,16 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -297,20 +288,16 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
wait_event(bitmap->write_wait, wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0); atomic_read(&bitmap->pending_writes)==0);
} }
if (bitmap->flags & BITMAP_WRITE_ERROR) if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
bitmap_file_kick(bitmap); bitmap_file_kick(bitmap);
} }
static void end_bitmap_write(struct buffer_head *bh, int uptodate) static void end_bitmap_write(struct buffer_head *bh, int uptodate)
{ {
struct bitmap *bitmap = bh->b_private; struct bitmap *bitmap = bh->b_private;
unsigned long flags;
if (!uptodate) { if (!uptodate)
spin_lock_irqsave(&bitmap->lock, flags); set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
bitmap->flags |= BITMAP_WRITE_ERROR;
spin_unlock_irqrestore(&bitmap->lock, flags);
}
if (atomic_dec_and_test(&bitmap->pending_writes)) if (atomic_dec_and_test(&bitmap->pending_writes))
wake_up(&bitmap->write_wait); wake_up(&bitmap->write_wait);
} }
...@@ -325,8 +312,12 @@ __clear_page_buffers(struct page *page) ...@@ -325,8 +312,12 @@ __clear_page_buffers(struct page *page)
} }
static void free_buffers(struct page *page) static void free_buffers(struct page *page)
{ {
struct buffer_head *bh = page_buffers(page); struct buffer_head *bh;
if (!PagePrivate(page))
return;
bh = page_buffers(page);
while (bh) { while (bh) {
struct buffer_head *next = bh->b_this_page; struct buffer_head *next = bh->b_this_page;
free_buffer_head(bh); free_buffer_head(bh);
...@@ -343,11 +334,12 @@ static void free_buffers(struct page *page) ...@@ -343,11 +334,12 @@ static void free_buffers(struct page *page)
* This usage is similar to how swap files are handled, and allows us * This usage is similar to how swap files are handled, and allows us
* to write to a file with no concerns of memory allocation failing. * to write to a file with no concerns of memory allocation failing.
*/ */
static struct page *read_page(struct file *file, unsigned long index, static int read_page(struct file *file, unsigned long index,
struct bitmap *bitmap, struct bitmap *bitmap,
unsigned long count) unsigned long count,
struct page *page)
{ {
struct page *page = NULL; int ret = 0;
struct inode *inode = file->f_path.dentry->d_inode; struct inode *inode = file->f_path.dentry->d_inode;
struct buffer_head *bh; struct buffer_head *bh;
sector_t block; sector_t block;
...@@ -355,16 +347,9 @@ static struct page *read_page(struct file *file, unsigned long index, ...@@ -355,16 +347,9 @@ static struct page *read_page(struct file *file, unsigned long index,
pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT); (unsigned long long)index << PAGE_SHIFT);
page = alloc_page(GFP_KERNEL);
if (!page)
page = ERR_PTR(-ENOMEM);
if (IS_ERR(page))
goto out;
bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0); bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
if (!bh) { if (!bh) {
put_page(page); ret = -ENOMEM;
page = ERR_PTR(-ENOMEM);
goto out; goto out;
} }
attach_page_buffers(page, bh); attach_page_buffers(page, bh);
...@@ -376,8 +361,7 @@ static struct page *read_page(struct file *file, unsigned long index, ...@@ -376,8 +361,7 @@ static struct page *read_page(struct file *file, unsigned long index,
bh->b_blocknr = bmap(inode, block); bh->b_blocknr = bmap(inode, block);
if (bh->b_blocknr == 0) { if (bh->b_blocknr == 0) {
/* Cannot use this file! */ /* Cannot use this file! */
free_buffers(page); ret = -EINVAL;
page = ERR_PTR(-EINVAL);
goto out; goto out;
} }
bh->b_bdev = inode->i_sb->s_bdev; bh->b_bdev = inode->i_sb->s_bdev;
...@@ -400,17 +384,15 @@ static struct page *read_page(struct file *file, unsigned long index, ...@@ -400,17 +384,15 @@ static struct page *read_page(struct file *file, unsigned long index,
wait_event(bitmap->write_wait, wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0); atomic_read(&bitmap->pending_writes)==0);
if (bitmap->flags & BITMAP_WRITE_ERROR) { if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
free_buffers(page); ret = -EIO;
page = ERR_PTR(-EIO);
}
out: out:
if (IS_ERR(page)) if (ret)
printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %ld\n", printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
(int)PAGE_SIZE, (int)PAGE_SIZE,
(unsigned long long)index << PAGE_SHIFT, (unsigned long long)index << PAGE_SHIFT,
PTR_ERR(page)); ret);
return page; return ret;
} }
/* /*
...@@ -426,9 +408,9 @@ void bitmap_update_sb(struct bitmap *bitmap) ...@@ -426,9 +408,9 @@ void bitmap_update_sb(struct bitmap *bitmap)
return; return;
if (bitmap->mddev->bitmap_info.external) if (bitmap->mddev->bitmap_info.external)
return; return;
if (!bitmap->sb_page) /* no superblock */ if (!bitmap->storage.sb_page) /* no superblock */
return; return;
sb = kmap_atomic(bitmap->sb_page); sb = kmap_atomic(bitmap->storage.sb_page);
sb->events = cpu_to_le64(bitmap->mddev->events); sb->events = cpu_to_le64(bitmap->mddev->events);
if (bitmap->mddev->events < bitmap->events_cleared) if (bitmap->mddev->events < bitmap->events_cleared)
/* rocking back to read-only */ /* rocking back to read-only */
...@@ -438,8 +420,13 @@ void bitmap_update_sb(struct bitmap *bitmap) ...@@ -438,8 +420,13 @@ void bitmap_update_sb(struct bitmap *bitmap)
/* Just in case these have been changed via sysfs: */ /* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
/* This might have been changed by a reshape */
sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
kunmap_atomic(sb); kunmap_atomic(sb);
write_page(bitmap, bitmap->sb_page, 1); write_page(bitmap, bitmap->storage.sb_page, 1);
} }
/* print out the bitmap file superblock */ /* print out the bitmap file superblock */
...@@ -447,9 +434,9 @@ void bitmap_print_sb(struct bitmap *bitmap) ...@@ -447,9 +434,9 @@ void bitmap_print_sb(struct bitmap *bitmap)
{ {
bitmap_super_t *sb; bitmap_super_t *sb;
if (!bitmap || !bitmap->sb_page) if (!bitmap || !bitmap->storage.sb_page)
return; return;
sb = kmap_atomic(bitmap->sb_page); sb = kmap_atomic(bitmap->storage.sb_page);
printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
...@@ -488,15 +475,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) ...@@ -488,15 +475,15 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
unsigned long chunksize, daemon_sleep, write_behind; unsigned long chunksize, daemon_sleep, write_behind;
int err = -EINVAL; int err = -EINVAL;
bitmap->sb_page = alloc_page(GFP_KERNEL); bitmap->storage.sb_page = alloc_page(GFP_KERNEL);
if (IS_ERR(bitmap->sb_page)) { if (IS_ERR(bitmap->storage.sb_page)) {
err = PTR_ERR(bitmap->sb_page); err = PTR_ERR(bitmap->storage.sb_page);
bitmap->sb_page = NULL; bitmap->storage.sb_page = NULL;
return err; return err;
} }
bitmap->sb_page->index = 0; bitmap->storage.sb_page->index = 0;
sb = kmap_atomic(bitmap->sb_page); sb = kmap_atomic(bitmap->storage.sb_page);
sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->magic = cpu_to_le32(BITMAP_MAGIC);
sb->version = cpu_to_le32(BITMAP_MAJOR_HI); sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
...@@ -534,8 +521,8 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) ...@@ -534,8 +521,8 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
memcpy(sb->uuid, bitmap->mddev->uuid, 16); memcpy(sb->uuid, bitmap->mddev->uuid, 16);
bitmap->flags |= BITMAP_STALE; set_bit(BITMAP_STALE, &bitmap->flags);
sb->state |= cpu_to_le32(BITMAP_STALE); sb->state = cpu_to_le32(bitmap->flags);
bitmap->events_cleared = bitmap->mddev->events; bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->mddev->events); sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
...@@ -551,31 +538,45 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -551,31 +538,45 @@ static int bitmap_read_sb(struct bitmap *bitmap)
bitmap_super_t *sb; bitmap_super_t *sb;
unsigned long chunksize, daemon_sleep, write_behind; unsigned long chunksize, daemon_sleep, write_behind;
unsigned long long events; unsigned long long events;
unsigned long sectors_reserved = 0;
int err = -EINVAL; int err = -EINVAL;
struct page *sb_page;
if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
chunksize = 128 * 1024 * 1024;
daemon_sleep = 5 * HZ;
write_behind = 0;
set_bit(BITMAP_STALE, &bitmap->flags);
err = 0;
goto out_no_sb;
}
/* page 0 is the superblock, read it... */ /* page 0 is the superblock, read it... */
if (bitmap->file) { sb_page = alloc_page(GFP_KERNEL);
loff_t isize = i_size_read(bitmap->file->f_mapping->host); if (!sb_page)
return -ENOMEM;
bitmap->storage.sb_page = sb_page;
if (bitmap->storage.file) {
loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); err = read_page(bitmap->storage.file, 0,
bitmap, bytes, sb_page);
} else { } else {
bitmap->sb_page = read_sb_page(bitmap->mddev, err = read_sb_page(bitmap->mddev,
bitmap->mddev->bitmap_info.offset, bitmap->mddev->bitmap_info.offset,
NULL, sb_page,
0, sizeof(bitmap_super_t)); 0, sizeof(bitmap_super_t));
} }
if (IS_ERR(bitmap->sb_page)) { if (err)
err = PTR_ERR(bitmap->sb_page);
bitmap->sb_page = NULL;
return err; return err;
}
sb = kmap_atomic(bitmap->sb_page); sb = kmap_atomic(sb_page);
chunksize = le32_to_cpu(sb->chunksize); chunksize = le32_to_cpu(sb->chunksize);
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind); write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
/* verify that the bitmap-specific fields are valid */ /* verify that the bitmap-specific fields are valid */
if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
...@@ -618,60 +619,32 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -618,60 +619,32 @@ static int bitmap_read_sb(struct bitmap *bitmap)
"-- forcing full recovery\n", "-- forcing full recovery\n",
bmname(bitmap), events, bmname(bitmap), events,
(unsigned long long) bitmap->mddev->events); (unsigned long long) bitmap->mddev->events);
sb->state |= cpu_to_le32(BITMAP_STALE); set_bit(BITMAP_STALE, &bitmap->flags);
} }
} }
/* assign fields using values from superblock */ /* assign fields using values from superblock */
bitmap->mddev->bitmap_info.chunksize = chunksize;
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
bitmap->flags |= le32_to_cpu(sb->state); bitmap->flags |= le32_to_cpu(sb->state);
if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
bitmap->flags |= BITMAP_HOSTENDIAN; set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
bitmap->events_cleared = le64_to_cpu(sb->events_cleared); bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
if (bitmap->flags & BITMAP_STALE)
bitmap->events_cleared = bitmap->mddev->events;
err = 0; err = 0;
out: out:
kunmap_atomic(sb); kunmap_atomic(sb);
out_no_sb:
if (test_bit(BITMAP_STALE, &bitmap->flags))
bitmap->events_cleared = bitmap->mddev->events;
bitmap->mddev->bitmap_info.chunksize = chunksize;
bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
bitmap->mddev->bitmap_info.max_write_behind = write_behind;
if (bitmap->mddev->bitmap_info.space == 0 ||
bitmap->mddev->bitmap_info.space > sectors_reserved)
bitmap->mddev->bitmap_info.space = sectors_reserved;
if (err) if (err)
bitmap_print_sb(bitmap); bitmap_print_sb(bitmap);
return err; return err;
} }
enum bitmap_mask_op {
MASK_SET,
MASK_UNSET
};
/* record the state of the bitmap in the superblock. Return the old value */
static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
enum bitmap_mask_op op)
{
bitmap_super_t *sb;
int old;
if (!bitmap->sb_page) /* can't set the state */
return 0;
sb = kmap_atomic(bitmap->sb_page);
old = le32_to_cpu(sb->state) & bits;
switch (op) {
case MASK_SET:
sb->state |= cpu_to_le32(bits);
bitmap->flags |= bits;
break;
case MASK_UNSET:
sb->state &= cpu_to_le32(~bits);
bitmap->flags &= ~bits;
break;
default:
BUG();
}
kunmap_atomic(sb);
return old;
}
/* /*
* general bitmap file operations * general bitmap file operations
*/ */
...@@ -683,17 +656,19 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, ...@@ -683,17 +656,19 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
* file a page at a time. There's a superblock at the start of the file. * file a page at a time. There's a superblock at the start of the file.
*/ */
/* calculate the index of the page that contains this bit */ /* calculate the index of the page that contains this bit */
static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) static inline unsigned long file_page_index(struct bitmap_storage *store,
unsigned long chunk)
{ {
if (!bitmap->mddev->bitmap_info.external) if (store->sb_page)
chunk += sizeof(bitmap_super_t) << 3; chunk += sizeof(bitmap_super_t) << 3;
return chunk >> PAGE_BIT_SHIFT; return chunk >> PAGE_BIT_SHIFT;
} }
/* calculate the (bit) offset of this bit within a page */ /* calculate the (bit) offset of this bit within a page */
static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) static inline unsigned long file_page_offset(struct bitmap_storage *store,
unsigned long chunk)
{ {
if (!bitmap->mddev->bitmap_info.external) if (store->sb_page)
chunk += sizeof(bitmap_super_t) << 3; chunk += sizeof(bitmap_super_t) << 3;
return chunk & (PAGE_BITS - 1); return chunk & (PAGE_BITS - 1);
} }
...@@ -705,57 +680,86 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon ...@@ -705,57 +680,86 @@ static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned lon
* 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
* 0 or page 1 * 0 or page 1
*/ */
static inline struct page *filemap_get_page(struct bitmap *bitmap, static inline struct page *filemap_get_page(struct bitmap_storage *store,
unsigned long chunk) unsigned long chunk)
{ {
if (file_page_index(bitmap, chunk) >= bitmap->file_pages) if (file_page_index(store, chunk) >= store->file_pages)
return NULL; return NULL;
return bitmap->filemap[file_page_index(bitmap, chunk) return store->filemap[file_page_index(store, chunk)
- file_page_index(bitmap, 0)]; - file_page_index(store, 0)];
} }
static void bitmap_file_unmap(struct bitmap *bitmap) static int bitmap_storage_alloc(struct bitmap_storage *store,
unsigned long chunks, int with_super)
{
int pnum;
unsigned long num_pages;
unsigned long bytes;
bytes = DIV_ROUND_UP(chunks, 8);
if (with_super)
bytes += sizeof(bitmap_super_t);
num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
store->filemap = kmalloc(sizeof(struct page *)
* num_pages, GFP_KERNEL);
if (!store->filemap)
return -ENOMEM;
if (with_super && !store->sb_page) {
store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (store->sb_page == NULL)
return -ENOMEM;
store->sb_page->index = 0;
}
pnum = 0;
if (store->sb_page) {
store->filemap[0] = store->sb_page;
pnum = 1;
}
for ( ; pnum < num_pages; pnum++) {
store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
if (!store->filemap[pnum]) {
store->file_pages = pnum;
return -ENOMEM;
}
store->filemap[pnum]->index = pnum;
}
store->file_pages = pnum;
/* We need 4 bits per page, rounded up to a multiple
* of sizeof(unsigned long) */
store->filemap_attr = kzalloc(
roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
GFP_KERNEL);
if (!store->filemap_attr)
return -ENOMEM;
store->bytes = bytes;
return 0;
}
static void bitmap_file_unmap(struct bitmap_storage *store)
{ {
struct page **map, *sb_page; struct page **map, *sb_page;
unsigned long *attr;
int pages; int pages;
unsigned long flags; struct file *file;
spin_lock_irqsave(&bitmap->lock, flags); file = store->file;
map = bitmap->filemap; map = store->filemap;
bitmap->filemap = NULL; pages = store->file_pages;
attr = bitmap->filemap_attr; sb_page = store->sb_page;
bitmap->filemap_attr = NULL;
pages = bitmap->file_pages;
bitmap->file_pages = 0;
sb_page = bitmap->sb_page;
bitmap->sb_page = NULL;
spin_unlock_irqrestore(&bitmap->lock, flags);
while (pages--) while (pages--)
if (map[pages] != sb_page) /* 0 is sb_page, release it below */ if (map[pages] != sb_page) /* 0 is sb_page, release it below */
free_buffers(map[pages]); free_buffers(map[pages]);
kfree(map); kfree(map);
kfree(attr); kfree(store->filemap_attr);
if (sb_page) if (sb_page)
free_buffers(sb_page); free_buffers(sb_page);
}
static void bitmap_file_put(struct bitmap *bitmap)
{
struct file *file;
unsigned long flags;
spin_lock_irqsave(&bitmap->lock, flags);
file = bitmap->file;
bitmap->file = NULL;
spin_unlock_irqrestore(&bitmap->lock, flags);
if (file)
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
bitmap_file_unmap(bitmap);
if (file) { if (file) {
struct inode *inode = file->f_path.dentry->d_inode; struct inode *inode = file->f_path.dentry->d_inode;
...@@ -773,14 +777,14 @@ static void bitmap_file_kick(struct bitmap *bitmap) ...@@ -773,14 +777,14 @@ static void bitmap_file_kick(struct bitmap *bitmap)
{ {
char *path, *ptr = NULL; char *path, *ptr = NULL;
if (bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET) == 0) { if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
bitmap_update_sb(bitmap); bitmap_update_sb(bitmap);
if (bitmap->file) { if (bitmap->storage.file) {
path = kmalloc(PAGE_SIZE, GFP_KERNEL); path = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (path) if (path)
ptr = d_path(&bitmap->file->f_path, path, ptr = d_path(&bitmap->storage.file->f_path,
PAGE_SIZE); path, PAGE_SIZE);
printk(KERN_ALERT printk(KERN_ALERT
"%s: kicking failed bitmap file %s from array!\n", "%s: kicking failed bitmap file %s from array!\n",
...@@ -792,10 +796,6 @@ static void bitmap_file_kick(struct bitmap *bitmap) ...@@ -792,10 +796,6 @@ static void bitmap_file_kick(struct bitmap *bitmap)
"%s: disabling internal bitmap due to errors\n", "%s: disabling internal bitmap due to errors\n",
bmname(bitmap)); bmname(bitmap));
} }
bitmap_file_put(bitmap);
return;
} }
enum bitmap_page_attr { enum bitmap_page_attr {
...@@ -805,24 +805,30 @@ enum bitmap_page_attr { ...@@ -805,24 +805,30 @@ enum bitmap_page_attr {
BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */
}; };
static inline void set_page_attr(struct bitmap *bitmap, struct page *page, static inline void set_page_attr(struct bitmap *bitmap, int pnum,
enum bitmap_page_attr attr) enum bitmap_page_attr attr)
{ {
__set_bit((page->index<<2) + attr, bitmap->filemap_attr); set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
} }
static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, static inline void clear_page_attr(struct bitmap *bitmap, int pnum,
enum bitmap_page_attr attr) enum bitmap_page_attr attr)
{ {
__clear_bit((page->index<<2) + attr, bitmap->filemap_attr); clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
} }
static inline unsigned long test_page_attr(struct bitmap *bitmap, struct page *page, static inline int test_page_attr(struct bitmap *bitmap, int pnum,
enum bitmap_page_attr attr) enum bitmap_page_attr attr)
{ {
return test_bit((page->index<<2) + attr, bitmap->filemap_attr); return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr);
} }
static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum,
enum bitmap_page_attr attr)
{
return test_and_clear_bit((pnum<<2) + attr,
bitmap->storage.filemap_attr);
}
/* /*
* bitmap_file_set_bit -- called before performing a write to the md device * bitmap_file_set_bit -- called before performing a write to the md device
* to set (and eventually sync) a particular bit in the bitmap file * to set (and eventually sync) a particular bit in the bitmap file
...@@ -835,26 +841,46 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) ...@@ -835,26 +841,46 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long bit; unsigned long bit;
struct page *page; struct page *page;
void *kaddr; void *kaddr;
unsigned long chunk = block >> bitmap->chunkshift; unsigned long chunk = block >> bitmap->counts.chunkshift;
if (!bitmap->filemap) page = filemap_get_page(&bitmap->storage, chunk);
return;
page = filemap_get_page(bitmap, chunk);
if (!page) if (!page)
return; return;
bit = file_page_offset(bitmap, chunk); bit = file_page_offset(&bitmap->storage, chunk);
/* set the bit */ /* set the bit */
kaddr = kmap_atomic(page); kaddr = kmap_atomic(page);
if (bitmap->flags & BITMAP_HOSTENDIAN) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
set_bit(bit, kaddr); set_bit(bit, kaddr);
else else
__set_bit_le(bit, kaddr); test_and_set_bit_le(bit, kaddr);
kunmap_atomic(kaddr); kunmap_atomic(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, page->index); pr_debug("set file bit %lu page %lu\n", bit, page->index);
/* record page number so it gets flushed to disk when unplug occurs */ /* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); set_page_attr(bitmap, page->index, BITMAP_PAGE_DIRTY);
}
static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
{
unsigned long bit;
struct page *page;
void *paddr;
unsigned long chunk = block >> bitmap->counts.chunkshift;
page = filemap_get_page(&bitmap->storage, chunk);
if (!page)
return;
bit = file_page_offset(&bitmap->storage, chunk);
paddr = kmap_atomic(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
clear_bit(bit, paddr);
else
test_and_clear_bit_le(bit, paddr);
kunmap_atomic(paddr);
if (!test_page_attr(bitmap, page->index, BITMAP_PAGE_NEEDWRITE)) {
set_page_attr(bitmap, page->index, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
} }
/* this gets called when the md device is ready to unplug its underlying /* this gets called when the md device is ready to unplug its underlying
...@@ -862,42 +888,37 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) ...@@ -862,42 +888,37 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
* sync the dirty pages of the bitmap file to disk */ * sync the dirty pages of the bitmap file to disk */
void bitmap_unplug(struct bitmap *bitmap) void bitmap_unplug(struct bitmap *bitmap)
{ {
unsigned long i, flags; unsigned long i;
int dirty, need_write; int dirty, need_write;
struct page *page;
int wait = 0; int wait = 0;
if (!bitmap) if (!bitmap || !bitmap->storage.filemap ||
test_bit(BITMAP_STALE, &bitmap->flags))
return; return;
/* look at each page to see if there are any set bits that need to be /* look at each page to see if there are any set bits that need to be
* flushed out to disk */ * flushed out to disk */
for (i = 0; i < bitmap->file_pages; i++) { for (i = 0; i < bitmap->storage.file_pages; i++) {
spin_lock_irqsave(&bitmap->lock, flags); if (!bitmap->storage.filemap)
if (!bitmap->filemap) {
spin_unlock_irqrestore(&bitmap->lock, flags);
return; return;
dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
need_write = test_and_clear_page_attr(bitmap, i,
BITMAP_PAGE_NEEDWRITE);
if (dirty || need_write) {
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
write_page(bitmap, bitmap->storage.filemap[i], 0);
} }
page = bitmap->filemap[i];
dirty = test_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
need_write = test_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
if (dirty) if (dirty)
wait = 1; wait = 1;
spin_unlock_irqrestore(&bitmap->lock, flags);
if (dirty || need_write)
write_page(bitmap, page, 0);
} }
if (wait) { /* if any writes were performed, we need to wait on them */ if (wait) { /* if any writes were performed, we need to wait on them */
if (bitmap->file) if (bitmap->storage.file)
wait_event(bitmap->write_wait, wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0); atomic_read(&bitmap->pending_writes)==0);
else else
md_super_wait(bitmap->mddev); md_super_wait(bitmap->mddev);
} }
if (bitmap->flags & BITMAP_WRITE_ERROR) if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
bitmap_file_kick(bitmap); bitmap_file_kick(bitmap);
} }
EXPORT_SYMBOL(bitmap_unplug); EXPORT_SYMBOL(bitmap_unplug);
...@@ -917,98 +938,77 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n ...@@ -917,98 +938,77 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
{ {
unsigned long i, chunks, index, oldindex, bit; unsigned long i, chunks, index, oldindex, bit;
struct page *page = NULL, *oldpage = NULL; struct page *page = NULL;
unsigned long num_pages, bit_cnt = 0; unsigned long bit_cnt = 0;
struct file *file; struct file *file;
unsigned long bytes, offset; unsigned long offset;
int outofdate; int outofdate;
int ret = -ENOSPC; int ret = -ENOSPC;
void *paddr; void *paddr;
struct bitmap_storage *store = &bitmap->storage;
chunks = bitmap->chunks; chunks = bitmap->counts.chunks;
file = bitmap->file; file = store->file;
BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); if (!file && !bitmap->mddev->bitmap_info.offset) {
/* No permanent bitmap - fill with '1s'. */
store->filemap = NULL;
store->file_pages = 0;
for (i = 0; i < chunks ; i++) {
/* if the disk bit is set, set the memory bit */
int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift)
>= start);
bitmap_set_memory_bits(bitmap,
(sector_t)i << bitmap->counts.chunkshift,
needed);
}
return 0;
}
outofdate = bitmap->flags & BITMAP_STALE; outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
if (outofdate) if (outofdate)
printk(KERN_INFO "%s: bitmap file is out of date, doing full " printk(KERN_INFO "%s: bitmap file is out of date, doing full "
"recovery\n", bmname(bitmap)); "recovery\n", bmname(bitmap));
bytes = DIV_ROUND_UP(bitmap->chunks, 8); if (file && i_size_read(file->f_mapping->host) < store->bytes) {
if (!bitmap->mddev->bitmap_info.external)
bytes += sizeof(bitmap_super_t);
num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
if (file && i_size_read(file->f_mapping->host) < bytes) {
printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
bmname(bitmap), bmname(bitmap),
(unsigned long) i_size_read(file->f_mapping->host), (unsigned long) i_size_read(file->f_mapping->host),
bytes); store->bytes);
goto err; goto err;
} }
ret = -ENOMEM;
bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
if (!bitmap->filemap)
goto err;
/* We need 4 bits per page, rounded up to a multiple of sizeof(unsigned long) */
bitmap->filemap_attr = kzalloc(
roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)),
GFP_KERNEL);
if (!bitmap->filemap_attr)
goto err;
oldindex = ~0L; oldindex = ~0L;
offset = 0;
if (!bitmap->mddev->bitmap_info.external)
offset = sizeof(bitmap_super_t);
for (i = 0; i < chunks; i++) { for (i = 0; i < chunks; i++) {
int b; int b;
index = file_page_index(bitmap, i); index = file_page_index(&bitmap->storage, i);
bit = file_page_offset(bitmap, i); bit = file_page_offset(&bitmap->storage, i);
if (index != oldindex) { /* this is a new page, read it in */ if (index != oldindex) { /* this is a new page, read it in */
int count; int count;
/* unmap the old page, we're done with it */ /* unmap the old page, we're done with it */
if (index == num_pages-1) if (index == store->file_pages-1)
count = bytes - index * PAGE_SIZE; count = store->bytes - index * PAGE_SIZE;
else else
count = PAGE_SIZE; count = PAGE_SIZE;
if (index == 0 && bitmap->sb_page) { page = store->filemap[index];
/* if (file)
* if we're here then the superblock page ret = read_page(file, index, bitmap,
* contains some bits (PAGE_SIZE != sizeof sb) count, page);
* we've already read it in, so just use it else
*/ ret = read_sb_page(
page = bitmap->sb_page; bitmap->mddev,
offset = sizeof(bitmap_super_t); bitmap->mddev->bitmap_info.offset,
if (!file) page,
page = read_sb_page( index, count);
bitmap->mddev,
bitmap->mddev->bitmap_info.offset, if (ret)
page,
index, count);
} else if (file) {
page = read_page(file, index, bitmap, count);
offset = 0;
} else {
page = read_sb_page(bitmap->mddev,
bitmap->mddev->bitmap_info.offset,
NULL,
index, count);
offset = 0;
}
if (IS_ERR(page)) { /* read error */
ret = PTR_ERR(page);
goto err; goto err;
}
oldindex = index; oldindex = index;
oldpage = page;
bitmap->filemap[bitmap->file_pages++] = page;
bitmap->last_page_size = count;
if (outofdate) { if (outofdate) {
/* /*
...@@ -1022,39 +1022,33 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) ...@@ -1022,39 +1022,33 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
write_page(bitmap, page, 1); write_page(bitmap, page, 1);
ret = -EIO; ret = -EIO;
if (bitmap->flags & BITMAP_WRITE_ERROR) if (test_bit(BITMAP_WRITE_ERROR,
&bitmap->flags))
goto err; goto err;
} }
} }
paddr = kmap_atomic(page); paddr = kmap_atomic(page);
if (bitmap->flags & BITMAP_HOSTENDIAN) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
b = test_bit(bit, paddr); b = test_bit(bit, paddr);
else else
b = test_bit_le(bit, paddr); b = test_bit_le(bit, paddr);
kunmap_atomic(paddr); kunmap_atomic(paddr);
if (b) { if (b) {
/* if the disk bit is set, set the memory bit */ /* if the disk bit is set, set the memory bit */
int needed = ((sector_t)(i+1) << bitmap->chunkshift int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
>= start); >= start);
bitmap_set_memory_bits(bitmap, bitmap_set_memory_bits(bitmap,
(sector_t)i << bitmap->chunkshift, (sector_t)i << bitmap->counts.chunkshift,
needed); needed);
bit_cnt++; bit_cnt++;
} }
} offset = 0;
/* everything went OK */
ret = 0;
bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
if (bit_cnt) { /* Kick recovery if any bits were set */
set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
md_wakeup_thread(bitmap->mddev->thread);
} }
printk(KERN_INFO "%s: bitmap initialized from disk: " printk(KERN_INFO "%s: bitmap initialized from disk: "
"read %lu/%lu pages, set %lu of %lu bits\n", "read %lu pages, set %lu of %lu bits\n",
bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, chunks); bmname(bitmap), store->file_pages,
bit_cnt, chunks);
return 0; return 0;
...@@ -1071,22 +1065,38 @@ void bitmap_write_all(struct bitmap *bitmap) ...@@ -1071,22 +1065,38 @@ void bitmap_write_all(struct bitmap *bitmap)
*/ */
int i; int i;
spin_lock_irq(&bitmap->lock); if (!bitmap || !bitmap->storage.filemap)
for (i = 0; i < bitmap->file_pages; i++) return;
set_page_attr(bitmap, bitmap->filemap[i], if (bitmap->storage.file)
/* Only one copy, so nothing needed */
return;
for (i = 0; i < bitmap->storage.file_pages; i++)
set_page_attr(bitmap, i,
BITMAP_PAGE_NEEDWRITE); BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0; bitmap->allclean = 0;
spin_unlock_irq(&bitmap->lock);
} }
static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) static void bitmap_count_page(struct bitmap_counts *bitmap,
sector_t offset, int inc)
{ {
sector_t chunk = offset >> bitmap->chunkshift; sector_t chunk = offset >> bitmap->chunkshift;
unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
bitmap->bp[page].count += inc; bitmap->bp[page].count += inc;
bitmap_checkfree(bitmap, page); bitmap_checkfree(bitmap, page);
} }
static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
{
sector_t chunk = offset >> bitmap->chunkshift;
unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
struct bitmap_page *bp = &bitmap->bp[page];
if (!bp->pending)
bp->pending = 1;
}
static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
sector_t offset, sector_t *blocks, sector_t offset, sector_t *blocks,
int create); int create);
...@@ -1099,10 +1109,9 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1099,10 +1109,9 @@ void bitmap_daemon_work(struct mddev *mddev)
{ {
struct bitmap *bitmap; struct bitmap *bitmap;
unsigned long j; unsigned long j;
unsigned long flags; unsigned long nextpage;
struct page *page = NULL, *lastpage = NULL;
sector_t blocks; sector_t blocks;
void *paddr; struct bitmap_counts *counts;
/* Use a mutex to guard daemon_work against /* Use a mutex to guard daemon_work against
* bitmap_destroy. * bitmap_destroy.
...@@ -1124,112 +1133,90 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1124,112 +1133,90 @@ void bitmap_daemon_work(struct mddev *mddev)
} }
bitmap->allclean = 1; bitmap->allclean = 1;
spin_lock_irqsave(&bitmap->lock, flags); /* Any file-page which is PENDING now needs to be written.
for (j = 0; j < bitmap->chunks; j++) { * So set NEEDWRITE now, then after we make any last-minute changes
* we will write it.
*/
for (j = 0; j < bitmap->storage.file_pages; j++)
if (test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_PENDING))
set_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE);
if (bitmap->need_sync &&
mddev->bitmap_info.external == 0) {
/* Arrange for superblock update as well as
* other changes */
bitmap_super_t *sb;
bitmap->need_sync = 0;
if (bitmap->storage.filemap) {
sb = kmap_atomic(bitmap->storage.sb_page);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
kunmap_atomic(sb);
set_page_attr(bitmap, 0,
BITMAP_PAGE_NEEDWRITE);
}
}
/* Now look at the bitmap counters and if any are '2' or '1',
* decrement and handle accordingly.
*/
counts = &bitmap->counts;
spin_lock_irq(&counts->lock);
nextpage = 0;
for (j = 0; j < counts->chunks; j++) {
bitmap_counter_t *bmc; bitmap_counter_t *bmc;
if (!bitmap->filemap) sector_t block = (sector_t)j << counts->chunkshift;
/* error or shutdown */
break;
page = filemap_get_page(bitmap, j); if (j == nextpage) {
nextpage += PAGE_COUNTER_RATIO;
if (page != lastpage) { if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) {
/* skip this page unless it's marked as needing cleaning */ j |= PAGE_COUNTER_MASK;
if (!test_page_attr(bitmap, page, BITMAP_PAGE_PENDING)) {
int need_write = test_page_attr(bitmap, page,
BITMAP_PAGE_NEEDWRITE);
if (need_write)
clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
if (need_write)
write_page(bitmap, page, 0);
spin_lock_irqsave(&bitmap->lock, flags);
j |= (PAGE_BITS - 1);
continue; continue;
} }
counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
/* grab the new page, sync and release the old */
if (lastpage != NULL) {
if (test_page_attr(bitmap, lastpage,
BITMAP_PAGE_NEEDWRITE)) {
clear_page_attr(bitmap, lastpage,
BITMAP_PAGE_NEEDWRITE);
spin_unlock_irqrestore(&bitmap->lock, flags);
write_page(bitmap, lastpage, 0);
} else {
set_page_attr(bitmap, lastpage,
BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0;
spin_unlock_irqrestore(&bitmap->lock, flags);
}
} else
spin_unlock_irqrestore(&bitmap->lock, flags);
lastpage = page;
/* We are possibly going to clear some bits, so make
* sure that events_cleared is up-to-date.
*/
if (bitmap->need_sync &&
mddev->bitmap_info.external == 0) {
bitmap_super_t *sb;
bitmap->need_sync = 0;
sb = kmap_atomic(bitmap->sb_page);
sb->events_cleared =
cpu_to_le64(bitmap->events_cleared);
kunmap_atomic(sb);
write_page(bitmap, bitmap->sb_page, 1);
}
spin_lock_irqsave(&bitmap->lock, flags);
if (!bitmap->need_sync)
clear_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
else
bitmap->allclean = 0;
} }
bmc = bitmap_get_counter(bitmap, bmc = bitmap_get_counter(counts,
(sector_t)j << bitmap->chunkshift, block,
&blocks, 0); &blocks, 0);
if (!bmc)
if (!bmc) {
j |= PAGE_COUNTER_MASK; j |= PAGE_COUNTER_MASK;
else if (*bmc) { continue;
if (*bmc == 1 && !bitmap->need_sync) {
/* we can clear the bit */
*bmc = 0;
bitmap_count_page(bitmap,
(sector_t)j << bitmap->chunkshift,
-1);
/* clear the bit */
paddr = kmap_atomic(page);
if (bitmap->flags & BITMAP_HOSTENDIAN)
clear_bit(file_page_offset(bitmap, j),
paddr);
else
__clear_bit_le(
file_page_offset(bitmap,
j),
paddr);
kunmap_atomic(paddr);
} else if (*bmc <= 2) {
*bmc = 1; /* maybe clear the bit next time */
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
} }
} if (*bmc == 1 && !bitmap->need_sync) {
spin_unlock_irqrestore(&bitmap->lock, flags); /* We can clear the bit */
*bmc = 0;
/* now sync the final page */ bitmap_count_page(counts, block, -1);
if (lastpage != NULL) { bitmap_file_clear_bit(bitmap, block);
spin_lock_irqsave(&bitmap->lock, flags); } else if (*bmc && *bmc <= 2) {
if (test_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE)) { *bmc = 1;
clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); bitmap_set_pending(counts, block);
spin_unlock_irqrestore(&bitmap->lock, flags);
write_page(bitmap, lastpage, 0);
} else {
set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0; bitmap->allclean = 0;
spin_unlock_irqrestore(&bitmap->lock, flags); }
}
spin_unlock_irq(&counts->lock);
/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
* DIRTY pages need to be written by bitmap_unplug so it can wait
* for them.
* If we find any DIRTY page we stop there and let bitmap_unplug
* handle all the rest. This is important in the case where
* the first blocking holds the superblock and it has been updated.
* We mustn't write any other blocks before the superblock.
*/
for (j = 0;
j < bitmap->storage.file_pages
&& !test_bit(BITMAP_STALE, &bitmap->flags);
j++) {
if (test_page_attr(bitmap, j,
BITMAP_PAGE_DIRTY))
/* bitmap_unplug will handle the rest */
break;
if (test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE)) {
write_page(bitmap, bitmap->storage.filemap[j], 0);
} }
} }
...@@ -1240,7 +1227,7 @@ void bitmap_daemon_work(struct mddev *mddev) ...@@ -1240,7 +1227,7 @@ void bitmap_daemon_work(struct mddev *mddev)
mutex_unlock(&mddev->bitmap_info.mutex); mutex_unlock(&mddev->bitmap_info.mutex);
} }
static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap,
sector_t offset, sector_t *blocks, sector_t offset, sector_t *blocks,
int create) int create)
__releases(bitmap->lock) __releases(bitmap->lock)
...@@ -1302,10 +1289,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect ...@@ -1302,10 +1289,10 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
sector_t blocks; sector_t blocks;
bitmap_counter_t *bmc; bitmap_counter_t *bmc;
spin_lock_irq(&bitmap->lock); spin_lock_irq(&bitmap->counts.lock);
bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
if (!bmc) { if (!bmc) {
spin_unlock_irq(&bitmap->lock); spin_unlock_irq(&bitmap->counts.lock);
return 0; return 0;
} }
...@@ -1317,7 +1304,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect ...@@ -1317,7 +1304,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
*/ */
prepare_to_wait(&bitmap->overflow_wait, &__wait, prepare_to_wait(&bitmap->overflow_wait, &__wait,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&bitmap->lock); spin_unlock_irq(&bitmap->counts.lock);
io_schedule(); io_schedule();
finish_wait(&bitmap->overflow_wait, &__wait); finish_wait(&bitmap->overflow_wait, &__wait);
continue; continue;
...@@ -1326,7 +1313,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect ...@@ -1326,7 +1313,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
switch (*bmc) { switch (*bmc) {
case 0: case 0:
bitmap_file_set_bit(bitmap, offset); bitmap_file_set_bit(bitmap, offset);
bitmap_count_page(bitmap, offset, 1); bitmap_count_page(&bitmap->counts, offset, 1);
/* fall through */ /* fall through */
case 1: case 1:
*bmc = 2; *bmc = 2;
...@@ -1334,7 +1321,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect ...@@ -1334,7 +1321,7 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect
(*bmc)++; (*bmc)++;
spin_unlock_irq(&bitmap->lock); spin_unlock_irq(&bitmap->counts.lock);
offset += blocks; offset += blocks;
if (sectors > blocks) if (sectors > blocks)
...@@ -1364,10 +1351,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto ...@@ -1364,10 +1351,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
unsigned long flags; unsigned long flags;
bitmap_counter_t *bmc; bitmap_counter_t *bmc;
spin_lock_irqsave(&bitmap->lock, flags); spin_lock_irqsave(&bitmap->counts.lock, flags);
bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
if (!bmc) { if (!bmc) {
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->counts.lock, flags);
return; return;
} }
...@@ -1386,14 +1373,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto ...@@ -1386,14 +1373,10 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
(*bmc)--; (*bmc)--;
if (*bmc <= 2) { if (*bmc <= 2) {
set_page_attr(bitmap, bitmap_set_pending(&bitmap->counts, offset);
filemap_get_page(
bitmap,
offset >> bitmap->chunkshift),
BITMAP_PAGE_PENDING);
bitmap->allclean = 0; bitmap->allclean = 0;
} }
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->counts.lock, flags);
offset += blocks; offset += blocks;
if (sectors > blocks) if (sectors > blocks)
sectors -= blocks; sectors -= blocks;
...@@ -1412,8 +1395,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t ...@@ -1412,8 +1395,8 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
*blocks = 1024; *blocks = 1024;
return 1; /* always resync if no bitmap */ return 1; /* always resync if no bitmap */
} }
spin_lock_irq(&bitmap->lock); spin_lock_irq(&bitmap->counts.lock);
bmc = bitmap_get_counter(bitmap, offset, blocks, 0); bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
rv = 0; rv = 0;
if (bmc) { if (bmc) {
/* locked */ /* locked */
...@@ -1427,7 +1410,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t ...@@ -1427,7 +1410,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
} }
} }
} }
spin_unlock_irq(&bitmap->lock); spin_unlock_irq(&bitmap->counts.lock);
return rv; return rv;
} }
...@@ -1464,8 +1447,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i ...@@ -1464,8 +1447,8 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
*blocks = 1024; *blocks = 1024;
return; return;
} }
spin_lock_irqsave(&bitmap->lock, flags); spin_lock_irqsave(&bitmap->counts.lock, flags);
bmc = bitmap_get_counter(bitmap, offset, blocks, 0); bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
if (bmc == NULL) if (bmc == NULL)
goto unlock; goto unlock;
/* locked */ /* locked */
...@@ -1476,15 +1459,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i ...@@ -1476,15 +1459,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
*bmc |= NEEDED_MASK; *bmc |= NEEDED_MASK;
else { else {
if (*bmc <= 2) { if (*bmc <= 2) {
set_page_attr(bitmap, bitmap_set_pending(&bitmap->counts, offset);
filemap_get_page(bitmap, offset >> bitmap->chunkshift),
BITMAP_PAGE_PENDING);
bitmap->allclean = 0; bitmap->allclean = 0;
} }
} }
} }
unlock: unlock:
spin_unlock_irqrestore(&bitmap->lock, flags); spin_unlock_irqrestore(&bitmap->counts.lock, flags);
} }
EXPORT_SYMBOL(bitmap_end_sync); EXPORT_SYMBOL(bitmap_end_sync);
...@@ -1524,7 +1505,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) ...@@ -1524,7 +1505,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->mddev->curr_resync_completed = sector; bitmap->mddev->curr_resync_completed = sector;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << bitmap->chunkshift) - 1); sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
s = 0; s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) { while (s < sector && s < bitmap->mddev->resync_max_sectors) {
bitmap_end_sync(bitmap, s, &blocks, 0); bitmap_end_sync(bitmap, s, &blocks, 0);
...@@ -1538,27 +1519,25 @@ EXPORT_SYMBOL(bitmap_cond_end_sync); ...@@ -1538,27 +1519,25 @@ EXPORT_SYMBOL(bitmap_cond_end_sync);
static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
{ {
/* For each chunk covered by any of these sectors, set the /* For each chunk covered by any of these sectors, set the
* counter to 1 and set resync_needed. They should all * counter to 2 and possibly set resync_needed. They should all
* be 0 at this point * be 0 at this point
*/ */
sector_t secs; sector_t secs;
bitmap_counter_t *bmc; bitmap_counter_t *bmc;
spin_lock_irq(&bitmap->lock); spin_lock_irq(&bitmap->counts.lock);
bmc = bitmap_get_counter(bitmap, offset, &secs, 1); bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
if (!bmc) { if (!bmc) {
spin_unlock_irq(&bitmap->lock); spin_unlock_irq(&bitmap->counts.lock);
return; return;
} }
if (!*bmc) { if (!*bmc) {
struct page *page;
*bmc = 2 | (needed ? NEEDED_MASK : 0); *bmc = 2 | (needed ? NEEDED_MASK : 0);
bitmap_count_page(bitmap, offset, 1); bitmap_count_page(&bitmap->counts, offset, 1);
page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); bitmap_set_pending(&bitmap->counts, offset);
set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
bitmap->allclean = 0; bitmap->allclean = 0;
} }
spin_unlock_irq(&bitmap->lock); spin_unlock_irq(&bitmap->counts.lock);
} }
/* dirty the memory and file bits for bitmap chunks "s" to "e" */ /* dirty the memory and file bits for bitmap chunks "s" to "e" */
...@@ -1567,11 +1546,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) ...@@ -1567,11 +1546,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
unsigned long chunk; unsigned long chunk;
for (chunk = s; chunk <= e; chunk++) { for (chunk = s; chunk <= e; chunk++) {
sector_t sec = (sector_t)chunk << bitmap->chunkshift; sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
bitmap_set_memory_bits(bitmap, sec, 1); bitmap_set_memory_bits(bitmap, sec, 1);
spin_lock_irq(&bitmap->lock);
bitmap_file_set_bit(bitmap, sec); bitmap_file_set_bit(bitmap, sec);
spin_unlock_irq(&bitmap->lock);
if (sec < bitmap->mddev->recovery_cp) if (sec < bitmap->mddev->recovery_cp)
/* We are asserting that the array is dirty, /* We are asserting that the array is dirty,
* so move the recovery_cp address back so * so move the recovery_cp address back so
...@@ -1616,11 +1593,15 @@ static void bitmap_free(struct bitmap *bitmap) ...@@ -1616,11 +1593,15 @@ static void bitmap_free(struct bitmap *bitmap)
if (!bitmap) /* there was no bitmap */ if (!bitmap) /* there was no bitmap */
return; return;
/* release the bitmap file and kill the daemon */ /* Shouldn't be needed - but just in case.... */
bitmap_file_put(bitmap); wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes) == 0);
/* release the bitmap file */
bitmap_file_unmap(&bitmap->storage);
bp = bitmap->bp; bp = bitmap->counts.bp;
pages = bitmap->pages; pages = bitmap->counts.pages;
/* free all allocated memory */ /* free all allocated memory */
...@@ -1659,25 +1640,19 @@ int bitmap_create(struct mddev *mddev) ...@@ -1659,25 +1640,19 @@ int bitmap_create(struct mddev *mddev)
{ {
struct bitmap *bitmap; struct bitmap *bitmap;
sector_t blocks = mddev->resync_max_sectors; sector_t blocks = mddev->resync_max_sectors;
unsigned long chunks;
unsigned long pages;
struct file *file = mddev->bitmap_info.file; struct file *file = mddev->bitmap_info.file;
int err; int err;
struct sysfs_dirent *bm = NULL; struct sysfs_dirent *bm = NULL;
BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); BUILD_BUG_ON(sizeof(bitmap_super_t) != 256);
if (!file
&& !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */
return 0;
BUG_ON(file && mddev->bitmap_info.offset); BUG_ON(file && mddev->bitmap_info.offset);
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
if (!bitmap) if (!bitmap)
return -ENOMEM; return -ENOMEM;
spin_lock_init(&bitmap->lock); spin_lock_init(&bitmap->counts.lock);
atomic_set(&bitmap->pending_writes, 0); atomic_set(&bitmap->pending_writes, 0);
init_waitqueue_head(&bitmap->write_wait); init_waitqueue_head(&bitmap->write_wait);
init_waitqueue_head(&bitmap->overflow_wait); init_waitqueue_head(&bitmap->overflow_wait);
...@@ -1693,7 +1668,7 @@ int bitmap_create(struct mddev *mddev) ...@@ -1693,7 +1668,7 @@ int bitmap_create(struct mddev *mddev)
} else } else
bitmap->sysfs_can_clear = NULL; bitmap->sysfs_can_clear = NULL;
bitmap->file = file; bitmap->storage.file = file;
if (file) { if (file) {
get_file(file); get_file(file);
/* As future accesses to this file will use bmap, /* As future accesses to this file will use bmap,
...@@ -1724,32 +1699,15 @@ int bitmap_create(struct mddev *mddev) ...@@ -1724,32 +1699,15 @@ int bitmap_create(struct mddev *mddev)
goto error; goto error;
bitmap->daemon_lastrun = jiffies; bitmap->daemon_lastrun = jiffies;
bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
- BITMAP_BLOCK_SHIFT); if (err)
chunks = (blocks + (1 << bitmap->chunkshift) - 1) >>
bitmap->chunkshift;
pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
BUG_ON(!pages);
bitmap->chunks = chunks;
bitmap->pages = pages;
bitmap->missing_pages = pages;
bitmap->bp = kzalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
err = -ENOMEM;
if (!bitmap->bp)
goto error; goto error;
printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
pages, bmname(bitmap)); bitmap->counts.pages, bmname(bitmap));
mddev->bitmap = bitmap; mddev->bitmap = bitmap;
return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
return (bitmap->flags & BITMAP_WRITE_ERROR) ? -EIO : 0;
error: error:
bitmap_free(bitmap); bitmap_free(bitmap);
...@@ -1790,13 +1748,17 @@ int bitmap_load(struct mddev *mddev) ...@@ -1790,13 +1748,17 @@ int bitmap_load(struct mddev *mddev)
if (err) if (err)
goto out; goto out;
clear_bit(BITMAP_STALE, &bitmap->flags);
/* Kick recovery in case any bits were set */
set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; mddev->thread->timeout = mddev->bitmap_info.daemon_sleep;
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
bitmap_update_sb(bitmap); bitmap_update_sb(bitmap);
if (bitmap->flags & BITMAP_WRITE_ERROR) if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
err = -EIO; err = -EIO;
out: out:
return err; return err;
...@@ -1806,30 +1768,194 @@ EXPORT_SYMBOL_GPL(bitmap_load); ...@@ -1806,30 +1768,194 @@ EXPORT_SYMBOL_GPL(bitmap_load);
void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
{ {
unsigned long chunk_kb; unsigned long chunk_kb;
unsigned long flags; struct bitmap_counts *counts;
if (!bitmap) if (!bitmap)
return; return;
spin_lock_irqsave(&bitmap->lock, flags); counts = &bitmap->counts;
chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
"%lu%s chunk", "%lu%s chunk",
bitmap->pages - bitmap->missing_pages, counts->pages - counts->missing_pages,
bitmap->pages, counts->pages,
(bitmap->pages - bitmap->missing_pages) (counts->pages - counts->missing_pages)
<< (PAGE_SHIFT - 10), << (PAGE_SHIFT - 10),
chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
chunk_kb ? "KB" : "B"); chunk_kb ? "KB" : "B");
if (bitmap->file) { if (bitmap->storage.file) {
seq_printf(seq, ", file: "); seq_printf(seq, ", file: ");
seq_path(seq, &bitmap->file->f_path, " \t\n"); seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
} }
seq_printf(seq, "\n"); seq_printf(seq, "\n");
spin_unlock_irqrestore(&bitmap->lock, flags);
} }
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init)
{
/* If chunk_size is 0, choose an appropriate chunk size.
* Then possibly allocate new storage space.
* Then quiesce, copy bits, replace bitmap, and re-start
*
* This function is called both to set up the initial bitmap
* and to resize the bitmap while the array is active.
* If this happens as a result of the array being resized,
* chunksize will be zero, and we need to choose a suitable
* chunksize, otherwise we use what we are given.
*/
struct bitmap_storage store;
struct bitmap_counts old_counts;
unsigned long chunks;
sector_t block;
sector_t old_blocks, new_blocks;
int chunkshift;
int ret = 0;
long pages;
struct bitmap_page *new_bp;
if (chunksize == 0) {
/* If there is enough space, leave the chunk size unchanged,
* else increase by factor of two until there is enough space.
*/
long bytes;
long space = bitmap->mddev->bitmap_info.space;
if (space == 0) {
/* We don't know how much space there is, so limit
* to current size - in sectors.
*/
bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8);
if (!bitmap->mddev->bitmap_info.external)
bytes += sizeof(bitmap_super_t);
space = DIV_ROUND_UP(bytes, 512);
bitmap->mddev->bitmap_info.space = space;
}
chunkshift = bitmap->counts.chunkshift;
chunkshift--;
do {
/* 'chunkshift' is shift from block size to chunk size */
chunkshift++;
chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
bytes = DIV_ROUND_UP(chunks, 8);
if (!bitmap->mddev->bitmap_info.external)
bytes += sizeof(bitmap_super_t);
} while (bytes > (space << 9));
} else
chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift);
memset(&store, 0, sizeof(store));
if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
ret = bitmap_storage_alloc(&store, chunks,
!bitmap->mddev->bitmap_info.external);
if (ret)
goto err;
pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO);
new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL);
ret = -ENOMEM;
if (!new_bp) {
bitmap_file_unmap(&store);
goto err;
}
if (!init)
bitmap->mddev->pers->quiesce(bitmap->mddev, 1);
store.file = bitmap->storage.file;
bitmap->storage.file = NULL;
if (store.sb_page && bitmap->storage.sb_page)
memcpy(page_address(store.sb_page),
page_address(bitmap->storage.sb_page),
sizeof(bitmap_super_t));
bitmap_file_unmap(&bitmap->storage);
bitmap->storage = store;
old_counts = bitmap->counts;
bitmap->counts.bp = new_bp;
bitmap->counts.pages = pages;
bitmap->counts.missing_pages = pages;
bitmap->counts.chunkshift = chunkshift;
bitmap->counts.chunks = chunks;
bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
BITMAP_BLOCK_SHIFT);
blocks = min(old_counts.chunks << old_counts.chunkshift,
chunks << chunkshift);
spin_lock_irq(&bitmap->counts.lock);
for (block = 0; block < blocks; ) {
bitmap_counter_t *bmc_old, *bmc_new;
int set;
bmc_old = bitmap_get_counter(&old_counts, block,
&old_blocks, 0);
set = bmc_old && NEEDED(*bmc_old);
if (set) {
bmc_new = bitmap_get_counter(&bitmap->counts, block,
&new_blocks, 1);
if (*bmc_new == 0) {
/* need to set on-disk bits too. */
sector_t end = block + new_blocks;
sector_t start = block >> chunkshift;
start <<= chunkshift;
while (start < end) {
bitmap_file_set_bit(bitmap, block);
start += 1 << chunkshift;
}
*bmc_new = 2;
bitmap_count_page(&bitmap->counts,
block, 1);
bitmap_set_pending(&bitmap->counts,
block);
}
*bmc_new |= NEEDED_MASK;
if (new_blocks < old_blocks)
old_blocks = new_blocks;
}
block += old_blocks;
}
if (!init) {
int i;
while (block < (chunks << chunkshift)) {
bitmap_counter_t *bmc;
bmc = bitmap_get_counter(&bitmap->counts, block,
&new_blocks, 1);
if (bmc) {
/* new space. It needs to be resynced, so
* we set NEEDED_MASK.
*/
if (*bmc == 0) {
*bmc = NEEDED_MASK | 2;
bitmap_count_page(&bitmap->counts,
block, 1);
bitmap_set_pending(&bitmap->counts,
block);
}
}
block += new_blocks;
}
for (i = 0; i < bitmap->storage.file_pages; i++)
set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
}
spin_unlock_irq(&bitmap->counts.lock);
if (!init) {
bitmap_unplug(bitmap);
bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
}
ret = 0;
err:
return ret;
}
EXPORT_SYMBOL_GPL(bitmap_resize);
static ssize_t static ssize_t
location_show(struct mddev *mddev, char *page) location_show(struct mddev *mddev, char *page)
{ {
...@@ -1923,6 +2049,43 @@ location_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -1923,6 +2049,43 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
static struct md_sysfs_entry bitmap_location = static struct md_sysfs_entry bitmap_location =
__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); __ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store);
/* 'bitmap/space' is the space available at 'location' for the
* bitmap. This allows the kernel to know when it is safe to
* resize the bitmap to match a resized array.
*/
static ssize_t
space_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%lu\n", mddev->bitmap_info.space);
}
static ssize_t
space_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long sectors;
int rv;
rv = kstrtoul(buf, 10, &sectors);
if (rv)
return rv;
if (sectors == 0)
return -EINVAL;
if (mddev->bitmap &&
sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
return -EFBIG; /* Bitmap is too big for this small space */
/* could make sure it isn't too big, but that isn't really
* needed - user-space should be careful.
*/
mddev->bitmap_info.space = sectors;
return len;
}
static struct md_sysfs_entry bitmap_space =
__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store);
static ssize_t static ssize_t
timeout_show(struct mddev *mddev, char *page) timeout_show(struct mddev *mddev, char *page)
{ {
...@@ -2098,6 +2261,7 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, ...@@ -2098,6 +2261,7 @@ __ATTR(max_backlog_used, S_IRUGO | S_IWUSR,
static struct attribute *md_bitmap_attrs[] = { static struct attribute *md_bitmap_attrs[] = {
&bitmap_location.attr, &bitmap_location.attr,
&bitmap_space.attr,
&bitmap_timeout.attr, &bitmap_timeout.attr,
&bitmap_backlog.attr, &bitmap_backlog.attr,
&bitmap_chunksize.attr, &bitmap_chunksize.attr,
......
...@@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t; ...@@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t;
/* use these for bitmap->flags and bitmap->sb->state bit-fields */ /* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state { enum bitmap_state {
BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */ BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
BITMAP_HOSTENDIAN = 0x8000, BITMAP_HOSTENDIAN =15,
}; };
/* the superblock at the front of the bitmap file -- little endian */ /* the superblock at the front of the bitmap file -- little endian */
...@@ -128,8 +128,10 @@ typedef struct bitmap_super_s { ...@@ -128,8 +128,10 @@ typedef struct bitmap_super_s {
__le32 chunksize; /* 52 the bitmap chunk size in bytes */ __le32 chunksize; /* 52 the bitmap chunk size in bytes */
__le32 daemon_sleep; /* 56 seconds between disk flushes */ __le32 daemon_sleep; /* 56 seconds between disk flushes */
__le32 write_behind; /* 60 number of outstanding write-behind writes */ __le32 write_behind; /* 60 number of outstanding write-behind writes */
__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
* reserved for the bitmap. */
__u8 pad[256 - 64]; /* set to zero */ __u8 pad[256 - 68]; /* set to zero */
} bitmap_super_t; } bitmap_super_t;
/* notes: /* notes:
...@@ -159,36 +161,49 @@ struct bitmap_page { ...@@ -159,36 +161,49 @@ struct bitmap_page {
* pointer and use it as two counters itself * pointer and use it as two counters itself
*/ */
unsigned int hijacked:1; unsigned int hijacked:1;
/*
* If any counter in this page is '1' or '2' - and so could be
* cleared then that page is marked as 'pending'
*/
unsigned int pending:1;
/* /*
* count of dirty bits on the page * count of dirty bits on the page
*/ */
unsigned int count:31; unsigned int count:30;
}; };
/* the main bitmap structure - one per mddev */ /* the main bitmap structure - one per mddev */
struct bitmap { struct bitmap {
struct bitmap_page *bp;
unsigned long pages; /* total number of pages in the bitmap */
unsigned long missing_pages; /* number of pages not yet allocated */
struct mddev *mddev; /* the md device that the bitmap is for */ struct bitmap_counts {
spinlock_t lock;
struct bitmap_page *bp;
unsigned long pages; /* total number of pages
* in the bitmap */
unsigned long missing_pages; /* number of pages
* not yet allocated */
unsigned long chunkshift; /* chunksize = 2^chunkshift
* (for bitops) */
unsigned long chunks; /* Total number of data
* chunks for the array */
} counts;
/* bitmap chunksize -- how much data does each bit represent? */ struct mddev *mddev; /* the md device that the bitmap is for */
unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
unsigned long chunks; /* total number of data chunks for the array */
__u64 events_cleared; __u64 events_cleared;
int need_sync; int need_sync;
/* bitmap spinlock */ struct bitmap_storage {
spinlock_t lock; struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap
struct file *file; /* backing disk file */ * file superblock */
struct page *sb_page; /* cached copy of the bitmap file superblock */ struct page **filemap; /* list of cache pages for
struct page **filemap; /* list of cache pages for the file */ * the file */
unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ unsigned long *filemap_attr; /* attributes associated
unsigned long file_pages; /* number of pages in the file */ * w/ filemap pages */
int last_page_size; /* bytes in the last page */ unsigned long file_pages; /* number of pages in the file*/
unsigned long bytes; /* total bytes in the bitmap */
} storage;
unsigned long flags; unsigned long flags;
...@@ -242,6 +257,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); ...@@ -242,6 +257,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
void bitmap_unplug(struct bitmap *bitmap); void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev); void bitmap_daemon_work(struct mddev *mddev);
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init);
#endif #endif
#endif #endif
...@@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs) ...@@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs)
for (i = 0; i < rs->md.raid_disks; i++) { for (i = 0; i < rs->md.raid_disks; i++) {
if (rs->dev[i].meta_dev) if (rs->dev[i].meta_dev)
dm_put_device(rs->ti, rs->dev[i].meta_dev); dm_put_device(rs->ti, rs->dev[i].meta_dev);
if (rs->dev[i].rdev.sb_page) md_rdev_clear(&rs->dev[i].rdev);
put_page(rs->dev[i].rdev.sb_page);
rs->dev[i].rdev.sb_page = NULL;
rs->dev[i].rdev.sb_loaded = 0;
if (rs->dev[i].data_dev) if (rs->dev[i].data_dev)
dm_put_device(rs->ti, rs->dev[i].data_dev); dm_put_device(rs->ti, rs->dev[i].data_dev);
} }
...@@ -606,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size) ...@@ -606,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
DMERR("Failed to read superblock of device at position %d", DMERR("Failed to read superblock of device at position %d",
rdev->raid_disk); rdev->raid_disk);
set_bit(Faulty, &rdev->flags); md_error(rdev->mddev, rdev);
return -EINVAL; return -EINVAL;
} }
...@@ -617,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size) ...@@ -617,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
static void super_sync(struct mddev *mddev, struct md_rdev *rdev) static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct md_rdev *r; int i;
uint64_t failed_devices; uint64_t failed_devices;
struct dm_raid_superblock *sb; struct dm_raid_superblock *sb;
struct raid_set *rs = container_of(mddev, struct raid_set, md);
sb = page_address(rdev->sb_page); sb = page_address(rdev->sb_page);
failed_devices = le64_to_cpu(sb->failed_devices); failed_devices = le64_to_cpu(sb->failed_devices);
rdev_for_each(r, mddev) for (i = 0; i < mddev->raid_disks; i++)
if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) if (!rs->dev[i].data_dev ||
failed_devices |= (1ULL << r->raid_disk); test_bit(Faulty, &(rs->dev[i].rdev.flags)))
failed_devices |= (1ULL << i);
memset(sb, 0, sizeof(*sb)); memset(sb, 0, sizeof(*sb));
...@@ -1252,12 +1251,13 @@ static void raid_resume(struct dm_target *ti) ...@@ -1252,12 +1251,13 @@ static void raid_resume(struct dm_target *ti)
{ {
struct raid_set *rs = ti->private; struct raid_set *rs = ti->private;
set_bit(MD_CHANGE_DEVS, &rs->md.flags);
if (!rs->bitmap_loaded) { if (!rs->bitmap_loaded) {
bitmap_load(&rs->md); bitmap_load(&rs->md);
rs->bitmap_loaded = 1; rs->bitmap_loaded = 1;
} else }
md_wakeup_thread(rs->md.thread);
clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
mddev_resume(&rs->md); mddev_resume(&rs->md);
} }
......
...@@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev) ...@@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev)
wake_up(&mddev->sb_wait); wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0); mddev->pers->quiesce(mddev, 0);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
} }
...@@ -452,7 +453,7 @@ static void submit_flushes(struct work_struct *ws) ...@@ -452,7 +453,7 @@ static void submit_flushes(struct work_struct *ws)
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
rcu_read_unlock(); rcu_read_unlock();
bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
bi->bi_end_io = md_end_flush; bi->bi_end_io = md_end_flush;
bi->bi_private = rdev; bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev; bi->bi_bdev = rdev->bdev;
...@@ -607,6 +608,7 @@ void mddev_init(struct mddev *mddev) ...@@ -607,6 +608,7 @@ void mddev_init(struct mddev *mddev)
init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait); init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
mddev->reshape_backwards = 0;
mddev->resync_min = 0; mddev->resync_min = 0;
mddev->resync_max = MaxSector; mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE; mddev->level = LEVEL_NONE;
...@@ -802,7 +804,7 @@ static int alloc_disk_sb(struct md_rdev * rdev) ...@@ -802,7 +804,7 @@ static int alloc_disk_sb(struct md_rdev * rdev)
return 0; return 0;
} }
static void free_disk_sb(struct md_rdev * rdev) void md_rdev_clear(struct md_rdev *rdev)
{ {
if (rdev->sb_page) { if (rdev->sb_page) {
put_page(rdev->sb_page); put_page(rdev->sb_page);
...@@ -815,8 +817,10 @@ static void free_disk_sb(struct md_rdev * rdev) ...@@ -815,8 +817,10 @@ static void free_disk_sb(struct md_rdev * rdev)
put_page(rdev->bb_page); put_page(rdev->bb_page);
rdev->bb_page = NULL; rdev->bb_page = NULL;
} }
kfree(rdev->badblocks.page);
rdev->badblocks.page = NULL;
} }
EXPORT_SYMBOL_GPL(md_rdev_clear);
static void super_written(struct bio *bio, int error) static void super_written(struct bio *bio, int error)
{ {
...@@ -887,6 +891,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, ...@@ -887,6 +891,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
rdev->meta_bdev : rdev->bdev; rdev->meta_bdev : rdev->bdev;
if (metadata_op) if (metadata_op)
bio->bi_sector = sector + rdev->sb_start; bio->bi_sector = sector + rdev->sb_start;
else if (rdev->mddev->reshape_position != MaxSector &&
(rdev->mddev->reshape_backwards ==
(sector >= rdev->mddev->reshape_position)))
bio->bi_sector = sector + rdev->new_data_offset;
else else
bio->bi_sector = sector + rdev->data_offset; bio->bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0); bio_add_page(bio, page, size, 0);
...@@ -1034,12 +1042,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb) ...@@ -1034,12 +1042,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
struct super_type { struct super_type {
char *name; char *name;
struct module *owner; struct module *owner;
int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev, int (*load_super)(struct md_rdev *rdev,
struct md_rdev *refdev,
int minor_version); int minor_version);
int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev); int (*validate_super)(struct mddev *mddev,
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_rdev *rdev);
void (*sync_super)(struct mddev *mddev,
struct md_rdev *rdev);
unsigned long long (*rdev_size_change)(struct md_rdev *rdev, unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
sector_t num_sectors); sector_t num_sectors);
int (*allow_new_offset)(struct md_rdev *rdev,
unsigned long long new_offset);
}; };
/* /*
...@@ -1111,6 +1124,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor ...@@ -1111,6 +1124,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
rdev->preferred_minor = sb->md_minor; rdev->preferred_minor = sb->md_minor;
rdev->data_offset = 0; rdev->data_offset = 0;
rdev->new_data_offset = 0;
rdev->sb_size = MD_SB_BYTES; rdev->sb_size = MD_SB_BYTES;
rdev->badblocks.shift = -1; rdev->badblocks.shift = -1;
...@@ -1184,7 +1198,11 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1184,7 +1198,11 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->dev_sectors = ((sector_t)sb->size) * 2; mddev->dev_sectors = ((sector_t)sb->size) * 2;
mddev->events = ev1; mddev->events = ev1;
mddev->bitmap_info.offset = 0; mddev->bitmap_info.offset = 0;
mddev->bitmap_info.space = 0;
/* bitmap can use 60 K after the 4K superblocks */
mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
mddev->reshape_backwards = 0;
if (mddev->minor_version >= 91) { if (mddev->minor_version >= 91) {
mddev->reshape_position = sb->reshape_position; mddev->reshape_position = sb->reshape_position;
...@@ -1192,6 +1210,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1192,6 +1210,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_level = sb->new_level; mddev->new_level = sb->new_level;
mddev->new_layout = sb->new_layout; mddev->new_layout = sb->new_layout;
mddev->new_chunk_sectors = sb->new_chunk >> 9; mddev->new_chunk_sectors = sb->new_chunk >> 9;
if (mddev->delta_disks < 0)
mddev->reshape_backwards = 1;
} else { } else {
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
mddev->delta_disks = 0; mddev->delta_disks = 0;
...@@ -1218,9 +1238,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1218,9 +1238,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->max_disks = MD_SB_DISKS; mddev->max_disks = MD_SB_DISKS;
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
mddev->bitmap_info.file == NULL) mddev->bitmap_info.file == NULL) {
mddev->bitmap_info.offset = mddev->bitmap_info.offset =
mddev->bitmap_info.default_offset; mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.space;
}
} else if (mddev->pers == NULL) { } else if (mddev->pers == NULL) {
/* Insist on good event counter while assembling, except /* Insist on good event counter while assembling, except
...@@ -1434,6 +1457,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) ...@@ -1434,6 +1457,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
return num_sectors; return num_sectors;
} }
static int
super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
{
/* non-zero offset changes not possible with v0.90 */
return new_offset == 0;
}
/* /*
* version 1 superblock * version 1 superblock
...@@ -1469,6 +1498,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ...@@ -1469,6 +1498,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
struct mdp_superblock_1 *sb; struct mdp_superblock_1 *sb;
int ret; int ret;
sector_t sb_start; sector_t sb_start;
sector_t sectors;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
int bmask; int bmask;
...@@ -1523,9 +1553,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ...@@ -1523,9 +1553,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
bdevname(rdev->bdev,b)); bdevname(rdev->bdev,b));
return -EINVAL; return -EINVAL;
} }
if (sb->pad0 ||
sb->pad3[0] ||
memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
/* Some padding is non-zero, might be a new feature */
return -EINVAL;
rdev->preferred_minor = 0xffff; rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset); rdev->data_offset = le64_to_cpu(sb->data_offset);
rdev->new_data_offset = rdev->data_offset;
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
(le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
...@@ -1536,6 +1575,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ...@@ -1536,6 +1575,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
if (minor_version if (minor_version
&& rdev->data_offset < sb_start + (rdev->sb_size/512)) && rdev->data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL; return -EINVAL;
if (minor_version
&& rdev->new_data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL;
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
rdev->desc_nr = -1; rdev->desc_nr = -1;
...@@ -1607,16 +1649,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ...@@ -1607,16 +1649,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
else else
ret = 0; ret = 0;
} }
if (minor_version) if (minor_version) {
rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
le64_to_cpu(sb->data_offset); sectors -= rdev->data_offset;
else } else
rdev->sectors = rdev->sb_start; sectors = rdev->sb_start;
if (rdev->sectors < le64_to_cpu(sb->data_size)) if (sectors < le64_to_cpu(sb->data_size))
return -EINVAL; return -EINVAL;
rdev->sectors = le64_to_cpu(sb->data_size); rdev->sectors = le64_to_cpu(sb->data_size);
if (le64_to_cpu(sb->size) > rdev->sectors)
return -EINVAL;
return ret; return ret;
} }
...@@ -1644,17 +1684,37 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1644,17 +1684,37 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->dev_sectors = le64_to_cpu(sb->size); mddev->dev_sectors = le64_to_cpu(sb->size);
mddev->events = ev1; mddev->events = ev1;
mddev->bitmap_info.offset = 0; mddev->bitmap_info.offset = 0;
mddev->bitmap_info.space = 0;
/* Default location for bitmap is 1K after superblock
* using 3K - total of 4K
*/
mddev->bitmap_info.default_offset = 1024 >> 9; mddev->bitmap_info.default_offset = 1024 >> 9;
mddev->bitmap_info.default_space = (4096-1024) >> 9;
mddev->reshape_backwards = 0;
mddev->recovery_cp = le64_to_cpu(sb->resync_offset); mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
memcpy(mddev->uuid, sb->set_uuid, 16); memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2; mddev->max_disks = (4096-256)/2;
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
mddev->bitmap_info.file == NULL ) mddev->bitmap_info.file == NULL) {
mddev->bitmap_info.offset = mddev->bitmap_info.offset =
(__s32)le32_to_cpu(sb->bitmap_offset); (__s32)le32_to_cpu(sb->bitmap_offset);
/* Metadata doesn't record how much space is available.
* For 1.0, we assume we can use up to the superblock
* if before, else to 4K beyond superblock.
* For others, assume no change is possible.
*/
if (mddev->minor_version > 0)
mddev->bitmap_info.space = 0;
else if (mddev->bitmap_info.offset > 0)
mddev->bitmap_info.space =
8 - mddev->bitmap_info.offset;
else
mddev->bitmap_info.space =
-mddev->bitmap_info.offset;
}
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
mddev->reshape_position = le64_to_cpu(sb->reshape_position); mddev->reshape_position = le64_to_cpu(sb->reshape_position);
...@@ -1662,6 +1722,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1662,6 +1722,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_level = le32_to_cpu(sb->new_level);
mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_layout = le32_to_cpu(sb->new_layout);
mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
if (mddev->delta_disks < 0 ||
(mddev->delta_disks == 0 &&
(le32_to_cpu(sb->feature_map)
& MD_FEATURE_RESHAPE_BACKWARDS)))
mddev->reshape_backwards = 1;
} else { } else {
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
mddev->delta_disks = 0; mddev->delta_disks = 0;
...@@ -1735,7 +1800,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1735,7 +1800,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = 0; sb->feature_map = 0;
sb->pad0 = 0; sb->pad0 = 0;
sb->recovery_offset = cpu_to_le64(0); sb->recovery_offset = cpu_to_le64(0);
memset(sb->pad1, 0, sizeof(sb->pad1));
memset(sb->pad3, 0, sizeof(sb->pad3)); memset(sb->pad3, 0, sizeof(sb->pad3));
sb->utime = cpu_to_le64((__u64)mddev->utime); sb->utime = cpu_to_le64((__u64)mddev->utime);
...@@ -1757,6 +1821,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1757,6 +1821,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->devflags |= WriteMostly1; sb->devflags |= WriteMostly1;
else else
sb->devflags &= ~WriteMostly1; sb->devflags &= ~WriteMostly1;
sb->data_offset = cpu_to_le64(rdev->data_offset);
sb->data_size = cpu_to_le64(rdev->sectors);
if (mddev->bitmap && mddev->bitmap_info.file == NULL) { if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
...@@ -1781,6 +1847,16 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1781,6 +1847,16 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->delta_disks = cpu_to_le32(mddev->delta_disks); sb->delta_disks = cpu_to_le32(mddev->delta_disks);
sb->new_level = cpu_to_le32(mddev->new_level); sb->new_level = cpu_to_le32(mddev->new_level);
sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
if (mddev->delta_disks == 0 &&
mddev->reshape_backwards)
sb->feature_map
|= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
if (rdev->new_data_offset != rdev->data_offset) {
sb->feature_map
|= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
- rdev->data_offset));
}
} }
if (rdev->badblocks.count == 0) if (rdev->badblocks.count == 0)
...@@ -1857,6 +1933,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) ...@@ -1857,6 +1933,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
sector_t max_sectors; sector_t max_sectors;
if (num_sectors && num_sectors < rdev->mddev->dev_sectors) if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
return 0; /* component must fit device */ return 0; /* component must fit device */
if (rdev->data_offset != rdev->new_data_offset)
return 0; /* too confusing */
if (rdev->sb_start < rdev->data_offset) { if (rdev->sb_start < rdev->data_offset) {
/* minor versions 1 and 2; superblock before data */ /* minor versions 1 and 2; superblock before data */
max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
...@@ -1884,6 +1962,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) ...@@ -1884,6 +1962,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
rdev->sb_page); rdev->sb_page);
md_super_wait(rdev->mddev); md_super_wait(rdev->mddev);
return num_sectors; return num_sectors;
}
static int
super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
/* All necessary checks on new >= old have been done */
struct bitmap *bitmap;
if (new_offset >= rdev->data_offset)
return 1;
/* with 1.0 metadata, there is no metadata to tread on
* so we can always move back */
if (rdev->mddev->minor_version == 0)
return 1;
/* otherwise we must be sure not to step on
* any metadata, so stay:
* 36K beyond start of superblock
* beyond end of badblocks
* beyond write-intent bitmap
*/
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
bitmap = rdev->mddev->bitmap;
if (bitmap && !rdev->mddev->bitmap_info.file &&
rdev->sb_start + rdev->mddev->bitmap_info.offset +
bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
return 0;
if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
return 0;
return 1;
} }
static struct super_type super_types[] = { static struct super_type super_types[] = {
...@@ -1894,6 +2006,7 @@ static struct super_type super_types[] = { ...@@ -1894,6 +2006,7 @@ static struct super_type super_types[] = {
.validate_super = super_90_validate, .validate_super = super_90_validate,
.sync_super = super_90_sync, .sync_super = super_90_sync,
.rdev_size_change = super_90_rdev_size_change, .rdev_size_change = super_90_rdev_size_change,
.allow_new_offset = super_90_allow_new_offset,
}, },
[1] = { [1] = {
.name = "md-1", .name = "md-1",
...@@ -1902,6 +2015,7 @@ static struct super_type super_types[] = { ...@@ -1902,6 +2015,7 @@ static struct super_type super_types[] = {
.validate_super = super_1_validate, .validate_super = super_1_validate,
.sync_super = super_1_sync, .sync_super = super_1_sync,
.rdev_size_change = super_1_rdev_size_change, .rdev_size_change = super_1_rdev_size_change,
.allow_new_offset = super_1_allow_new_offset,
}, },
}; };
...@@ -2105,9 +2219,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev) ...@@ -2105,9 +2219,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
sysfs_remove_link(&rdev->kobj, "block"); sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_state);
rdev->sysfs_state = NULL; rdev->sysfs_state = NULL;
kfree(rdev->badblocks.page);
rdev->badblocks.count = 0; rdev->badblocks.count = 0;
rdev->badblocks.page = NULL;
/* We need to delay this, otherwise we can deadlock when /* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need * writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage. * to delay it due to rcu usage.
...@@ -2158,7 +2270,7 @@ static void export_rdev(struct md_rdev * rdev) ...@@ -2158,7 +2270,7 @@ static void export_rdev(struct md_rdev * rdev)
bdevname(rdev->bdev,b)); bdevname(rdev->bdev,b));
if (rdev->mddev) if (rdev->mddev)
MD_BUG(); MD_BUG();
free_disk_sb(rdev); md_rdev_clear(rdev);
#ifndef MODULE #ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags)) if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev); md_autodetect_dev(rdev->bdev->bd_dev);
...@@ -2809,9 +2921,8 @@ offset_show(struct md_rdev *rdev, char *page) ...@@ -2809,9 +2921,8 @@ offset_show(struct md_rdev *rdev, char *page)
static ssize_t static ssize_t
offset_store(struct md_rdev *rdev, const char *buf, size_t len) offset_store(struct md_rdev *rdev, const char *buf, size_t len)
{ {
char *e; unsigned long long offset;
unsigned long long offset = simple_strtoull(buf, &e, 10); if (strict_strtoull(buf, 10, &offset) < 0)
if (e==buf || (*e && *e != '\n'))
return -EINVAL; return -EINVAL;
if (rdev->mddev->pers && rdev->raid_disk >= 0) if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY; return -EBUSY;
...@@ -2826,6 +2937,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2826,6 +2937,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
static struct rdev_sysfs_entry rdev_offset = static struct rdev_sysfs_entry rdev_offset =
__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
{
return sprintf(page, "%llu\n",
(unsigned long long)rdev->new_data_offset);
}
static ssize_t new_offset_store(struct md_rdev *rdev,
const char *buf, size_t len)
{
unsigned long long new_offset;
struct mddev *mddev = rdev->mddev;
if (strict_strtoull(buf, 10, &new_offset) < 0)
return -EINVAL;
if (mddev->sync_thread)
return -EBUSY;
if (new_offset == rdev->data_offset)
/* reset is always permitted */
;
else if (new_offset > rdev->data_offset) {
/* must not push array size beyond rdev_sectors */
if (new_offset - rdev->data_offset
+ mddev->dev_sectors > rdev->sectors)
return -E2BIG;
}
/* Metadata worries about other space details. */
/* decreasing the offset is inconsistent with a backwards
* reshape.
*/
if (new_offset < rdev->data_offset &&
mddev->reshape_backwards)
return -EINVAL;
/* Increasing offset is inconsistent with forwards
* reshape. reshape_direction should be set to
* 'backwards' first.
*/
if (new_offset > rdev->data_offset &&
!mddev->reshape_backwards)
return -EINVAL;
if (mddev->pers && mddev->persistent &&
!super_types[mddev->major_version]
.allow_new_offset(rdev, new_offset))
return -E2BIG;
rdev->new_data_offset = new_offset;
if (new_offset > rdev->data_offset)
mddev->reshape_backwards = 1;
else if (new_offset < rdev->data_offset)
mddev->reshape_backwards = 0;
return len;
}
static struct rdev_sysfs_entry rdev_new_offset =
__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
static ssize_t static ssize_t
rdev_size_show(struct md_rdev *rdev, char *page) rdev_size_show(struct md_rdev *rdev, char *page)
{ {
...@@ -2870,6 +3038,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2870,6 +3038,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
if (strict_blocks_to_sectors(buf, &sectors) < 0) if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL; return -EINVAL;
if (rdev->data_offset != rdev->new_data_offset)
return -EINVAL; /* too confusing */
if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->pers && rdev->raid_disk >= 0) {
if (my_mddev->persistent) { if (my_mddev->persistent) {
sectors = super_types[my_mddev->major_version]. sectors = super_types[my_mddev->major_version].
...@@ -3006,6 +3176,7 @@ static struct attribute *rdev_default_attrs[] = { ...@@ -3006,6 +3176,7 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_errors.attr, &rdev_errors.attr,
&rdev_slot.attr, &rdev_slot.attr,
&rdev_offset.attr, &rdev_offset.attr,
&rdev_new_offset.attr,
&rdev_size.attr, &rdev_size.attr,
&rdev_recovery_start.attr, &rdev_recovery_start.attr,
&rdev_bad_blocks.attr, &rdev_bad_blocks.attr,
...@@ -3080,6 +3251,7 @@ int md_rdev_init(struct md_rdev *rdev) ...@@ -3080,6 +3251,7 @@ int md_rdev_init(struct md_rdev *rdev)
rdev->raid_disk = -1; rdev->raid_disk = -1;
rdev->flags = 0; rdev->flags = 0;
rdev->data_offset = 0; rdev->data_offset = 0;
rdev->new_data_offset = 0;
rdev->sb_events = 0; rdev->sb_events = 0;
rdev->last_read_error.tv_sec = 0; rdev->last_read_error.tv_sec = 0;
rdev->last_read_error.tv_nsec = 0; rdev->last_read_error.tv_nsec = 0;
...@@ -3178,8 +3350,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe ...@@ -3178,8 +3350,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
abort_free: abort_free:
if (rdev->bdev) if (rdev->bdev)
unlock_rdev(rdev); unlock_rdev(rdev);
free_disk_sb(rdev); md_rdev_clear(rdev);
kfree(rdev->badblocks.page);
kfree(rdev); kfree(rdev);
return ERR_PTR(err); return ERR_PTR(err);
} }
...@@ -3419,6 +3590,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3419,6 +3590,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->raid_disks -= mddev->delta_disks; mddev->raid_disks -= mddev->delta_disks;
mddev->delta_disks = 0; mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
module_put(pers->owner); module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s would not accept array\n", printk(KERN_WARNING "md: %s: %s would not accept array\n",
mdname(mddev), clevel); mdname(mddev), clevel);
...@@ -3492,6 +3664,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3492,6 +3664,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->layout = mddev->new_layout; mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors; mddev->chunk_sectors = mddev->new_chunk_sectors;
mddev->delta_disks = 0; mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
mddev->degraded = 0; mddev->degraded = 0;
if (mddev->pers->sync_request == NULL) { if (mddev->pers->sync_request == NULL) {
/* this is now an array without redundancy, so /* this is now an array without redundancy, so
...@@ -3501,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3501,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
del_timer_sync(&mddev->safemode_timer); del_timer_sync(&mddev->safemode_timer);
} }
pers->run(mddev); pers->run(mddev);
mddev_resume(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); mddev_resume(mddev);
md_wakeup_thread(mddev->thread);
sysfs_notify(&mddev->kobj, NULL, "level"); sysfs_notify(&mddev->kobj, NULL, "level");
md_new_event(mddev); md_new_event(mddev);
return rv; return rv;
...@@ -3582,9 +3753,20 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3582,9 +3753,20 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers) if (mddev->pers)
rv = update_raid_disks(mddev, n); rv = update_raid_disks(mddev, n);
else if (mddev->reshape_position != MaxSector) { else if (mddev->reshape_position != MaxSector) {
struct md_rdev *rdev;
int olddisks = mddev->raid_disks - mddev->delta_disks; int olddisks = mddev->raid_disks - mddev->delta_disks;
rdev_for_each(rdev, mddev) {
if (olddisks < n &&
rdev->data_offset < rdev->new_data_offset)
return -EINVAL;
if (olddisks > n &&
rdev->data_offset > rdev->new_data_offset)
return -EINVAL;
}
mddev->delta_disks = n - olddisks; mddev->delta_disks = n - olddisks;
mddev->raid_disks = n; mddev->raid_disks = n;
mddev->reshape_backwards = (mddev->delta_disks < 0);
} else } else
mddev->raid_disks = n; mddev->raid_disks = n;
return rv ? rv : len; return rv ? rv : len;
...@@ -4266,7 +4448,8 @@ sync_completed_show(struct mddev *mddev, char *page) ...@@ -4266,7 +4448,8 @@ sync_completed_show(struct mddev *mddev, char *page)
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n"); return sprintf(page, "none\n");
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors; max_sectors = mddev->resync_max_sectors;
else else
max_sectors = mddev->dev_sectors; max_sectors = mddev->dev_sectors;
...@@ -4428,6 +4611,7 @@ reshape_position_show(struct mddev *mddev, char *page) ...@@ -4428,6 +4611,7 @@ reshape_position_show(struct mddev *mddev, char *page)
static ssize_t static ssize_t
reshape_position_store(struct mddev *mddev, const char *buf, size_t len) reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{ {
struct md_rdev *rdev;
char *e; char *e;
unsigned long long new = simple_strtoull(buf, &e, 10); unsigned long long new = simple_strtoull(buf, &e, 10);
if (mddev->pers) if (mddev->pers)
...@@ -4436,9 +4620,12 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -4436,9 +4620,12 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
return -EINVAL; return -EINVAL;
mddev->reshape_position = new; mddev->reshape_position = new;
mddev->delta_disks = 0; mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
mddev->new_level = mddev->level; mddev->new_level = mddev->level;
mddev->new_layout = mddev->layout; mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_chunk_sectors = mddev->chunk_sectors;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
return len; return len;
} }
...@@ -4446,6 +4633,42 @@ static struct md_sysfs_entry md_reshape_position = ...@@ -4446,6 +4633,42 @@ static struct md_sysfs_entry md_reshape_position =
__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
reshape_position_store); reshape_position_store);
static ssize_t
reshape_direction_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%s\n",
mddev->reshape_backwards ? "backwards" : "forwards");
}
static ssize_t
reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
{
int backwards = 0;
if (cmd_match(buf, "forwards"))
backwards = 0;
else if (cmd_match(buf, "backwards"))
backwards = 1;
else
return -EINVAL;
if (mddev->reshape_backwards == backwards)
return len;
/* check if we are allowed to change */
if (mddev->delta_disks)
return -EBUSY;
if (mddev->persistent &&
mddev->major_version == 0)
return -EINVAL;
mddev->reshape_backwards = backwards;
return len;
}
static struct md_sysfs_entry md_reshape_direction =
__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
reshape_direction_store);
static ssize_t static ssize_t
array_size_show(struct mddev *mddev, char *page) array_size_show(struct mddev *mddev, char *page)
{ {
...@@ -4501,6 +4724,7 @@ static struct attribute *md_default_attrs[] = { ...@@ -4501,6 +4724,7 @@ static struct attribute *md_default_attrs[] = {
&md_safe_delay.attr, &md_safe_delay.attr,
&md_array_state.attr, &md_array_state.attr,
&md_reshape_position.attr, &md_reshape_position.attr,
&md_reshape_direction.attr,
&md_array_size.attr, &md_array_size.attr,
&max_corr_read_errors.attr, &max_corr_read_errors.attr,
NULL, NULL,
...@@ -4914,7 +5138,8 @@ int md_run(struct mddev *mddev) ...@@ -4914,7 +5138,8 @@ int md_run(struct mddev *mddev)
err = -EINVAL; err = -EINVAL;
mddev->pers->stop(mddev); mddev->pers->stop(mddev);
} }
if (err == 0 && mddev->pers->sync_request) { if (err == 0 && mddev->pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
err = bitmap_create(mddev); err = bitmap_create(mddev);
if (err) { if (err) {
printk(KERN_ERR "%s: failed to create bitmap (%d)\n", printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
...@@ -5064,6 +5289,7 @@ static void md_clean(struct mddev *mddev) ...@@ -5064,6 +5289,7 @@ static void md_clean(struct mddev *mddev)
mddev->events = 0; mddev->events = 0;
mddev->can_decrease_events = 0; mddev->can_decrease_events = 0;
mddev->delta_disks = 0; mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
mddev->new_level = LEVEL_NONE; mddev->new_level = LEVEL_NONE;
mddev->new_layout = 0; mddev->new_layout = 0;
mddev->new_chunk_sectors = 0; mddev->new_chunk_sectors = 0;
...@@ -5079,6 +5305,7 @@ static void md_clean(struct mddev *mddev) ...@@ -5079,6 +5305,7 @@ static void md_clean(struct mddev *mddev)
mddev->merge_check_needed = 0; mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0; mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_offset = 0;
mddev->bitmap_info.default_space = 0;
mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.chunksize = 0;
mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.daemon_sleep = 0;
mddev->bitmap_info.max_write_behind = 0; mddev->bitmap_info.max_write_behind = 0;
...@@ -5421,7 +5648,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) ...@@ -5421,7 +5648,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
goto out; goto out;
/* bitmap disabled, zero the first byte and copy out */ /* bitmap disabled, zero the first byte and copy out */
if (!mddev->bitmap || !mddev->bitmap->file) { if (!mddev->bitmap || !mddev->bitmap->storage.file) {
file->pathname[0] = '\0'; file->pathname[0] = '\0';
goto copy_out; goto copy_out;
} }
...@@ -5430,7 +5657,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg) ...@@ -5430,7 +5657,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
if (!buf) if (!buf)
goto out; goto out;
ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); ptr = d_path(&mddev->bitmap->storage.file->f_path,
buf, sizeof(file->pathname));
if (IS_ERR(ptr)) if (IS_ERR(ptr))
goto out; goto out;
...@@ -5875,6 +6103,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) ...@@ -5875,6 +6103,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
mddev->bitmap_info.offset = 0; mddev->bitmap_info.offset = 0;
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
...@@ -5888,6 +6117,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) ...@@ -5888,6 +6117,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
mddev->new_chunk_sectors = mddev->chunk_sectors; mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->new_layout = mddev->layout; mddev->new_layout = mddev->layout;
mddev->delta_disks = 0; mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
return 0; return 0;
} }
...@@ -5922,11 +6152,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) ...@@ -5922,11 +6152,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
*/ */
if (mddev->sync_thread) if (mddev->sync_thread)
return -EBUSY; return -EBUSY;
if (mddev->bitmap)
/* Sorry, cannot grow a bitmap yet, just remove it,
* grow, and re-add.
*/
return -EBUSY;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
sector_t avail = rdev->sectors; sector_t avail = rdev->sectors;
...@@ -5944,6 +6170,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) ...@@ -5944,6 +6170,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
static int update_raid_disks(struct mddev *mddev, int raid_disks) static int update_raid_disks(struct mddev *mddev, int raid_disks)
{ {
int rv; int rv;
struct md_rdev *rdev;
/* change the number of raid disks */ /* change the number of raid disks */
if (mddev->pers->check_reshape == NULL) if (mddev->pers->check_reshape == NULL)
return -EINVAL; return -EINVAL;
...@@ -5952,11 +6179,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) ...@@ -5952,11 +6179,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
return -EINVAL; return -EINVAL;
if (mddev->sync_thread || mddev->reshape_position != MaxSector) if (mddev->sync_thread || mddev->reshape_position != MaxSector)
return -EBUSY; return -EBUSY;
rdev_for_each(rdev, mddev) {
if (mddev->raid_disks < raid_disks &&
rdev->data_offset < rdev->new_data_offset)
return -EINVAL;
if (mddev->raid_disks > raid_disks &&
rdev->data_offset > rdev->new_data_offset)
return -EINVAL;
}
mddev->delta_disks = raid_disks - mddev->raid_disks; mddev->delta_disks = raid_disks - mddev->raid_disks;
if (mddev->delta_disks < 0)
mddev->reshape_backwards = 1;
else if (mddev->delta_disks > 0)
mddev->reshape_backwards = 0;
rv = mddev->pers->check_reshape(mddev); rv = mddev->pers->check_reshape(mddev);
if (rv < 0) if (rv < 0) {
mddev->delta_disks = 0; mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
}
return rv; return rv;
} }
...@@ -6039,6 +6282,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) ...@@ -6039,6 +6282,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return -EINVAL; return -EINVAL;
mddev->bitmap_info.offset = mddev->bitmap_info.offset =
mddev->bitmap_info.default_offset; mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 1);
rv = bitmap_create(mddev); rv = bitmap_create(mddev);
if (!rv) if (!rv)
...@@ -6050,7 +6295,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) ...@@ -6050,7 +6295,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
/* remove the bitmap */ /* remove the bitmap */
if (!mddev->bitmap) if (!mddev->bitmap)
return -ENOENT; return -ENOENT;
if (mddev->bitmap->file) if (mddev->bitmap->storage.file)
return -EINVAL; return -EINVAL;
mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 1);
bitmap_destroy(mddev); bitmap_destroy(mddev);
...@@ -6373,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode) ...@@ -6373,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode)
struct mddev *mddev = mddev_find(bdev->bd_dev); struct mddev *mddev = mddev_find(bdev->bd_dev);
int err; int err;
if (!mddev)
return -ENODEV;
if (mddev->gendisk != bdev->bd_disk) { if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this /* we are racing with mddev_put which is discarding this
* bd_disk. * bd_disk.
...@@ -6584,7 +6832,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) ...@@ -6584,7 +6832,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev)
resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors; max_sectors = mddev->resync_max_sectors;
else else
max_sectors = mddev->dev_sectors; max_sectors = mddev->dev_sectors;
...@@ -7147,7 +7396,7 @@ void md_do_sync(struct mddev *mddev) ...@@ -7147,7 +7396,7 @@ void md_do_sync(struct mddev *mddev)
j = mddev->recovery_cp; j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->dev_sectors; max_sectors = mddev->resync_max_sectors;
else { else {
/* recovery follows the physical size of devices */ /* recovery follows the physical size of devices */
max_sectors = mddev->dev_sectors; max_sectors = mddev->dev_sectors;
...@@ -7598,7 +7847,7 @@ void md_check_recovery(struct mddev *mddev) ...@@ -7598,7 +7847,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock; goto unlock;
if (mddev->pers->sync_request) { if (mddev->pers->sync_request) {
if (spares && mddev->bitmap && ! mddev->bitmap->file) { if (spares) {
/* We are adding a device or devices to an array /* We are adding a device or devices to an array
* which has the bitmap stored on all devices. * which has the bitmap stored on all devices.
* So make sure all bitmap pages get written * So make sure all bitmap pages get written
...@@ -7646,6 +7895,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) ...@@ -7646,6 +7895,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
} }
EXPORT_SYMBOL(md_wait_for_blocked_rdev); EXPORT_SYMBOL(md_wait_for_blocked_rdev);
void md_finish_reshape(struct mddev *mddev)
{
/* called be personality module when reshape completes. */
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) {
if (rdev->data_offset > rdev->new_data_offset)
rdev->sectors += rdev->data_offset - rdev->new_data_offset;
else
rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
rdev->data_offset = rdev->new_data_offset;
}
}
EXPORT_SYMBOL(md_finish_reshape);
/* Bad block management. /* Bad block management.
* We can record which blocks on each device are 'bad' and so just * We can record which blocks on each device are 'bad' and so just
...@@ -7894,10 +8157,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, ...@@ -7894,10 +8157,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
} }
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int acknowledged) int is_new)
{ {
int rv = md_set_badblocks(&rdev->badblocks, int rv;
s + rdev->data_offset, sectors, acknowledged); if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
rv = md_set_badblocks(&rdev->badblocks,
s, sectors, 0);
if (rv) { if (rv) {
/* Make sure they get written out promptly */ /* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state); sysfs_notify_dirent_safe(rdev->sysfs_state);
...@@ -8003,11 +8271,15 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) ...@@ -8003,11 +8271,15 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
return rv; return rv;
} }
int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors) int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new)
{ {
if (is_new)
s += rdev->new_data_offset;
else
s += rdev->data_offset;
return md_clear_badblocks(&rdev->badblocks, return md_clear_badblocks(&rdev->badblocks,
s + rdev->data_offset, s, sectors);
sectors);
} }
EXPORT_SYMBOL_GPL(rdev_clear_badblocks); EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
......
...@@ -55,6 +55,7 @@ struct md_rdev { ...@@ -55,6 +55,7 @@ struct md_rdev {
int sb_loaded; int sb_loaded;
__u64 sb_events; __u64 sb_events;
sector_t data_offset; /* start of data in array */ sector_t data_offset; /* start of data in array */
sector_t new_data_offset;/* only relevant while reshaping */
sector_t sb_start; /* offset of the super block (in 512byte sectors) */ sector_t sb_start; /* offset of the super block (in 512byte sectors) */
int sb_size; /* bytes in the superblock */ int sb_size; /* bytes in the superblock */
int preferred_minor; /* autorun support */ int preferred_minor; /* autorun support */
...@@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, ...@@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
return 0; return 0;
} }
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int acknowledged); int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern void md_ack_all_badblocks(struct badblocks *bb); extern void md_ack_all_badblocks(struct badblocks *bb);
struct mddev { struct mddev {
...@@ -262,6 +264,7 @@ struct mddev { ...@@ -262,6 +264,7 @@ struct mddev {
sector_t reshape_position; sector_t reshape_position;
int delta_disks, new_level, new_layout; int delta_disks, new_level, new_layout;
int new_chunk_sectors; int new_chunk_sectors;
int reshape_backwards;
atomic_t plug_cnt; /* If device is expecting atomic_t plug_cnt; /* If device is expecting
* more bios soon. * more bios soon.
...@@ -390,10 +393,13 @@ struct mddev { ...@@ -390,10 +393,13 @@ struct mddev {
* For external metadata, offset * For external metadata, offset
* from start of device. * from start of device.
*/ */
unsigned long space; /* space available at this offset */
loff_t default_offset; /* this is the offset to use when loff_t default_offset; /* this is the offset to use when
* hot-adding a bitmap. It should * hot-adding a bitmap. It should
* eventually be settable by sysfs. * eventually be settable by sysfs.
*/ */
unsigned long default_space; /* space available at
* default offset */
struct mutex mutex; struct mutex mutex;
unsigned long chunksize; unsigned long chunksize;
unsigned long daemon_sleep; /* how many jiffies between updates? */ unsigned long daemon_sleep; /* how many jiffies between updates? */
...@@ -591,6 +597,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi); ...@@ -591,6 +597,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev); extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
extern int mddev_congested(struct mddev *mddev, int bits); extern int mddev_congested(struct mddev *mddev, int bits);
extern void md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_flush_request(struct mddev *mddev, struct bio *bio);
...@@ -615,6 +622,7 @@ extern int md_run(struct mddev *mddev); ...@@ -615,6 +622,7 @@ extern int md_run(struct mddev *mddev);
extern void md_stop(struct mddev *mddev); extern void md_stop(struct mddev *mddev);
extern void md_stop_writes(struct mddev *mddev); extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev); extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
extern void mddev_suspend(struct mddev *mddev); extern void mddev_suspend(struct mddev *mddev);
extern void mddev_resume(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev);
......
...@@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk, ...@@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
rdev = conf->mirrors[d].rdev; rdev = conf->mirrors[d].rdev;
if (rdev && if (rdev &&
test_bit(In_sync, &rdev->flags) && (test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) &&
is_badblock(rdev, sect, s, is_badblock(rdev, sect, s,
&first_bad, &bad_sectors) == 0 && &first_bad, &bad_sectors) == 0 &&
sync_page_io(rdev, sect, s<<9, sync_page_io(rdev, sect, s<<9,
...@@ -2024,7 +2026,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio ...@@ -2024,7 +2026,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
continue; continue;
if (test_bit(BIO_UPTODATE, &bio->bi_flags) && if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
test_bit(R1BIO_MadeGood, &r1_bio->state)) { test_bit(R1BIO_MadeGood, &r1_bio->state)) {
rdev_clear_badblocks(rdev, r1_bio->sector, s); rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
} }
if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
test_bit(R1BIO_WriteError, &r1_bio->state)) { test_bit(R1BIO_WriteError, &r1_bio->state)) {
...@@ -2044,7 +2046,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) ...@@ -2044,7 +2046,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
struct md_rdev *rdev = conf->mirrors[m].rdev; struct md_rdev *rdev = conf->mirrors[m].rdev;
rdev_clear_badblocks(rdev, rdev_clear_badblocks(rdev,
r1_bio->sector, r1_bio->sector,
r1_bio->sectors); r1_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} else if (r1_bio->bios[m] != NULL) { } else if (r1_bio->bios[m] != NULL) {
/* This drive got a write error. We need to /* This drive got a write error. We need to
...@@ -2598,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) ...@@ -2598,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!disk->rdev || if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) { !test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0; disk->head_position = 0;
if (disk->rdev) if (disk->rdev &&
(disk->rdev->saved_raid_disk < 0))
conf->fullsync = 1; conf->fullsync = 1;
} else if (conf->last_used < 0) } else if (conf->last_used < 0)
/* /*
...@@ -2750,9 +2753,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) ...@@ -2750,9 +2753,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems * any io in the removed space completes, but it hardly seems
* worth it. * worth it.
*/ */
md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0)); sector_t newsize = raid1_size(mddev, sectors, 0);
if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL; return -EINVAL;
if (mddev->bitmap) {
int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
if (ret)
return ret;
}
md_set_array_sectors(mddev, newsize);
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk); revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && if (sectors > mddev->dev_sectors &&
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/kthread.h>
#include "md.h" #include "md.h"
#include "raid10.h" #include "raid10.h"
#include "raid0.h" #include "raid0.h"
...@@ -68,6 +69,11 @@ static int max_queued_requests = 1024; ...@@ -68,6 +69,11 @@ static int max_queued_requests = 1024;
static void allow_barrier(struct r10conf *conf); static void allow_barrier(struct r10conf *conf);
static void lower_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf);
static int enough(struct r10conf *conf, int ignore); static int enough(struct r10conf *conf, int ignore);
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
int *skipped);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
static void end_reshape_write(struct bio *bio, int error);
static void end_reshape(struct r10conf *conf);
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{ {
...@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -112,7 +118,8 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
if (!r10_bio) if (!r10_bio)
return NULL; return NULL;
if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
nalloc = conf->copies; /* resync */ nalloc = conf->copies; /* resync */
else else
nalloc = 2; /* recovery */ nalloc = 2; /* recovery */
...@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -140,9 +147,10 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
struct bio *rbio = r10_bio->devs[j].repl_bio; struct bio *rbio = r10_bio->devs[j].repl_bio;
bio = r10_bio->devs[j].bio; bio = r10_bio->devs[j].bio;
for (i = 0; i < RESYNC_PAGES; i++) { for (i = 0; i < RESYNC_PAGES; i++) {
if (j == 1 && !test_bit(MD_RECOVERY_SYNC, if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
&conf->mddev->recovery)) { &conf->mddev->recovery)) {
/* we can share bv_page's during recovery */ /* we can share bv_page's during recovery
* and reshape */
struct bio *rbio = r10_bio->devs[0].bio; struct bio *rbio = r10_bio->devs[0].bio;
page = rbio->bi_io_vec[i].bv_page; page = rbio->bi_io_vec[i].bv_page;
get_page(page); get_page(page);
...@@ -165,10 +173,11 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) ...@@ -165,10 +173,11 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
while (j--) while (j--)
for (i = 0; i < RESYNC_PAGES ; i++) for (i = 0; i < RESYNC_PAGES ; i++)
safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
j = -1; j = 0;
out_free_bio: out_free_bio:
while (++j < nalloc) { for ( ; j < nalloc; j++) {
bio_put(r10_bio->devs[j].bio); if (r10_bio->devs[j].bio)
bio_put(r10_bio->devs[j].bio);
if (r10_bio->devs[j].repl_bio) if (r10_bio->devs[j].repl_bio)
bio_put(r10_bio->devs[j].repl_bio); bio_put(r10_bio->devs[j].repl_bio);
} }
...@@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error) ...@@ -504,79 +513,96 @@ static void raid10_end_write_request(struct bio *bio, int error)
* sector offset to a virtual address * sector offset to a virtual address
*/ */
static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
{ {
int n,f; int n,f;
sector_t sector; sector_t sector;
sector_t chunk; sector_t chunk;
sector_t stripe; sector_t stripe;
int dev; int dev;
int slot = 0; int slot = 0;
/* now calculate first sector/dev */ /* now calculate first sector/dev */
chunk = r10bio->sector >> conf->chunk_shift; chunk = r10bio->sector >> geo->chunk_shift;
sector = r10bio->sector & conf->chunk_mask; sector = r10bio->sector & geo->chunk_mask;
chunk *= conf->near_copies; chunk *= geo->near_copies;
stripe = chunk; stripe = chunk;
dev = sector_div(stripe, conf->raid_disks); dev = sector_div(stripe, geo->raid_disks);
if (conf->far_offset) if (geo->far_offset)
stripe *= conf->far_copies; stripe *= geo->far_copies;
sector += stripe << conf->chunk_shift; sector += stripe << geo->chunk_shift;
/* and calculate all the others */ /* and calculate all the others */
for (n=0; n < conf->near_copies; n++) { for (n = 0; n < geo->near_copies; n++) {
int d = dev; int d = dev;
sector_t s = sector; sector_t s = sector;
r10bio->devs[slot].addr = sector; r10bio->devs[slot].addr = sector;
r10bio->devs[slot].devnum = d; r10bio->devs[slot].devnum = d;
slot++; slot++;
for (f = 1; f < conf->far_copies; f++) { for (f = 1; f < geo->far_copies; f++) {
d += conf->near_copies; d += geo->near_copies;
if (d >= conf->raid_disks) if (d >= geo->raid_disks)
d -= conf->raid_disks; d -= geo->raid_disks;
s += conf->stride; s += geo->stride;
r10bio->devs[slot].devnum = d; r10bio->devs[slot].devnum = d;
r10bio->devs[slot].addr = s; r10bio->devs[slot].addr = s;
slot++; slot++;
} }
dev++; dev++;
if (dev >= conf->raid_disks) { if (dev >= geo->raid_disks) {
dev = 0; dev = 0;
sector += (conf->chunk_mask + 1); sector += (geo->chunk_mask + 1);
} }
} }
BUG_ON(slot != conf->copies); }
static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
{
struct geom *geo = &conf->geo;
if (conf->reshape_progress != MaxSector &&
((r10bio->sector >= conf->reshape_progress) !=
conf->mddev->reshape_backwards)) {
set_bit(R10BIO_Previous, &r10bio->state);
geo = &conf->prev;
} else
clear_bit(R10BIO_Previous, &r10bio->state);
__raid10_find_phys(geo, r10bio);
} }
static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
{ {
sector_t offset, chunk, vchunk; sector_t offset, chunk, vchunk;
/* Never use conf->prev as this is only called during resync
* or recovery, so reshape isn't happening
*/
struct geom *geo = &conf->geo;
offset = sector & conf->chunk_mask; offset = sector & geo->chunk_mask;
if (conf->far_offset) { if (geo->far_offset) {
int fc; int fc;
chunk = sector >> conf->chunk_shift; chunk = sector >> geo->chunk_shift;
fc = sector_div(chunk, conf->far_copies); fc = sector_div(chunk, geo->far_copies);
dev -= fc * conf->near_copies; dev -= fc * geo->near_copies;
if (dev < 0) if (dev < 0)
dev += conf->raid_disks; dev += geo->raid_disks;
} else { } else {
while (sector >= conf->stride) { while (sector >= geo->stride) {
sector -= conf->stride; sector -= geo->stride;
if (dev < conf->near_copies) if (dev < geo->near_copies)
dev += conf->raid_disks - conf->near_copies; dev += geo->raid_disks - geo->near_copies;
else else
dev -= conf->near_copies; dev -= geo->near_copies;
} }
chunk = sector >> conf->chunk_shift; chunk = sector >> geo->chunk_shift;
} }
vchunk = chunk * conf->raid_disks + dev; vchunk = chunk * geo->raid_disks + dev;
sector_div(vchunk, conf->near_copies); sector_div(vchunk, geo->near_copies);
return (vchunk << conf->chunk_shift) + offset; return (vchunk << geo->chunk_shift) + offset;
} }
/** /**
...@@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q, ...@@ -597,10 +623,17 @@ static int raid10_mergeable_bvec(struct request_queue *q,
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max; int max;
unsigned int chunk_sectors = mddev->chunk_sectors; unsigned int chunk_sectors;
unsigned int bio_sectors = bvm->bi_size >> 9; unsigned int bio_sectors = bvm->bi_size >> 9;
struct geom *geo = &conf->geo;
chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
if (conf->reshape_progress != MaxSector &&
((sector >= conf->reshape_progress) !=
conf->mddev->reshape_backwards))
geo = &conf->prev;
if (conf->near_copies < conf->raid_disks) { if (geo->near_copies < geo->raid_disks) {
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ bio_sectors)) << 9; + bio_sectors)) << 9;
if (max < 0) if (max < 0)
...@@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q, ...@@ -614,6 +647,12 @@ static int raid10_mergeable_bvec(struct request_queue *q,
if (mddev->merge_check_needed) { if (mddev->merge_check_needed) {
struct r10bio r10_bio; struct r10bio r10_bio;
int s; int s;
if (conf->reshape_progress != MaxSector) {
/* Cannot give any guidance during reshape */
if (max <= biovec->bv_len && bio_sectors == 0)
return biovec->bv_len;
return 0;
}
r10_bio.sector = sector; r10_bio.sector = sector;
raid10_find_phys(conf, &r10_bio); raid10_find_phys(conf, &r10_bio);
rcu_read_lock(); rcu_read_lock();
...@@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -681,6 +720,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
struct md_rdev *rdev, *best_rdev; struct md_rdev *rdev, *best_rdev;
int do_balance; int do_balance;
int best_slot; int best_slot;
struct geom *geo = &conf->geo;
raid10_find_phys(conf, r10_bio); raid10_find_phys(conf, r10_bio);
rcu_read_lock(); rcu_read_lock();
...@@ -761,11 +801,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, ...@@ -761,11 +801,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
* sequential read speed for 'far copies' arrays. So only * sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later. * keep it for 'near' arrays, and review those later.
*/ */
if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
break; break;
/* for far > 1 always use the lowest address */ /* for far > 1 always use the lowest address */
if (conf->far_copies > 1) if (geo->far_copies > 1)
new_distance = r10_bio->devs[slot].addr; new_distance = r10_bio->devs[slot].addr;
else else
new_distance = abs(r10_bio->devs[slot].addr - new_distance = abs(r10_bio->devs[slot].addr -
...@@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits) ...@@ -812,7 +852,10 @@ static int raid10_congested(void *data, int bits)
if (mddev_congested(mddev, bits)) if (mddev_congested(mddev, bits))
return 1; return 1;
rcu_read_lock(); rcu_read_lock();
for (i = 0; i < conf->raid_disks && ret == 0; i++) { for (i = 0;
(i < conf->geo.raid_disks || i < conf->prev.raid_disks)
&& ret == 0;
i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) { if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q = bdev_get_queue(rdev->bdev); struct request_queue *q = bdev_get_queue(rdev->bdev);
...@@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf) ...@@ -973,13 +1016,24 @@ static void unfreeze_array(struct r10conf *conf)
spin_unlock_irq(&conf->resync_lock); spin_unlock_irq(&conf->resync_lock);
} }
static sector_t choose_data_offset(struct r10bio *r10_bio,
struct md_rdev *rdev)
{
if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
test_bit(R10BIO_Previous, &r10_bio->state))
return rdev->data_offset;
else
return rdev->new_data_offset;
}
static void make_request(struct mddev *mddev, struct bio * bio) static void make_request(struct mddev *mddev, struct bio * bio)
{ {
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
struct r10bio *r10_bio; struct r10bio *r10_bio;
struct bio *read_bio; struct bio *read_bio;
int i; int i;
int chunk_sects = conf->chunk_mask + 1; sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1;
const int rw = bio_data_dir(bio); const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_fua = (bio->bi_rw & REQ_FUA); const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
...@@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -988,6 +1042,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
int plugged; int plugged;
int sectors_handled; int sectors_handled;
int max_sectors; int max_sectors;
int sectors;
if (unlikely(bio->bi_rw & REQ_FLUSH)) { if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_flush_request(mddev, bio); md_flush_request(mddev, bio);
...@@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -997,9 +1052,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
/* If this request crosses a chunk boundary, we need to /* If this request crosses a chunk boundary, we need to
* split it. This will only happen for 1 PAGE (or less) requests. * split it. This will only happen for 1 PAGE (or less) requests.
*/ */
if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9) if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
> chunk_sects && > chunk_sects
conf->near_copies < conf->raid_disks)) { && (conf->geo.near_copies < conf->geo.raid_disks
|| conf->prev.near_copies < conf->prev.raid_disks))) {
struct bio_pair *bp; struct bio_pair *bp;
/* Sanity check -- queue functions should prevent this happening */ /* Sanity check -- queue functions should prevent this happening */
if (bio->bi_vcnt != 1 || if (bio->bi_vcnt != 1 ||
...@@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1051,10 +1107,41 @@ static void make_request(struct mddev *mddev, struct bio * bio)
*/ */
wait_barrier(conf); wait_barrier(conf);
sectors = bio->bi_size >> 9;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_sector < conf->reshape_progress &&
bio->bi_sector + sectors > conf->reshape_progress) {
/* IO spans the reshape position. Need to wait for
* reshape to pass
*/
allow_barrier(conf);
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_sector ||
conf->reshape_progress >= bio->bi_sector + sectors);
wait_barrier(conf);
}
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio_data_dir(bio) == WRITE &&
(mddev->reshape_backwards
? (bio->bi_sector < conf->reshape_safe &&
bio->bi_sector + sectors > conf->reshape_progress)
: (bio->bi_sector + sectors > conf->reshape_safe &&
bio->bi_sector < conf->reshape_progress))) {
/* Need to update reshape_position in metadata */
mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
conf->reshape_safe = mddev->reshape_position;
}
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio; r10_bio->master_bio = bio;
r10_bio->sectors = bio->bi_size >> 9; r10_bio->sectors = sectors;
r10_bio->mddev = mddev; r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_sector; r10_bio->sector = bio->bi_sector;
...@@ -1093,7 +1180,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1093,7 +1180,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
r10_bio->devs[slot].rdev = rdev; r10_bio->devs[slot].rdev = rdev;
read_bio->bi_sector = r10_bio->devs[slot].addr + read_bio->bi_sector = r10_bio->devs[slot].addr +
rdev->data_offset; choose_data_offset(r10_bio, rdev);
read_bio->bi_bdev = rdev->bdev; read_bio->bi_bdev = rdev->bdev;
read_bio->bi_end_io = raid10_end_read_request; read_bio->bi_end_io = raid10_end_read_request;
read_bio->bi_rw = READ | do_sync; read_bio->bi_rw = READ | do_sync;
...@@ -1297,7 +1384,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1297,7 +1384,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
r10_bio->devs[i].bio = mbio; r10_bio->devs[i].bio = mbio;
mbio->bi_sector = (r10_bio->devs[i].addr+ mbio->bi_sector = (r10_bio->devs[i].addr+
conf->mirrors[d].rdev->data_offset); choose_data_offset(r10_bio,
conf->mirrors[d].rdev));
mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
mbio->bi_end_io = raid10_end_write_request; mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua; mbio->bi_rw = WRITE | do_sync | do_fua;
...@@ -1321,8 +1409,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) ...@@ -1321,8 +1409,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
* so it cannot disappear, so the replacement cannot * so it cannot disappear, so the replacement cannot
* become NULL here * become NULL here
*/ */
mbio->bi_sector = (r10_bio->devs[i].addr+ mbio->bi_sector = (r10_bio->devs[i].addr +
conf->mirrors[d].replacement->data_offset); choose_data_offset(
r10_bio,
conf->mirrors[d].replacement));
mbio->bi_bdev = conf->mirrors[d].replacement->bdev; mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
mbio->bi_end_io = raid10_end_write_request; mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua; mbio->bi_rw = WRITE | do_sync | do_fua;
...@@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev) ...@@ -1368,19 +1458,19 @@ static void status(struct seq_file *seq, struct mddev *mddev)
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
int i; int i;
if (conf->near_copies < conf->raid_disks) if (conf->geo.near_copies < conf->geo.raid_disks)
seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
if (conf->near_copies > 1) if (conf->geo.near_copies > 1)
seq_printf(seq, " %d near-copies", conf->near_copies); seq_printf(seq, " %d near-copies", conf->geo.near_copies);
if (conf->far_copies > 1) { if (conf->geo.far_copies > 1) {
if (conf->far_offset) if (conf->geo.far_offset)
seq_printf(seq, " %d offset-copies", conf->far_copies); seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
else else
seq_printf(seq, " %d far-copies", conf->far_copies); seq_printf(seq, " %d far-copies", conf->geo.far_copies);
} }
seq_printf(seq, " [%d/%d] [", conf->raid_disks, seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
conf->raid_disks - mddev->degraded); conf->geo.raid_disks - mddev->degraded);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->geo.raid_disks; i++)
seq_printf(seq, "%s", seq_printf(seq, "%s",
conf->mirrors[i].rdev && conf->mirrors[i].rdev &&
test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
...@@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) ...@@ -1392,7 +1482,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
* Don't consider the device numbered 'ignore' * Don't consider the device numbered 'ignore'
* as we might be about to remove it. * as we might be about to remove it.
*/ */
static int enough(struct r10conf *conf, int ignore) static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
{ {
int first = 0; int first = 0;
...@@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore) ...@@ -1403,7 +1493,7 @@ static int enough(struct r10conf *conf, int ignore)
if (conf->mirrors[first].rdev && if (conf->mirrors[first].rdev &&
first != ignore) first != ignore)
cnt++; cnt++;
first = (first+1) % conf->raid_disks; first = (first+1) % geo->raid_disks;
} }
if (cnt == 0) if (cnt == 0)
return 0; return 0;
...@@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore) ...@@ -1411,6 +1501,12 @@ static int enough(struct r10conf *conf, int ignore)
return 1; return 1;
} }
static int enough(struct r10conf *conf, int ignore)
{
return _enough(conf, &conf->geo, ignore) &&
_enough(conf, &conf->prev, ignore);
}
static void error(struct mddev *mddev, struct md_rdev *rdev) static void error(struct mddev *mddev, struct md_rdev *rdev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
...@@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1445,7 +1541,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
"md/raid10:%s: Disk failure on %s, disabling device.\n" "md/raid10:%s: Disk failure on %s, disabling device.\n"
"md/raid10:%s: Operation continuing on %d devices.\n", "md/raid10:%s: Operation continuing on %d devices.\n",
mdname(mddev), bdevname(rdev->bdev, b), mdname(mddev), bdevname(rdev->bdev, b),
mdname(mddev), conf->raid_disks - mddev->degraded); mdname(mddev), conf->geo.raid_disks - mddev->degraded);
} }
static void print_conf(struct r10conf *conf) static void print_conf(struct r10conf *conf)
...@@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf) ...@@ -1458,10 +1554,10 @@ static void print_conf(struct r10conf *conf)
printk(KERN_DEBUG "(!conf)\n"); printk(KERN_DEBUG "(!conf)\n");
return; return;
} }
printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
conf->raid_disks); conf->geo.raid_disks);
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->geo.raid_disks; i++) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
tmp = conf->mirrors + i; tmp = conf->mirrors + i;
if (tmp->rdev) if (tmp->rdev)
...@@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev) ...@@ -1493,7 +1589,7 @@ static int raid10_spare_active(struct mddev *mddev)
* Find all non-in_sync disks within the RAID10 configuration * Find all non-in_sync disks within the RAID10 configuration
* and mark them in_sync * and mark them in_sync
*/ */
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->geo.raid_disks; i++) {
tmp = conf->mirrors + i; tmp = conf->mirrors + i;
if (tmp->replacement if (tmp->replacement
&& tmp->replacement->recovery_offset == MaxSector && tmp->replacement->recovery_offset == MaxSector
...@@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1535,7 +1631,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int err = -EEXIST; int err = -EEXIST;
int mirror; int mirror;
int first = 0; int first = 0;
int last = conf->raid_disks - 1; int last = conf->geo.raid_disks - 1;
struct request_queue *q = bdev_get_queue(rdev->bdev); struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_cp < MaxSector) if (mddev->recovery_cp < MaxSector)
...@@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1543,7 +1639,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* very different from resync * very different from resync
*/ */
return -EBUSY; return -EBUSY;
if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
return -EINVAL; return -EINVAL;
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
...@@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1635,6 +1731,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (!test_bit(Faulty, &rdev->flags) && if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != p->recovery_disabled && mddev->recovery_disabled != p->recovery_disabled &&
(!p->replacement || p->replacement == rdev) && (!p->replacement || p->replacement == rdev) &&
number < conf->geo.raid_disks &&
enough(conf, -1)) { enough(conf, -1)) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
...@@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error) ...@@ -1676,7 +1773,11 @@ static void end_sync_read(struct bio *bio, int error)
struct r10conf *conf = r10_bio->mddev->private; struct r10conf *conf = r10_bio->mddev->private;
int d; int d;
d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); if (bio == r10_bio->master_bio) {
/* this is a reshape read */
d = r10_bio->read_slot; /* really the read dev */
} else
d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
if (test_bit(BIO_UPTODATE, &bio->bi_flags)) if (test_bit(BIO_UPTODATE, &bio->bi_flags))
set_bit(R10BIO_Uptodate, &r10_bio->state); set_bit(R10BIO_Uptodate, &r10_bio->state);
...@@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 ...@@ -2218,7 +2319,9 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
" (%d sectors at %llu on %s)\n", " (%d sectors at %llu on %s)\n",
mdname(mddev), s, mdname(mddev), s,
(unsigned long long)( (unsigned long long)(
sect + rdev->data_offset), sect +
choose_data_offset(r10_bio,
rdev)),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing " printk(KERN_NOTICE "md/raid10:%s: %s: failing "
"drive\n", "drive\n",
...@@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 ...@@ -2256,7 +2359,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
" (%d sectors at %llu on %s)\n", " (%d sectors at %llu on %s)\n",
mdname(mddev), s, mdname(mddev), s,
(unsigned long long)( (unsigned long long)(
sect + rdev->data_offset), sect +
choose_data_offset(r10_bio, rdev)),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
printk(KERN_NOTICE "md/raid10:%s: %s: failing " printk(KERN_NOTICE "md/raid10:%s: %s: failing "
"drive\n", "drive\n",
...@@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 ...@@ -2269,7 +2373,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
" (%d sectors at %llu on %s)\n", " (%d sectors at %llu on %s)\n",
mdname(mddev), s, mdname(mddev), s,
(unsigned long long)( (unsigned long long)(
sect + rdev->data_offset), sect +
choose_data_offset(r10_bio, rdev)),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
atomic_add(s, &rdev->corrected_errors); atomic_add(s, &rdev->corrected_errors);
} }
...@@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) ...@@ -2343,7 +2448,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(wbio, sector - bio->bi_sector, sectors); md_trim_bio(wbio, sector - bio->bi_sector, sectors);
wbio->bi_sector = (r10_bio->devs[i].addr+ wbio->bi_sector = (r10_bio->devs[i].addr+
rdev->data_offset+ choose_data_offset(r10_bio, rdev) +
(sector - r10_bio->sector)); (sector - r10_bio->sector));
wbio->bi_bdev = rdev->bdev; wbio->bi_bdev = rdev->bdev;
if (submit_bio_wait(WRITE, wbio) == 0) if (submit_bio_wait(WRITE, wbio) == 0)
...@@ -2420,7 +2525,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) ...@@ -2420,7 +2525,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].bio = bio;
r10_bio->devs[slot].rdev = rdev; r10_bio->devs[slot].rdev = rdev;
bio->bi_sector = r10_bio->devs[slot].addr bio->bi_sector = r10_bio->devs[slot].addr
+ rdev->data_offset; + choose_data_offset(r10_bio, rdev);
bio->bi_bdev = rdev->bdev; bio->bi_bdev = rdev->bdev;
bio->bi_rw = READ | do_sync; bio->bi_rw = READ | do_sync;
bio->bi_private = r10_bio; bio->bi_private = r10_bio;
...@@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2480,7 +2585,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_clear_badblocks( rdev_clear_badblocks(
rdev, rdev,
r10_bio->devs[m].addr, r10_bio->devs[m].addr,
r10_bio->sectors); r10_bio->sectors, 0);
} else { } else {
if (!rdev_set_badblocks( if (!rdev_set_badblocks(
rdev, rdev,
...@@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2496,7 +2601,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_clear_badblocks( rdev_clear_badblocks(
rdev, rdev,
r10_bio->devs[m].addr, r10_bio->devs[m].addr,
r10_bio->sectors); r10_bio->sectors, 0);
} else { } else {
if (!rdev_set_badblocks( if (!rdev_set_badblocks(
rdev, rdev,
...@@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2515,7 +2620,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_clear_badblocks( rdev_clear_badblocks(
rdev, rdev,
r10_bio->devs[m].addr, r10_bio->devs[m].addr,
r10_bio->sectors); r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} else if (bio != NULL && } else if (bio != NULL &&
!test_bit(BIO_UPTODATE, &bio->bi_flags)) { !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
...@@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2532,7 +2637,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
rdev_clear_badblocks( rdev_clear_badblocks(
rdev, rdev,
r10_bio->devs[m].addr, r10_bio->devs[m].addr,
r10_bio->sectors); r10_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
} }
...@@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev) ...@@ -2573,6 +2678,8 @@ static void raid10d(struct mddev *mddev)
if (test_bit(R10BIO_MadeGood, &r10_bio->state) || if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
test_bit(R10BIO_WriteError, &r10_bio->state)) test_bit(R10BIO_WriteError, &r10_bio->state))
handle_write_completed(conf, r10_bio); handle_write_completed(conf, r10_bio);
else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
reshape_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_IsSync, &r10_bio->state)) else if (test_bit(R10BIO_IsSync, &r10_bio->state))
sync_request_write(mddev, r10_bio); sync_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
...@@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf) ...@@ -2603,7 +2710,7 @@ static int init_resync(struct r10conf *conf)
buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
BUG_ON(conf->r10buf_pool); BUG_ON(conf->r10buf_pool);
conf->have_replacement = 0; conf->have_replacement = 0;
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->geo.raid_disks; i++)
if (conf->mirrors[i].replacement) if (conf->mirrors[i].replacement)
conf->have_replacement = 1; conf->have_replacement = 1;
conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
...@@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2657,6 +2764,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t sync_blocks; sector_t sync_blocks;
sector_t sectors_skipped = 0; sector_t sectors_skipped = 0;
int chunks_skipped = 0; int chunks_skipped = 0;
sector_t chunk_mask = conf->geo.chunk_mask;
if (!conf->r10buf_pool) if (!conf->r10buf_pool)
if (init_resync(conf)) if (init_resync(conf))
...@@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2664,7 +2772,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
skipped: skipped:
max_sector = mddev->dev_sectors; max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sector = mddev->resync_max_sectors; max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) { if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the /* If we aborted, we need to abort the
...@@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2676,11 +2785,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
* we need to convert that to several * we need to convert that to several
* virtual addresses. * virtual addresses.
*/ */
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
return 0;
}
if (mddev->curr_resync < max_sector) { /* aborted */ if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
bitmap_end_sync(mddev->bitmap, mddev->curr_resync, bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1); &sync_blocks, 1);
else for (i=0; i<conf->raid_disks; i++) { else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect = sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i); raid10_find_virt(conf, mddev->curr_resync, i);
bitmap_end_sync(mddev->bitmap, sect, bitmap_end_sync(mddev->bitmap, sect,
...@@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2694,7 +2808,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* Completed a full sync so the replacements /* Completed a full sync so the replacements
* are now fully recovered. * are now fully recovered.
*/ */
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->geo.raid_disks; i++)
if (conf->mirrors[i].replacement) if (conf->mirrors[i].replacement)
conf->mirrors[i].replacement conf->mirrors[i].replacement
->recovery_offset ->recovery_offset
...@@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2707,7 +2821,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*skipped = 1; *skipped = 1;
return sectors_skipped; return sectors_skipped;
} }
if (chunks_skipped >= conf->raid_disks) {
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
if (chunks_skipped >= conf->geo.raid_disks) {
/* if there has been nothing to do on any drive, /* if there has been nothing to do on any drive,
* then there is nothing to do at all.. * then there is nothing to do at all..
*/ */
...@@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2721,9 +2839,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* make sure whole request will fit in a chunk - if chunks /* make sure whole request will fit in a chunk - if chunks
* are meaningful * are meaningful
*/ */
if (conf->near_copies < conf->raid_disks && if (conf->geo.near_copies < conf->geo.raid_disks &&
max_sector > (sector_nr | conf->chunk_mask)) max_sector > (sector_nr | chunk_mask))
max_sector = (sector_nr | conf->chunk_mask) + 1; max_sector = (sector_nr | chunk_mask) + 1;
/* /*
* If there is non-resync activity waiting for us then * If there is non-resync activity waiting for us then
* put in a delay to throttle resync. * put in a delay to throttle resync.
...@@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2752,7 +2870,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
int j; int j;
r10_bio = NULL; r10_bio = NULL;
for (i=0 ; i<conf->raid_disks; i++) { for (i = 0 ; i < conf->geo.raid_disks; i++) {
int still_degraded; int still_degraded;
struct r10bio *rb2; struct r10bio *rb2;
sector_t sect; sector_t sect;
...@@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2806,7 +2924,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* Need to check if the array will still be /* Need to check if the array will still be
* degraded * degraded
*/ */
for (j=0; j<conf->raid_disks; j++) for (j = 0; j < conf->geo.raid_disks; j++)
if (conf->mirrors[j].rdev == NULL || if (conf->mirrors[j].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
still_degraded = 1; still_degraded = 1;
...@@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -2984,9 +3102,9 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->sector = sector_nr; r10_bio->sector = sector_nr;
set_bit(R10BIO_IsSync, &r10_bio->state); set_bit(R10BIO_IsSync, &r10_bio->state);
raid10_find_phys(conf, r10_bio); raid10_find_phys(conf, r10_bio);
r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1; r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
for (i=0; i<conf->copies; i++) { for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum; int d = r10_bio->devs[i].devnum;
sector_t first_bad, sector; sector_t first_bad, sector;
int bad_sectors; int bad_sectors;
...@@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) ...@@ -3152,16 +3270,17 @@ raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
if (!raid_disks) if (!raid_disks)
raid_disks = conf->raid_disks; raid_disks = min(conf->geo.raid_disks,
conf->prev.raid_disks);
if (!sectors) if (!sectors)
sectors = conf->dev_sectors; sectors = conf->dev_sectors;
size = sectors >> conf->chunk_shift; size = sectors >> conf->geo.chunk_shift;
sector_div(size, conf->far_copies); sector_div(size, conf->geo.far_copies);
size = size * raid_disks; size = size * raid_disks;
sector_div(size, conf->near_copies); sector_div(size, conf->geo.near_copies);
return size << conf->chunk_shift; return size << conf->geo.chunk_shift;
} }
static void calc_sectors(struct r10conf *conf, sector_t size) static void calc_sectors(struct r10conf *conf, sector_t size)
...@@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size) ...@@ -3171,10 +3290,10 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
* conf->stride * conf->stride
*/ */
size = size >> conf->chunk_shift; size = size >> conf->geo.chunk_shift;
sector_div(size, conf->far_copies); sector_div(size, conf->geo.far_copies);
size = size * conf->raid_disks; size = size * conf->geo.raid_disks;
sector_div(size, conf->near_copies); sector_div(size, conf->geo.near_copies);
/* 'size' is now the number of chunks in the array */ /* 'size' is now the number of chunks in the array */
/* calculate "used chunks per device" */ /* calculate "used chunks per device" */
size = size * conf->copies; size = size * conf->copies;
...@@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size) ...@@ -3182,38 +3301,76 @@ static void calc_sectors(struct r10conf *conf, sector_t size)
/* We need to round up when dividing by raid_disks to /* We need to round up when dividing by raid_disks to
* get the stride size. * get the stride size.
*/ */
size = DIV_ROUND_UP_SECTOR_T(size, conf->raid_disks); size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
conf->dev_sectors = size << conf->chunk_shift; conf->dev_sectors = size << conf->geo.chunk_shift;
if (conf->far_offset) if (conf->geo.far_offset)
conf->stride = 1 << conf->chunk_shift; conf->geo.stride = 1 << conf->geo.chunk_shift;
else { else {
sector_div(size, conf->far_copies); sector_div(size, conf->geo.far_copies);
conf->stride = size << conf->chunk_shift; conf->geo.stride = size << conf->geo.chunk_shift;
} }
} }
enum geo_type {geo_new, geo_old, geo_start};
static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
{
int nc, fc, fo;
int layout, chunk, disks;
switch (new) {
case geo_old:
layout = mddev->layout;
chunk = mddev->chunk_sectors;
disks = mddev->raid_disks - mddev->delta_disks;
break;
case geo_new:
layout = mddev->new_layout;
chunk = mddev->new_chunk_sectors;
disks = mddev->raid_disks;
break;
default: /* avoid 'may be unused' warnings */
case geo_start: /* new when starting reshape - raid_disks not
* updated yet. */
layout = mddev->new_layout;
chunk = mddev->new_chunk_sectors;
disks = mddev->raid_disks + mddev->delta_disks;
break;
}
if (layout >> 17)
return -1;
if (chunk < (PAGE_SIZE >> 9) ||
!is_power_of_2(chunk))
return -2;
nc = layout & 255;
fc = (layout >> 8) & 255;
fo = layout & (1<<16);
geo->raid_disks = disks;
geo->near_copies = nc;
geo->far_copies = fc;
geo->far_offset = fo;
geo->chunk_mask = chunk - 1;
geo->chunk_shift = ffz(~chunk);
return nc*fc;
}
static struct r10conf *setup_conf(struct mddev *mddev) static struct r10conf *setup_conf(struct mddev *mddev)
{ {
struct r10conf *conf = NULL; struct r10conf *conf = NULL;
int nc, fc, fo;
int err = -EINVAL; int err = -EINVAL;
struct geom geo;
int copies;
copies = setup_geo(&geo, mddev, geo_new);
if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) || if (copies == -2) {
!is_power_of_2(mddev->new_chunk_sectors)) {
printk(KERN_ERR "md/raid10:%s: chunk size must be " printk(KERN_ERR "md/raid10:%s: chunk size must be "
"at least PAGE_SIZE(%ld) and be a power of 2.\n", "at least PAGE_SIZE(%ld) and be a power of 2.\n",
mdname(mddev), PAGE_SIZE); mdname(mddev), PAGE_SIZE);
goto out; goto out;
} }
nc = mddev->new_layout & 255; if (copies < 2 || copies > mddev->raid_disks) {
fc = (mddev->new_layout >> 8) & 255;
fo = mddev->new_layout & (1<<16);
if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
(mddev->new_layout >> 17)) {
printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
mdname(mddev), mddev->new_layout); mdname(mddev), mddev->new_layout);
goto out; goto out;
...@@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) ...@@ -3224,7 +3381,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf) if (!conf)
goto out; goto out;
conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, /* FIXME calc properly */
conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks +
max(0,mddev->delta_disks)),
GFP_KERNEL); GFP_KERNEL);
if (!conf->mirrors) if (!conf->mirrors)
goto out; goto out;
...@@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev) ...@@ -3233,22 +3392,29 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->tmppage) if (!conf->tmppage)
goto out; goto out;
conf->geo = geo;
conf->raid_disks = mddev->raid_disks; conf->copies = copies;
conf->near_copies = nc;
conf->far_copies = fc;
conf->copies = nc*fc;
conf->far_offset = fo;
conf->chunk_mask = mddev->new_chunk_sectors - 1;
conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
r10bio_pool_free, conf); r10bio_pool_free, conf);
if (!conf->r10bio_pool) if (!conf->r10bio_pool)
goto out; goto out;
calc_sectors(conf, mddev->dev_sectors); calc_sectors(conf, mddev->dev_sectors);
if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo;
conf->reshape_progress = MaxSector;
} else {
if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
err = -EINVAL;
goto out;
}
conf->reshape_progress = mddev->reshape_position;
if (conf->prev.far_offset)
conf->prev.stride = 1 << conf->prev.chunk_shift;
else
/* far_copies must be 1 */
conf->prev.stride = conf->dev_sectors;
}
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
...@@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev) ...@@ -3263,8 +3429,9 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return conf; return conf;
out: out:
printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", if (err == -ENOMEM)
mdname(mddev)); printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
mdname(mddev));
if (conf) { if (conf) {
if (conf->r10bio_pool) if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool); mempool_destroy(conf->r10bio_pool);
...@@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev) ...@@ -3282,12 +3449,8 @@ static int run(struct mddev *mddev)
struct mirror_info *disk; struct mirror_info *disk;
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t size; sector_t size;
sector_t min_offset_diff = 0;
/* int first = 1;
* copy the already verified devices into our private RAID10
* bookkeeping area. [whatever we allocate in run(),
* should be freed in stop()]
*/
if (mddev->private == NULL) { if (mddev->private == NULL) {
conf = setup_conf(mddev); conf = setup_conf(mddev);
...@@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev) ...@@ -3304,17 +3467,20 @@ static int run(struct mddev *mddev)
chunk_size = mddev->chunk_sectors << 9; chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_min(mddev->queue, chunk_size);
if (conf->raid_disks % conf->near_copies) if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks); blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
else else
blk_queue_io_opt(mddev->queue, chunk_size * blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks / conf->near_copies)); (conf->geo.raid_disks / conf->geo.near_copies));
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
long long diff;
disk_idx = rdev->raid_disk; disk_idx = rdev->raid_disk;
if (disk_idx >= conf->raid_disks if (disk_idx < 0)
|| disk_idx < 0) continue;
if (disk_idx >= conf->geo.raid_disks &&
disk_idx >= conf->prev.raid_disks)
continue; continue;
disk = conf->mirrors + disk_idx; disk = conf->mirrors + disk_idx;
...@@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev) ...@@ -3327,12 +3493,20 @@ static int run(struct mddev *mddev)
goto out_free_conf; goto out_free_conf;
disk->rdev = rdev; disk->rdev = rdev;
} }
diff = (rdev->new_data_offset - rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
if (diff < 0)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
disk->head_position = 0; disk->head_position = 0;
} }
/* need to check that every block has at least one working mirror */ /* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) { if (!enough(conf, -1)) {
printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
...@@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev) ...@@ -3340,8 +3514,21 @@ static int run(struct mddev *mddev)
goto out_free_conf; goto out_free_conf;
} }
if (conf->reshape_progress != MaxSector) {
/* must ensure that shape change is supported */
if (conf->geo.far_copies != 1 &&
conf->geo.far_offset == 0)
goto out_free_conf;
if (conf->prev.far_copies != 1 &&
conf->geo.far_offset == 0)
goto out_free_conf;
}
mddev->degraded = 0; mddev->degraded = 0;
for (i = 0; i < conf->raid_disks; i++) { for (i = 0;
i < conf->geo.raid_disks
|| i < conf->prev.raid_disks;
i++) {
disk = conf->mirrors + i; disk = conf->mirrors + i;
...@@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev) ...@@ -3368,8 +3555,8 @@ static int run(struct mddev *mddev)
mdname(mddev)); mdname(mddev));
printk(KERN_INFO printk(KERN_INFO
"md/raid10:%s: active with %d out of %d devices\n", "md/raid10:%s: active with %d out of %d devices\n",
mdname(mddev), conf->raid_disks - mddev->degraded, mdname(mddev), conf->geo.raid_disks - mddev->degraded,
conf->raid_disks); conf->geo.raid_disks);
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
...@@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev) ...@@ -3386,11 +3573,11 @@ static int run(struct mddev *mddev)
* maybe... * maybe...
*/ */
{ {
int stripe = conf->raid_disks * int stripe = conf->geo.raid_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE); ((mddev->chunk_sectors << 9) / PAGE_SIZE);
stripe /= conf->near_copies; stripe /= conf->geo.near_copies;
if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2* stripe; mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
} }
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
...@@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev) ...@@ -3398,6 +3585,30 @@ static int run(struct mddev *mddev)
if (md_integrity_register(mddev)) if (md_integrity_register(mddev))
goto out_free_conf; goto out_free_conf;
if (conf->reshape_progress != MaxSector) {
unsigned long before_length, after_length;
before_length = ((1 << conf->prev.chunk_shift) *
conf->prev.far_copies);
after_length = ((1 << conf->geo.chunk_shift) *
conf->geo.far_copies);
if (max(before_length, after_length) > min_offset_diff) {
/* This cannot work */
printk("md/raid10: offset difference not enough to continue reshape\n");
goto out_free_conf;
}
conf->offset_diff = min_offset_diff;
conf->reshape_safe = conf->reshape_progress;
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
}
return 0; return 0;
out_free_conf: out_free_conf:
...@@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) ...@@ -3460,14 +3671,23 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
sector_t oldsize, size; sector_t oldsize, size;
if (conf->far_copies > 1 && !conf->far_offset) if (mddev->reshape_position != MaxSector)
return -EBUSY;
if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
return -EINVAL; return -EINVAL;
oldsize = raid10_size(mddev, 0, 0); oldsize = raid10_size(mddev, 0, 0);
size = raid10_size(mddev, sectors, 0); size = raid10_size(mddev, sectors, 0);
md_set_array_sectors(mddev, size); if (mddev->external_size &&
if (mddev->array_sectors > size) mddev->array_sectors > size)
return -EINVAL; return -EINVAL;
if (mddev->bitmap) {
int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
if (ret)
return ret;
}
md_set_array_sectors(mddev, size);
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk); revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && if (sectors > mddev->dev_sectors &&
...@@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev) ...@@ -3534,6 +3754,758 @@ static void *raid10_takeover(struct mddev *mddev)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
static int raid10_check_reshape(struct mddev *mddev)
{
/* Called when there is a request to change
* - layout (to ->new_layout)
* - chunk size (to ->new_chunk_sectors)
* - raid_disks (by delta_disks)
* or when trying to restart a reshape that was ongoing.
*
* We need to validate the request and possibly allocate
* space if that might be an issue later.
*
* Currently we reject any reshape of a 'far' mode array,
* allow chunk size to change if new is generally acceptable,
* allow raid_disks to increase, and allow
* a switch between 'near' mode and 'offset' mode.
*/
struct r10conf *conf = mddev->private;
struct geom geo;
if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
return -EINVAL;
if (setup_geo(&geo, mddev, geo_start) != conf->copies)
/* mustn't change number of copies */
return -EINVAL;
if (geo.far_copies > 1 && !geo.far_offset)
/* Cannot switch to 'far' mode */
return -EINVAL;
if (mddev->array_sectors & geo.chunk_mask)
/* not factor of array size */
return -EINVAL;
if (!enough(conf, -1))
return -EINVAL;
kfree(conf->mirrors_new);
conf->mirrors_new = NULL;
if (mddev->delta_disks > 0) {
/* allocate new 'mirrors' list */
conf->mirrors_new = kzalloc(
sizeof(struct mirror_info)
*(mddev->raid_disks +
mddev->delta_disks),
GFP_KERNEL);
if (!conf->mirrors_new)
return -ENOMEM;
}
return 0;
}
/*
* Need to check if array has failed when deciding whether to:
* - start an array
* - remove non-faulty devices
* - add a spare
* - allow a reshape
* This determination is simple when no reshape is happening.
* However if there is a reshape, we need to carefully check
* both the before and after sections.
* This is because some failed devices may only affect one
* of the two sections, and some non-in_sync devices may
* be insync in the section most affected by failed devices.
*/
static int calc_degraded(struct r10conf *conf)
{
int degraded, degraded2;
int i;
rcu_read_lock();
degraded = 0;
/* 'prev' section first */
for (i = 0; i < conf->prev.raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded++;
else if (!test_bit(In_sync, &rdev->flags))
/* When we can reduce the number of devices in
* an array, this might not contribute to
* 'degraded'. It does now.
*/
degraded++;
}
rcu_read_unlock();
if (conf->geo.raid_disks == conf->prev.raid_disks)
return degraded;
rcu_read_lock();
degraded2 = 0;
for (i = 0; i < conf->geo.raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
if (!rdev || test_bit(Faulty, &rdev->flags))
degraded2++;
else if (!test_bit(In_sync, &rdev->flags)) {
/* If reshape is increasing the number of devices,
* this section has already been recovered, so
* it doesn't contribute to degraded.
* else it does.
*/
if (conf->geo.raid_disks <= conf->prev.raid_disks)
degraded2++;
}
}
rcu_read_unlock();
if (degraded2 > degraded)
return degraded2;
return degraded;
}
static int raid10_start_reshape(struct mddev *mddev)
{
/* A 'reshape' has been requested. This commits
* the various 'new' fields and sets MD_RECOVER_RESHAPE
* This also checks if there are enough spares and adds them
* to the array.
* We currently require enough spares to make the final
* array non-degraded. We also require that the difference
* between old and new data_offset - on each device - is
* enough that we never risk over-writing.
*/
unsigned long before_length, after_length;
sector_t min_offset_diff = 0;
int first = 1;
struct geom new;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev;
int spares = 0;
int ret;
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (setup_geo(&new, mddev, geo_start) != conf->copies)
return -EINVAL;
before_length = ((1 << conf->prev.chunk_shift) *
conf->prev.far_copies);
after_length = ((1 << conf->geo.chunk_shift) *
conf->geo.far_copies);
rdev_for_each(rdev, mddev) {
if (!test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk >= 0) {
long long diff = (rdev->new_data_offset
- rdev->data_offset);
if (!mddev->reshape_backwards)
diff = -diff;
if (diff < 0)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
}
}
if (max(before_length, after_length) > min_offset_diff)
return -EINVAL;
if (spares < mddev->delta_disks)
return -EINVAL;
conf->offset_diff = min_offset_diff;
spin_lock_irq(&conf->device_lock);
if (conf->mirrors_new) {
memcpy(conf->mirrors_new, conf->mirrors,
sizeof(struct mirror_info)*conf->prev.raid_disks);
smp_mb();
kfree(conf->mirrors_old); /* FIXME and elsewhere */
conf->mirrors_old = conf->mirrors;
conf->mirrors = conf->mirrors_new;
conf->mirrors_new = NULL;
}
setup_geo(&conf->geo, mddev, geo_start);
smp_mb();
if (mddev->reshape_backwards) {
sector_t size = raid10_size(mddev, 0, 0);
if (size < mddev->array_sectors) {
spin_unlock_irq(&conf->device_lock);
printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
mdname(mddev));
return -EINVAL;
}
mddev->resync_max_sectors = size;
conf->reshape_progress = size;
} else
conf->reshape_progress = 0;
spin_unlock_irq(&conf->device_lock);
if (mddev->delta_disks && mddev->bitmap) {
ret = bitmap_resize(mddev->bitmap,
raid10_size(mddev, 0,
conf->geo.raid_disks),
0, 0);
if (ret)
goto abort;
}
if (mddev->delta_disks > 0) {
rdev_for_each(rdev, mddev)
if (rdev->raid_disk < 0 &&
!test_bit(Faulty, &rdev->flags)) {
if (raid10_add_disk(mddev, rdev) == 0) {
if (rdev->raid_disk >=
conf->prev.raid_disks)
set_bit(In_sync, &rdev->flags);
else
rdev->recovery_offset = 0;
if (sysfs_link_rdev(mddev, rdev))
/* Failure here is OK */;
}
} else if (rdev->raid_disk >= conf->prev.raid_disks
&& !test_bit(Faulty, &rdev->flags)) {
/* This is a spare that was manually added */
set_bit(In_sync, &rdev->flags);
}
}
/* When a reshape changes the number of devices,
* ->degraded is measured against the larger of the
* pre and post numbers.
*/
spin_lock_irq(&conf->device_lock);
mddev->degraded = calc_degraded(conf);
spin_unlock_irq(&conf->device_lock);
mddev->raid_disks = conf->geo.raid_disks;
mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
mddev->sync_thread = md_register_thread(md_do_sync, mddev,
"reshape");
if (!mddev->sync_thread) {
ret = -EAGAIN;
goto abort;
}
conf->reshape_checkpoint = jiffies;
md_wakeup_thread(mddev->sync_thread);
md_new_event(mddev);
return 0;
abort:
mddev->recovery = 0;
spin_lock_irq(&conf->device_lock);
conf->geo = conf->prev;
mddev->raid_disks = conf->geo.raid_disks;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
smp_wmb();
conf->reshape_progress = MaxSector;
mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock);
return ret;
}
/* Calculate the last device-address that could contain
* any block from the chunk that includes the array-address 's'
* and report the next address.
* i.e. the address returned will be chunk-aligned and after
* any data that is in the chunk containing 's'.
*/
static sector_t last_dev_address(sector_t s, struct geom *geo)
{
s = (s | geo->chunk_mask) + 1;
s >>= geo->chunk_shift;
s *= geo->near_copies;
s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
s *= geo->far_copies;
s <<= geo->chunk_shift;
return s;
}
/* Calculate the first device-address that could contain
* any block from the chunk that includes the array-address 's'.
* This too will be the start of a chunk
*/
static sector_t first_dev_address(sector_t s, struct geom *geo)
{
s >>= geo->chunk_shift;
s *= geo->near_copies;
sector_div(s, geo->raid_disks);
s *= geo->far_copies;
s <<= geo->chunk_shift;
return s;
}
static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
int *skipped)
{
/* We simply copy at most one chunk (smallest of old and new)
* at a time, possibly less if that exceeds RESYNC_PAGES,
* or we hit a bad block or something.
* This might mean we pause for normal IO in the middle of
* a chunk, but that is not a problem was mddev->reshape_position
* can record any location.
*
* If we will want to write to a location that isn't
* yet recorded as 'safe' (i.e. in metadata on disk) then
* we need to flush all reshape requests and update the metadata.
*
* When reshaping forwards (e.g. to more devices), we interpret
* 'safe' as the earliest block which might not have been copied
* down yet. We divide this by previous stripe size and multiply
* by previous stripe length to get lowest device offset that we
* cannot write to yet.
* We interpret 'sector_nr' as an address that we want to write to.
* From this we use last_device_address() to find where we might
* write to, and first_device_address on the 'safe' position.
* If this 'next' write position is after the 'safe' position,
* we must update the metadata to increase the 'safe' position.
*
* When reshaping backwards, we round in the opposite direction
* and perform the reverse test: next write position must not be
* less than current safe position.
*
* In all this the minimum difference in data offsets
* (conf->offset_diff - always positive) allows a bit of slack,
* so next can be after 'safe', but not by more than offset_disk
*
* We need to prepare all the bios here before we start any IO
* to ensure the size we choose is acceptable to all devices.
* The means one for each copy for write-out and an extra one for
* read-in.
* We store the read-in bio in ->master_bio and the others in
* ->devs[x].bio and ->devs[x].repl_bio.
*/
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
sector_t next, safe, last;
int max_sectors;
int nr_sectors;
int s;
struct md_rdev *rdev;
int need_flush = 0;
struct bio *blist;
struct bio *bio, *read_bio;
int sectors_done = 0;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
if (mddev->reshape_backwards &&
conf->reshape_progress < raid10_size(mddev, 0, 0)) {
sector_nr = (raid10_size(mddev, 0, 0)
- conf->reshape_progress);
} else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
if (sector_nr) {
mddev->curr_resync_completed = sector_nr;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
*skipped = 1;
return sector_nr;
}
}
/* We don't use sector_nr to track where we are up to
* as that doesn't work well for ->reshape_backwards.
* So just use ->reshape_progress.
*/
if (mddev->reshape_backwards) {
/* 'next' is the earliest device address that we might
* write to for this chunk in the new layout
*/
next = first_dev_address(conf->reshape_progress - 1,
&conf->geo);
/* 'safe' is the last device address that we might read from
* in the old layout after a restart
*/
safe = last_dev_address(conf->reshape_safe - 1,
&conf->prev);
if (next + conf->offset_diff < safe)
need_flush = 1;
last = conf->reshape_progress - 1;
sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
& conf->prev.chunk_mask);
if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
} else {
/* 'next' is after the last device address that we
* might write to for this chunk in the new layout
*/
next = last_dev_address(conf->reshape_progress, &conf->geo);
/* 'safe' is the earliest device address that we might
* read from in the old layout after a restart
*/
safe = first_dev_address(conf->reshape_safe, &conf->prev);
/* Need to update metadata if 'next' might be beyond 'safe'
* as that would possibly corrupt data
*/
if (next > safe + conf->offset_diff)
need_flush = 1;
sector_nr = conf->reshape_progress;
last = sector_nr | (conf->geo.chunk_mask
& conf->prev.chunk_mask);
if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
}
if (need_flush ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Need to update reshape_position in metadata */
wait_barrier(conf);
mddev->reshape_position = conf->reshape_progress;
if (mddev->reshape_backwards)
mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
- conf->reshape_progress;
else
mddev->curr_resync_completed = conf->reshape_progress;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop());
conf->reshape_safe = mddev->reshape_position;
allow_barrier(conf);
}
read_more:
/* Now schedule reads for blocks from sector_nr to last */
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
raise_barrier(conf, sectors_done != 0);
atomic_set(&r10_bio->remaining, 0);
r10_bio->mddev = mddev;
r10_bio->sector = sector_nr;
set_bit(R10BIO_IsReshape, &r10_bio->state);
r10_bio->sectors = last - sector_nr + 1;
rdev = read_balance(conf, r10_bio, &max_sectors);
BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
if (!rdev) {
/* Cannot read from here, so need to record bad blocks
* on all the target devices.
*/
// FIXME
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return sectors_done;
}
read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
read_bio->bi_bdev = rdev->bdev;
read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
+ rdev->data_offset);
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ;
read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
read_bio->bi_flags |= 1 << BIO_UPTODATE;
read_bio->bi_vcnt = 0;
read_bio->bi_idx = 0;
read_bio->bi_size = 0;
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
/* Now find the locations in the new layout */
__raid10_find_phys(&conf->geo, r10_bio);
blist = read_bio;
read_bio->bi_next = NULL;
for (s = 0; s < conf->copies*2; s++) {
struct bio *b;
int d = r10_bio->devs[s/2].devnum;
struct md_rdev *rdev2;
if (s&1) {
rdev2 = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
rdev2 = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
}
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
continue;
b->bi_bdev = rdev2->bdev;
b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
b->bi_private = r10_bio;
b->bi_end_io = end_reshape_write;
b->bi_rw = WRITE;
b->bi_flags &= ~(BIO_POOL_MASK - 1);
b->bi_flags |= 1 << BIO_UPTODATE;
b->bi_next = blist;
b->bi_vcnt = 0;
b->bi_idx = 0;
b->bi_size = 0;
blist = b;
}
/* Now add as many pages as possible to all of these bios. */
nr_sectors = 0;
for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
int len = (max_sectors - s) << 9;
if (len > PAGE_SIZE)
len = PAGE_SIZE;
for (bio = blist; bio ; bio = bio->bi_next) {
struct bio *bio2;
if (bio_add_page(bio, page, len, 0))
continue;
/* Didn't fit, must stop */
for (bio2 = blist;
bio2 && bio2 != bio;
bio2 = bio2->bi_next) {
/* Remove last page from this bio */
bio2->bi_vcnt--;
bio2->bi_size -= len;
bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
}
goto bio_full;
}
sector_nr += len >> 9;
nr_sectors += len >> 9;
}
bio_full:
r10_bio->sectors = nr_sectors;
/* Now submit the read */
md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
generic_make_request(read_bio);
sector_nr += nr_sectors;
sectors_done += nr_sectors;
if (sector_nr <= last)
goto read_more;
/* Now that we have done the whole section we can
* update reshape_progress
*/
if (mddev->reshape_backwards)
conf->reshape_progress -= sectors_done;
else
conf->reshape_progress += sectors_done;
return sectors_done;
}
static void end_reshape_request(struct r10bio *r10_bio);
static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10_bio);
static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{
/* Reshape read completed. Hopefully we have a block
* to write out.
* If we got a read error then we do sync 1-page reads from
* elsewhere until we find the data - or give up.
*/
struct r10conf *conf = mddev->private;
int s;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
if (handle_reshape_read_error(mddev, r10_bio) < 0) {
/* Reshape has been aborted */
md_done_sync(mddev, r10_bio->sectors, 0);
return;
}
/* We definitely have the data in the pages, schedule the
* writes.
*/
atomic_set(&r10_bio->remaining, 1);
for (s = 0; s < conf->copies*2; s++) {
struct bio *b;
int d = r10_bio->devs[s/2].devnum;
struct md_rdev *rdev;
if (s&1) {
rdev = conf->mirrors[d].replacement;
b = r10_bio->devs[s/2].repl_bio;
} else {
rdev = conf->mirrors[d].rdev;
b = r10_bio->devs[s/2].bio;
}
if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
atomic_inc(&rdev->nr_pending);
md_sync_acct(b->bi_bdev, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
generic_make_request(b);
}
end_reshape_request(r10_bio);
}
static void end_reshape(struct r10conf *conf)
{
if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
return;
spin_lock_irq(&conf->device_lock);
conf->prev = conf->geo;
md_finish_reshape(conf->mddev);
smp_wmb();
conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock);
/* read-ahead size must cover two whole stripes, which is
* 2 * (datadisks) * chunksize where 'n' is the number of raid devices
*/
if (conf->mddev->queue) {
int stripe = conf->geo.raid_disks *
((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
stripe /= conf->geo.near_copies;
if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
}
conf->fullsync = 0;
}
static int handle_reshape_read_error(struct mddev *mddev,
struct r10bio *r10_bio)
{
/* Use sync reads to get the blocks from somewhere else */
int sectors = r10_bio->sectors;
struct r10bio r10b;
struct r10conf *conf = mddev->private;
int slot = 0;
int idx = 0;
struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
r10b.sector = r10_bio->sector;
__raid10_find_phys(&conf->prev, &r10b);
while (sectors) {
int s = sectors;
int success = 0;
int first_slot = slot;
if (s > (PAGE_SIZE >> 9))
s = PAGE_SIZE >> 9;
while (!success) {
int d = r10b.devs[slot].devnum;
struct md_rdev *rdev = conf->mirrors[d].rdev;
sector_t addr;
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
goto failed;
addr = r10b.devs[slot].addr + idx * PAGE_SIZE;
success = sync_page_io(rdev,
addr,
s << 9,
bvec[idx].bv_page,
READ, false);
if (success)
break;
failed:
slot++;
if (slot >= conf->copies)
slot = 0;
if (slot == first_slot)
break;
}
if (!success) {
/* couldn't read this block, must give up */
set_bit(MD_RECOVERY_INTR,
&mddev->recovery);
return -EIO;
}
sectors -= s;
idx++;
}
return 0;
}
static void end_reshape_write(struct bio *bio, int error)
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct r10bio *r10_bio = bio->bi_private;
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
int slot;
int repl;
struct md_rdev *rdev = NULL;
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
if (repl)
rdev = conf->mirrors[d].replacement;
if (!rdev) {
smp_mb();
rdev = conf->mirrors[d].rdev;
}
if (!uptodate) {
/* FIXME should record badblock */
md_error(mddev, rdev);
}
rdev_dec_pending(rdev, mddev);
end_reshape_request(r10_bio);
}
static void end_reshape_request(struct r10bio *r10_bio)
{
if (!atomic_dec_and_test(&r10_bio->remaining))
return;
md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
bio_put(r10_bio->master_bio);
put_buf(r10_bio);
}
static void raid10_finish_reshape(struct mddev *mddev)
{
struct r10conf *conf = mddev->private;
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
return;
if (mddev->delta_disks > 0) {
sector_t size = raid10_size(mddev, 0, 0);
md_set_array_sectors(mddev, size);
if (mddev->recovery_cp > mddev->resync_max_sectors) {
mddev->recovery_cp = mddev->resync_max_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
}
mddev->resync_max_sectors = size;
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
} else {
int d;
for (d = conf->geo.raid_disks ;
d < conf->geo.raid_disks - mddev->delta_disks;
d++) {
struct md_rdev *rdev = conf->mirrors[d].rdev;
if (rdev)
clear_bit(In_sync, &rdev->flags);
rdev = conf->mirrors[d].replacement;
if (rdev)
clear_bit(In_sync, &rdev->flags);
}
}
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
mddev->reshape_position = MaxSector;
mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
}
static struct md_personality raid10_personality = static struct md_personality raid10_personality =
{ {
.name = "raid10", .name = "raid10",
...@@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality = ...@@ -3552,6 +4524,9 @@ static struct md_personality raid10_personality =
.size = raid10_size, .size = raid10_size,
.resize = raid10_resize, .resize = raid10_resize,
.takeover = raid10_takeover, .takeover = raid10_takeover,
.check_reshape = raid10_check_reshape,
.start_reshape = raid10_start_reshape,
.finish_reshape = raid10_finish_reshape,
}; };
static int __init raid_init(void) static int __init raid_init(void)
......
...@@ -14,32 +14,38 @@ struct mirror_info { ...@@ -14,32 +14,38 @@ struct mirror_info {
struct r10conf { struct r10conf {
struct mddev *mddev; struct mddev *mddev;
struct mirror_info *mirrors; struct mirror_info *mirrors;
int raid_disks; struct mirror_info *mirrors_new, *mirrors_old;
spinlock_t device_lock; spinlock_t device_lock;
/* geometry */ /* geometry */
int near_copies; /* number of copies laid out struct geom {
int raid_disks;
int near_copies; /* number of copies laid out
* raid0 style */ * raid0 style */
int far_copies; /* number of copies laid out int far_copies; /* number of copies laid out
* at large strides across drives * at large strides across drives
*/ */
int far_offset; /* far_copies are offset by 1 int far_offset; /* far_copies are offset by 1
* stripe instead of many * stripe instead of many
*/ */
int copies; /* near_copies * far_copies. sector_t stride; /* distance between far copies.
* must be <= raid_disks
*/
sector_t stride; /* distance between far copies.
* This is size / far_copies unless * This is size / far_copies unless
* far_offset, in which case it is * far_offset, in which case it is
* 1 stripe. * 1 stripe.
*/ */
int chunk_shift; /* shift from chunks to sectors */
sector_t chunk_mask;
} prev, geo;
int copies; /* near_copies * far_copies.
* must be <= raid_disks
*/
sector_t dev_sectors; /* temp copy of sector_t dev_sectors; /* temp copy of
* mddev->dev_sectors */ * mddev->dev_sectors */
sector_t reshape_progress;
int chunk_shift; /* shift from chunks to sectors */ sector_t reshape_safe;
sector_t chunk_mask; unsigned long reshape_checkpoint;
sector_t offset_diff;
struct list_head retry_list; struct list_head retry_list;
/* queue pending writes and submit them on unplug */ /* queue pending writes and submit them on unplug */
...@@ -136,6 +142,7 @@ enum r10bio_state { ...@@ -136,6 +142,7 @@ enum r10bio_state {
R10BIO_Uptodate, R10BIO_Uptodate,
R10BIO_IsSync, R10BIO_IsSync,
R10BIO_IsRecover, R10BIO_IsRecover,
R10BIO_IsReshape,
R10BIO_Degraded, R10BIO_Degraded,
/* Set ReadError on bios that experience a read error /* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them. * so that raid10d knows what to do with them.
...@@ -146,5 +153,10 @@ enum r10bio_state { ...@@ -146,5 +153,10 @@ enum r10bio_state {
*/ */
R10BIO_MadeGood, R10BIO_MadeGood,
R10BIO_WriteError, R10BIO_WriteError,
/* During a reshape we might be performing IO on the
* 'previous' part of the array, in which case this
* flag is set
*/
R10BIO_Previous,
}; };
#endif #endif
...@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector, ...@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
return sh; return sh;
} }
/* Determine if 'data_offset' or 'new_data_offset' should be used
* in this stripe_head.
*/
static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
{
sector_t progress = conf->reshape_progress;
/* Need a memory barrier to make sure we see the value
* of conf->generation, or ->data_offset that was set before
* reshape_progress was updated.
*/
smp_rmb();
if (progress == MaxSector)
return 0;
if (sh->generation == conf->generation - 1)
return 0;
/* We are in a reshape, and this is a new-generation stripe,
* so use new_data_offset.
*/
return 1;
}
static void static void
raid5_end_read_request(struct bio *bi, int error); raid5_end_read_request(struct bio *bi, int error);
static void static void
...@@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
replace_only = 1; replace_only = 1;
} else } else
continue; continue;
if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
rw |= REQ_SYNC;
bi = &sh->dev[i].req; bi = &sh->dev[i].req;
rbi = &sh->dev[i].rreq; /* For writing to replacement */ rbi = &sh->dev[i].rreq; /* For writing to replacement */
...@@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
__func__, (unsigned long long)sh->sector, __func__, (unsigned long long)sh->sector,
bi->bi_rw, i); bi->bi_rw, i);
atomic_inc(&sh->count); atomic_inc(&sh->count);
bi->bi_sector = sh->sector + rdev->data_offset; if (use_new_offset(conf, sh))
bi->bi_sector = (sh->sector
+ rdev->new_data_offset);
else
bi->bi_sector = (sh->sector
+ rdev->data_offset);
bi->bi_flags = 1 << BIO_UPTODATE; bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_idx = 0; bi->bi_idx = 0;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
...@@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
__func__, (unsigned long long)sh->sector, __func__, (unsigned long long)sh->sector,
rbi->bi_rw, i); rbi->bi_rw, i);
atomic_inc(&sh->count); atomic_inc(&sh->count);
rbi->bi_sector = sh->sector + rrdev->data_offset; if (use_new_offset(conf, sh))
rbi->bi_sector = (sh->sector
+ rrdev->new_data_offset);
else
rbi->bi_sector = (sh->sector
+ rrdev->data_offset);
rbi->bi_flags = 1 << BIO_UPTODATE; rbi->bi_flags = 1 << BIO_UPTODATE;
rbi->bi_idx = 0; rbi->bi_idx = 0;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
...@@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) ...@@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
if (wbi->bi_rw & REQ_FUA) if (wbi->bi_rw & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags); set_bit(R5_WantFUA, &dev->flags);
if (wbi->bi_rw & REQ_SYNC)
set_bit(R5_SyncIO, &dev->flags);
tx = async_copy_data(1, wbi, dev->page, tx = async_copy_data(1, wbi, dev->page,
dev->sector, tx); dev->sector, tx);
wbi = r5_next_bio(wbi, dev->sector); wbi = r5_next_bio(wbi, dev->sector);
...@@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) ...@@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
int pd_idx = sh->pd_idx; int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx; int qd_idx = sh->qd_idx;
int i; int i;
bool fua = false; bool fua = false, sync = false;
pr_debug("%s: stripe %llu\n", __func__, pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
for (i = disks; i--; ) for (i = disks; i--; ) {
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
}
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
...@@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref) ...@@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
set_bit(R5_UPTODATE, &dev->flags); set_bit(R5_UPTODATE, &dev->flags);
if (fua) if (fua)
set_bit(R5_WantFUA, &dev->flags); set_bit(R5_WantFUA, &dev->flags);
if (sync)
set_bit(R5_SyncIO, &dev->flags);
} }
} }
...@@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct md_rdev *rdev = NULL; struct md_rdev *rdev = NULL;
sector_t s;
for (i=0 ; i<disks; i++) for (i=0 ; i<disks; i++)
if (bi == &sh->dev[i].req) if (bi == &sh->dev[i].req)
...@@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error)
if (!rdev) if (!rdev)
rdev = conf->disks[i].rdev; rdev = conf->disks[i].rdev;
if (use_new_offset(conf, sh))
s = sh->sector + rdev->new_data_offset;
else
s = sh->sector + rdev->data_offset;
if (uptodate) { if (uptodate) {
set_bit(R5_UPTODATE, &sh->dev[i].flags); set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) { if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
...@@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error corrected" "md/raid:%s: read error corrected"
" (%lu sectors at %llu on %s)\n", " (%lu sectors at %llu on %s)\n",
mdname(conf->mddev), STRIPE_SECTORS, mdname(conf->mddev), STRIPE_SECTORS,
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReadError, &sh->dev[i].flags);
...@@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error on replacement device " "md/raid:%s: read error on replacement device "
"(sector %llu on %s).\n", "(sector %llu on %s).\n",
mdname(conf->mddev), mdname(conf->mddev),
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdn); bdn);
else if (conf->mddev->degraded >= conf->max_degraded) else if (conf->mddev->degraded >= conf->max_degraded)
printk_ratelimited( printk_ratelimited(
...@@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error not correctable " "md/raid:%s: read error not correctable "
"(sector %llu on %s).\n", "(sector %llu on %s).\n",
mdname(conf->mddev), mdname(conf->mddev),
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdn); bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */ /* Oh, no!!! */
...@@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error) ...@@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error NOT corrected!! " "md/raid:%s: read error NOT corrected!! "
"(sector %llu on %s).\n", "(sector %llu on %s).\n",
mdname(conf->mddev), mdname(conf->mddev),
(unsigned long long)(sh->sector (unsigned long long)s,
+ rdev->data_offset),
bdn); bdn);
else if (atomic_read(&rdev->read_errors) else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes) > conf->max_nr_stripes)
...@@ -3561,7 +3600,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3561,7 +3600,7 @@ static void handle_stripe(struct stripe_head *sh)
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev; rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector, rdev_clear_badblocks(rdev, sh->sector,
STRIPE_SECTORS); STRIPE_SECTORS, 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
...@@ -3570,7 +3609,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -3570,7 +3609,7 @@ static void handle_stripe(struct stripe_head *sh)
/* rdev have been moved down */ /* rdev have been moved down */
rdev = conf->disks[i].rdev; rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector, rdev_clear_badblocks(rdev, sh->sector,
STRIPE_SECTORS); STRIPE_SECTORS, 0);
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
} }
} }
...@@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) ...@@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
raid_bio->bi_next = (void*)rdev; raid_bio->bi_next = (void*)rdev;
align_bi->bi_bdev = rdev->bdev; align_bi->bi_bdev = rdev->bdev;
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
/* No reshape active, so we can trust rdev->data_offset */
align_bi->bi_sector += rdev->data_offset; align_bi->bi_sector += rdev->data_offset;
if (!bio_fits_rdev(align_bi) || if (!bio_fits_rdev(align_bi) ||
...@@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
plugged = mddev_check_plugged(mddev); plugged = mddev_check_plugged(mddev);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w); DEFINE_WAIT(w);
int disks, data_disks;
int previous; int previous;
retry: retry:
previous = 0; previous = 0;
disks = conf->raid_disks;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
if (unlikely(conf->reshape_progress != MaxSector)) { if (unlikely(conf->reshape_progress != MaxSector)) {
/* spinlock is needed as reshape_progress may be /* spinlock is needed as reshape_progress may be
...@@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* to check again. * to check again.
*/ */
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (mddev->delta_disks < 0 if (mddev->reshape_backwards
? logical_sector < conf->reshape_progress ? logical_sector < conf->reshape_progress
: logical_sector >= conf->reshape_progress) { : logical_sector >= conf->reshape_progress) {
disks = conf->previous_raid_disks;
previous = 1; previous = 1;
} else { } else {
if (mddev->delta_disks < 0 if (mddev->reshape_backwards
? logical_sector < conf->reshape_safe ? logical_sector < conf->reshape_safe
: logical_sector >= conf->reshape_safe) { : logical_sector >= conf->reshape_safe) {
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
...@@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
} }
data_disks = disks - conf->max_degraded;
new_sector = raid5_compute_sector(conf, logical_sector, new_sector = raid5_compute_sector(conf, logical_sector,
previous, previous,
...@@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
*/ */
int must_retry = 0; int must_retry = 0;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (mddev->delta_disks < 0 if (mddev->reshape_backwards
? logical_sector >= conf->reshape_progress ? logical_sector >= conf->reshape_progress
: logical_sector < conf->reshape_progress) : logical_sector < conf->reshape_progress)
/* mismatch, need to try again */ /* mismatch, need to try again */
...@@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (sector_nr == 0) { if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */ /* If restarting in the middle, skip the initial sectors */
if (mddev->delta_disks < 0 && if (mddev->reshape_backwards &&
conf->reshape_progress < raid5_size(mddev, 0, 0)) { conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0) sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress; - conf->reshape_progress;
} else if (mddev->delta_disks >= 0 && } else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0) conf->reshape_progress > 0)
sector_nr = conf->reshape_progress; sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks); sector_div(sector_nr, new_data_disks);
...@@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
else else
reshape_sectors = mddev->chunk_sectors; reshape_sectors = mddev->chunk_sectors;
/* we update the metadata when there is more than 3Meg /* We update the metadata at least every 10 seconds, or when
* in the block range (that is rather arbitrary, should * the data about to be copied would over-write the source of
* probably be time based) or when the data about to be * the data at the front of the range. i.e. one new_stripe
* copied would over-write the source of the data at * along from reshape_progress new_maps to after where
* the front of the range. * reshape_safe old_maps to
* i.e. one new_stripe along from reshape_progress new_maps
* to after where reshape_safe old_maps to
*/ */
writepos = conf->reshape_progress; writepos = conf->reshape_progress;
sector_div(writepos, new_data_disks); sector_div(writepos, new_data_disks);
...@@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_div(readpos, data_disks); sector_div(readpos, data_disks);
safepos = conf->reshape_safe; safepos = conf->reshape_safe;
sector_div(safepos, data_disks); sector_div(safepos, data_disks);
if (mddev->delta_disks < 0) { if (mddev->reshape_backwards) {
writepos -= min_t(sector_t, reshape_sectors, writepos); writepos -= min_t(sector_t, reshape_sectors, writepos);
readpos += reshape_sectors; readpos += reshape_sectors;
safepos += reshape_sectors; safepos += reshape_sectors;
...@@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos -= min_t(sector_t, reshape_sectors, safepos); safepos -= min_t(sector_t, reshape_sectors, safepos);
} }
/* Having calculated the 'writepos' possibly use it
* to set 'stripe_addr' which is where we will write to.
*/
if (mddev->reshape_backwards) {
BUG_ON(conf->reshape_progress == 0);
stripe_addr = writepos;
BUG_ON((mddev->dev_sectors &
~((sector_t)reshape_sectors - 1))
- reshape_sectors - stripe_addr
!= sector_nr);
} else {
BUG_ON(writepos != sector_nr + reshape_sectors);
stripe_addr = sector_nr;
}
/* 'writepos' is the most advanced device address we might write. /* 'writepos' is the most advanced device address we might write.
* 'readpos' is the least advanced device address we might read. * 'readpos' is the least advanced device address we might read.
* 'safepos' is the least address recorded in the metadata as having * 'safepos' is the least address recorded in the metadata as having
* been reshaped. * been reshaped.
* If 'readpos' is behind 'writepos', then there is no way that we can * If there is a min_offset_diff, these are adjusted either by
* increasing the safepos/readpos if diff is negative, or
* increasing writepos if diff is positive.
* If 'readpos' is then behind 'writepos', there is no way that we can
* ensure safety in the face of a crash - that must be done by userspace * ensure safety in the face of a crash - that must be done by userspace
* making a backup of the data. So in that case there is no particular * making a backup of the data. So in that case there is no particular
* rush to update metadata. * rush to update metadata.
...@@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* Maybe that number should be configurable, but I'm not sure it is * Maybe that number should be configurable, but I'm not sure it is
* worth it.... maybe it could be a multiple of safemode_delay??? * worth it.... maybe it could be a multiple of safemode_delay???
*/ */
if ((mddev->delta_disks < 0 if (conf->min_offset_diff < 0) {
safepos += -conf->min_offset_diff;
readpos += -conf->min_offset_diff;
} else
writepos += conf->min_offset_diff;
if ((mddev->reshape_backwards
? (safepos > writepos && readpos < writepos) ? (safepos > writepos && readpos < writepos)
: (safepos < writepos && readpos > writepos)) || : (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
...@@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sysfs_notify(&mddev->kobj, NULL, "sync_completed"); sysfs_notify(&mddev->kobj, NULL, "sync_completed");
} }
if (mddev->delta_disks < 0) {
BUG_ON(conf->reshape_progress == 0);
stripe_addr = writepos;
BUG_ON((mddev->dev_sectors &
~((sector_t)reshape_sectors - 1))
- reshape_sectors - stripe_addr
!= sector_nr);
} else {
BUG_ON(writepos != sector_nr + reshape_sectors);
stripe_addr = sector_nr;
}
INIT_LIST_HEAD(&stripes); INIT_LIST_HEAD(&stripes);
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j; int j;
...@@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
list_add(&sh->lru, &stripes); list_add(&sh->lru, &stripes);
} }
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (mddev->delta_disks < 0) if (mddev->reshape_backwards)
conf->reshape_progress -= reshape_sectors * new_data_disks; conf->reshape_progress -= reshape_sectors * new_data_disks;
else else
conf->reshape_progress += reshape_sectors * new_data_disks; conf->reshape_progress += reshape_sectors * new_data_disks;
...@@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev) ...@@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev)
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t reshape_offset = 0; sector_t reshape_offset = 0;
int i; int i;
long long min_offset_diff = 0;
int first = 1;
if (mddev->recovery_cp != MaxSector) if (mddev->recovery_cp != MaxSector)
printk(KERN_NOTICE "md/raid:%s: not clean" printk(KERN_NOTICE "md/raid:%s: not clean"
" -- starting background reconstruction\n", " -- starting background reconstruction\n",
mdname(mddev)); mdname(mddev));
rdev_for_each(rdev, mddev) {
long long diff;
if (rdev->raid_disk < 0)
continue;
diff = (rdev->new_data_offset - rdev->data_offset);
if (first) {
min_offset_diff = diff;
first = 0;
} else if (mddev->reshape_backwards &&
diff < min_offset_diff)
min_offset_diff = diff;
else if (!mddev->reshape_backwards &&
diff > min_offset_diff)
min_offset_diff = diff;
}
if (mddev->reshape_position != MaxSector) { if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape. /* Check that we can continue the reshape.
* Currently only disks can change, it must * Difficulties arise if the stripe we would write to
* increase, and we must be past the point where * next is at or after the stripe we would read from next.
* a stripe over-writes itself * For a reshape that changes the number of devices, this
* is only possible for a very short time, and mdadm makes
* sure that time appears to have past before assembling
* the array. So we fail if that time hasn't passed.
* For a reshape that keeps the number of devices the same
* mdadm must be monitoring the reshape can keeping the
* critical areas read-only and backed up. It will start
* the array in read-only mode, so we check for that.
*/ */
sector_t here_new, here_old; sector_t here_new, here_old;
int old_disks; int old_disks;
...@@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev) ...@@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev)
/* here_old is the first stripe that we might need to read /* here_old is the first stripe that we might need to read
* from */ * from */
if (mddev->delta_disks == 0) { if (mddev->delta_disks == 0) {
if ((here_new * mddev->new_chunk_sectors !=
here_old * mddev->chunk_sectors)) {
printk(KERN_ERR "md/raid:%s: reshape position is"
" confused - aborting\n", mdname(mddev));
return -EINVAL;
}
/* We cannot be sure it is safe to start an in-place /* We cannot be sure it is safe to start an in-place
* reshape. It is only safe if user-space if monitoring * reshape. It is only safe if user-space is monitoring
* and taking constant backups. * and taking constant backups.
* mdadm always starts a situation like this in * mdadm always starts a situation like this in
* readonly mode so it can take control before * readonly mode so it can take control before
* allowing any writes. So just check for that. * allowing any writes. So just check for that.
*/ */
if ((here_new * mddev->new_chunk_sectors != if (abs(min_offset_diff) >= mddev->chunk_sectors &&
here_old * mddev->chunk_sectors) || abs(min_offset_diff) >= mddev->new_chunk_sectors)
mddev->ro == 0) { /* not really in-place - so OK */;
printk(KERN_ERR "md/raid:%s: in-place reshape must be started" else if (mddev->ro == 0) {
" in read-only mode - aborting\n", printk(KERN_ERR "md/raid:%s: in-place reshape "
"must be started in read-only mode "
"- aborting\n",
mdname(mddev)); mdname(mddev));
return -EINVAL; return -EINVAL;
} }
} else if (mddev->delta_disks < 0 } else if (mddev->reshape_backwards
? (here_new * mddev->new_chunk_sectors <= ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
here_old * mddev->chunk_sectors) here_old * mddev->chunk_sectors)
: (here_new * mddev->new_chunk_sectors >= : (here_new * mddev->new_chunk_sectors >=
here_old * mddev->chunk_sectors)) { here_old * mddev->chunk_sectors + (-min_offset_diff))) {
/* Reading from the same stripe as writing to - bad */ /* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "md/raid:%s: reshape_position too early for " printk(KERN_ERR "md/raid:%s: reshape_position too early for "
"auto-recovery - aborting.\n", "auto-recovery - aborting.\n",
...@@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev) ...@@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf)) if (IS_ERR(conf))
return PTR_ERR(conf); return PTR_ERR(conf);
conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread; mddev->thread = conf->thread;
conf->thread = NULL; conf->thread = NULL;
mddev->private = conf; mddev->private = conf;
...@@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev) ...@@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size * blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded)); (conf->raid_disks - conf->max_degraded));
rdev_for_each(rdev, mddev) rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev, disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9); rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
}
} }
return 0; return 0;
...@@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) ...@@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems * any io in the removed space completes, but it hardly seems
* worth it. * worth it.
*/ */
sector_t newsize;
sectors &= ~((sector_t)mddev->chunk_sectors - 1); sectors &= ~((sector_t)mddev->chunk_sectors - 1);
md_set_array_sectors(mddev, raid5_size(mddev, sectors, newsize = raid5_size(mddev, sectors, mddev->raid_disks);
mddev->raid_disks)); if (mddev->external_size &&
if (mddev->array_sectors > mddev->array_sectors > newsize)
raid5_size(mddev, sectors, mddev->raid_disks))
return -EINVAL; return -EINVAL;
if (mddev->bitmap) {
int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
if (ret)
return ret;
}
md_set_array_sectors(mddev, newsize);
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk); revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && if (sectors > mddev->dev_sectors &&
...@@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev) ...@@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev)
mddev->new_layout == mddev->layout && mddev->new_layout == mddev->layout &&
mddev->new_chunk_sectors == mddev->chunk_sectors) mddev->new_chunk_sectors == mddev->chunk_sectors)
return 0; /* nothing to do */ return 0; /* nothing to do */
if (mddev->bitmap)
/* Cannot grow a bitmap yet */
return -EBUSY;
if (has_failed(conf)) if (has_failed(conf))
return -EINVAL; return -EINVAL;
if (mddev->delta_disks < 0) { if (mddev->delta_disks < 0) {
...@@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5505,10 +5593,14 @@ static int raid5_start_reshape(struct mddev *mddev)
if (!check_stripe_cache(mddev)) if (!check_stripe_cache(mddev))
return -ENOSPC; return -ENOSPC;
rdev_for_each(rdev, mddev) if (has_failed(conf))
return -EINVAL;
rdev_for_each(rdev, mddev) {
if (!test_bit(In_sync, &rdev->flags) if (!test_bit(In_sync, &rdev->flags)
&& !test_bit(Faulty, &rdev->flags)) && !test_bit(Faulty, &rdev->flags))
spares++; spares++;
}
if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
/* Not enough devices even to make a degraded array /* Not enough devices even to make a degraded array
...@@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5535,12 +5627,16 @@ static int raid5_start_reshape(struct mddev *mddev)
conf->chunk_sectors = mddev->new_chunk_sectors; conf->chunk_sectors = mddev->new_chunk_sectors;
conf->prev_algo = conf->algorithm; conf->prev_algo = conf->algorithm;
conf->algorithm = mddev->new_layout; conf->algorithm = mddev->new_layout;
if (mddev->delta_disks < 0) conf->generation++;
/* Code that selects data_offset needs to see the generation update
* if reshape_progress has been set - so a memory barrier needed.
*/
smp_mb();
if (mddev->reshape_backwards)
conf->reshape_progress = raid5_size(mddev, 0, 0); conf->reshape_progress = raid5_size(mddev, 0, 0);
else else
conf->reshape_progress = 0; conf->reshape_progress = 0;
conf->reshape_safe = conf->reshape_progress; conf->reshape_safe = conf->reshape_progress;
conf->generation++;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
/* Add some new drives, as many as will fit. /* Add some new drives, as many as will fit.
...@@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev) ...@@ -5592,6 +5688,9 @@ static int raid5_start_reshape(struct mddev *mddev)
mddev->recovery = 0; mddev->recovery = 0;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
rdev_for_each(rdev, mddev)
rdev->new_data_offset = rdev->data_offset;
smp_wmb();
conf->reshape_progress = MaxSector; conf->reshape_progress = MaxSector;
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
...@@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf) ...@@ -5610,9 +5709,13 @@ static void end_reshape(struct r5conf *conf)
{ {
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
struct md_rdev *rdev;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks; conf->previous_raid_disks = conf->raid_disks;
rdev_for_each(rdev, conf->mddev)
rdev->data_offset = rdev->new_data_offset;
smp_wmb();
conf->reshape_progress = MaxSector; conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
...@@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev) ...@@ -5652,17 +5755,18 @@ static void raid5_finish_reshape(struct mddev *mddev)
d < conf->raid_disks - mddev->delta_disks; d < conf->raid_disks - mddev->delta_disks;
d++) { d++) {
struct md_rdev *rdev = conf->disks[d].rdev; struct md_rdev *rdev = conf->disks[d].rdev;
if (rdev && if (rdev)
raid5_remove_disk(mddev, rdev) == 0) { clear_bit(In_sync, &rdev->flags);
sysfs_unlink_rdev(mddev, rdev); rdev = conf->disks[d].replacement;
rdev->raid_disk = -1; if (rdev)
} clear_bit(In_sync, &rdev->flags);
} }
} }
mddev->layout = conf->algorithm; mddev->layout = conf->algorithm;
mddev->chunk_sectors = conf->chunk_sectors; mddev->chunk_sectors = conf->chunk_sectors;
mddev->reshape_position = MaxSector; mddev->reshape_position = MaxSector;
mddev->delta_disks = 0; mddev->delta_disks = 0;
mddev->reshape_backwards = 0;
} }
} }
......
...@@ -285,6 +285,7 @@ enum r5dev_flags { ...@@ -285,6 +285,7 @@ enum r5dev_flags {
*/ */
R5_Wantdrain, /* dev->towrite needs to be drained */ R5_Wantdrain, /* dev->towrite needs to be drained */
R5_WantFUA, /* Write should be FUA */ R5_WantFUA, /* Write should be FUA */
R5_SyncIO, /* The IO is sync */
R5_WriteError, /* got a write error - need to record it */ R5_WriteError, /* got a write error - need to record it */
R5_MadeGood, /* A bad block has been fixed by writing to it */ R5_MadeGood, /* A bad block has been fixed by writing to it */
R5_ReadRepl, /* Will/did read from replacement rather than orig */ R5_ReadRepl, /* Will/did read from replacement rather than orig */
...@@ -385,6 +386,12 @@ struct r5conf { ...@@ -385,6 +386,12 @@ struct r5conf {
short generation; /* increments with every reshape */ short generation; /* increments with every reshape */
unsigned long reshape_checkpoint; /* Time we last updated unsigned long reshape_checkpoint; /* Time we last updated
* metadata */ * metadata */
long long min_offset_diff; /* minimum difference between
* data_offset and
* new_data_offset across all
* devices. May be negative,
* but is closest to zero.
*/
struct list_head handle_list; /* stripes needing handling */ struct list_head handle_list; /* stripes needing handling */
struct list_head hold_list; /* preread ready stripes */ struct list_head hold_list; /* preread ready stripes */
......
...@@ -233,7 +233,10 @@ struct mdp_superblock_1 { ...@@ -233,7 +233,10 @@ struct mdp_superblock_1 {
__le32 delta_disks; /* change in number of raid_disks */ __le32 delta_disks; /* change in number of raid_disks */
__le32 new_layout; /* new layout */ __le32 new_layout; /* new layout */
__le32 new_chunk; /* new chunk size (512byte sectors) */ __le32 new_chunk; /* new chunk size (512byte sectors) */
__u8 pad1[128-124]; /* set to 0 when written */ __le32 new_offset; /* signed number to add to data_offset in new
* layout. 0 == no-change. This can be
* different on each device in the array.
*/
/* constant this-device information - 64 bytes */ /* constant this-device information - 64 bytes */
__le64 data_offset; /* sector start of data, often 0 */ __le64 data_offset; /* sector start of data, often 0 */
...@@ -281,10 +284,18 @@ struct mdp_superblock_1 { ...@@ -281,10 +284,18 @@ struct mdp_superblock_1 {
* active device with same 'role'. * active device with same 'role'.
* 'recovery_offset' is also set. * 'recovery_offset' is also set.
*/ */
#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number
* of devices, but is going
* backwards anyway.
*/
#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \ |MD_FEATURE_RESHAPE_ACTIVE \
|MD_FEATURE_BAD_BLOCKS \ |MD_FEATURE_BAD_BLOCKS \
|MD_FEATURE_REPLACEMENT) |MD_FEATURE_REPLACEMENT \
|MD_FEATURE_RESHAPE_BACKWARDS \
|MD_FEATURE_NEW_OFFSET \
)
#endif #endif
...@@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2; ...@@ -99,8 +99,20 @@ extern const struct raid6_calls raid6_altivec2;
extern const struct raid6_calls raid6_altivec4; extern const struct raid6_calls raid6_altivec4;
extern const struct raid6_calls raid6_altivec8; extern const struct raid6_calls raid6_altivec8;
struct raid6_recov_calls {
void (*data2)(int, size_t, int, int, void **);
void (*datap)(int, size_t, int, void **);
int (*valid)(void);
const char *name;
int priority;
};
extern const struct raid6_recov_calls raid6_recov_intx1;
extern const struct raid6_recov_calls raid6_recov_ssse3;
/* Algorithm list */ /* Algorithm list */
extern const struct raid6_calls * const raid6_algos[]; extern const struct raid6_calls * const raid6_algos[];
extern const struct raid6_recov_calls *const raid6_recov_algos[];
int raid6_select_algo(void); int raid6_select_algo(void);
/* Return values from chk_syndrome */ /* Return values from chk_syndrome */
...@@ -111,14 +123,16 @@ int raid6_select_algo(void); ...@@ -111,14 +123,16 @@ int raid6_select_algo(void);
/* Galois field tables */ /* Galois field tables */
extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256))); extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
extern const u8 raid6_vgfmul[256][32] __attribute__((aligned(256)));
extern const u8 raid6_gfexp[256] __attribute__((aligned(256))); extern const u8 raid6_gfexp[256] __attribute__((aligned(256)));
extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
/* Recovery routines */ /* Recovery routines */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, extern void (*raid6_2data_recov)(int disks, size_t bytes, int faila, int failb,
void **ptrs); void **ptrs);
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); extern void (*raid6_datap_recov)(int disks, size_t bytes, int faila,
void **ptrs);
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
void **ptrs); void **ptrs);
......
obj-$(CONFIG_RAID6_PQ) += raid6_pq.o obj-$(CONFIG_RAID6_PQ) += raid6_pq.o
raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ raid6_pq-y += algos.o recov.o recov_ssse3.o tables.o int1.o int2.o int4.o \
int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \ int8.o int16.o int32.o altivec1.o altivec2.o altivec4.o \
altivec8.o mmx.o sse1.o sse2.o altivec8.o mmx.o sse1.o sse2.o
hostprogs-y += mktables hostprogs-y += mktables
......
...@@ -17,11 +17,11 @@ ...@@ -17,11 +17,11 @@
*/ */
#include <linux/raid/pq.h> #include <linux/raid/pq.h>
#include <linux/module.h>
#ifndef __KERNEL__ #ifndef __KERNEL__
#include <sys/mman.h> #include <sys/mman.h>
#include <stdio.h> #include <stdio.h>
#else #else
#include <linux/module.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#if !RAID6_USE_EMPTY_ZERO_PAGE #if !RAID6_USE_EMPTY_ZERO_PAGE
/* In .bss so it's zeroed */ /* In .bss so it's zeroed */
...@@ -34,10 +34,6 @@ struct raid6_calls raid6_call; ...@@ -34,10 +34,6 @@ struct raid6_calls raid6_call;
EXPORT_SYMBOL_GPL(raid6_call); EXPORT_SYMBOL_GPL(raid6_call);
const struct raid6_calls * const raid6_algos[] = { const struct raid6_calls * const raid6_algos[] = {
&raid6_intx1,
&raid6_intx2,
&raid6_intx4,
&raid6_intx8,
#if defined(__ia64__) #if defined(__ia64__)
&raid6_intx16, &raid6_intx16,
&raid6_intx32, &raid6_intx32,
...@@ -61,6 +57,24 @@ const struct raid6_calls * const raid6_algos[] = { ...@@ -61,6 +57,24 @@ const struct raid6_calls * const raid6_algos[] = {
&raid6_altivec4, &raid6_altivec4,
&raid6_altivec8, &raid6_altivec8,
#endif #endif
&raid6_intx1,
&raid6_intx2,
&raid6_intx4,
&raid6_intx8,
NULL
};
void (*raid6_2data_recov)(int, size_t, int, int, void **);
EXPORT_SYMBOL_GPL(raid6_2data_recov);
void (*raid6_datap_recov)(int, size_t, int, void **);
EXPORT_SYMBOL_GPL(raid6_datap_recov);
const struct raid6_recov_calls *const raid6_recov_algos[] = {
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
&raid6_recov_ssse3,
#endif
&raid6_recov_intx1,
NULL NULL
}; };
...@@ -72,59 +86,55 @@ const struct raid6_calls * const raid6_algos[] = { ...@@ -72,59 +86,55 @@ const struct raid6_calls * const raid6_algos[] = {
#define time_before(x, y) ((x) < (y)) #define time_before(x, y) ((x) < (y))
#endif #endif
/* Try to pick the best algorithm */ static inline const struct raid6_recov_calls *raid6_choose_recov(void)
/* This code uses the gfmul table as convenient data set to abuse */
int __init raid6_select_algo(void)
{ {
const struct raid6_calls * const * algo; const struct raid6_recov_calls *const *algo;
const struct raid6_calls * best; const struct raid6_recov_calls *best;
char *syndromes;
void *dptrs[(65536/PAGE_SIZE)+2];
int i, disks;
unsigned long perf, bestperf;
int bestprefer;
unsigned long j0, j1;
disks = (65536/PAGE_SIZE)+2; for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
for ( i = 0 ; i < disks-2 ; i++ ) { if (!best || (*algo)->priority > best->priority)
dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; if (!(*algo)->valid || (*algo)->valid())
} best = *algo;
/* Normal code - use a 2-page allocation to avoid D$ conflict */ if (best) {
syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); raid6_2data_recov = best->data2;
raid6_datap_recov = best->datap;
if ( !syndromes ) { printk("raid6: using %s recovery algorithm\n", best->name);
printk("raid6: Yikes! No memory available.\n"); } else
return -ENOMEM; printk("raid6: Yikes! No recovery algorithm found!\n");
}
dptrs[disks-2] = syndromes; return best;
dptrs[disks-1] = syndromes + PAGE_SIZE; }
static inline const struct raid6_calls *raid6_choose_gen(
void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks)
{
unsigned long perf, bestperf, j0, j1;
const struct raid6_calls *const *algo;
const struct raid6_calls *best;
bestperf = 0; bestprefer = 0; best = NULL; for (bestperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
if (!best || (*algo)->prefer >= best->prefer) {
if ((*algo)->valid && !(*algo)->valid())
continue;
for ( algo = raid6_algos ; *algo ; algo++ ) {
if ( !(*algo)->valid || (*algo)->valid() ) {
perf = 0; perf = 0;
preempt_disable(); preempt_disable();
j0 = jiffies; j0 = jiffies;
while ( (j1 = jiffies) == j0 ) while ((j1 = jiffies) == j0)
cpu_relax(); cpu_relax();
while (time_before(jiffies, while (time_before(jiffies,
j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
(*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
perf++; perf++;
} }
preempt_enable(); preempt_enable();
if ( (*algo)->prefer > bestprefer || if (perf > bestperf) {
((*algo)->prefer == bestprefer &&
perf > bestperf) ) {
best = *algo;
bestprefer = best->prefer;
bestperf = perf; bestperf = perf;
best = *algo;
} }
printk("raid6: %-8s %5ld MB/s\n", (*algo)->name, printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
(perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
...@@ -139,9 +149,46 @@ int __init raid6_select_algo(void) ...@@ -139,9 +149,46 @@ int __init raid6_select_algo(void)
} else } else
printk("raid6: Yikes! No algorithm found!\n"); printk("raid6: Yikes! No algorithm found!\n");
return best;
}
/* Try to pick the best algorithm */
/* This code uses the gfmul table as convenient data set to abuse */
int __init raid6_select_algo(void)
{
const int disks = (65536/PAGE_SIZE)+2;
const struct raid6_calls *gen_best;
const struct raid6_recov_calls *rec_best;
char *syndromes;
void *dptrs[(65536/PAGE_SIZE)+2];
int i;
for (i = 0; i < disks-2; i++)
dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
/* Normal code - use a 2-page allocation to avoid D$ conflict */
syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
if (!syndromes) {
printk("raid6: Yikes! No memory available.\n");
return -ENOMEM;
}
dptrs[disks-2] = syndromes;
dptrs[disks-1] = syndromes + PAGE_SIZE;
/* select raid gen_syndrome function */
gen_best = raid6_choose_gen(&dptrs, disks);
/* select raid recover functions */
rec_best = raid6_choose_recov();
free_pages((unsigned long)syndromes, 1); free_pages((unsigned long)syndromes, 1);
return best ? 0 : -EINVAL; return gen_best && rec_best ? 0 : -EINVAL;
} }
static void raid6_exit(void) static void raid6_exit(void)
......
...@@ -81,6 +81,31 @@ int main(int argc, char *argv[]) ...@@ -81,6 +81,31 @@ int main(int argc, char *argv[])
printf("EXPORT_SYMBOL(raid6_gfmul);\n"); printf("EXPORT_SYMBOL(raid6_gfmul);\n");
printf("#endif\n"); printf("#endif\n");
/* Compute vector multiplication table */
printf("\nconst u8 __attribute__((aligned(256)))\n"
"raid6_vgfmul[256][32] =\n"
"{\n");
for (i = 0; i < 256; i++) {
printf("\t{\n");
for (j = 0; j < 16; j += 8) {
printf("\t\t");
for (k = 0; k < 8; k++)
printf("0x%02x,%c", gfmul(i, j + k),
(k == 7) ? '\n' : ' ');
}
for (j = 0; j < 16; j += 8) {
printf("\t\t");
for (k = 0; k < 8; k++)
printf("0x%02x,%c", gfmul(i, (j + k) << 4),
(k == 7) ? '\n' : ' ');
}
printf("\t},\n");
}
printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_vgfmul);\n");
printf("#endif\n");
/* Compute power-of-2 table (exponent) */ /* Compute power-of-2 table (exponent) */
v = 1; v = 1;
printf("\nconst u8 __attribute__((aligned(256)))\n" printf("\nconst u8 __attribute__((aligned(256)))\n"
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include <linux/raid/pq.h> #include <linux/raid/pq.h>
/* Recover two failed data blocks. */ /* Recover two failed data blocks. */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, int failb,
void **ptrs) void **ptrs)
{ {
u8 *p, *q, *dp, *dq; u8 *p, *q, *dp, *dq;
...@@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, ...@@ -64,10 +64,9 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
p++; q++; p++; q++;
} }
} }
EXPORT_SYMBOL_GPL(raid6_2data_recov);
/* Recover failure of one data block plus the P block */ /* Recover failure of one data block plus the P block */
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, void **ptrs)
{ {
u8 *p, *q, *dq; u8 *p, *q, *dq;
const u8 *qmul; /* Q multiplier table */ const u8 *qmul; /* Q multiplier table */
...@@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) ...@@ -96,7 +95,15 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
q++; dq++; q++; dq++;
} }
} }
EXPORT_SYMBOL_GPL(raid6_datap_recov);
const struct raid6_recov_calls raid6_recov_intx1 = {
.data2 = raid6_2data_recov_intx1,
.datap = raid6_datap_recov_intx1,
.valid = NULL,
.name = "intx1",
.priority = 0,
};
#ifndef __KERNEL__ #ifndef __KERNEL__
/* Testing only */ /* Testing only */
......
/*
* Copyright (C) 2012 Intel Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
#include <linux/raid/pq.h>
#include "x86.h"
static int raid6_has_ssse3(void)
{
return boot_cpu_has(X86_FEATURE_XMM) &&
boot_cpu_has(X86_FEATURE_XMM2) &&
boot_cpu_has(X86_FEATURE_SSSE3);
}
void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, int failb,
void **ptrs)
{
u8 *p, *q, *dp, *dq;
const u8 *pbmul; /* P multiplier table for B data */
const u8 *qmul; /* Q multiplier table (for both) */
static const u8 __aligned(16) x0f[16] = {
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
p = (u8 *)ptrs[disks-2];
q = (u8 *)ptrs[disks-1];
/* Compute syndrome with zero for the missing data pages
Use the dead data pages as temporary storage for
delta p and delta q */
dp = (u8 *)ptrs[faila];
ptrs[faila] = (void *)raid6_empty_zero_page;
ptrs[disks-2] = dp;
dq = (u8 *)ptrs[failb];
ptrs[failb] = (void *)raid6_empty_zero_page;
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
/* Restore pointer table */
ptrs[faila] = dp;
ptrs[failb] = dq;
ptrs[disks-2] = p;
ptrs[disks-1] = q;
/* Now, pick the proper data tables */
pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
raid6_gfexp[failb]]];
kernel_fpu_begin();
asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0]));
#ifdef CONFIG_X86_64
asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0]));
asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0]));
asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16]));
#endif
/* Now do it... */
while (bytes) {
#ifdef CONFIG_X86_64
/* xmm6, xmm14, xmm15 */
asm volatile("movdqa %0,%%xmm1" : : "m" (q[0]));
asm volatile("movdqa %0,%%xmm9" : : "m" (q[16]));
asm volatile("movdqa %0,%%xmm0" : : "m" (p[0]));
asm volatile("movdqa %0,%%xmm8" : : "m" (p[16]));
asm volatile("pxor %0,%%xmm1" : : "m" (dq[0]));
asm volatile("pxor %0,%%xmm9" : : "m" (dq[16]));
asm volatile("pxor %0,%%xmm0" : : "m" (dp[0]));
asm volatile("pxor %0,%%xmm8" : : "m" (dp[16]));
/* xmm0/8 = px */
asm volatile("movdqa %xmm6,%xmm4");
asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
asm volatile("movdqa %xmm6,%xmm12");
asm volatile("movdqa %xmm5,%xmm13");
asm volatile("movdqa %xmm1,%xmm3");
asm volatile("movdqa %xmm9,%xmm11");
asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */
asm volatile("movdqa %xmm8,%xmm10");
asm volatile("psraw $4,%xmm1");
asm volatile("psraw $4,%xmm9");
asm volatile("pand %xmm7,%xmm3");
asm volatile("pand %xmm7,%xmm11");
asm volatile("pand %xmm7,%xmm1");
asm volatile("pand %xmm7,%xmm9");
asm volatile("pshufb %xmm3,%xmm4");
asm volatile("pshufb %xmm11,%xmm12");
asm volatile("pshufb %xmm1,%xmm5");
asm volatile("pshufb %xmm9,%xmm13");
asm volatile("pxor %xmm4,%xmm5");
asm volatile("pxor %xmm12,%xmm13");
/* xmm5/13 = qx */
asm volatile("movdqa %xmm14,%xmm4");
asm volatile("movdqa %xmm15,%xmm1");
asm volatile("movdqa %xmm14,%xmm12");
asm volatile("movdqa %xmm15,%xmm9");
asm volatile("movdqa %xmm2,%xmm3");
asm volatile("movdqa %xmm10,%xmm11");
asm volatile("psraw $4,%xmm2");
asm volatile("psraw $4,%xmm10");
asm volatile("pand %xmm7,%xmm3");
asm volatile("pand %xmm7,%xmm11");
asm volatile("pand %xmm7,%xmm2");
asm volatile("pand %xmm7,%xmm10");
asm volatile("pshufb %xmm3,%xmm4");
asm volatile("pshufb %xmm11,%xmm12");
asm volatile("pshufb %xmm2,%xmm1");
asm volatile("pshufb %xmm10,%xmm9");
asm volatile("pxor %xmm4,%xmm1");
asm volatile("pxor %xmm12,%xmm9");
/* xmm1/9 = pbmul[px] */
asm volatile("pxor %xmm5,%xmm1");
asm volatile("pxor %xmm13,%xmm9");
/* xmm1/9 = db = DQ */
asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0]));
asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16]));
asm volatile("pxor %xmm1,%xmm0");
asm volatile("pxor %xmm9,%xmm8");
asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0]));
asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16]));
bytes -= 32;
p += 32;
q += 32;
dp += 32;
dq += 32;
#else
asm volatile("movdqa %0,%%xmm1" : : "m" (*q));
asm volatile("movdqa %0,%%xmm0" : : "m" (*p));
asm volatile("pxor %0,%%xmm1" : : "m" (*dq));
asm volatile("pxor %0,%%xmm0" : : "m" (*dp));
/* 1 = dq ^ q
* 0 = dp ^ p
*/
asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0]));
asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
asm volatile("movdqa %xmm1,%xmm3");
asm volatile("psraw $4,%xmm1");
asm volatile("pand %xmm7,%xmm3");
asm volatile("pand %xmm7,%xmm1");
asm volatile("pshufb %xmm3,%xmm4");
asm volatile("pshufb %xmm1,%xmm5");
asm volatile("pxor %xmm4,%xmm5");
asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */
/* xmm5 = qx */
asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0]));
asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16]));
asm volatile("movdqa %xmm2,%xmm3");
asm volatile("psraw $4,%xmm2");
asm volatile("pand %xmm7,%xmm3");
asm volatile("pand %xmm7,%xmm2");
asm volatile("pshufb %xmm3,%xmm4");
asm volatile("pshufb %xmm2,%xmm1");
asm volatile("pxor %xmm4,%xmm1");
/* xmm1 = pbmul[px] */
asm volatile("pxor %xmm5,%xmm1");
/* xmm1 = db = DQ */
asm volatile("movdqa %%xmm1,%0" : "=m" (*dq));
asm volatile("pxor %xmm1,%xmm0");
asm volatile("movdqa %%xmm0,%0" : "=m" (*dp));
bytes -= 16;
p += 16;
q += 16;
dp += 16;
dq += 16;
#endif
}
kernel_fpu_end();
}
void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, void **ptrs)
{
u8 *p, *q, *dq;
const u8 *qmul; /* Q multiplier table */
static const u8 __aligned(16) x0f[16] = {
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
p = (u8 *)ptrs[disks-2];
q = (u8 *)ptrs[disks-1];
/* Compute syndrome with zero for the missing data page
Use the dead data page as temporary storage for delta q */
dq = (u8 *)ptrs[faila];
ptrs[faila] = (void *)raid6_empty_zero_page;
ptrs[disks-1] = dq;
raid6_call.gen_syndrome(disks, bytes, ptrs);
/* Restore pointer table */
ptrs[faila] = dq;
ptrs[disks-1] = q;
/* Now, pick the proper data tables */
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
kernel_fpu_begin();
asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0]));
while (bytes) {
#ifdef CONFIG_X86_64
asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16]));
asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
/* xmm3 = q[0] ^ dq[0] */
asm volatile("pxor %0, %%xmm4" : : "m" (q[16]));
asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
/* xmm4 = q[16] ^ dq[16] */
asm volatile("movdqa %xmm3, %xmm6");
asm volatile("movdqa %xmm4, %xmm8");
/* xmm4 = xmm8 = q[16] ^ dq[16] */
asm volatile("psraw $4, %xmm3");
asm volatile("pand %xmm7, %xmm6");
asm volatile("pand %xmm7, %xmm3");
asm volatile("pshufb %xmm6, %xmm0");
asm volatile("pshufb %xmm3, %xmm1");
asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0]));
asm volatile("pxor %xmm0, %xmm1");
asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16]));
/* xmm1 = qmul[q[0] ^ dq[0]] */
asm volatile("psraw $4, %xmm4");
asm volatile("pand %xmm7, %xmm8");
asm volatile("pand %xmm7, %xmm4");
asm volatile("pshufb %xmm8, %xmm10");
asm volatile("pshufb %xmm4, %xmm11");
asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
asm volatile("pxor %xmm10, %xmm11");
asm volatile("movdqa %0, %%xmm12" : : "m" (p[16]));
/* xmm11 = qmul[q[16] ^ dq[16]] */
asm volatile("pxor %xmm1, %xmm2");
/* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */
asm volatile("pxor %xmm11, %xmm12");
/* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */
asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16]));
asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
asm volatile("movdqa %%xmm12, %0" : "=m" (p[16]));
bytes -= 32;
p += 32;
q += 32;
dq += 32;
#else
asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
/* xmm3 = *q ^ *dq */
asm volatile("movdqa %xmm3, %xmm6");
asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
asm volatile("psraw $4, %xmm3");
asm volatile("pand %xmm7, %xmm6");
asm volatile("pand %xmm7, %xmm3");
asm volatile("pshufb %xmm6, %xmm0");
asm volatile("pshufb %xmm3, %xmm1");
asm volatile("pxor %xmm0, %xmm1");
/* xmm1 = qmul[*q ^ *dq */
asm volatile("pxor %xmm1, %xmm2");
/* xmm2 = *p ^ qmul[*q ^ *dq] */
asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
bytes -= 16;
p += 16;
q += 16;
dq += 16;
#endif
}
kernel_fpu_end();
}
const struct raid6_recov_calls raid6_recov_ssse3 = {
.data2 = raid6_2data_recov_ssse3,
.datap = raid6_datap_recov_ssse3,
.valid = raid6_has_ssse3,
#ifdef CONFIG_X86_64
.name = "ssse3x2",
#else
.name = "ssse3x1",
#endif
.priority = 1,
};
#endif
...@@ -23,7 +23,7 @@ RANLIB = ranlib ...@@ -23,7 +23,7 @@ RANLIB = ranlib
all: raid6.a raid6test all: raid6.a raid6test
raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \ raid6.a: int1.o int2.o int4.o int8.o int16.o int32.o mmx.o sse1.o sse2.o \
altivec1.o altivec2.o altivec4.o altivec8.o recov.o algos.o \ altivec1.o altivec2.o altivec4.o altivec8.o recov.o recov_ssse3.o algos.o \
tables.o tables.o
rm -f $@ rm -f $@
$(AR) cq $@ $^ $(AR) cq $@ $^
......
...@@ -90,25 +90,35 @@ static int test_disks(int i, int j) ...@@ -90,25 +90,35 @@ static int test_disks(int i, int j)
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
const struct raid6_calls *const *algo; const struct raid6_calls *const *algo;
const struct raid6_recov_calls *const *ra;
int i, j; int i, j;
int err = 0; int err = 0;
makedata(); makedata();
for (algo = raid6_algos; *algo; algo++) { for (ra = raid6_recov_algos; *ra; ra++) {
if (!(*algo)->valid || (*algo)->valid()) { if ((*ra)->valid && !(*ra)->valid())
raid6_call = **algo; continue;
raid6_2data_recov = (*ra)->data2;
raid6_datap_recov = (*ra)->datap;
/* Nuke syndromes */ printf("using recovery %s\n", (*ra)->name);
memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
/* Generate assumed good syndrome */ for (algo = raid6_algos; *algo; algo++) {
raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, if (!(*algo)->valid || (*algo)->valid()) {
(void **)&dataptrs); raid6_call = **algo;
for (i = 0; i < NDISKS-1; i++) /* Nuke syndromes */
for (j = i+1; j < NDISKS; j++) memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
err += test_disks(i, j);
/* Generate assumed good syndrome */
raid6_call.gen_syndrome(NDISKS, PAGE_SIZE,
(void **)&dataptrs);
for (i = 0; i < NDISKS-1; i++)
for (j = i+1; j < NDISKS; j++)
err += test_disks(i, j);
}
} }
printf("\n"); printf("\n");
} }
......
...@@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void) ...@@ -35,24 +35,29 @@ static inline void kernel_fpu_end(void)
{ {
} }
#define __aligned(x) __attribute__((aligned(x)))
#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ #define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions #define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions
* (fast save and restore) */ * (fast save and restore) */
#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ #define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */
#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ #define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
/* Should work well enough on modern CPUs for testing */ /* Should work well enough on modern CPUs for testing */
static inline int boot_cpu_has(int flag) static inline int boot_cpu_has(int flag)
{ {
u32 eax = (flag >> 5) ? 0x80000001 : 1; u32 eax = (flag & 0x20) ? 0x80000001 : 1;
u32 edx; u32 ecx, edx;
asm volatile("cpuid" asm volatile("cpuid"
: "+a" (eax), "=d" (edx) : "+a" (eax), "=d" (edx), "=c" (ecx)
: : "ecx", "ebx"); : : "ebx");
return (edx >> (flag & 31)) & 1; return ((flag & 0x80 ? ecx : edx) >> (flag & 31)) & 1;
} }
#endif /* ndef __KERNEL__ */ #endif /* ndef __KERNEL__ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment