Commit 223cdea4 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (53 commits)
  md/raid5 revise rules for when to update metadata during reshape
  md/raid5: minor code cleanups in make_request.
  md: remove CONFIG_MD_RAID_RESHAPE config option.
  md/raid5: be more careful about write ordering when reshaping.
  md: don't display meaningless values in sysfs files resync_start and sync_speed
  md/raid5: allow layout and chunksize to be changed on active array.
  md/raid5: reshape using largest of old and new chunk size
  md/raid5: prepare for allowing reshape to change layout
  md/raid5: prepare for allowing reshape to change chunksize.
  md/raid5: clearly differentiate 'before' and 'after' stripes during reshape.
  Documentation/md.txt update
  md: allow number of drives in raid5 to be reduced
  md/raid5: change reshape-progress measurement to cope with reshaping backwards.
  md: add explicit method to signal the end of a reshape.
  md/raid5: enhance raid5_size to work correctly with negative delta_disks
  md/raid5: drop qd_idx from r6_state
  md/raid6: move raid6 data processing to raid6_pq.ko
  md: raid5 run(): Fix max_degraded for raid level 4.
  md: 'array_size' sysfs attribute
  md: centralize ->array_sectors modifications
  ...
parents 31e6e2da c8f517c4
...@@ -164,15 +164,19 @@ All md devices contain: ...@@ -164,15 +164,19 @@ All md devices contain:
raid_disks raid_disks
a text file with a simple number indicating the number of devices a text file with a simple number indicating the number of devices
in a fully functional array. If this is not yet known, the file in a fully functional array. If this is not yet known, the file
will be empty. If an array is being resized (not currently will be empty. If an array is being resized this will contain
possible) this will contain the larger of the old and new sizes. the new number of devices.
Some raid level (RAID1) allow this value to be set while the Some raid levels allow this value to be set while the array is
array is active. This will reconfigure the array. Otherwise active. This will reconfigure the array. Otherwise it can only
it can only be set while assembling an array. be set while assembling an array.
A change to this attribute will not be permitted if it would
reduce the size of the array. To reduce the number of drives
in an e.g. raid5, the array size must first be reduced by
setting the 'array_size' attribute.
chunk_size chunk_size
This is the size if bytes for 'chunks' and is only relevant to This is the size in bytes for 'chunks' and is only relevant to
raid levels that involve striping (1,4,5,6,10). The address space raid levels that involve striping (0,4,5,6,10). The address space
of the array is conceptually divided into chunks and consecutive of the array is conceptually divided into chunks and consecutive
chunks are striped onto neighbouring devices. chunks are striped onto neighbouring devices.
The size should be at least PAGE_SIZE (4k) and should be a power The size should be at least PAGE_SIZE (4k) and should be a power
...@@ -183,6 +187,20 @@ All md devices contain: ...@@ -183,6 +187,20 @@ All md devices contain:
simply a number that is interpretted differently by different simply a number that is interpretted differently by different
levels. It can be written while assembling an array. levels. It can be written while assembling an array.
array_size
This can be used to artificially constrain the available space in
the array to be less than is actually available on the combined
devices. Writing a number (in Kilobytes) which is less than
the available size will set the size. Any reconfiguration of the
array (e.g. adding devices) will not cause the size to change.
Writing the word 'default' will cause the effective size of the
array to be whatever size is actually available based on
'level', 'chunk_size' and 'component_size'.
This can be used to reduce the size of the array before reducing
the number of devices in a raid4/5/6, or to support external
metadata formats which mandate such clipping.
reshape_position reshape_position
This is either "none" or a sector number within the devices of This is either "none" or a sector number within the devices of
the array where "reshape" is up to. If this is set, the three the array where "reshape" is up to. If this is set, the three
...@@ -207,6 +225,11 @@ All md devices contain: ...@@ -207,6 +225,11 @@ All md devices contain:
about the array. It can be 0.90 (traditional format), 1.0, 1.1, about the array. It can be 0.90 (traditional format), 1.0, 1.1,
1.2 (newer format in varying locations) or "none" indicating that 1.2 (newer format in varying locations) or "none" indicating that
the kernel isn't managing metadata at all. the kernel isn't managing metadata at all.
Alternately it can be "external:" followed by a string which
is set by user-space. This indicates that metadata is managed
by a user-space program. Any device failure or other event that
requires a metadata update will cause array activity to be
suspended until the event is acknowledged.
resync_start resync_start
The point at which resync should start. If no resync is needed, The point at which resync should start. If no resync is needed,
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
#define BH_TRACE 0 #define BH_TRACE 0
#include <linux/module.h> #include <linux/module.h>
#include <linux/raid/md.h>
#include <linux/raid/xor.h> #include <linux/raid/xor.h>
#include <linux/jiffies.h>
#include <asm/xor.h> #include <asm/xor.h>
/* The xor routines to use. */ /* The xor routines to use. */
......
...@@ -121,6 +121,7 @@ config MD_RAID10 ...@@ -121,6 +121,7 @@ config MD_RAID10
config MD_RAID456 config MD_RAID456
tristate "RAID-4/RAID-5/RAID-6 mode" tristate "RAID-4/RAID-5/RAID-6 mode"
depends on BLK_DEV_MD depends on BLK_DEV_MD
select MD_RAID6_PQ
select ASYNC_MEMCPY select ASYNC_MEMCPY
select ASYNC_XOR select ASYNC_XOR
---help--- ---help---
...@@ -151,34 +152,8 @@ config MD_RAID456 ...@@ -151,34 +152,8 @@ config MD_RAID456
If unsure, say Y. If unsure, say Y.
config MD_RAID5_RESHAPE config MD_RAID6_PQ
bool "Support adding drives to a raid-5 array" tristate
depends on MD_RAID456
default y
---help---
A RAID-5 set can be expanded by adding extra drives. This
requires "restriping" the array which means (almost) every
block must be written to a different place.
This option allows such restriping to be done while the array
is online.
You will need mdadm version 2.4.1 or later to use this
feature safely. During the early stage of reshape there is
a critical section where live data is being over-written. A
crash during this time needs extra care for recovery. The
newer mdadm takes a copy of the data in the critical section
and will restore it, if necessary, after a crash.
The mdadm usage is e.g.
mdadm --grow /dev/md1 --raid-disks=6
to grow '/dev/md1' to having 6 disks.
Note: The array can only be expanded, not contracted.
There should be enough spares already present to make the new
array workable.
If unsure, say Y.
config MD_MULTIPATH config MD_MULTIPATH
tristate "Multipath I/O support" tristate "Multipath I/O support"
......
...@@ -2,20 +2,21 @@ ...@@ -2,20 +2,21 @@
# Makefile for the kernel software RAID and LVM drivers. # Makefile for the kernel software RAID and LVM drivers.
# #
dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o
dm-multipath-objs := dm-path-selector.o dm-mpath.o dm-multipath-y += dm-path-selector.o dm-mpath.o
dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-snap-persistent.o dm-snap-persistent.o
dm-mirror-objs := dm-raid1.o dm-mirror-y += dm-raid1.o
md-mod-objs := md.o bitmap.o md-mod-y += md.o bitmap.o
raid456-objs := raid5.o raid6algos.o raid6recov.o raid6tables.o \ raid456-y += raid5.o
raid6_pq-y += raid6algos.o raid6recov.o raid6tables.o \
raid6int1.o raid6int2.o raid6int4.o \ raid6int1.o raid6int2.o raid6int4.o \
raid6int8.o raid6int16.o raid6int32.o \ raid6int8.o raid6int16.o raid6int32.o \
raid6altivec1.o raid6altivec2.o raid6altivec4.o \ raid6altivec1.o raid6altivec2.o raid6altivec4.o \
raid6altivec8.o \ raid6altivec8.o \
raid6mmx.o raid6sse1.o raid6sse2.o raid6mmx.o raid6sse1.o raid6sse2.o
hostprogs-y := mktables hostprogs-y += mktables
# Note: link order is important. All raid personalities # Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise # and must come before md.o, as they each initialise
...@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o ...@@ -26,6 +27,7 @@ obj-$(CONFIG_MD_LINEAR) += linear.o
obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID0) += raid0.o
obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID1) += raid1.o
obj-$(CONFIG_MD_RAID10) += raid10.o obj-$(CONFIG_MD_RAID10) += raid10.o
obj-$(CONFIG_MD_RAID6_PQ) += raid6_pq.o
obj-$(CONFIG_MD_RAID456) += raid456.o obj-$(CONFIG_MD_RAID456) += raid456.o
obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o
obj-$(CONFIG_MD_FAULTY) += faulty.o obj-$(CONFIG_MD_FAULTY) += faulty.o
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
* wait if count gets too high, wake when it drops to half. * wait if count gets too high, wake when it drops to half.
*/ */
#include <linux/blkdev.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/slab.h> #include <linux/slab.h>
...@@ -26,8 +27,8 @@ ...@@ -26,8 +27,8 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/mount.h> #include <linux/mount.h>
#include <linux/buffer_head.h> #include <linux/buffer_head.h>
#include <linux/raid/md.h> #include "md.h"
#include <linux/raid/bitmap.h> #include "bitmap.h"
/* debug macros */ /* debug macros */
...@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat ...@@ -111,9 +112,10 @@ static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int creat
unsigned char *mappage; unsigned char *mappage;
if (page >= bitmap->pages) { if (page >= bitmap->pages) {
printk(KERN_ALERT /* This can happen if bitmap_start_sync goes beyond
"%s: invalid bitmap page request: %lu (> %lu)\n", * End-of-device while looking for a whole page.
bmname(bitmap), page, bitmap->pages-1); * It is harmless.
*/
return -EINVAL; return -EINVAL;
} }
...@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev) ...@@ -265,7 +267,6 @@ static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
list_for_each_continue_rcu(pos, &mddev->disks) { list_for_each_continue_rcu(pos, &mddev->disks) {
rdev = list_entry(pos, mdk_rdev_t, same_set); rdev = list_entry(pos, mdk_rdev_t, same_set);
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) { !test_bit(Faulty, &rdev->flags)) {
/* this is a usable devices */ /* this is a usable devices */
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
...@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) ...@@ -297,7 +298,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+ size/512 > 0) + size/512 > 0)
/* bitmap runs in to metadata */ /* bitmap runs in to metadata */
goto bad_alignment; goto bad_alignment;
if (rdev->data_offset + mddev->size*2 if (rdev->data_offset + mddev->dev_sectors
> rdev->sb_start + bitmap->offset) > rdev->sb_start + bitmap->offset)
/* data runs in to bitmap */ /* data runs in to bitmap */
goto bad_alignment; goto bad_alignment;
...@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -570,7 +571,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
reason = "unrecognized superblock version"; reason = "unrecognized superblock version";
else if (chunksize < PAGE_SIZE) else if (chunksize < 512)
reason = "bitmap chunksize too small"; reason = "bitmap chunksize too small";
else if ((1 << ffz(~chunksize)) != chunksize) else if ((1 << ffz(~chunksize)) != chunksize)
reason = "bitmap chunksize not a power of 2"; reason = "bitmap chunksize not a power of 2";
...@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto ...@@ -1306,6 +1307,9 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
} }
if (bitmap->mddev->degraded)
/* Never clear bits or update events_cleared when degraded */
success = 0;
while (sectors) { while (sectors) {
int blocks; int blocks;
...@@ -1345,7 +1349,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto ...@@ -1345,7 +1349,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
} }
} }
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
int degraded) int degraded)
{ {
bitmap_counter_t *bmc; bitmap_counter_t *bmc;
...@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, ...@@ -1374,6 +1378,29 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
return rv; return rv;
} }
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
int degraded)
{
/* bitmap_start_sync must always report on multiples of whole
* pages, otherwise resync (which is very PAGE_SIZE based) will
* get confused.
* So call __bitmap_start_sync repeatedly (if needed) until
* At least PAGE_SIZE>>9 blocks are covered.
* Return the 'or' of the result.
*/
int rv = 0;
int blocks1;
*blocks = 0;
while (*blocks < (PAGE_SIZE>>9)) {
rv |= __bitmap_start_sync(bitmap, offset,
&blocks1, degraded);
offset += blocks1;
*blocks += blocks1;
}
return rv;
}
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
{ {
bitmap_counter_t *bmc; bitmap_counter_t *bmc;
...@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) ...@@ -1443,6 +1470,8 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
wait_event(bitmap->mddev->recovery_wait, wait_event(bitmap->mddev->recovery_wait,
atomic_read(&bitmap->mddev->recovery_active) == 0); atomic_read(&bitmap->mddev->recovery_active) == 0);
bitmap->mddev->curr_resync_completed = bitmap->mddev->curr_resync;
set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1);
s = 0; s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) { while (s < sector && s < bitmap->mddev->resync_max_sectors) {
......
...@@ -62,7 +62,10 @@ ...@@ -62,7 +62,10 @@
#define ModeShift 5 #define ModeShift 5
#define MaxFault 50 #define MaxFault 50
#include <linux/raid/md.h> #include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include "md.h"
#include <linux/seq_file.h>
static void faulty_fail(struct bio *bio, int error) static void faulty_fail(struct bio *bio, int error)
...@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size) ...@@ -280,6 +283,17 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
return 0; return 0;
} }
static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(raid_disks,
"%s does not support generic reshape\n", __func__);
if (sectors == 0)
return mddev->dev_sectors;
return sectors;
}
static int run(mddev_t *mddev) static int run(mddev_t *mddev)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -298,7 +312,7 @@ static int run(mddev_t *mddev) ...@@ -298,7 +312,7 @@ static int run(mddev_t *mddev)
list_for_each_entry(rdev, &mddev->disks, same_set) list_for_each_entry(rdev, &mddev->disks, same_set)
conf->rdev = rdev; conf->rdev = rdev;
mddev->array_sectors = mddev->size * 2; md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
mddev->private = conf; mddev->private = conf;
reconfig(mddev, mddev->layout, -1); reconfig(mddev, mddev->layout, -1);
...@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality = ...@@ -325,6 +339,7 @@ static struct mdk_personality faulty_personality =
.stop = stop, .stop = stop,
.status = status, .status = status,
.reconfig = reconfig, .reconfig = reconfig,
.size = faulty_size,
}; };
static int __init raid_init(void) static int __init raid_init(void)
......
...@@ -16,7 +16,11 @@ ...@@ -16,7 +16,11 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#include <linux/raid/linear.h> #include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include "md.h"
#include "linear.h"
/* /*
* find which device holds a particular offset * find which device holds a particular offset
...@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits) ...@@ -97,6 +101,16 @@ static int linear_congested(void *data, int bits)
return ret; return ret;
} }
static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
linear_conf_t *conf = mddev_to_conf(mddev);
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
return conf->array_sectors;
}
static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
{ {
linear_conf_t *conf; linear_conf_t *conf;
...@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) ...@@ -135,8 +149,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
mddev->queue->max_sectors > (PAGE_SIZE>>9)) mddev->queue->max_sectors > (PAGE_SIZE>>9))
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
disk->num_sectors = rdev->size * 2; disk->num_sectors = rdev->sectors;
conf->array_sectors += rdev->size * 2; conf->array_sectors += rdev->sectors;
cnt++; cnt++;
} }
...@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev) ...@@ -249,7 +263,7 @@ static int linear_run (mddev_t *mddev)
if (!conf) if (!conf)
return 1; return 1;
mddev->private = conf; mddev->private = conf;
mddev->array_sectors = conf->array_sectors; md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
mddev->queue->unplug_fn = linear_unplug; mddev->queue->unplug_fn = linear_unplug;
...@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -283,7 +297,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
newconf->prev = mddev_to_conf(mddev); newconf->prev = mddev_to_conf(mddev);
mddev->private = newconf; mddev->private = newconf;
mddev->raid_disks++; mddev->raid_disks++;
mddev->array_sectors = newconf->array_sectors; md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
return 0; return 0;
} }
...@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality = ...@@ -381,6 +395,7 @@ static struct mdk_personality linear_personality =
.stop = linear_stop, .stop = linear_stop,
.status = linear_status, .status = linear_status,
.hot_add_disk = linear_add, .hot_add_disk = linear_add,
.size = linear_size,
}; };
static int __init linear_init (void) static int __init linear_init (void)
......
#ifndef _LINEAR_H #ifndef _LINEAR_H
#define _LINEAR_H #define _LINEAR_H
#include <linux/raid/md.h>
struct dev_info { struct dev_info {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
sector_t num_sectors; sector_t num_sectors;
......
...@@ -33,9 +33,9 @@ ...@@ -33,9 +33,9 @@
*/ */
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/raid/md.h> #include <linux/blkdev.h>
#include <linux/raid/bitmap.h>
#include <linux/sysctl.h> #include <linux/sysctl.h>
#include <linux/seq_file.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/poll.h> #include <linux/poll.h>
#include <linux/ctype.h> #include <linux/ctype.h>
...@@ -45,11 +45,10 @@ ...@@ -45,11 +45,10 @@
#include <linux/reboot.h> #include <linux/reboot.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/raid/md_p.h>
#define MAJOR_NR MD_MAJOR #include <linux/raid/md_u.h>
#include "md.h"
/* 63 partitions with the alternate major number (mdp) */ #include "bitmap.h"
#define MdpMinorShift 6
#define DEBUG 0 #define DEBUG 0
#define dprintk(x...) ((void)(DEBUG && printk(x))) #define dprintk(x...) ((void)(DEBUG && printk(x)))
...@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock); ...@@ -202,12 +201,68 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
) )
static int md_fail_request(struct request_queue *q, struct bio *bio) /* Rather than calling directly into the personality make_request function,
* IO requests come here first so that we can check if the device is
* being suspended pending a reconfiguration.
* We hold a refcount over the call to ->make_request. By the time that
* call has finished, the bio has been linked into some internal structure
* and so is visible to ->quiesce(), so we don't need the refcount any more.
*/
static int md_make_request(struct request_queue *q, struct bio *bio)
{ {
mddev_t *mddev = q->queuedata;
int rv;
if (mddev == NULL || mddev->pers == NULL) {
bio_io_error(bio); bio_io_error(bio);
return 0; return 0;
}
rcu_read_lock();
if (mddev->suspended) {
DEFINE_WAIT(__wait);
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE);
if (!mddev->suspended)
break;
rcu_read_unlock();
schedule();
rcu_read_lock();
}
finish_wait(&mddev->sb_wait, &__wait);
}
atomic_inc(&mddev->active_io);
rcu_read_unlock();
rv = mddev->pers->make_request(q, bio);
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
wake_up(&mddev->sb_wait);
return rv;
}
static void mddev_suspend(mddev_t *mddev)
{
BUG_ON(mddev->suspended);
mddev->suspended = 1;
synchronize_rcu();
wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
mddev->pers->quiesce(mddev, 1);
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
/* we now know that no code is executing in the personality module,
* except possibly the tail end of a ->bi_end_io function, but that
* is certain to complete before the module has a chance to get
* unloaded
*/
}
static void mddev_resume(mddev_t *mddev)
{
mddev->suspended = 0;
wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
} }
static inline mddev_t *mddev_get(mddev_t *mddev) static inline mddev_t *mddev_get(mddev_t *mddev)
{ {
atomic_inc(&mddev->active); atomic_inc(&mddev->active);
...@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit) ...@@ -310,6 +365,7 @@ static mddev_t * mddev_find(dev_t unit)
init_timer(&new->safemode_timer); init_timer(&new->safemode_timer);
atomic_set(&new->active, 1); atomic_set(&new->active, 1);
atomic_set(&new->openers, 0); atomic_set(&new->openers, 0);
atomic_set(&new->active_io, 0);
spin_lock_init(&new->write_lock); spin_lock_init(&new->write_lock);
init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->sb_wait);
init_waitqueue_head(&new->recovery_wait); init_waitqueue_head(&new->recovery_wait);
...@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev) ...@@ -326,6 +382,11 @@ static inline int mddev_lock(mddev_t * mddev)
return mutex_lock_interruptible(&mddev->reconfig_mutex); return mutex_lock_interruptible(&mddev->reconfig_mutex);
} }
static inline int mddev_is_locked(mddev_t *mddev)
{
return mutex_is_locked(&mddev->reconfig_mutex);
}
static inline int mddev_trylock(mddev_t * mddev) static inline int mddev_trylock(mddev_t * mddev)
{ {
return mutex_trylock(&mddev->reconfig_mutex); return mutex_trylock(&mddev->reconfig_mutex);
...@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev) ...@@ -409,7 +470,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
rdev->sb_loaded = 0; rdev->sb_loaded = 0;
rdev->sb_page = NULL; rdev->sb_page = NULL;
rdev->sb_start = 0; rdev->sb_start = 0;
rdev->size = 0; rdev->sectors = 0;
} }
} }
...@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version ...@@ -775,9 +836,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
else else
ret = 0; ret = 0;
} }
rdev->size = calc_num_sectors(rdev, sb->chunk_size) / 2; rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
if (rdev->size < sb->size && sb->level > 1) if (rdev->sectors < sb->size * 2 && sb->level > 1)
/* "this cannot possibly happen" ... */ /* "this cannot possibly happen" ... */
ret = -EINVAL; ret = -EINVAL;
...@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -812,7 +873,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->clevel[0] = 0; mddev->clevel[0] = 0;
mddev->layout = sb->layout; mddev->layout = sb->layout;
mddev->raid_disks = sb->raid_disks; mddev->raid_disks = sb->raid_disks;
mddev->size = sb->size; mddev->dev_sectors = sb->size * 2;
mddev->events = ev1; mddev->events = ev1;
mddev->bitmap_offset = 0; mddev->bitmap_offset = 0;
mddev->default_bitmap_offset = MD_SB_BYTES >> 9; mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
...@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -926,7 +987,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->ctime = mddev->ctime; sb->ctime = mddev->ctime;
sb->level = mddev->level; sb->level = mddev->level;
sb->size = mddev->size; sb->size = mddev->dev_sectors / 2;
sb->raid_disks = mddev->raid_disks; sb->raid_disks = mddev->raid_disks;
sb->md_minor = mddev->md_minor; sb->md_minor = mddev->md_minor;
sb->not_persistent = 0; sb->not_persistent = 0;
...@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1024,7 +1085,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
static unsigned long long static unsigned long long
super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{ {
if (num_sectors && num_sectors < rdev->mddev->size * 2) if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
return 0; /* component must fit device */ return 0; /* component must fit device */
if (rdev->mddev->bitmap_offset) if (rdev->mddev->bitmap_offset)
return 0; /* can't move bitmap */ return 0; /* can't move bitmap */
...@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ...@@ -1180,16 +1241,17 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
ret = 0; ret = 0;
} }
if (minor_version) if (minor_version)
rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2; rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
le64_to_cpu(sb->data_offset);
else else
rdev->size = rdev->sb_start / 2; rdev->sectors = rdev->sb_start;
if (rdev->size < le64_to_cpu(sb->data_size)/2) if (rdev->sectors < le64_to_cpu(sb->data_size))
return -EINVAL; return -EINVAL;
rdev->size = le64_to_cpu(sb->data_size)/2; rdev->sectors = le64_to_cpu(sb->data_size);
if (le32_to_cpu(sb->chunksize)) if (le32_to_cpu(sb->chunksize))
rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1); rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
if (le64_to_cpu(sb->size) > rdev->size*2) if (le64_to_cpu(sb->size) > rdev->sectors)
return -EINVAL; return -EINVAL;
return ret; return ret;
} }
...@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1216,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
mddev->clevel[0] = 0; mddev->clevel[0] = 0;
mddev->layout = le32_to_cpu(sb->layout); mddev->layout = le32_to_cpu(sb->layout);
mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->raid_disks = le32_to_cpu(sb->raid_disks);
mddev->size = le64_to_cpu(sb->size)/2; mddev->dev_sectors = le64_to_cpu(sb->size);
mddev->events = ev1; mddev->events = ev1;
mddev->bitmap_offset = 0; mddev->bitmap_offset = 0;
mddev->default_bitmap_offset = 1024 >> 9; mddev->default_bitmap_offset = 1024 >> 9;
...@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1312,7 +1374,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->raid_disks = cpu_to_le32(mddev->raid_disks);
sb->size = cpu_to_le64(mddev->size<<1); sb->size = cpu_to_le64(mddev->dev_sectors);
if (mddev->bitmap && mddev->bitmap_file == NULL) { if (mddev->bitmap && mddev->bitmap_file == NULL) {
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
...@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1320,10 +1382,15 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
} }
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) {
rdev->recovery_offset > 0) { if (mddev->curr_resync_completed > rdev->recovery_offset)
sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); rdev->recovery_offset = mddev->curr_resync_completed;
sb->recovery_offset = cpu_to_le64(rdev->recovery_offset); if (rdev->recovery_offset > 0) {
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset);
}
} }
if (mddev->reshape_position != MaxSector) { if (mddev->reshape_position != MaxSector) {
...@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) ...@@ -1365,7 +1432,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
{ {
struct mdp_superblock_1 *sb; struct mdp_superblock_1 *sb;
sector_t max_sectors; sector_t max_sectors;
if (num_sectors && num_sectors < rdev->mddev->size * 2) if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
return 0; /* component must fit device */ return 0; /* component must fit device */
if (rdev->sb_start < rdev->data_offset) { if (rdev->sb_start < rdev->data_offset) {
/* minor versions 1 and 2; superblock before data */ /* minor versions 1 and 2; superblock before data */
...@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) ...@@ -1381,7 +1448,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
sector_t sb_start; sector_t sb_start;
sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
sb_start &= ~(sector_t)(4*2 - 1); sb_start &= ~(sector_t)(4*2 - 1);
max_sectors = rdev->size * 2 + sb_start - rdev->sb_start; max_sectors = rdev->sectors + sb_start - rdev->sb_start;
if (!num_sectors || num_sectors > max_sectors) if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors; num_sectors = max_sectors;
rdev->sb_start = sb_start; rdev->sb_start = sb_start;
...@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) ...@@ -1433,6 +1500,38 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
static LIST_HEAD(pending_raid_disks); static LIST_HEAD(pending_raid_disks);
static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
{
struct mdk_personality *pers = mddev->pers;
struct gendisk *disk = mddev->gendisk;
struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
struct blk_integrity *bi_mddev = blk_get_integrity(disk);
/* Data integrity passthrough not supported on RAID 4, 5 and 6 */
if (pers && pers->level >= 4 && pers->level <= 6)
return;
/* If rdev is integrity capable, register profile for mddev */
if (!bi_mddev && bi_rdev) {
if (blk_integrity_register(disk, bi_rdev))
printk(KERN_ERR "%s: %s Could not register integrity!\n",
__func__, disk->disk_name);
else
printk(KERN_NOTICE "Enabling data integrity on %s\n",
disk->disk_name);
return;
}
/* Check that mddev and rdev have matching profiles */
if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
disk->disk_name, rdev->bdev->bd_disk->disk_name);
printk(KERN_NOTICE "Disabling data integrity on %s\n",
disk->disk_name);
blk_integrity_unregister(disk);
}
}
static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
...@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) ...@@ -1449,8 +1548,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
if (find_rdev(mddev, rdev->bdev->bd_dev)) if (find_rdev(mddev, rdev->bdev->bd_dev))
return -EEXIST; return -EEXIST;
/* make sure rdev->size exceeds mddev->size */ /* make sure rdev->sectors exceeds mddev->dev_sectors */
if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { if (rdev->sectors && (mddev->dev_sectors == 0 ||
rdev->sectors < mddev->dev_sectors)) {
if (mddev->pers) { if (mddev->pers) {
/* Cannot change size, so fail /* Cannot change size, so fail
* If mddev->level <= 0, then we don't care * If mddev->level <= 0, then we don't care
...@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) ...@@ -1459,7 +1559,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
if (mddev->level > 0) if (mddev->level > 0)
return -ENOSPC; return -ENOSPC;
} else } else
mddev->size = rdev->size; mddev->dev_sectors = rdev->sectors;
} }
/* Verify rdev->desc_nr is unique. /* Verify rdev->desc_nr is unique.
...@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) ...@@ -1503,6 +1603,8 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
/* May as well allow recovery to be retried once */ /* May as well allow recovery to be retried once */
mddev->recovery_disabled = 0; mddev->recovery_disabled = 0;
md_integrity_check(rdev, mddev);
return 0; return 0;
fail: fail:
...@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb) ...@@ -1713,8 +1815,8 @@ static void print_sb_1(struct mdp_superblock_1 *sb)
static void print_rdev(mdk_rdev_t *rdev, int major_version) static void print_rdev(mdk_rdev_t *rdev, int major_version)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
bdevname(rdev->bdev,b), (unsigned long long)rdev->size, bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
rdev->desc_nr); rdev->desc_nr);
if (rdev->sb_loaded) { if (rdev->sb_loaded) {
...@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -2153,7 +2255,7 @@ offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
return -EINVAL; return -EINVAL;
if (rdev->mddev->pers && rdev->raid_disk >= 0) if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY; return -EBUSY;
if (rdev->size && rdev->mddev->external) if (rdev->sectors && rdev->mddev->external)
/* Must set offset before size, so overlap checks /* Must set offset before size, so overlap checks
* can be sane */ * can be sane */
return -EBUSY; return -EBUSY;
...@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); ...@@ -2167,7 +2269,7 @@ __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
static ssize_t static ssize_t
rdev_size_show(mdk_rdev_t *rdev, char *page) rdev_size_show(mdk_rdev_t *rdev, char *page)
{ {
return sprintf(page, "%llu\n", (unsigned long long)rdev->size); return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
} }
static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
...@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) ...@@ -2180,34 +2282,52 @@ static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
return 1; return 1;
} }
static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
{
unsigned long long blocks;
sector_t new;
if (strict_strtoull(buf, 10, &blocks) < 0)
return -EINVAL;
if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
return -EINVAL; /* sector conversion overflow */
new = blocks * 2;
if (new != blocks * 2)
return -EINVAL; /* unsigned long long to sector_t overflow */
*sectors = new;
return 0;
}
static ssize_t static ssize_t
rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
{ {
unsigned long long size;
unsigned long long oldsize = rdev->size;
mddev_t *my_mddev = rdev->mddev; mddev_t *my_mddev = rdev->mddev;
sector_t oldsectors = rdev->sectors;
sector_t sectors;
if (strict_strtoull(buf, 10, &size) < 0) if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL; return -EINVAL;
if (my_mddev->pers && rdev->raid_disk >= 0) { if (my_mddev->pers && rdev->raid_disk >= 0) {
if (my_mddev->persistent) { if (my_mddev->persistent) {
size = super_types[my_mddev->major_version]. sectors = super_types[my_mddev->major_version].
rdev_size_change(rdev, size * 2); rdev_size_change(rdev, sectors);
if (!size) if (!sectors)
return -EBUSY; return -EBUSY;
} else if (!size) { } else if (!sectors)
size = (rdev->bdev->bd_inode->i_size >> 10); sectors = (rdev->bdev->bd_inode->i_size >> 9) -
size -= rdev->data_offset/2; rdev->data_offset;
} }
} if (sectors < my_mddev->dev_sectors)
if (size < my_mddev->size)
return -EINVAL; /* component must fit device */ return -EINVAL; /* component must fit device */
rdev->size = size; rdev->sectors = sectors;
if (size > oldsize && my_mddev->external) { if (sectors > oldsectors && my_mddev->external) {
/* need to check that all other rdevs with the same ->bdev /* need to check that all other rdevs with the same ->bdev
* do not overlap. We need to unlock the mddev to avoid * do not overlap. We need to unlock the mddev to avoid
* a deadlock. We have already changed rdev->size, and if * a deadlock. We have already changed rdev->sectors, and if
* we have to change it back, we will have the lock again. * we have to change it back, we will have the lock again.
*/ */
mddev_t *mddev; mddev_t *mddev;
...@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -2223,9 +2343,9 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (test_bit(AllReserved, &rdev2->flags) || if (test_bit(AllReserved, &rdev2->flags) ||
(rdev->bdev == rdev2->bdev && (rdev->bdev == rdev2->bdev &&
rdev != rdev2 && rdev != rdev2 &&
overlaps(rdev->data_offset, rdev->size * 2, overlaps(rdev->data_offset, rdev->sectors,
rdev2->data_offset, rdev2->data_offset,
rdev2->size * 2))) { rdev2->sectors))) {
overlap = 1; overlap = 1;
break; break;
} }
...@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) ...@@ -2239,11 +2359,11 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (overlap) { if (overlap) {
/* Someone else could have slipped in a size /* Someone else could have slipped in a size
* change here, but doing so is just silly. * change here, but doing so is just silly.
* We put oldsize back because we *know* it is * We put oldsectors back because we *know* it is
* safe, and trust userspace not to race with * safe, and trust userspace not to race with
* itself * itself
*/ */
rdev->size = oldsize; rdev->sectors = oldsectors;
return -EBUSY; return -EBUSY;
} }
} }
...@@ -2547,9 +2667,12 @@ level_show(mddev_t *mddev, char *page) ...@@ -2547,9 +2667,12 @@ level_show(mddev_t *mddev, char *page)
static ssize_t static ssize_t
level_store(mddev_t *mddev, const char *buf, size_t len) level_store(mddev_t *mddev, const char *buf, size_t len)
{ {
char level[16];
ssize_t rv = len; ssize_t rv = len;
if (mddev->pers) struct mdk_personality *pers;
return -EBUSY; void *priv;
if (mddev->pers == NULL) {
if (len == 0) if (len == 0)
return 0; return 0;
if (len >= sizeof(mddev->clevel)) if (len >= sizeof(mddev->clevel))
...@@ -2560,6 +2683,86 @@ level_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2560,6 +2683,86 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
mddev->clevel[len] = 0; mddev->clevel[len] = 0;
mddev->level = LEVEL_NONE; mddev->level = LEVEL_NONE;
return rv; return rv;
}
/* request to change the personality. Need to ensure:
* - array is not engaged in resync/recovery/reshape
* - old personality can be suspended
* - new personality will access other array.
*/
if (mddev->sync_thread || mddev->reshape_position != MaxSector)
return -EBUSY;
if (!mddev->pers->quiesce) {
printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
mdname(mddev), mddev->pers->name);
return -EINVAL;
}
/* Now find the new personality */
if (len == 0 || len >= sizeof(level))
return -EINVAL;
strncpy(level, buf, len);
if (level[len-1] == '\n')
len--;
level[len] = 0;
request_module("md-%s", level);
spin_lock(&pers_lock);
pers = find_pers(LEVEL_NONE, level);
if (!pers || !try_module_get(pers->owner)) {
spin_unlock(&pers_lock);
printk(KERN_WARNING "md: personality %s not loaded\n", level);
return -EINVAL;
}
spin_unlock(&pers_lock);
if (pers == mddev->pers) {
/* Nothing to do! */
module_put(pers->owner);
return rv;
}
if (!pers->takeover) {
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
mdname(mddev), level);
return -EINVAL;
}
/* ->takeover must set new_* and/or delta_disks
* if it succeeds, and may set them when it fails.
*/
priv = pers->takeover(mddev);
if (IS_ERR(priv)) {
mddev->new_level = mddev->level;
mddev->new_layout = mddev->layout;
mddev->new_chunk = mddev->chunk_size;
mddev->raid_disks -= mddev->delta_disks;
mddev->delta_disks = 0;
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s would not accept array\n",
mdname(mddev), level);
return PTR_ERR(priv);
}
/* Looks like we have a winner */
mddev_suspend(mddev);
mddev->pers->stop(mddev);
module_put(mddev->pers->owner);
mddev->pers = pers;
mddev->private = priv;
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
mddev->level = mddev->new_level;
mddev->layout = mddev->new_layout;
mddev->chunk_size = mddev->new_chunk;
mddev->delta_disks = 0;
pers->run(mddev);
mddev_resume(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return rv;
} }
static struct md_sysfs_entry md_level = static struct md_sysfs_entry md_level =
...@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2586,12 +2789,18 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
if (!*buf || (*e && *e != '\n')) if (!*buf || (*e && *e != '\n'))
return -EINVAL; return -EINVAL;
if (mddev->pers) if (mddev->pers) {
int err;
if (mddev->pers->reconfig == NULL)
return -EBUSY; return -EBUSY;
if (mddev->reshape_position != MaxSector) err = mddev->pers->reconfig(mddev, n, -1);
if (err)
return err;
} else {
mddev->new_layout = n; mddev->new_layout = n;
else if (mddev->reshape_position == MaxSector)
mddev->layout = n; mddev->layout = n;
}
return len; return len;
} }
static struct md_sysfs_entry md_layout = static struct md_sysfs_entry md_layout =
...@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page) ...@@ -2648,19 +2857,24 @@ chunk_size_show(mddev_t *mddev, char *page)
static ssize_t static ssize_t
chunk_size_store(mddev_t *mddev, const char *buf, size_t len) chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
{ {
/* can only set chunk_size if array is not yet active */
char *e; char *e;
unsigned long n = simple_strtoul(buf, &e, 10); unsigned long n = simple_strtoul(buf, &e, 10);
if (!*buf || (*e && *e != '\n')) if (!*buf || (*e && *e != '\n'))
return -EINVAL; return -EINVAL;
if (mddev->pers) if (mddev->pers) {
int err;
if (mddev->pers->reconfig == NULL)
return -EBUSY; return -EBUSY;
else if (mddev->reshape_position != MaxSector) err = mddev->pers->reconfig(mddev, -1, n);
if (err)
return err;
} else {
mddev->new_chunk = n; mddev->new_chunk = n;
else if (mddev->reshape_position == MaxSector)
mddev->chunk_size = n; mddev->chunk_size = n;
}
return len; return len;
} }
static struct md_sysfs_entry md_chunk_size = static struct md_sysfs_entry md_chunk_size =
...@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); ...@@ -2669,6 +2883,8 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
static ssize_t static ssize_t
resync_start_show(mddev_t *mddev, char *page) resync_start_show(mddev_t *mddev, char *page)
{ {
if (mddev->recovery_cp == MaxSector)
return sprintf(page, "none\n");
return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
} }
...@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page) ...@@ -2766,7 +2982,7 @@ array_state_show(mddev_t *mddev, char *page)
else { else {
if (list_empty(&mddev->disks) && if (list_empty(&mddev->disks) &&
mddev->raid_disks == 0 && mddev->raid_disks == 0 &&
mddev->size == 0) mddev->dev_sectors == 0)
st = clear; st = clear;
else else
st = inactive; st = inactive;
...@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); ...@@ -2973,7 +3189,8 @@ __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
static ssize_t static ssize_t
size_show(mddev_t *mddev, char *page) size_show(mddev_t *mddev, char *page)
{ {
return sprintf(page, "%llu\n", (unsigned long long)mddev->size); return sprintf(page, "%llu\n",
(unsigned long long)mddev->dev_sectors / 2);
} }
static int update_size(mddev_t *mddev, sector_t num_sectors); static int update_size(mddev_t *mddev, sector_t num_sectors);
...@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len) ...@@ -2985,20 +3202,18 @@ size_store(mddev_t *mddev, const char *buf, size_t len)
* not increase it (except from 0). * not increase it (except from 0).
* If array is active, we can try an on-line resize * If array is active, we can try an on-line resize
*/ */
char *e; sector_t sectors;
int err = 0; int err = strict_blocks_to_sectors(buf, &sectors);
unsigned long long size = simple_strtoull(buf, &e, 10);
if (!*buf || *buf == '\n' ||
(*e && *e != '\n'))
return -EINVAL;
if (err < 0)
return err;
if (mddev->pers) { if (mddev->pers) {
err = update_size(mddev, size * 2); err = update_size(mddev, sectors);
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
} else { } else {
if (mddev->size == 0 || if (mddev->dev_sectors == 0 ||
mddev->size > size) mddev->dev_sectors > sectors)
mddev->size = size; mddev->dev_sectors = sectors;
else else
err = -ENOSPC; err = -ENOSPC;
} }
...@@ -3251,6 +3466,8 @@ static ssize_t ...@@ -3251,6 +3466,8 @@ static ssize_t
sync_speed_show(mddev_t *mddev, char *page) sync_speed_show(mddev_t *mddev, char *page)
{ {
unsigned long resync, dt, db; unsigned long resync, dt, db;
if (mddev->curr_resync == 0)
return sprintf(page, "none\n");
resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
dt = (jiffies - mddev->resync_mark) / HZ; dt = (jiffies - mddev->resync_mark) / HZ;
if (!dt) dt++; if (!dt) dt++;
...@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); ...@@ -3263,15 +3480,15 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
static ssize_t static ssize_t
sync_completed_show(mddev_t *mddev, char *page) sync_completed_show(mddev_t *mddev, char *page)
{ {
unsigned long max_blocks, resync; unsigned long max_sectors, resync;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_blocks = mddev->resync_max_sectors; max_sectors = mddev->resync_max_sectors;
else else
max_blocks = mddev->size << 1; max_sectors = mddev->dev_sectors;
resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active)); resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
return sprintf(page, "%lu / %lu\n", resync, max_blocks); return sprintf(page, "%lu / %lu\n", resync, max_sectors);
} }
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
...@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position = ...@@ -3431,6 +3648,57 @@ static struct md_sysfs_entry md_reshape_position =
__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
reshape_position_store); reshape_position_store);
static ssize_t
array_size_show(mddev_t *mddev, char *page)
{
if (mddev->external_size)
return sprintf(page, "%llu\n",
(unsigned long long)mddev->array_sectors/2);
else
return sprintf(page, "default\n");
}
static ssize_t
array_size_store(mddev_t *mddev, const char *buf, size_t len)
{
sector_t sectors;
if (strncmp(buf, "default", 7) == 0) {
if (mddev->pers)
sectors = mddev->pers->size(mddev, 0, 0);
else
sectors = mddev->array_sectors;
mddev->external_size = 0;
} else {
if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
return -EINVAL;
mddev->external_size = 1;
}
mddev->array_sectors = sectors;
set_capacity(mddev->gendisk, mddev->array_sectors);
if (mddev->pers) {
struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode,
(loff_t)mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
}
return len;
}
static struct md_sysfs_entry md_array_size =
__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
array_size_store);
static struct attribute *md_default_attrs[] = { static struct attribute *md_default_attrs[] = {
&md_level.attr, &md_level.attr,
...@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = { ...@@ -3444,6 +3712,7 @@ static struct attribute *md_default_attrs[] = {
&md_safe_delay.attr, &md_safe_delay.attr,
&md_array_state.attr, &md_array_state.attr,
&md_reshape_position.attr, &md_reshape_position.attr,
&md_array_size.attr,
NULL, NULL,
}; };
...@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name) ...@@ -3602,10 +3871,12 @@ static int md_alloc(dev_t dev, char *name)
mddev_put(mddev); mddev_put(mddev);
return -ENOMEM; return -ENOMEM;
} }
mddev->queue->queuedata = mddev;
/* Can be unlocked because the queue is new: no concurrency */ /* Can be unlocked because the queue is new: no concurrency */
queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
blk_queue_make_request(mddev->queue, md_fail_request); blk_queue_make_request(mddev->queue, md_make_request);
disk = alloc_disk(1 << shift); disk = alloc_disk(1 << shift);
if (!disk) { if (!disk) {
...@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev) ...@@ -3731,13 +4002,13 @@ static int do_md_run(mddev_t * mddev)
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
if (test_bit(Faulty, &rdev->flags)) if (test_bit(Faulty, &rdev->flags))
continue; continue;
if (rdev->size < chunk_size / 1024) { if (rdev->sectors < chunk_size / 512) {
printk(KERN_WARNING printk(KERN_WARNING
"md: Dev %s smaller than chunk_size:" "md: Dev %s smaller than chunk_size:"
" %lluk < %dk\n", " %llu < %d\n",
bdevname(rdev->bdev,b), bdevname(rdev->bdev,b),
(unsigned long long)rdev->size, (unsigned long long)rdev->sectors,
chunk_size / 1024); chunk_size / 512);
return -EINVAL; return -EINVAL;
} }
} }
...@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev) ...@@ -3761,11 +4032,11 @@ static int do_md_run(mddev_t * mddev)
/* perform some consistency tests on the device. /* perform some consistency tests on the device.
* We don't want the data to overlap the metadata, * We don't want the data to overlap the metadata,
* Internal Bitmap issues has handled elsewhere. * Internal Bitmap issues have been handled elsewhere.
*/ */
if (rdev->data_offset < rdev->sb_start) { if (rdev->data_offset < rdev->sb_start) {
if (mddev->size && if (mddev->dev_sectors &&
rdev->data_offset + mddev->size*2 rdev->data_offset + mddev->dev_sectors
> rdev->sb_start) { > rdev->sb_start) {
printk("md: %s: data overlaps metadata\n", printk("md: %s: data overlaps metadata\n",
mdname(mddev)); mdname(mddev));
...@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev) ...@@ -3801,9 +4072,16 @@ static int do_md_run(mddev_t * mddev)
} }
mddev->pers = pers; mddev->pers = pers;
spin_unlock(&pers_lock); spin_unlock(&pers_lock);
if (mddev->level != pers->level) {
mddev->level = pers->level; mddev->level = pers->level;
mddev->new_level = pers->level;
}
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
if (pers->level >= 4 && pers->level <= 6)
/* Cannot support integrity (yet) */
blk_integrity_unregister(mddev->gendisk);
if (mddev->reshape_position != MaxSector && if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) { pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */ /* This personality cannot handle reshaping... */
...@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev) ...@@ -3843,7 +4121,9 @@ static int do_md_run(mddev_t * mddev)
} }
mddev->recovery = 0; mddev->recovery = 0;
mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ /* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
mddev->barriers_work = 1; mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded; mddev->ok_start_degraded = start_dirty_degraded;
...@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev) ...@@ -3853,7 +4133,17 @@ static int do_md_run(mddev_t * mddev)
err = mddev->pers->run(mddev); err = mddev->pers->run(mddev);
if (err) if (err)
printk(KERN_ERR "md: pers->run() failed ...\n"); printk(KERN_ERR "md: pers->run() failed ...\n");
else if (mddev->pers->sync_request) { else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
WARN_ONCE(!mddev->external_size, "%s: default size too small,"
" but 'external_size' not in effect?\n", __func__);
printk(KERN_ERR
"md: invalid array_size %llu > default size %llu\n",
(unsigned long long)mddev->array_sectors / 2,
(unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
err = -EINVAL;
mddev->pers->stop(mddev);
}
if (err == 0 && mddev->pers->sync_request) {
err = bitmap_create(mddev); err = bitmap_create(mddev);
if (err) { if (err) {
printk(KERN_ERR "%s: failed to create bitmap (%d)\n", printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
...@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev) ...@@ -3899,16 +4189,6 @@ static int do_md_run(mddev_t * mddev)
set_capacity(disk, mddev->array_sectors); set_capacity(disk, mddev->array_sectors);
/* If we call blk_queue_make_request here, it will
* re-initialise max_sectors etc which may have been
* refined inside -> run. So just set the bits we need to set.
* Most initialisation happended when we called
* blk_queue_make_request(..., md_fail_request)
* earlier.
*/
mddev->queue->queuedata = mddev;
mddev->queue->make_request_fn = mddev->pers->make_request;
/* If there is a partially-recovered drive we need to /* If there is a partially-recovered drive we need to
* start recovery here. If we leave it to md_check_recovery, * start recovery here. If we leave it to md_check_recovery,
* it will remove the drives and not do the right thing * it will remove the drives and not do the right thing
...@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) ...@@ -4038,7 +4318,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
md_super_wait(mddev); md_super_wait(mddev);
if (mddev->ro) if (mddev->ro)
set_disk_ro(disk, 0); set_disk_ro(disk, 0);
blk_queue_make_request(mddev->queue, md_fail_request);
mddev->pers->stop(mddev); mddev->pers->stop(mddev);
mddev->queue->merge_bvec_fn = NULL; mddev->queue->merge_bvec_fn = NULL;
mddev->queue->unplug_fn = NULL; mddev->queue->unplug_fn = NULL;
...@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) ...@@ -4095,7 +4375,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
export_array(mddev); export_array(mddev);
mddev->array_sectors = 0; mddev->array_sectors = 0;
mddev->size = 0; mddev->external_size = 0;
mddev->dev_sectors = 0;
mddev->raid_disks = 0; mddev->raid_disks = 0;
mddev->recovery_cp = 0; mddev->recovery_cp = 0;
mddev->resync_min = 0; mddev->resync_min = 0;
...@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) ...@@ -4135,6 +4416,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
printk(KERN_INFO "md: %s switched to read-only mode.\n", printk(KERN_INFO "md: %s switched to read-only mode.\n",
mdname(mddev)); mdname(mddev));
err = 0; err = 0;
blk_integrity_unregister(disk);
md_new_event(mddev); md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state); sysfs_notify_dirent(mddev->sysfs_state);
out: out:
...@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg) ...@@ -4300,8 +4582,8 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
info.patch_version = MD_PATCHLEVEL_VERSION; info.patch_version = MD_PATCHLEVEL_VERSION;
info.ctime = mddev->ctime; info.ctime = mddev->ctime;
info.level = mddev->level; info.level = mddev->level;
info.size = mddev->size; info.size = mddev->dev_sectors / 2;
if (info.size != mddev->size) /* overflow */ if (info.size != mddev->dev_sectors / 2) /* overflow */
info.size = -1; info.size = -1;
info.nr_disks = nr; info.nr_disks = nr;
info.raid_disks = mddev->raid_disks; info.raid_disks = mddev->raid_disks;
...@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -4480,6 +4762,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
clear_bit(In_sync, &rdev->flags); /* just to be sure */ clear_bit(In_sync, &rdev->flags); /* just to be sure */
if (info->state & (1<<MD_DISK_WRITEMOSTLY)) if (info->state & (1<<MD_DISK_WRITEMOSTLY))
set_bit(WriteMostly, &rdev->flags); set_bit(WriteMostly, &rdev->flags);
else
clear_bit(WriteMostly, &rdev->flags);
rdev->raid_disk = -1; rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
...@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -4543,7 +4827,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
} else } else
rdev->sb_start = calc_dev_sboffset(rdev->bdev); rdev->sb_start = calc_dev_sboffset(rdev->bdev);
rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
err = bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
if (err) { if (err) {
...@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -4613,7 +4897,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
else else
rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
rdev->size = calc_num_sectors(rdev, mddev->chunk_size) / 2; rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
printk(KERN_WARNING printk(KERN_WARNING
...@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) ...@@ -4749,7 +5033,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
mddev->level = info->level; mddev->level = info->level;
mddev->clevel[0] = 0; mddev->clevel[0] = 0;
mddev->size = info->size; mddev->dev_sectors = 2 * (sector_t)info->size;
mddev->raid_disks = info->raid_disks; mddev->raid_disks = info->raid_disks;
/* don't set md_minor, it is determined by which /dev/md* was /* don't set md_minor, it is determined by which /dev/md* was
* openned * openned
...@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) ...@@ -4788,6 +5072,17 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
return 0; return 0;
} }
void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
{
WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
if (mddev->external_size)
return;
mddev->array_sectors = array_sectors;
}
EXPORT_SYMBOL(md_set_array_sectors);
static int update_size(mddev_t *mddev, sector_t num_sectors) static int update_size(mddev_t *mddev, sector_t num_sectors)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) ...@@ -4814,8 +5109,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
*/ */
return -EBUSY; return -EBUSY;
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
sector_t avail; sector_t avail = rdev->sectors;
avail = rdev->size * 2;
if (fit && (num_sectors == 0 || num_sectors > avail)) if (fit && (num_sectors == 0 || num_sectors > avail))
num_sectors = avail; num_sectors = avail;
...@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) ...@@ -4887,12 +5181,18 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
) )
return -EINVAL; return -EINVAL;
/* Check there is only one change */ /* Check there is only one change */
if (info->size >= 0 && mddev->size != info->size) cnt++; if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
if (mddev->raid_disks != info->raid_disks) cnt++; cnt++;
if (mddev->layout != info->layout) cnt++; if (mddev->raid_disks != info->raid_disks)
if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++; cnt++;
if (cnt == 0) return 0; if (mddev->layout != info->layout)
if (cnt > 1) return -EINVAL; cnt++;
if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
cnt++;
if (cnt == 0)
return 0;
if (cnt > 1)
return -EINVAL;
if (mddev->layout != info->layout) { if (mddev->layout != info->layout) {
/* Change layout /* Change layout
...@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) ...@@ -4904,7 +5204,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
else else
return mddev->pers->reconfig(mddev, info->layout, -1); return mddev->pers->reconfig(mddev, info->layout, -1);
} }
if (info->size >= 0 && mddev->size != info->size) if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2); rv = update_size(mddev, (sector_t)info->size * 2);
if (mddev->raid_disks != info->raid_disks) if (mddev->raid_disks != info->raid_disks)
...@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, ...@@ -5331,6 +5631,8 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
void md_unregister_thread(mdk_thread_t *thread) void md_unregister_thread(mdk_thread_t *thread)
{ {
if (!thread)
return;
dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
kthread_stop(thread->tsk); kthread_stop(thread->tsk);
...@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev) ...@@ -5404,7 +5706,7 @@ static void status_resync(struct seq_file *seq, mddev_t * mddev)
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_blocks = mddev->resync_max_sectors >> 1; max_blocks = mddev->resync_max_sectors >> 1;
else else
max_blocks = mddev->size; max_blocks = mddev->dev_sectors / 2;
/* /*
* Should not happen. * Should not happen.
...@@ -5537,7 +5839,7 @@ struct mdstat_info { ...@@ -5537,7 +5839,7 @@ struct mdstat_info {
static int md_seq_show(struct seq_file *seq, void *v) static int md_seq_show(struct seq_file *seq, void *v)
{ {
mddev_t *mddev = v; mddev_t *mddev = v;
sector_t size; sector_t sectors;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct mdstat_info *mi = seq->private; struct mdstat_info *mi = seq->private;
struct bitmap *bitmap; struct bitmap *bitmap;
...@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -5573,7 +5875,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, " %s", mddev->pers->name); seq_printf(seq, " %s", mddev->pers->name);
} }
size = 0; sectors = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) { list_for_each_entry(rdev, &mddev->disks, same_set) {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]", seq_printf(seq, " %s[%d]",
...@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -5585,7 +5887,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
continue; continue;
} else if (rdev->raid_disk < 0) } else if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */ seq_printf(seq, "(S)"); /* spare */
size += rdev->size; sectors += rdev->sectors;
} }
if (!list_empty(&mddev->disks)) { if (!list_empty(&mddev->disks)) {
...@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -5595,7 +5897,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
mddev->array_sectors / 2); mddev->array_sectors / 2);
else else
seq_printf(seq, "\n %llu blocks", seq_printf(seq, "\n %llu blocks",
(unsigned long long)size); (unsigned long long)sectors / 2);
} }
if (mddev->persistent) { if (mddev->persistent) {
if (mddev->major_version != 0 || if (mddev->major_version != 0 ||
...@@ -5722,18 +6024,18 @@ int unregister_md_personality(struct mdk_personality *p) ...@@ -5722,18 +6024,18 @@ int unregister_md_personality(struct mdk_personality *p)
return 0; return 0;
} }
static int is_mddev_idle(mddev_t *mddev) static int is_mddev_idle(mddev_t *mddev, int init)
{ {
mdk_rdev_t * rdev; mdk_rdev_t * rdev;
int idle; int idle;
long curr_events; int curr_events;
idle = 1; idle = 1;
rcu_read_lock(); rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) { rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
curr_events = part_stat_read(&disk->part0, sectors[0]) + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
part_stat_read(&disk->part0, sectors[1]) - (int)part_stat_read(&disk->part0, sectors[1]) -
atomic_read(&disk->sync_io); atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats /* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and * as sync_io is counted when a request starts, and
...@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev) ...@@ -5757,7 +6059,7 @@ static int is_mddev_idle(mddev_t *mddev)
* always make curr_events less than last_events. * always make curr_events less than last_events.
* *
*/ */
if (curr_events - rdev->last_events > 4096) { if (init || curr_events - rdev->last_events > 64) {
rdev->last_events = curr_events; rdev->last_events = curr_events;
idle = 0; idle = 0;
} }
...@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev) ...@@ -5980,10 +6282,10 @@ void md_do_sync(mddev_t *mddev)
j = mddev->recovery_cp; j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->size << 1; max_sectors = mddev->dev_sectors;
else { else {
/* recovery follows the physical size of devices */ /* recovery follows the physical size of devices */
max_sectors = mddev->size << 1; max_sectors = mddev->dev_sectors;
j = MaxSector; j = MaxSector;
list_for_each_entry(rdev, &mddev->disks, same_set) list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
...@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev) ...@@ -6000,7 +6302,7 @@ void md_do_sync(mddev_t *mddev)
"(but not more than %d KB/sec) for %s.\n", "(but not more than %d KB/sec) for %s.\n",
speed_max(mddev), desc); speed_max(mddev), desc);
is_mddev_idle(mddev); /* this also initializes IO event counters */ is_mddev_idle(mddev, 1); /* this initializes IO event counters */
io_sectors = 0; io_sectors = 0;
for (m = 0; m < SYNC_MARKS; m++) { for (m = 0; m < SYNC_MARKS; m++) {
...@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev) ...@@ -6040,6 +6342,18 @@ void md_do_sync(mddev_t *mddev)
} }
if (kthread_should_stop()) if (kthread_should_stop())
goto interrupted; goto interrupted;
if (mddev->curr_resync > mddev->curr_resync_completed &&
(mddev->curr_resync - mddev->curr_resync_completed)
> (max_sectors >> 4)) {
/* time to update curr_resync_completed */
blk_unplug(mddev->queue);
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed =
mddev->curr_resync;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
}
sectors = mddev->pers->sync_request(mddev, j, &skipped, sectors = mddev->pers->sync_request(mddev, j, &skipped,
currspeed < speed_min(mddev)); currspeed < speed_min(mddev));
if (sectors == 0) { if (sectors == 0) {
...@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev) ...@@ -6102,7 +6416,7 @@ void md_do_sync(mddev_t *mddev)
if (currspeed > speed_min(mddev)) { if (currspeed > speed_min(mddev)) {
if ((currspeed > speed_max(mddev)) || if ((currspeed > speed_max(mddev)) ||
!is_mddev_idle(mddev)) { !is_mddev_idle(mddev, 0)) {
msleep(500); msleep(500);
goto repeat; goto repeat;
} }
...@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev) ...@@ -6173,6 +6487,8 @@ static int remove_and_add_spares(mddev_t *mddev)
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
int spares = 0; int spares = 0;
mddev->curr_resync_completed = 0;
list_for_each_entry(rdev, &mddev->disks, same_set) list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) && !test_bit(Blocked, &rdev->flags) &&
...@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev) ...@@ -6327,6 +6643,9 @@ void md_check_recovery(mddev_t *mddev)
sysfs_notify(&mddev->kobj, NULL, sysfs_notify(&mddev->kobj, NULL,
"degraded"); "degraded");
} }
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
/* if array is no-longer degraded, then any saved_raid_disk /* if array is no-longer degraded, then any saved_raid_disk
...@@ -6470,13 +6789,13 @@ static void md_geninit(void) ...@@ -6470,13 +6789,13 @@ static void md_geninit(void)
static int __init md_init(void) static int __init md_init(void)
{ {
if (register_blkdev(MAJOR_NR, "md")) if (register_blkdev(MD_MAJOR, "md"))
return -1; return -1;
if ((mdp_major=register_blkdev(0, "mdp"))<=0) { if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
unregister_blkdev(MAJOR_NR, "md"); unregister_blkdev(MD_MAJOR, "md");
return -1; return -1;
} }
blk_register_region(MKDEV(MAJOR_NR, 0), 1UL<<MINORBITS, THIS_MODULE, blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL); md_probe, NULL, NULL);
blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL); md_probe, NULL, NULL);
...@@ -6562,10 +6881,10 @@ static __exit void md_exit(void) ...@@ -6562,10 +6881,10 @@ static __exit void md_exit(void)
mddev_t *mddev; mddev_t *mddev;
struct list_head *tmp; struct list_head *tmp;
blk_unregister_region(MKDEV(MAJOR_NR,0), 1U << MINORBITS); blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
unregister_blkdev(MAJOR_NR,"md"); unregister_blkdev(MD_MAJOR,"md");
unregister_blkdev(mdp_major, "mdp"); unregister_blkdev(mdp_major, "mdp");
unregister_reboot_notifier(&md_notifier); unregister_reboot_notifier(&md_notifier);
unregister_sysctl_table(raid_table_header); unregister_sysctl_table(raid_table_header);
......
...@@ -15,21 +15,8 @@ ...@@ -15,21 +15,8 @@
#ifndef _MD_K_H #ifndef _MD_K_H
#define _MD_K_H #define _MD_K_H
/* and dm-bio-list.h is not under include/linux because.... ??? */
#include "../../../drivers/md/dm-bio-list.h"
#ifdef CONFIG_BLOCK #ifdef CONFIG_BLOCK
#define LEVEL_MULTIPATH (-4)
#define LEVEL_LINEAR (-1)
#define LEVEL_FAULTY (-5)
/* we need a value for 'no level specified' and 0
* means 'raid0', so we need something else. This is
* for internal use only
*/
#define LEVEL_NONE (-1000000)
#define MaxSector (~(sector_t)0) #define MaxSector (~(sector_t)0)
typedef struct mddev_s mddev_t; typedef struct mddev_s mddev_t;
...@@ -49,9 +36,9 @@ struct mdk_rdev_s ...@@ -49,9 +36,9 @@ struct mdk_rdev_s
{ {
struct list_head same_set; /* RAID devices within the same set */ struct list_head same_set; /* RAID devices within the same set */
sector_t size; /* Device size (in blocks) */ sector_t sectors; /* Device size (in 512bytes sectors) */
mddev_t *mddev; /* RAID array if running */ mddev_t *mddev; /* RAID array if running */
long last_events; /* IO event timestamp */ int last_events; /* IO event timestamp */
struct block_device *bdev; /* block device handle */ struct block_device *bdev; /* block device handle */
...@@ -132,6 +119,8 @@ struct mddev_s ...@@ -132,6 +119,8 @@ struct mddev_s
#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
#define MD_CHANGE_PENDING 2 /* superblock update in progress */ #define MD_CHANGE_PENDING 2 /* superblock update in progress */
int suspended;
atomic_t active_io;
int ro; int ro;
struct gendisk *gendisk; struct gendisk *gendisk;
...@@ -155,8 +144,11 @@ struct mddev_s ...@@ -155,8 +144,11 @@ struct mddev_s
char clevel[16]; char clevel[16];
int raid_disks; int raid_disks;
int max_disks; int max_disks;
sector_t size; /* used size of component devices */ sector_t dev_sectors; /* used size of
* component devices */
sector_t array_sectors; /* exported array size */ sector_t array_sectors; /* exported array size */
int external_size; /* size managed
* externally */
__u64 events; __u64 events;
char uuid[16]; char uuid[16];
...@@ -172,6 +164,13 @@ struct mddev_s ...@@ -172,6 +164,13 @@ struct mddev_s
struct mdk_thread_s *thread; /* management thread */ struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
sector_t curr_resync; /* last block scheduled */ sector_t curr_resync; /* last block scheduled */
/* As resync requests can complete out of order, we cannot easily track
* how much resync has been completed. So we occasionally pause until
* everything completes, then set curr_resync_completed to curr_resync.
* As such it may be well behind the real resync mark, but it is a value
* we are certain of.
*/
sector_t curr_resync_completed;
unsigned long resync_mark; /* a recent timestamp */ unsigned long resync_mark; /* a recent timestamp */
sector_t resync_mark_cnt;/* blocks written at resync_mark */ sector_t resync_mark_cnt;/* blocks written at resync_mark */
sector_t curr_mark_cnt; /* blocks scheduled now */ sector_t curr_mark_cnt; /* blocks scheduled now */
...@@ -315,8 +314,10 @@ struct mdk_personality ...@@ -315,8 +314,10 @@ struct mdk_personality
int (*spare_active) (mddev_t *mddev); int (*spare_active) (mddev_t *mddev);
sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster);
int (*resize) (mddev_t *mddev, sector_t sectors); int (*resize) (mddev_t *mddev, sector_t sectors);
sector_t (*size) (mddev_t *mddev, sector_t sectors, int raid_disks);
int (*check_reshape) (mddev_t *mddev); int (*check_reshape) (mddev_t *mddev);
int (*start_reshape) (mddev_t *mddev); int (*start_reshape) (mddev_t *mddev);
void (*finish_reshape) (mddev_t *mddev);
int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
/* quiesce moves between quiescence states /* quiesce moves between quiescence states
* 0 - fully active * 0 - fully active
...@@ -324,6 +325,16 @@ struct mdk_personality ...@@ -324,6 +325,16 @@ struct mdk_personality
* others - reserved * others - reserved
*/ */
void (*quiesce) (mddev_t *mddev, int state); void (*quiesce) (mddev_t *mddev, int state);
/* takeover is used to transition an array from one
* personality to another. The new personality must be able
* to handle the data in the current layout.
* e.g. 2drive raid1 -> 2drive raid5
* ndrive raid5 -> degraded n+1drive raid6 with special layout
* If the takeover succeeds, a new 'private' structure is returned.
* This needs to be installed and then ->run used to activate the
* array.
*/
void *(*takeover) (mddev_t *mddev);
}; };
...@@ -400,3 +411,26 @@ static inline void safe_put_page(struct page *p) ...@@ -400,3 +411,26 @@ static inline void safe_put_page(struct page *p)
#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLOCK */
#endif #endif
extern int register_md_personality(struct mdk_personality *p);
extern int unregister_md_personality(struct mdk_personality *p);
extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
mddev_t *mddev, const char *name);
extern void md_unregister_thread(mdk_thread_t *thread);
extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_check_recovery(mddev_t *mddev);
extern void md_write_start(mddev_t *mddev, struct bio *bi);
extern void md_write_end(mddev_t *mddev);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
...@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) ...@@ -59,7 +59,7 @@ int main(int argc, char *argv[])
uint8_t v; uint8_t v;
uint8_t exptbl[256], invtbl[256]; uint8_t exptbl[256], invtbl[256];
printf("#include \"raid6.h\"\n"); printf("#include <linux/raid/pq.h>\n");
/* Compute multiplication table */ /* Compute multiplication table */
printf("\nconst u8 __attribute__((aligned(256)))\n" printf("\nconst u8 __attribute__((aligned(256)))\n"
...@@ -76,6 +76,9 @@ int main(int argc, char *argv[]) ...@@ -76,6 +76,9 @@ int main(int argc, char *argv[])
printf("\t},\n"); printf("\t},\n");
} }
printf("};\n"); printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfmul);\n");
printf("#endif\n");
/* Compute power-of-2 table (exponent) */ /* Compute power-of-2 table (exponent) */
v = 1; v = 1;
...@@ -92,6 +95,9 @@ int main(int argc, char *argv[]) ...@@ -92,6 +95,9 @@ int main(int argc, char *argv[])
} }
} }
printf("};\n"); printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfexp);\n");
printf("#endif\n");
/* Compute inverse table x^-1 == x^254 */ /* Compute inverse table x^-1 == x^254 */
printf("\nconst u8 __attribute__((aligned(256)))\n" printf("\nconst u8 __attribute__((aligned(256)))\n"
...@@ -104,6 +110,9 @@ int main(int argc, char *argv[]) ...@@ -104,6 +110,9 @@ int main(int argc, char *argv[])
} }
} }
printf("};\n"); printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfinv);\n");
printf("#endif\n");
/* Compute inv(2^x + 1) (exponent-xor-inverse) table */ /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
printf("\nconst u8 __attribute__((aligned(256)))\n" printf("\nconst u8 __attribute__((aligned(256)))\n"
...@@ -115,6 +124,9 @@ int main(int argc, char *argv[]) ...@@ -115,6 +124,9 @@ int main(int argc, char *argv[])
(j == 7) ? '\n' : ' '); (j == 7) ? '\n' : ' ');
} }
printf("};\n"); printf("};\n");
printf("#ifdef __KERNEL__\n");
printf("EXPORT_SYMBOL(raid6_gfexi);\n");
printf("#endif\n");
return 0; return 0;
} }
...@@ -19,7 +19,11 @@ ...@@ -19,7 +19,11 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#include <linux/raid/multipath.h> #include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include <linux/seq_file.h>
#include "md.h"
#include "multipath.h"
#define MAX_WORK_PER_DISK 128 #define MAX_WORK_PER_DISK 128
...@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev) ...@@ -402,6 +406,14 @@ static void multipathd (mddev_t *mddev)
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
} }
static sector_t multipath_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
return mddev->dev_sectors;
}
static int multipath_run (mddev_t *mddev) static int multipath_run (mddev_t *mddev)
{ {
multipath_conf_t *conf; multipath_conf_t *conf;
...@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev) ...@@ -498,7 +510,7 @@ static int multipath_run (mddev_t *mddev)
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
mddev->array_sectors = mddev->size * 2; md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
mddev->queue->unplug_fn = multipath_unplug; mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested; mddev->queue->backing_dev_info.congested_fn = multipath_congested;
...@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality = ...@@ -543,6 +555,7 @@ static struct mdk_personality multipath_personality =
.error_handler = multipath_error, .error_handler = multipath_error,
.hot_add_disk = multipath_add_disk, .hot_add_disk = multipath_add_disk,
.hot_remove_disk= multipath_remove_disk, .hot_remove_disk= multipath_remove_disk,
.size = multipath_size,
}; };
static int __init multipath_init (void) static int __init multipath_init (void)
......
#ifndef _MULTIPATH_H #ifndef _MULTIPATH_H
#define _MULTIPATH_H #define _MULTIPATH_H
#include <linux/raid/md.h>
struct multipath_info { struct multipath_info {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
}; };
......
...@@ -18,7 +18,10 @@ ...@@ -18,7 +18,10 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#include <linux/raid/raid0.h> #include <linux/blkdev.h>
#include <linux/seq_file.h>
#include "md.h"
#include "raid0.h"
static void raid0_unplug(struct request_queue *q) static void raid0_unplug(struct request_queue *q)
{ {
...@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev) ...@@ -73,16 +76,15 @@ static int create_strip_zones (mddev_t *mddev)
list_for_each_entry(rdev2, &mddev->disks, same_set) { list_for_each_entry(rdev2, &mddev->disks, same_set) {
printk(KERN_INFO "raid0: comparing %s(%llu)", printk(KERN_INFO "raid0: comparing %s(%llu)",
bdevname(rdev1->bdev,b), bdevname(rdev1->bdev,b),
(unsigned long long)rdev1->size); (unsigned long long)rdev1->sectors);
printk(KERN_INFO " with %s(%llu)\n", printk(KERN_INFO " with %s(%llu)\n",
bdevname(rdev2->bdev,b), bdevname(rdev2->bdev,b),
(unsigned long long)rdev2->size); (unsigned long long)rdev2->sectors);
if (rdev2 == rdev1) { if (rdev2 == rdev1) {
printk(KERN_INFO "raid0: END\n"); printk(KERN_INFO "raid0: END\n");
break; break;
} }
if (rdev2->size == rdev1->size) if (rdev2->sectors == rdev1->sectors) {
{
/* /*
* Not unique, don't count it as a new * Not unique, don't count it as a new
* group * group
...@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev) ...@@ -145,7 +147,7 @@ static int create_strip_zones (mddev_t *mddev)
mddev->queue->max_sectors > (PAGE_SIZE>>9)) mddev->queue->max_sectors > (PAGE_SIZE>>9))
blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
if (!smallest || (rdev1->size <smallest->size)) if (!smallest || (rdev1->sectors < smallest->sectors))
smallest = rdev1; smallest = rdev1;
cnt++; cnt++;
} }
...@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev) ...@@ -155,10 +157,10 @@ static int create_strip_zones (mddev_t *mddev)
goto abort; goto abort;
} }
zone->nb_dev = cnt; zone->nb_dev = cnt;
zone->sectors = smallest->size * cnt * 2; zone->sectors = smallest->sectors * cnt;
zone->zone_start = 0; zone->zone_start = 0;
current_start = smallest->size * 2; current_start = smallest->sectors;
curr_zone_start = zone->sectors; curr_zone_start = zone->sectors;
/* now do the other zones */ /* now do the other zones */
...@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev) ...@@ -177,29 +179,29 @@ static int create_strip_zones (mddev_t *mddev)
rdev = conf->strip_zone[0].dev[j]; rdev = conf->strip_zone[0].dev[j];
printk(KERN_INFO "raid0: checking %s ...", printk(KERN_INFO "raid0: checking %s ...",
bdevname(rdev->bdev, b)); bdevname(rdev->bdev, b));
if (rdev->size > current_start / 2) { if (rdev->sectors <= current_start) {
printk(KERN_INFO " contained as device %d\n", printk(KERN_INFO " nope.\n");
c); continue;
}
printk(KERN_INFO " contained as device %d\n", c);
zone->dev[c] = rdev; zone->dev[c] = rdev;
c++; c++;
if (!smallest || (rdev->size <smallest->size)) { if (!smallest || rdev->sectors < smallest->sectors) {
smallest = rdev; smallest = rdev;
printk(KERN_INFO " (%llu) is smallest!.\n", printk(KERN_INFO " (%llu) is smallest!.\n",
(unsigned long long)rdev->size); (unsigned long long)rdev->sectors);
} }
} else
printk(KERN_INFO " nope.\n");
} }
zone->nb_dev = c; zone->nb_dev = c;
zone->sectors = (smallest->size * 2 - current_start) * c; zone->sectors = (smallest->sectors - current_start) * c;
printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
zone->nb_dev, (unsigned long long)zone->sectors); zone->nb_dev, (unsigned long long)zone->sectors);
zone->zone_start = curr_zone_start; zone->zone_start = curr_zone_start;
curr_zone_start += zone->sectors; curr_zone_start += zone->sectors;
current_start = smallest->size * 2; current_start = smallest->sectors;
printk(KERN_INFO "raid0: current zone start: %llu\n", printk(KERN_INFO "raid0: current zone start: %llu\n",
(unsigned long long)current_start); (unsigned long long)current_start);
} }
...@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q, ...@@ -261,12 +263,25 @@ static int raid0_mergeable_bvec(struct request_queue *q,
return max; return max;
} }
static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
sector_t array_sectors = 0;
mdk_rdev_t *rdev;
WARN_ONCE(sectors || raid_disks,
"%s does not support generic reshape\n", __func__);
list_for_each_entry(rdev, &mddev->disks, same_set)
array_sectors += rdev->sectors;
return array_sectors;
}
static int raid0_run (mddev_t *mddev) static int raid0_run (mddev_t *mddev)
{ {
unsigned cur=0, i=0, nb_zone; unsigned cur=0, i=0, nb_zone;
s64 sectors; s64 sectors;
raid0_conf_t *conf; raid0_conf_t *conf;
mdk_rdev_t *rdev;
if (mddev->chunk_size == 0) { if (mddev->chunk_size == 0) {
printk(KERN_ERR "md/raid0: non-zero chunk size required.\n"); printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
...@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev) ...@@ -291,16 +306,14 @@ static int raid0_run (mddev_t *mddev)
goto out_free_conf; goto out_free_conf;
/* calculate array device size */ /* calculate array device size */
mddev->array_sectors = 0; md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
list_for_each_entry(rdev, &mddev->disks, same_set)
mddev->array_sectors += rdev->size * 2;
printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
(unsigned long long)mddev->array_sectors); (unsigned long long)mddev->array_sectors);
printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n", printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
(unsigned long long)conf->spacing); (unsigned long long)conf->spacing);
{ {
sector_t s = mddev->array_sectors; sector_t s = raid0_size(mddev, 0, 0);
sector_t space = conf->spacing; sector_t space = conf->spacing;
int round; int round;
conf->sector_shift = 0; conf->sector_shift = 0;
...@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality= ...@@ -509,6 +522,7 @@ static struct mdk_personality raid0_personality=
.run = raid0_run, .run = raid0_run,
.stop = raid0_stop, .stop = raid0_stop,
.status = raid0_status, .status = raid0_status,
.size = raid0_size,
}; };
static int __init raid0_init (void) static int __init raid0_init (void)
......
#ifndef _RAID0_H #ifndef _RAID0_H
#define _RAID0_H #define _RAID0_H
#include <linux/raid/md.h>
struct strip_zone struct strip_zone
{ {
sector_t zone_start; /* Zone offset in md_dev (in sectors) */ sector_t zone_start; /* Zone offset in md_dev (in sectors) */
......
...@@ -31,10 +31,13 @@ ...@@ -31,10 +31,13 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#include "dm-bio-list.h"
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/raid/raid1.h> #include <linux/blkdev.h>
#include <linux/raid/bitmap.h> #include <linux/seq_file.h>
#include "md.h"
#include "dm-bio-list.h"
#include "raid1.h"
#include "bitmap.h"
#define DEBUG 0 #define DEBUG 0
#if DEBUG #if DEBUG
...@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1723,7 +1726,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0; return 0;
} }
max_sector = mddev->size << 1; max_sector = mddev->dev_sectors;
if (sector_nr >= max_sector) { if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the /* If we aborted, we need to abort the
* sync on the 'current' bitmap chunk (there will * sync on the 'current' bitmap chunk (there will
...@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1919,6 +1922,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return nr_sectors; return nr_sectors;
} }
static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
if (sectors)
return sectors;
return mddev->dev_sectors;
}
static int run(mddev_t *mddev) static int run(mddev_t *mddev)
{ {
conf_t *conf; conf_t *conf;
...@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev) ...@@ -2048,7 +2059,7 @@ static int run(mddev_t *mddev)
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
mddev->array_sectors = mddev->size * 2; md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
mddev->queue->unplug_fn = raid1_unplug; mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_fn = raid1_congested;
...@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev) ...@@ -2089,6 +2100,9 @@ static int stop(mddev_t *mddev)
/* need to kick something here to make sure I/O goes? */ /* need to kick something here to make sure I/O goes? */
} }
raise_barrier(conf);
lower_barrier(conf);
md_unregister_thread(mddev->thread); md_unregister_thread(mddev->thread);
mddev->thread = NULL; mddev->thread = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
...@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) ...@@ -2110,15 +2124,17 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems * any io in the removed space completes, but it hardly seems
* worth it. * worth it.
*/ */
mddev->array_sectors = sectors; md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1; mddev->changed = 1;
if (mddev->array_sectors / 2 > mddev->size && if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) { mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1; mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
mddev->size = mddev->array_sectors / 2; mddev->dev_sectors = sectors;
mddev->resync_max_sectors = sectors; mddev->resync_max_sectors = sectors;
return 0; return 0;
} }
...@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality = ...@@ -2264,6 +2280,7 @@ static struct mdk_personality raid1_personality =
.spare_active = raid1_spare_active, .spare_active = raid1_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
.resize = raid1_resize, .resize = raid1_resize,
.size = raid1_size,
.check_reshape = raid1_reshape, .check_reshape = raid1_reshape,
.quiesce = raid1_quiesce, .quiesce = raid1_quiesce,
}; };
......
#ifndef _RAID1_H #ifndef _RAID1_H
#define _RAID1_H #define _RAID1_H
#include <linux/raid/md.h>
typedef struct mirror_info mirror_info_t; typedef struct mirror_info mirror_info_t;
struct mirror_info { struct mirror_info {
......
...@@ -18,10 +18,13 @@ ...@@ -18,10 +18,13 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/ */
#include "dm-bio-list.h"
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/raid/raid10.h> #include <linux/blkdev.h>
#include <linux/raid/bitmap.h> #include <linux/seq_file.h>
#include "md.h"
#include "dm-bio-list.h"
#include "raid10.h"
#include "bitmap.h"
/* /*
* RAID10 provides a combination of RAID0 and RAID1 functionality. * RAID10 provides a combination of RAID0 and RAID1 functionality.
...@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -1695,7 +1698,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
return 0; return 0;
skipped: skipped:
max_sector = mddev->size << 1; max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_sector = mddev->resync_max_sectors; max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) { if (sector_nr >= max_sector) {
...@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i ...@@ -2020,6 +2023,25 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
goto skipped; goto skipped;
} }
static sector_t
raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
sector_t size;
conf_t *conf = mddev_to_conf(mddev);
if (!raid_disks)
raid_disks = mddev->raid_disks;
if (!sectors)
sectors = mddev->dev_sectors;
size = sectors >> conf->chunk_shift;
sector_div(size, conf->far_copies);
size = size * raid_disks;
sector_div(size, conf->near_copies);
return size << conf->chunk_shift;
}
static int run(mddev_t *mddev) static int run(mddev_t *mddev)
{ {
conf_t *conf; conf_t *conf;
...@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev) ...@@ -2076,7 +2098,7 @@ static int run(mddev_t *mddev)
conf->far_offset = fo; conf->far_offset = fo;
conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1; conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
conf->chunk_shift = ffz(~mddev->chunk_size) - 9; conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
size = mddev->size >> (conf->chunk_shift-1); size = mddev->dev_sectors >> conf->chunk_shift;
sector_div(size, fc); sector_div(size, fc);
size = size * conf->raid_disks; size = size * conf->raid_disks;
sector_div(size, nc); sector_div(size, nc);
...@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev) ...@@ -2089,7 +2111,7 @@ static int run(mddev_t *mddev)
*/ */
stride += conf->raid_disks - 1; stride += conf->raid_disks - 1;
sector_div(stride, conf->raid_disks); sector_div(stride, conf->raid_disks);
mddev->size = stride << (conf->chunk_shift-1); mddev->dev_sectors = stride << conf->chunk_shift;
if (fo) if (fo)
stride = 1; stride = 1;
...@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev) ...@@ -2171,8 +2193,8 @@ static int run(mddev_t *mddev)
/* /*
* Ok, everything is just fine now * Ok, everything is just fine now
*/ */
mddev->array_sectors = size << conf->chunk_shift; md_set_array_sectors(mddev, raid10_size(mddev, 0, 0));
mddev->resync_max_sectors = size << conf->chunk_shift; mddev->resync_max_sectors = raid10_size(mddev, 0, 0);
mddev->queue->unplug_fn = raid10_unplug; mddev->queue->unplug_fn = raid10_unplug;
mddev->queue->backing_dev_info.congested_fn = raid10_congested; mddev->queue->backing_dev_info.congested_fn = raid10_congested;
...@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev) ...@@ -2208,6 +2230,9 @@ static int stop(mddev_t *mddev)
{ {
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
raise_barrier(conf, 0);
lower_barrier(conf);
md_unregister_thread(mddev->thread); md_unregister_thread(mddev->thread);
mddev->thread = NULL; mddev->thread = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
...@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality = ...@@ -2255,6 +2280,7 @@ static struct mdk_personality raid10_personality =
.spare_active = raid10_spare_active, .spare_active = raid10_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
.quiesce = raid10_quiesce, .quiesce = raid10_quiesce,
.size = raid10_size,
}; };
static int __init raid_init(void) static int __init raid_init(void)
......
#ifndef _RAID10_H #ifndef _RAID10_H
#define _RAID10_H #define _RAID10_H
#include <linux/raid/md.h>
typedef struct mirror_info mirror_info_t; typedef struct mirror_info mirror_info_t;
struct mirror_info { struct mirror_info {
......
...@@ -43,11 +43,14 @@ ...@@ -43,11 +43,14 @@
* miss any bits. * miss any bits.
*/ */
#include <linux/blkdev.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include "raid6.h" #include <linux/raid/pq.h>
#include <linux/raid/bitmap.h>
#include <linux/async_tx.h> #include <linux/async_tx.h>
#include <linux/seq_file.h>
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
/* /*
* Stripe cache * Stripe cache
...@@ -91,11 +94,6 @@ ...@@ -91,11 +94,6 @@
#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
#if !RAID6_USE_EMPTY_ZERO_PAGE
/* In .bss so it's zeroed */
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
#endif
/* /*
* We maintain a biased count of active stripes in the bottom 16 bits of * We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits * bi_phys_segments, and a count of processed stripes in the upper 16 bits
...@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) ...@@ -130,12 +128,42 @@ static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
} }
/* Find first data disk in a raid6 stripe */
static inline int raid6_d0(struct stripe_head *sh)
{
if (sh->ddf_layout)
/* ddf always start from first device */
return 0;
/* md starts just after Q block */
if (sh->qd_idx == sh->disks - 1)
return 0;
else
return sh->qd_idx + 1;
}
static inline int raid6_next_disk(int disk, int raid_disks) static inline int raid6_next_disk(int disk, int raid_disks)
{ {
disk++; disk++;
return (disk < raid_disks) ? disk : 0; return (disk < raid_disks) ? disk : 0;
} }
/* When walking through the disks in a raid5, starting at raid6_d0,
* We need to map each disk to a 'slot', where the data disks are slot
* 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
* is raid_disks-1. This help does that mapping.
*/
static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
int *count, int syndrome_disks)
{
int slot;
if (idx == sh->pd_idx)
return syndrome_disks;
if (idx == sh->qd_idx)
return syndrome_disks + 1;
slot = (*count)++;
return slot;
}
static void return_io(struct bio *return_bi) static void return_io(struct bio *return_bi)
{ {
struct bio *bi = return_bi; struct bio *bi = return_bi;
...@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) ...@@ -193,6 +221,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
} }
} }
} }
static void release_stripe(struct stripe_head *sh) static void release_stripe(struct stripe_head *sh)
{ {
raid5_conf_t *conf = sh->raid_conf; raid5_conf_t *conf = sh->raid_conf;
...@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num) ...@@ -270,9 +299,11 @@ static int grow_buffers(struct stripe_head *sh, int num)
return 0; return 0;
} }
static void raid5_build_block(struct stripe_head *sh, int i); static void raid5_build_block(struct stripe_head *sh, int i, int previous);
static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
struct stripe_head *sh);
static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int disks) static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
{ {
raid5_conf_t *conf = sh->raid_conf; raid5_conf_t *conf = sh->raid_conf;
int i; int i;
...@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int ...@@ -287,11 +318,12 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
remove_hash(sh); remove_hash(sh);
sh->generation = conf->generation - previous;
sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
sh->sector = sector; sh->sector = sector;
sh->pd_idx = pd_idx; stripe_set_idx(sector, conf, previous, sh);
sh->state = 0; sh->state = 0;
sh->disks = disks;
for (i = sh->disks; i--; ) { for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
...@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int ...@@ -305,12 +337,13 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int
BUG(); BUG();
} }
dev->flags = 0; dev->flags = 0;
raid5_build_block(sh, i); raid5_build_block(sh, i, previous);
} }
insert_hash(conf, sh); insert_hash(conf, sh);
} }
static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, int disks) static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
short generation)
{ {
struct stripe_head *sh; struct stripe_head *sh;
struct hlist_node *hn; struct hlist_node *hn;
...@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in ...@@ -318,7 +351,7 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
CHECK_DEVLOCK(); CHECK_DEVLOCK();
pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
if (sh->sector == sector && sh->disks == disks) if (sh->sector == sector && sh->generation == generation)
return sh; return sh;
pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
return NULL; return NULL;
...@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in ...@@ -327,8 +360,9 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, in
static void unplug_slaves(mddev_t *mddev); static void unplug_slaves(mddev_t *mddev);
static void raid5_unplug_device(struct request_queue *q); static void raid5_unplug_device(struct request_queue *q);
static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector, int disks, static struct stripe_head *
int pd_idx, int noblock) get_active_stripe(raid5_conf_t *conf, sector_t sector,
int previous, int noblock)
{ {
struct stripe_head *sh; struct stripe_head *sh;
...@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ...@@ -340,7 +374,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
wait_event_lock_irq(conf->wait_for_stripe, wait_event_lock_irq(conf->wait_for_stripe,
conf->quiesce == 0, conf->quiesce == 0,
conf->device_lock, /* nothing */); conf->device_lock, /* nothing */);
sh = __find_stripe(conf, sector, disks); sh = __find_stripe(conf, sector, conf->generation - previous);
if (!sh) { if (!sh) {
if (!conf->inactive_blocked) if (!conf->inactive_blocked)
sh = get_free_stripe(conf); sh = get_free_stripe(conf);
...@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector ...@@ -358,10 +392,11 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
); );
conf->inactive_blocked = 0; conf->inactive_blocked = 0;
} else } else
init_stripe(sh, sector, pd_idx, disks); init_stripe(sh, sector, previous);
} else { } else {
if (atomic_read(&sh->count)) { if (atomic_read(&sh->count)) {
BUG_ON(!list_empty(&sh->lru)); BUG_ON(!list_empty(&sh->lru)
&& !test_bit(STRIPE_EXPANDING, &sh->state));
} else { } else {
if (!test_bit(STRIPE_HANDLE, &sh->state)) if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes); atomic_inc(&conf->active_stripes);
...@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num) ...@@ -895,8 +930,10 @@ static int grow_stripes(raid5_conf_t *conf, int num)
struct kmem_cache *sc; struct kmem_cache *sc;
int devs = conf->raid_disks; int devs = conf->raid_disks;
sprintf(conf->cache_name[0], "raid5-%s", mdname(conf->mddev)); sprintf(conf->cache_name[0],
sprintf(conf->cache_name[1], "raid5-%s-alt", mdname(conf->mddev)); "raid%d-%s", conf->level, mdname(conf->mddev));
sprintf(conf->cache_name[1],
"raid%d-%s-alt", conf->level, mdname(conf->mddev));
conf->active_name = 0; conf->active_name = 0;
sc = kmem_cache_create(conf->cache_name[conf->active_name], sc = kmem_cache_create(conf->cache_name[conf->active_name],
sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
...@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num) ...@@ -911,7 +948,6 @@ static int grow_stripes(raid5_conf_t *conf, int num)
return 0; return 0;
} }
#ifdef CONFIG_MD_RAID5_RESHAPE
static int resize_stripes(raid5_conf_t *conf, int newsize) static int resize_stripes(raid5_conf_t *conf, int newsize)
{ {
/* Make all the stripes able to hold 'newsize' devices. /* Make all the stripes able to hold 'newsize' devices.
...@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) ...@@ -1036,7 +1072,6 @@ static int resize_stripes(raid5_conf_t *conf, int newsize)
conf->pool_size = newsize; conf->pool_size = newsize;
return err; return err;
} }
#endif
static int drop_one_stripe(raid5_conf_t *conf) static int drop_one_stripe(raid5_conf_t *conf)
{ {
...@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error) ...@@ -1176,9 +1211,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
} }
static sector_t compute_blocknr(struct stripe_head *sh, int i); static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
static void raid5_build_block(struct stripe_head *sh, int i) static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{ {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
...@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i) ...@@ -1194,7 +1229,7 @@ static void raid5_build_block(struct stripe_head *sh, int i)
dev->req.bi_private = sh; dev->req.bi_private = sh;
dev->flags = 0; dev->flags = 0;
dev->sector = compute_blocknr(sh, i); dev->sector = compute_blocknr(sh, i, previous);
} }
static void error(mddev_t *mddev, mdk_rdev_t *rdev) static void error(mddev_t *mddev, mdk_rdev_t *rdev)
...@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1227,15 +1262,23 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
* Input: a 'big' sector number, * Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them. * Output: index of the data and parity disk, and the sector # in them.
*/ */
static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
unsigned int data_disks, unsigned int * dd_idx, int previous, int *dd_idx,
unsigned int * pd_idx, raid5_conf_t *conf) struct stripe_head *sh)
{ {
long stripe; long stripe;
unsigned long chunk_number; unsigned long chunk_number;
unsigned int chunk_offset; unsigned int chunk_offset;
int pd_idx, qd_idx;
int ddf_layout = 0;
sector_t new_sector; sector_t new_sector;
int sectors_per_chunk = conf->chunk_size >> 9; int algorithm = previous ? conf->prev_algo
: conf->algorithm;
int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
: (conf->chunk_size >> 9);
int raid_disks = previous ? conf->previous_raid_disks
: conf->raid_disks;
int data_disks = raid_disks - conf->max_degraded;
/* First compute the information on this sector */ /* First compute the information on this sector */
...@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, ...@@ -1259,68 +1302,170 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
/* /*
* Select the parity disk based on the user selected algorithm. * Select the parity disk based on the user selected algorithm.
*/ */
pd_idx = qd_idx = ~0;
switch(conf->level) { switch(conf->level) {
case 4: case 4:
*pd_idx = data_disks; pd_idx = data_disks;
break; break;
case 5: case 5:
switch (conf->algorithm) { switch (algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_LEFT_ASYMMETRIC:
*pd_idx = data_disks - stripe % raid_disks; pd_idx = data_disks - stripe % raid_disks;
if (*dd_idx >= *pd_idx) if (*dd_idx >= pd_idx)
(*dd_idx)++; (*dd_idx)++;
break; break;
case ALGORITHM_RIGHT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC:
*pd_idx = stripe % raid_disks; pd_idx = stripe % raid_disks;
if (*dd_idx >= *pd_idx) if (*dd_idx >= pd_idx)
(*dd_idx)++; (*dd_idx)++;
break; break;
case ALGORITHM_LEFT_SYMMETRIC: case ALGORITHM_LEFT_SYMMETRIC:
*pd_idx = data_disks - stripe % raid_disks; pd_idx = data_disks - stripe % raid_disks;
*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
break; break;
case ALGORITHM_RIGHT_SYMMETRIC: case ALGORITHM_RIGHT_SYMMETRIC:
*pd_idx = stripe % raid_disks; pd_idx = stripe % raid_disks;
*dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
break;
case ALGORITHM_PARITY_0:
pd_idx = 0;
(*dd_idx)++;
break;
case ALGORITHM_PARITY_N:
pd_idx = data_disks;
break; break;
default: default:
printk(KERN_ERR "raid5: unsupported algorithm %d\n", printk(KERN_ERR "raid5: unsupported algorithm %d\n",
conf->algorithm); algorithm);
BUG();
} }
break; break;
case 6: case 6:
/**** FIX THIS ****/ switch (algorithm) {
switch (conf->algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_LEFT_ASYMMETRIC:
*pd_idx = raid_disks - 1 - (stripe % raid_disks); pd_idx = raid_disks - 1 - (stripe % raid_disks);
if (*pd_idx == raid_disks-1) qd_idx = pd_idx + 1;
if (pd_idx == raid_disks-1) {
(*dd_idx)++; /* Q D D D P */ (*dd_idx)++; /* Q D D D P */
else if (*dd_idx >= *pd_idx) qd_idx = 0;
} else if (*dd_idx >= pd_idx)
(*dd_idx) += 2; /* D D P Q D */ (*dd_idx) += 2; /* D D P Q D */
break; break;
case ALGORITHM_RIGHT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC:
*pd_idx = stripe % raid_disks; pd_idx = stripe % raid_disks;
if (*pd_idx == raid_disks-1) qd_idx = pd_idx + 1;
if (pd_idx == raid_disks-1) {
(*dd_idx)++; /* Q D D D P */ (*dd_idx)++; /* Q D D D P */
else if (*dd_idx >= *pd_idx) qd_idx = 0;
} else if (*dd_idx >= pd_idx)
(*dd_idx) += 2; /* D D P Q D */ (*dd_idx) += 2; /* D D P Q D */
break; break;
case ALGORITHM_LEFT_SYMMETRIC: case ALGORITHM_LEFT_SYMMETRIC:
*pd_idx = raid_disks - 1 - (stripe % raid_disks); pd_idx = raid_disks - 1 - (stripe % raid_disks);
*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; qd_idx = (pd_idx + 1) % raid_disks;
*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
break; break;
case ALGORITHM_RIGHT_SYMMETRIC: case ALGORITHM_RIGHT_SYMMETRIC:
*pd_idx = stripe % raid_disks; pd_idx = stripe % raid_disks;
*dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks; qd_idx = (pd_idx + 1) % raid_disks;
*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
break;
case ALGORITHM_PARITY_0:
pd_idx = 0;
qd_idx = 1;
(*dd_idx) += 2;
break;
case ALGORITHM_PARITY_N:
pd_idx = data_disks;
qd_idx = data_disks + 1;
break;
case ALGORITHM_ROTATING_ZERO_RESTART:
/* Exactly the same as RIGHT_ASYMMETRIC, but or
* of blocks for computing Q is different.
*/
pd_idx = stripe % raid_disks;
qd_idx = pd_idx + 1;
if (pd_idx == raid_disks-1) {
(*dd_idx)++; /* Q D D D P */
qd_idx = 0;
} else if (*dd_idx >= pd_idx)
(*dd_idx) += 2; /* D D P Q D */
ddf_layout = 1;
break;
case ALGORITHM_ROTATING_N_RESTART:
/* Same a left_asymmetric, by first stripe is
* D D D P Q rather than
* Q D D D P
*/
pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
qd_idx = pd_idx + 1;
if (pd_idx == raid_disks-1) {
(*dd_idx)++; /* Q D D D P */
qd_idx = 0;
} else if (*dd_idx >= pd_idx)
(*dd_idx) += 2; /* D D P Q D */
ddf_layout = 1;
break;
case ALGORITHM_ROTATING_N_CONTINUE:
/* Same as left_symmetric but Q is before P */
pd_idx = raid_disks - 1 - (stripe % raid_disks);
qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
ddf_layout = 1;
break;
case ALGORITHM_LEFT_ASYMMETRIC_6:
/* RAID5 left_asymmetric, with Q on last device */
pd_idx = data_disks - stripe % (raid_disks-1);
if (*dd_idx >= pd_idx)
(*dd_idx)++;
qd_idx = raid_disks - 1;
break;
case ALGORITHM_RIGHT_ASYMMETRIC_6:
pd_idx = stripe % (raid_disks-1);
if (*dd_idx >= pd_idx)
(*dd_idx)++;
qd_idx = raid_disks - 1;
break;
case ALGORITHM_LEFT_SYMMETRIC_6:
pd_idx = data_disks - stripe % (raid_disks-1);
*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
qd_idx = raid_disks - 1;
break;
case ALGORITHM_RIGHT_SYMMETRIC_6:
pd_idx = stripe % (raid_disks-1);
*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
qd_idx = raid_disks - 1;
break;
case ALGORITHM_PARITY_0_6:
pd_idx = 0;
(*dd_idx)++;
qd_idx = raid_disks - 1;
break; break;
default: default:
printk(KERN_CRIT "raid6: unsupported algorithm %d\n", printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
conf->algorithm); algorithm);
BUG();
} }
break; break;
} }
if (sh) {
sh->pd_idx = pd_idx;
sh->qd_idx = qd_idx;
sh->ddf_layout = ddf_layout;
}
/* /*
* Finally, compute the new sector number * Finally, compute the new sector number
*/ */
...@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks, ...@@ -1329,17 +1474,21 @@ static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
} }
static sector_t compute_blocknr(struct stripe_head *sh, int i) static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
{ {
raid5_conf_t *conf = sh->raid_conf; raid5_conf_t *conf = sh->raid_conf;
int raid_disks = sh->disks; int raid_disks = sh->disks;
int data_disks = raid_disks - conf->max_degraded; int data_disks = raid_disks - conf->max_degraded;
sector_t new_sector = sh->sector, check; sector_t new_sector = sh->sector, check;
int sectors_per_chunk = conf->chunk_size >> 9; int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
: (conf->chunk_size >> 9);
int algorithm = previous ? conf->prev_algo
: conf->algorithm;
sector_t stripe; sector_t stripe;
int chunk_offset; int chunk_offset;
int chunk_number, dummy1, dummy2, dd_idx = i; int chunk_number, dummy1, dd_idx = i;
sector_t r_sector; sector_t r_sector;
struct stripe_head sh2;
chunk_offset = sector_div(new_sector, sectors_per_chunk); chunk_offset = sector_div(new_sector, sectors_per_chunk);
...@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) ...@@ -1351,7 +1500,7 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
switch(conf->level) { switch(conf->level) {
case 4: break; case 4: break;
case 5: case 5:
switch (conf->algorithm) { switch (algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_LEFT_ASYMMETRIC:
case ALGORITHM_RIGHT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC:
if (i > sh->pd_idx) if (i > sh->pd_idx)
...@@ -1363,17 +1512,25 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) ...@@ -1363,17 +1512,25 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
i += raid_disks; i += raid_disks;
i -= (sh->pd_idx + 1); i -= (sh->pd_idx + 1);
break; break;
case ALGORITHM_PARITY_0:
i -= 1;
break;
case ALGORITHM_PARITY_N:
break;
default: default:
printk(KERN_ERR "raid5: unsupported algorithm %d\n", printk(KERN_ERR "raid5: unsupported algorithm %d\n",
conf->algorithm); algorithm);
BUG();
} }
break; break;
case 6: case 6:
if (i == raid6_next_disk(sh->pd_idx, raid_disks)) if (i == sh->qd_idx)
return 0; /* It is the Q disk */ return 0; /* It is the Q disk */
switch (conf->algorithm) { switch (algorithm) {
case ALGORITHM_LEFT_ASYMMETRIC: case ALGORITHM_LEFT_ASYMMETRIC:
case ALGORITHM_RIGHT_ASYMMETRIC: case ALGORITHM_RIGHT_ASYMMETRIC:
case ALGORITHM_ROTATING_ZERO_RESTART:
case ALGORITHM_ROTATING_N_RESTART:
if (sh->pd_idx == raid_disks-1) if (sh->pd_idx == raid_disks-1)
i--; /* Q D D D P */ i--; /* Q D D D P */
else if (i > sh->pd_idx) else if (i > sh->pd_idx)
...@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) ...@@ -1390,9 +1547,35 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
i -= (sh->pd_idx + 2); i -= (sh->pd_idx + 2);
} }
break; break;
case ALGORITHM_PARITY_0:
i -= 2;
break;
case ALGORITHM_PARITY_N:
break;
case ALGORITHM_ROTATING_N_CONTINUE:
if (sh->pd_idx == 0)
i--; /* P D D D Q */
else if (i > sh->pd_idx)
i -= 2; /* D D Q P D */
break;
case ALGORITHM_LEFT_ASYMMETRIC_6:
case ALGORITHM_RIGHT_ASYMMETRIC_6:
if (i > sh->pd_idx)
i--;
break;
case ALGORITHM_LEFT_SYMMETRIC_6:
case ALGORITHM_RIGHT_SYMMETRIC_6:
if (i < sh->pd_idx)
i += data_disks + 1;
i -= (sh->pd_idx + 1);
break;
case ALGORITHM_PARITY_0_6:
i -= 1;
break;
default: default:
printk(KERN_CRIT "raid6: unsupported algorithm %d\n", printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
conf->algorithm); algorithm);
BUG();
} }
break; break;
} }
...@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i) ...@@ -1400,8 +1583,10 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i)
chunk_number = stripe * data_disks + i; chunk_number = stripe * data_disks + i;
r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
check = raid5_compute_sector(r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf); check = raid5_compute_sector(conf, r_sector,
if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) { previous, &dummy1, &sh2);
if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
|| sh2.qd_idx != sh->qd_idx) {
printk(KERN_ERR "compute_blocknr: map not correct\n"); printk(KERN_ERR "compute_blocknr: map not correct\n");
return 0; return 0;
} }
...@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio, ...@@ -1468,14 +1653,16 @@ static void copy_data(int frombio, struct bio *bio,
static void compute_parity6(struct stripe_head *sh, int method) static void compute_parity6(struct stripe_head *sh, int method)
{ {
raid6_conf_t *conf = sh->raid_conf; raid5_conf_t *conf = sh->raid_conf;
int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
struct bio *chosen; struct bio *chosen;
/**** FIX THIS: This could be very bad if disks is close to 256 ****/ /**** FIX THIS: This could be very bad if disks is close to 256 ****/
void *ptrs[disks]; void *ptrs[syndrome_disks+2];
qd_idx = raid6_next_disk(pd_idx, disks); pd_idx = sh->pd_idx;
d0_idx = raid6_next_disk(qd_idx, disks); qd_idx = sh->qd_idx;
d0_idx = raid6_d0(sh);
pr_debug("compute_parity, stripe %llu, method %d\n", pr_debug("compute_parity, stripe %llu, method %d\n",
(unsigned long long)sh->sector, method); (unsigned long long)sh->sector, method);
...@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method) ...@@ -1513,24 +1700,29 @@ static void compute_parity6(struct stripe_head *sh, int method)
set_bit(R5_UPTODATE, &sh->dev[i].flags); set_bit(R5_UPTODATE, &sh->dev[i].flags);
} }
// switch(method) { /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
// case RECONSTRUCT_WRITE:
// case CHECK_PARITY: for (i = 0; i < disks; i++)
// case UPDATE_PARITY: ptrs[i] = (void *)raid6_empty_zero_page;
/* Note that unlike RAID-5, the ordering of the disks matters greatly. */
/* FIX: Is this ordering of drives even remotely optimal? */
count = 0; count = 0;
i = d0_idx; i = d0_idx;
do { do {
ptrs[count++] = page_address(sh->dev[i].page); int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
printk("block %d/%d not uptodate on parity calc\n", i,count); ptrs[slot] = page_address(sh->dev[i].page);
if (slot < syndrome_disks &&
!test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
printk(KERN_ERR "block %d/%d not uptodate "
"on parity calc\n", i, count);
BUG();
}
i = raid6_next_disk(i, disks); i = raid6_next_disk(i, disks);
} while ( i != d0_idx ); } while (i != d0_idx);
// break; BUG_ON(count != syndrome_disks);
// }
raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs); raid6_call.gen_syndrome(syndrome_disks+2, STRIPE_SIZE, ptrs);
switch(method) { switch(method) {
case RECONSTRUCT_WRITE: case RECONSTRUCT_WRITE:
...@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) ...@@ -1552,8 +1744,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
{ {
int i, count, disks = sh->disks; int i, count, disks = sh->disks;
void *ptr[MAX_XOR_BLOCKS], *dest, *p; void *ptr[MAX_XOR_BLOCKS], *dest, *p;
int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx;
int qd_idx = raid6_next_disk(pd_idx, disks);
pr_debug("compute_block_1, stripe %llu, idx %d\n", pr_debug("compute_block_1, stripe %llu, idx %d\n",
(unsigned long long)sh->sector, dd_idx); (unsigned long long)sh->sector, dd_idx);
...@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) ...@@ -1589,63 +1780,65 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero)
static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
{ {
int i, count, disks = sh->disks; int i, count, disks = sh->disks;
int pd_idx = sh->pd_idx; int syndrome_disks = sh->ddf_layout ? disks : disks-2;
int qd_idx = raid6_next_disk(pd_idx, disks); int d0_idx = raid6_d0(sh);
int d0_idx = raid6_next_disk(qd_idx, disks); int faila = -1, failb = -1;
int faila, failb; /**** FIX THIS: This could be very bad if disks is close to 256 ****/
void *ptrs[syndrome_disks+2];
/* faila and failb are disk numbers relative to d0_idx */ for (i = 0; i < disks ; i++)
/* pd_idx become disks-2 and qd_idx become disks-1 */ ptrs[i] = (void *)raid6_empty_zero_page;
faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx; count = 0;
failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx; i = d0_idx;
do {
int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
ptrs[slot] = page_address(sh->dev[i].page);
if (i == dd_idx1)
faila = slot;
if (i == dd_idx2)
failb = slot;
i = raid6_next_disk(i, disks);
} while (i != d0_idx);
BUG_ON(count != syndrome_disks);
BUG_ON(faila == failb); BUG_ON(faila == failb);
if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; } if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n", pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
(unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb); (unsigned long long)sh->sector, dd_idx1, dd_idx2,
faila, failb);
if ( failb == disks-1 ) { if (failb == syndrome_disks+1) {
/* Q disk is one of the missing disks */ /* Q disk is one of the missing disks */
if ( faila == disks-2 ) { if (faila == syndrome_disks) {
/* Missing P+Q, just recompute */ /* Missing P+Q, just recompute */
compute_parity6(sh, UPDATE_PARITY); compute_parity6(sh, UPDATE_PARITY);
return; return;
} else { } else {
/* We're missing D+Q; recompute D from P */ /* We're missing D+Q; recompute D from P */
compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0); compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
dd_idx2 : dd_idx1),
0);
compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */ compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
return; return;
} }
} }
/* We're missing D+P or D+D; build pointer table */ /* We're missing D+P or D+D; */
{ if (failb == syndrome_disks) {
/**** FIX THIS: This could be very bad if disks is close to 256 ****/
void *ptrs[disks];
count = 0;
i = d0_idx;
do {
ptrs[count++] = page_address(sh->dev[i].page);
i = raid6_next_disk(i, disks);
if (i != dd_idx1 && i != dd_idx2 &&
!test_bit(R5_UPTODATE, &sh->dev[i].flags))
printk("compute_2 with missing block %d/%d\n", count, i);
} while ( i != d0_idx );
if ( failb == disks-2 ) {
/* We're missing D+P. */ /* We're missing D+P. */
raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs); raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE, faila, ptrs);
} else { } else {
/* We're missing D+D. */ /* We're missing D+D. */
raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs); raid6_2data_recov(syndrome_disks+2, STRIPE_SIZE, faila, failb,
ptrs);
} }
/* Both the above update both missing blocks */ /* Both the above update both missing blocks */
set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags); set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags); set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
}
} }
static void static void
...@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p) ...@@ -1800,17 +1993,21 @@ static int page_is_zero(struct page *p)
memcmp(a, a+4, STRIPE_SIZE-4)==0); memcmp(a, a+4, STRIPE_SIZE-4)==0);
} }
static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
struct stripe_head *sh)
{ {
int sectors_per_chunk = conf->chunk_size >> 9; int sectors_per_chunk =
int pd_idx, dd_idx; previous ? (conf->prev_chunk >> 9)
: (conf->chunk_size >> 9);
int dd_idx;
int chunk_offset = sector_div(stripe, sectors_per_chunk); int chunk_offset = sector_div(stripe, sectors_per_chunk);
int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
raid5_compute_sector(stripe * (disks - conf->max_degraded) raid5_compute_sector(conf,
stripe * (disks - conf->max_degraded)
*sectors_per_chunk + chunk_offset, *sectors_per_chunk + chunk_offset,
disks, disks - conf->max_degraded, previous,
&dd_idx, &pd_idx, conf); &dd_idx, sh);
return pd_idx;
} }
static void static void
...@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf, ...@@ -2181,7 +2378,7 @@ static void handle_stripe_dirtying6(raid5_conf_t *conf,
struct r6_state *r6s, int disks) struct r6_state *r6s, int disks)
{ {
int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i; int rcw = 0, must_compute = 0, pd_idx = sh->pd_idx, i;
int qd_idx = r6s->qd_idx; int qd_idx = sh->qd_idx;
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
/* Would I have to read this buffer for reconstruct_write */ /* Would I have to read this buffer for reconstruct_write */
...@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, ...@@ -2371,7 +2568,7 @@ static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
int update_p = 0, update_q = 0; int update_p = 0, update_q = 0;
struct r5dev *dev; struct r5dev *dev;
int pd_idx = sh->pd_idx; int pd_idx = sh->pd_idx;
int qd_idx = r6s->qd_idx; int qd_idx = sh->qd_idx;
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
...@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, ...@@ -2467,17 +2664,14 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
struct dma_async_tx_descriptor *tx = NULL; struct dma_async_tx_descriptor *tx = NULL;
clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
for (i = 0; i < sh->disks; i++) for (i = 0; i < sh->disks; i++)
if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) { if (i != sh->pd_idx && i != sh->qd_idx) {
int dd_idx, pd_idx, j; int dd_idx, j;
struct stripe_head *sh2; struct stripe_head *sh2;
sector_t bn = compute_blocknr(sh, i); sector_t bn = compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(bn, conf->raid_disks, sector_t s = raid5_compute_sector(conf, bn, 0,
conf->raid_disks - &dd_idx, NULL);
conf->max_degraded, &dd_idx, sh2 = get_active_stripe(conf, s, 0, 1);
&pd_idx, conf);
sh2 = get_active_stripe(conf, s, conf->raid_disks,
pd_idx, 1);
if (sh2 == NULL) if (sh2 == NULL)
/* so far only the early blocks of this stripe /* so far only the early blocks of this stripe
* have been requested. When later blocks * have been requested. When later blocks
...@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, ...@@ -2500,8 +2694,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j = 0; j < conf->raid_disks; j++) for (j = 0; j < conf->raid_disks; j++)
if (j != sh2->pd_idx && if (j != sh2->pd_idx &&
(!r6s || j != raid6_next_disk(sh2->pd_idx, (!r6s || j != sh2->qd_idx) &&
sh2->disks)) &&
!test_bit(R5_Expanded, &sh2->dev[j].flags)) !test_bit(R5_Expanded, &sh2->dev[j].flags))
break; break;
if (j == conf->raid_disks) { if (j == conf->raid_disks) {
...@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh) ...@@ -2750,6 +2943,23 @@ static bool handle_stripe5(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */ /* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) { if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh2
= get_active_stripe(conf, sh->sector, 1, 1);
if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
/* sh cannot be written until sh2 has been read.
* so arrange for sh to be delayed a little
*/
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh2->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe(sh2);
goto unlock;
}
if (sh2)
release_stripe(sh2);
sh->reconstruct_state = reconstruct_state_idle; sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state); clear_bit(STRIPE_EXPANDING, &sh->state);
for (i = conf->raid_disks; i--; ) { for (i = conf->raid_disks; i--; ) {
...@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh) ...@@ -2763,8 +2973,7 @@ static bool handle_stripe5(struct stripe_head *sh)
!sh->reconstruct_state) { !sh->reconstruct_state) {
/* Need to write out all blocks after computing parity */ /* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks; sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf, stripe_set_idx(sh->sector, conf, 0, sh);
conf->raid_disks);
schedule_reconstruction5(sh, &s, 1, 1); schedule_reconstruction5(sh, &s, 1, 1);
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state); clear_bit(STRIPE_EXPAND_READY, &sh->state);
...@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh) ...@@ -2796,20 +3005,19 @@ static bool handle_stripe5(struct stripe_head *sh)
static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
{ {
raid6_conf_t *conf = sh->raid_conf; raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks; int disks = sh->disks;
struct bio *return_bi = NULL; struct bio *return_bi = NULL;
int i, pd_idx = sh->pd_idx; int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
struct stripe_head_state s; struct stripe_head_state s;
struct r6_state r6s; struct r6_state r6s;
struct r5dev *dev, *pdev, *qdev; struct r5dev *dev, *pdev, *qdev;
mdk_rdev_t *blocked_rdev = NULL; mdk_rdev_t *blocked_rdev = NULL;
r6s.qd_idx = raid6_next_disk(pd_idx, disks);
pr_debug("handling stripe %llu, state=%#lx cnt=%d, " pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
"pd_idx=%d, qd_idx=%d\n", "pd_idx=%d, qd_idx=%d\n",
(unsigned long long)sh->sector, sh->state, (unsigned long long)sh->sector, sh->state,
atomic_read(&sh->count), pd_idx, r6s.qd_idx); atomic_read(&sh->count), pd_idx, qd_idx);
memset(&s, 0, sizeof(s)); memset(&s, 0, sizeof(s));
spin_lock(&sh->lock); spin_lock(&sh->lock);
...@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ...@@ -2920,9 +3128,9 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
pdev = &sh->dev[pd_idx]; pdev = &sh->dev[pd_idx];
r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
|| (s.failed >= 2 && r6s.failed_num[1] == pd_idx); || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
qdev = &sh->dev[r6s.qd_idx]; qdev = &sh->dev[qd_idx];
r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == r6s.qd_idx) r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
|| (s.failed >= 2 && r6s.failed_num[1] == r6s.qd_idx); || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
if ( s.written && if ( s.written &&
( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
...@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ...@@ -2980,10 +3188,26 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
} }
if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
struct stripe_head *sh2
= get_active_stripe(conf, sh->sector, 1, 1);
if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
/* sh cannot be written until sh2 has been read.
* so arrange for sh to be delayed a little
*/
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh2->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe(sh2);
goto unlock;
}
if (sh2)
release_stripe(sh2);
/* Need to write out all blocks after computing P&Q */ /* Need to write out all blocks after computing P&Q */
sh->disks = conf->raid_disks; sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf, stripe_set_idx(sh->sector, conf, 0, sh);
conf->raid_disks);
compute_parity6(sh, RECONSTRUCT_WRITE); compute_parity6(sh, RECONSTRUCT_WRITE);
for (i = conf->raid_disks ; i-- ; ) { for (i = conf->raid_disks ; i-- ; ) {
set_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(R5_LOCKED, &sh->dev[i].flags);
...@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q, ...@@ -3134,6 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
if ((bvm->bi_rw & 1) == WRITE) if ((bvm->bi_rw & 1) == WRITE)
return biovec->bv_len; /* always allow writes to be mergeable */ return biovec->bv_len; /* always allow writes to be mergeable */
if (mddev->new_chunk < mddev->chunk_size)
chunk_sectors = mddev->new_chunk >> 9;
max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
if (max < 0) max = 0; if (max < 0) max = 0;
if (max <= biovec->bv_len && bio_sectors == 0) if (max <= biovec->bv_len && bio_sectors == 0)
...@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) ...@@ -3149,6 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
unsigned int chunk_sectors = mddev->chunk_size >> 9; unsigned int chunk_sectors = mddev->chunk_size >> 9;
unsigned int bio_sectors = bio->bi_size >> 9; unsigned int bio_sectors = bio->bi_size >> 9;
if (mddev->new_chunk < mddev->chunk_size)
chunk_sectors = mddev->new_chunk >> 9;
return chunk_sectors >= return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors); ((sector & (chunk_sectors - 1)) + bio_sectors);
} }
...@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) ...@@ -3255,9 +3483,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
{ {
mddev_t *mddev = q->queuedata; mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
const unsigned int raid_disks = conf->raid_disks; unsigned int dd_idx;
const unsigned int data_disks = raid_disks - conf->max_degraded;
unsigned int dd_idx, pd_idx;
struct bio* align_bi; struct bio* align_bi;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) ...@@ -3280,12 +3506,9 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
/* /*
* compute position * compute position
*/ */
align_bi->bi_sector = raid5_compute_sector(raid_bio->bi_sector, align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
raid_disks, 0,
data_disks, &dd_idx, NULL);
&dd_idx,
&pd_idx,
conf);
rcu_read_lock(); rcu_read_lock();
rdev = rcu_dereference(conf->disks[dd_idx].rdev); rdev = rcu_dereference(conf->disks[dd_idx].rdev);
...@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi) ...@@ -3377,7 +3600,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
{ {
mddev_t *mddev = q->queuedata; mddev_t *mddev = q->queuedata;
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
unsigned int dd_idx, pd_idx; int dd_idx;
sector_t new_sector; sector_t new_sector;
sector_t logical_sector, last_sector; sector_t logical_sector, last_sector;
struct stripe_head *sh; struct stripe_head *sh;
...@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi) ...@@ -3410,26 +3633,31 @@ static int make_request(struct request_queue *q, struct bio * bi)
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w); DEFINE_WAIT(w);
int disks, data_disks; int disks, data_disks;
int previous;
retry: retry:
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); previous = 0;
if (likely(conf->expand_progress == MaxSector))
disks = conf->raid_disks; disks = conf->raid_disks;
else { prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
/* spinlock is needed as expand_progress may be if (unlikely(conf->reshape_progress != MaxSector)) {
/* spinlock is needed as reshape_progress may be
* 64bit on a 32bit platform, and so it might be * 64bit on a 32bit platform, and so it might be
* possible to see a half-updated value * possible to see a half-updated value
* Ofcourse expand_progress could change after * Ofcourse reshape_progress could change after
* the lock is dropped, so once we get a reference * the lock is dropped, so once we get a reference
* to the stripe that we think it is, we will have * to the stripe that we think it is, we will have
* to check again. * to check again.
*/ */
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
disks = conf->raid_disks; if (mddev->delta_disks < 0
if (logical_sector >= conf->expand_progress) ? logical_sector < conf->reshape_progress
: logical_sector >= conf->reshape_progress) {
disks = conf->previous_raid_disks; disks = conf->previous_raid_disks;
else { previous = 1;
if (logical_sector >= conf->expand_lo) { } else {
if (mddev->delta_disks < 0
? logical_sector < conf->reshape_safe
: logical_sector >= conf->reshape_safe) {
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
schedule(); schedule();
goto retry; goto retry;
...@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi) ...@@ -3439,15 +3667,17 @@ static int make_request(struct request_queue *q, struct bio * bi)
} }
data_disks = disks - conf->max_degraded; data_disks = disks - conf->max_degraded;
new_sector = raid5_compute_sector(logical_sector, disks, data_disks, new_sector = raid5_compute_sector(conf, logical_sector,
&dd_idx, &pd_idx, conf); previous,
&dd_idx, NULL);
pr_debug("raid5: make_request, sector %llu logical %llu\n", pr_debug("raid5: make_request, sector %llu logical %llu\n",
(unsigned long long)new_sector, (unsigned long long)new_sector,
(unsigned long long)logical_sector); (unsigned long long)logical_sector);
sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK)); sh = get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK));
if (sh) { if (sh) {
if (unlikely(conf->expand_progress != MaxSector)) { if (unlikely(previous)) {
/* expansion might have moved on while waiting for a /* expansion might have moved on while waiting for a
* stripe, so we must do the range check again. * stripe, so we must do the range check again.
* Expansion could still move past after this * Expansion could still move past after this
...@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi) ...@@ -3458,8 +3688,9 @@ static int make_request(struct request_queue *q, struct bio * bi)
*/ */
int must_retry = 0; int must_retry = 0;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (logical_sector < conf->expand_progress && if (mddev->delta_disks < 0
disks == conf->previous_raid_disks) ? logical_sector >= conf->reshape_progress
: logical_sector < conf->reshape_progress)
/* mismatch, need to try again */ /* mismatch, need to try again */
must_retry = 1; must_retry = 1;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
...@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi) ...@@ -3514,6 +3745,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
return 0; return 0;
} }
static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
{ {
/* reshaping is quite different to recovery/resync so it is /* reshaping is quite different to recovery/resync so it is
...@@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped ...@@ -3527,61 +3760,118 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
*/ */
raid5_conf_t *conf = (raid5_conf_t *) mddev->private; raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
struct stripe_head *sh; struct stripe_head *sh;
int pd_idx;
sector_t first_sector, last_sector; sector_t first_sector, last_sector;
int raid_disks = conf->previous_raid_disks; int raid_disks = conf->previous_raid_disks;
int data_disks = raid_disks - conf->max_degraded; int data_disks = raid_disks - conf->max_degraded;
int new_data_disks = conf->raid_disks - conf->max_degraded; int new_data_disks = conf->raid_disks - conf->max_degraded;
int i; int i;
int dd_idx; int dd_idx;
sector_t writepos, safepos, gap; sector_t writepos, readpos, safepos;
sector_t stripe_addr;
if (sector_nr == 0 && int reshape_sectors;
conf->expand_progress != 0) { struct list_head stripes;
/* restarting in the middle, skip the initial sectors */
sector_nr = conf->expand_progress; if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
if (mddev->delta_disks < 0 &&
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
} else if (mddev->delta_disks > 0 &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks); sector_div(sector_nr, new_data_disks);
if (sector_nr) {
*skipped = 1; *skipped = 1;
return sector_nr; return sector_nr;
} }
}
/* We need to process a full chunk at a time.
* If old and new chunk sizes differ, we need to process the
* largest of these
*/
if (mddev->new_chunk > mddev->chunk_size)
reshape_sectors = mddev->new_chunk / 512;
else
reshape_sectors = mddev->chunk_size / 512;
/* we update the metadata when there is more than 3Meg /* we update the metadata when there is more than 3Meg
* in the block range (that is rather arbitrary, should * in the block range (that is rather arbitrary, should
* probably be time based) or when the data about to be * probably be time based) or when the data about to be
* copied would over-write the source of the data at * copied would over-write the source of the data at
* the front of the range. * the front of the range.
* i.e. one new_stripe forward from expand_progress new_maps * i.e. one new_stripe along from reshape_progress new_maps
* to after where expand_lo old_maps to * to after where reshape_safe old_maps to
*/ */
writepos = conf->expand_progress + writepos = conf->reshape_progress;
conf->chunk_size/512*(new_data_disks);
sector_div(writepos, new_data_disks); sector_div(writepos, new_data_disks);
safepos = conf->expand_lo; readpos = conf->reshape_progress;
sector_div(readpos, data_disks);
safepos = conf->reshape_safe;
sector_div(safepos, data_disks); sector_div(safepos, data_disks);
gap = conf->expand_progress - conf->expand_lo; if (mddev->delta_disks < 0) {
writepos -= reshape_sectors;
if (writepos >= safepos || readpos += reshape_sectors;
gap > (new_data_disks)*3000*2 /*3Meg*/) { safepos += reshape_sectors;
} else {
writepos += reshape_sectors;
readpos -= reshape_sectors;
safepos -= reshape_sectors;
}
/* 'writepos' is the most advanced device address we might write.
* 'readpos' is the least advanced device address we might read.
* 'safepos' is the least address recorded in the metadata as having
* been reshaped.
* If 'readpos' is behind 'writepos', then there is no way that we can
* ensure safety in the face of a crash - that must be done by userspace
* making a backup of the data. So in that case there is no particular
* rush to update metadata.
* Otherwise if 'safepos' is behind 'writepos', then we really need to
* update the metadata to advance 'safepos' to match 'readpos' so that
* we can be safe in the event of a crash.
* So we insist on updating metadata if safepos is behind writepos and
* readpos is beyond writepos.
* In any case, update the metadata every 10 seconds.
* Maybe that number should be configurable, but I'm not sure it is
* worth it.... maybe it could be a multiple of safemode_delay???
*/
if ((mddev->delta_disks < 0
? (safepos > writepos && readpos < writepos)
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */ /* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap, wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes)==0); atomic_read(&conf->reshape_stripes)==0);
mddev->reshape_position = conf->expand_progress; mddev->reshape_position = conf->reshape_progress;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, mddev->flags == 0 || wait_event(mddev->sb_wait, mddev->flags == 0 ||
kthread_should_stop()); kthread_should_stop());
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
conf->expand_lo = mddev->reshape_position; conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
} }
for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { if (mddev->delta_disks < 0) {
BUG_ON(conf->reshape_progress == 0);
stripe_addr = writepos;
BUG_ON((mddev->dev_sectors &
~((sector_t)reshape_sectors - 1))
- reshape_sectors - stripe_addr
!= sector_nr);
} else {
BUG_ON(writepos != sector_nr + reshape_sectors);
stripe_addr = sector_nr;
}
INIT_LIST_HEAD(&stripes);
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j; int j;
int skipped = 0; int skipped = 0;
pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); sh = get_active_stripe(conf, stripe_addr+i, 0, 0);
sh = get_active_stripe(conf, sector_nr+i,
conf->raid_disks, pd_idx, 0);
set_bit(STRIPE_EXPANDING, &sh->state); set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes); atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old /* If any of this stripe is beyond the end of the old
...@@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped ...@@ -3592,10 +3882,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
if (j == sh->pd_idx) if (j == sh->pd_idx)
continue; continue;
if (conf->level == 6 && if (conf->level == 6 &&
j == raid6_next_disk(sh->pd_idx, sh->disks)) j == sh->qd_idx)
continue; continue;
s = compute_blocknr(sh, j); s = compute_blocknr(sh, j, 0);
if (s < mddev->array_sectors) { if (s < raid5_size(mddev, 0, 0)) {
skipped = 1; skipped = 1;
continue; continue;
} }
...@@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped ...@@ -3607,10 +3897,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
set_bit(STRIPE_EXPAND_READY, &sh->state); set_bit(STRIPE_EXPAND_READY, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
} }
release_stripe(sh); list_add(&sh->lru, &stripes);
} }
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
conf->expand_progress = (sector_nr + i) * new_data_disks; if (mddev->delta_disks < 0)
conf->reshape_progress -= reshape_sectors * new_data_disks;
else
conf->reshape_progress += reshape_sectors * new_data_disks;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
/* Ok, those stripe are ready. We can start scheduling /* Ok, those stripe are ready. We can start scheduling
* reads on the source stripes. * reads on the source stripes.
...@@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped ...@@ -3618,46 +3911,50 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
* block on the destination stripes. * block on the destination stripes.
*/ */
first_sector = first_sector =
raid5_compute_sector(sector_nr*(new_data_disks), raid5_compute_sector(conf, stripe_addr*(new_data_disks),
raid_disks, data_disks, 1, &dd_idx, NULL);
&dd_idx, &pd_idx, conf);
last_sector = last_sector =
raid5_compute_sector((sector_nr+conf->chunk_size/512) raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512)
*(new_data_disks) -1, *(new_data_disks) - 1),
raid_disks, data_disks, 1, &dd_idx, NULL);
&dd_idx, &pd_idx, conf); if (last_sector >= mddev->dev_sectors)
if (last_sector >= (mddev->size<<1)) last_sector = mddev->dev_sectors - 1;
last_sector = (mddev->size<<1)-1;
while (first_sector <= last_sector) { while (first_sector <= last_sector) {
pd_idx = stripe_to_pdidx(first_sector, conf, sh = get_active_stripe(conf, first_sector, 1, 0);
conf->previous_raid_disks);
sh = get_active_stripe(conf, first_sector,
conf->previous_raid_disks, pd_idx, 0);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); release_stripe(sh);
first_sector += STRIPE_SECTORS; first_sector += STRIPE_SECTORS;
} }
/* Now that the sources are clearly marked, we can release
* the destination stripes
*/
while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru);
release_stripe(sh);
}
/* If this takes us to the resync_max point where we have to pause, /* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock. * then we need to write out the superblock.
*/ */
sector_nr += conf->chunk_size>>9; sector_nr += reshape_sectors;
if (sector_nr >= mddev->resync_max) { if (sector_nr >= mddev->resync_max) {
/* Cannot proceed until we've updated the superblock... */ /* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap, wait_event(conf->wait_for_overlap,
atomic_read(&conf->reshape_stripes) == 0); atomic_read(&conf->reshape_stripes) == 0);
mddev->reshape_position = conf->expand_progress; mddev->reshape_position = conf->reshape_progress;
conf->reshape_checkpoint = jiffies;
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait, wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_DEVS, &mddev->flags) !test_bit(MD_CHANGE_DEVS, &mddev->flags)
|| kthread_should_stop()); || kthread_should_stop());
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
conf->expand_lo = mddev->reshape_position; conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
} }
return conf->chunk_size>>9; return reshape_sectors;
} }
/* FIXME go_faster isn't used */ /* FIXME go_faster isn't used */
...@@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski ...@@ -3665,9 +3962,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
{ {
raid5_conf_t *conf = (raid5_conf_t *) mddev->private; raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
struct stripe_head *sh; struct stripe_head *sh;
int pd_idx; sector_t max_sector = mddev->dev_sectors;
int raid_disks = conf->raid_disks;
sector_t max_sector = mddev->size << 1;
int sync_blocks; int sync_blocks;
int still_degraded = 0; int still_degraded = 0;
int i; int i;
...@@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski ...@@ -3675,6 +3970,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
if (sector_nr >= max_sector) { if (sector_nr >= max_sector) {
/* just being told to finish up .. nothing much to do */ /* just being told to finish up .. nothing much to do */
unplug_slaves(mddev); unplug_slaves(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf); end_reshape(conf);
return 0; return 0;
...@@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski ...@@ -3705,7 +4001,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
*/ */
if (mddev->degraded >= conf->max_degraded && if (mddev->degraded >= conf->max_degraded &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
sector_t rv = (mddev->size << 1) - sector_nr; sector_t rv = mddev->dev_sectors - sector_nr;
*skipped = 1; *skipped = 1;
return rv; return rv;
} }
...@@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski ...@@ -3721,10 +4017,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
bitmap_cond_end_sync(mddev->bitmap, sector_nr); bitmap_cond_end_sync(mddev->bitmap, sector_nr);
pd_idx = stripe_to_pdidx(sector_nr, conf, raid_disks); sh = get_active_stripe(conf, sector_nr, 0, 1);
sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 1);
if (sh == NULL) { if (sh == NULL) {
sh = get_active_stripe(conf, sector_nr, raid_disks, pd_idx, 0); sh = get_active_stripe(conf, sector_nr, 0, 0);
/* make sure we don't swamp the stripe cache if someone else /* make sure we don't swamp the stripe cache if someone else
* is trying to get access * is trying to get access
*/ */
...@@ -3766,19 +4061,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) ...@@ -3766,19 +4061,15 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
* it will be only one 'dd_idx' and only need one call to raid5_compute_sector. * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
*/ */
struct stripe_head *sh; struct stripe_head *sh;
int dd_idx, pd_idx; int dd_idx;
sector_t sector, logical_sector, last_sector; sector_t sector, logical_sector, last_sector;
int scnt = 0; int scnt = 0;
int remaining; int remaining;
int handled = 0; int handled = 0;
logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
sector = raid5_compute_sector( logical_sector, sector = raid5_compute_sector(conf, logical_sector,
conf->raid_disks, 0, &dd_idx, NULL);
conf->raid_disks - conf->max_degraded,
&dd_idx,
&pd_idx,
conf);
last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
for (; logical_sector < last_sector; for (; logical_sector < last_sector;
...@@ -3790,7 +4081,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) ...@@ -3790,7 +4081,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
/* already done this stripe */ /* already done this stripe */
continue; continue;
sh = get_active_stripe(conf, sector, conf->raid_disks, pd_idx, 1); sh = get_active_stripe(conf, sector, 0, 1);
if (!sh) { if (!sh) {
/* failed to get a stripe - must wait */ /* failed to get a stripe - must wait */
...@@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = { ...@@ -3992,89 +4283,69 @@ static struct attribute_group raid5_attrs_group = {
.attrs = raid5_attrs, .attrs = raid5_attrs,
}; };
static int run(mddev_t *mddev) static sector_t
raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
raid5_conf_t *conf = mddev_to_conf(mddev);
if (!sectors)
sectors = mddev->dev_sectors;
if (!raid_disks) {
/* size is defined by the smallest of previous and new size */
if (conf->raid_disks < conf->previous_raid_disks)
raid_disks = conf->raid_disks;
else
raid_disks = conf->previous_raid_disks;
}
sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
return sectors * (raid_disks - conf->max_degraded);
}
static raid5_conf_t *setup_conf(mddev_t *mddev)
{ {
raid5_conf_t *conf; raid5_conf_t *conf;
int raid_disk, memory; int raid_disk, memory;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct disk_info *disk; struct disk_info *disk;
int working_disks = 0;
if (mddev->level != 5 && mddev->level != 4 && mddev->level != 6) { if (mddev->new_level != 5
&& mddev->new_level != 4
&& mddev->new_level != 6) {
printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
mdname(mddev), mddev->level); mdname(mddev), mddev->new_level);
return -EIO; return ERR_PTR(-EIO);
} }
if ((mddev->new_level == 5
if (mddev->chunk_size < PAGE_SIZE) { && !algorithm_valid_raid5(mddev->new_layout)) ||
printk(KERN_ERR "md/raid5: chunk_size must be at least " (mddev->new_level == 6
"PAGE_SIZE but %d < %ld\n", && !algorithm_valid_raid6(mddev->new_layout))) {
mddev->chunk_size, PAGE_SIZE); printk(KERN_ERR "raid5: %s: layout %d not supported\n",
return -EINVAL; mdname(mddev), mddev->new_layout);
return ERR_PTR(-EIO);
}
if (mddev->new_level == 6 && mddev->raid_disks < 4) {
printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
mdname(mddev), mddev->raid_disks);
return ERR_PTR(-EINVAL);
} }
if (mddev->reshape_position != MaxSector) { if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
/* Check that we can continue the reshape. printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
* Currently only disks can change, it must mddev->new_chunk, mdname(mddev));
* increase, and we must be past the point where return ERR_PTR(-EINVAL);
* a stripe over-writes itself
*/
sector_t here_new, here_old;
int old_disks;
int max_degraded = (mddev->level == 5 ? 1 : 2);
if (mddev->new_level != mddev->level ||
mddev->new_layout != mddev->layout ||
mddev->new_chunk != mddev->chunk_size) {
printk(KERN_ERR "raid5: %s: unsupported reshape "
"required - aborting.\n",
mdname(mddev));
return -EINVAL;
}
if (mddev->delta_disks <= 0) {
printk(KERN_ERR "raid5: %s: unsupported reshape "
"(reduce disks) required - aborting.\n",
mdname(mddev));
return -EINVAL;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one
* further up in new geometry must map after here in old
* geometry.
*/
here_new = mddev->reshape_position;
if (sector_div(here_new, (mddev->chunk_size>>9)*
(mddev->raid_disks - max_degraded))) {
printk(KERN_ERR "raid5: reshape_position not "
"on a stripe boundary\n");
return -EINVAL;
}
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
sector_div(here_old, (mddev->chunk_size>>9)*
(old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
if (here_new >= here_old) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "raid5: reshape_position too early for "
"auto-recovery - aborting.\n");
return -EINVAL;
}
printk(KERN_INFO "raid5: reshape will continue\n");
/* OK, we should be able to continue; */
} }
conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); if (conf == NULL)
if ((conf = mddev->private) == NULL)
goto abort; goto abort;
if (mddev->reshape_position == MaxSector) {
conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks;
} else {
conf->raid_disks = mddev->raid_disks; conf->raid_disks = mddev->raid_disks;
if (mddev->reshape_position == MaxSector)
conf->previous_raid_disks = mddev->raid_disks;
else
conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
}
conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info),
GFP_KERNEL); GFP_KERNEL);
...@@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev) ...@@ -4086,13 +4357,12 @@ static int run(mddev_t *mddev)
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
goto abort; goto abort;
if (mddev->level == 6) { if (mddev->new_level == 6) {
conf->spare_page = alloc_page(GFP_KERNEL); conf->spare_page = alloc_page(GFP_KERNEL);
if (!conf->spare_page) if (!conf->spare_page)
goto abort; goto abort;
} }
spin_lock_init(&conf->device_lock); spin_lock_init(&conf->device_lock);
mddev->queue->queue_lock = &conf->device_lock;
init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap); init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list); INIT_LIST_HEAD(&conf->handle_list);
...@@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev) ...@@ -4121,47 +4391,134 @@ static int run(mddev_t *mddev)
printk(KERN_INFO "raid5: device %s operational as raid" printk(KERN_INFO "raid5: device %s operational as raid"
" disk %d\n", bdevname(rdev->bdev,b), " disk %d\n", bdevname(rdev->bdev,b),
raid_disk); raid_disk);
working_disks++;
} else } else
/* Cannot rely on bitmap to complete recovery */ /* Cannot rely on bitmap to complete recovery */
conf->fullsync = 1; conf->fullsync = 1;
} }
/* conf->chunk_size = mddev->new_chunk;
* 0 for a fully functional array, 1 or 2 for a degraded array. conf->level = mddev->new_level;
*/
mddev->degraded = conf->raid_disks - working_disks;
conf->mddev = mddev;
conf->chunk_size = mddev->chunk_size;
conf->level = mddev->level;
if (conf->level == 6) if (conf->level == 6)
conf->max_degraded = 2; conf->max_degraded = 2;
else else
conf->max_degraded = 1; conf->max_degraded = 1;
conf->algorithm = mddev->layout; conf->algorithm = mddev->new_layout;
conf->max_nr_stripes = NR_STRIPES; conf->max_nr_stripes = NR_STRIPES;
conf->expand_progress = mddev->reshape_position; conf->reshape_progress = mddev->reshape_position;
if (conf->reshape_progress != MaxSector) {
conf->prev_chunk = mddev->chunk_size;
conf->prev_algo = mddev->layout;
}
/* device size must be a multiple of chunk size */ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
mddev->size &= ~(mddev->chunk_size/1024 -1); conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
mddev->resync_max_sectors = mddev->size << 1; if (grow_stripes(conf, conf->max_nr_stripes)) {
printk(KERN_ERR
"raid5: couldn't allocate %dkB for buffers\n", memory);
goto abort;
} else
printk(KERN_INFO "raid5: allocated %dkB for %s\n",
memory, mdname(mddev));
if (conf->level == 6 && conf->raid_disks < 4) { conf->thread = md_register_thread(raid5d, mddev, "%s_raid5");
printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", if (!conf->thread) {
mdname(mddev), conf->raid_disks); printk(KERN_ERR
"raid5: couldn't allocate thread for %s\n",
mdname(mddev));
goto abort; goto abort;
} }
if (!conf->chunk_size || conf->chunk_size % 4) {
printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", return conf;
conf->chunk_size, mdname(mddev));
goto abort; abort:
if (conf) {
shrink_stripes(conf);
safe_put_page(conf->spare_page);
kfree(conf->disks);
kfree(conf->stripe_hashtbl);
kfree(conf);
return ERR_PTR(-EIO);
} else
return ERR_PTR(-ENOMEM);
}
static int run(mddev_t *mddev)
{
raid5_conf_t *conf;
int working_disks = 0;
mdk_rdev_t *rdev;
if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape.
* Currently only disks can change, it must
* increase, and we must be past the point where
* a stripe over-writes itself
*/
sector_t here_new, here_old;
int old_disks;
int max_degraded = (mddev->level == 6 ? 2 : 1);
if (mddev->new_level != mddev->level) {
printk(KERN_ERR "raid5: %s: unsupported reshape "
"required - aborting.\n",
mdname(mddev));
return -EINVAL;
} }
if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { old_disks = mddev->raid_disks - mddev->delta_disks;
printk(KERN_ERR /* reshape_position must be on a new-stripe boundary, and one
"raid5: unsupported parity algorithm %d for %s\n", * further up in new geometry must map after here in old
conf->algorithm, mdname(mddev)); * geometry.
goto abort; */
here_new = mddev->reshape_position;
if (sector_div(here_new, (mddev->new_chunk>>9)*
(mddev->raid_disks - max_degraded))) {
printk(KERN_ERR "raid5: reshape_position not "
"on a stripe boundary\n");
return -EINVAL;
} }
/* here_new is the stripe we will write to */
here_old = mddev->reshape_position;
sector_div(here_old, (mddev->chunk_size>>9)*
(old_disks-max_degraded));
/* here_old is the first stripe that we might need to read
* from */
if (here_new >= here_old) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "raid5: reshape_position too early for "
"auto-recovery - aborting.\n");
return -EINVAL;
}
printk(KERN_INFO "raid5: reshape will continue\n");
/* OK, we should be able to continue; */
} else {
BUG_ON(mddev->level != mddev->new_level);
BUG_ON(mddev->layout != mddev->new_layout);
BUG_ON(mddev->chunk_size != mddev->new_chunk);
BUG_ON(mddev->delta_disks != 0);
}
if (mddev->private == NULL)
conf = setup_conf(mddev);
else
conf = mddev->private;
if (IS_ERR(conf))
return PTR_ERR(conf);
mddev->thread = conf->thread;
conf->thread = NULL;
mddev->private = conf;
/*
* 0 for a fully functional array, 1 or 2 for a degraded array.
*/
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 &&
test_bit(In_sync, &rdev->flags))
working_disks++;
mddev->degraded = conf->raid_disks - working_disks;
if (mddev->degraded > conf->max_degraded) { if (mddev->degraded > conf->max_degraded) {
printk(KERN_ERR "raid5: not enough operational devices for %s" printk(KERN_ERR "raid5: not enough operational devices for %s"
" (%d/%d failed)\n", " (%d/%d failed)\n",
...@@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev) ...@@ -4169,6 +4526,10 @@ static int run(mddev_t *mddev)
goto abort; goto abort;
} }
/* device size must be a multiple of chunk size */
mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
mddev->resync_max_sectors = mddev->dev_sectors;
if (mddev->degraded > 0 && if (mddev->degraded > 0 &&
mddev->recovery_cp != MaxSector) { mddev->recovery_cp != MaxSector) {
if (mddev->ok_start_degraded) if (mddev->ok_start_degraded)
...@@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev) ...@@ -4184,43 +4545,22 @@ static int run(mddev_t *mddev)
} }
} }
{
mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
if (!mddev->thread) {
printk(KERN_ERR
"raid5: couldn't allocate thread for %s\n",
mdname(mddev));
goto abort;
}
}
memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
if (grow_stripes(conf, conf->max_nr_stripes)) {
printk(KERN_ERR
"raid5: couldn't allocate %dkB for buffers\n", memory);
shrink_stripes(conf);
md_unregister_thread(mddev->thread);
goto abort;
} else
printk(KERN_INFO "raid5: allocated %dkB for %s\n",
memory, mdname(mddev));
if (mddev->degraded == 0) if (mddev->degraded == 0)
printk("raid5: raid level %d set %s active with %d out of %d" printk("raid5: raid level %d set %s active with %d out of %d"
" devices, algorithm %d\n", conf->level, mdname(mddev), " devices, algorithm %d\n", conf->level, mdname(mddev),
mddev->raid_disks-mddev->degraded, mddev->raid_disks, mddev->raid_disks-mddev->degraded, mddev->raid_disks,
conf->algorithm); mddev->new_layout);
else else
printk(KERN_ALERT "raid5: raid level %d set %s active with %d" printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
" out of %d devices, algorithm %d\n", conf->level, " out of %d devices, algorithm %d\n", conf->level,
mdname(mddev), mddev->raid_disks - mddev->degraded, mdname(mddev), mddev->raid_disks - mddev->degraded,
mddev->raid_disks, conf->algorithm); mddev->raid_disks, mddev->new_layout);
print_raid5_conf(conf); print_raid5_conf(conf);
if (conf->expand_progress != MaxSector) { if (conf->reshape_progress != MaxSector) {
printk("...ok start reshape thread\n"); printk("...ok start reshape thread\n");
conf->expand_lo = conf->expand_progress; conf->reshape_safe = conf->reshape_progress;
atomic_set(&conf->reshape_stripes, 0); atomic_set(&conf->reshape_stripes, 0);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
...@@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev) ...@@ -4247,18 +4587,22 @@ static int run(mddev_t *mddev)
"raid5: failed to create sysfs attributes for %s\n", "raid5: failed to create sysfs attributes for %s\n",
mdname(mddev)); mdname(mddev));
mddev->queue->queue_lock = &conf->device_lock;
mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->unplug_fn = raid5_unplug_device;
mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_data = mddev;
mddev->queue->backing_dev_info.congested_fn = raid5_congested; mddev->queue->backing_dev_info.congested_fn = raid5_congested;
mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
conf->max_degraded);
blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
return 0; return 0;
abort: abort:
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
if (conf) { if (conf) {
shrink_stripes(conf);
print_raid5_conf(conf); print_raid5_conf(conf);
safe_put_page(conf->spare_page); safe_put_page(conf->spare_page);
kfree(conf->disks); kfree(conf->disks);
...@@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number) ...@@ -4396,6 +4740,10 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
print_raid5_conf(conf); print_raid5_conf(conf);
rdev = p->rdev; rdev = p->rdev;
if (rdev) { if (rdev) {
if (number >= conf->raid_disks &&
conf->reshape_progress == MaxSector)
clear_bit(In_sync, &rdev->flags);
if (test_bit(In_sync, &rdev->flags) || if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) { atomic_read(&rdev->nr_pending)) {
err = -EBUSY; err = -EBUSY;
...@@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number) ...@@ -4405,7 +4753,8 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
* isn't possible. * isn't possible.
*/ */
if (!test_bit(Faulty, &rdev->flags) && if (!test_bit(Faulty, &rdev->flags) &&
mddev->degraded <= conf->max_degraded) { mddev->degraded <= conf->max_degraded &&
number < conf->raid_disks) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
...@@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) ...@@ -4472,36 +4821,48 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems * any io in the removed space completes, but it hardly seems
* worth it. * worth it.
*/ */
raid5_conf_t *conf = mddev_to_conf(mddev);
sectors &= ~((sector_t)mddev->chunk_size/512 - 1); sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
mddev->array_sectors = sectors * (mddev->raid_disks md_set_array_sectors(mddev, raid5_size(mddev, sectors,
- conf->max_degraded); mddev->raid_disks));
if (mddev->array_sectors >
raid5_size(mddev, sectors, mddev->raid_disks))
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors); set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1; mddev->changed = 1;
if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->size << 1; mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
mddev->size = sectors /2; mddev->dev_sectors = sectors;
mddev->resync_max_sectors = sectors; mddev->resync_max_sectors = sectors;
return 0; return 0;
} }
#ifdef CONFIG_MD_RAID5_RESHAPE
static int raid5_check_reshape(mddev_t *mddev) static int raid5_check_reshape(mddev_t *mddev)
{ {
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
int err;
if (mddev->delta_disks < 0 || if (mddev->delta_disks == 0 &&
mddev->new_level != mddev->level) mddev->new_layout == mddev->layout &&
return -EINVAL; /* Cannot shrink array or change level yet */ mddev->new_chunk == mddev->chunk_size)
if (mddev->delta_disks == 0) return -EINVAL; /* nothing to do */
return 0; /* nothing to do */
if (mddev->bitmap) if (mddev->bitmap)
/* Cannot grow a bitmap yet */ /* Cannot grow a bitmap yet */
return -EBUSY; return -EBUSY;
if (mddev->degraded > conf->max_degraded)
return -EINVAL;
if (mddev->delta_disks < 0) {
/* We might be able to shrink, but the devices must
* be made bigger first.
* For raid6, 4 is the minimum size.
* Otherwise 2 is the minimum
*/
int min = 2;
if (mddev->level == 6)
min = 4;
if (mddev->raid_disks + mddev->delta_disks < min)
return -EINVAL;
}
/* Can only proceed if there are plenty of stripe_heads. /* Can only proceed if there are plenty of stripe_heads.
* We need a minimum of one full stripe,, and for sensible progress * We need a minimum of one full stripe,, and for sensible progress
...@@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev) ...@@ -4514,18 +4875,12 @@ static int raid5_check_reshape(mddev_t *mddev)
if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes || if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
(mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) { (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
(mddev->chunk_size / STRIPE_SIZE)*4); (max(mddev->chunk_size, mddev->new_chunk)
/ STRIPE_SIZE)*4);
return -ENOSPC; return -ENOSPC;
} }
err = resize_stripes(conf, conf->raid_disks + mddev->delta_disks); return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
if (err)
return err;
if (mddev->degraded > conf->max_degraded)
return -EINVAL;
/* looks like we might be able to manage this */
return 0;
} }
static int raid5_start_reshape(mddev_t *mddev) static int raid5_start_reshape(mddev_t *mddev)
...@@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev) ...@@ -4550,12 +4905,31 @@ static int raid5_start_reshape(mddev_t *mddev)
*/ */
return -EINVAL; return -EINVAL;
/* Refuse to reduce size of the array. Any reductions in
* array size must be through explicit setting of array_size
* attribute.
*/
if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
< mddev->array_sectors) {
printk(KERN_ERR "md: %s: array size must be reduced "
"before number of disks\n", mdname(mddev));
return -EINVAL;
}
atomic_set(&conf->reshape_stripes, 0); atomic_set(&conf->reshape_stripes, 0);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks; conf->previous_raid_disks = conf->raid_disks;
conf->raid_disks += mddev->delta_disks; conf->raid_disks += mddev->delta_disks;
conf->expand_progress = 0; conf->prev_chunk = conf->chunk_size;
conf->expand_lo = 0; conf->chunk_size = mddev->new_chunk;
conf->prev_algo = conf->algorithm;
conf->algorithm = mddev->new_layout;
if (mddev->delta_disks < 0)
conf->reshape_progress = raid5_size(mddev, 0, 0);
else
conf->reshape_progress = 0;
conf->reshape_safe = conf->reshape_progress;
conf->generation++;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
/* Add some new drives, as many as will fit. /* Add some new drives, as many as will fit.
...@@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev) ...@@ -4580,9 +4954,12 @@ static int raid5_start_reshape(mddev_t *mddev)
break; break;
} }
if (mddev->delta_disks > 0) {
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) - added_devices; mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
- added_devices;
spin_unlock_irqrestore(&conf->device_lock, flags); spin_unlock_irqrestore(&conf->device_lock, flags);
}
mddev->raid_disks = conf->raid_disks; mddev->raid_disks = conf->raid_disks;
mddev->reshape_position = 0; mddev->reshape_position = 0;
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
...@@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev) ...@@ -4597,52 +4974,86 @@ static int raid5_start_reshape(mddev_t *mddev)
mddev->recovery = 0; mddev->recovery = 0;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
conf->expand_progress = MaxSector; conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
return -EAGAIN; return -EAGAIN;
} }
conf->reshape_checkpoint = jiffies;
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
md_new_event(mddev); md_new_event(mddev);
return 0; return 0;
} }
#endif
/* This is called from the reshape thread and should make any
* changes needed in 'conf'
*/
static void end_reshape(raid5_conf_t *conf) static void end_reshape(raid5_conf_t *conf)
{ {
struct block_device *bdev;
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
conf->mddev->array_sectors = 2 * conf->mddev->size *
(conf->raid_disks - conf->max_degraded);
set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors);
conf->mddev->changed = 1;
bdev = bdget_disk(conf->mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode,
(loff_t)conf->mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
conf->expand_progress = MaxSector; conf->previous_raid_disks = conf->raid_disks;
conf->reshape_progress = MaxSector;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
conf->mddev->reshape_position = MaxSector; wake_up(&conf->wait_for_overlap);
/* read-ahead size must cover two whole stripes, which is /* read-ahead size must cover two whole stripes, which is
* 2 * (datadisks) * chunksize where 'n' is the number of raid devices * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
*/ */
{ {
int data_disks = conf->previous_raid_disks - conf->max_degraded; int data_disks = conf->raid_disks - conf->max_degraded;
int stripe = data_disks * int stripe = data_disks * (conf->chunk_size
(conf->mddev->chunk_size / PAGE_SIZE); / PAGE_SIZE);
if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
} }
} }
} }
/* This is called from the raid5d thread with mddev_lock held.
* It makes config changes to the device.
*/
static void raid5_finish_reshape(mddev_t *mddev)
{
struct block_device *bdev;
raid5_conf_t *conf = mddev_to_conf(mddev);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->delta_disks > 0) {
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
bdev = bdget_disk(mddev->gendisk, 0);
if (bdev) {
mutex_lock(&bdev->bd_inode->i_mutex);
i_size_write(bdev->bd_inode,
(loff_t)mddev->array_sectors << 9);
mutex_unlock(&bdev->bd_inode->i_mutex);
bdput(bdev);
}
} else {
int d;
mddev->degraded = conf->raid_disks;
for (d = 0; d < conf->raid_disks ; d++)
if (conf->disks[d].rdev &&
test_bit(In_sync,
&conf->disks[d].rdev->flags))
mddev->degraded--;
for (d = conf->raid_disks ;
d < conf->raid_disks - mddev->delta_disks;
d++)
raid5_remove_disk(mddev, d);
}
mddev->layout = conf->algorithm;
mddev->chunk_size = conf->chunk_size;
mddev->reshape_position = MaxSector;
mddev->delta_disks = 0;
}
}
static void raid5_quiesce(mddev_t *mddev, int state) static void raid5_quiesce(mddev_t *mddev, int state)
{ {
raid5_conf_t *conf = mddev_to_conf(mddev); raid5_conf_t *conf = mddev_to_conf(mddev);
...@@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state) ...@@ -4672,6 +5083,212 @@ static void raid5_quiesce(mddev_t *mddev, int state)
} }
} }
static void *raid5_takeover_raid1(mddev_t *mddev)
{
int chunksect;
if (mddev->raid_disks != 2 ||
mddev->degraded > 1)
return ERR_PTR(-EINVAL);
/* Should check if there are write-behind devices? */
chunksect = 64*2; /* 64K by default */
/* The array must be an exact multiple of chunksize */
while (chunksect && (mddev->array_sectors & (chunksect-1)))
chunksect >>= 1;
if ((chunksect<<9) < STRIPE_SIZE)
/* array size does not allow a suitable chunk size */
return ERR_PTR(-EINVAL);
mddev->new_level = 5;
mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
mddev->new_chunk = chunksect << 9;
return setup_conf(mddev);
}
static void *raid5_takeover_raid6(mddev_t *mddev)
{
int new_layout;
switch (mddev->layout) {
case ALGORITHM_LEFT_ASYMMETRIC_6:
new_layout = ALGORITHM_LEFT_ASYMMETRIC;
break;
case ALGORITHM_RIGHT_ASYMMETRIC_6:
new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
break;
case ALGORITHM_LEFT_SYMMETRIC_6:
new_layout = ALGORITHM_LEFT_SYMMETRIC;
break;
case ALGORITHM_RIGHT_SYMMETRIC_6:
new_layout = ALGORITHM_RIGHT_SYMMETRIC;
break;
case ALGORITHM_PARITY_0_6:
new_layout = ALGORITHM_PARITY_0;
break;
case ALGORITHM_PARITY_N:
new_layout = ALGORITHM_PARITY_N;
break;
default:
return ERR_PTR(-EINVAL);
}
mddev->new_level = 5;
mddev->new_layout = new_layout;
mddev->delta_disks = -1;
mddev->raid_disks -= 1;
return setup_conf(mddev);
}
static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
{
/* For a 2-drive array, the layout and chunk size can be changed
* immediately as not restriping is needed.
* For larger arrays we record the new value - after validation
* to be used by a reshape pass.
*/
raid5_conf_t *conf = mddev_to_conf(mddev);
if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
return -EINVAL;
if (new_chunk > 0) {
if (new_chunk & (new_chunk-1))
/* not a power of 2 */
return -EINVAL;
if (new_chunk < PAGE_SIZE)
return -EINVAL;
if (mddev->array_sectors & ((new_chunk>>9)-1))
/* not factor of array size */
return -EINVAL;
}
/* They look valid */
if (mddev->raid_disks == 2) {
if (new_layout >= 0) {
conf->algorithm = new_layout;
mddev->layout = mddev->new_layout = new_layout;
}
if (new_chunk > 0) {
conf->chunk_size = new_chunk;
mddev->chunk_size = mddev->new_chunk = new_chunk;
}
set_bit(MD_CHANGE_DEVS, &mddev->flags);
md_wakeup_thread(mddev->thread);
} else {
if (new_layout >= 0)
mddev->new_layout = new_layout;
if (new_chunk > 0)
mddev->new_chunk = new_chunk;
}
return 0;
}
static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
{
if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
return -EINVAL;
if (new_chunk > 0) {
if (new_chunk & (new_chunk-1))
/* not a power of 2 */
return -EINVAL;
if (new_chunk < PAGE_SIZE)
return -EINVAL;
if (mddev->array_sectors & ((new_chunk>>9)-1))
/* not factor of array size */
return -EINVAL;
}
/* They look valid */
if (new_layout >= 0)
mddev->new_layout = new_layout;
if (new_chunk > 0)
mddev->new_chunk = new_chunk;
return 0;
}
static void *raid5_takeover(mddev_t *mddev)
{
/* raid5 can take over:
* raid0 - if all devices are the same - make it a raid4 layout
* raid1 - if there are two drives. We need to know the chunk size
* raid4 - trivial - just use a raid4 layout.
* raid6 - Providing it is a *_6 layout
*
* For now, just do raid1
*/
if (mddev->level == 1)
return raid5_takeover_raid1(mddev);
if (mddev->level == 4) {
mddev->new_layout = ALGORITHM_PARITY_N;
mddev->new_level = 5;
return setup_conf(mddev);
}
if (mddev->level == 6)
return raid5_takeover_raid6(mddev);
return ERR_PTR(-EINVAL);
}
static struct mdk_personality raid5_personality;
static void *raid6_takeover(mddev_t *mddev)
{
/* Currently can only take over a raid5. We map the
* personality to an equivalent raid6 personality
* with the Q block at the end.
*/
int new_layout;
if (mddev->pers != &raid5_personality)
return ERR_PTR(-EINVAL);
if (mddev->degraded > 1)
return ERR_PTR(-EINVAL);
if (mddev->raid_disks > 253)
return ERR_PTR(-EINVAL);
if (mddev->raid_disks < 3)
return ERR_PTR(-EINVAL);
switch (mddev->layout) {
case ALGORITHM_LEFT_ASYMMETRIC:
new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
break;
case ALGORITHM_RIGHT_ASYMMETRIC:
new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
break;
case ALGORITHM_LEFT_SYMMETRIC:
new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
break;
case ALGORITHM_RIGHT_SYMMETRIC:
new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
break;
case ALGORITHM_PARITY_0:
new_layout = ALGORITHM_PARITY_0_6;
break;
case ALGORITHM_PARITY_N:
new_layout = ALGORITHM_PARITY_N;
break;
default:
return ERR_PTR(-EINVAL);
}
mddev->new_level = 6;
mddev->new_layout = new_layout;
mddev->delta_disks = 1;
mddev->raid_disks += 1;
return setup_conf(mddev);
}
static struct mdk_personality raid6_personality = static struct mdk_personality raid6_personality =
{ {
.name = "raid6", .name = "raid6",
...@@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality = ...@@ -4687,11 +5304,13 @@ static struct mdk_personality raid6_personality =
.spare_active = raid5_spare_active, .spare_active = raid5_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
.resize = raid5_resize, .resize = raid5_resize,
#ifdef CONFIG_MD_RAID5_RESHAPE .size = raid5_size,
.check_reshape = raid5_check_reshape, .check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape, .start_reshape = raid5_start_reshape,
#endif .finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce, .quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.reconfig = raid6_reconfig,
}; };
static struct mdk_personality raid5_personality = static struct mdk_personality raid5_personality =
{ {
...@@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality = ...@@ -4708,11 +5327,13 @@ static struct mdk_personality raid5_personality =
.spare_active = raid5_spare_active, .spare_active = raid5_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
.resize = raid5_resize, .resize = raid5_resize,
#ifdef CONFIG_MD_RAID5_RESHAPE .size = raid5_size,
.check_reshape = raid5_check_reshape, .check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape, .start_reshape = raid5_start_reshape,
#endif .finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce, .quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.reconfig = raid5_reconfig,
}; };
static struct mdk_personality raid4_personality = static struct mdk_personality raid4_personality =
...@@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality = ...@@ -4730,20 +5351,15 @@ static struct mdk_personality raid4_personality =
.spare_active = raid5_spare_active, .spare_active = raid5_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
.resize = raid5_resize, .resize = raid5_resize,
#ifdef CONFIG_MD_RAID5_RESHAPE .size = raid5_size,
.check_reshape = raid5_check_reshape, .check_reshape = raid5_check_reshape,
.start_reshape = raid5_start_reshape, .start_reshape = raid5_start_reshape,
#endif .finish_reshape = raid5_finish_reshape,
.quiesce = raid5_quiesce, .quiesce = raid5_quiesce,
}; };
static int __init raid5_init(void) static int __init raid5_init(void)
{ {
int e;
e = raid6_select_algo();
if ( e )
return e;
register_md_personality(&raid6_personality); register_md_personality(&raid6_personality);
register_md_personality(&raid5_personality); register_md_personality(&raid5_personality);
register_md_personality(&raid4_personality); register_md_personality(&raid4_personality);
......
#ifndef _RAID5_H #ifndef _RAID5_H
#define _RAID5_H #define _RAID5_H
#include <linux/raid/md.h>
#include <linux/raid/xor.h> #include <linux/raid/xor.h>
/* /*
...@@ -199,8 +198,12 @@ struct stripe_head { ...@@ -199,8 +198,12 @@ struct stripe_head {
struct hlist_node hash; struct hlist_node hash;
struct list_head lru; /* inactive_list or handle_list */ struct list_head lru; /* inactive_list or handle_list */
struct raid5_private_data *raid_conf; struct raid5_private_data *raid_conf;
short generation; /* increments with every
* reshape */
sector_t sector; /* sector of this row */ sector_t sector; /* sector of this row */
int pd_idx; /* parity disk index */ short pd_idx; /* parity disk index */
short qd_idx; /* 'Q' disk index for raid6 */
short ddf_layout;/* use DDF ordering to calculate Q */
unsigned long state; /* state flags */ unsigned long state; /* state flags */
atomic_t count; /* nr of active thread/requests */ atomic_t count; /* nr of active thread/requests */
spinlock_t lock; spinlock_t lock;
...@@ -238,7 +241,7 @@ struct stripe_head_state { ...@@ -238,7 +241,7 @@ struct stripe_head_state {
/* r6_state - extra state data only relevant to r6 */ /* r6_state - extra state data only relevant to r6 */
struct r6_state { struct r6_state {
int p_failed, q_failed, qd_idx, failed_num[2]; int p_failed, q_failed, failed_num[2];
}; };
/* Flags */ /* Flags */
...@@ -268,6 +271,8 @@ struct r6_state { ...@@ -268,6 +271,8 @@ struct r6_state {
#define READ_MODIFY_WRITE 2 #define READ_MODIFY_WRITE 2
/* not a write method, but a compute_parity mode */ /* not a write method, but a compute_parity mode */
#define CHECK_PARITY 3 #define CHECK_PARITY 3
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
#define UPDATE_PARITY 4
/* /*
* Stripe state * Stripe state
...@@ -334,12 +339,21 @@ struct raid5_private_data { ...@@ -334,12 +339,21 @@ struct raid5_private_data {
int raid_disks; int raid_disks;
int max_nr_stripes; int max_nr_stripes;
/* used during an expand */ /* reshape_progress is the leading edge of a 'reshape'
sector_t expand_progress; /* MaxSector when no expand happening */ * It has value MaxSector when no reshape is happening
sector_t expand_lo; /* from here up to expand_progress it out-of-bounds * If delta_disks < 0, it is the last sector we started work on,
* as we haven't flushed the metadata yet * else is it the next sector to work on.
*/ */
sector_t reshape_progress;
/* reshape_safe is the trailing edge of a reshape. We know that
* before (or after) this address, all reshape has completed.
*/
sector_t reshape_safe;
int previous_raid_disks; int previous_raid_disks;
int prev_chunk, prev_algo;
short generation; /* increments with every reshape */
unsigned long reshape_checkpoint; /* Time we last updated
* metadata */
struct list_head handle_list; /* stripes needing handling */ struct list_head handle_list; /* stripes needing handling */
struct list_head hold_list; /* preread ready stripes */ struct list_head hold_list; /* preread ready stripes */
...@@ -385,6 +399,11 @@ struct raid5_private_data { ...@@ -385,6 +399,11 @@ struct raid5_private_data {
int pool_size; /* number of disks in stripeheads in pool */ int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock; spinlock_t device_lock;
struct disk_info *disks; struct disk_info *disks;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
*/
struct mdk_thread_s *thread;
}; };
typedef struct raid5_private_data raid5_conf_t; typedef struct raid5_private_data raid5_conf_t;
...@@ -394,9 +413,62 @@ typedef struct raid5_private_data raid5_conf_t; ...@@ -394,9 +413,62 @@ typedef struct raid5_private_data raid5_conf_t;
/* /*
* Our supported algorithms * Our supported algorithms
*/ */
#define ALGORITHM_LEFT_ASYMMETRIC 0 #define ALGORITHM_LEFT_ASYMMETRIC 0 /* Rotating Parity N with Data Restart */
#define ALGORITHM_RIGHT_ASYMMETRIC 1 #define ALGORITHM_RIGHT_ASYMMETRIC 1 /* Rotating Parity 0 with Data Restart */
#define ALGORITHM_LEFT_SYMMETRIC 2 #define ALGORITHM_LEFT_SYMMETRIC 2 /* Rotating Parity N with Data Continuation */
#define ALGORITHM_RIGHT_SYMMETRIC 3 #define ALGORITHM_RIGHT_SYMMETRIC 3 /* Rotating Parity 0 with Data Continuation */
/* Define non-rotating (raid4) algorithms. These allow
* conversion of raid4 to raid5.
*/
#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
* Firstly, the exact positioning of the parity block is slightly
* different between the 'LEFT_*' modes of md and the "_N_*" modes
* of DDF.
* Secondly, or order of datablocks over which the Q syndrome is computed
* is different.
* Consequently we have different layouts for DDF/raid6 than md/raid6.
* These layouts are from the DDFv1.2 spec.
* Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
* leaves RLQ=3 as 'Vendor Specific'
*/
#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
/* For every RAID5 algorithm we define a RAID6 algorithm
* with exactly the same layout for data and parity, and
* with the Q block always on the last device (N-1).
* This allows trivial conversion from RAID5 to RAID6
*/
#define ALGORITHM_LEFT_ASYMMETRIC_6 16
#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
#define ALGORITHM_LEFT_SYMMETRIC_6 18
#define ALGORITHM_RIGHT_SYMMETRIC_6 19
#define ALGORITHM_PARITY_0_6 20
#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
static inline int algorithm_valid_raid5(int layout)
{
return (layout >= 0) &&
(layout <= 5);
}
static inline int algorithm_valid_raid6(int layout)
{
return (layout >= 0 && layout <= 5)
||
(layout == 8 || layout == 10)
||
(layout >= 16 && layout <= 20);
}
static inline int algorithm_is_DDF(int layout)
{
return layout >= 8 && layout <= 10;
}
#endif #endif
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -16,13 +16,20 @@ ...@@ -16,13 +16,20 @@
* Algorithm list and algorithm selection for RAID-6 * Algorithm list and algorithm selection for RAID-6
*/ */
#include "raid6.h" #include <linux/raid/pq.h>
#ifndef __KERNEL__ #ifndef __KERNEL__
#include <sys/mman.h> #include <sys/mman.h>
#include <stdio.h> #include <stdio.h>
#else
#if !RAID6_USE_EMPTY_ZERO_PAGE
/* In .bss so it's zeroed */
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
EXPORT_SYMBOL(raid6_empty_zero_page);
#endif
#endif #endif
struct raid6_calls raid6_call; struct raid6_calls raid6_call;
EXPORT_SYMBOL_GPL(raid6_call);
/* Various routine sets */ /* Various routine sets */
extern const struct raid6_calls raid6_intx1; extern const struct raid6_calls raid6_intx1;
...@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = { ...@@ -79,6 +86,7 @@ const struct raid6_calls * const raid6_algos[] = {
#else #else
/* Need more time to be stable in userspace */ /* Need more time to be stable in userspace */
#define RAID6_TIME_JIFFIES_LG2 9 #define RAID6_TIME_JIFFIES_LG2 9
#define time_before(x, y) ((x) < (y))
#endif #endif
/* Try to pick the best algorithm */ /* Try to pick the best algorithm */
...@@ -152,3 +160,12 @@ int __init raid6_select_algo(void) ...@@ -152,3 +160,12 @@ int __init raid6_select_algo(void)
return best ? 0 : -EINVAL; return best ? 0 : -EINVAL;
} }
static void raid6_exit(void)
{
do { } while (0);
}
subsys_initcall(raid6_select_algo);
module_exit(raid6_exit);
MODULE_LICENSE("GPL");
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
* bracked this with preempt_disable/enable or in a lock) * bracked this with preempt_disable/enable or in a lock)
*/ */
#include "raid6.h" #include <linux/raid/pq.h>
#ifdef CONFIG_ALTIVEC #ifdef CONFIG_ALTIVEC
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
* This file is postprocessed using unroll.pl * This file is postprocessed using unroll.pl
*/ */
#include "raid6.h" #include <linux/raid/pq.h>
/* /*
* This is the C data type to use * This is the C data type to use
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#if defined(__i386__) && !defined(__arch_um__) #if defined(__i386__) && !defined(__arch_um__)
#include "raid6.h" #include <linux/raid/pq.h>
#include "raid6x86.h" #include "raid6x86.h"
/* Shared with raid6sse1.c */ /* Shared with raid6sse1.c */
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
* the syndrome.) * the syndrome.)
*/ */
#include "raid6.h" #include <linux/raid/pq.h>
/* Recover two failed data blocks. */ /* Recover two failed data blocks. */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
...@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, ...@@ -63,9 +63,7 @@ void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
p++; q++; p++; q++;
} }
} }
EXPORT_SYMBOL_GPL(raid6_2data_recov);
/* Recover failure of one data block plus the P block */ /* Recover failure of one data block plus the P block */
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
...@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs) ...@@ -97,9 +95,10 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
q++; dq++; q++; dq++;
} }
} }
EXPORT_SYMBOL_GPL(raid6_datap_recov);
#ifndef __KERNEL__
#ifndef __KERNEL__ /* Testing only */ /* Testing only */
/* Recover two failed blocks. */ /* Recover two failed blocks. */
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#if defined(__i386__) && !defined(__arch_um__) #if defined(__i386__) && !defined(__arch_um__)
#include "raid6.h" #include <linux/raid/pq.h>
#include "raid6x86.h" #include "raid6x86.h"
/* Defined in raid6mmx.c */ /* Defined in raid6mmx.c */
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) #if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__)
#include "raid6.h" #include <linux/raid/pq.h>
#include "raid6x86.h" #include "raid6x86.h"
static const struct raid6_sse_constants { static const struct raid6_sse_constants {
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
CC = gcc CC = gcc
OPTFLAGS = -O2 # Adjust as desired OPTFLAGS = -O2 # Adjust as desired
CFLAGS = -I.. -g $(OPTFLAGS) CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS)
LD = ld LD = ld
PERL = perl PERL = perl
AR = ar AR = ar
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include "raid6.h" #include <linux/raid/pq.h>
#define NDISKS 16 /* Including P and Q */ #define NDISKS 16 /* Including P and Q */
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
......
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include <linux/if.h> #include <linux/if.h>
#include <linux/if_bridge.h> #include <linux/if_bridge.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/raid/md.h> #include <linux/raid/md_u.h>
#include <linux/kd.h> #include <linux/kd.h>
#include <linux/route.h> #include <linux/route.h>
#include <linux/in6.h> #include <linux/in6.h>
......
/*
md.h : Multiple Devices driver for Linux
Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
Copyright (C) 1994-96 Marc ZYNGIER
<zyngier@ufr-info-p7.ibp.fr> or
<maz@gloups.fdn.fr>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _MD_H
#define _MD_H
#include <linux/blkdev.h>
#include <linux/seq_file.h>
/*
* 'md_p.h' holds the 'physical' layout of RAID devices
* 'md_u.h' holds the user <=> kernel API
*
* 'md_k.h' holds kernel internal definitions
*/
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
#include <linux/raid/md_k.h>
#ifdef CONFIG_MD
/*
* Different major versions are not compatible.
* Different minor versions are only downward compatible.
* Different patchlevel versions are downward and upward compatible.
*/
#define MD_MAJOR_VERSION 0
#define MD_MINOR_VERSION 90
/*
* MD_PATCHLEVEL_VERSION indicates kernel functionality.
* >=1 means different superblock formats are selectable using SET_ARRAY_INFO
* and major_version/minor_version accordingly
* >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
* in the super status byte
* >=3 means that bitmap superblock version 4 is supported, which uses
* little-ending representation rather than host-endian
*/
#define MD_PATCHLEVEL_VERSION 3
extern int mdp_major;
extern int register_md_personality(struct mdk_personality *p);
extern int unregister_md_personality(struct mdk_personality *p);
extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
mddev_t *mddev, const char *name);
extern void md_unregister_thread(mdk_thread_t *thread);
extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_check_recovery(mddev_t *mddev);
extern void md_write_start(mddev_t *mddev, struct bio *bi);
extern void md_write_end(mddev_t *mddev);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* CONFIG_MD */
#endif
...@@ -15,6 +15,24 @@ ...@@ -15,6 +15,24 @@
#ifndef _MD_U_H #ifndef _MD_U_H
#define _MD_U_H #define _MD_U_H
/*
* Different major versions are not compatible.
* Different minor versions are only downward compatible.
* Different patchlevel versions are downward and upward compatible.
*/
#define MD_MAJOR_VERSION 0
#define MD_MINOR_VERSION 90
/*
* MD_PATCHLEVEL_VERSION indicates kernel functionality.
* >=1 means different superblock formats are selectable using SET_ARRAY_INFO
* and major_version/minor_version accordingly
* >=2 means that Internal bitmaps are supported by setting MD_SB_BITMAP_PRESENT
* in the super status byte
* >=3 means that bitmap superblock version 4 is supported, which uses
* little-ending representation rather than host-endian
*/
#define MD_PATCHLEVEL_VERSION 3
/* ioctls */ /* ioctls */
/* status */ /* status */
...@@ -46,6 +64,12 @@ ...@@ -46,6 +64,12 @@
#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
/* 63 partitions with the alternate major number (mdp) */
#define MdpMinorShift 6
#ifdef __KERNEL__
extern int mdp_major;
#endif
typedef struct mdu_version_s { typedef struct mdu_version_s {
int major; int major;
int minor; int minor;
...@@ -85,6 +109,17 @@ typedef struct mdu_array_info_s { ...@@ -85,6 +109,17 @@ typedef struct mdu_array_info_s {
} mdu_array_info_t; } mdu_array_info_t;
/* non-obvious values for 'level' */
#define LEVEL_MULTIPATH (-4)
#define LEVEL_LINEAR (-1)
#define LEVEL_FAULTY (-5)
/* we need a value for 'no level specified' and 0
* means 'raid0', so we need something else. This is
* for internal use only
*/
#define LEVEL_NONE (-1000000)
typedef struct mdu_disk_info_s { typedef struct mdu_disk_info_s {
/* /*
* configuration/status of one particular disk * configuration/status of one particular disk
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
* This program is free software; you can redistribute it and/or modify * This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330, * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Bostom MA 02111-1307, USA; either version 2 of the License, or * Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference. * (at your option) any later version; incorporated herein by reference.
* *
* ----------------------------------------------------------------------- */ * ----------------------------------------------------------------------- */
...@@ -17,14 +17,7 @@ ...@@ -17,14 +17,7 @@
/* Set to 1 to use kernel-wide empty_zero_page */ /* Set to 1 to use kernel-wide empty_zero_page */
#define RAID6_USE_EMPTY_ZERO_PAGE 0 #define RAID6_USE_EMPTY_ZERO_PAGE 0
#include <linux/blkdev.h>
#include <linux/raid/md.h>
#include <linux/raid/raid5.h>
typedef raid5_conf_t raid6_conf_t; /* Same configuration */
/* Additional compute_parity mode -- updates the parity w/o LOCKING */
#define UPDATE_PARITY 4
/* We need a pre-zeroed page... if we don't want to use the kernel-provided /* We need a pre-zeroed page... if we don't want to use the kernel-provided
one define it here */ one define it here */
...@@ -68,6 +61,10 @@ extern const char raid6_empty_zero_page[PAGE_SIZE]; ...@@ -68,6 +61,10 @@ extern const char raid6_empty_zero_page[PAGE_SIZE];
#define enable_kernel_altivec() #define enable_kernel_altivec()
#define disable_kernel_altivec() #define disable_kernel_altivec()
#define EXPORT_SYMBOL(sym)
#define MODULE_LICENSE(licence)
#define subsys_initcall(x)
#define module_exit(x)
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
/* Routine choices */ /* Routine choices */
...@@ -98,9 +95,11 @@ extern const u8 raid6_gfinv[256] __attribute__((aligned(256))); ...@@ -98,9 +95,11 @@ extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
extern const u8 raid6_gfexi[256] __attribute__((aligned(256))); extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
/* Recovery routines */ /* Recovery routines */
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
void **ptrs);
void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs); void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs); void raid6_dual_recov(int disks, size_t bytes, int faila, int failb,
void **ptrs);
/* Some definitions to allow code to be compiled for testing in userspace */ /* Some definitions to allow code to be compiled for testing in userspace */
#ifndef __KERNEL__ #ifndef __KERNEL__
...@@ -108,8 +107,11 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs ...@@ -108,8 +107,11 @@ void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs
# define jiffies raid6_jiffies() # define jiffies raid6_jiffies()
# define printk printf # define printk printf
# define GFP_KERNEL 0 # define GFP_KERNEL 0
# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0)) # define __get_free_pages(x, y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), \
# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE) PROT_READ|PROT_WRITE, \
MAP_PRIVATE|MAP_ANONYMOUS,\
0, 0))
# define free_pages(x, y) munmap((void *)(x), (y)*PAGE_SIZE)
static inline void cpu_relax(void) static inline void cpu_relax(void)
{ {
......
#ifndef _XOR_H #ifndef _XOR_H
#define _XOR_H #define _XOR_H
#include <linux/raid/md.h>
#define MAX_XOR_BLOCKS 4 #define MAX_XOR_BLOCKS 4
extern void xor_blocks(unsigned int count, unsigned int bytes, extern void xor_blocks(unsigned int count, unsigned int bytes,
......
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/unistd.h> #include <linux/unistd.h>
......
#include <linux/delay.h> #include <linux/delay.h>
#include <linux/raid/md.h> #include <linux/raid/md_u.h>
#include <linux/raid/md_p.h>
#include "do_mounts.h" #include "do_mounts.h"
...@@ -112,8 +113,6 @@ static int __init md_setup(char *str) ...@@ -112,8 +113,6 @@ static int __init md_setup(char *str)
return 1; return 1;
} }
#define MdpMinorShift 6
static void __init md_setup_drive(void) static void __init md_setup_drive(void)
{ {
int minor, i, ent, partitioned; int minor, i, ent, partitioned;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment