md.c 81.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
   md.c : Multiple Devices driver for Linux
	  Copyright (C) 1998, 1999, 2000 Ingo Molnar

     completely rewritten, based on the MD driver code from Marc Zyngier

   Changes:

   - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
   - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
   - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
   - kmod support by: Cyrus Durgin
   - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
   - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>

   - lots of fixes and improvements to the RAID1/RAID5 and generic
     RAID code (such as request based resynchronization):

     Neil Brown <neilb@cse.unsw.edu.au>.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   You should have received a copy of the GNU General Public License
   (for example /usr/src/linux/COPYING); if not, write to the Free
   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

#include <linux/module.h>
#include <linux/config.h>
Linus Torvalds's avatar
Linus Torvalds committed
33
#include <linux/linkage.h>
Linus Torvalds's avatar
Linus Torvalds committed
34 35
#include <linux/raid/md.h>
#include <linux/sysctl.h>
36
#include <linux/bio.h>
Linus Torvalds's avatar
Linus Torvalds committed
37 38
#include <linux/raid/xor.h>
#include <linux/devfs_fs_kernel.h>
39
#include <linux/buffer_head.h> /* for invalidate_bdev */
Linus Torvalds's avatar
Linus Torvalds committed
40 41 42 43 44 45 46 47 48 49 50 51 52 53

#include <linux/init.h>

#ifdef CONFIG_KMOD
#include <linux/kmod.h>
#endif

#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>

#include <asm/unaligned.h>

#define MAJOR_NR MD_MAJOR
#define MD_DRIVER
Alexander Viro's avatar
Alexander Viro committed
54
#define DEVICE_NR(device) (minor(device))
Linus Torvalds's avatar
Linus Torvalds committed
55 56 57 58 59 60 61 62 63 64

#include <linux/blk.h>

#define DEBUG 0
#if DEBUG
# define dprintk(x...) printk(x)
#else
# define dprintk(x...) do { } while(0)
#endif

Linus Torvalds's avatar
Linus Torvalds committed
65 66 67 68
#ifndef MODULE
static void autostart_arrays (void);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
69 70 71 72
static mdk_personality_t *pers[MAX_PERSONALITY];

/*
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
Linus Torvalds's avatar
Linus Torvalds committed
73
 * is 1000 KB/sec, so the extra system load does not show up that much.
Linus Torvalds's avatar
Linus Torvalds committed
74 75 76 77 78 79 80 81 82
 * Increase it if you want to have more _guaranteed_ speed. Note that
 * the RAID driver will use the maximum available bandwith if the IO
 * subsystem is idle. There is also an 'absolute maximum' reconstruction
 * speed limit - in case reconstruction slows down your system despite
 * idle IO detection.
 *
 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
 */

Linus Torvalds's avatar
Linus Torvalds committed
83 84
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
Linus Torvalds's avatar
Linus Torvalds committed
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110

static struct ctl_table_header *raid_table_header;

static ctl_table raid_table[] = {
	{DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
	 &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
	{DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
	 &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
	{0}
};

static ctl_table raid_dir_table[] = {
	{DEV_RAID, "raid", NULL, 0, 0555, raid_table},
	{0}
};

static ctl_table raid_root_table[] = {
	{CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
	{0}
};

/*
 * these have to be allocated separately because external
 * subsystems want to have a pre-defined structure
 */
struct hd_struct md_hd_struct[MAX_MD_DEVS];
111
static void md_recover_arrays(void);
Linus Torvalds's avatar
Linus Torvalds committed
112 113 114 115
static mdk_thread_t *md_recovery_thread;

int md_size[MAX_MD_DEVS];

Linus Torvalds's avatar
Linus Torvalds committed
116
static struct block_device_operations md_fops;
Linus Torvalds's avatar
Linus Torvalds committed
117 118
static devfs_handle_t devfs_handle;

119
static struct gendisk *disks[MAX_MD_DEVS];
Linus Torvalds's avatar
Linus Torvalds committed
120 121 122

/*
 * Enables to iterate over all existing md arrays
123
 * all_mddevs_lock protects this list as well as mddev_map.
Linus Torvalds's avatar
Linus Torvalds committed
124
 */
Linus Torvalds's avatar
Linus Torvalds committed
125
static LIST_HEAD(all_mddevs);
126 127
static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;

Linus Torvalds's avatar
Linus Torvalds committed
128 129

/*
130 131 132 133 134
 * iterates through all used mddevs in the system.
 * We take care to grab the all_mddevs_lock whenever navigating
 * the list, and to always hold a refcount when unlocked.
 * Any code which breaks out of this loop while own
 * a reference to the current mddev and must mddev_put it.
Linus Torvalds's avatar
Linus Torvalds committed
135
 */
136 137
#define ITERATE_MDDEV(mddev,tmp)					\
									\
138 139 140 141 142 143 144 145 146 147 148
	for (({ spin_lock(&all_mddevs_lock); 				\
		tmp = all_mddevs.next;					\
		mddev = NULL;});					\
	     ({ if (tmp != &all_mddevs)					\
			mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
		spin_unlock(&all_mddevs_lock);				\
		if (mddev) mddev_put(mddev);				\
		mddev = list_entry(tmp, mddev_t, all_mddevs);		\
		tmp != &all_mddevs;});					\
	     ({ spin_lock(&all_mddevs_lock);				\
		tmp = tmp->next;})					\
149
		)
Linus Torvalds's avatar
Linus Torvalds committed
150

151
static mddev_t *mddev_map[MAX_MD_DEVS];
Linus Torvalds's avatar
Linus Torvalds committed
152

153 154 155 156 157
static int md_fail_request (request_queue_t *q, struct bio *bio)
{
	bio_io_error(bio);
	return 0;
}
Linus Torvalds's avatar
Linus Torvalds committed
158

159
static inline mddev_t *mddev_get(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
160
{
161 162
	atomic_inc(&mddev->active);
	return mddev;
Linus Torvalds's avatar
Linus Torvalds committed
163 164
}

165
static void mddev_put(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
166
{
167
	if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
Linus Torvalds's avatar
Linus Torvalds committed
168
		return;
169
	if (!mddev->raid_disks && list_empty(&mddev->disks)) {
170 171 172 173
		list_del(&mddev->all_mddevs);
		mddev_map[mdidx(mddev)] = NULL;
		kfree(mddev);
		MOD_DEC_USE_COUNT;
Linus Torvalds's avatar
Linus Torvalds committed
174
	}
175
	spin_unlock(&all_mddevs_lock);
Linus Torvalds's avatar
Linus Torvalds committed
176 177
}

178
static mddev_t * mddev_find(int unit)
Linus Torvalds's avatar
Linus Torvalds committed
179
{
180
	mddev_t *mddev, *new = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
181

182 183 184 185 186 187 188 189
 retry:
	spin_lock(&all_mddevs_lock);
	if (mddev_map[unit]) {
		mddev =  mddev_get(mddev_map[unit]);
		spin_unlock(&all_mddevs_lock);
		if (new)
			kfree(new);
		return mddev;
Linus Torvalds's avatar
Linus Torvalds committed
190
	}
191 192 193 194 195 196
	if (new) {
		mddev_map[unit] = new;
		list_add(&new->all_mddevs, &all_mddevs);
		spin_unlock(&all_mddevs_lock);
		MOD_INC_USE_COUNT;
		return new;
Linus Torvalds's avatar
Linus Torvalds committed
197
	}
198
	spin_unlock(&all_mddevs_lock);
Linus Torvalds's avatar
Linus Torvalds committed
199

200 201
	new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
	if (!new)
Linus Torvalds's avatar
Linus Torvalds committed
202
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
203

204
	memset(new, 0, sizeof(*new));
Linus Torvalds's avatar
Linus Torvalds committed
205

206 207 208 209 210
	new->__minor = unit;
	init_MUTEX(&new->reconfig_sem);
	INIT_LIST_HEAD(&new->disks);
	INIT_LIST_HEAD(&new->all_mddevs);
	atomic_set(&new->active, 1);
Linus Torvalds's avatar
Linus Torvalds committed
211

212
	goto retry;
Linus Torvalds's avatar
Linus Torvalds committed
213 214
}

215 216 217 218
static inline int mddev_lock(mddev_t * mddev)
{
	return down_interruptible(&mddev->reconfig_sem);
}
Linus Torvalds's avatar
Linus Torvalds committed
219

220 221 222 223 224 225 226 227
static inline int mddev_trylock(mddev_t * mddev)
{
	return down_trylock(&mddev->reconfig_sem);
}

static inline void mddev_unlock(mddev_t * mddev)
{
	up(&mddev->reconfig_sem);
Linus Torvalds's avatar
Linus Torvalds committed
228 229 230 231 232
}

mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
{
	mdk_rdev_t * rdev;
Linus Torvalds's avatar
Linus Torvalds committed
233
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
234 235

	ITERATE_RDEV(mddev,rdev,tmp) {
236
		if (rdev->desc_nr == nr)
Linus Torvalds's avatar
Linus Torvalds committed
237 238 239 240 241
			return rdev;
	}
	return NULL;
}

242
static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
243
{
Linus Torvalds's avatar
Linus Torvalds committed
244
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
245 246 247
	mdk_rdev_t *rdev;

	ITERATE_RDEV(mddev,rdev,tmp) {
248 249 250 251 252 253
		if (rdev->bdev->bd_dev == dev)
			return rdev;
	}
	return NULL;
}

Linus Torvalds's avatar
Linus Torvalds committed
254
static LIST_HEAD(device_names);
Linus Torvalds's avatar
Linus Torvalds committed
255

Linus Torvalds's avatar
Linus Torvalds committed
256
char * partition_name(kdev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
257 258 259 260
{
	struct gendisk *hd;
	static char nomem [] = "<nomem>";
	dev_name_t *dname;
261
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
262

263
	list_for_each(tmp, &device_names) {
Linus Torvalds's avatar
Linus Torvalds committed
264
		dname = list_entry(tmp, dev_name_t, list);
Linus Torvalds's avatar
Linus Torvalds committed
265
		if (kdev_same(dname->dev, dev))
Linus Torvalds's avatar
Linus Torvalds committed
266 267 268 269 270 271 272 273 274 275
			return dname->name;
	}

	dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);

	if (!dname)
		return nomem;
	/*
	 * ok, add this new device name to the list
	 */
Linus Torvalds's avatar
Linus Torvalds committed
276
	hd = get_gendisk (dev);
Linus Torvalds's avatar
Linus Torvalds committed
277 278
	dname->name = NULL;
	if (hd)
Linus Torvalds's avatar
Linus Torvalds committed
279
		dname->name = disk_name (hd, minor(dev), dname->namebuf);
Linus Torvalds's avatar
Linus Torvalds committed
280 281 282 283 284 285
	if (!dname->name) {
		sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
		dname->name = dname->namebuf;
	}

	dname->dev = dev;
Linus Torvalds's avatar
Linus Torvalds committed
286
	list_add(&dname->list, &device_names);
Linus Torvalds's avatar
Linus Torvalds committed
287 288 289 290

	return dname->name;
}

291
static unsigned int calc_dev_sboffset(struct block_device *bdev)
Linus Torvalds's avatar
Linus Torvalds committed
292
{
293 294
	unsigned int size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
	return MD_NEW_SIZE_BLOCKS(size);
Linus Torvalds's avatar
Linus Torvalds committed
295 296
}

297
static unsigned int calc_dev_size(struct block_device *bdev, mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
298 299 300
{
	unsigned int size;

301 302 303 304
	if (mddev->persistent)
		size = calc_dev_sboffset(bdev);
	else
		size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
305 306
	if (mddev->chunk_size)
		size &= ~(mddev->chunk_size/1024 - 1);
Linus Torvalds's avatar
Linus Torvalds committed
307 308 309
	return size;
}

Linus Torvalds's avatar
Linus Torvalds committed
310
static unsigned int zoned_raid_size(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
311 312 313
{
	unsigned int mask;
	mdk_rdev_t * rdev;
Linus Torvalds's avatar
Linus Torvalds committed
314
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
315 316 317 318

	/*
	 * do size and offset calculations.
	 */
319
	mask = ~(mddev->chunk_size/1024 - 1);
Linus Torvalds's avatar
Linus Torvalds committed
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343

	ITERATE_RDEV(mddev,rdev,tmp) {
		rdev->size &= mask;
		md_size[mdidx(mddev)] += rdev->size;
	}
	return 0;
}


#define BAD_MAGIC KERN_ERR \
"md: invalid raid superblock magic on %s\n"

#define BAD_MINOR KERN_ERR \
"md: %s: invalid raid minor (%x)\n"

#define OUT_OF_MEM KERN_ALERT \
"md: out of memory.\n"

#define NO_SB KERN_ERR \
"md: disabled device %s, could not read superblock.\n"

#define BAD_CSUM KERN_WARNING \
"md: invalid superblock checksum on %s\n"

Linus Torvalds's avatar
Linus Torvalds committed
344
static int alloc_disk_sb(mdk_rdev_t * rdev)
Linus Torvalds's avatar
Linus Torvalds committed
345
{
346
	if (rdev->sb_page)
Linus Torvalds's avatar
Linus Torvalds committed
347 348
		MD_BUG();

349 350
	rdev->sb_page = alloc_page(GFP_KERNEL);
	if (!rdev->sb_page) {
Linus Torvalds's avatar
Linus Torvalds committed
351
		printk(OUT_OF_MEM);
Linus Torvalds's avatar
Linus Torvalds committed
352 353
		return -EINVAL;
	}
354
	rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
Linus Torvalds's avatar
Linus Torvalds committed
355
	clear_page(rdev->sb);
Linus Torvalds's avatar
Linus Torvalds committed
356 357 358 359

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
360
static void free_disk_sb(mdk_rdev_t * rdev)
Linus Torvalds's avatar
Linus Torvalds committed
361
{
362 363
	if (rdev->sb_page) {
		page_cache_release(rdev->sb_page);
Linus Torvalds's avatar
Linus Torvalds committed
364
		rdev->sb = NULL;
365
		rdev->sb_page = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
366 367 368 369 370
		rdev->sb_offset = 0;
		rdev->size = 0;
	}
}

371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397

static void bi_complete(struct bio *bio)
{
	complete((struct completion*)bio->bi_private);
}

static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
		   struct page *page, int rw)
{
	struct bio bio;
	struct bio_vec vec;
	struct completion event;

	bio_init(&bio);
	bio.bi_io_vec = &vec;
	vec.bv_page = page;
	vec.bv_len = size;
	vec.bv_offset = 0;
	bio.bi_vcnt = 1;
	bio.bi_idx = 0;
	bio.bi_size = size;
	bio.bi_bdev = bdev;
	bio.bi_sector = sector;
	init_completion(&event);
	bio.bi_private = &event;
	bio.bi_end_io = bi_complete;
	submit_bio(rw, &bio);
Jens Axboe's avatar
Jens Axboe committed
398
	blk_run_queues();
399 400 401 402 403
	wait_for_completion(&event);

	return test_bit(BIO_UPTODATE, &bio.bi_flags);
}

Linus Torvalds's avatar
Linus Torvalds committed
404
static int read_disk_sb(mdk_rdev_t * rdev)
Linus Torvalds's avatar
Linus Torvalds committed
405 406 407 408 409
{
	unsigned long sb_offset;

	if (!rdev->sb) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
410
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
411 412
	}

Linus Torvalds's avatar
Linus Torvalds committed
413 414
	/*
	 * Calculate the position of the superblock,
Linus Torvalds's avatar
Linus Torvalds committed
415 416 417
	 * it's at the end of the disk.
	 *
	 * It also happens to be a multiple of 4Kb.
Linus Torvalds's avatar
Linus Torvalds committed
418
	 */
419
	sb_offset = calc_dev_sboffset(rdev->bdev);
Linus Torvalds's avatar
Linus Torvalds committed
420
	rdev->sb_offset = sb_offset;
421 422

	if (!sync_page_io(rdev->bdev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
Linus Torvalds's avatar
Linus Torvalds committed
423
		goto fail;
424

Linus Torvalds's avatar
Linus Torvalds committed
425
	printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
Linus Torvalds's avatar
Linus Torvalds committed
426 427 428
	return 0;

fail:
429
	printk(NO_SB,bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
430
	return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
431 432
}

Linus Torvalds's avatar
Linus Torvalds committed
433
static unsigned int calc_sb_csum(mdp_super_t * sb)
Linus Torvalds's avatar
Linus Torvalds committed
434 435 436 437 438 439 440 441 442 443 444 445 446 447
{
	unsigned int disk_csum, csum;

	disk_csum = sb->sb_csum;
	sb->sb_csum = 0;
	csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
	sb->sb_csum = disk_csum;
	return csum;
}

/*
 * Check one RAID superblock for generic plausibility
 */

Linus Torvalds's avatar
Linus Torvalds committed
448
static int check_disk_sb(mdk_rdev_t * rdev)
Linus Torvalds's avatar
Linus Torvalds committed
449 450 451 452 453 454 455 456 457 458 459
{
	mdp_super_t *sb;
	int ret = -EINVAL;

	sb = rdev->sb;
	if (!sb) {
		MD_BUG();
		goto abort;
	}

	if (sb->md_magic != MD_SB_MAGIC) {
460
		printk(BAD_MAGIC, bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
461 462 463 464
		goto abort;
	}

	if (sb->md_minor >= MAX_MD_DEVS) {
465
		printk(BAD_MINOR, bdev_partition_name(rdev->bdev), sb->md_minor);
Linus Torvalds's avatar
Linus Torvalds committed
466 467 468
		goto abort;
	}

Linus Torvalds's avatar
Linus Torvalds committed
469
	if (calc_sb_csum(sb) != sb->sb_csum) {
470
		printk(BAD_CSUM, bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
471 472
		goto abort;
	}
Linus Torvalds's avatar
Linus Torvalds committed
473 474 475 476 477
	ret = 0;
abort:
	return ret;
}

Alexander Viro's avatar
Alexander Viro committed
478
static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
Linus Torvalds's avatar
Linus Torvalds committed
479
{
Linus Torvalds's avatar
Linus Torvalds committed
480
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
481 482 483
	mdk_rdev_t *rdev;

	ITERATE_RDEV(mddev,rdev,tmp)
Alexander Viro's avatar
Alexander Viro committed
484
		if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
Linus Torvalds's avatar
Linus Torvalds committed
485 486 487 488 489 490 491
			return rdev;

	return NULL;
}

static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
{
Linus Torvalds's avatar
Linus Torvalds committed
492
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
493 494 495
	mdk_rdev_t *rdev;

	ITERATE_RDEV(mddev1,rdev,tmp)
Alexander Viro's avatar
Alexander Viro committed
496
		if (match_dev_unit(mddev2, rdev))
Linus Torvalds's avatar
Linus Torvalds committed
497 498 499 500 501
			return 1;

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
502
static LIST_HEAD(pending_raid_disks);
Linus Torvalds's avatar
Linus Torvalds committed
503

Linus Torvalds's avatar
Linus Torvalds committed
504
static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
Linus Torvalds's avatar
Linus Torvalds committed
505 506 507 508 509 510 511
{
	mdk_rdev_t *same_pdev;

	if (rdev->mddev) {
		MD_BUG();
		return;
	}
Alexander Viro's avatar
Alexander Viro committed
512
	same_pdev = match_dev_unit(mddev, rdev);
Linus Torvalds's avatar
Linus Torvalds committed
513 514 515 516
	if (same_pdev)
		printk( KERN_WARNING
"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
"     protection against single-disk failure might be compromised.\n",
517 518
			mdidx(mddev), bdev_partition_name(rdev->bdev),
				bdev_partition_name(same_pdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
519

Linus Torvalds's avatar
Linus Torvalds committed
520
	list_add(&rdev->same_set, &mddev->disks);
Linus Torvalds's avatar
Linus Torvalds committed
521
	rdev->mddev = mddev;
522
	printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
523 524
}

Linus Torvalds's avatar
Linus Torvalds committed
525
static void unbind_rdev_from_array(mdk_rdev_t * rdev)
Linus Torvalds's avatar
Linus Torvalds committed
526 527 528 529 530
{
	if (!rdev->mddev) {
		MD_BUG();
		return;
	}
531
	list_del_init(&rdev->same_set);
532
	printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
533 534 535 536 537 538 539 540 541 542
	rdev->mddev = NULL;
}

/*
 * prevent the device from being mounted, repartitioned or
 * otherwise reused by a RAID array (or any other kernel
 * subsystem), by opening the device. [simply getting an
 * inode is not enough, the SCSI module usage code needs
 * an explicit open() on the device]
 */
543
static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
544 545 546 547
{
	int err = 0;
	struct block_device *bdev;

548
	bdev = bdget(dev);
Linus Torvalds's avatar
Linus Torvalds committed
549
	if (!bdev)
Linus Torvalds's avatar
Linus Torvalds committed
550
		return -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
551
	err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
552 553
	if (err)
		return err;
554
	err = bd_claim(bdev, rdev);
555 556 557 558 559
	if (err) {
		blkdev_put(bdev, BDEV_RAW);
		return err;
	}
	rdev->bdev = bdev;
Linus Torvalds's avatar
Linus Torvalds committed
560 561 562
	return err;
}

Linus Torvalds's avatar
Linus Torvalds committed
563
static void unlock_rdev(mdk_rdev_t *rdev)
Linus Torvalds's avatar
Linus Torvalds committed
564
{
Linus Torvalds's avatar
Linus Torvalds committed
565
	struct block_device *bdev = rdev->bdev;
Linus Torvalds's avatar
Linus Torvalds committed
566
	rdev->bdev = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
567 568
	if (!bdev)
		MD_BUG();
569
	bd_release(bdev);
Linus Torvalds's avatar
Linus Torvalds committed
570
	blkdev_put(bdev, BDEV_RAW);
Linus Torvalds's avatar
Linus Torvalds committed
571 572
}

573
void md_autodetect_dev(dev_t dev);
Linus Torvalds's avatar
Linus Torvalds committed
574

Linus Torvalds's avatar
Linus Torvalds committed
575
static void export_rdev(mdk_rdev_t * rdev)
Linus Torvalds's avatar
Linus Torvalds committed
576
{
577
	printk(KERN_INFO "md: export_rdev(%s)\n",bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
578 579 580
	if (rdev->mddev)
		MD_BUG();
	free_disk_sb(rdev);
Neil Brown's avatar
Neil Brown committed
581
	list_del_init(&rdev->same_set);
Linus Torvalds's avatar
Linus Torvalds committed
582
#ifndef MODULE
583
	md_autodetect_dev(rdev->bdev->bd_dev);
Linus Torvalds's avatar
Linus Torvalds committed
584
#endif
585
	unlock_rdev(rdev);
Linus Torvalds's avatar
Linus Torvalds committed
586 587 588
	kfree(rdev);
}

Linus Torvalds's avatar
Linus Torvalds committed
589
static void kick_rdev_from_array(mdk_rdev_t * rdev)
Linus Torvalds's avatar
Linus Torvalds committed
590 591 592 593 594
{
	unbind_rdev_from_array(rdev);
	export_rdev(rdev);
}

Linus Torvalds's avatar
Linus Torvalds committed
595
static void export_array(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
596
{
Linus Torvalds's avatar
Linus Torvalds committed
597
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
598 599 600 601 602 603 604 605 606
	mdk_rdev_t *rdev;

	ITERATE_RDEV(mddev,rdev,tmp) {
		if (!rdev->mddev) {
			MD_BUG();
			continue;
		}
		kick_rdev_from_array(rdev);
	}
607
	if (!list_empty(&mddev->disks))
Linus Torvalds's avatar
Linus Torvalds committed
608
		MD_BUG();
609
	mddev->raid_disks = 0;
Linus Torvalds's avatar
Linus Torvalds committed
610 611
}

Linus Torvalds's avatar
Linus Torvalds committed
612
static void free_mddev(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
{
	if (!mddev) {
		MD_BUG();
		return;
	}

	export_array(mddev);
	md_size[mdidx(mddev)] = 0;
	md_hd_struct[mdidx(mddev)].nr_sects = 0;
}

#undef BAD_CSUM
#undef BAD_MAGIC
#undef OUT_OF_MEM
#undef NO_SB

static void print_desc(mdp_disk_t *desc)
{
	printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
Linus Torvalds's avatar
Linus Torvalds committed
632
		partition_name(mk_kdev(desc->major,desc->minor)),
Linus Torvalds's avatar
Linus Torvalds committed
633 634 635 636 637 638 639
		desc->major,desc->minor,desc->raid_disk,desc->state);
}

static void print_sb(mdp_super_t *sb)
{
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
640
	printk(KERN_INFO "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
Linus Torvalds's avatar
Linus Torvalds committed
641 642 643
		sb->major_version, sb->minor_version, sb->patch_version,
		sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
		sb->ctime);
Linus Torvalds's avatar
Linus Torvalds committed
644
	printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
Linus Torvalds's avatar
Linus Torvalds committed
645 646
		sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
		sb->layout, sb->chunk_size);
Linus Torvalds's avatar
Linus Torvalds committed
647
	printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
Linus Torvalds's avatar
Linus Torvalds committed
648 649 650 651
		sb->utime, sb->state, sb->active_disks, sb->working_disks,
		sb->failed_disks, sb->spare_disks,
		sb->sb_csum, (unsigned long)sb->events_lo);

Linus Torvalds's avatar
Linus Torvalds committed
652
	printk(KERN_INFO);
Linus Torvalds's avatar
Linus Torvalds committed
653 654 655 656
	for (i = 0; i < MD_SB_DISKS; i++) {
		mdp_disk_t *desc;

		desc = sb->disks + i;
Linus Torvalds's avatar
Linus Torvalds committed
657 658
		if (desc->number || desc->major || desc->minor ||
		    desc->raid_disk || (desc->state && (desc->state != 4))) {
Linus Torvalds's avatar
Linus Torvalds committed
659 660 661
			printk("     D %2d: ", i);
			print_desc(desc);
		}
Linus Torvalds's avatar
Linus Torvalds committed
662
	}
Linus Torvalds's avatar
Linus Torvalds committed
663
	printk(KERN_INFO "md:     THIS: ");
Linus Torvalds's avatar
Linus Torvalds committed
664 665 666 667 668 669
	print_desc(&sb->this_disk);

}

static void print_rdev(mdk_rdev_t *rdev)
{
670
	printk(KERN_INFO "md: rdev %s, SZ:%08ld F:%d S:%d DN:%d ",
671
		bdev_partition_name(rdev->bdev),
672
		rdev->size, rdev->faulty, rdev->in_sync, rdev->desc_nr);
Linus Torvalds's avatar
Linus Torvalds committed
673
	if (rdev->sb) {
Linus Torvalds's avatar
Linus Torvalds committed
674
		printk(KERN_INFO "md: rdev superblock:\n");
Linus Torvalds's avatar
Linus Torvalds committed
675 676
		print_sb(rdev->sb);
	} else
Linus Torvalds's avatar
Linus Torvalds committed
677
		printk(KERN_INFO "md: no rdev superblock!\n");
Linus Torvalds's avatar
Linus Torvalds committed
678 679
}

Linus Torvalds's avatar
Linus Torvalds committed
680
void md_print_devices(void)
Linus Torvalds's avatar
Linus Torvalds committed
681
{
Linus Torvalds's avatar
Linus Torvalds committed
682
	struct list_head *tmp, *tmp2;
Linus Torvalds's avatar
Linus Torvalds committed
683 684 685 686
	mdk_rdev_t *rdev;
	mddev_t *mddev;

	printk("\n");
Linus Torvalds's avatar
Linus Torvalds committed
687 688 689
	printk("md:	**********************************\n");
	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
	printk("md:	**********************************\n");
Linus Torvalds's avatar
Linus Torvalds committed
690 691 692 693
	ITERATE_MDDEV(mddev,tmp) {
		printk("md%d: ", mdidx(mddev));

		ITERATE_RDEV(mddev,rdev,tmp2)
694
			printk("<%s>", bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
695 696 697 698

		ITERATE_RDEV(mddev,rdev,tmp2)
			print_rdev(rdev);
	}
Linus Torvalds's avatar
Linus Torvalds committed
699
	printk("md:	**********************************\n");
Linus Torvalds's avatar
Linus Torvalds committed
700 701 702
	printk("\n");
}

Linus Torvalds's avatar
Linus Torvalds committed
703
static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
Linus Torvalds's avatar
Linus Torvalds committed
704 705 706 707 708 709 710 711 712
{
	int ret;
	mdp_super_t *tmp1, *tmp2;

	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);

	if (!tmp1 || !tmp2) {
		ret = 0;
Linus Torvalds's avatar
Linus Torvalds committed
713
		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
Linus Torvalds's avatar
Linus Torvalds committed
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
		goto abort;
	}

	*tmp1 = *sb1;
	*tmp2 = *sb2;

	/*
	 * nr_disks is not constant
	 */
	tmp1->nr_disks = 0;
	tmp2->nr_disks = 0;

	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
		ret = 0;
	else
		ret = 1;

abort:
	if (tmp1)
		kfree(tmp1);
	if (tmp2)
		kfree(tmp2);

	return ret;
}

static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
{
	if (	(rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
		(rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
		(rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
		(rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))

		return 1;

	return 0;
}

static int write_disk_sb(mdk_rdev_t * rdev)
{
	unsigned long sb_offset, size;

	if (!rdev->sb) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
758
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
759 760 761
	}
	if (rdev->faulty) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
762
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
763 764 765
	}
	if (rdev->sb->md_magic != MD_SB_MAGIC) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
766
		return 1;
Linus Torvalds's avatar
Linus Torvalds committed
767 768
	}

769
	sb_offset = calc_dev_sboffset(rdev->bdev);
Linus Torvalds's avatar
Linus Torvalds committed
770
	if (rdev->sb_offset != sb_offset) {
Linus Torvalds's avatar
Linus Torvalds committed
771
		printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
772
		       bdev_partition_name(rdev->bdev), rdev->sb_offset, sb_offset);
Linus Torvalds's avatar
Linus Torvalds committed
773 774 775 776
		goto skip;
	}
	/*
	 * If the disk went offline meanwhile and it's just a spare, then
Linus Torvalds's avatar
Linus Torvalds committed
777
	 * its size has changed to zero silently, and the MD code does
Linus Torvalds's avatar
Linus Torvalds committed
778 779
	 * not yet know that it's faulty.
	 */
780
	size = calc_dev_size(rdev->bdev, rdev->mddev);
Linus Torvalds's avatar
Linus Torvalds committed
781
	if (size != rdev->size) {
Linus Torvalds's avatar
Linus Torvalds committed
782
		printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
783
		       bdev_partition_name(rdev->bdev), rdev->size, size);
Linus Torvalds's avatar
Linus Torvalds committed
784 785 786
		goto skip;
	}

787
	printk(KERN_INFO "(write) %s's sb offset: %ld\n", bdev_partition_name(rdev->bdev), sb_offset);
788 789

	if (!sync_page_io(rdev->bdev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
Linus Torvalds's avatar
Linus Torvalds committed
790
		goto fail;
Linus Torvalds's avatar
Linus Torvalds committed
791 792
skip:
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
793
fail:
794
	printk("md: write_disk_sb failed for device %s\n", bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
795
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
796 797
}

798
static void sync_sbs(mddev_t * mddev)
Linus Torvalds's avatar
Linus Torvalds committed
799 800 801
{
	mdk_rdev_t *rdev;
	mdp_super_t *sb;
Linus Torvalds's avatar
Linus Torvalds committed
802
	struct list_head *tmp;
803
	int next_spare = mddev->raid_disks;
Linus Torvalds's avatar
Linus Torvalds committed
804

805 806 807 808
	/* make all rdev->sb match mddev data..
	 * we setup the data in the first rdev and copy it
	 * to the others.
	 *
809 810 811 812 813 814 815 816 817 818 819 820
	 * 1/ zero out disks
	 * 2/ Add info for each disk, keeping track of highest desc_nr
	 * 3/ any empty disks < highest become removed
	 *
	 * disks[0] gets initialised to REMOVED because
	 * we cannot be sure from other fields if it has
	 * been initialised or not.
	 */
	int highest = 0;
	int i;
	int active=0, working=0,failed=0,spare=0,nr_disks=0;

821 822 823 824
	if (list_empty(&mddev->disks)) {
		MD_BUG();
		return;
	}
825
	rdev = list_entry(mddev->disks.next, mdk_rdev_t, same_set);
826 827
	sb = rdev->sb;

828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
	memset(sb, 0, sizeof(*sb));

	sb->md_magic = MD_SB_MAGIC;
	sb->major_version = mddev->major_version;
	sb->minor_version = mddev->minor_version;
	sb->patch_version = mddev->patch_version;
	sb->gvalid_words  = 0; /* ignored */
	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
	memcpy(&sb->set_uuid3, mddev->uuid+12,4);

	sb->ctime = mddev->ctime;
	sb->level = mddev->level;
	sb->size  = mddev->size;
	sb->raid_disks = mddev->raid_disks;
	sb->md_minor = mddev->__minor;
845
	sb->not_persistent = !mddev->persistent;
846 847 848 849 850 851 852
	sb->utime = mddev->utime;
	sb->state = mddev->state;
	sb->events_hi = (mddev->events>>32);
	sb->events_lo = (u32)mddev->events;

	sb->layout = mddev->layout;
	sb->chunk_size = mddev->chunk_size;
853 854 855

	sb->disks[0].state = (1<<MD_DISK_REMOVED);
	ITERATE_RDEV(mddev,rdev,tmp) {
856 857 858 859 860 861
		mdp_disk_t *d;
		if (rdev->raid_disk >= 0)
			rdev->desc_nr = rdev->raid_disk;
		else
			rdev->desc_nr = next_spare++;
		d = &sb->disks[rdev->desc_nr];
862 863
		nr_disks++;
		d->number = rdev->desc_nr;
864 865
		d->major = MAJOR(rdev->bdev->bd_dev);
		d->minor = MINOR(rdev->bdev->bd_dev);
866 867 868 869
		if (rdev->raid_disk >= 0)
			d->raid_disk = rdev->raid_disk;
		else
			d->raid_disk = rdev->desc_nr; /* compatability */
870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901
		if (rdev->faulty) {
			d->state = (1<<MD_DISK_FAULTY);
			failed++;
		} else if (rdev->in_sync) {
			d->state = (1<<MD_DISK_ACTIVE);
			d->state |= (1<<MD_DISK_SYNC);
			active++;
			working++;
		} else {
			d->state = 0;
			spare++;
			working++;
		}
		if (rdev->desc_nr > highest)
			highest = rdev->desc_nr;
	}
	
	/* now set the "removed" bit on any non-trailing holes */
	for (i=0; i<highest; i++) {
		mdp_disk_t *d = &sb->disks[i];
		if (d->state == 0 && d->number == 0) {
			d->number = i;
			d->raid_disk = i;
			d->state = (1<<MD_DISK_REMOVED);
		}
	}
	sb->nr_disks = nr_disks;
	sb->active_disks = active;
	sb->working_disks = working;
	sb->failed_disks = failed;
	sb->spare_disks = spare;

Linus Torvalds's avatar
Linus Torvalds committed
902
	ITERATE_RDEV(mddev,rdev,tmp) {
903 904 905 906 907 908 909
		mdp_super_t *this_sb;
		
		this_sb = rdev->sb;
		if (this_sb != sb)
			*this_sb = *sb;
		this_sb->this_disk = this_sb->disks[rdev->desc_nr];
		this_sb->sb_csum = calc_sb_csum(this_sb);
Linus Torvalds's avatar
Linus Torvalds committed
910 911 912
	}
}

913
static void md_update_sb(mddev_t * mddev)
Linus Torvalds's avatar
Linus Torvalds committed
914
{
Linus Torvalds's avatar
Linus Torvalds committed
915
	int err, count = 100;
Linus Torvalds's avatar
Linus Torvalds committed
916
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
917 918
	mdk_rdev_t *rdev;

919
	mddev->sb_dirty = 0;
Linus Torvalds's avatar
Linus Torvalds committed
920
repeat:
921 922
	mddev->utime = CURRENT_TIME;
	mddev->events ++;
Linus Torvalds's avatar
Linus Torvalds committed
923

924
	if (!mddev->events) {
Linus Torvalds's avatar
Linus Torvalds committed
925 926 927 928 929 930
		/*
		 * oops, this 64-bit counter should never wrap.
		 * Either we are in around ~1 trillion A.C., assuming
		 * 1 reboot per second, or we have a bug:
		 */
		MD_BUG();
931
		mddev->events --;
Linus Torvalds's avatar
Linus Torvalds committed
932 933 934 935 936 937 938
	}
	sync_sbs(mddev);

	/*
	 * do not write anything to disk if using
	 * nonpersistent superblocks
	 */
939
	if (!mddev->persistent)
940
		return;
Linus Torvalds's avatar
Linus Torvalds committed
941 942 943 944 945 946

	printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
					mdidx(mddev));

	err = 0;
	ITERATE_RDEV(mddev,rdev,tmp) {
Linus Torvalds's avatar
Linus Torvalds committed
947
		printk(KERN_INFO "md: ");
Linus Torvalds's avatar
Linus Torvalds committed
948 949
		if (rdev->faulty)
			printk("(skipping faulty ");
Linus Torvalds's avatar
Linus Torvalds committed
950

951
		printk("%s ", bdev_partition_name(rdev->bdev));
952
		if (!rdev->faulty) {
Linus Torvalds's avatar
Linus Torvalds committed
953 954 955 956 957
			printk("[events: %08lx]",
				(unsigned long)rdev->sb->events_lo);
			err += write_disk_sb(rdev);
		} else
			printk(")\n");
958 959 960
		if (!err && mddev->level == LEVEL_MULTIPATH)
			/* only need to write one superblock... */
			break;
Linus Torvalds's avatar
Linus Torvalds committed
961 962
	}
	if (err) {
Linus Torvalds's avatar
Linus Torvalds committed
963
		if (--count) {
Linus Torvalds's avatar
Linus Torvalds committed
964
			printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
Linus Torvalds's avatar
Linus Torvalds committed
965
			goto repeat;
Linus Torvalds's avatar
Linus Torvalds committed
966
		}
Linus Torvalds's avatar
Linus Torvalds committed
967
		printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
Linus Torvalds's avatar
Linus Torvalds committed
968 969 970 971 972 973 974 975 976 977 978 979 980
	}
}

/*
 * Import a device. If 'on_disk', then sanity check the superblock
 *
 * mark the device faulty if:
 *
 *   - the device is nonexistent (zero size)
 *   - the device has no valid superblock
 *
 * a faulty rdev _never_ has rdev->sb set.
 */
981
static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
Linus Torvalds's avatar
Linus Torvalds committed
982 983 984 985 986 987 988
{
	int err;
	mdk_rdev_t *rdev;
	unsigned int size;

	rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
	if (!rdev) {
989
		printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(to_kdev_t(newdev)));
990
		return ERR_PTR(-ENOMEM);
Linus Torvalds's avatar
Linus Torvalds committed
991 992 993 994 995 996
	}
	memset(rdev, 0, sizeof(*rdev));

	if ((err = alloc_disk_sb(rdev)))
		goto abort_free;

997
	err = lock_rdev(rdev, newdev);
998 999
	if (err) {
		printk(KERN_ERR "md: could not lock %s.\n",
1000
			partition_name(to_kdev_t(newdev)));
Linus Torvalds's avatar
Linus Torvalds committed
1001 1002 1003 1004
		goto abort_free;
	}
	rdev->desc_nr = -1;
	rdev->faulty = 0;
1005
	rdev->in_sync = 0;
1006
	atomic_set(&rdev->nr_pending, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1007

1008
	size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
Linus Torvalds's avatar
Linus Torvalds committed
1009
	if (!size) {
Andries E. Brouwer's avatar
Andries E. Brouwer committed
1010 1011
		printk(KERN_WARNING
		       "md: %s has zero or unknown size, marking faulty!\n",
1012
		       bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1013 1014 1015 1016 1017 1018
		err = -EINVAL;
		goto abort_free;
	}

	if (on_disk) {
		if ((err = read_disk_sb(rdev))) {
Linus Torvalds's avatar
Linus Torvalds committed
1019
			printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
1020
			       bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1021 1022 1023
			goto abort_free;
		}
		if ((err = check_disk_sb(rdev))) {
Linus Torvalds's avatar
Linus Torvalds committed
1024
			printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
1025
			       bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1026 1027 1028
			goto abort_free;
		}
	}
1029
	INIT_LIST_HEAD(&rdev->same_set);
Linus Torvalds's avatar
Linus Torvalds committed
1030

1031
	return rdev;
Linus Torvalds's avatar
Linus Torvalds committed
1032 1033 1034 1035 1036 1037 1038 1039

abort_free:
	if (rdev->sb) {
		if (rdev->bdev)
			unlock_rdev(rdev);
		free_disk_sb(rdev);
	}
	kfree(rdev);
1040
	return ERR_PTR(err);
Linus Torvalds's avatar
Linus Torvalds committed
1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
}

/*
 * Check a full RAID array for plausibility
 */

#define INCONSISTENT KERN_ERR \
"md: fatal superblock inconsistency in %s -- removing from array\n"

#define OUT_OF_DATE KERN_ERR \
"md: superblock update time inconsistency -- using the most recent one\n"

#define OLD_VERSION KERN_ALERT \
"md: md%d: unsupported raid array version %d.%d.%d\n"

#define NOT_CLEAN_IGNORE KERN_ERR \
"md: md%d: raid array is not clean -- starting background reconstruction\n"

#define UNKNOWN_LEVEL KERN_ERR \
"md: md%d: unsupported raid level %d\n"

Linus Torvalds's avatar
Linus Torvalds committed
1062
static int analyze_sbs(mddev_t * mddev)
Linus Torvalds's avatar
Linus Torvalds committed
1063
{
1064
	int out_of_date = 0, i;
1065 1066
	struct list_head *tmp;
	mdk_rdev_t *rdev, *freshest;
Linus Torvalds's avatar
Linus Torvalds committed
1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
	mdp_super_t *sb;

	/*
	 * Verify the RAID superblock on each real device
	 */
	ITERATE_RDEV(mddev,rdev,tmp) {
		if (rdev->faulty) {
			MD_BUG();
			goto abort;
		}
		if (!rdev->sb) {
			MD_BUG();
			goto abort;
		}
		if (check_disk_sb(rdev))
			goto abort;
	}

	/*
	 * The superblock constant part has to be the same
	 * for all disks in the array.
	 */
	sb = NULL;

	ITERATE_RDEV(mddev,rdev,tmp) {
		if (!sb) {
			sb = rdev->sb;
			continue;
		}
		if (!sb_equal(sb, rdev->sb)) {
1097
			printk(INCONSISTENT, bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
			kick_rdev_from_array(rdev);
			continue;
		}
	}

	/*
	 * OK, we have all disks and the array is ready to run. Let's
	 * find the freshest superblock, that one will be the superblock
	 * that represents the whole array.
	 */
	freshest = NULL;

	ITERATE_RDEV(mddev,rdev,tmp) {
		__u64 ev1, ev2;
		/*
		 * if the checksum is invalid, use the superblock
		 * only as a last resort. (decrease it's age by
		 * one event)
		 */
		if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
			if (rdev->sb->events_lo || rdev->sb->events_hi)
Linus Torvalds's avatar
Linus Torvalds committed
1119
				if (!(rdev->sb->events_lo--))
Linus Torvalds's avatar
Linus Torvalds committed
1120 1121 1122
					rdev->sb->events_hi--;
		}

Linus Torvalds's avatar
Linus Torvalds committed
1123
		printk(KERN_INFO "md: %s's event counter: %08lx\n",
1124
		       bdev_partition_name(rdev->bdev),
Linus Torvalds's avatar
Linus Torvalds committed
1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142
			(unsigned long)rdev->sb->events_lo);
		if (!freshest) {
			freshest = rdev;
			continue;
		}
		/*
		 * Find the newest superblock version
		 */
		ev1 = md_event(rdev->sb);
		ev2 = md_event(freshest->sb);
		if (ev1 != ev2) {
			out_of_date = 1;
			if (ev1 > ev2)
				freshest = rdev;
		}
	}
	if (out_of_date) {
		printk(OUT_OF_DATE);
1143
		printk(KERN_INFO "md: freshest: %s\n", bdev_partition_name(freshest->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1144
	}
1145 1146

	sb = freshest->sb;
Linus Torvalds's avatar
Linus Torvalds committed
1147

1148 1149 1150
	mddev->major_version = sb->major_version;
	mddev->minor_version = sb->minor_version;
	mddev->patch_version = sb->patch_version;
1151
	mddev->persistent = ! sb->not_persistent;
1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
	mddev->chunk_size = sb->chunk_size;
	mddev->ctime = sb->ctime;
	mddev->utime = sb->utime;
	mddev->level = sb->level;
	mddev->layout = sb->layout;
	mddev->raid_disks = sb->raid_disks;
	mddev->state = sb->state;
	mddev->size = sb->size;
	mddev->events = md_event(sb);
	
	memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
	memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
	memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
	memcpy(mddev->uuid+12,&sb->set_uuid3, 4);

Linus Torvalds's avatar
Linus Torvalds committed
1167 1168 1169 1170 1171 1172 1173 1174
	/*
	 * at this point we have picked the 'best' superblock
	 * from all available superblocks.
	 * now we validate this superblock and kick out possibly
	 * failed disks.
	 */
	ITERATE_RDEV(mddev,rdev,tmp) {
		/*
Linus Torvalds's avatar
Linus Torvalds committed
1175
		 * Kick all non-fresh devices
Linus Torvalds's avatar
Linus Torvalds committed
1176
		 */
1177
		__u64 ev1;
Linus Torvalds's avatar
Linus Torvalds committed
1178 1179
		ev1 = md_event(rdev->sb);
		++ev1;
1180
		if (ev1 < mddev->events) {
Linus Torvalds's avatar
Linus Torvalds committed
1181
			printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
1182
						bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1183 1184 1185 1186 1187
			kick_rdev_from_array(rdev);
			continue;
		}
	}

1188 1189 1190 1191 1192 1193
	/* set rdev->desc_nr for each device.
	 * for MULTIPATH, we just us sequential number as
	 * nothing else is meaningful
	 */
	i = 0;
	ITERATE_RDEV(mddev,rdev,tmp) {
1194
		if (mddev->level == LEVEL_MULTIPATH) {
1195
			rdev->desc_nr = i++;
1196
			rdev->raid_disk = rdev->desc_nr;
1197
			rdev->in_sync = 1;
1198
		} else {
1199
			mdp_disk_t *desc;
1200
			rdev->desc_nr = rdev->sb->this_disk.number;
1201
			desc = sb->disks + rdev->desc_nr;
1202
			rdev->raid_disk = -1;
1203 1204
			rdev->in_sync = rdev->faulty = 0;

1205
			if (desc->state & (1<<MD_DISK_FAULTY)) {
1206
				rdev->faulty = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1207
				kick_rdev_from_array(rdev);
1208
			} else if (desc->state & (1<<MD_DISK_SYNC) &&
1209
				   desc->raid_disk < mddev->raid_disks) {
1210
				rdev->in_sync = 1;
1211 1212
				rdev->raid_disk = desc->raid_disk;
			}
Linus Torvalds's avatar
Linus Torvalds committed
1213 1214 1215 1216 1217 1218 1219
		}
	}


	/*
	 * Check if we can support this RAID array
	 */
1220 1221
	if (mddev->major_version != MD_MAJOR_VERSION ||
			mddev->minor_version > MD_MINOR_VERSION) {
Linus Torvalds's avatar
Linus Torvalds committed
1222

1223 1224
		printk(OLD_VERSION, mdidx(mddev), mddev->major_version,
				mddev->minor_version, mddev->patch_version);
Linus Torvalds's avatar
Linus Torvalds committed
1225 1226 1227
		goto abort;
	}

1228 1229
	if ((mddev->state != (1 << MD_SB_CLEAN)) && ((mddev->level == 1) ||
			(mddev->level == 4) || (mddev->level == 5)))
Linus Torvalds's avatar
Linus Torvalds committed
1230
		printk(NOT_CLEAN_IGNORE, mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241

	return 0;
abort:
	return 1;
}

#undef INCONSISTENT
#undef OUT_OF_DATE
#undef OLD_VERSION
#undef OLD_LEVEL

Linus Torvalds's avatar
Linus Torvalds committed
1242
static int device_size_calculation(mddev_t * mddev)
Linus Torvalds's avatar
Linus Torvalds committed
1243
{
1244
	int data_disks = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1245
	unsigned int readahead;
Linus Torvalds's avatar
Linus Torvalds committed
1246
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
1247 1248 1249 1250 1251 1252 1253
	mdk_rdev_t *rdev;

	/*
	 * Do device size calculation. Bail out if too small.
	 * (we have to do this after having validated chunk_size,
	 * because device size has to be modulo chunk_size)
	 */
1254

Linus Torvalds's avatar
Linus Torvalds committed
1255 1256 1257 1258 1259 1260 1261
	ITERATE_RDEV(mddev,rdev,tmp) {
		if (rdev->faulty)
			continue;
		if (rdev->size) {
			MD_BUG();
			continue;
		}
1262
		rdev->size = calc_dev_size(rdev->bdev, mddev);
1263
		if (rdev->size < mddev->chunk_size / 1024) {
Linus Torvalds's avatar
Linus Torvalds committed
1264
			printk(KERN_WARNING
Linus Torvalds's avatar
Linus Torvalds committed
1265
				"md: Dev %s smaller than chunk_size: %ldk < %dk\n",
1266
				bdev_partition_name(rdev->bdev),
1267
				rdev->size, mddev->chunk_size / 1024);
Linus Torvalds's avatar
Linus Torvalds committed
1268 1269 1270 1271
			return -EINVAL;
		}
	}

1272
	switch (mddev->level) {
1273
		case LEVEL_MULTIPATH:
Linus Torvalds's avatar
Linus Torvalds committed
1274 1275
			data_disks = 1;
			break;
Linus Torvalds's avatar
Linus Torvalds committed
1276 1277 1278 1279 1280 1281
		case -3:
			data_disks = 1;
			break;
		case -2:
			data_disks = 1;
			break;
1282
		case LEVEL_LINEAR:
Linus Torvalds's avatar
Linus Torvalds committed
1283 1284 1285 1286 1287
			zoned_raid_size(mddev);
			data_disks = 1;
			break;
		case 0:
			zoned_raid_size(mddev);
1288
			data_disks = mddev->raid_disks;
Linus Torvalds's avatar
Linus Torvalds committed
1289 1290 1291 1292 1293 1294
			break;
		case 1:
			data_disks = 1;
			break;
		case 4:
		case 5:
1295
			data_disks = mddev->raid_disks-1;
Linus Torvalds's avatar
Linus Torvalds committed
1296 1297
			break;
		default:
1298
			printk(UNKNOWN_LEVEL, mdidx(mddev), mddev->level);
Linus Torvalds's avatar
Linus Torvalds committed
1299 1300 1301
			goto abort;
	}
	if (!md_size[mdidx(mddev)])
1302
		md_size[mdidx(mddev)] = mddev->size * data_disks;
Linus Torvalds's avatar
Linus Torvalds committed
1303

Andrew Morton's avatar
Andrew Morton committed
1304
	readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
1305 1306
	if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
		readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
Linus Torvalds's avatar
Linus Torvalds committed
1307 1308 1309
		if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
			readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
1310
		// (no multipath branch - it uses the default setting)
1311
		if (mddev->level == -3)
Linus Torvalds's avatar
Linus Torvalds committed
1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335
			readahead = 0;
	}

	printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
		mdidx(mddev), readahead*(PAGE_SIZE/1024));

	printk(KERN_INFO
		"md%d: %d data-disks, max readahead per data-disk: %ldk\n",
			mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
	return 0;
abort:
	return 1;
}


#define TOO_BIG_CHUNKSIZE KERN_ERR \
"too big chunk_size: %d > %d\n"

#define TOO_SMALL_CHUNKSIZE KERN_ERR \
"too small chunk_size: %d < %ld\n"

#define BAD_CHUNKSIZE KERN_ERR \
"no chunksize specified, see 'man raidtab'\n"

Linus Torvalds's avatar
Linus Torvalds committed
1336
static int do_md_run(mddev_t * mddev)
Linus Torvalds's avatar
Linus Torvalds committed
1337 1338 1339
{
	int pnum, err;
	int chunk_size;
Linus Torvalds's avatar
Linus Torvalds committed
1340
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
1341
	mdk_rdev_t *rdev;
1342
	struct gendisk *disk;
1343 1344
	char *major_name;

Linus Torvalds's avatar
Linus Torvalds committed
1345

1346
	if (list_empty(&mddev->disks)) {
Linus Torvalds's avatar
Linus Torvalds committed
1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
		MD_BUG();
		return -EINVAL;
	}

	if (mddev->pers)
		return -EBUSY;

	/*
	 * Resize disks to align partitions size on a given
	 * chunk size.
	 */
	md_size[mdidx(mddev)] = 0;

	/*
	 * Analyze all RAID superblock(s)
	 */
1363
	if (!mddev->raid_disks && analyze_sbs(mddev)) {
Linus Torvalds's avatar
Linus Torvalds committed
1364 1365 1366 1367
		MD_BUG();
		return -EINVAL;
	}

1368 1369
	chunk_size = mddev->chunk_size;
	pnum = level_to_pers(mddev->level);
Linus Torvalds's avatar
Linus Torvalds committed
1370

Linus Torvalds's avatar
Linus Torvalds committed
1371
	if ((pnum != MULTIPATH) && (pnum != RAID1)) {
Linus Torvalds's avatar
Linus Torvalds committed
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
		if (!chunk_size) {
			/*
			 * 'default chunksize' in the old md code used to
			 * be PAGE_SIZE, baaad.
			 * we abort here to be on the safe side. We dont
			 * want to continue the bad practice.
			 */
			printk(BAD_CHUNKSIZE);
			return -EINVAL;
		}
		if (chunk_size > MAX_CHUNK_SIZE) {
			printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
			return -EINVAL;
		}
		/*
		 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
		 */
		if ( (1 << ffz(~chunk_size)) != chunk_size) {
			MD_BUG();
			return -EINVAL;
		}
		if (chunk_size < PAGE_SIZE) {
			printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
			return -EINVAL;
		}
	} else
		if (chunk_size)
Linus Torvalds's avatar
Linus Torvalds committed
1399
			printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
1400
			       mddev->level);
Linus Torvalds's avatar
Linus Torvalds committed
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414

	if (pnum >= MAX_PERSONALITY) {
		MD_BUG();
		return -EINVAL;
	}

	if (!pers[pnum])
	{
#ifdef CONFIG_KMOD
		char module_name[80];
		sprintf (module_name, "md-personality-%d", pnum);
		request_module (module_name);
		if (!pers[pnum])
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1415
		{
Linus Torvalds's avatar
Linus Torvalds committed
1416
			printk(KERN_ERR "md: personality %d is not loaded!\n",
Linus Torvalds's avatar
Linus Torvalds committed
1417
				pnum);
Linus Torvalds's avatar
Linus Torvalds committed
1418
			return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1419
		}
Linus Torvalds's avatar
Linus Torvalds committed
1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
	}

	if (device_size_calculation(mddev))
		return -EINVAL;

	/*
	 * Drop all container device buffers, from now on
	 * the only valid external interface is through the md
	 * device.
	 * Also find largest hardsector size
	 */
	ITERATE_RDEV(mddev,rdev,tmp) {
		if (rdev->faulty)
			continue;
1434 1435
		sync_blockdev(rdev->bdev);
		invalidate_bdev(rdev->bdev, 0);
1436 1437 1438 1439 1440 1441 1442 1443
#if 0
	/*
	 * Aside of obvious breakage (code below results in block size set
	 * according to the sector size of last component instead of the
	 * maximal sector size), we have more interesting problem here.
	 * Namely, we actually ought to set _sector_ size for the array
	 * and that requires per-array request queues.  Disabled for now.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1444
		md_blocksizes[mdidx(mddev)] = 1024;
1445 1446
		if (bdev_hardsect_size(rdev->bdev) > md_blocksizes[mdidx(mddev)])
			md_blocksizes[mdidx(mddev)] = bdev_hardsect_size(rdev->bdev);
1447
#endif
Linus Torvalds's avatar
Linus Torvalds committed
1448
	}
1449 1450 1451 1452 1453

	disk = kmalloc(sizeof(struct gendisk), GFP_KERNEL);
	if (!disk)
		return -ENOMEM;
	memset(disk, 0, sizeof(struct gendisk));
1454 1455
	major_name = kmalloc(6, GFP_KERNEL);
	if (!major_name) {
1456 1457 1458 1459 1460 1461
		kfree(disk);
		return -ENOMEM;
	}
	disk->major = MD_MAJOR;
	disk->first_minor = mdidx(mddev);
	disk->minor_shift = 0;
1462 1463
	sprintf(major_name, "md%d", mdidx(mddev));
	disk->major_name = major_name;
1464 1465 1466 1467
	disk->part = md_hd_struct + mdidx(mddev);
	disk->nr_real = 1;
	disk->fops = &md_fops;

Linus Torvalds's avatar
Linus Torvalds committed
1468 1469
	mddev->pers = pers[pnum];

1470
	blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
1471 1472
	mddev->queue.queuedata = mddev;

Linus Torvalds's avatar
Linus Torvalds committed
1473 1474
	err = mddev->pers->run(mddev);
	if (err) {
Linus Torvalds's avatar
Linus Torvalds committed
1475
		printk(KERN_ERR "md: pers->run() failed ...\n");
Linus Torvalds's avatar
Linus Torvalds committed
1476
		mddev->pers = NULL;
1477 1478
		kfree(disk->major_name);
		kfree(disk);
Linus Torvalds's avatar
Linus Torvalds committed
1479 1480 1481
		return -EINVAL;
	}

1482
	mddev->in_sync = (mddev->state & (1<<MD_SB_CLEAN));
1483 1484 1485 1486
	/* if personality doesn't have "sync_request", then
	 * a dirty array doesn't mean anything
	 */
	if (mddev->pers->sync_request)
1487
		mddev->state &= ~(1 << MD_SB_CLEAN);
1488
	md_update_sb(mddev);
1489
	md_recover_arrays();
1490 1491
	add_gendisk(disk);
	register_disk(disk, mk_kdev(disk->major,disk->first_minor),
Linus Torvalds's avatar
Linus Torvalds committed
1492
			1, &md_fops, md_size[mdidx(mddev)]<<1);
1493
	disks[mdidx(mddev)] = disk;
Linus Torvalds's avatar
Linus Torvalds committed
1494 1495 1496 1497 1498 1499 1500

	return (0);
}

#undef TOO_BIG_CHUNKSIZE
#undef BAD_CHUNKSIZE

Linus Torvalds's avatar
Linus Torvalds committed
1501
static int restart_array(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
1502
{
1503
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
1504 1505 1506 1507

	/*
	 * Complain if it has no devices
	 */
1508
	err = -ENXIO;
1509
	if (list_empty(&mddev->disks))
1510
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1511 1512

	if (mddev->pers) {
1513
		err = -EBUSY;
Linus Torvalds's avatar
Linus Torvalds committed
1514
		if (!mddev->ro)
1515
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1516 1517 1518 1519

		mddev->ro = 0;
		set_device_ro(mddev_to_kdev(mddev), 0);

Linus Torvalds's avatar
Linus Torvalds committed
1520
		printk(KERN_INFO
Linus Torvalds's avatar
Linus Torvalds committed
1521
			"md: md%d switched to read-write mode.\n", mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
1522 1523 1524 1525
		/*
		 * Kick recovery or resync if necessary
		 */
		md_recover_arrays();
1526
		err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1527
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
1528
		printk(KERN_ERR "md: md%d has no personality assigned.\n",
Linus Torvalds's avatar
Linus Torvalds committed
1529
			mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
1530
		err = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1531
	}
Linus Torvalds's avatar
Linus Torvalds committed
1532 1533 1534 1535 1536 1537 1538 1539 1540 1541

out:
	return err;
}

#define STILL_MOUNTED KERN_WARNING \
"md: md%d still mounted.\n"
#define	STILL_IN_USE \
"md: md%d still in use.\n"

Linus Torvalds's avatar
Linus Torvalds committed
1542
static int do_md_stop(mddev_t * mddev, int ro)
Linus Torvalds's avatar
Linus Torvalds committed
1543
{
1544
	int err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1545
	kdev_t dev = mddev_to_kdev(mddev);
1546
	struct gendisk *disk;
Linus Torvalds's avatar
Linus Torvalds committed
1547

Linus Torvalds's avatar
Linus Torvalds committed
1548 1549
	if (atomic_read(&mddev->active)>1) {
		printk(STILL_IN_USE, mdidx(mddev));
1550 1551
		err = -EBUSY;
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1552
	}
Linus Torvalds's avatar
Linus Torvalds committed
1553 1554

	if (mddev->pers) {
1555 1556 1557 1558 1559 1560
		if (mddev->sync_thread) {
			if (mddev->recovery_running > 0)
				mddev->recovery_running = -EINTR;
			md_unregister_thread(mddev->sync_thread);
			mddev->sync_thread = NULL;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1561

Linus Torvalds's avatar
Linus Torvalds committed
1562
		invalidate_device(dev, 1);
Linus Torvalds's avatar
Linus Torvalds committed
1563 1564

		if (ro) {
1565
			err  = -ENXIO;
Linus Torvalds's avatar
Linus Torvalds committed
1566
			if (mddev->ro)
1567
				goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1568 1569 1570 1571 1572
			mddev->ro = 1;
		} else {
			if (mddev->ro)
				set_device_ro(dev, 0);
			if (mddev->pers->stop(mddev)) {
1573
				err = -EBUSY;
Linus Torvalds's avatar
Linus Torvalds committed
1574 1575
				if (mddev->ro)
					set_device_ro(dev, 1);
1576
				goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1577 1578 1579 1580
			}
			if (mddev->ro)
				mddev->ro = 0;
		}
1581
		if (mddev->raid_disks) {
Linus Torvalds's avatar
Linus Torvalds committed
1582 1583 1584 1585
			/*
			 * mark it clean only if there was no resync
			 * interrupted.
			 */
1586
			if (mddev->in_sync) {
Linus Torvalds's avatar
Linus Torvalds committed
1587
				printk(KERN_INFO "md: marking sb clean...\n");
1588
				mddev->state |= 1 << MD_SB_CLEAN;
Linus Torvalds's avatar
Linus Torvalds committed
1589
			}
1590
			md_update_sb(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
1591 1592 1593 1594
		}
		if (ro)
			set_device_ro(dev, 1);
	}
1595 1596 1597 1598 1599 1600 1601 1602
	disk = disks[mdidx(mddev)];
	disks[mdidx(mddev)] = NULL;

	if (disk) {
		del_gendisk(disk);
		kfree(disk->major_name);
		kfree(disk);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1603 1604 1605 1606 1607

	/*
	 * Free resources if final stop
	 */
	if (!ro) {
Linus Torvalds's avatar
Linus Torvalds committed
1608
		printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
1609 1610
		free_mddev(mddev);
	} else
Linus Torvalds's avatar
Linus Torvalds committed
1611
		printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
1612
	err = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1613 1614 1615 1616 1617 1618 1619
out:
	return err;
}

/*
 * We have to safely support old arrays too.
 */
Linus Torvalds's avatar
Linus Torvalds committed
1620
int detect_old_array(mdp_super_t *sb)
Linus Torvalds's avatar
Linus Torvalds committed
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
{
	if (sb->major_version > 0)
		return 0;
	if (sb->minor_version >= 90)
		return 0;

	return -EINVAL;
}


Linus Torvalds's avatar
Linus Torvalds committed
1631
static void autorun_array(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
1632 1633
{
	mdk_rdev_t *rdev;
Linus Torvalds's avatar
Linus Torvalds committed
1634
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
1635 1636
	int err;

1637
	if (list_empty(&mddev->disks)) {
Linus Torvalds's avatar
Linus Torvalds committed
1638 1639 1640 1641
		MD_BUG();
		return;
	}

Linus Torvalds's avatar
Linus Torvalds committed
1642
	printk(KERN_INFO "md: running: ");
Linus Torvalds's avatar
Linus Torvalds committed
1643 1644

	ITERATE_RDEV(mddev,rdev,tmp) {
1645
		printk("<%s>", bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1646
	}
Linus Torvalds's avatar
Linus Torvalds committed
1647
	printk("\n");
Linus Torvalds's avatar
Linus Torvalds committed
1648 1649 1650

	err = do_md_run (mddev);
	if (err) {
Linus Torvalds's avatar
Linus Torvalds committed
1651
		printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
Linus Torvalds's avatar
Linus Torvalds committed
1652 1653 1654 1655 1656 1657 1658 1659 1660 1661
		/*
		 * prevent the writeback of an unrunnable array
		 */
		mddev->sb_dirty = 0;
		do_md_stop (mddev, 0);
	}
}

/*
 * lets try to run arrays based on all disks that have arrived
Neil Brown's avatar
Neil Brown committed
1662
 * until now. (those are in pending_raid_disks)
Linus Torvalds's avatar
Linus Torvalds committed
1663 1664 1665 1666 1667 1668 1669 1670 1671
 *
 * the method: pick the first pending disk, collect all disks with
 * the same UUID, remove all from the pending list and put them into
 * the 'same_array' list. Then order this list based on superblock
 * update time (freshest comes first), kick out 'old' disks and
 * compare superblocks. If everything's fine then run it.
 *
 * If "unit" is allocated, then bump its reference count
 */
1672
static void autorun_devices(void)
Linus Torvalds's avatar
Linus Torvalds committed
1673
{
Linus Torvalds's avatar
Linus Torvalds committed
1674 1675
	struct list_head candidates;
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
1676 1677 1678
	mdk_rdev_t *rdev0, *rdev;
	mddev_t *mddev;

Linus Torvalds's avatar
Linus Torvalds committed
1679
	printk(KERN_INFO "md: autorun ...\n");
1680
	while (!list_empty(&pending_raid_disks)) {
Linus Torvalds's avatar
Linus Torvalds committed
1681
		rdev0 = list_entry(pending_raid_disks.next,
Neil Brown's avatar
Neil Brown committed
1682
					 mdk_rdev_t, same_set);
Linus Torvalds's avatar
Linus Torvalds committed
1683

1684
		printk(KERN_INFO "md: considering %s ...\n", bdev_partition_name(rdev0->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1685
		INIT_LIST_HEAD(&candidates);
Linus Torvalds's avatar
Linus Torvalds committed
1686 1687 1688
		ITERATE_RDEV_PENDING(rdev,tmp) {
			if (uuid_equal(rdev0, rdev)) {
				if (!sb_equal(rdev0->sb, rdev->sb)) {
Linus Torvalds's avatar
Linus Torvalds committed
1689 1690
					printk(KERN_WARNING
					       "md: %s has same UUID as %s, but superblocks differ ...\n",
1691
					       bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1692 1693
					continue;
				}
1694
				printk(KERN_INFO "md:  adding %s ...\n", bdev_partition_name(rdev->bdev));
Neil Brown's avatar
Neil Brown committed
1695
				list_move(&rdev->same_set, &candidates);
Linus Torvalds's avatar
Linus Torvalds committed
1696 1697 1698 1699 1700 1701 1702
			}
		}
		/*
		 * now we have a set of devices, with all of them having
		 * mostly sane superblocks. It's time to allocate the
		 * mddev.
		 */
1703 1704

		mddev = mddev_find(rdev0->sb->md_minor);
Linus Torvalds's avatar
Linus Torvalds committed
1705 1706 1707 1708
		if (!mddev) {
			printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
			break;
		}
1709 1710 1711
		if (mddev_lock(mddev)) 
			printk(KERN_WARNING "md: md%d locked, cannot run\n",
			       mdidx(mddev));
1712
		else if (mddev->raid_disks || !list_empty(&mddev->disks)) {
Linus Torvalds's avatar
Linus Torvalds committed
1713
			printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
1714
			       mdidx(mddev), bdev_partition_name(rdev0->bdev));
1715 1716 1717
			mddev_unlock(mddev);
		} else {
			printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
Neil Brown's avatar
Neil Brown committed
1718 1719
			ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
				list_del_init(&rdev->same_set);
1720 1721 1722 1723
				bind_rdev_to_array(rdev, mddev);
			}
			autorun_array(mddev);
			mddev_unlock(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
1724
		}
1725 1726 1727
		/* on success, candidates will be empty, on error
		 * it wont...
		 */
Neil Brown's avatar
Neil Brown committed
1728
		ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
1729
			export_rdev(rdev);
1730
		mddev_put(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
1731
	}
Linus Torvalds's avatar
Linus Torvalds committed
1732
	printk(KERN_INFO "md: ... autorun DONE.\n");
Linus Torvalds's avatar
Linus Torvalds committed
1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766
}

/*
 * import RAID devices based on one partition
 * if possible, the array gets run as well.
 */

#define BAD_VERSION KERN_ERR \
"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"

#define OUT_OF_MEM KERN_ALERT \
"md: out of memory.\n"

#define NO_DEVICE KERN_ERR \
"md: disabled device %s\n"

#define AUTOADD_FAILED KERN_ERR \
"md: auto-adding devices to md%d FAILED (error %d).\n"

#define AUTOADD_FAILED_USED KERN_ERR \
"md: cannot auto-add device %s to md%d, already used.\n"

#define AUTORUN_FAILED KERN_ERR \
"md: auto-running md%d FAILED (error %d).\n"

#define MDDEV_BUSY KERN_ERR \
"md: cannot auto-add to md%d, already running.\n"

#define AUTOADDING KERN_INFO \
"md: auto-adding devices to md%d, based on %s's superblock.\n"

#define AUTORUNNING KERN_INFO \
"md: auto-running md%d.\n"

1767
static int autostart_array(dev_t startdev)
Linus Torvalds's avatar
Linus Torvalds committed
1768 1769 1770 1771 1772
{
	int err = -EINVAL, i;
	mdp_super_t *sb = NULL;
	mdk_rdev_t *start_rdev = NULL, *rdev;

1773 1774
	start_rdev = md_import_device(startdev, 1);
	if (IS_ERR(start_rdev)) {
1775
		printk(KERN_WARNING "md: could not import %s!\n", partition_name(to_kdev_t(startdev)));
Linus Torvalds's avatar
Linus Torvalds committed
1776 1777 1778 1779
		goto abort;
	}

	if (start_rdev->faulty) {
Linus Torvalds's avatar
Linus Torvalds committed
1780
		printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
1781
						bdev_partition_name(start_rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1782 1783
		goto abort;
	}
Neil Brown's avatar
Neil Brown committed
1784
	list_add(&start_rdev->same_set, &pending_raid_disks);
Linus Torvalds's avatar
Linus Torvalds committed
1785 1786 1787 1788 1789

	sb = start_rdev->sb;

	err = detect_old_array(sb);
	if (err) {
Linus Torvalds's avatar
Linus Torvalds committed
1790 1791 1792
		printk(KERN_WARNING "md: array version is too old to be autostarted ,"
		       "use raidtools 0.90 mkraid --upgrade to upgrade the array "
		       "without data loss!\n");
Linus Torvalds's avatar
Linus Torvalds committed
1793 1794 1795 1796 1797
		goto abort;
	}

	for (i = 0; i < MD_SB_DISKS; i++) {
		mdp_disk_t *desc;
1798
		dev_t dev;
Linus Torvalds's avatar
Linus Torvalds committed
1799 1800

		desc = sb->disks + i;
1801
		dev = MKDEV(desc->major, desc->minor);
Linus Torvalds's avatar
Linus Torvalds committed
1802

1803
		if (!dev)
Linus Torvalds's avatar
Linus Torvalds committed
1804
			continue;
1805
		if (dev == startdev)
Linus Torvalds's avatar
Linus Torvalds committed
1806
			continue;
1807 1808
		rdev = md_import_device(dev, 1);
		if (IS_ERR(rdev)) {
Linus Torvalds's avatar
Linus Torvalds committed
1809
			printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
1810
			       partition_name(to_kdev_t(dev)));
Linus Torvalds's avatar
Linus Torvalds committed
1811 1812
			continue;
		}
Neil Brown's avatar
Neil Brown committed
1813
		list_add(&rdev->same_set, &pending_raid_disks);
Linus Torvalds's avatar
Linus Torvalds committed
1814 1815 1816 1817 1818
	}

	/*
	 * possibly return codes
	 */
1819
	autorun_devices();
Linus Torvalds's avatar
Linus Torvalds committed
1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837
	return 0;

abort:
	if (start_rdev)
		export_rdev(start_rdev);
	return err;
}

#undef BAD_VERSION
#undef OUT_OF_MEM
#undef NO_DEVICE
#undef AUTOADD_FAILED_USED
#undef AUTOADD_FAILED
#undef AUTORUN_FAILED
#undef AUTOADDING
#undef AUTORUNNING


Linus Torvalds's avatar
Linus Torvalds committed
1838
static int get_version(void * arg)
Linus Torvalds's avatar
Linus Torvalds committed
1839 1840 1841 1842 1843 1844 1845
{
	mdu_version_t ver;

	ver.major = MD_MAJOR_VERSION;
	ver.minor = MD_MINOR_VERSION;
	ver.patchlevel = MD_PATCHLEVEL_VERSION;

Linus Torvalds's avatar
Linus Torvalds committed
1846
	if (copy_to_user(arg, &ver, sizeof(ver)))
Linus Torvalds's avatar
Linus Torvalds committed
1847 1848 1849 1850 1851
		return -EFAULT;

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
1852
static int get_array_info(mddev_t * mddev, void * arg)
Linus Torvalds's avatar
Linus Torvalds committed
1853 1854
{
	mdu_array_info_t info;
1855 1856 1857
	int nr,working,active,failed,spare;
	mdk_rdev_t *rdev;
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
1858

1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871
	nr=working=active=failed=spare=0;
	ITERATE_RDEV(mddev,rdev,tmp) {
		nr++;
		if (rdev->faulty)
			failed++;
		else {
			working++;
			if (rdev->in_sync)
				active++;	
			else
				spare++;
		}
	}
Linus Torvalds's avatar
Linus Torvalds committed
1872

1873 1874 1875 1876 1877 1878 1879 1880 1881 1882
	info.major_version = mddev->major_version;
	info.major_version = mddev->major_version;
	info.minor_version = mddev->minor_version;
	info.patch_version = mddev->patch_version;
	info.ctime         = mddev->ctime;
	info.level         = mddev->level;
	info.size          = mddev->size;
	info.nr_disks      = nr;
	info.raid_disks    = mddev->raid_disks;
	info.md_minor      = mddev->__minor;
1883
	info.not_persistent= !mddev->persistent;
Linus Torvalds's avatar
Linus Torvalds committed
1884

1885 1886
	info.utime         = mddev->utime;
	info.state         = mddev->state;
1887 1888 1889 1890
	info.active_disks  = active;
	info.working_disks = working;
	info.failed_disks  = failed;
	info.spare_disks   = spare;
Linus Torvalds's avatar
Linus Torvalds committed
1891

1892 1893
	info.layout        = mddev->layout;
	info.chunk_size    = mddev->chunk_size;
Linus Torvalds's avatar
Linus Torvalds committed
1894

Linus Torvalds's avatar
Linus Torvalds committed
1895
	if (copy_to_user(arg, &info, sizeof(info)))
Linus Torvalds's avatar
Linus Torvalds committed
1896 1897 1898 1899 1900 1901
		return -EFAULT;

	return 0;
}
#undef SET_FROM_SB

1902

Linus Torvalds's avatar
Linus Torvalds committed
1903
static int get_disk_info(mddev_t * mddev, void * arg)
Linus Torvalds's avatar
Linus Torvalds committed
1904 1905 1906
{
	mdu_disk_info_t info;
	unsigned int nr;
1907
	mdk_rdev_t *rdev;
Linus Torvalds's avatar
Linus Torvalds committed
1908

Linus Torvalds's avatar
Linus Torvalds committed
1909
	if (copy_from_user(&info, arg, sizeof(info)))
Linus Torvalds's avatar
Linus Torvalds committed
1910 1911 1912
		return -EFAULT;

	nr = info.number;
Linus Torvalds's avatar
Linus Torvalds committed
1913
	if (nr >= MD_SB_DISKS)
Linus Torvalds's avatar
Linus Torvalds committed
1914 1915
		return -EINVAL;

1916 1917
	rdev = find_rdev_nr(mddev, nr);
	if (rdev) {
1918 1919
		info.major = MAJOR(rdev->bdev->bd_dev);
		info.minor = MINOR(rdev->bdev->bd_dev);
1920 1921 1922 1923 1924 1925 1926 1927 1928 1929
		info.raid_disk = rdev->raid_disk;
		info.state = 0;
		if (rdev->faulty)
			info.state |= (1<<MD_DISK_FAULTY);
		else if (rdev->in_sync) {
			info.state |= (1<<MD_DISK_ACTIVE);
			info.state |= (1<<MD_DISK_SYNC);
		}
	} else {
		info.major = info.minor = 0;
1930
		info.raid_disk = -1;
1931 1932
		info.state = (1<<MD_DISK_REMOVED);
	}
Linus Torvalds's avatar
Linus Torvalds committed
1933

Linus Torvalds's avatar
Linus Torvalds committed
1934
	if (copy_to_user(arg, &info, sizeof(info)))
Linus Torvalds's avatar
Linus Torvalds committed
1935 1936 1937 1938 1939
		return -EFAULT;

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
1940
static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
Linus Torvalds's avatar
Linus Torvalds committed
1941
{
1942
	int size;
Linus Torvalds's avatar
Linus Torvalds committed
1943
	mdk_rdev_t *rdev;
1944 1945
	dev_t dev;
	dev = MKDEV(info->major,info->minor);
1946
	if (!mddev->raid_disks) {
Linus Torvalds's avatar
Linus Torvalds committed
1947
		/* expecting a device which has a superblock */
1948 1949 1950
		rdev = md_import_device(dev, 1);
		if (IS_ERR(rdev)) {
			printk(KERN_WARNING "md: md_import_device returned %ld\n", PTR_ERR(rdev));
1951
			return PTR_ERR(rdev);
Linus Torvalds's avatar
Linus Torvalds committed
1952
		}
1953
		if (!list_empty(&mddev->disks)) {
Linus Torvalds's avatar
Linus Torvalds committed
1954
			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
Linus Torvalds's avatar
Linus Torvalds committed
1955
							mdk_rdev_t, same_set);
Linus Torvalds's avatar
Linus Torvalds committed
1956
			if (!uuid_equal(rdev0, rdev)) {
Linus Torvalds's avatar
Linus Torvalds committed
1957
				printk(KERN_WARNING "md: %s has different UUID to %s\n",
1958
				       bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1959 1960 1961 1962
				export_rdev(rdev);
				return -EINVAL;
			}
			if (!sb_equal(rdev0->sb, rdev->sb)) {
Linus Torvalds's avatar
Linus Torvalds committed
1963
				printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
1964
				       bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
1965 1966 1967 1968 1969 1970 1971 1972
				export_rdev(rdev);
				return -EINVAL;
			}
		}
		bind_rdev_to_array(rdev, mddev);
		return 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
1973
	if (!(info->state & (1<<MD_DISK_FAULTY))) {
1974 1975 1976
		rdev = md_import_device (dev, 0);
		if (IS_ERR(rdev)) {
			printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev));
1977
			return PTR_ERR(rdev);
Linus Torvalds's avatar
Linus Torvalds committed
1978 1979
		}
		rdev->desc_nr = info->number;
1980 1981 1982 1983 1984
		if (info->raid_disk < mddev->raid_disks)
			rdev->raid_disk = info->raid_disk;
		else
			rdev->raid_disk = -1;

1985
		rdev->faulty = 0;
1986
		if (rdev->raid_disk < mddev->raid_disks)
1987 1988 1989
			rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
		else
			rdev->in_sync = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1990 1991 1992

		bind_rdev_to_array(rdev, mddev);

1993
		if (!mddev->persistent)
Linus Torvalds's avatar
Linus Torvalds committed
1994
			printk(KERN_INFO "md: nonpersistent superblock ...\n");
Linus Torvalds's avatar
Linus Torvalds committed
1995

1996 1997
		size = calc_dev_size(rdev->bdev, mddev);
		rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
Linus Torvalds's avatar
Linus Torvalds committed
1998

1999 2000
		if (!mddev->size || (mddev->size > size))
			mddev->size = size;
Linus Torvalds's avatar
Linus Torvalds committed
2001 2002 2003 2004 2005
	}

	return 0;
}

2006
static int hot_generate_error(mddev_t * mddev, dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
2007 2008 2009
{
	struct request_queue *q;
	mdk_rdev_t *rdev;
Linus Torvalds's avatar
Linus Torvalds committed
2010

Linus Torvalds's avatar
Linus Torvalds committed
2011 2012
	if (!mddev->pers)
		return -ENODEV;
Linus Torvalds's avatar
Linus Torvalds committed
2013 2014

	printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
2015
		partition_name(to_kdev_t(dev)), mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2016

Linus Torvalds's avatar
Linus Torvalds committed
2017 2018 2019 2020 2021
	rdev = find_rdev(mddev, dev);
	if (!rdev) {
		MD_BUG();
		return -ENXIO;
	}
Linus Torvalds's avatar
Linus Torvalds committed
2022

Linus Torvalds's avatar
Linus Torvalds committed
2023 2024 2025 2026
	if (rdev->desc_nr == -1) {
		MD_BUG();
		return -EINVAL;
	}
2027
	if (!rdev->in_sync)
Linus Torvalds's avatar
Linus Torvalds committed
2028
		return -ENODEV;
Linus Torvalds's avatar
Linus Torvalds committed
2029

2030
	q = bdev_get_queue(rdev->bdev);
Linus Torvalds's avatar
Linus Torvalds committed
2031 2032 2033 2034
	if (!q) {
		MD_BUG();
		return -ENODEV;
	}
Linus Torvalds's avatar
Linus Torvalds committed
2035
	printk(KERN_INFO "md: okay, generating error!\n");
Linus Torvalds's avatar
Linus Torvalds committed
2036
//	q->oneshot_error = 1; // disabled for now
Linus Torvalds's avatar
Linus Torvalds committed
2037

Linus Torvalds's avatar
Linus Torvalds committed
2038 2039 2040
	return 0;
}

2041
static int hot_remove_disk(mddev_t * mddev, dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
2042 2043 2044 2045 2046 2047
{
	mdk_rdev_t *rdev;

	if (!mddev->pers)
		return -ENODEV;

Linus Torvalds's avatar
Linus Torvalds committed
2048
	printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
2049
		partition_name(to_kdev_t(dev)), mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2050 2051 2052 2053 2054

	rdev = find_rdev(mddev, dev);
	if (!rdev)
		return -ENXIO;

2055
	if (rdev->raid_disk >= 0)
Linus Torvalds's avatar
Linus Torvalds committed
2056 2057 2058
		goto busy;

	kick_rdev_from_array(rdev);
2059
	md_update_sb(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
2060 2061 2062

	return 0;
busy:
Linus Torvalds's avatar
Linus Torvalds committed
2063
	printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
2064
		bdev_partition_name(rdev->bdev), mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2065 2066 2067
	return -EBUSY;
}

2068
static int hot_add_disk(mddev_t * mddev, dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
2069
{
2070
	int i, err;
Linus Torvalds's avatar
Linus Torvalds committed
2071 2072 2073 2074 2075 2076
	unsigned int size;
	mdk_rdev_t *rdev;

	if (!mddev->pers)
		return -ENODEV;

Linus Torvalds's avatar
Linus Torvalds committed
2077
	printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
2078
		partition_name(to_kdev_t(dev)), mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2079

2080
	if (!mddev->pers->hot_add_disk) {
Linus Torvalds's avatar
Linus Torvalds committed
2081 2082
		printk(KERN_WARNING "md%d: personality does not support diskops!\n",
		       mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2083 2084 2085
		return -EINVAL;
	}

2086 2087 2088
	rdev = md_import_device (dev, 0);
	if (IS_ERR(rdev)) {
		printk(KERN_WARNING "md: error, md_import_device() returned %ld\n", PTR_ERR(rdev));
Linus Torvalds's avatar
Linus Torvalds committed
2089 2090
		return -EINVAL;
	}
2091 2092

	size = calc_dev_size(rdev->bdev, mddev);
2093

2094 2095 2096
	if (size < mddev->size) {
		printk(KERN_WARNING "md%d: disk size %d blocks < array size %ld\n",
				mdidx(mddev), size, mddev->size);
2097 2098 2099 2100
		err = -ENOSPC;
		goto abort_export;
	}

Linus Torvalds's avatar
Linus Torvalds committed
2101
	if (rdev->faulty) {
Linus Torvalds's avatar
Linus Torvalds committed
2102
		printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
2103
				bdev_partition_name(rdev->bdev), mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2104 2105 2106
		err = -EINVAL;
		goto abort_export;
	}
2107
	rdev->in_sync = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2108 2109 2110 2111 2112 2113 2114
	bind_rdev_to_array(rdev, mddev);

	/*
	 * The rest should better be atomic, we can have disk failures
	 * noticed in interrupt contexts ...
	 */
	rdev->size = size;
2115
	rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
Linus Torvalds's avatar
Linus Torvalds committed
2116

2117
	for (i = mddev->raid_disks; i < MD_SB_DISKS; i++)
2118
		if (find_rdev_nr(mddev,i)==NULL)
Linus Torvalds's avatar
Linus Torvalds committed
2119
			break;
2120

Linus Torvalds's avatar
Linus Torvalds committed
2121
	if (i == MD_SB_DISKS) {
Linus Torvalds's avatar
Linus Torvalds committed
2122 2123
		printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
		       mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2124 2125 2126 2127
		err = -EBUSY;
		goto abort_unbind_export;
	}

2128
	rdev->desc_nr = i;
2129
	rdev->raid_disk = -1;
Linus Torvalds's avatar
Linus Torvalds committed
2130

2131
	md_update_sb(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148

	/*
	 * Kick recovery, maybe this spare has to be added to the
	 * array immediately.
	 */
	md_recover_arrays();

	return 0;

abort_unbind_export:
	unbind_rdev_from_array(rdev);

abort_export:
	export_rdev(rdev);
	return err;
}

Linus Torvalds's avatar
Linus Torvalds committed
2149
static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
Linus Torvalds's avatar
Linus Torvalds committed
2150 2151
{

2152 2153 2154 2155
	mddev->major_version = MD_MAJOR_VERSION;
	mddev->minor_version = MD_MINOR_VERSION;
	mddev->patch_version = MD_PATCHLEVEL_VERSION;
	mddev->ctime         = CURRENT_TIME;
Linus Torvalds's avatar
Linus Torvalds committed
2156

2157 2158 2159 2160 2161 2162 2163
	mddev->level         = info->level;
	mddev->size          = info->size;
	mddev->raid_disks    = info->raid_disks;
	/* don't set __minor, it is determined by which /dev/md* was
	 * openned
	 */
	mddev->state         = info->state;
2164
	mddev->persistent    = ! info->not_persistent;
Linus Torvalds's avatar
Linus Torvalds committed
2165

2166 2167
	mddev->layout        = info->layout;
	mddev->chunk_size    = info->chunk_size;
Linus Torvalds's avatar
Linus Torvalds committed
2168 2169 2170 2171 2172 2173



	/*
	 * Generate a 128 bit UUID
	 */
2174
	get_random_bytes(mddev->uuid, 16);
Linus Torvalds's avatar
Linus Torvalds committed
2175 2176 2177 2178

	return 0;
}

2179
static int set_disk_faulty(mddev_t *mddev, dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
2180
{
2181
	mdk_rdev_t *rdev;
Linus Torvalds's avatar
Linus Torvalds committed
2182

2183 2184 2185 2186
	rdev = find_rdev(mddev, dev);
	if (!rdev)
		return 0;

2187 2188
	md_error(mddev, rdev);
	return 1;
Linus Torvalds's avatar
Linus Torvalds committed
2189 2190
}

Linus Torvalds's avatar
Linus Torvalds committed
2191
static int md_ioctl(struct inode *inode, struct file *file,
Linus Torvalds's avatar
Linus Torvalds committed
2192 2193 2194 2195 2196 2197 2198 2199
			unsigned int cmd, unsigned long arg)
{
	unsigned int minor;
	int err = 0;
	struct hd_geometry *loc = (struct hd_geometry *) arg;
	mddev_t *mddev = NULL;
	kdev_t dev;

Linus Torvalds's avatar
Linus Torvalds committed
2200
	if (!capable(CAP_SYS_ADMIN))
Linus Torvalds's avatar
Linus Torvalds committed
2201 2202 2203
		return -EACCES;

	dev = inode->i_rdev;
Linus Torvalds's avatar
Linus Torvalds committed
2204
	minor = minor(dev);
Linus Torvalds's avatar
Linus Torvalds committed
2205 2206
	if (minor >= MAX_MD_DEVS) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
2207
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
2208
	}
Linus Torvalds's avatar
Linus Torvalds committed
2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222

	/*
	 * Commands dealing with the RAID driver but not any
	 * particular array:
	 */
	switch (cmd)
	{
		case RAID_VERSION:
			err = get_version((void *)arg);
			goto done;

		case PRINT_RAID_DEBUG:
			err = 0;
			md_print_devices();
2223
			goto done;
Linus Torvalds's avatar
Linus Torvalds committed
2224

Linus Torvalds's avatar
Linus Torvalds committed
2225
#ifndef MODULE
Linus Torvalds's avatar
Linus Torvalds committed
2226 2227 2228 2229
		case RAID_AUTORUN:
			err = 0;
			autostart_arrays();
			goto done;
Linus Torvalds's avatar
Linus Torvalds committed
2230
#endif
Linus Torvalds's avatar
Linus Torvalds committed
2231
		default:;
Linus Torvalds's avatar
Linus Torvalds committed
2232 2233 2234 2235 2236 2237
	}

	/*
	 * Commands creating/starting a new array:
	 */

2238
	mddev = inode->i_bdev->bd_inode->u.generic_ip;
Linus Torvalds's avatar
Linus Torvalds committed
2239

2240
	if (!mddev) {
2241
		BUG();
2242
		goto abort;
Linus Torvalds's avatar
Linus Torvalds committed
2243
	}
2244

2245 2246 2247 2248 2249

	if (cmd == START_ARRAY) {
		/* START_ARRAY doesn't need to lock the array as autostart_array
		 * does the locking, and it could even be a different array
		 */
2250
		err = autostart_array(arg);
2251 2252 2253 2254 2255 2256 2257 2258
		if (err) {
			printk(KERN_WARNING "md: autostart %s failed!\n",
			       partition_name(val_to_kdev(arg)));
			goto abort;
		}
		goto done;
	}

2259 2260 2261 2262 2263 2264 2265
	err = mddev_lock(mddev);
	if (err) {
		printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",
		       err, cmd);
		goto abort;
	}

Linus Torvalds's avatar
Linus Torvalds committed
2266 2267 2268 2269
	switch (cmd)
	{
		case SET_ARRAY_INFO:

2270 2271 2272 2273 2274
			if (!list_empty(&mddev->disks)) {
				printk(KERN_WARNING "md: array md%d already has disks!\n",
					mdidx(mddev));
				err = -EBUSY;
				goto abort_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
2275
			}
2276 2277
			if (mddev->raid_disks) {
				printk(KERN_WARNING "md: array md%d already initialised!\n",
Linus Torvalds's avatar
Linus Torvalds committed
2278
					mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2279 2280 2281 2282 2283
				err = -EBUSY;
				goto abort_unlock;
			}
			if (arg) {
				mdu_array_info_t info;
Linus Torvalds's avatar
Linus Torvalds committed
2284
				if (copy_from_user(&info, (void*)arg, sizeof(info))) {
Linus Torvalds's avatar
Linus Torvalds committed
2285 2286 2287 2288 2289
					err = -EFAULT;
					goto abort_unlock;
				}
				err = set_array_info(mddev, &info);
				if (err) {
Linus Torvalds's avatar
Linus Torvalds committed
2290
					printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
Linus Torvalds's avatar
Linus Torvalds committed
2291 2292 2293 2294 2295
					goto abort_unlock;
				}
			}
			goto done_unlock;

Linus Torvalds's avatar
Linus Torvalds committed
2296
		default:;
Linus Torvalds's avatar
Linus Torvalds committed
2297 2298 2299 2300 2301
	}

	/*
	 * Commands querying/configuring an existing array:
	 */
2302 2303
	/* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
	if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
Linus Torvalds's avatar
Linus Torvalds committed
2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325
		err = -ENODEV;
		goto abort_unlock;
	}

	/*
	 * Commands even a read-only array can execute:
	 */
	switch (cmd)
	{
		case GET_ARRAY_INFO:
			err = get_array_info(mddev, (void *)arg);
			goto done_unlock;

		case GET_DISK_INFO:
			err = get_disk_info(mddev, (void *)arg);
			goto done_unlock;

		case RESTART_ARRAY_RW:
			err = restart_array(mddev);
			goto done_unlock;

		case STOP_ARRAY:
2326
			err = do_md_stop (mddev, 0);
Linus Torvalds's avatar
Linus Torvalds committed
2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343
			goto done_unlock;

		case STOP_ARRAY_RO:
			err = do_md_stop (mddev, 1);
			goto done_unlock;

	/*
	 * We have a problem here : there is no easy way to give a CHS
	 * virtual geometry. We currently pretend that we have a 2 heads
	 * 4 sectors (with a BIG number of cylinders...). This drives
	 * dosfs just mad... ;-)
	 */
		case HDIO_GETGEO:
			if (!loc) {
				err = -EINVAL;
				goto abort_unlock;
			}
Linus Torvalds's avatar
Linus Torvalds committed
2344
			err = put_user (2, (char *) &loc->heads);
Linus Torvalds's avatar
Linus Torvalds committed
2345 2346
			if (err)
				goto abort_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
2347
			err = put_user (4, (char *) &loc->sectors);
Linus Torvalds's avatar
Linus Torvalds committed
2348 2349
			if (err)
				goto abort_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
2350
			err = put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
Linus Torvalds's avatar
Linus Torvalds committed
2351 2352 2353
						(short *) &loc->cylinders);
			if (err)
				goto abort_unlock;
2354
			err = put_user (get_start_sect(inode->i_bdev),
Linus Torvalds's avatar
Linus Torvalds committed
2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373
						(long *) &loc->start);
			goto done_unlock;
	}

	/*
	 * The remaining ioctls are changing the state of the
	 * superblock, so we do not allow read-only arrays
	 * here:
	 */
	if (mddev->ro) {
		err = -EROFS;
		goto abort_unlock;
	}

	switch (cmd)
	{
		case ADD_NEW_DISK:
		{
			mdu_disk_info_t info;
Linus Torvalds's avatar
Linus Torvalds committed
2374
			if (copy_from_user(&info, (void*)arg, sizeof(info)))
Linus Torvalds's avatar
Linus Torvalds committed
2375 2376 2377 2378 2379
				err = -EFAULT;
			else
				err = add_new_disk(mddev, &info);
			goto done_unlock;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2380
		case HOT_GENERATE_ERROR:
2381
			err = hot_generate_error(mddev, arg);
Linus Torvalds's avatar
Linus Torvalds committed
2382
			goto done_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
2383
		case HOT_REMOVE_DISK:
2384
			err = hot_remove_disk(mddev, arg);
Linus Torvalds's avatar
Linus Torvalds committed
2385 2386 2387
			goto done_unlock;

		case HOT_ADD_DISK:
2388
			err = hot_add_disk(mddev, arg);
Linus Torvalds's avatar
Linus Torvalds committed
2389 2390 2391
			goto done_unlock;

		case SET_DISK_FAULTY:
2392
			err = set_disk_faulty(mddev, arg);
Linus Torvalds's avatar
Linus Torvalds committed
2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404
			goto done_unlock;

		case RUN_ARRAY:
		{
			err = do_md_run (mddev);
			/*
			 * we have to clean up the mess if
			 * the array cannot be run for some
			 * reason ...
			 */
			if (err) {
				mddev->sb_dirty = 0;
2405
				do_md_stop (mddev, 0);
Linus Torvalds's avatar
Linus Torvalds committed
2406 2407 2408 2409 2410
			}
			goto done_unlock;
		}

		default:
Linus Torvalds's avatar
Linus Torvalds committed
2411 2412 2413
			printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
			       "upgrade your software to use new ictls.\n",
			       current->comm, current->pid);
Linus Torvalds's avatar
Linus Torvalds committed
2414 2415 2416 2417 2418 2419
			err = -EINVAL;
			goto abort_unlock;
	}

done_unlock:
abort_unlock:
2420
	mddev_unlock(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
2421 2422 2423 2424

	return err;
done:
	if (err)
Linus Torvalds's avatar
Linus Torvalds committed
2425
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
2426 2427 2428 2429
abort:
	return err;
}

Linus Torvalds's avatar
Linus Torvalds committed
2430
static int md_open(struct inode *inode, struct file *file)
Linus Torvalds's avatar
Linus Torvalds committed
2431 2432
{
	/*
2433
	 * Succeed if we can find or allocate a mddev structure.
Linus Torvalds's avatar
Linus Torvalds committed
2434
	 */
2435
	mddev_t *mddev = mddev_find(minor(inode->i_rdev));
2436
	int err = -ENOMEM;
2437

2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450
	if (!mddev)
		goto out;

	if ((err = mddev_lock(mddev)))
		goto put;

	err = 0;
	mddev_unlock(mddev);
	inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
 put:
	mddev_put(mddev);
 out:
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
2451 2452
}

Linus Torvalds's avatar
Linus Torvalds committed
2453
static int md_release(struct inode *inode, struct file * file)
Linus Torvalds's avatar
Linus Torvalds committed
2454
{
2455 2456
 	mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;

2457 2458 2459 2460
	if (!mddev)
		BUG();
	mddev_put(mddev);

Linus Torvalds's avatar
Linus Torvalds committed
2461 2462 2463
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
2464
static struct block_device_operations md_fops =
Linus Torvalds's avatar
Linus Torvalds committed
2465
{
2466 2467 2468 2469
	.owner		= THIS_MODULE,
	.open		= md_open,
	.release	= md_release,
	.ioctl		= md_ioctl,
Linus Torvalds's avatar
Linus Torvalds committed
2470 2471 2472
};


Linus Torvalds's avatar
Linus Torvalds committed
2473 2474 2475 2476 2477 2478 2479
static inline void flush_curr_signals(void)
{
	spin_lock(&current->sigmask_lock);
	flush_signals(current);
	spin_unlock(&current->sigmask_lock);
}

Linus Torvalds's avatar
Linus Torvalds committed
2480 2481 2482 2483
int md_thread(void * arg)
{
	mdk_thread_t *thread = arg;

Linus Torvalds's avatar
Linus Torvalds committed
2484
	lock_kernel();
Linus Torvalds's avatar
Linus Torvalds committed
2485 2486 2487 2488 2489 2490

	/*
	 * Detach thread
	 */

	daemonize();
2491
	reparent_to_init();
Linus Torvalds's avatar
Linus Torvalds committed
2492 2493

	sprintf(current->comm, thread->name);
Linus Torvalds's avatar
Linus Torvalds committed
2494 2495 2496
	current->exit_signal = SIGCHLD;
	siginitsetinv(&current->blocked, sigmask(SIGKILL));
	flush_curr_signals();
Linus Torvalds's avatar
Linus Torvalds committed
2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509
	thread->tsk = current;

	/*
	 * md_thread is a 'system-thread', it's priority should be very
	 * high. We avoid resource deadlocks individually in each
	 * raid personality. (RAID5 does preallocation) We also use RR and
	 * the very same RT priority as kswapd, thus we will never get
	 * into a priority inversion deadlock.
	 *
	 * we definitely have to have equal or higher priority than
	 * bdflush, otherwise bdflush will deadlock if there are too
	 * many dirty RAID5 blocks.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
2510
	unlock_kernel();
Linus Torvalds's avatar
Linus Torvalds committed
2511

Linus Torvalds's avatar
Linus Torvalds committed
2512 2513 2514
	complete(thread->event);
	while (thread->run) {
		void (*run)(void *data);
Linus Torvalds's avatar
Linus Torvalds committed
2515

2516 2517 2518
		wait_event_interruptible(thread->wqueue,
					 test_bit(THREAD_WAKEUP, &thread->flags));

Linus Torvalds's avatar
Linus Torvalds committed
2519 2520
		clear_bit(THREAD_WAKEUP, &thread->flags);

Linus Torvalds's avatar
Linus Torvalds committed
2521 2522
		run = thread->run;
		if (run) {
Linus Torvalds's avatar
Linus Torvalds committed
2523
			run(thread->data);
Jens Axboe's avatar
Jens Axboe committed
2524
			blk_run_queues();
Linus Torvalds's avatar
Linus Torvalds committed
2525
		}
Linus Torvalds's avatar
Linus Torvalds committed
2526 2527
		if (signal_pending(current))
			flush_curr_signals();
Linus Torvalds's avatar
Linus Torvalds committed
2528
	}
Linus Torvalds's avatar
Linus Torvalds committed
2529
	complete(thread->event);
Linus Torvalds's avatar
Linus Torvalds committed
2530 2531 2532 2533 2534
	return 0;
}

void md_wakeup_thread(mdk_thread_t *thread)
{
Linus Torvalds's avatar
Linus Torvalds committed
2535
	dprintk("md: waking up MD thread %p.\n", thread);
Linus Torvalds's avatar
Linus Torvalds committed
2536 2537 2538 2539
	set_bit(THREAD_WAKEUP, &thread->flags);
	wake_up(&thread->wqueue);
}

Linus Torvalds's avatar
Linus Torvalds committed
2540
mdk_thread_t *md_register_thread(void (*run) (void *),
Linus Torvalds's avatar
Linus Torvalds committed
2541 2542 2543 2544
						void *data, const char *name)
{
	mdk_thread_t *thread;
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
2545
	struct completion event;
Linus Torvalds's avatar
Linus Torvalds committed
2546

Linus Torvalds's avatar
Linus Torvalds committed
2547 2548 2549 2550
	thread = (mdk_thread_t *) kmalloc
				(sizeof(mdk_thread_t), GFP_KERNEL);
	if (!thread)
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
2551

Linus Torvalds's avatar
Linus Torvalds committed
2552
	memset(thread, 0, sizeof(mdk_thread_t));
Linus Torvalds's avatar
Linus Torvalds committed
2553
	init_waitqueue_head(&thread->wqueue);
Linus Torvalds's avatar
Linus Torvalds committed
2554

Linus Torvalds's avatar
Linus Torvalds committed
2555
	init_completion(&event);
Linus Torvalds's avatar
Linus Torvalds committed
2556
	thread->event = &event;
Linus Torvalds's avatar
Linus Torvalds committed
2557 2558 2559 2560 2561 2562 2563 2564
	thread->run = run;
	thread->data = data;
	thread->name = name;
	ret = kernel_thread(md_thread, thread, 0);
	if (ret < 0) {
		kfree(thread);
		return NULL;
	}
Linus Torvalds's avatar
Linus Torvalds committed
2565
	wait_for_completion(&event);
Linus Torvalds's avatar
Linus Torvalds committed
2566 2567 2568
	return thread;
}

Linus Torvalds's avatar
Linus Torvalds committed
2569
void md_interrupt_thread(mdk_thread_t *thread)
Linus Torvalds's avatar
Linus Torvalds committed
2570 2571 2572 2573 2574
{
	if (!thread->tsk) {
		MD_BUG();
		return;
	}
Linus Torvalds's avatar
Linus Torvalds committed
2575
	dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
Linus Torvalds's avatar
Linus Torvalds committed
2576 2577 2578
	send_sig(SIGKILL, thread->tsk, 1);
}

Linus Torvalds's avatar
Linus Torvalds committed
2579
void md_unregister_thread(mdk_thread_t *thread)
Linus Torvalds's avatar
Linus Torvalds committed
2580
{
Linus Torvalds's avatar
Linus Torvalds committed
2581 2582 2583
	struct completion event;

	init_completion(&event);
Linus Torvalds's avatar
Linus Torvalds committed
2584

Linus Torvalds's avatar
Linus Torvalds committed
2585
	thread->event = &event;
Linus Torvalds's avatar
Linus Torvalds committed
2586 2587 2588
	thread->run = NULL;
	thread->name = NULL;
	md_interrupt_thread(thread);
Linus Torvalds's avatar
Linus Torvalds committed
2589 2590
	wait_for_completion(&event);
	kfree(thread);
Linus Torvalds's avatar
Linus Torvalds committed
2591 2592
}

2593
static void md_recover_arrays(void)
Linus Torvalds's avatar
Linus Torvalds committed
2594 2595 2596 2597 2598 2599 2600 2601 2602
{
	if (!md_recovery_thread) {
		MD_BUG();
		return;
	}
	md_wakeup_thread(md_recovery_thread);
}


2603
void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
Linus Torvalds's avatar
Linus Torvalds committed
2604
{
Linus Torvalds's avatar
Linus Torvalds committed
2605
	dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2606
		MD_MAJOR,mdidx(mddev),MAJOR(bdev->bd_dev),MINOR(bdev->bd_dev),
Linus Torvalds's avatar
Linus Torvalds committed
2607 2608 2609
		__builtin_return_address(0),__builtin_return_address(1),
		__builtin_return_address(2),__builtin_return_address(3));

Linus Torvalds's avatar
Linus Torvalds committed
2610 2611
	if (!mddev) {
		MD_BUG();
2612
		return;
Linus Torvalds's avatar
Linus Torvalds committed
2613
	}
2614 2615

	if (!rdev || rdev->faulty)
2616 2617 2618 2619
		return;
	if (!mddev->pers->error_handler)
		return;
	mddev->pers->error_handler(mddev,rdev);
Linus Torvalds's avatar
Linus Torvalds committed
2620
	md_recover_arrays();
Linus Torvalds's avatar
Linus Torvalds committed
2621 2622
}

Linus Torvalds's avatar
Linus Torvalds committed
2623
static int status_unused(char * page)
Linus Torvalds's avatar
Linus Torvalds committed
2624 2625 2626
{
	int sz = 0, i = 0;
	mdk_rdev_t *rdev;
Linus Torvalds's avatar
Linus Torvalds committed
2627
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
2628 2629 2630

	sz += sprintf(page + sz, "unused devices: ");

Neil Brown's avatar
Neil Brown committed
2631 2632 2633
	ITERATE_RDEV_PENDING(rdev,tmp) {
		i++;
		sz += sprintf(page + sz, "%s ",
2634
			      bdev_partition_name(rdev->bdev));
Linus Torvalds's avatar
Linus Torvalds committed
2635 2636 2637 2638 2639 2640 2641 2642 2643
	}
	if (!i)
		sz += sprintf(page + sz, "<none>");

	sz += sprintf(page + sz, "\n");
	return sz;
}


Linus Torvalds's avatar
Linus Torvalds committed
2644
static int status_resync(char * page, mddev_t * mddev)
Linus Torvalds's avatar
Linus Torvalds committed
2645 2646 2647 2648
{
	int sz = 0;
	unsigned long max_blocks, resync, res, dt, db, rt;

Linus Torvalds's avatar
Linus Torvalds committed
2649
	resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
2650
	max_blocks = mddev->size;
Linus Torvalds's avatar
Linus Torvalds committed
2651 2652 2653

	/*
	 * Should not happen.
Linus Torvalds's avatar
Linus Torvalds committed
2654
	 */
Linus Torvalds's avatar
Linus Torvalds committed
2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669
	if (!max_blocks) {
		MD_BUG();
		return 0;
	}
	res = (resync/1024)*1000/(max_blocks/1024 + 1);
	{
		int i, x = res/50, y = 20-x;
		sz += sprintf(page + sz, "[");
		for (i = 0; i < x; i++)
			sz += sprintf(page + sz, "=");
		sz += sprintf(page + sz, ">");
		for (i = 0; i < y; i++)
			sz += sprintf(page + sz, ".");
		sz += sprintf(page + sz, "] ");
	}
2670
	sz += sprintf(page + sz, " %s =%3lu.%lu%% (%lu/%lu)",
2671
		      (mddev->spares ? "recovery" : "resync"),
2672
		      res/10, res % 10, resync, max_blocks);
Linus Torvalds's avatar
Linus Torvalds committed
2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684

	/*
	 * We do not want to overflow, so the order of operands and
	 * the * 100 / 100 trick are important. We do a +1 to be
	 * safe against division by zero. We only estimate anyway.
	 *
	 * dt: time from mark until now
	 * db: blocks written from mark until now
	 * rt: remaining time
	 */
	dt = ((jiffies - mddev->resync_mark) / HZ);
	if (!dt) dt++;
Linus Torvalds's avatar
Linus Torvalds committed
2685
	db = resync - (mddev->resync_mark_cnt/2);
Linus Torvalds's avatar
Linus Torvalds committed
2686
	rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
Linus Torvalds's avatar
Linus Torvalds committed
2687

Linus Torvalds's avatar
Linus Torvalds committed
2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698
	sz += sprintf(page + sz, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);

	sz += sprintf(page + sz, " speed=%ldK/sec", db/dt);

	return sz;
}

static int md_status_read_proc(char *page, char **start, off_t off,
			int count, int *eof, void *data)
{
	int sz = 0, j, size;
Linus Torvalds's avatar
Linus Torvalds committed
2699
	struct list_head *tmp, *tmp2;
Linus Torvalds's avatar
Linus Torvalds committed
2700 2701 2702 2703 2704 2705 2706 2707 2708 2709
	mdk_rdev_t *rdev;
	mddev_t *mddev;

	sz += sprintf(page + sz, "Personalities : ");
	for (j = 0; j < MAX_PERSONALITY; j++)
	if (pers[j])
		sz += sprintf(page+sz, "[%s] ", pers[j]->name);

	sz += sprintf(page+sz, "\n");

2710
	ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
Linus Torvalds's avatar
Linus Torvalds committed
2711 2712 2713
		sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev),
						mddev->pers ? "" : "in");
		if (mddev->pers) {
Linus Torvalds's avatar
Linus Torvalds committed
2714
			if (mddev->ro)
Linus Torvalds's avatar
Linus Torvalds committed
2715 2716 2717 2718 2719 2720 2721
				sz += sprintf(page + sz, " (read-only)");
			sz += sprintf(page + sz, " %s", mddev->pers->name);
		}

		size = 0;
		ITERATE_RDEV(mddev,rdev,tmp2) {
			sz += sprintf(page + sz, " %s[%d]",
2722
				bdev_partition_name(rdev->bdev), rdev->desc_nr);
Linus Torvalds's avatar
Linus Torvalds committed
2723 2724 2725 2726 2727 2728 2729
			if (rdev->faulty) {
				sz += sprintf(page + sz, "(F)");
				continue;
			}
			size += rdev->size;
		}

2730
		if (!list_empty(&mddev->disks)) {
Linus Torvalds's avatar
Linus Torvalds committed
2731 2732 2733 2734 2735 2736 2737 2738 2739
			if (mddev->pers)
				sz += sprintf(page + sz, "\n      %d blocks",
						 md_size[mdidx(mddev)]);
			else
				sz += sprintf(page + sz, "\n      %d blocks", size);
		}

		if (!mddev->pers) {
			sz += sprintf(page+sz, "\n");
2740
			mddev_unlock(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
2741 2742 2743 2744 2745 2746
			continue;
		}

		sz += mddev->pers->status (page+sz, mddev);

		sz += sprintf(page+sz, "\n      ");
2747
		if (mddev->curr_resync > 1)
Linus Torvalds's avatar
Linus Torvalds committed
2748
			sz += status_resync (page+sz, mddev);
2749
		else if (mddev->curr_resync == 1)
Linus Torvalds's avatar
Linus Torvalds committed
2750
				sz += sprintf(page + sz, "	resync=DELAYED");
2751

Linus Torvalds's avatar
Linus Torvalds committed
2752
		sz += sprintf(page + sz, "\n");
2753
		mddev_unlock(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
2754
	}
Linus Torvalds's avatar
Linus Torvalds committed
2755
	sz += status_unused(page + sz);
Linus Torvalds's avatar
Linus Torvalds committed
2756 2757 2758 2759

	return sz;
}

Linus Torvalds's avatar
Linus Torvalds committed
2760
int register_md_personality(int pnum, mdk_personality_t *p)
Linus Torvalds's avatar
Linus Torvalds committed
2761
{
Linus Torvalds's avatar
Linus Torvalds committed
2762 2763
	if (pnum >= MAX_PERSONALITY) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
2764
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
2765
	}
Linus Torvalds's avatar
Linus Torvalds committed
2766

Linus Torvalds's avatar
Linus Torvalds committed
2767 2768
	if (pers[pnum]) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
2769
		return -EBUSY;
Linus Torvalds's avatar
Linus Torvalds committed
2770
	}
Linus Torvalds's avatar
Linus Torvalds committed
2771 2772

	pers[pnum] = p;
Linus Torvalds's avatar
Linus Torvalds committed
2773
	printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
Linus Torvalds's avatar
Linus Torvalds committed
2774 2775 2776
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
2777
int unregister_md_personality(int pnum)
Linus Torvalds's avatar
Linus Torvalds committed
2778
{
Linus Torvalds's avatar
Linus Torvalds committed
2779 2780
	if (pnum >= MAX_PERSONALITY) {
		MD_BUG();
Linus Torvalds's avatar
Linus Torvalds committed
2781
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
2782
	}
Linus Torvalds's avatar
Linus Torvalds committed
2783

Linus Torvalds's avatar
Linus Torvalds committed
2784
	printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
Linus Torvalds's avatar
Linus Torvalds committed
2785 2786 2787 2788 2789
	pers[pnum] = NULL;
	return 0;
}

static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
2790
void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
Linus Torvalds's avatar
Linus Torvalds committed
2791
{
2792
	kdev_t dev = to_kdev_t(rdev->bdev->bd_dev);
Linus Torvalds's avatar
Linus Torvalds committed
2793
	unsigned int major = major(dev);
Linus Torvalds's avatar
Linus Torvalds committed
2794 2795 2796 2797 2798 2799 2800 2801 2802
	unsigned int index;

	index = disk_index(dev);
	if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
		return;

	sync_io[major][index] += nr_sectors;
}

Linus Torvalds's avatar
Linus Torvalds committed
2803
static int is_mddev_idle(mddev_t *mddev)
Linus Torvalds's avatar
Linus Torvalds committed
2804 2805
{
	mdk_rdev_t * rdev;
Linus Torvalds's avatar
Linus Torvalds committed
2806
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
2807 2808 2809 2810 2811
	int idle;
	unsigned long curr_events;

	idle = 1;
	ITERATE_RDEV(mddev,rdev,tmp) {
2812 2813 2814
		kdev_t dev = to_kdev_t(rdev->bdev->bd_dev);
		int major = major(dev);
		int idx = disk_index(dev);
Linus Torvalds's avatar
Linus Torvalds committed
2815 2816 2817 2818 2819 2820 2821

		if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
			continue;

		curr_events = kstat.dk_drive_rblk[major][idx] +
						kstat.dk_drive_wblk[major][idx] ;
		curr_events -= sync_io[major][idx];
Linus Torvalds's avatar
Linus Torvalds committed
2822
		if ((curr_events - rdev->last_events) > 32) {
Linus Torvalds's avatar
Linus Torvalds committed
2823 2824 2825 2826 2827 2828 2829 2830 2831
			rdev->last_events = curr_events;
			idle = 0;
		}
	}
	return idle;
}

void md_done_sync(mddev_t *mddev, int blocks, int ok)
{
Linus Torvalds's avatar
Linus Torvalds committed
2832
	/* another "blocks" (512byte) blocks have been synced */
Linus Torvalds's avatar
Linus Torvalds committed
2833 2834 2835
	atomic_sub(blocks, &mddev->recovery_active);
	wake_up(&mddev->recovery_wait);
	if (!ok) {
2836 2837
		mddev->recovery_running = -EIO;
		md_recover_arrays();
Linus Torvalds's avatar
Linus Torvalds committed
2838 2839 2840 2841
		// stop recovery, signal do_sync ....
	}
}

2842 2843 2844

DECLARE_WAIT_QUEUE_HEAD(resync_wait);

Linus Torvalds's avatar
Linus Torvalds committed
2845 2846
#define SYNC_MARKS	10
#define	SYNC_MARK_STEP	(3*HZ)
2847
static void md_do_sync(void *data)
Linus Torvalds's avatar
Linus Torvalds committed
2848
{
2849
	mddev_t *mddev = data;
Linus Torvalds's avatar
Linus Torvalds committed
2850
	mddev_t *mddev2;
Linus Torvalds's avatar
Linus Torvalds committed
2851
	unsigned int max_sectors, currspeed = 0,
2852
		j, window, err;
Linus Torvalds's avatar
Linus Torvalds committed
2853
	unsigned long mark[SYNC_MARKS];
Linus Torvalds's avatar
Linus Torvalds committed
2854
	unsigned long mark_cnt[SYNC_MARKS];
Linus Torvalds's avatar
Linus Torvalds committed
2855
	int last_mark,m;
Linus Torvalds's avatar
Linus Torvalds committed
2856
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
2857 2858
	unsigned long last_check;

2859 2860 2861
	/* just incase thread restarts... */
	if (mddev->recovery_running <= 0)
		return;
Linus Torvalds's avatar
Linus Torvalds committed
2862

2863 2864 2865 2866 2867 2868 2869 2870 2871
	/* we overload curr_resync somewhat here.
	 * 0 == not engaged in resync at all
	 * 2 == checking that there is no conflict with another sync
	 * 1 == like 2, but have yielded to allow conflicting resync to
	 *		commense
	 * other == active in resync - this many blocks
	 */
	do {
		mddev->curr_resync = 2;
Linus Torvalds's avatar
Linus Torvalds committed
2872

2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886
		ITERATE_MDDEV(mddev2,tmp) {
			if (mddev2 == mddev)
				continue;
			if (mddev2->curr_resync && 
			    match_mddev_units(mddev,mddev2)) {
				printk(KERN_INFO "md: delaying resync of md%d until md%d "
				       "has finished resync (they share one or more physical units)\n",
				       mdidx(mddev), mdidx(mddev2));
				if (mddev < mddev2) /* arbitrarily yield */
					mddev->curr_resync = 1;
				if (wait_event_interruptible(resync_wait,
							     mddev2->curr_resync < 2)) {
					flush_curr_signals();
					err = -EINTR;
2887
					mddev_put(mddev2);
2888
					goto skip;
2889 2890
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
2891
		}
2892
	} while (mddev->curr_resync < 2);
Linus Torvalds's avatar
Linus Torvalds committed
2893

2894
	max_sectors = mddev->size << 1;
Linus Torvalds's avatar
Linus Torvalds committed
2895 2896

	printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
Linus Torvalds's avatar
Linus Torvalds committed
2897
	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", sysctl_speed_limit_min);
Linus Torvalds's avatar
Linus Torvalds committed
2898 2899 2900
	printk(KERN_INFO "md: using maximum available idle IO bandwith "
	       "(but not more than %d KB/sec) for reconstruction.\n",
	       sysctl_speed_limit_max);
Linus Torvalds's avatar
Linus Torvalds committed
2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913

	is_mddev_idle(mddev); /* this also initializes IO event counters */
	for (m = 0; m < SYNC_MARKS; m++) {
		mark[m] = jiffies;
		mark_cnt[m] = 0;
	}
	last_mark = 0;
	mddev->resync_mark = mark[last_mark];
	mddev->resync_mark_cnt = mark_cnt[last_mark];

	/*
	 * Tune reconstruction:
	 */
Andrew Morton's avatar
Andrew Morton committed
2914
	window = 32*(PAGE_SIZE/512);
Linus Torvalds's avatar
Linus Torvalds committed
2915 2916
	printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
	       window/2,max_sectors/2);
Linus Torvalds's avatar
Linus Torvalds committed
2917 2918 2919 2920

	atomic_set(&mddev->recovery_active, 0);
	init_waitqueue_head(&mddev->recovery_wait);
	last_check = 0;
Linus Torvalds's avatar
Linus Torvalds committed
2921 2922
	for (j = 0; j < max_sectors;) {
		int sectors;
Linus Torvalds's avatar
Linus Torvalds committed
2923

Linus Torvalds's avatar
Linus Torvalds committed
2924
		sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
Linus Torvalds's avatar
Linus Torvalds committed
2925 2926
		if (sectors < 0) {
			err = sectors;
Linus Torvalds's avatar
Linus Torvalds committed
2927 2928
			goto out;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2929 2930
		atomic_add(sectors, &mddev->recovery_active);
		j += sectors;
2931
		if (j>1) mddev->curr_resync = j;
Linus Torvalds's avatar
Linus Torvalds committed
2932 2933 2934

		if (last_check + window > j)
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
2935 2936

		last_check = j;
Linus Torvalds's avatar
Linus Torvalds committed
2937

Jens Axboe's avatar
Jens Axboe committed
2938
		blk_run_queues();
Linus Torvalds's avatar
Linus Torvalds committed
2939

Linus Torvalds's avatar
Linus Torvalds committed
2940
	repeat:
Linus Torvalds's avatar
Linus Torvalds committed
2941 2942 2943
		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
			/* step marks */
			int next = (last_mark+1) % SYNC_MARKS;
Linus Torvalds's avatar
Linus Torvalds committed
2944

Linus Torvalds's avatar
Linus Torvalds committed
2945 2946 2947 2948 2949 2950
			mddev->resync_mark = mark[next];
			mddev->resync_mark_cnt = mark_cnt[next];
			mark[next] = jiffies;
			mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
			last_mark = next;
		}
Linus Torvalds's avatar
Linus Torvalds committed
2951

Linus Torvalds's avatar
Linus Torvalds committed
2952

Linus Torvalds's avatar
Linus Torvalds committed
2953
		if (signal_pending(current)) {
Linus Torvalds's avatar
Linus Torvalds committed
2954 2955 2956
			/*
			 * got a signal, exit.
			 */
Linus Torvalds's avatar
Linus Torvalds committed
2957
			printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
Linus Torvalds's avatar
Linus Torvalds committed
2958
			flush_curr_signals();
Linus Torvalds's avatar
Linus Torvalds committed
2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970
			err = -EINTR;
			goto out;
		}

		/*
		 * this loop exits only if either when we are slower than
		 * the 'hard' speed limit, or the system was IO-idle for
		 * a jiffy.
		 * the system might be non-idle CPU-wise, but we only care
		 * about not overloading the IO subsystem. (things like an
		 * e2fsck being done on the RAID array should execute fast)
		 */
Linus Torvalds's avatar
Linus Torvalds committed
2971
		cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
2972

Linus Torvalds's avatar
Linus Torvalds committed
2973
		currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
Linus Torvalds's avatar
Linus Torvalds committed
2974 2975 2976 2977 2978

		if (currspeed > sysctl_speed_limit_min) {
			if ((currspeed > sysctl_speed_limit_max) ||
					!is_mddev_idle(mddev)) {
				current->state = TASK_INTERRUPTIBLE;
Linus Torvalds's avatar
Linus Torvalds committed
2979
				schedule_timeout(HZ/4);
Linus Torvalds's avatar
Linus Torvalds committed
2980
				goto repeat;
Linus Torvalds's avatar
Linus Torvalds committed
2981
			}
Linus Torvalds's avatar
Linus Torvalds committed
2982
		}
Linus Torvalds's avatar
Linus Torvalds committed
2983 2984 2985 2986 2987 2988
	}
	printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
	err = 0;
	/*
	 * this also signals 'finished resyncing' to md_stop
	 */
2989
 out:
Linus Torvalds's avatar
Linus Torvalds committed
2990
	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
2991 2992
	/* tell personality that we are finished */
	mddev->pers->sync_request(mddev, max_sectors, 1);
2993
 skip:
Linus Torvalds's avatar
Linus Torvalds committed
2994
	mddev->curr_resync = 0;
2995 2996 2997 2998 2999 3000 3001
	if (err)
		mddev->recovery_running = err;
	if (mddev->recovery_running > 0)
		mddev->recovery_running = 0;
	if (mddev->recovery_running == 0)
		mddev->in_sync = 1;
	md_recover_arrays();
Linus Torvalds's avatar
Linus Torvalds committed
3002 3003 3004 3005
}


/*
3006 3007
 * This is the kernel thread that watches all md arrays for re-sync and other
 * action that might be needed.
3008 3009 3010 3011
 * It does not do any resync itself, but rather "forks" off other threads
 * to do that as needed.
 * When it is determined that resync is needed, we set "->recovery_running" and
 * create a thread at ->sync_thread.
3012
 * When the thread finishes it clears recovery_running (or sets an error)
3013
 * and wakeup up this thread which will reap the thread and finish up.
3014 3015 3016 3017 3018 3019 3020 3021 3022
 * This thread also removes any faulty devices (with nr_pending == 0).
 *
 * The overall approach is:
 *  1/ if the superblock needs updating, update it.
 *  2/ If a recovery thread is running, don't do anything else.
 *  3/ If recovery has finished, clean up, possibly marking spares active.
 *  4/ If there are any faulty devices, remove them.
 *  5/ If array is degraded, try to add spares devices
 *  6/ If array has spares or is not in-sync, start a resync thread.
Linus Torvalds's avatar
Linus Torvalds committed
3023
 */
Linus Torvalds's avatar
Linus Torvalds committed
3024
void md_do_recovery(void *data)
Linus Torvalds's avatar
Linus Torvalds committed
3025 3026
{
	mddev_t *mddev;
3027 3028 3029
	mdk_rdev_t *rdev;
	struct list_head *tmp, *rtmp;

Linus Torvalds's avatar
Linus Torvalds committed
3030

3031 3032 3033
	dprintk(KERN_INFO "md: recovery thread got woken up ...\n");

	ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
3034
		if (!mddev->raid_disks || !mddev->pers || mddev->ro)
3035
			goto unlock;
3036 3037
		if (mddev->sb_dirty)
			md_update_sb(mddev);
3038 3039
		if (mddev->recovery_running > 0)
			/* resync/recovery still happening */
3040
			goto unlock;
3041 3042 3043 3044
		if (mddev->sync_thread) {
			/* resync has finished, collect result */
			md_unregister_thread(mddev->sync_thread);
			mddev->sync_thread = NULL;
3045
			if (mddev->recovery_running == 0) {
3046
				/* success...*/
3047 3048 3049
				/* activate any spares */
				mddev->pers->spare_active(mddev);
				mddev->spares = 0;
Linus Torvalds's avatar
Linus Torvalds committed
3050
			}
3051
			md_update_sb(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
3052
			mddev->recovery_running = 0;
3053
			wake_up(&resync_wait);
3054
			goto unlock;
Linus Torvalds's avatar
Linus Torvalds committed
3055
		}
3056 3057
		if (mddev->recovery_running) {
			/* that's odd.. */
Linus Torvalds's avatar
Linus Torvalds committed
3058
			mddev->recovery_running = 0;
3059
			wake_up(&resync_wait);
Linus Torvalds's avatar
Linus Torvalds committed
3060
		}
3061

3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076
		/* no recovery is running.
		 * remove any failed drives, then
		 * add spares if possible
		 */
		mddev->spares = 0;
		ITERATE_RDEV(mddev,rdev,rtmp) {
			if (rdev->raid_disk >= 0 &&
			    rdev->faulty &&
			    atomic_read(&rdev->nr_pending)==0) {
				mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
				rdev->raid_disk = -1;
			}
			if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
				mddev->spares++;
		}
3077
		if (mddev->degraded) {
3078 3079 3080 3081 3082 3083 3084 3085
			ITERATE_RDEV(mddev,rdev,rtmp)
				if (rdev->raid_disk < 0
				    && !rdev->faulty) {
					if (mddev->pers->hot_add_disk(mddev,rdev))
						mddev->spares++;
					else
						break;
				}
3086
		}
3087 3088

		if (!mddev->spares && mddev->in_sync) {
3089
			/* nothing we can do ... */
3090
			goto unlock;
Linus Torvalds's avatar
Linus Torvalds committed
3091
		}
3092 3093 3094 3095 3096 3097
		if (mddev->pers->sync_request) {
			mddev->sync_thread = md_register_thread(md_do_sync,
								mddev,
								"md_resync");
			if (!mddev->sync_thread) {
				printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev));
3098
				/* leave the spares where they are, it shouldn't hurt */
3099 3100 3101 3102 3103
				mddev->recovery_running = 0;
			} else {
				mddev->recovery_running = 1;
				md_wakeup_thread(mddev->sync_thread);
			}
Linus Torvalds's avatar
Linus Torvalds committed
3104
		}
3105 3106
	unlock:
		mddev_unlock(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
3107
	}
3108
	dprintk(KERN_INFO "md: recovery thread finished ...\n");
Linus Torvalds's avatar
Linus Torvalds committed
3109

Linus Torvalds's avatar
Linus Torvalds committed
3110 3111 3112 3113 3114
}

int md_notify_reboot(struct notifier_block *this,
					unsigned long code, void *x)
{
Linus Torvalds's avatar
Linus Torvalds committed
3115
	struct list_head *tmp;
Linus Torvalds's avatar
Linus Torvalds committed
3116 3117
	mddev_t *mddev;

Linus Torvalds's avatar
Linus Torvalds committed
3118
	if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
Linus Torvalds's avatar
Linus Torvalds committed
3119

Linus Torvalds's avatar
Linus Torvalds committed
3120
		printk(KERN_INFO "md: stopping all md devices.\n");
Linus Torvalds's avatar
Linus Torvalds committed
3121
		return NOTIFY_DONE;
Linus Torvalds's avatar
Linus Torvalds committed
3122 3123

		ITERATE_MDDEV(mddev,tmp)
3124 3125
			if (mddev_trylock(mddev)==0)
				do_md_stop (mddev, 1);
Linus Torvalds's avatar
Linus Torvalds committed
3126 3127 3128 3129 3130 3131
		/*
		 * certain more exotic SCSI devices are known to be
		 * volatile wrt too early system reboots. While the
		 * right place to handle this issue is the given
		 * driver, we do want to have a safe RAID driver ...
		 */
Linus Torvalds's avatar
Linus Torvalds committed
3132
		mdelay(1000*1);
Linus Torvalds's avatar
Linus Torvalds committed
3133 3134 3135 3136 3137
	}
	return NOTIFY_DONE;
}

struct notifier_block md_notifier = {
3138 3139 3140
	.notifier_call	= md_notify_reboot,
	.next		= NULL,
	.priority	= INT_MAX, /* before any real devices */
Linus Torvalds's avatar
Linus Torvalds committed
3141 3142
};

Linus Torvalds's avatar
Linus Torvalds committed
3143
static void md_geninit(void)
Linus Torvalds's avatar
Linus Torvalds committed
3144 3145 3146 3147 3148 3149 3150
{
	int i;

	for(i = 0; i < MAX_MD_DEVS; i++) {
		md_size[i] = 0;
	}

Linus Torvalds's avatar
Linus Torvalds committed
3151
	dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
Linus Torvalds's avatar
Linus Torvalds committed
3152 3153 3154 3155 3156 3157

#ifdef CONFIG_PROC_FS
	create_proc_read_entry("mdstat", 0, NULL, md_status_read_proc, NULL);
#endif
}

3158 3159
request_queue_t * md_queue_proc(kdev_t dev)
{
3160 3161 3162 3163 3164 3165 3166 3167
	mddev_t *mddev = mddev_find(minor(dev));
	request_queue_t *q = BLK_DEFAULT_QUEUE(MAJOR_NR);
	if (!mddev || atomic_read(&mddev->active)<2)
		BUG();
	if (mddev->pers)
		q = &mddev->queue;
	mddev_put(mddev); /* the caller must hold a reference... */
	return q;
3168 3169
}

Linus Torvalds's avatar
Linus Torvalds committed
3170
int __init md_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
3171 3172
{
	static char * name = "mdrecoveryd";
Linus Torvalds's avatar
Linus Torvalds committed
3173
	int minor;
Linus Torvalds's avatar
Linus Torvalds committed
3174 3175

	printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
Linus Torvalds's avatar
Linus Torvalds committed
3176 3177 3178
			MD_MAJOR_VERSION, MD_MINOR_VERSION,
			MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);

3179
	if (register_blkdev (MAJOR_NR, "md", &md_fops)) {
Linus Torvalds's avatar
Linus Torvalds committed
3180
		printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
Linus Torvalds's avatar
Linus Torvalds committed
3181 3182 3183
		return (-1);
	}
	devfs_handle = devfs_mk_dir (NULL, "md", NULL);
Linus Torvalds's avatar
Linus Torvalds committed
3184 3185 3186 3187 3188 3189 3190 3191
	/* we don't use devfs_register_series because we want to fill md_hd_struct */
	for (minor=0; minor < MAX_MD_DEVS; ++minor) {
		char devname[128];
		sprintf (devname, "%u", minor);
		md_hd_struct[minor].de = devfs_register (devfs_handle,
			devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
			S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
	}
Linus Torvalds's avatar
Linus Torvalds committed
3192

3193 3194 3195
	/* all requests on an uninitialised device get failed... */
	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
	blk_dev[MAJOR_NR].queue = md_queue_proc;
Linus Torvalds's avatar
Linus Torvalds committed
3196

Linus Torvalds's avatar
Linus Torvalds committed
3197 3198
	md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
	if (!md_recovery_thread)
Linus Torvalds's avatar
Linus Torvalds committed
3199 3200
		printk(KERN_ALERT
		       "md: bug: couldn't allocate md_recovery_thread\n");
Linus Torvalds's avatar
Linus Torvalds committed
3201

Linus Torvalds's avatar
Linus Torvalds committed
3202
	register_reboot_notifier(&md_notifier);
Linus Torvalds's avatar
Linus Torvalds committed
3203 3204 3205 3206 3207 3208
	raid_table_header = register_sysctl_table(raid_root_table, 1);

	md_geninit();
	return (0);
}

Linus Torvalds's avatar
Linus Torvalds committed
3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223

#ifndef MODULE

/*
 * When md (and any require personalities) are compiled into the kernel
 * (not a module), arrays can be assembles are boot time using with AUTODETECT
 * where specially marked partitions are registered with md_autodetect_dev(),
 * and with MD_BOOT where devices to be collected are given on the boot line
 * with md=.....
 * The code for that is here.
 */

struct {
	int set;
	int noautodetect;
Linus Torvalds's avatar
Linus Torvalds committed
3224
} raid_setup_args __initdata;
Linus Torvalds's avatar
Linus Torvalds committed
3225 3226 3227 3228 3229

/*
 * Searches all registered partitions for autorun RAID arrays
 * at boot time.
 */
3230
static dev_t detected_devices[128];
Linus Torvalds's avatar
Linus Torvalds committed
3231 3232
static int dev_cnt;

3233
void md_autodetect_dev(dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
3234 3235 3236 3237 3238 3239
{
	if (dev_cnt >= 0 && dev_cnt < 127)
		detected_devices[dev_cnt++] = dev;
}


Linus Torvalds's avatar
Linus Torvalds committed
3240
static void autostart_arrays(void)
Linus Torvalds's avatar
Linus Torvalds committed
3241 3242 3243 3244
{
	mdk_rdev_t *rdev;
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
3245
	printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
Linus Torvalds's avatar
Linus Torvalds committed
3246

Linus Torvalds's avatar
Linus Torvalds committed
3247
	for (i = 0; i < dev_cnt; i++) {
3248
		dev_t dev = detected_devices[i];
Linus Torvalds's avatar
Linus Torvalds committed
3249

3250 3251
		rdev = md_import_device(dev,1);
		if (IS_ERR(rdev)) {
Linus Torvalds's avatar
Linus Torvalds committed
3252
			printk(KERN_ALERT "md: could not import %s!\n",
3253
				partition_name(to_kdev_t(dev)));
Linus Torvalds's avatar
Linus Torvalds committed
3254 3255 3256 3257 3258 3259
			continue;
		}
		if (rdev->faulty) {
			MD_BUG();
			continue;
		}
Neil Brown's avatar
Neil Brown committed
3260
		list_add(&rdev->same_set, &pending_raid_disks);
Linus Torvalds's avatar
Linus Torvalds committed
3261
	}
Linus Torvalds's avatar
Linus Torvalds committed
3262
	dev_cnt = 0;
Linus Torvalds's avatar
Linus Torvalds committed
3263

3264
	autorun_devices();
Linus Torvalds's avatar
Linus Torvalds committed
3265 3266
}

Linus Torvalds's avatar
Linus Torvalds committed
3267 3268 3269 3270
static struct {
	char device_set [MAX_MD_DEVS];
	int pers[MAX_MD_DEVS];
	int chunk[MAX_MD_DEVS];
Linus Torvalds's avatar
Linus Torvalds committed
3271
	char *device_names[MAX_MD_DEVS];
Linus Torvalds's avatar
Linus Torvalds committed
3272
} md_setup_args __initdata;
Linus Torvalds's avatar
Linus Torvalds committed
3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283

/*
 * Parse the command-line parameters given our kernel, but do not
 * actually try to invoke the MD device now; that is handled by
 * md_setup_drive after the low-level disk drivers have initialised.
 *
 * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
 *             assigns the task of parsing integer arguments to the
 *             invoked program now).  Added ability to initialise all
 *             the MD devices (by specifying multiple "md=" lines)
 *             instead of just one.  -- KTK
3284
 * 18May2000: Added support for persistent-superblock arrays:
Linus Torvalds's avatar
Linus Torvalds committed
3285 3286 3287 3288 3289
 *             md=n,0,factor,fault,device-list   uses RAID0 for device n
 *             md=n,-1,factor,fault,device-list  uses LINEAR for device n
 *             md=n,device-list      reads a RAID superblock from the devices
 *             elements in device-list are read by name_to_kdev_t so can be
 *             a hex number or something like /dev/hda1 /dev/sdb
Linus Torvalds's avatar
Linus Torvalds committed
3290 3291 3292
 * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
 *		Shifted name_to_kdev_t() and related operations to md_set_drive()
 *		for later execution. Rewrote section to make devfs compatible.
Linus Torvalds's avatar
Linus Torvalds committed
3293
 */
Linus Torvalds's avatar
Linus Torvalds committed
3294
static int __init md_setup(char *str)
Linus Torvalds's avatar
Linus Torvalds committed
3295
{
3296
	int minor, level, factor, fault, pers;
Linus Torvalds's avatar
Linus Torvalds committed
3297 3298
	char *pername = "";
	char *str1 = str;
Linus Torvalds's avatar
Linus Torvalds committed
3299

Linus Torvalds's avatar
Linus Torvalds committed
3300
	if (get_option(&str, &minor) != 2) {	/* MD Number */
Linus Torvalds's avatar
Linus Torvalds committed
3301
		printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
Linus Torvalds's avatar
Linus Torvalds committed
3302 3303
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
3304
	if (minor >= MAX_MD_DEVS) {
Linus Torvalds's avatar
Linus Torvalds committed
3305
		printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
Linus Torvalds's avatar
Linus Torvalds committed
3306
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
3307
	} else if (md_setup_args.device_names[minor]) {
Dave Jones's avatar
Dave Jones committed
3308
		printk(KERN_WARNING "md: md=%d, Specified more than once. "
Linus Torvalds's avatar
Linus Torvalds committed
3309
		       "Replacing previous definition.\n", minor);
Linus Torvalds's avatar
Linus Torvalds committed
3310
	}
Linus Torvalds's avatar
Linus Torvalds committed
3311
	switch (get_option(&str, &level)) {	/* RAID Personality */
Linus Torvalds's avatar
Linus Torvalds committed
3312
	case 2: /* could be 0 or -1.. */
3313
		if (level == 0 || level == LEVEL_LINEAR) {
Linus Torvalds's avatar
Linus Torvalds committed
3314
			if (get_option(&str, &factor) != 2 ||	/* Chunk Size */
Linus Torvalds's avatar
Linus Torvalds committed
3315
					get_option(&str, &fault) != 2) {
Linus Torvalds's avatar
Linus Torvalds committed
3316
				printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
Linus Torvalds's avatar
Linus Torvalds committed
3317 3318 3319 3320 3321
				return 0;
			}
			md_setup_args.pers[minor] = level;
			md_setup_args.chunk[minor] = 1 << (factor+12);
			switch(level) {
3322 3323
			case LEVEL_LINEAR:
				pers = LINEAR;
Linus Torvalds's avatar
Linus Torvalds committed
3324 3325 3326
				pername = "linear";
				break;
			case 0:
3327
				pers = RAID0;
Linus Torvalds's avatar
Linus Torvalds committed
3328 3329 3330
				pername = "raid0";
				break;
			default:
Linus Torvalds's avatar
Linus Torvalds committed
3331 3332 3333
				printk(KERN_WARNING
				       "md: The kernel has not been configured for raid%d support!\n",
				       level);
Linus Torvalds's avatar
Linus Torvalds committed
3334 3335
				return 0;
			}
3336
			md_setup_args.pers[minor] = pers;
Linus Torvalds's avatar
Linus Torvalds committed
3337 3338 3339 3340
			break;
		}
		/* FALL THROUGH */
	case 1: /* the first device is numeric */
Linus Torvalds's avatar
Linus Torvalds committed
3341
		str = str1;
Linus Torvalds's avatar
Linus Torvalds committed
3342 3343 3344 3345 3346
		/* FALL THROUGH */
	case 0:
		md_setup_args.pers[minor] = 0;
		pername="super-block";
	}
Linus Torvalds's avatar
Linus Torvalds committed
3347 3348

	printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
Linus Torvalds's avatar
Linus Torvalds committed
3349 3350
		minor, pername, str);
	md_setup_args.device_names[minor] = str;
Linus Torvalds's avatar
Linus Torvalds committed
3351

Linus Torvalds's avatar
Linus Torvalds committed
3352 3353 3354
	return 1;
}

Linus Torvalds's avatar
Linus Torvalds committed
3355 3356
extern kdev_t name_to_kdev_t(char *line) __init;
void __init md_setup_drive(void)
Linus Torvalds's avatar
Linus Torvalds committed
3357 3358 3359 3360
{
	int minor, i;
	kdev_t dev;
	mddev_t*mddev;
Linus Torvalds's avatar
Linus Torvalds committed
3361
	kdev_t devices[MD_SB_DISKS+1];
Linus Torvalds's avatar
Linus Torvalds committed
3362

Linus Torvalds's avatar
Linus Torvalds committed
3363
	for (minor = 0; minor < MAX_MD_DEVS; minor++) {
Linus Torvalds's avatar
Linus Torvalds committed
3364 3365
		int err = 0;
		char *devname;
Linus Torvalds's avatar
Linus Torvalds committed
3366
		mdu_disk_info_t dinfo;
Linus Torvalds's avatar
Linus Torvalds committed
3367

Linus Torvalds's avatar
Linus Torvalds committed
3368 3369
		if (!(devname = md_setup_args.device_names[minor]))
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
3370

Linus Torvalds's avatar
Linus Torvalds committed
3371 3372 3373 3374
		for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {

			char *p;
			void *handle;
Linus Torvalds's avatar
Linus Torvalds committed
3375 3376 3377

			p = strchr(devname, ',');
			if (p)
Linus Torvalds's avatar
Linus Torvalds committed
3378 3379 3380
				*p++ = 0;

			dev = name_to_kdev_t(devname);
3381 3382
			handle = devfs_get_handle(NULL, devname, major(dev), minor(dev),
						  DEVFS_SPECIAL_BLK, 1);
Linus Torvalds's avatar
Linus Torvalds committed
3383 3384 3385
			if (handle != 0) {
				unsigned major, minor;
				devfs_get_maj_min(handle, &major, &minor);
Linus Torvalds's avatar
Linus Torvalds committed
3386
				dev = mk_kdev(major, minor);
3387
				devfs_put(handle);
Linus Torvalds's avatar
Linus Torvalds committed
3388
			}
Linus Torvalds's avatar
Linus Torvalds committed
3389
			if (kdev_none(dev)) {
Linus Torvalds's avatar
Linus Torvalds committed
3390
				printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
Linus Torvalds's avatar
Linus Torvalds committed
3391 3392
				break;
			}
Linus Torvalds's avatar
Linus Torvalds committed
3393

Linus Torvalds's avatar
Linus Torvalds committed
3394 3395
			devices[i] = dev;
			md_setup_args.device_set[minor] = 1;
Linus Torvalds's avatar
Linus Torvalds committed
3396

Linus Torvalds's avatar
Linus Torvalds committed
3397 3398
			devname = p;
		}
Linus Torvalds's avatar
Linus Torvalds committed
3399
		devices[i] = to_kdev_t(0);
Linus Torvalds's avatar
Linus Torvalds committed
3400

Linus Torvalds's avatar
Linus Torvalds committed
3401
		if (!md_setup_args.device_set[minor])
Linus Torvalds's avatar
Linus Torvalds committed
3402
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
3403 3404 3405

		printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);

3406
		mddev = mddev_find(minor);
Linus Torvalds's avatar
Linus Torvalds committed
3407 3408
		if (!mddev) {
			printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
Linus Torvalds's avatar
Linus Torvalds committed
3409 3410
			continue;
		}
3411
		if (mddev_lock(mddev)) {
Linus Torvalds's avatar
Linus Torvalds committed
3412
			printk(KERN_WARNING
3413
			       "md: Ignoring md=%d, cannot lock!\n",
Linus Torvalds's avatar
Linus Torvalds committed
3414
			       minor);
3415
			mddev_put(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
3416 3417
			continue;
		}
Linus Torvalds's avatar
Linus Torvalds committed
3418

3419
		if (mddev->raid_disks || !list_empty(&mddev->disks)) {
3420 3421 3422
			printk(KERN_WARNING
			       "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
			       minor);
3423
			mddev_unlock(mddev);
3424
			mddev_put(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
3425 3426
			continue;
		}
Linus Torvalds's avatar
Linus Torvalds committed
3427 3428 3429 3430 3431 3432 3433 3434 3435 3436
		if (md_setup_args.pers[minor]) {
			/* non-persistent */
			mdu_array_info_t ainfo;
			ainfo.level = pers_to_level(md_setup_args.pers[minor]);
			ainfo.size = 0;
			ainfo.nr_disks =0;
			ainfo.raid_disks =0;
			ainfo.md_minor =minor;
			ainfo.not_persistent = 1;

Linus Torvalds's avatar
Linus Torvalds committed
3437
			ainfo.state = (1 << MD_SB_CLEAN);
Linus Torvalds's avatar
Linus Torvalds committed
3438 3439 3440
			ainfo.layout = 0;
			ainfo.chunk_size = md_setup_args.chunk[minor];
			err = set_array_info(mddev, &ainfo);
Linus Torvalds's avatar
Linus Torvalds committed
3441 3442 3443 3444
			for (i = 0; !err && i <= MD_SB_DISKS; i++) {
				dev = devices[i];
				if (kdev_none(dev))
					break;
Linus Torvalds's avatar
Linus Torvalds committed
3445 3446 3447
				dinfo.number = i;
				dinfo.raid_disk = i;
				dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
Linus Torvalds's avatar
Linus Torvalds committed
3448 3449
				dinfo.major = major(dev);
				dinfo.minor = minor(dev);
3450
				mddev->raid_disks++;
Linus Torvalds's avatar
Linus Torvalds committed
3451 3452 3453 3454
				err = add_new_disk (mddev, &dinfo);
			}
		} else {
			/* persistent */
Linus Torvalds's avatar
Linus Torvalds committed
3455 3456 3457 3458
			for (i = 0; i <= MD_SB_DISKS; i++) {
				dev = devices[i];
				if (kdev_none(dev))
					break;
Linus Torvalds's avatar
Linus Torvalds committed
3459 3460
				dinfo.major = major(dev);
				dinfo.minor = minor(dev);
Linus Torvalds's avatar
Linus Torvalds committed
3461 3462 3463 3464 3465 3466 3467 3468
				add_new_disk (mddev, &dinfo);
			}
		}
		if (!err)
			err = do_md_run(mddev);
		if (err) {
			mddev->sb_dirty = 0;
			do_md_stop(mddev, 0);
Linus Torvalds's avatar
Linus Torvalds committed
3469
			printk(KERN_WARNING "md: starting md%d failed\n", minor);
Linus Torvalds's avatar
Linus Torvalds committed
3470
		}
3471
		mddev_unlock(mddev);
3472
		mddev_put(mddev);
Linus Torvalds's avatar
Linus Torvalds committed
3473 3474 3475
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
3476
static int __init raid_setup(char *str)
Linus Torvalds's avatar
Linus Torvalds committed
3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489
{
	int len, pos;

	len = strlen(str) + 1;
	pos = 0;

	while (pos < len) {
		char *comma = strchr(str+pos, ',');
		int wlen;
		if (comma)
			wlen = (comma-str)-pos;
		else	wlen = (len-1)-pos;

Linus Torvalds's avatar
Linus Torvalds committed
3490
		if (!strncmp(str, "noautodetect", wlen))
Linus Torvalds's avatar
Linus Torvalds committed
3491 3492 3493 3494 3495 3496 3497
			raid_setup_args.noautodetect = 1;
		pos += wlen+1;
	}
	raid_setup_args.set = 1;
	return 1;
}

3498
static int __init md_run_setup(void)
Linus Torvalds's avatar
Linus Torvalds committed
3499 3500
{
	if (raid_setup_args.noautodetect)
Linus Torvalds's avatar
Linus Torvalds committed
3501
		printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
Linus Torvalds's avatar
Linus Torvalds committed
3502 3503 3504 3505 3506 3507 3508
	else
		autostart_arrays();
	md_setup_drive();
	return 0;
}

__setup("raid=", raid_setup);
Linus Torvalds's avatar
Linus Torvalds committed
3509 3510
__setup("md=", md_setup);

Linus Torvalds's avatar
Linus Torvalds committed
3511 3512 3513 3514 3515
__initcall(md_init);
__initcall(md_run_setup);

#else /* It is a MODULE */

Linus Torvalds's avatar
Linus Torvalds committed
3516
int init_module(void)
Linus Torvalds's avatar
Linus Torvalds committed
3517 3518 3519 3520 3521 3522
{
	return md_init();
}

static void free_device_names(void)
{
3523
	while (!list_empty(&device_names)) {
3524 3525
		dev_name_t *tmp = list_entry(device_names.next,
					     dev_name_t, list);
3526
		list_del(&tmp->list);
Linus Torvalds's avatar
Linus Torvalds committed
3527 3528 3529 3530 3531
		kfree(tmp);
	}
}


Linus Torvalds's avatar
Linus Torvalds committed
3532
void cleanup_module(void)
Linus Torvalds's avatar
Linus Torvalds committed
3533 3534 3535 3536
{
	md_unregister_thread(md_recovery_thread);
	devfs_unregister(devfs_handle);

3537
	unregister_blkdev(MAJOR_NR,"md");
Linus Torvalds's avatar
Linus Torvalds committed
3538 3539 3540 3541 3542
	unregister_reboot_notifier(&md_notifier);
	unregister_sysctl_table(raid_table_header);
#ifdef CONFIG_PROC_FS
	remove_proc_entry("mdstat", NULL);
#endif
Linus Torvalds's avatar
Linus Torvalds committed
3543

Linus Torvalds's avatar
Linus Torvalds committed
3544
	blk_dev[MAJOR_NR].queue = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
3545 3546
	blk_clear(MAJOR_NR);
	
Linus Torvalds's avatar
Linus Torvalds committed
3547 3548 3549 3550
	free_device_names();
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562
EXPORT_SYMBOL(md_size);
EXPORT_SYMBOL(register_md_personality);
EXPORT_SYMBOL(unregister_md_personality);
EXPORT_SYMBOL(partition_name);
EXPORT_SYMBOL(md_error);
EXPORT_SYMBOL(md_sync_acct);
EXPORT_SYMBOL(md_done_sync);
EXPORT_SYMBOL(md_register_thread);
EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_print_devices);
EXPORT_SYMBOL(md_interrupt_thread);
Linus Torvalds's avatar
Linus Torvalds committed
3563
MODULE_LICENSE("GPL");