Commit df38421d authored by Kai Germaschewski's avatar Kai Germaschewski

Hand merged.

parents be5178be 44a706d5
...@@ -69,7 +69,9 @@ void generic_set_mtrr(unsigned int reg, unsigned long base, ...@@ -69,7 +69,9 @@ void generic_set_mtrr(unsigned int reg, unsigned long base,
{ {
u32 cr0, cr4 = 0; u32 cr0, cr4 = 0;
u32 deftype_lo, deftype_hi; u32 deftype_lo, deftype_hi;
static spinlock_t set_atomicity_lock = SPIN_LOCK_UNLOCKED;
spin_lock(&set_atomicity_lock);
/* Save value of CR4 and clear Page Global Enable (bit 7) */ /* Save value of CR4 and clear Page Global Enable (bit 7) */
if ( cpu_has_pge ) { if ( cpu_has_pge ) {
cr4 = read_cr4(); cr4 = read_cr4();
...@@ -112,6 +114,7 @@ void generic_set_mtrr(unsigned int reg, unsigned long base, ...@@ -112,6 +114,7 @@ void generic_set_mtrr(unsigned int reg, unsigned long base,
/* Restore value of CR4 */ /* Restore value of CR4 */
if ( cpu_has_pge ) if ( cpu_has_pge )
write_cr4(cr4); write_cr4(cr4);
spin_unlock(&set_atomicity_lock);
} }
int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
......
...@@ -184,7 +184,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) ...@@ -184,7 +184,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
* If we're in an interrupt, have no user context or are running in an * If we're in an interrupt, have no user context or are running in an
* atomic region then we must not take the fault.. * atomic region then we must not take the fault..
*/ */
if (preempt_count() || !mm) if (in_atomic() || !mm)
goto no_context; goto no_context;
down_read(&mm->mmap_sem); down_read(&mm->mmap_sem);
......
...@@ -55,12 +55,12 @@ static int linear_run (mddev_t *mddev) ...@@ -55,12 +55,12 @@ static int linear_run (mddev_t *mddev)
int j = rdev->raid_disk; int j = rdev->raid_disk;
dev_info_t *disk = conf->disks + j; dev_info_t *disk = conf->disks + j;
if (j < 0 || j > mddev->raid_disks || disk->bdev) { if (j < 0 || j > mddev->raid_disks || disk->rdev) {
printk("linear: disk numbering problem. Aborting!\n"); printk("linear: disk numbering problem. Aborting!\n");
goto out; goto out;
} }
disk->bdev = rdev->bdev; disk->rdev = rdev;
disk->size = rdev->size; disk->size = rdev->size;
if (!conf->smallest || (disk->size < conf->smallest->size)) if (!conf->smallest || (disk->size < conf->smallest->size))
...@@ -153,11 +153,11 @@ static int linear_make_request (request_queue_t *q, struct bio *bio) ...@@ -153,11 +153,11 @@ static int linear_make_request (request_queue_t *q, struct bio *bio)
if (block >= (tmp_dev->size + tmp_dev->offset) if (block >= (tmp_dev->size + tmp_dev->offset)
|| block < tmp_dev->offset) { || block < tmp_dev->offset) {
printk ("linear_make_request: Block %ld out of bounds on dev %s size %ld offset %ld\n", block, bdevname(tmp_dev->bdev), tmp_dev->size, tmp_dev->offset); printk ("linear_make_request: Block %ld out of bounds on dev %s size %ld offset %ld\n", block, bdevname(tmp_dev->rdev->bdev), tmp_dev->size, tmp_dev->offset);
bio_io_error(bio); bio_io_error(bio);
return 0; return 0;
} }
bio->bi_bdev = tmp_dev->bdev; bio->bi_bdev = tmp_dev->rdev->bdev;
bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1); bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1);
return 1; return 1;
...@@ -176,11 +176,11 @@ static int linear_status (char *page, mddev_t *mddev) ...@@ -176,11 +176,11 @@ static int linear_status (char *page, mddev_t *mddev)
for (j = 0; j < conf->nr_zones; j++) for (j = 0; j < conf->nr_zones; j++)
{ {
sz += sprintf(page+sz, "[%s", sz += sprintf(page+sz, "[%s",
bdev_partition_name(conf->hash_table[j].dev0->bdev)); bdev_partition_name(conf->hash_table[j].dev0->rdev->bdev));
if (conf->hash_table[j].dev1) if (conf->hash_table[j].dev1)
sz += sprintf(page+sz, "/%s] ", sz += sprintf(page+sz, "/%s] ",
bdev_partition_name(conf->hash_table[j].dev1->bdev)); bdev_partition_name(conf->hash_table[j].dev1->rdev->bdev));
else else
sz += sprintf(page+sz, "] "); sz += sprintf(page+sz, "] ");
} }
......
...@@ -233,7 +233,7 @@ mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) ...@@ -233,7 +233,7 @@ mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
struct list_head *tmp; struct list_head *tmp;
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->raid_disk == nr) if (rdev->desc_nr == nr)
return rdev; return rdev;
} }
return NULL; return NULL;
...@@ -251,18 +251,6 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) ...@@ -251,18 +251,6 @@ static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
return NULL; return NULL;
} }
static mdk_rdev_t * find_rdev_bdev(mddev_t * mddev, struct block_device *bdev)
{
struct list_head *tmp;
mdk_rdev_t *rdev;
ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->bdev == bdev)
return rdev;
}
return NULL;
}
static LIST_HEAD(device_names); static LIST_HEAD(device_names);
char * partition_name(kdev_t dev) char * partition_name(kdev_t dev)
...@@ -377,9 +365,6 @@ static void free_disk_sb(mdk_rdev_t * rdev) ...@@ -377,9 +365,6 @@ static void free_disk_sb(mdk_rdev_t * rdev)
rdev->sb_page = NULL; rdev->sb_page = NULL;
rdev->sb_offset = 0; rdev->sb_offset = 0;
rdev->size = 0; rdev->size = 0;
} else {
if (!rdev->faulty)
MD_BUG();
} }
} }
...@@ -594,11 +579,10 @@ static void export_rdev(mdk_rdev_t * rdev) ...@@ -594,11 +579,10 @@ static void export_rdev(mdk_rdev_t * rdev)
MD_BUG(); MD_BUG();
free_disk_sb(rdev); free_disk_sb(rdev);
list_del_init(&rdev->same_set); list_del_init(&rdev->same_set);
unlock_rdev(rdev);
#ifndef MODULE #ifndef MODULE
md_autodetect_dev(rdev->bdev->bd_dev); md_autodetect_dev(rdev->bdev->bd_dev);
#endif #endif
rdev->faulty = 0; unlock_rdev(rdev);
kfree(rdev); kfree(rdev);
} }
...@@ -683,9 +667,9 @@ static void print_sb(mdp_super_t *sb) ...@@ -683,9 +667,9 @@ static void print_sb(mdp_super_t *sb)
static void print_rdev(mdk_rdev_t *rdev) static void print_rdev(mdk_rdev_t *rdev)
{ {
printk(KERN_INFO "md: rdev %s, SZ:%08ld F:%d DN:%d ", printk(KERN_INFO "md: rdev %s, SZ:%08ld F:%d S:%d DN:%d ",
bdev_partition_name(rdev->bdev), bdev_partition_name(rdev->bdev),
rdev->size, rdev->faulty, rdev->desc_nr); rdev->size, rdev->faulty, rdev->in_sync, rdev->desc_nr);
if (rdev->sb) { if (rdev->sb) {
printk(KERN_INFO "md: rdev superblock:\n"); printk(KERN_INFO "md: rdev superblock:\n");
print_sb(rdev->sb); print_sb(rdev->sb);
...@@ -816,6 +800,7 @@ static void sync_sbs(mddev_t * mddev) ...@@ -816,6 +800,7 @@ static void sync_sbs(mddev_t * mddev)
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
mdp_super_t *sb; mdp_super_t *sb;
struct list_head *tmp; struct list_head *tmp;
int next_spare = mddev->raid_disks;
/* make all rdev->sb match mddev data.. /* make all rdev->sb match mddev data..
* we setup the data in the first rdev and copy it * we setup the data in the first rdev and copy it
...@@ -868,12 +853,20 @@ static void sync_sbs(mddev_t * mddev) ...@@ -868,12 +853,20 @@ static void sync_sbs(mddev_t * mddev)
sb->disks[0].state = (1<<MD_DISK_REMOVED); sb->disks[0].state = (1<<MD_DISK_REMOVED);
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
mdp_disk_t *d = &sb->disks[rdev->desc_nr]; mdp_disk_t *d;
if (rdev->raid_disk >= 0)
rdev->desc_nr = rdev->raid_disk;
else
rdev->desc_nr = next_spare++;
d = &sb->disks[rdev->desc_nr];
nr_disks++; nr_disks++;
d->number = rdev->desc_nr; d->number = rdev->desc_nr;
d->major = MAJOR(rdev->bdev->bd_dev); d->major = MAJOR(rdev->bdev->bd_dev);
d->minor = MINOR(rdev->bdev->bd_dev); d->minor = MINOR(rdev->bdev->bd_dev);
d->raid_disk = rdev->raid_disk; if (rdev->raid_disk >= 0)
d->raid_disk = rdev->raid_disk;
else
d->raid_disk = rdev->desc_nr; /* compatability */
if (rdev->faulty) { if (rdev->faulty) {
d->state = (1<<MD_DISK_FAULTY); d->state = (1<<MD_DISK_FAULTY);
failed++; failed++;
...@@ -909,8 +902,6 @@ static void sync_sbs(mddev_t * mddev) ...@@ -909,8 +902,6 @@ static void sync_sbs(mddev_t * mddev)
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
mdp_super_t *this_sb; mdp_super_t *this_sb;
if (rdev->faulty || rdev->alias_device)
continue;
this_sb = rdev->sb; this_sb = rdev->sb;
if (this_sb != sb) if (this_sb != sb)
*this_sb = *sb; *this_sb = *sb;
...@@ -956,16 +947,17 @@ static void md_update_sb(mddev_t * mddev) ...@@ -956,16 +947,17 @@ static void md_update_sb(mddev_t * mddev)
printk(KERN_INFO "md: "); printk(KERN_INFO "md: ");
if (rdev->faulty) if (rdev->faulty)
printk("(skipping faulty "); printk("(skipping faulty ");
if (rdev->alias_device)
printk("(skipping alias ");
printk("%s ", bdev_partition_name(rdev->bdev)); printk("%s ", bdev_partition_name(rdev->bdev));
if (!rdev->faulty && !rdev->alias_device) { if (!rdev->faulty) {
printk("[events: %08lx]", printk("[events: %08lx]",
(unsigned long)rdev->sb->events_lo); (unsigned long)rdev->sb->events_lo);
err += write_disk_sb(rdev); err += write_disk_sb(rdev);
} else } else
printk(")\n"); printk(")\n");
if (!err && mddev->level == LEVEL_MULTIPATH)
/* only need to write one superblock... */
break;
} }
if (err) { if (err) {
if (--count) { if (--count) {
...@@ -1010,6 +1002,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk) ...@@ -1010,6 +1002,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
} }
rdev->desc_nr = -1; rdev->desc_nr = -1;
rdev->faulty = 0; rdev->faulty = 0;
rdev->in_sync = 0;
atomic_set(&rdev->nr_pending, 0);
size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
if (!size) { if (!size) {
...@@ -1198,7 +1192,6 @@ static int analyze_sbs(mddev_t * mddev) ...@@ -1198,7 +1192,6 @@ static int analyze_sbs(mddev_t * mddev)
i = 0; i = 0;
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
if (mddev->level == LEVEL_MULTIPATH) { if (mddev->level == LEVEL_MULTIPATH) {
rdev->alias_device = !!i;
rdev->desc_nr = i++; rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr; rdev->raid_disk = rdev->desc_nr;
rdev->in_sync = 1; rdev->in_sync = 1;
...@@ -1206,15 +1199,17 @@ static int analyze_sbs(mddev_t * mddev) ...@@ -1206,15 +1199,17 @@ static int analyze_sbs(mddev_t * mddev)
mdp_disk_t *desc; mdp_disk_t *desc;
rdev->desc_nr = rdev->sb->this_disk.number; rdev->desc_nr = rdev->sb->this_disk.number;
desc = sb->disks + rdev->desc_nr; desc = sb->disks + rdev->desc_nr;
rdev->raid_disk = desc->raid_disk; rdev->raid_disk = -1;
rdev->in_sync = rdev->faulty = 0; rdev->in_sync = rdev->faulty = 0;
if (desc->state & (1<<MD_DISK_FAULTY)) { if (desc->state & (1<<MD_DISK_FAULTY)) {
rdev->faulty = 1; rdev->faulty = 1;
kick_rdev_from_array(rdev); kick_rdev_from_array(rdev);
} else if (desc->state & (1<<MD_DISK_SYNC) && } else if (desc->state & (1<<MD_DISK_SYNC) &&
rdev->raid_disk < mddev->raid_disks) desc->raid_disk < mddev->raid_disks) {
rdev->in_sync = 1; rdev->in_sync = 1;
rdev->raid_disk = desc->raid_disk;
}
} }
} }
...@@ -1345,6 +1340,8 @@ static int do_md_run(mddev_t * mddev) ...@@ -1345,6 +1340,8 @@ static int do_md_run(mddev_t * mddev)
struct list_head *tmp; struct list_head *tmp;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct gendisk *disk; struct gendisk *disk;
char *major_name;
if (list_empty(&mddev->disks)) { if (list_empty(&mddev->disks)) {
MD_BUG(); MD_BUG();
...@@ -1397,10 +1394,7 @@ static int do_md_run(mddev_t * mddev) ...@@ -1397,10 +1394,7 @@ static int do_md_run(mddev_t * mddev)
printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
return -EINVAL; return -EINVAL;
} }
} else }
if (chunk_size)
printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
mddev->level);
if (pnum >= MAX_PERSONALITY) { if (pnum >= MAX_PERSONALITY) {
MD_BUG(); MD_BUG();
...@@ -1454,15 +1448,16 @@ static int do_md_run(mddev_t * mddev) ...@@ -1454,15 +1448,16 @@ static int do_md_run(mddev_t * mddev)
if (!disk) if (!disk)
return -ENOMEM; return -ENOMEM;
memset(disk, 0, sizeof(struct gendisk)); memset(disk, 0, sizeof(struct gendisk));
disk->major_name = kmalloc(6, GFP_KERNEL); major_name = kmalloc(6, GFP_KERNEL);
if (!disk->major_name) { if (!major_name) {
kfree(disk); kfree(disk);
return -ENOMEM; return -ENOMEM;
} }
disk->major = MD_MAJOR; disk->major = MD_MAJOR;
disk->first_minor = mdidx(mddev); disk->first_minor = mdidx(mddev);
disk->minor_shift = 0; disk->minor_shift = 0;
sprintf(disk->major_name, "md%d", mdidx(mddev)); sprintf(major_name, "md%d", mdidx(mddev));
disk->major_name = major_name;
disk->part = md_hd_struct + mdidx(mddev); disk->part = md_hd_struct + mdidx(mddev);
disk->nr_real = 1; disk->nr_real = 1;
disk->fops = &md_fops; disk->fops = &md_fops;
...@@ -1559,10 +1554,6 @@ static int do_md_stop(mddev_t * mddev, int ro) ...@@ -1559,10 +1554,6 @@ static int do_md_stop(mddev_t * mddev, int ro)
mddev->recovery_running = -EINTR; mddev->recovery_running = -EINTR;
md_unregister_thread(mddev->sync_thread); md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL; mddev->sync_thread = NULL;
if (mddev->spare) {
mddev->pers->spare_inactive(mddev);
mddev->spare = NULL;
}
} }
invalidate_device(dev, 1); invalidate_device(dev, 1);
...@@ -1933,7 +1924,7 @@ static int get_disk_info(mddev_t * mddev, void * arg) ...@@ -1933,7 +1924,7 @@ static int get_disk_info(mddev_t * mddev, void * arg)
} }
} else { } else {
info.major = info.minor = 0; info.major = info.minor = 0;
info.raid_disk = 0; info.raid_disk = -1;
info.state = (1<<MD_DISK_REMOVED); info.state = (1<<MD_DISK_REMOVED);
} }
...@@ -1983,7 +1974,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) ...@@ -1983,7 +1974,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
return PTR_ERR(rdev); return PTR_ERR(rdev);
} }
rdev->desc_nr = info->number; rdev->desc_nr = info->number;
rdev->raid_disk = info->raid_disk; if (info->raid_disk < mddev->raid_disks)
rdev->raid_disk = info->raid_disk;
else
rdev->raid_disk = -1;
rdev->faulty = 0; rdev->faulty = 0;
if (rdev->raid_disk < mddev->raid_disks) if (rdev->raid_disk < mddev->raid_disks)
rdev->in_sync = (info->state & (1<<MD_DISK_SYNC)); rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
...@@ -2042,7 +2037,6 @@ static int hot_generate_error(mddev_t * mddev, dev_t dev) ...@@ -2042,7 +2037,6 @@ static int hot_generate_error(mddev_t * mddev, dev_t dev)
static int hot_remove_disk(mddev_t * mddev, dev_t dev) static int hot_remove_disk(mddev_t * mddev, dev_t dev)
{ {
int err;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
if (!mddev->pers) if (!mddev->pers)
...@@ -2051,28 +2045,12 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev) ...@@ -2051,28 +2045,12 @@ static int hot_remove_disk(mddev_t * mddev, dev_t dev)
printk(KERN_INFO "md: trying to remove %s from md%d ... \n", printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
partition_name(to_kdev_t(dev)), mdidx(mddev)); partition_name(to_kdev_t(dev)), mdidx(mddev));
if (!mddev->pers->hot_remove_disk) {
printk(KERN_WARNING "md%d: personality does not support diskops!\n",
mdidx(mddev));
return -EINVAL;
}
rdev = find_rdev(mddev, dev); rdev = find_rdev(mddev, dev);
if (!rdev) if (!rdev)
return -ENXIO; return -ENXIO;
if (rdev->in_sync && ! rdev->faulty) if (rdev->raid_disk >= 0)
goto busy;
err = mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
if (err == -EBUSY) {
MD_BUG();
goto busy; goto busy;
}
if (err) {
MD_BUG();
return -EINVAL;
}
kick_rdev_from_array(rdev); kick_rdev_from_array(rdev);
md_update_sb(mddev); md_update_sb(mddev);
...@@ -2145,13 +2123,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) ...@@ -2145,13 +2123,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
} }
rdev->desc_nr = i; rdev->desc_nr = i;
rdev->raid_disk = i; rdev->raid_disk = -1;
if (mddev->pers->hot_add_disk(mddev, rdev)) {
MD_BUG();
err = -EINVAL;
goto abort_unbind_export;
}
md_update_sb(mddev); md_update_sb(mddev);
...@@ -2204,14 +2176,13 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) ...@@ -2204,14 +2176,13 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
static int set_disk_faulty(mddev_t *mddev, dev_t dev) static int set_disk_faulty(mddev_t *mddev, dev_t dev)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
int ret;
rdev = find_rdev(mddev, dev); rdev = find_rdev(mddev, dev);
if (!rdev) if (!rdev)
return 0; return 0;
ret = md_error(mddev, rdev->bdev); md_error(mddev, rdev);
return ret; return 1;
} }
static int md_ioctl(struct inode *inode, struct file *file, static int md_ioctl(struct inode *inode, struct file *file,
...@@ -2434,9 +2405,10 @@ static int md_ioctl(struct inode *inode, struct file *file, ...@@ -2434,9 +2405,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
} }
default: default:
printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, " if (_IOC_TYPE(cmd) == MD_MAJOR)
"upgrade your software to use new ictls.\n", printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
current->comm, current->pid); "upgrade your software to use new ictls.\n",
current->comm, current->pid);
err = -EINVAL; err = -EINVAL;
goto abort_unlock; goto abort_unlock;
} }
...@@ -2626,10 +2598,8 @@ static void md_recover_arrays(void) ...@@ -2626,10 +2598,8 @@ static void md_recover_arrays(void)
} }
int md_error(mddev_t *mddev, struct block_device *bdev) void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
mdk_rdev_t * rrdev;
dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
MD_MAJOR,mdidx(mddev),MAJOR(bdev->bd_dev),MINOR(bdev->bd_dev), MD_MAJOR,mdidx(mddev),MAJOR(bdev->bd_dev),MINOR(bdev->bd_dev),
__builtin_return_address(0),__builtin_return_address(1), __builtin_return_address(0),__builtin_return_address(1),
...@@ -2637,25 +2607,15 @@ int md_error(mddev_t *mddev, struct block_device *bdev) ...@@ -2637,25 +2607,15 @@ int md_error(mddev_t *mddev, struct block_device *bdev)
if (!mddev) { if (!mddev) {
MD_BUG(); MD_BUG();
return 0; return;
} }
rrdev = find_rdev_bdev(mddev, bdev);
if (!rrdev || rrdev->faulty)
return 0;
if (!mddev->pers->error_handler
|| mddev->pers->error_handler(mddev,bdev) <= 0) {
rrdev->faulty = 1;
rrdev->in_sync = 0;
} else
return 1;
/*
* if recovery was running, stop it now.
*/
if (mddev->recovery_running)
mddev->recovery_running = -EIO;
md_recover_arrays();
return 0; if (!rdev || rdev->faulty)
return;
if (!mddev->pers->error_handler)
return;
mddev->pers->error_handler(mddev,rdev);
md_recover_arrays();
} }
static int status_unused(char * page) static int status_unused(char * page)
...@@ -2706,7 +2666,7 @@ static int status_resync(char * page, mddev_t * mddev) ...@@ -2706,7 +2666,7 @@ static int status_resync(char * page, mddev_t * mddev)
sz += sprintf(page + sz, "] "); sz += sprintf(page + sz, "] ");
} }
sz += sprintf(page + sz, " %s =%3lu.%lu%% (%lu/%lu)", sz += sprintf(page + sz, " %s =%3lu.%lu%% (%lu/%lu)",
(mddev->spare ? "recovery" : "resync"), (mddev->spares ? "recovery" : "resync"),
res/10, res % 10, resync, max_blocks); res/10, res % 10, resync, max_blocks);
/* /*
...@@ -2824,26 +2784,10 @@ int unregister_md_personality(int pnum) ...@@ -2824,26 +2784,10 @@ int unregister_md_personality(int pnum)
return 0; return 0;
} }
static mdk_rdev_t *get_spare(mddev_t *mddev)
{
mdk_rdev_t *rdev;
struct list_head *tmp;
ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->faulty)
continue;
if (rdev->in_sync)
continue;
return rdev;
}
return NULL;
}
static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
{ {
kdev_t dev = to_kdev_t(bdev->bd_dev); kdev_t dev = to_kdev_t(rdev->bdev->bd_dev);
unsigned int major = major(dev); unsigned int major = major(dev);
unsigned int index; unsigned int index;
...@@ -3057,19 +3001,30 @@ static void md_do_sync(void *data) ...@@ -3057,19 +3001,30 @@ static void md_do_sync(void *data)
/* /*
* This is the kernel thread that watches all md arrays for re-sync action * This is the kernel thread that watches all md arrays for re-sync and other
* that might be needed. * action that might be needed.
* It does not do any resync itself, but rather "forks" off other threads * It does not do any resync itself, but rather "forks" off other threads
* to do that as needed. * to do that as needed.
* When it is determined that resync is needed, we set "->recovery_running" and * When it is determined that resync is needed, we set "->recovery_running" and
* create a thread at ->sync_thread. * create a thread at ->sync_thread.
* When the thread finishes is clears recovery_running (or set and error) * When the thread finishes it clears recovery_running (or sets an error)
* and wakeup up this thread which will reap the thread and finish up. * and wakeup up this thread which will reap the thread and finish up.
* This thread also removes any faulty devices (with nr_pending == 0).
*
* The overall approach is:
* 1/ if the superblock needs updating, update it.
* 2/ If a recovery thread is running, don't do anything else.
* 3/ If recovery has finished, clean up, possibly marking spares active.
* 4/ If there are any faulty devices, remove them.
* 5/ If array is degraded, try to add spares devices
* 6/ If array has spares or is not in-sync, start a resync thread.
*/ */
void md_do_recovery(void *data) void md_do_recovery(void *data)
{ {
mddev_t *mddev; mddev_t *mddev;
struct list_head *tmp; mdk_rdev_t *rdev;
struct list_head *tmp, *rtmp;
dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
...@@ -3085,26 +3040,11 @@ void md_do_recovery(void *data) ...@@ -3085,26 +3040,11 @@ void md_do_recovery(void *data)
/* resync has finished, collect result */ /* resync has finished, collect result */
md_unregister_thread(mddev->sync_thread); md_unregister_thread(mddev->sync_thread);
mddev->sync_thread = NULL; mddev->sync_thread = NULL;
if (mddev->recovery_running < 0) { if (mddev->recovery_running == 0) {
/* some sort of failure.
* If we were doing a reconstruction,
* we need to retrieve the spare
*/
if (!mddev->pers->spare_inactive)
goto unlock;
if (mddev->spare) {
mddev->pers->spare_inactive(mddev);
mddev->spare = NULL;
}
} else {
if (!mddev->pers->spare_active)
goto unlock;
/* success...*/ /* success...*/
if (mddev->spare) { /* activate any spares */
mddev->pers->spare_active(mddev); mddev->pers->spare_active(mddev);
mddev->spare->in_sync = 1; mddev->spares = 0;
mddev->spare = NULL;
}
} }
md_update_sb(mddev); md_update_sb(mddev);
mddev->recovery_running = 0; mddev->recovery_running = 0;
...@@ -3117,16 +3057,33 @@ void md_do_recovery(void *data) ...@@ -3117,16 +3057,33 @@ void md_do_recovery(void *data)
wake_up(&resync_wait); wake_up(&resync_wait);
} }
/* no recovery is running.
* remove any failed drives, then
* add spares if possible
*/
mddev->spares = 0;
ITERATE_RDEV(mddev,rdev,rtmp) {
if (rdev->raid_disk >= 0 &&
rdev->faulty &&
atomic_read(&rdev->nr_pending)==0) {
mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
rdev->raid_disk = -1;
}
if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
mddev->spares++;
}
if (mddev->degraded) { if (mddev->degraded) {
mddev->spare = get_spare(mddev); ITERATE_RDEV(mddev,rdev,rtmp)
if (!mddev->spare) if (rdev->raid_disk < 0
printk(KERN_ERR "md%d: no spare disk to reconstruct array! " && !rdev->faulty) {
"-- continuing in degraded mode\n", mdidx(mddev)); if (mddev->pers->hot_add_disk(mddev,rdev))
else mddev->spares++;
printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n", else
mdidx(mddev), bdev_partition_name(mddev->spare->bdev)); break;
}
} }
if (!mddev->spare && mddev->in_sync) {
if (!mddev->spares && mddev->in_sync) {
/* nothing we can do ... */ /* nothing we can do ... */
goto unlock; goto unlock;
} }
...@@ -3136,13 +3093,9 @@ void md_do_recovery(void *data) ...@@ -3136,13 +3093,9 @@ void md_do_recovery(void *data)
"md_resync"); "md_resync");
if (!mddev->sync_thread) { if (!mddev->sync_thread) {
printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev)); printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev));
if (mddev->spare) /* leave the spares where they are, it shouldn't hurt */
mddev->pers->spare_inactive(mddev);
mddev->spare = NULL;
mddev->recovery_running = 0; mddev->recovery_running = 0;
} else { } else {
if (mddev->spare)
mddev->pers->spare_write(mddev);
mddev->recovery_running = 1; mddev->recovery_running = 1;
md_wakeup_thread(mddev->sync_thread); md_wakeup_thread(mddev->sync_thread);
} }
...@@ -3540,7 +3493,7 @@ static int __init raid_setup(char *str) ...@@ -3540,7 +3493,7 @@ static int __init raid_setup(char *str)
return 1; return 1;
} }
int __init md_run_setup(void) static int __init md_run_setup(void)
{ {
if (raid_setup_args.noautodetect) if (raid_setup_args.noautodetect)
printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n"); printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
...@@ -3604,6 +3557,5 @@ EXPORT_SYMBOL(md_register_thread); ...@@ -3604,6 +3557,5 @@ EXPORT_SYMBOL(md_register_thread);
EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_unregister_thread);
EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_wakeup_thread);
EXPORT_SYMBOL(md_print_devices); EXPORT_SYMBOL(md_print_devices);
EXPORT_SYMBOL(find_rdev_nr);
EXPORT_SYMBOL(md_interrupt_thread); EXPORT_SYMBOL(md_interrupt_thread);
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
...@@ -70,7 +70,7 @@ static void mp_pool_free(void *mpb, void *data) ...@@ -70,7 +70,7 @@ static void mp_pool_free(void *mpb, void *data)
kfree(mpb); kfree(mpb);
} }
static int multipath_map (mddev_t *mddev, struct block_device **bdev) static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdevp)
{ {
multipath_conf_t *conf = mddev_to_conf(mddev); multipath_conf_t *conf = mddev_to_conf(mddev);
int i, disks = MD_SB_DISKS; int i, disks = MD_SB_DISKS;
...@@ -80,12 +80,17 @@ static int multipath_map (mddev_t *mddev, struct block_device **bdev) ...@@ -80,12 +80,17 @@ static int multipath_map (mddev_t *mddev, struct block_device **bdev)
* now we use the first available disk. * now we use the first available disk.
*/ */
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) { for (i = 0; i < disks; i++) {
if (conf->multipaths[i].operational) { mdk_rdev_t *rdev = conf->multipaths[i].rdev;
*bdev = conf->multipaths[i].bdev; if (rdev && rdev->in_sync) {
return (0); *rdevp = rdev;
atomic_inc(&rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
return 0;
} }
} }
spin_unlock_irq(&conf->device_lock);
printk (KERN_ERR "multipath_map(): no more operational IO paths?\n"); printk (KERN_ERR "multipath_map(): no more operational IO paths?\n");
return (-1); return (-1);
...@@ -126,21 +131,21 @@ void multipath_end_request(struct bio *bio) ...@@ -126,21 +131,21 @@ void multipath_end_request(struct bio *bio)
{ {
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
multipath_conf_t *conf; multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
struct block_device *bdev; mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
if (uptodate) {
if (uptodate)
multipath_end_bh_io(mp_bh, uptodate); multipath_end_bh_io(mp_bh, uptodate);
return; else {
/*
* oops, IO error:
*/
md_error (mp_bh->mddev, rdev);
printk(KERN_ERR "multipath: %s: rescheduling sector %lu\n",
bdev_partition_name(rdev->bdev), bio->bi_sector);
multipath_reschedule_retry(mp_bh);
} }
/* atomic_dec(&rdev->nr_pending);
* oops, IO error:
*/
conf = mddev_to_conf(mp_bh->mddev);
bdev = conf->multipaths[mp_bh->path].bdev;
md_error (mp_bh->mddev, bdev);
printk(KERN_ERR "multipath: %s: rescheduling sector %lu\n",
bdev_partition_name(bdev), bio->bi_sector);
multipath_reschedule_retry(mp_bh);
return; return;
} }
...@@ -153,9 +158,11 @@ static int multipath_read_balance (multipath_conf_t *conf) ...@@ -153,9 +158,11 @@ static int multipath_read_balance (multipath_conf_t *conf)
{ {
int disk; int disk;
for (disk = 0; disk < MD_SB_DISKS; disk++) for (disk = 0; disk < MD_SB_DISKS; disk++) {
if (conf->multipaths[disk].operational) mdk_rdev_t *rdev = conf->multipaths[disk].rdev;
if (rdev && rdev->in_sync)
return disk; return disk;
}
BUG(); BUG();
return 0; return 0;
} }
...@@ -175,11 +182,14 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio) ...@@ -175,11 +182,14 @@ static int multipath_make_request (request_queue_t *q, struct bio * bio)
/* /*
* read balancing logic: * read balancing logic:
*/ */
spin_lock_irq(&conf->device_lock);
mp_bh->path = multipath_read_balance(conf); mp_bh->path = multipath_read_balance(conf);
multipath = conf->multipaths + mp_bh->path; multipath = conf->multipaths + mp_bh->path;
atomic_inc(&multipath->rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
mp_bh->bio = *bio; mp_bh->bio = *bio;
mp_bh->bio.bi_bdev = multipath->bdev; mp_bh->bio.bi_bdev = multipath->rdev->bdev;
mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_end_io = multipath_end_request;
mp_bh->bio.bi_private = mp_bh; mp_bh->bio.bi_private = mp_bh;
generic_make_request(&mp_bh->bio); generic_make_request(&mp_bh->bio);
...@@ -195,7 +205,8 @@ static int multipath_status (char *page, mddev_t *mddev) ...@@ -195,7 +205,8 @@ static int multipath_status (char *page, mddev_t *mddev)
conf->working_disks); conf->working_disks);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
sz += sprintf (page+sz, "%s", sz += sprintf (page+sz, "%s",
conf->multipaths[i].operational ? "U" : "_"); conf->multipaths[i].rdev &&
conf->multipaths[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]"); sz += sprintf (page+sz, "]");
return sz; return sz;
} }
...@@ -210,28 +221,13 @@ static int multipath_status (char *page, mddev_t *mddev) ...@@ -210,28 +221,13 @@ static int multipath_status (char *page, mddev_t *mddev)
"multipath: IO failure on %s, disabling IO path. \n" \ "multipath: IO failure on %s, disabling IO path. \n" \
" Operation continuing on %d IO paths.\n" " Operation continuing on %d IO paths.\n"
static void mark_disk_bad (mddev_t *mddev, int failed)
{
multipath_conf_t *conf = mddev_to_conf(mddev);
struct multipath_info *multipath = conf->multipaths+failed;
multipath->operational = 0;
mddev->sb_dirty = 1;
conf->working_disks--;
printk (DISK_FAILED, bdev_partition_name (multipath->bdev),
conf->working_disks);
}
/* /*
* Careful, this can execute in IRQ contexts as well! * Careful, this can execute in IRQ contexts as well!
*/ */
static int multipath_error (mddev_t *mddev, struct block_device *bdev) static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
{ {
multipath_conf_t *conf = mddev_to_conf(mddev); multipath_conf_t *conf = mddev_to_conf(mddev);
struct multipath_info * multipaths = conf->multipaths;
int disks = MD_SB_DISKS;
int i;
if (conf->working_disks <= 1) { if (conf->working_disks <= 1) {
/* /*
...@@ -239,24 +235,21 @@ static int multipath_error (mddev_t *mddev, struct block_device *bdev) ...@@ -239,24 +235,21 @@ static int multipath_error (mddev_t *mddev, struct block_device *bdev)
* first check if this is a queued request for a device * first check if this is a queued request for a device
* which has just failed. * which has just failed.
*/ */
for (i = 0; i < disks; i++) {
if (multipaths[i].bdev == bdev && !multipaths[i].operational)
return 0;
}
printk (LAST_DISK); printk (LAST_DISK);
return 1; /* leave it active... it's all we have */ /* leave it active... it's all we have */
} else { } else {
/* /*
* Mark disk as unusable * Mark disk as unusable
*/ */
for (i = 0; i < disks; i++) { if (!rdev->faulty) {
if (multipaths[i].bdev == bdev && multipaths[i].operational) { rdev->in_sync = 0;
mark_disk_bad(mddev, i); rdev->faulty = 1;
break; mddev->sb_dirty = 1;
} conf->working_disks--;
printk (DISK_FAILED, bdev_partition_name (rdev->bdev),
conf->working_disks);
} }
} }
return 0;
} }
#undef LAST_DISK #undef LAST_DISK
...@@ -279,11 +272,10 @@ static void print_multipath_conf (multipath_conf_t *conf) ...@@ -279,11 +272,10 @@ static void print_multipath_conf (multipath_conf_t *conf)
for (i = 0; i < MD_SB_DISKS; i++) { for (i = 0; i < MD_SB_DISKS; i++) {
tmp = conf->multipaths + i; tmp = conf->multipaths + i;
if (tmp->operational || tmp->used_slot) if (tmp->rdev)
printk(" disk%d, o:%d, us:%d dev:%s\n", printk(" disk%d, o:%d, dev:%s\n",
i,tmp->operational, i,!tmp->rdev->faulty,
tmp->used_slot, bdev_partition_name(tmp->rdev->bdev));
bdev_partition_name(tmp->bdev));
} }
} }
...@@ -291,24 +283,23 @@ static void print_multipath_conf (multipath_conf_t *conf) ...@@ -291,24 +283,23 @@ static void print_multipath_conf (multipath_conf_t *conf)
static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
multipath_conf_t *conf = mddev->private; multipath_conf_t *conf = mddev->private;
int err = 1; int found = 0;
struct multipath_info *p = conf->multipaths + rdev->raid_disk; int path;
struct multipath_info *p;
print_multipath_conf(conf); print_multipath_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (!p->used_slot) { for (path=0; path<mddev->raid_disks; path++)
p->bdev = rdev->bdev; if ((p=conf->multipaths+path)->rdev == NULL) {
p->operational = 1; p->rdev = rdev;
p->used_slot = 1; conf->working_disks++;
conf->working_disks++; rdev->raid_disk = path;
err = 0; found = 1;
} }
if (err)
MD_BUG();
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
print_multipath_conf(conf); print_multipath_conf(conf);
return err; return found;
} }
static int multipath_remove_disk(mddev_t *mddev, int number) static int multipath_remove_disk(mddev_t *mddev, int number)
...@@ -320,14 +311,14 @@ static int multipath_remove_disk(mddev_t *mddev, int number) ...@@ -320,14 +311,14 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
print_multipath_conf(conf); print_multipath_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (p->used_slot) { if (p->rdev) {
if (p->operational) { if (p->rdev->in_sync ||
atomic_read(&p->rdev->nr_pending)) {
printk(KERN_ERR "hot-remove-disk, slot %d is identified but is still operational!\n", number); printk(KERN_ERR "hot-remove-disk, slot %d is identified but is still operational!\n", number);
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
p->bdev = NULL; p->rdev = NULL;
p->used_slot = 0;
err = 0; err = 0;
} }
if (err) if (err)
...@@ -359,7 +350,7 @@ static void multipathd (void *data) ...@@ -359,7 +350,7 @@ static void multipathd (void *data)
struct bio *bio; struct bio *bio;
unsigned long flags; unsigned long flags;
mddev_t *mddev; mddev_t *mddev;
struct block_device *bdev; mdk_rdev_t *rdev;
for (;;) { for (;;) {
spin_lock_irqsave(&retry_list_lock, flags); spin_lock_irqsave(&retry_list_lock, flags);
...@@ -372,16 +363,16 @@ static void multipathd (void *data) ...@@ -372,16 +363,16 @@ static void multipathd (void *data)
mddev = mp_bh->mddev; mddev = mp_bh->mddev;
bio = &mp_bh->bio; bio = &mp_bh->bio;
bio->bi_sector = mp_bh->master_bio->bi_sector; bio->bi_sector = mp_bh->master_bio->bi_sector;
bdev = bio->bi_bdev;
multipath_map (mddev, &bio->bi_bdev); rdev = NULL;
if (bio->bi_bdev == bdev) { if (multipath_map (mddev, &rdev)<0) {
printk(IO_ERROR, printk(IO_ERROR,
bdev_partition_name(bio->bi_bdev), bio->bi_sector); bdev_partition_name(bio->bi_bdev), bio->bi_sector);
multipath_end_bh_io(mp_bh, 0); multipath_end_bh_io(mp_bh, 0);
} else { } else {
printk(REDIRECT_SECTOR, printk(REDIRECT_SECTOR,
bdev_partition_name(bio->bi_bdev), bio->bi_sector); bdev_partition_name(bio->bi_bdev), bio->bi_sector);
bio->bi_bdev = rdev->bdev;
generic_make_request(bio); generic_make_request(bio);
} }
} }
...@@ -436,7 +427,6 @@ static int multipath_run (mddev_t *mddev) ...@@ -436,7 +427,6 @@ static int multipath_run (mddev_t *mddev)
struct multipath_info *disk; struct multipath_info *disk;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct list_head *tmp; struct list_head *tmp;
int num_rdevs = 0;
MOD_INC_USE_COUNT; MOD_INC_USE_COUNT;
...@@ -458,40 +448,20 @@ static int multipath_run (mddev_t *mddev) ...@@ -458,40 +448,20 @@ static int multipath_run (mddev_t *mddev)
} }
memset(conf, 0, sizeof(*conf)); memset(conf, 0, sizeof(*conf));
conf->working_disks = 0;
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
if (rdev->faulty) { disk_idx = rdev->raid_disk;
/* this is a "should never happen" case and if it */ if (disk_idx < 0 ||
/* ever does happen, a continue; won't help */ disk_idx >= mddev->raid_disks)
printk(ERRORS, bdev_partition_name(rdev->bdev));
continue;
} else {
/* this is a "should never happen" case and if it */
/* ever does happen, a continue; won't help */
if (!rdev->sb) {
MD_BUG();
continue;
}
}
if (rdev->desc_nr == -1) {
MD_BUG();
continue; continue;
}
disk_idx = rdev->raid_disk;
disk = conf->multipaths + disk_idx; disk = conf->multipaths + disk_idx;
disk->rdev = rdev;
/* if (!rdev->faulty)
* Mark all disks as active to start with, there are no conf->working_disks++;
* spares. multipath_read_balance deals with choose
* the "best" operational device.
*/
disk->bdev = rdev->bdev;
disk->operational = 1;
disk->used_slot = 1;
num_rdevs++;
} }
conf->raid_disks = mddev->raid_disks = num_rdevs; conf->raid_disks = mddev->raid_disks;
mddev->sb_dirty = 1; mddev->sb_dirty = 1;
conf->mddev = mddev; conf->mddev = mddev;
conf->device_lock = SPIN_LOCK_UNLOCKED; conf->device_lock = SPIN_LOCK_UNLOCKED;
...@@ -500,6 +470,7 @@ static int multipath_run (mddev_t *mddev) ...@@ -500,6 +470,7 @@ static int multipath_run (mddev_t *mddev)
printk(NONE_OPERATIONAL, mdidx(mddev)); printk(NONE_OPERATIONAL, mdidx(mddev));
goto out_free_conf; goto out_free_conf;
} }
mddev->degraded = conf->raid_disks = conf->working_disks;
conf->pool = mempool_create(NR_RESERVED_BUFS, conf->pool = mempool_create(NR_RESERVED_BUFS,
mp_pool_alloc, mp_pool_free, mp_pool_alloc, mp_pool_free,
......
...@@ -87,7 +87,7 @@ static int create_strip_zones (mddev_t *mddev) ...@@ -87,7 +87,7 @@ static int create_strip_zones (mddev_t *mddev)
cnt = 0; cnt = 0;
smallest = NULL; smallest = NULL;
ITERATE_RDEV(mddev, rdev1, tmp1) { ITERATE_RDEV(mddev, rdev1, tmp1) {
int j = rdev1->sb->this_disk.raid_disk; int j = rdev1->raid_disk;
if (j < 0 || j >= mddev->raid_disks) { if (j < 0 || j >= mddev->raid_disks) {
printk("raid0: bad disk number %d - aborting!\n", j); printk("raid0: bad disk number %d - aborting!\n", j);
......
...@@ -135,7 +135,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) ...@@ -135,7 +135,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
bio_put(r1_bio->read_bio); bio_put(r1_bio->read_bio);
r1_bio->read_bio = NULL; r1_bio->read_bio = NULL;
} }
for (i = 0; i < MD_SB_DISKS; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct bio **bio = r1_bio->write_bios + i; struct bio **bio = r1_bio->write_bios + i;
if (*bio) { if (*bio) {
if (atomic_read(&(*bio)->bi_cnt) != 1) if (atomic_read(&(*bio)->bi_cnt) != 1)
...@@ -188,22 +188,27 @@ static inline void put_buf(r1bio_t *r1_bio) ...@@ -188,22 +188,27 @@ static inline void put_buf(r1bio_t *r1_bio)
mempool_free(r1_bio, conf->r1buf_pool); mempool_free(r1_bio, conf->r1buf_pool);
} }
static int map(mddev_t *mddev, struct block_device **bdev) static int map(mddev_t *mddev, mdk_rdev_t **rdevp)
{ {
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
int i, disks = MD_SB_DISKS; int i, disks = conf->raid_disks;
/* /*
* Later we do read balancing on the read side * Later we do read balancing on the read side
* now we use the first available disk. * now we use the first available disk.
*/ */
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) { for (i = 0; i < disks; i++) {
if (conf->mirrors[i].operational) { mdk_rdev_t *rdev = conf->mirrors[i].rdev;
*bdev = conf->mirrors[i].bdev; if (rdev && rdev->in_sync) {
*rdevp = rdev;
atomic_inc(&rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
return 0; return 0;
} }
} }
spin_unlock_irq(&conf->device_lock);
printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n"); printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
return -1; return -1;
...@@ -244,7 +249,6 @@ static void inline update_head_pos(int disk, r1bio_t *r1_bio) ...@@ -244,7 +249,6 @@ static void inline update_head_pos(int disk, r1bio_t *r1_bio)
conf->mirrors[disk].head_position = conf->mirrors[disk].head_position =
r1_bio->sector + (r1_bio->master_bio->bi_size >> 9); r1_bio->sector + (r1_bio->master_bio->bi_size >> 9);
atomic_dec(&conf->mirrors[disk].nr_pending);
} }
static void end_request(struct bio *bio) static void end_request(struct bio *bio)
...@@ -257,7 +261,7 @@ static void end_request(struct bio *bio) ...@@ -257,7 +261,7 @@ static void end_request(struct bio *bio)
if (r1_bio->cmd == READ || r1_bio->cmd == READA) if (r1_bio->cmd == READ || r1_bio->cmd == READA)
mirror = r1_bio->read_disk; mirror = r1_bio->read_disk;
else { else {
for (mirror = 0; mirror < MD_SB_DISKS; mirror++) for (mirror = 0; mirror < conf->raid_disks; mirror++)
if (r1_bio->write_bios[mirror] == bio) if (r1_bio->write_bios[mirror] == bio)
break; break;
} }
...@@ -265,7 +269,7 @@ static void end_request(struct bio *bio) ...@@ -265,7 +269,7 @@ static void end_request(struct bio *bio)
* this branch is our 'one mirror IO has finished' event handler: * this branch is our 'one mirror IO has finished' event handler:
*/ */
if (!uptodate) if (!uptodate)
md_error(r1_bio->mddev, conf->mirrors[mirror].bdev); md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
else else
/* /*
* Set R1BIO_Uptodate in our master bio, so that * Set R1BIO_Uptodate in our master bio, so that
...@@ -285,29 +289,30 @@ static void end_request(struct bio *bio) ...@@ -285,29 +289,30 @@ static void end_request(struct bio *bio)
/* /*
* we have only one bio on the read side * we have only one bio on the read side
*/ */
if (uptodate) { if (uptodate)
raid_end_bio_io(r1_bio, uptodate); raid_end_bio_io(r1_bio, uptodate);
return; else {
/*
* oops, read error:
*/
printk(KERN_ERR "raid1: %s: rescheduling sector %lu\n",
bdev_partition_name(conf->mirrors[mirror].rdev->bdev), r1_bio->sector);
reschedule_retry(r1_bio);
} }
} else {
if (r1_bio->read_bio)
BUG();
/* /*
* oops, read error: * WRITE:
*
* Let's see if all mirrored write operations have finished
* already.
*/ */
printk(KERN_ERR "raid1: %s: rescheduling sector %lu\n", if (atomic_dec_and_test(&r1_bio->remaining))
bdev_partition_name(conf->mirrors[mirror].bdev), r1_bio->sector); raid_end_bio_io(r1_bio, uptodate);
reschedule_retry(r1_bio);
return;
} }
atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
if (r1_bio->read_bio)
BUG();
/*
* WRITE:
*
* Let's see if all mirrored write operations have finished
* already.
*/
if (atomic_dec_and_test(&r1_bio->remaining))
raid_end_bio_io(r1_bio, uptodate);
} }
/* /*
...@@ -321,6 +326,8 @@ static void end_request(struct bio *bio) ...@@ -321,6 +326,8 @@ static void end_request(struct bio *bio)
* *
* If there are 2 mirrors in the same 2 devices, performance degrades * If there are 2 mirrors in the same 2 devices, performance degrades
* because position is mirror, not device based. * because position is mirror, not device based.
*
* The rdev for the device selected will have nr_pending incremented.
*/ */
static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
{ {
...@@ -329,6 +336,7 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) ...@@ -329,6 +336,7 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
const int sectors = bio->bi_size >> 9; const int sectors = bio->bi_size >> 9;
sector_t new_distance, current_distance; sector_t new_distance, current_distance;
spin_lock_irq(&conf->device_lock);
/* /*
* Check if it if we can balance. We can balance on the whole * Check if it if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window. * device if no resync is going on, or below the resync window.
...@@ -337,7 +345,9 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) ...@@ -337,7 +345,9 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) { if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) {
/* make sure that disk is operational */ /* make sure that disk is operational */
new_disk = 0; new_disk = 0;
while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
while (!conf->mirrors[new_disk].rdev ||
!conf->mirrors[new_disk].rdev->in_sync) {
new_disk++; new_disk++;
if (new_disk == conf->raid_disks) { if (new_disk == conf->raid_disks) {
new_disk = 0; new_disk = 0;
...@@ -349,7 +359,8 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) ...@@ -349,7 +359,8 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
/* make sure the disk is operational */ /* make sure the disk is operational */
while (!conf->mirrors[new_disk].operational) { while (!conf->mirrors[new_disk].rdev ||
!conf->mirrors[new_disk].rdev->in_sync) {
if (new_disk <= 0) if (new_disk <= 0)
new_disk = conf->raid_disks; new_disk = conf->raid_disks;
new_disk--; new_disk--;
...@@ -378,11 +389,11 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) ...@@ -378,11 +389,11 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
disk = conf->raid_disks; disk = conf->raid_disks;
disk--; disk--;
if ((conf->mirrors[disk].write_only) || if (!conf->mirrors[disk].rdev ||
(!conf->mirrors[disk].operational)) !conf->mirrors[disk].rdev->in_sync)
continue; continue;
if (!atomic_read(&conf->mirrors[disk].nr_pending)) { if (!atomic_read(&conf->mirrors[disk].rdev->nr_pending)) {
new_disk = disk; new_disk = disk;
break; break;
} }
...@@ -399,6 +410,10 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio) ...@@ -399,6 +410,10 @@ static int read_balance(conf_t *conf, struct bio *bio, r1bio_t *r1_bio)
conf->last_used = new_disk; conf->last_used = new_disk;
if (conf->mirrors[new_disk].rdev)
atomic_inc(&conf->mirrors[new_disk].rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
return new_disk; return new_disk;
} }
...@@ -441,7 +456,7 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -441,7 +456,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
mirror_info_t *mirror; mirror_info_t *mirror;
r1bio_t *r1_bio; r1bio_t *r1_bio;
struct bio *read_bio; struct bio *read_bio;
int i, sum_bios = 0, disks = MD_SB_DISKS; int i, sum_bios = 0, disks = conf->raid_disks;
/* /*
* Register the new request and wait if the reconstruction * Register the new request and wait if the reconstruction
...@@ -478,31 +493,42 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -478,31 +493,42 @@ static int make_request(request_queue_t *q, struct bio * bio)
r1_bio->read_bio = read_bio; r1_bio->read_bio = read_bio;
read_bio->bi_sector = r1_bio->sector; read_bio->bi_sector = r1_bio->sector;
read_bio->bi_bdev = mirror->bdev; read_bio->bi_bdev = mirror->rdev->bdev;
read_bio->bi_end_io = end_request; read_bio->bi_end_io = end_request;
read_bio->bi_rw = r1_bio->cmd; read_bio->bi_rw = r1_bio->cmd;
read_bio->bi_private = r1_bio; read_bio->bi_private = r1_bio;
generic_make_request(read_bio); generic_make_request(read_bio);
atomic_inc(&conf->mirrors[r1_bio->read_disk].nr_pending);
return 0; return 0;
} }
/* /*
* WRITE: * WRITE:
*/ */
/* first select target devices under spinlock and
* inc refcount on their rdev. Record them by setting
* write_bios[x] to bio
*/
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) {
if (conf->mirrors[i].rdev &&
!conf->mirrors[i].rdev->faulty) {
atomic_inc(&conf->mirrors[i].rdev->nr_pending);
r1_bio->write_bios[i] = bio;
} else
r1_bio->write_bios[i] = NULL;
}
spin_unlock_irq(&conf->device_lock);
for (i = 0; i < disks; i++) { for (i = 0; i < disks; i++) {
struct bio *mbio; struct bio *mbio;
if (!conf->mirrors[i].operational) if (!r1_bio->write_bios[i])
continue; continue;
mbio = bio_clone(bio, GFP_NOIO); mbio = bio_clone(bio, GFP_NOIO);
if (r1_bio->write_bios[i])
BUG();
r1_bio->write_bios[i] = mbio; r1_bio->write_bios[i] = mbio;
mbio->bi_sector = r1_bio->sector; mbio->bi_sector = r1_bio->sector;
mbio->bi_bdev = conf->mirrors[i].bdev; mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = end_request; mbio->bi_end_io = end_request;
mbio->bi_rw = r1_bio->cmd; mbio->bi_rw = r1_bio->cmd;
mbio->bi_private = r1_bio; mbio->bi_private = r1_bio;
...@@ -529,14 +555,13 @@ static int make_request(request_queue_t *q, struct bio * bio) ...@@ -529,14 +555,13 @@ static int make_request(request_queue_t *q, struct bio * bio)
* do end_request by hand if all requests finish until we had a * do end_request by hand if all requests finish until we had a
* chance to set up the semaphore correctly ... lots of races). * chance to set up the semaphore correctly ... lots of races).
*/ */
for (i = 0; i < disks; i++) { for (i=disks; i--; ) {
struct bio *mbio; struct bio *mbio;
mbio = r1_bio->write_bios[i]; mbio = r1_bio->write_bios[i];
if (!mbio) if (!mbio)
continue; continue;
generic_make_request(mbio); generic_make_request(mbio);
atomic_inc(&conf->mirrors[i].nr_pending);
} }
return 0; return 0;
} }
...@@ -550,7 +575,8 @@ static int status(char *page, mddev_t *mddev) ...@@ -550,7 +575,8 @@ static int status(char *page, mddev_t *mddev)
conf->working_disks); conf->working_disks);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
sz += sprintf(page+sz, "%s", sz += sprintf(page+sz, "%s",
conf->mirrors[i].operational ? "U" : "_"); conf->mirrors[i].rdev &&
conf->mirrors[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]"); sz += sprintf (page+sz, "]");
return sz; return sz;
} }
...@@ -571,48 +597,37 @@ static int status(char *page, mddev_t *mddev) ...@@ -571,48 +597,37 @@ static int status(char *page, mddev_t *mddev)
#define ALREADY_SYNCING KERN_INFO \ #define ALREADY_SYNCING KERN_INFO \
"raid1: syncing already in progress.\n" "raid1: syncing already in progress.\n"
static void mark_disk_bad(mddev_t *mddev, int failed)
{
conf_t *conf = mddev_to_conf(mddev);
mirror_info_t *mirror = conf->mirrors+failed;
mirror->operational = 0;
if (!mirror->write_only) {
mddev->degraded++;
conf->working_disks--;
}
mddev->sb_dirty = 1;
printk(DISK_FAILED, bdev_partition_name(mirror->bdev), conf->working_disks);
}
static int error(mddev_t *mddev, struct block_device *bdev) static void error(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
mirror_info_t * mirrors = conf->mirrors;
int disks = MD_SB_DISKS;
int i;
/* /*
* Find the drive.
* If it is not operational, then we have already marked it as dead * If it is not operational, then we have already marked it as dead
* else if it is the last working disks, ignore the error, let the * else if it is the last working disks, ignore the error, let the
* next level up know. * next level up know.
* else mark the drive as failed * else mark the drive as failed
*/ */
for (i = 0; i < disks; i++) if (rdev->in_sync
if (mirrors[i].bdev == bdev && mirrors[i].operational) && conf->working_disks == 1)
break;
if (i == disks)
return 0;
if (i < conf->raid_disks && conf->working_disks == 1)
/* /*
* Don't fail the drive, act as though we were just a * Don't fail the drive, act as though we were just a
* normal single drive * normal single drive
*/ */
return 1; return;
mark_disk_bad(mddev, i); if (rdev->in_sync) {
return 0; mddev->degraded++;
conf->working_disks--;
/*
* if recovery was running, stop it now.
*/
if (mddev->recovery_running)
mddev->recovery_running = -EIO;
}
rdev->in_sync = 0;
rdev->faulty = 1;
mddev->sb_dirty = 1;
printk(DISK_FAILED, bdev_partition_name(rdev->bdev), conf->working_disks);
} }
static void print_conf(conf_t *conf) static void print_conf(conf_t *conf)
...@@ -628,12 +643,12 @@ static void print_conf(conf_t *conf) ...@@ -628,12 +643,12 @@ static void print_conf(conf_t *conf)
printk(" --- wd:%d rd:%d\n", conf->working_disks, printk(" --- wd:%d rd:%d\n", conf->working_disks,
conf->raid_disks); conf->raid_disks);
for (i = 0; i < MD_SB_DISKS; i++) { for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->mirrors + i; tmp = conf->mirrors + i;
printk(" disk %d, s:%d, o:%d, us:%d dev:%s\n", if (tmp->rdev)
i, tmp->spare, tmp->operational, printk(" disk %d, wo:%d, o:%d, dev:%s\n",
tmp->used_slot, i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
bdev_partition_name(tmp->bdev)); bdev_partition_name(tmp->rdev->bdev));
} }
} }
...@@ -653,160 +668,52 @@ static void close_sync(conf_t *conf) ...@@ -653,160 +668,52 @@ static void close_sync(conf_t *conf)
static int raid1_spare_active(mddev_t *mddev) static int raid1_spare_active(mddev_t *mddev)
{ {
int err = 0; int i;
int i, failed_disk = -1, spare_disk = -1;
conf_t *conf = mddev->private; conf_t *conf = mddev->private;
mirror_info_t *tmp, *sdisk, *fdisk; mirror_info_t *tmp;
mdk_rdev_t *spare_rdev, *failed_rdev;
print_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
/* /*
* Find the failed disk within the RAID1 configuration ... * Find all failed disks within the RAID1 configuration
* (this can only be in the first conf->working_disks part) * and mark them readable
*/ */
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->mirrors + i; tmp = conf->mirrors + i;
if ((!tmp->operational && !tmp->spare) || if (tmp->rdev
!tmp->used_slot) { && !tmp->rdev->faulty
failed_disk = i; && !tmp->rdev->in_sync) {
break; conf->working_disks++;
mddev->degraded--;
tmp->rdev->in_sync = 1;
} }
} }
/*
* When we activate a spare disk we _must_ have a disk in
* the lower (active) part of the array to replace.
*/
if (failed_disk == -1) {
MD_BUG();
err = 1;
goto abort;
}
/*
* Find the spare disk ... (can only be in the 'high'
* area of the array)
*/
spare_disk = mddev->spare->raid_disk;
sdisk = conf->mirrors + spare_disk;
fdisk = conf->mirrors + failed_disk;
/*
* do the switch finally
*/
spare_rdev = find_rdev_nr(mddev, spare_disk);
failed_rdev = find_rdev_nr(mddev, failed_disk);
/*
* There must be a spare_rdev, but there may not be a
* failed_rdev. That slot might be empty...
*/
spare_rdev->desc_nr = failed_disk;
spare_rdev->raid_disk = failed_disk;
if (failed_rdev) {
failed_rdev->desc_nr = spare_disk;
failed_rdev->raid_disk = spare_disk;
}
xchg_values(*fdisk, *sdisk);
/*
* (careful, 'failed' and 'spare' are switched from now on)
*
* we want to preserve linear numbering and we want to
* give the proper raid_disk number to the now activated
* disk. (this means we switch back these values)
*/
if (!sdisk->bdev)
sdisk->used_slot = 0;
/*
* this really activates the spare.
*/
fdisk->spare = 0;
fdisk->write_only = 0;
/*
* if we activate a spare, we definitely replace a
* non-operational disk slot in the 'low' area of
* the disk array.
*/
conf->working_disks++;
mddev->degraded--;
abort:
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
print_conf(conf); print_conf(conf);
return err; return 0;
}
static int raid1_spare_inactive(mddev_t *mddev)
{
conf_t *conf = mddev->private;
mirror_info_t *p;
int err = 0;
print_conf(conf);
spin_lock_irq(&conf->device_lock);
p = conf->mirrors + mddev->spare->raid_disk;
if (p) {
p->operational = 0;
p->write_only = 0;
} else {
MD_BUG();
err = 1;
}
spin_unlock_irq(&conf->device_lock);
print_conf(conf);
return err;
} }
static int raid1_spare_write(mddev_t *mddev)
{
conf_t *conf = mddev->private;
mirror_info_t *p;
int err = 0;
print_conf(conf);
spin_lock_irq(&conf->device_lock);
p = conf->mirrors + mddev->spare->raid_disk;
if (p) {
p->operational = 1;
p->write_only = 1;
} else {
MD_BUG();
err = 1;
}
spin_unlock_irq(&conf->device_lock);
print_conf(conf);
return err;
}
static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
conf_t *conf = mddev->private; conf_t *conf = mddev->private;
int err = 1; int found = 0;
mirror_info_t *p = conf->mirrors + rdev->raid_disk; int mirror;
mirror_info_t *p;
print_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (!p->used_slot) { for (mirror=0; mirror < mddev->raid_disks; mirror++)
/* it will be held open by rdev */ if ( !(p=conf->mirrors+mirror)->rdev) {
p->bdev = rdev->bdev; p->rdev = rdev;
p->operational = 0; p->head_position = 0;
p->write_only = 0; rdev->raid_disk = mirror;
p->spare = 1; found = 1;
p->used_slot = 1; break;
p->head_position = 0; }
err = 0;
}
if (err)
MD_BUG();
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
print_conf(conf); print_conf(conf);
return err; return found;
} }
static int raid1_remove_disk(mddev_t *mddev, int number) static int raid1_remove_disk(mddev_t *mddev, int number)
...@@ -817,13 +724,13 @@ static int raid1_remove_disk(mddev_t *mddev, int number) ...@@ -817,13 +724,13 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
print_conf(conf); print_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (p->used_slot) { if (p->rdev) {
if (p->operational) { if (p->rdev->in_sync ||
atomic_read(&p->rdev->nr_pending)) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
p->bdev = NULL; p->rdev = NULL;
p->used_slot = 0;
err = 0; err = 0;
} }
if (err) if (err)
...@@ -857,9 +764,10 @@ static void end_sync_read(struct bio *bio) ...@@ -857,9 +764,10 @@ static void end_sync_read(struct bio *bio)
*/ */
if (!uptodate) if (!uptodate)
md_error(r1_bio->mddev, md_error(r1_bio->mddev,
conf->mirrors[r1_bio->read_disk].bdev); conf->mirrors[r1_bio->read_disk].rdev);
else else
set_bit(R1BIO_Uptodate, &r1_bio->state); set_bit(R1BIO_Uptodate, &r1_bio->state);
atomic_dec(&conf->mirrors[r1_bio->read_disk].rdev->nr_pending);
reschedule_retry(r1_bio); reschedule_retry(r1_bio);
} }
...@@ -872,13 +780,13 @@ static void end_sync_write(struct bio *bio) ...@@ -872,13 +780,13 @@ static void end_sync_write(struct bio *bio)
int i; int i;
int mirror=0; int mirror=0;
for (i = 0; i < MD_SB_DISKS; i++) for (i = 0; i < conf->raid_disks; i++)
if (r1_bio->write_bios[i] == bio) { if (r1_bio->write_bios[i] == bio) {
mirror = i; mirror = i;
break; break;
} }
if (!uptodate) if (!uptodate)
md_error(mddev, conf->mirrors[mirror].bdev); md_error(mddev, conf->mirrors[mirror].rdev);
update_head_pos(mirror, r1_bio); update_head_pos(mirror, r1_bio);
if (atomic_dec_and_test(&r1_bio->remaining)) { if (atomic_dec_and_test(&r1_bio->remaining)) {
...@@ -886,13 +794,14 @@ static void end_sync_write(struct bio *bio) ...@@ -886,13 +794,14 @@ static void end_sync_write(struct bio *bio)
resume_device(conf); resume_device(conf);
put_buf(r1_bio); put_buf(r1_bio);
} }
atomic_dec(&conf->mirrors[mirror].rdev->nr_pending);
} }
static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
{ {
conf_t *conf = mddev_to_conf(mddev); conf_t *conf = mddev_to_conf(mddev);
int i, sum_bios = 0; int i, sum_bios = 0;
int disks = MD_SB_DISKS; int disks = conf->raid_disks;
struct bio *bio, *mbio; struct bio *bio, *mbio;
bio = r1_bio->master_bio; bio = r1_bio->master_bio;
...@@ -913,25 +822,33 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -913,25 +822,33 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
return; return;
} }
spin_lock_irq(&conf->device_lock);
for (i = 0; i < disks ; i++) { for (i = 0; i < disks ; i++) {
if (!conf->mirrors[i].operational) r1_bio->write_bios[i] = NULL;
if (!conf->mirrors[i].rdev ||
conf->mirrors[i].rdev->faulty)
continue; continue;
if (i == conf->last_used) if (i == conf->last_used)
/* /*
* we read from here, no need to write * we read from here, no need to write
*/ */
continue; continue;
if (i < conf->raid_disks && mddev->in_sync) if (conf->mirrors[i].rdev->in_sync && mddev->in_sync)
/* /*
* don't need to write this we are just rebuilding * don't need to write this we are just rebuilding
*/ */
continue; continue;
atomic_inc(&conf->mirrors[i].rdev->nr_pending);
r1_bio->write_bios[i] = bio;
}
spin_unlock_irq(&conf->device_lock);
for (i = 0; i < disks ; i++) {
if (!r1_bio->write_bios[i])
continue;
mbio = bio_clone(bio, GFP_NOIO); mbio = bio_clone(bio, GFP_NOIO);
if (r1_bio->write_bios[i])
BUG();
r1_bio->write_bios[i] = mbio; r1_bio->write_bios[i] = mbio;
mbio->bi_bdev = conf->mirrors[i].bdev; mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_sector = r1_bio->sector; mbio->bi_sector = r1_bio->sector;
mbio->bi_end_io = end_sync_write; mbio->bi_end_io = end_sync_write;
mbio->bi_rw = WRITE; mbio->bi_rw = WRITE;
...@@ -949,7 +866,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -949,7 +866,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
* Nowhere to write this to... I guess we * Nowhere to write this to... I guess we
* must be done * must be done
*/ */
printk(IO_ERROR, bdev_partition_name(bio->bi_bdev), r1_bio->sector); printk(KERN_ALERT "raid1: sync aborting as there is nowhere to write sector %lu\n", r1_bio->sector);
md_done_sync(mddev, r1_bio->master_bio->bi_size >> 9, 0); md_done_sync(mddev, r1_bio->master_bio->bi_size >> 9, 0);
resume_device(conf); resume_device(conf);
put_buf(r1_bio); put_buf(r1_bio);
...@@ -960,9 +877,8 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) ...@@ -960,9 +877,8 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
if (!mbio) if (!mbio)
continue; continue;
md_sync_acct(mbio->bi_bdev, mbio->bi_size >> 9); md_sync_acct(conf->mirrors[i].rdev, mbio->bi_size >> 9);
generic_make_request(mbio); generic_make_request(mbio);
atomic_inc(&conf->mirrors[i].nr_pending);
} }
} }
...@@ -982,7 +898,7 @@ static void raid1d(void *data) ...@@ -982,7 +898,7 @@ static void raid1d(void *data)
unsigned long flags; unsigned long flags;
mddev_t *mddev; mddev_t *mddev;
conf_t *conf; conf_t *conf;
struct block_device *bdev; mdk_rdev_t *rdev;
for (;;) { for (;;) {
...@@ -1002,20 +918,18 @@ static void raid1d(void *data) ...@@ -1002,20 +918,18 @@ static void raid1d(void *data)
break; break;
case READ: case READ:
case READA: case READA:
bdev = bio->bi_bdev; if (map(mddev, &rdev) == -1) {
map(mddev, &bio->bi_bdev);
if (bio->bi_bdev == bdev) {
printk(IO_ERROR, bdev_partition_name(bio->bi_bdev), r1_bio->sector); printk(IO_ERROR, bdev_partition_name(bio->bi_bdev), r1_bio->sector);
raid_end_bio_io(r1_bio, 0); raid_end_bio_io(r1_bio, 0);
break; break;
} }
printk(REDIRECT_SECTOR, printk(REDIRECT_SECTOR,
bdev_partition_name(bio->bi_bdev), r1_bio->sector); bdev_partition_name(rdev->bdev), r1_bio->sector);
bio->bi_bdev = rdev->bdev;
bio->bi_sector = r1_bio->sector; bio->bi_sector = r1_bio->sector;
bio->bi_rw = r1_bio->cmd; bio->bi_rw = r1_bio->cmd;
generic_make_request(bio); generic_make_request(bio);
atomic_inc(&conf->mirrors[r1_bio->read_disk].nr_pending);
break; break;
} }
} }
...@@ -1081,7 +995,9 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1081,7 +995,9 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
*/ */
disk = conf->last_used; disk = conf->last_used;
/* make sure disk is operational */ /* make sure disk is operational */
while (!conf->mirrors[disk].operational) { spin_lock_irq(&conf->device_lock);
while (conf->mirrors[disk].rdev == NULL ||
!conf->mirrors[disk].rdev->in_sync) {
if (disk <= 0) if (disk <= 0)
disk = conf->raid_disks; disk = conf->raid_disks;
disk--; disk--;
...@@ -1089,6 +1005,8 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1089,6 +1005,8 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
break; break;
} }
conf->last_used = disk; conf->last_used = disk;
atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
mirror = conf->mirrors + conf->last_used; mirror = conf->mirrors + conf->last_used;
...@@ -1119,7 +1037,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1119,7 +1037,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
read_bio = bio_clone(r1_bio->master_bio, GFP_NOIO); read_bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
read_bio->bi_sector = sector_nr; read_bio->bi_sector = sector_nr;
read_bio->bi_bdev = mirror->bdev; read_bio->bi_bdev = mirror->rdev->bdev;
read_bio->bi_end_io = end_sync_read; read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ; read_bio->bi_rw = READ;
read_bio->bi_private = r1_bio; read_bio->bi_private = r1_bio;
...@@ -1128,10 +1046,9 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) ...@@ -1128,10 +1046,9 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
BUG(); BUG();
r1_bio->read_bio = read_bio; r1_bio->read_bio = read_bio;
md_sync_acct(read_bio->bi_bdev, nr_sectors); md_sync_acct(mirror->rdev, nr_sectors);
generic_make_request(read_bio); generic_make_request(read_bio);
atomic_inc(&conf->mirrors[conf->last_used].nr_pending);
return nr_sectors; return nr_sectors;
} }
...@@ -1209,61 +1126,18 @@ static int run(mddev_t *mddev) ...@@ -1209,61 +1126,18 @@ static int run(mddev_t *mddev)
goto out; goto out;
} }
// for (tmp = (mddev)->disks.next; rdev = ((mdk_rdev_t *)((char *)(tmp)-(unsigned long)(&((mdk_rdev_t *)0)->same_set))), tmp = tmp->next, tmp->prev != &(mddev)->disks ; ) {
ITERATE_RDEV(mddev, rdev, tmp) { ITERATE_RDEV(mddev, rdev, tmp) {
if (rdev->faulty) {
printk(ERRORS, bdev_partition_name(rdev->bdev));
} else {
if (!rdev->sb) {
MD_BUG();
continue;
}
}
if (rdev->desc_nr == -1) {
MD_BUG();
continue;
}
disk_idx = rdev->raid_disk; disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
continue;
disk = conf->mirrors + disk_idx; disk = conf->mirrors + disk_idx;
if (rdev->faulty) { disk->rdev = rdev;
disk->bdev = rdev->bdev; disk->head_position = 0;
disk->operational = 0; if (!rdev->faulty && rdev->in_sync)
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
disk->head_position = 0;
continue;
}
if (rdev->in_sync) {
if (disk->operational) {
printk(ALREADY_RUNNING,
bdev_partition_name(rdev->bdev),
disk_idx);
continue;
}
printk(OPERATIONAL, bdev_partition_name(rdev->bdev),
disk_idx);
disk->bdev = rdev->bdev;
disk->operational = 1;
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
disk->head_position = 0;
conf->working_disks++; conf->working_disks++;
} else {
/*
* Must be a spare disk ..
*/
printk(SPARE, bdev_partition_name(rdev->bdev));
disk->bdev = rdev->bdev;
disk->operational = 0;
disk->write_only = 0;
disk->spare = 1;
disk->used_slot = 1;
disk->head_position = 0;
}
} }
conf->raid_disks = mddev->raid_disks; conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev; conf->mddev = mddev;
...@@ -1283,23 +1157,19 @@ static int run(mddev_t *mddev) ...@@ -1283,23 +1157,19 @@ static int run(mddev_t *mddev)
disk = conf->mirrors + i; disk = conf->mirrors + i;
if (!disk->used_slot) { if (!disk->rdev) {
disk->bdev = NULL;
disk->operational = 0;
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
disk->head_position = 0; disk->head_position = 0;
}
if (!disk->used_slot)
mddev->degraded++; mddev->degraded++;
}
} }
/* /*
* find the first working one and use it as a starting point * find the first working one and use it as a starting point
* to read balancing. * to read balancing.
*/ */
for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) for (j = 0; j < conf->raid_disks &&
(!conf->mirrors[j].rdev ||
!conf->mirrors[j].rdev->in_sync) ; j++)
/* nothing */; /* nothing */;
conf->last_used = j; conf->last_used = j;
...@@ -1354,8 +1224,6 @@ static mdk_personality_t raid1_personality = ...@@ -1354,8 +1224,6 @@ static mdk_personality_t raid1_personality =
.error_handler = error, .error_handler = error,
.hot_add_disk = raid1_add_disk, .hot_add_disk = raid1_add_disk,
.hot_remove_disk= raid1_remove_disk, .hot_remove_disk= raid1_remove_disk,
.spare_write = raid1_spare_write,
.spare_inactive = raid1_spare_inactive,
.spare_active = raid1_spare_active, .spare_active = raid1_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
}; };
......
...@@ -371,9 +371,10 @@ static void raid5_end_read_request (struct bio * bi) ...@@ -371,9 +371,10 @@ static void raid5_end_read_request (struct bio * bi)
set_bit(R5_UPTODATE, &sh->dev[i].flags); set_bit(R5_UPTODATE, &sh->dev[i].flags);
#endif #endif
} else { } else {
md_error(conf->mddev, conf->disks[i].bdev); md_error(conf->mddev, conf->disks[i].rdev);
clear_bit(R5_UPTODATE, &sh->dev[i].flags); clear_bit(R5_UPTODATE, &sh->dev[i].flags);
} }
atomic_dec(&conf->disks[i].rdev->nr_pending);
#if 0 #if 0
/* must restore b_page before unlocking buffer... */ /* must restore b_page before unlocking buffer... */
if (sh->bh_page[i] != bh->b_page) { if (sh->bh_page[i] != bh->b_page) {
...@@ -407,7 +408,9 @@ static void raid5_end_write_request (struct bio *bi) ...@@ -407,7 +408,9 @@ static void raid5_end_write_request (struct bio *bi)
spin_lock_irqsave(&conf->device_lock, flags); spin_lock_irqsave(&conf->device_lock, flags);
if (!uptodate) if (!uptodate)
md_error(conf->mddev, conf->disks[i].bdev); md_error(conf->mddev, conf->disks[i].rdev);
atomic_dec(&conf->disks[i].rdev->nr_pending);
clear_bit(R5_LOCKED, &sh->dev[i].flags); clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
...@@ -420,7 +423,6 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i); ...@@ -420,7 +423,6 @@ static unsigned long compute_blocknr(struct stripe_head *sh, int i);
static void raid5_build_block (struct stripe_head *sh, int i) static void raid5_build_block (struct stripe_head *sh, int i)
{ {
raid5_conf_t *conf = sh->raid_conf;
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
bio_init(&dev->req); bio_init(&dev->req);
...@@ -430,7 +432,6 @@ static void raid5_build_block (struct stripe_head *sh, int i) ...@@ -430,7 +432,6 @@ static void raid5_build_block (struct stripe_head *sh, int i)
dev->vec.bv_len = STRIPE_SIZE; dev->vec.bv_len = STRIPE_SIZE;
dev->vec.bv_offset = 0; dev->vec.bv_offset = 0;
dev->req.bi_bdev = conf->disks[i].bdev;
dev->req.bi_sector = sh->sector; dev->req.bi_sector = sh->sector;
dev->req.bi_private = sh; dev->req.bi_private = sh;
...@@ -439,54 +440,30 @@ static void raid5_build_block (struct stripe_head *sh, int i) ...@@ -439,54 +440,30 @@ static void raid5_build_block (struct stripe_head *sh, int i)
dev->sector = compute_blocknr(sh, i); dev->sector = compute_blocknr(sh, i);
} }
static int error(mddev_t *mddev, struct block_device *bdev) static void error(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
raid5_conf_t *conf = (raid5_conf_t *) mddev->private; raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
struct disk_info *disk;
int i;
PRINTK("raid5: error called\n"); PRINTK("raid5: error called\n");
for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { if (!rdev->faulty) {
if (disk->bdev != bdev) mddev->sb_dirty = 1;
continue; conf->working_disks--;
if (disk->operational) { if (rdev->in_sync) {
disk->operational = 0;
mddev->sb_dirty = 1;
mddev->degraded++; mddev->degraded++;
conf->working_disks--;
conf->failed_disks++; conf->failed_disks++;
printk (KERN_ALERT rdev->in_sync = 0;
"raid5: Disk failure on %s, disabling device." /*
" Operation continuing on %d devices\n", * if recovery was running, stop it now.
bdev_partition_name(bdev), conf->working_disks); */
} if (mddev->recovery_running)
return 0; mddev->recovery_running = -EIO;
}
/*
* handle errors in spares (during reconstruction)
*/
if (conf->spare) {
disk = conf->spare;
if (disk->bdev == bdev) {
printk (KERN_ALERT
"raid5: Disk failure on spare %s\n",
bdev_partition_name (bdev));
if (!conf->spare->operational) {
/* probably a SET_DISK_FAULTY ioctl */
return -EIO;
}
disk->operational = 0;
disk->write_only = 0;
conf->spare = NULL;
mddev->sb_dirty = 1;
return 0;
} }
rdev->faulty = 1;
printk (KERN_ALERT
"raid5: Disk failure on %s, disabling device."
" Operation continuing on %d devices\n",
bdev_partition_name(rdev->bdev), conf->working_disks);
} }
MD_BUG();
return -EIO;
} }
/* /*
...@@ -681,7 +658,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx) ...@@ -681,7 +658,7 @@ static void compute_block(struct stripe_head *sh, int dd_idx)
} }
if (count != 1) if (count != 1)
xor_block(count, STRIPE_SIZE, ptr); xor_block(count, STRIPE_SIZE, ptr);
set_bit(R5_UPTODATE, &sh->dev[i].flags); set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
} }
static void compute_parity(struct stripe_head *sh, int method) static void compute_parity(struct stripe_head *sh, int method)
...@@ -840,7 +817,6 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -840,7 +817,6 @@ static void handle_stripe(struct stripe_head *sh)
int disks = conf->raid_disks; int disks = conf->raid_disks;
struct bio *return_bi= NULL; struct bio *return_bi= NULL;
struct bio *bi; struct bio *bi;
int action[MD_SB_DISKS];
int i; int i;
int syncing; int syncing;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
...@@ -848,7 +824,6 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -848,7 +824,6 @@ static void handle_stripe(struct stripe_head *sh)
struct r5dev *dev; struct r5dev *dev;
PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx); PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
memset(action, 0, sizeof(action));
spin_lock(&sh->lock); spin_lock(&sh->lock);
clear_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_HANDLE, &sh->state);
...@@ -858,7 +833,13 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -858,7 +833,13 @@ static void handle_stripe(struct stripe_head *sh)
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
for (i=disks; i--; ) { for (i=disks; i--; ) {
mdk_rdev_t *rdev;
dev = &sh->dev[i]; dev = &sh->dev[i];
clear_bit(R5_Wantread, &dev->flags);
clear_bit(R5_Wantwrite, &dev->flags);
clear_bit(R5_Insync, &dev->flags);
clear_bit(R5_Syncio, &dev->flags);
PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i, PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i,
dev->flags, dev->toread, dev->towrite, dev->written); dev->flags, dev->toread, dev->towrite, dev->written);
/* maybe we can reply to a read */ /* maybe we can reply to a read */
...@@ -890,10 +871,12 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -890,10 +871,12 @@ static void handle_stripe(struct stripe_head *sh)
if (dev->toread) to_read++; if (dev->toread) to_read++;
if (dev->towrite) to_write++; if (dev->towrite) to_write++;
if (dev->written) written++; if (dev->written) written++;
if (!conf->disks[i].operational) { rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
if (!rdev || !rdev->in_sync) {
failed++; failed++;
failed_num = i; failed_num = i;
} } else
set_bit(R5_Insync, &dev->flags);
} }
PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n", PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
locked, uptodate, to_read, to_write, failed, failed_num); locked, uptodate, to_read, to_write, failed, failed_num);
...@@ -918,7 +901,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -918,7 +901,7 @@ static void handle_stripe(struct stripe_head *sh)
bi = nextbi; bi = nextbi;
} }
/* fail any reads if this device is non-operational */ /* fail any reads if this device is non-operational */
if (!conf->disks[i].operational) { if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
bi = sh->dev[i].toread; bi = sh->dev[i].toread;
sh->dev[i].toread = NULL; sh->dev[i].toread = NULL;
if (bi) to_read--; if (bi) to_read--;
...@@ -946,7 +929,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -946,7 +929,7 @@ static void handle_stripe(struct stripe_head *sh)
*/ */
dev = &sh->dev[sh->pd_idx]; dev = &sh->dev[sh->pd_idx];
if ( written && if ( written &&
( (conf->disks[sh->pd_idx].operational && !test_bit(R5_LOCKED, &dev->flags) && ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags)) test_bit(R5_UPTODATE, &dev->flags))
|| (failed == 1 && failed_num == sh->pd_idx)) || (failed == 1 && failed_num == sh->pd_idx))
) { ) {
...@@ -954,7 +937,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -954,7 +937,7 @@ static void handle_stripe(struct stripe_head *sh)
for (i=disks; i--; ) for (i=disks; i--; )
if (sh->dev[i].written) { if (sh->dev[i].written) {
dev = &sh->dev[i]; dev = &sh->dev[i];
if (!conf->disks[sh->pd_idx].operational || if (!test_bit(R5_Insync, &dev->flags) &&
(!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) ) { (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) ) {
/* maybe we can return some write requests */ /* maybe we can return some write requests */
struct bio *wbi, *wbi2; struct bio *wbi, *wbi2;
...@@ -988,9 +971,9 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -988,9 +971,9 @@ static void handle_stripe(struct stripe_head *sh)
PRINTK("Computing block %d\n", i); PRINTK("Computing block %d\n", i);
compute_block(sh, i); compute_block(sh, i);
uptodate++; uptodate++;
} else if (conf->disks[i].operational) { } else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
action[i] = READ+1; set_bit(R5_Wantread, &dev->flags);
#if 0 #if 0
/* if I am just reading this block and we don't have /* if I am just reading this block and we don't have
a failed drive, or any pending writes then sidestep the cache */ a failed drive, or any pending writes then sidestep the cache */
...@@ -1003,7 +986,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1003,7 +986,7 @@ static void handle_stripe(struct stripe_head *sh)
locked++; locked++;
PRINTK("Reading block %d (sync=%d)\n", i, syncing); PRINTK("Reading block %d (sync=%d)\n", i, syncing);
if (syncing) if (syncing)
md_sync_acct(conf->disks[i].bdev, STRIPE_SECTORS); md_sync_acct(conf->disks[i].rdev, STRIPE_SECTORS);
} }
} }
} }
...@@ -1023,7 +1006,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1023,7 +1006,7 @@ static void handle_stripe(struct stripe_head *sh)
#endif #endif
) && ) &&
!test_bit(R5_UPTODATE, &dev->flags)) { !test_bit(R5_UPTODATE, &dev->flags)) {
if (conf->disks[i].operational if (test_bit(R5_Insync, &dev->flags)
/* && !(!mddev->insync && i == sh->pd_idx) */ /* && !(!mddev->insync && i == sh->pd_idx) */
) )
rmw++; rmw++;
...@@ -1037,7 +1020,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1037,7 +1020,7 @@ static void handle_stripe(struct stripe_head *sh)
#endif #endif
) && ) &&
!test_bit(R5_UPTODATE, &dev->flags)) { !test_bit(R5_UPTODATE, &dev->flags)) {
if (conf->disks[i].operational) rcw++; if (test_bit(R5_Insync, &dev->flags)) rcw++;
else rcw += 2*disks; else rcw += 2*disks;
} }
} }
...@@ -1049,12 +1032,12 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1049,12 +1032,12 @@ static void handle_stripe(struct stripe_head *sh)
dev = &sh->dev[i]; dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) && if ((dev->towrite || i == sh->pd_idx) &&
!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
conf->disks[i].operational) { test_bit(R5_Insync, &dev->flags)) {
if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
{ {
PRINTK("Read_old block %d for r-m-w\n", i); PRINTK("Read_old block %d for r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
action[i] = READ+1; set_bit(R5_Wantread, &dev->flags);
locked++; locked++;
} else { } else {
set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_DELAYED, &sh->state);
...@@ -1068,12 +1051,12 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1068,12 +1051,12 @@ static void handle_stripe(struct stripe_head *sh)
dev = &sh->dev[i]; dev = &sh->dev[i];
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
conf->disks[i].operational) { test_bit(R5_Insync, &dev->flags)) {
if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
{ {
PRINTK("Read_old block %d for Reconstruct\n", i); PRINTK("Read_old block %d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
action[i] = READ+1; set_bit(R5_Wantread, &dev->flags);
locked++; locked++;
} else { } else {
set_bit(STRIPE_DELAYED, &sh->state); set_bit(STRIPE_DELAYED, &sh->state);
...@@ -1090,8 +1073,8 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1090,8 +1073,8 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(R5_LOCKED, &sh->dev[i].flags)) { if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
PRINTK("Writing block %d\n", i); PRINTK("Writing block %d\n", i);
locked++; locked++;
action[i] = WRITE+1; set_bit(R5_Wantwrite, &sh->dev[i].flags);
if (!conf->disks[i].operational if (!test_bit(R5_Insync, &sh->dev[i].flags)
|| (i==sh->pd_idx && failed == 0)) || (i==sh->pd_idx && failed == 0))
set_bit(STRIPE_INSYNC, &sh->state); set_bit(STRIPE_INSYNC, &sh->state);
} }
...@@ -1124,7 +1107,6 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1124,7 +1107,6 @@ static void handle_stripe(struct stripe_head *sh)
} }
} }
if (!test_bit(STRIPE_INSYNC, &sh->state)) { if (!test_bit(STRIPE_INSYNC, &sh->state)) {
struct disk_info *spare;
if (failed==0) if (failed==0)
failed_num = sh->pd_idx; failed_num = sh->pd_idx;
/* should be able to compute the missing block and write it to spare */ /* should be able to compute the missing block and write it to spare */
...@@ -1138,14 +1120,10 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1138,14 +1120,10 @@ static void handle_stripe(struct stripe_head *sh)
BUG(); BUG();
dev = &sh->dev[failed_num]; dev = &sh->dev[failed_num];
set_bit(R5_LOCKED, &dev->flags); set_bit(R5_LOCKED, &dev->flags);
action[failed_num] = WRITE+1; set_bit(R5_Wantwrite, &dev->flags);
locked++; locked++;
set_bit(STRIPE_INSYNC, &sh->state); set_bit(STRIPE_INSYNC, &sh->state);
if (conf->disks[failed_num].operational) set_bit(R5_Syncio, &dev->flags);
md_sync_acct(conf->disks[failed_num].bdev, STRIPE_SECTORS);
else if ((spare=conf->spare))
md_sync_acct(spare->bdev, STRIPE_SECTORS);
} }
} }
if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
...@@ -1161,27 +1139,34 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1161,27 +1139,34 @@ static void handle_stripe(struct stripe_head *sh)
bi->bi_end_io(bi); bi->bi_end_io(bi);
} }
for (i=disks; i-- ;) for (i=disks; i-- ;)
if (action[i]) { if (sh->dev[i].flags & ((1<<R5_Wantwrite)|(1<<R5_Wantread))) {
struct bio *bi = &sh->dev[i].req; struct bio *bi = &sh->dev[i].req;
struct disk_info *spare = conf->spare; mdk_rdev_t *rdev ;
int skip = 0;
if (action[i] == READ+1) bi->bi_rw = 0;
if (test_bit(R5_Wantread, &sh->dev[i].flags))
bi->bi_end_io = raid5_end_read_request; bi->bi_end_io = raid5_end_read_request;
else else {
bi->bi_end_io = raid5_end_write_request; bi->bi_end_io = raid5_end_write_request;
if (conf->disks[i].operational) bi->bi_rw = 1;
bi->bi_bdev = conf->disks[i].bdev; }
else if (spare && action[i] == WRITE+1)
bi->bi_bdev = spare->bdev; spin_lock_irq(&conf->device_lock);
else skip=1; rdev = conf->disks[i].rdev;
if (!skip) { if (rdev && rdev->faulty)
PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i); rdev = NULL;
if (rdev)
atomic_inc(&rdev->nr_pending);
spin_unlock_irq(&conf->device_lock);
if (rdev) {
if (test_bit(R5_Syncio, &sh->dev[i].flags))
md_sync_acct(rdev, STRIPE_SECTORS);
bi->bi_bdev = rdev->bdev;
PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, bi->bi_rw, i);
atomic_inc(&sh->count); atomic_inc(&sh->count);
bi->bi_sector = sh->sector; bi->bi_sector = sh->sector;
if (action[i] == READ+1)
bi->bi_rw = 0;
else
bi->bi_rw = 1;
bi->bi_flags = 0; bi->bi_flags = 0;
bi->bi_vcnt = 1; bi->bi_vcnt = 1;
bi->bi_idx = 0; bi->bi_idx = 0;
...@@ -1190,7 +1175,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -1190,7 +1175,7 @@ static void handle_stripe(struct stripe_head *sh)
bi->bi_next = NULL; bi->bi_next = NULL;
generic_make_request(bi); generic_make_request(bi);
} else { } else {
PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector); PRINTK("skip op %d on disc %d for sector %ld\n", bi->bi_rw, i, sh->sector);
clear_bit(R5_LOCKED, &dev->flags); clear_bit(R5_LOCKED, &dev->flags);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
} }
...@@ -1363,7 +1348,7 @@ static void raid5d (void *data) ...@@ -1363,7 +1348,7 @@ static void raid5d (void *data)
static int run (mddev_t *mddev) static int run (mddev_t *mddev)
{ {
raid5_conf_t *conf; raid5_conf_t *conf;
int i, raid_disk, memory; int raid_disk, memory;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
struct disk_info *disk; struct disk_info *disk;
struct list_head *tmp; struct list_head *tmp;
...@@ -1399,60 +1384,17 @@ static int run (mddev_t *mddev) ...@@ -1399,60 +1384,17 @@ static int run (mddev_t *mddev)
PRINTK("raid5: run(md%d) called.\n", mdidx(mddev)); PRINTK("raid5: run(md%d) called.\n", mdidx(mddev));
ITERATE_RDEV(mddev,rdev,tmp) { ITERATE_RDEV(mddev,rdev,tmp) {
/*
* This is important -- we are using the descriptor on
* the disk only to get a pointer to the descriptor on
* the main superblock, which might be more recent.
*/
raid_disk = rdev->raid_disk; raid_disk = rdev->raid_disk;
if (raid_disk > mddev->raid_disks
|| raid_disk < 0)
continue;
disk = conf->disks + raid_disk; disk = conf->disks + raid_disk;
if (rdev->faulty) { disk->rdev = rdev;
printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", bdev_partition_name(rdev->bdev));
disk->bdev = rdev->bdev;
disk->operational = 0;
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
continue;
}
if (rdev->in_sync) { if (rdev->in_sync) {
if (disk->operational) {
printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", bdev_partition_name(rdev->bdev), raid_disk);
continue;
}
printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", bdev_partition_name(rdev->bdev), raid_disk); printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", bdev_partition_name(rdev->bdev), raid_disk);
disk->bdev = rdev->bdev;
disk->operational = 1;
disk->used_slot = 1;
conf->working_disks++; conf->working_disks++;
} else {
/*
* Must be a spare disk ..
*/
printk(KERN_INFO "raid5: spare disk %s\n", bdev_partition_name(rdev->bdev));
disk->bdev = rdev->bdev;
disk->operational = 0;
disk->write_only = 0;
disk->spare = 1;
disk->used_slot = 1;
}
}
for (i = 0; i < conf->raid_disks; i++) {
disk = conf->disks + i;
if (!disk->used_slot) {
disk->bdev = NULL;
disk->operational = 0;
disk->write_only = 0;
disk->spare = 0;
disk->used_slot = 1;
} }
} }
...@@ -1467,14 +1409,6 @@ static int run (mddev_t *mddev) ...@@ -1467,14 +1409,6 @@ static int run (mddev_t *mddev)
conf->algorithm = mddev->layout; conf->algorithm = mddev->layout;
conf->max_nr_stripes = NR_STRIPES; conf->max_nr_stripes = NR_STRIPES;
#if 0
for (i = 0; i < conf->raid_disks; i++) {
if (!conf->disks[i].used_slot) {
MD_BUG();
goto abort;
}
}
#endif
if (!conf->chunk_size || conf->chunk_size % 4) { if (!conf->chunk_size || conf->chunk_size % 4) {
printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev)); printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
goto abort; goto abort;
...@@ -1519,7 +1453,7 @@ static int run (mddev_t *mddev) ...@@ -1519,7 +1453,7 @@ static int run (mddev_t *mddev)
mddev->raid_disks-mddev->degraded, mddev->raid_disks, conf->algorithm); mddev->raid_disks-mddev->degraded, mddev->raid_disks, conf->algorithm);
else else
printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev),
mddev->raid_disks = mddev->degraded, mddev->raid_disks, conf->algorithm); mddev->raid_disks - mddev->degraded, mddev->raid_disks, conf->algorithm);
print_raid5_conf(conf); print_raid5_conf(conf);
...@@ -1596,7 +1530,9 @@ static int status (char *page, mddev_t *mddev) ...@@ -1596,7 +1530,9 @@ static int status (char *page, mddev_t *mddev)
sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout); sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks);
for (i = 0; i < conf->raid_disks; i++) for (i = 0; i < conf->raid_disks; i++)
sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_"); sz += sprintf (page+sz, "%s",
conf->disks[i].rdev &&
conf->disks[i].rdev->in_sync ? "U" : "_");
sz += sprintf (page+sz, "]"); sz += sprintf (page+sz, "]");
#if RAID5_DEBUG #if RAID5_DEBUG
#define D(x) \ #define D(x) \
...@@ -1619,149 +1555,36 @@ static void print_raid5_conf (raid5_conf_t *conf) ...@@ -1619,149 +1555,36 @@ static void print_raid5_conf (raid5_conf_t *conf)
printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
conf->working_disks, conf->failed_disks); conf->working_disks, conf->failed_disks);
#if RAID5_DEBUG for (i = 0; i < conf->raid_disks; i++) {
for (i = 0; i < MD_SB_DISKS; i++) {
#else
for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
#endif
tmp = conf->disks + i; tmp = conf->disks + i;
printk(" disk %d, s:%d, o:%d, us:%d dev:%s\n", if (tmp->rdev)
i, tmp->spare,tmp->operational, printk(" disk %d, o:%d, dev:%s\n",
tmp->used_slot, i, !tmp->rdev->faulty,
bdev_partition_name(tmp->bdev)); bdev_partition_name(tmp->rdev->bdev));
} }
} }
static int raid5_spare_active(mddev_t *mddev) static int raid5_spare_active(mddev_t *mddev)
{ {
int err = 0; int i;
int i, failed_disk=-1, spare_disk=-1;
raid5_conf_t *conf = mddev->private; raid5_conf_t *conf = mddev->private;
struct disk_info *tmp, *sdisk, *fdisk; struct disk_info *tmp;
mdk_rdev_t *spare_rdev, *failed_rdev;
print_raid5_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
tmp = conf->disks + i; tmp = conf->disks + i;
if ((!tmp->operational && !tmp->spare) || if (tmp->rdev
!tmp->used_slot) { && !tmp->rdev->faulty
failed_disk = i; && !tmp->rdev->in_sync) {
break; mddev->degraded--;
conf->failed_disks--;
conf->working_disks++;
tmp->rdev->in_sync = 1;
} }
} }
if (failed_disk == -1) {
MD_BUG();
err = 1;
goto abort;
}
/*
* Find the spare disk ... (can only be in the 'high'
* area of the array)
*/
spare_disk = mddev->spare->raid_disk;
if (!conf->spare) {
MD_BUG();
err = 1;
goto abort;
}
sdisk = conf->disks + spare_disk;
fdisk = conf->disks + failed_disk;
/*
* do the switch finally
*/
spare_rdev = find_rdev_nr(mddev, spare_disk);
failed_rdev = find_rdev_nr(mddev, failed_disk);
/* There must be a spare_rdev, but there may not be a
* failed_rdev. That slot might be empty...
*/
spare_rdev->desc_nr = failed_disk;
spare_rdev->raid_disk = failed_disk;
if (failed_rdev) {
failed_rdev->desc_nr = spare_disk;
failed_rdev->raid_disk = spare_disk;
}
xchg_values(*fdisk, *sdisk);
/*
* (careful, 'failed' and 'spare' are switched from now on)
*
* we want to preserve linear numbering and we want to
* give the proper raid_disk number to the now activated
* disk. (this means we switch back these values)
*/
if (!sdisk->bdev)
sdisk->used_slot = 0;
/*
* this really activates the spare.
*/
fdisk->spare = 0;
fdisk->write_only = 0;
/*
* if we activate a spare, we definitely replace a
* non-operational disk slot in the 'low' area of
* the disk array.
*/
mddev->degraded--;
conf->failed_disks--;
conf->working_disks++;
conf->spare = NULL;
abort:
spin_unlock_irq(&conf->device_lock);
print_raid5_conf(conf);
return err;
}
static int raid5_spare_inactive(mddev_t *mddev)
{
raid5_conf_t *conf = mddev->private;
struct disk_info *p;
int err = 0;
print_raid5_conf(conf);
spin_lock_irq(&conf->device_lock);
p = conf->disks + mddev->spare->raid_disk;
if (p) {
p->operational = 0;
p->write_only = 0;
if (conf->spare == p)
conf->spare = NULL;
} else {
MD_BUG();
err = 1;
}
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
print_raid5_conf(conf); print_raid5_conf(conf);
return err; return 0;
}
static int raid5_spare_write(mddev_t *mddev)
{
raid5_conf_t *conf = mddev->private;
struct disk_info *p;
int err = 0;
print_raid5_conf(conf);
spin_lock_irq(&conf->device_lock);
p = conf->disks + mddev->spare->raid_disk;
if (p && !conf->spare) {
p->operational = 1;
p->write_only = 1;
conf->spare = p;
} else {
MD_BUG();
err = 1;
}
spin_unlock_irq(&conf->device_lock);
print_raid5_conf(conf);
return err;
} }
static int raid5_remove_disk(mddev_t *mddev, int number) static int raid5_remove_disk(mddev_t *mddev, int number)
...@@ -1773,13 +1596,13 @@ static int raid5_remove_disk(mddev_t *mddev, int number) ...@@ -1773,13 +1596,13 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
print_raid5_conf(conf); print_raid5_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
if (p->used_slot) { if (p->rdev) {
if (p->operational) { if (p->rdev->in_sync ||
atomic_read(&p->rdev->nr_pending)) {
err = -EBUSY; err = -EBUSY;
goto abort; goto abort;
} }
p->bdev = NULL; p->rdev = NULL;
p->used_slot = 0;
err = 0; err = 0;
} }
if (err) if (err)
...@@ -1793,29 +1616,25 @@ static int raid5_remove_disk(mddev_t *mddev, int number) ...@@ -1793,29 +1616,25 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
{ {
raid5_conf_t *conf = mddev->private; raid5_conf_t *conf = mddev->private;
int err = 1; int found = 0;
struct disk_info *p = conf->disks + rdev->raid_disk; int disk;
struct disk_info *p;
print_raid5_conf(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
/* /*
* find the disk ... * find the disk ...
*/ */
for (disk=0; disk < mddev->raid_disks; disk++)
if (!p->used_slot) { if ((p=conf->disks + disk)->rdev == NULL) {
/* it will be held open by rdev */ p->rdev = rdev;
p->bdev = rdev->bdev; rdev->in_sync = 0;
p->operational = 0; rdev->raid_disk = disk;
p->write_only = 0; found = 1;
p->spare = 1; break;
p->used_slot = 1; }
err = 0;
}
if (err)
MD_BUG();
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
print_raid5_conf(conf); print_raid5_conf(conf);
return err; return found;
} }
static mdk_personality_t raid5_personality= static mdk_personality_t raid5_personality=
...@@ -1828,8 +1647,6 @@ static mdk_personality_t raid5_personality= ...@@ -1828,8 +1647,6 @@ static mdk_personality_t raid5_personality=
.error_handler = error, .error_handler = error,
.hot_add_disk = raid5_add_disk, .hot_add_disk = raid5_add_disk,
.hot_remove_disk= raid5_remove_disk, .hot_remove_disk= raid5_remove_disk,
.spare_write = raid5_spare_write,
.spare_inactive = raid5_spare_inactive,
.spare_active = raid5_spare_active, .spare_active = raid5_spare_active,
.sync_request = sync_request, .sync_request = sync_request,
}; };
......
...@@ -16,3 +16,4 @@ if [ "$CONFIG_DEV_APPLETALK" = "y" ]; then ...@@ -16,3 +16,4 @@ if [ "$CONFIG_DEV_APPLETALK" = "y" ]; then
bool ' Appletalk-IP to IP Decapsulation support' CONFIG_IPDDP_DECAP bool ' Appletalk-IP to IP Decapsulation support' CONFIG_IPDDP_DECAP
fi fi
fi fi
...@@ -134,7 +134,7 @@ nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, ...@@ -134,7 +134,7 @@ nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
fh_copy(&resp->fh, &argp->fh); fh_copy(&resp->fh, &argp->fh);
resp->access = argp->access; resp->access = argp->access;
nfserr = nfsd_access(rqstp, &resp->fh, &resp->access); nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
RETURN_STATUS(nfserr); RETURN_STATUS(nfserr);
} }
...@@ -267,7 +267,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, ...@@ -267,7 +267,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
/* Now create the file and set attributes */ /* Now create the file and set attributes */
nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len, nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len,
attr, newfhp, attr, newfhp,
argp->createmode, argp->verf); argp->createmode, argp->verf, NULL);
RETURN_STATUS(nfserr); RETURN_STATUS(nfserr);
} }
...@@ -460,7 +460,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, ...@@ -460,7 +460,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
fh_copy(&resp->fh, &argp->fh); fh_copy(&resp->fh, &argp->fh);
nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t) argp->cookie, nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t) argp->cookie,
nfs3svc_encode_entry, nfs3svc_encode_entry,
buffer, &count, argp->verf); buffer, &count, argp->verf, NULL);
memcpy(resp->verf, argp->verf, 8); memcpy(resp->verf, argp->verf, 8);
resp->count = count; resp->count = count;
...@@ -495,7 +495,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, ...@@ -495,7 +495,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
fh_copy(&resp->fh, &argp->fh); fh_copy(&resp->fh, &argp->fh);
nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t) argp->cookie, nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t) argp->cookie,
nfs3svc_encode_entry_plus, nfs3svc_encode_entry_plus,
buffer, &count, argp->verf); buffer, &count, argp->verf, NULL);
memcpy(resp->verf, argp->verf, 8); memcpy(resp->verf, argp->verf, 8);
resp->count = count; resp->count = count;
......
...@@ -107,8 +107,10 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) ...@@ -107,8 +107,10 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
int fsid = 0; int fsid = 0;
error = nfserr_stale; error = nfserr_stale;
if (rqstp->rq_vers == 3) if (rqstp->rq_vers > 2)
error = nfserr_badhandle; error = nfserr_badhandle;
if (rqstp->rq_vers == 4 && fh->fh_size == 0)
return nfserr_nofilehandle;
if (fh->fh_version == 1) { if (fh->fh_version == 1) {
datap = fh->fh_auth; datap = fh->fh_auth;
...@@ -171,7 +173,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) ...@@ -171,7 +173,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
* Look up the dentry using the NFS file handle. * Look up the dentry using the NFS file handle.
*/ */
error = nfserr_stale; error = nfserr_stale;
if (rqstp->rq_vers == 3) if (rqstp->rq_vers > 2)
error = nfserr_badhandle; error = nfserr_badhandle;
if (fh->fh_version != 1) { if (fh->fh_version != 1) {
...@@ -234,11 +236,23 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) ...@@ -234,11 +236,23 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
/* Type can be negative when creating hardlinks - not to a dir */ /* Type can be negative when creating hardlinks - not to a dir */
if (type > 0 && (inode->i_mode & S_IFMT) != type) { if (type > 0 && (inode->i_mode & S_IFMT) != type) {
error = (type == S_IFDIR)? nfserr_notdir : nfserr_isdir; if (rqstp->rq_vers == 4 && (inode->i_mode & S_IFMT) == S_IFLNK)
error = nfserr_symlink;
else if (type == S_IFDIR)
error = nfserr_notdir;
else if ((inode->i_mode & S_IFMT) == S_IFDIR)
error = nfserr_isdir;
else
error = nfserr_inval;
goto out; goto out;
} }
if (type < 0 && (inode->i_mode & S_IFMT) == -type) { if (type < 0 && (inode->i_mode & S_IFMT) == -type) {
error = (type == -S_IFDIR)? nfserr_notdir : nfserr_isdir; if (rqstp->rq_vers == 4 && (inode->i_mode & S_IFMT) == S_IFLNK)
error = nfserr_symlink;
else if (type == -S_IFDIR)
error = nfserr_isdir;
else
error = nfserr_notdir;
goto out; goto out;
} }
...@@ -302,7 +316,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st ...@@ -302,7 +316,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
* Then create a 32byte filehandle using nfs_fhbase_old * Then create a 32byte filehandle using nfs_fhbase_old
* *
*/ */
u8 ref_fh_version = 0;
u8 ref_fh_fsid_type = 0;
struct inode * inode = dentry->d_inode; struct inode * inode = dentry->d_inode;
struct dentry *parent = dentry->d_parent; struct dentry *parent = dentry->d_parent;
__u32 *datap; __u32 *datap;
...@@ -312,6 +328,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st ...@@ -312,6 +328,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
parent->d_name.name, dentry->d_name.name, parent->d_name.name, dentry->d_name.name,
(inode ? inode->i_ino : 0)); (inode ? inode->i_ino : 0));
if (ref_fh) {
ref_fh_version = ref_fh->fh_handle.fh_version;
ref_fh_fsid_type = ref_fh->fh_handle.fh_fsid_type;
if (ref_fh == fhp)
fh_put(ref_fh);
}
if (fhp->fh_locked || fhp->fh_dentry) { if (fhp->fh_locked || fhp->fh_dentry) {
printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n", printk(KERN_ERR "fh_compose: fh %s/%s not initialized!\n",
parent->d_name.name, dentry->d_name.name); parent->d_name.name, dentry->d_name.name);
...@@ -323,8 +346,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st ...@@ -323,8 +346,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
fhp->fh_dentry = dentry; /* our internal copy */ fhp->fh_dentry = dentry; /* our internal copy */
fhp->fh_export = exp; fhp->fh_export = exp;
if (ref_fh && if (ref_fh_version == 0xca) {
ref_fh->fh_handle.fh_version == 0xca) {
/* old style filehandle please */ /* old style filehandle please */
memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
fhp->fh_handle.fh_size = NFS_FHSIZE; fhp->fh_handle.fh_size = NFS_FHSIZE;
...@@ -340,7 +362,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st ...@@ -340,7 +362,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
fhp->fh_handle.fh_auth_type = 0; fhp->fh_handle.fh_auth_type = 0;
datap = fhp->fh_handle.fh_auth+0; datap = fhp->fh_handle.fh_auth+0;
if ((exp->ex_flags & NFSEXP_FSID) && if ((exp->ex_flags & NFSEXP_FSID) &&
(!ref_fh || ref_fh->fh_handle.fh_fsid_type == 1)) { (ref_fh_fsid_type == 1)) {
fhp->fh_handle.fh_fsid_type = 1; fhp->fh_handle.fh_fsid_type = 1;
/* fsid_type 1 == 4 bytes filesystem id */ /* fsid_type 1 == 4 bytes filesystem id */
*datap++ = exp->ex_fsid; *datap++ = exp->ex_fsid;
...@@ -424,6 +446,10 @@ fh_put(struct svc_fh *fhp) ...@@ -424,6 +446,10 @@ fh_put(struct svc_fh *fhp)
fh_unlock(fhp); fh_unlock(fhp);
fhp->fh_dentry = NULL; fhp->fh_dentry = NULL;
dput(dentry); dput(dentry);
#ifdef CONFIG_NFSD_V3
fhp->fh_pre_saved = 0;
fhp->fh_post_saved = 0;
#endif
nfsd_nr_put++; nfsd_nr_put++;
} }
return; return;
......
...@@ -492,7 +492,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp, ...@@ -492,7 +492,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
/* Read directory and encode entries on the fly */ /* Read directory and encode entries on the fly */
nfserr = nfsd_readdir(rqstp, &argp->fh, (loff_t) argp->cookie, nfserr = nfsd_readdir(rqstp, &argp->fh, (loff_t) argp->cookie,
nfssvc_encode_entry, nfssvc_encode_entry,
buffer, &count, NULL); buffer, &count, NULL, NULL);
resp->count = count; resp->count = count;
fh_put(&argp->fh); fh_put(&argp->fh);
......
...@@ -348,12 +348,12 @@ static struct accessmap nfs3_anyaccess[] = { ...@@ -348,12 +348,12 @@ static struct accessmap nfs3_anyaccess[] = {
}; };
int int
nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access) nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
{ {
struct accessmap *map; struct accessmap *map;
struct svc_export *export; struct svc_export *export;
struct dentry *dentry; struct dentry *dentry;
u32 query, result = 0; u32 query, result = 0, sresult = 0;
unsigned int error; unsigned int error;
error = fh_verify(rqstp, fhp, 0, MAY_NOP); error = fh_verify(rqstp, fhp, 0, MAY_NOP);
...@@ -375,6 +375,9 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access) ...@@ -375,6 +375,9 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access)
for (; map->access; map++) { for (; map->access; map++) {
if (map->access & query) { if (map->access & query) {
unsigned int err2; unsigned int err2;
sresult |= map->access;
err2 = nfsd_permission(export, dentry, map->how); err2 = nfsd_permission(export, dentry, map->how);
switch (err2) { switch (err2) {
case nfs_ok: case nfs_ok:
...@@ -395,6 +398,8 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access) ...@@ -395,6 +398,8 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access)
} }
} }
*access = result; *access = result;
if (supported)
*supported = sresult;
out: out:
return error; return error;
...@@ -756,6 +761,9 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, ...@@ -756,6 +761,9 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file file; struct file file;
int err; int err;
if ((u64)count > ~(u64)offset)
return nfserr_inval;
if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0) if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
return err; return err;
if (EX_ISSYNC(fhp->fh_export)) { if (EX_ISSYNC(fhp->fh_export)) {
...@@ -904,7 +912,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, ...@@ -904,7 +912,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
int int
nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap, char *fname, int flen, struct iattr *iap,
struct svc_fh *resfhp, int createmode, u32 *verifier) struct svc_fh *resfhp, int createmode, u32 *verifier,
int *truncp)
{ {
struct dentry *dentry, *dchild; struct dentry *dentry, *dchild;
struct inode *dirp; struct inode *dirp;
...@@ -966,6 +975,16 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, ...@@ -966,6 +975,16 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFS3_CREATE_UNCHECKED: case NFS3_CREATE_UNCHECKED:
if (! S_ISREG(dchild->d_inode->i_mode)) if (! S_ISREG(dchild->d_inode->i_mode))
err = nfserr_exist; err = nfserr_exist;
else if (truncp) {
/* in nfsv4, we need to treat this case a little
* differently. we don't want to truncate the
* file now; this would be wrong if the OPEN
* fails for some other reason. furthermore,
* if the size is nonzero, we should ignore it
* according to spec!
*/
*truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
}
else { else {
iap->ia_valid &= ATTR_SIZE; iap->ia_valid &= ATTR_SIZE;
goto set_attr; goto set_attr;
...@@ -1326,6 +1345,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, ...@@ -1326,6 +1345,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
goto out; goto out;
} }
if (!type)
type = rdentry->d_inode->i_mode & S_IFMT;
if (type != S_IFDIR) { /* It's UNLINK */ if (type != S_IFDIR) { /* It's UNLINK */
#ifdef MSNFS #ifdef MSNFS
if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
...@@ -1359,7 +1381,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, ...@@ -1359,7 +1381,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
*/ */
int int
nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
encode_dent_fn func, u32 *buffer, int *countp, u32 *verf) encode_dent_fn func, u32 *buffer, int *countp, u32 *verf, u32 *bmval)
{ {
u32 *p; u32 *p;
int oldlen, eof, err; int oldlen, eof, err;
...@@ -1380,6 +1402,10 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, ...@@ -1380,6 +1402,10 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
cd.buffer = buffer; cd.buffer = buffer;
cd.buflen = *countp; /* count of words */ cd.buflen = *countp; /* count of words */
cd.dirfh = fhp; cd.dirfh = fhp;
if (bmval) {
cd.bmval[0] = bmval[0];
cd.bmval[1] = bmval[1];
}
/* /*
* Read the directory entries. This silly loop is necessary because * Read the directory entries. This silly loop is necessary because
...@@ -1395,13 +1421,20 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, ...@@ -1395,13 +1421,20 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
if (err < 0) if (err < 0)
goto out_nfserr; goto out_nfserr;
err = cd.nfserr;
if (err)
goto out_close;
} while (oldlen != cd.buflen && !cd.eob); } while (oldlen != cd.buflen && !cd.eob);
err = nfserr_readdir_nospc;
if (rqstp->rq_vers == 4 && cd.eob && cd.buffer == buffer)
goto out_close;
/* If we didn't fill the buffer completely, we're at EOF */ /* If we didn't fill the buffer completely, we're at EOF */
eof = !cd.eob; eof = !cd.eob;
if (cd.offset) { if (cd.offset) {
if (rqstp->rq_vers == 3) if (rqstp->rq_vers > 2)
(void)xdr_encode_hyper(cd.offset, file.f_pos); (void)xdr_encode_hyper(cd.offset, file.f_pos);
else else
*cd.offset = htonl(file.f_pos); *cd.offset = htonl(file.f_pos);
......
...@@ -77,8 +77,10 @@ typedef struct { ...@@ -77,8 +77,10 @@ typedef struct {
#define irq_enter() (preempt_count() += HARDIRQ_OFFSET) #define irq_enter() (preempt_count() += HARDIRQ_OFFSET)
#if CONFIG_PREEMPT #if CONFIG_PREEMPT
# define in_atomic() (preempt_count() != kernel_locked())
# define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
#else #else
# define in_atomic() (preempt_count() != 0)
# define IRQ_EXIT_OFFSET HARDIRQ_OFFSET # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET
#endif #endif
#define irq_exit() \ #define irq_exit() \
......
...@@ -39,40 +39,69 @@ ...@@ -39,40 +39,69 @@
* standard, but seem to be widely used nevertheless. * standard, but seem to be widely used nevertheless.
*/ */
enum nfs_stat { enum nfs_stat {
NFS_OK = 0, /* v2 v3 */ NFS_OK = 0, /* v2 v3 v4 */
NFSERR_PERM = 1, /* v2 v3 */ NFSERR_PERM = 1, /* v2 v3 v4 */
NFSERR_NOENT = 2, /* v2 v3 */ NFSERR_NOENT = 2, /* v2 v3 v4 */
NFSERR_IO = 5, /* v2 v3 */ NFSERR_IO = 5, /* v2 v3 v4 */
NFSERR_NXIO = 6, /* v2 v3 */ NFSERR_NXIO = 6, /* v2 v3 v4 */
NFSERR_EAGAIN = 11, /* v2 v3 */ NFSERR_EAGAIN = 11, /* v2 v3 */
NFSERR_ACCES = 13, /* v2 v3 */ NFSERR_ACCES = 13, /* v2 v3 v4 */
NFSERR_EXIST = 17, /* v2 v3 */ NFSERR_EXIST = 17, /* v2 v3 v4 */
NFSERR_XDEV = 18, /* v3 */ NFSERR_XDEV = 18, /* v3 v4 */
NFSERR_NODEV = 19, /* v2 v3 */ NFSERR_NODEV = 19, /* v2 v3 v4 */
NFSERR_NOTDIR = 20, /* v2 v3 */ NFSERR_NOTDIR = 20, /* v2 v3 v4 */
NFSERR_ISDIR = 21, /* v2 v3 */ NFSERR_ISDIR = 21, /* v2 v3 v4 */
NFSERR_INVAL = 22, /* v2 v3 that Sun forgot */ NFSERR_INVAL = 22, /* v2 v3 v4 */
NFSERR_FBIG = 27, /* v2 v3 */ NFSERR_FBIG = 27, /* v2 v3 v4 */
NFSERR_NOSPC = 28, /* v2 v3 */ NFSERR_NOSPC = 28, /* v2 v3 v4 */
NFSERR_ROFS = 30, /* v2 v3 */ NFSERR_ROFS = 30, /* v2 v3 v4 */
NFSERR_MLINK = 31, /* v3 */ NFSERR_MLINK = 31, /* v3 v4 */
NFSERR_OPNOTSUPP = 45, /* v2 v3 */ NFSERR_OPNOTSUPP = 45, /* v2 v3 */
NFSERR_NAMETOOLONG = 63, /* v2 v3 */ NFSERR_NAMETOOLONG = 63, /* v2 v3 v4 */
NFSERR_NOTEMPTY = 66, /* v2 v3 */ NFSERR_NOTEMPTY = 66, /* v2 v3 v4 */
NFSERR_DQUOT = 69, /* v2 v3 */ NFSERR_DQUOT = 69, /* v2 v3 v4 */
NFSERR_STALE = 70, /* v2 v3 */ NFSERR_STALE = 70, /* v2 v3 v4 */
NFSERR_REMOTE = 71, /* v2 v3 */ NFSERR_REMOTE = 71, /* v2 v3 */
NFSERR_WFLUSH = 99, /* v2 */ NFSERR_WFLUSH = 99, /* v2 */
NFSERR_BADHANDLE = 10001, /* v3 */ NFSERR_BADHANDLE = 10001, /* v3 v4 */
NFSERR_NOT_SYNC = 10002, /* v3 */ NFSERR_NOT_SYNC = 10002, /* v3 */
NFSERR_BAD_COOKIE = 10003, /* v3 */ NFSERR_BAD_COOKIE = 10003, /* v3 v4 */
NFSERR_NOTSUPP = 10004, /* v3 */ NFSERR_NOTSUPP = 10004, /* v3 v4 */
NFSERR_TOOSMALL = 10005, /* v3 */ NFSERR_TOOSMALL = 10005, /* v3 v4 */
NFSERR_SERVERFAULT = 10006, /* v3 */ NFSERR_SERVERFAULT = 10006, /* v3 v4 */
NFSERR_BADTYPE = 10007, /* v3 */ NFSERR_BADTYPE = 10007, /* v3 v4 */
NFSERR_JUKEBOX = 10008 /* v3 */ NFSERR_JUKEBOX = 10008, /* v3 v4 */
}; NFSERR_SAME = 10009, /* v4 */
NFSERR_DENIED = 10010, /* v4 */
NFSERR_EXPIRED = 10011, /* v4 */
NFSERR_LOCKED = 10012, /* v4 */
NFSERR_GRACE = 10013, /* v4 */
NFSERR_FHEXPIRED = 10014, /* v4 */
NFSERR_SHARE_DENIED = 10015, /* v4 */
NFSERR_WRONGSEC = 10016, /* v4 */
NFSERR_CLID_INUSE = 10017, /* v4 */
NFSERR_RESOURCE = 10018, /* v4 */
NFSERR_MOVED = 10019, /* v4 */
NFSERR_NOFILEHANDLE = 10020, /* v4 */
NFSERR_MINOR_VERS_MISMATCH = 10021, /* v4 */
NFSERR_STALE_CLIENTID = 10022, /* v4 */
NFSERR_STALE_STATEID = 10023, /* v4 */
NFSERR_OLD_STATEID = 10024, /* v4 */
NFSERR_BAD_STATEID = 10025, /* v4 */
NFSERR_BAD_SEQID = 10026, /* v4 */
NFSERR_NOT_SAME = 10027, /* v4 */
NFSERR_LOCK_RANGE = 10028, /* v4 */
NFSERR_SYMLINK = 10029, /* v4 */
NFSERR_READDIR_NOSPC = 10030, /* v4 */
NFSERR_LEASE_MOVED = 10031, /* v4 */
NFSERR_ATTRNOTSUPP = 10032, /* v4 */
NFSERR_NO_GRACE = 10033, /* v4 */
NFSERR_RECLAIM_BAD = 10034, /* v4 */
NFSERR_RECLAIM_CONFLICT = 10035,/* v4 */
NFSERR_BAD_XDR = 10036, /* v4 */
NFSERR_LOCKS_HELD = 10037 /* v4 */
};
/* NFSv2 file types - beware, these are not the same in NFSv3 */ /* NFSv2 file types - beware, these are not the same in NFSv3 */
enum nfs_ftype { enum nfs_ftype {
......
...@@ -55,6 +55,8 @@ struct readdir_cd { ...@@ -55,6 +55,8 @@ struct readdir_cd {
char plus; /* readdirplus */ char plus; /* readdirplus */
char eob; /* end of buffer */ char eob; /* end of buffer */
char dotonly; char dotonly;
int nfserr; /* v4 only */
u32 bmval[2]; /* v4 only */
}; };
typedef int (*encode_dent_fn)(struct readdir_cd *, const char *, typedef int (*encode_dent_fn)(struct readdir_cd *, const char *,
int, loff_t, ino_t, unsigned int); int, loff_t, ino_t, unsigned int);
...@@ -86,11 +88,11 @@ int nfsd_create(struct svc_rqst *, struct svc_fh *, ...@@ -86,11 +88,11 @@ int nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs, char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res); int type, dev_t rdev, struct svc_fh *res);
#ifdef CONFIG_NFSD_V3 #ifdef CONFIG_NFSD_V3
int nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *); int nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
int nfsd_create_v3(struct svc_rqst *, struct svc_fh *, int nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs, char *name, int len, struct iattr *attrs,
struct svc_fh *res, int createmode, struct svc_fh *res, int createmode,
u32 *verifier); u32 *verifier, int *truncp);
int nfsd_commit(struct svc_rqst *, struct svc_fh *, int nfsd_commit(struct svc_rqst *, struct svc_fh *,
off_t, unsigned long); off_t, unsigned long);
#endif /* CONFIG_NFSD_V3 */ #endif /* CONFIG_NFSD_V3 */
...@@ -119,7 +121,8 @@ int nfsd_truncate(struct svc_rqst *, struct svc_fh *, ...@@ -119,7 +121,8 @@ int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
unsigned long size); unsigned long size);
int nfsd_readdir(struct svc_rqst *, struct svc_fh *, int nfsd_readdir(struct svc_rqst *, struct svc_fh *,
loff_t, encode_dent_fn, loff_t, encode_dent_fn,
u32 *buffer, int *countp, u32 *verf); u32 *buffer, int *countp, u32 *verf,
u32 *bmval);
int nfsd_statfs(struct svc_rqst *, struct svc_fh *, int nfsd_statfs(struct svc_rqst *, struct svc_fh *,
struct statfs *); struct statfs *);
...@@ -170,6 +173,16 @@ void nfsd_lockd_unexport(struct svc_client *); ...@@ -170,6 +173,16 @@ void nfsd_lockd_unexport(struct svc_client *);
#define nfserr_serverfault __constant_htonl(NFSERR_SERVERFAULT) #define nfserr_serverfault __constant_htonl(NFSERR_SERVERFAULT)
#define nfserr_badtype __constant_htonl(NFSERR_BADTYPE) #define nfserr_badtype __constant_htonl(NFSERR_BADTYPE)
#define nfserr_jukebox __constant_htonl(NFSERR_JUKEBOX) #define nfserr_jukebox __constant_htonl(NFSERR_JUKEBOX)
#define nfserr_bad_cookie __constant_htonl(NFSERR_BAD_COOKIE)
#define nfserr_same __constant_htonl(NFSERR_SAME)
#define nfserr_clid_inuse __constant_htonl(NFSERR_CLID_INUSE)
#define nfserr_resource __constant_htonl(NFSERR_RESOURCE)
#define nfserr_nofilehandle __constant_htonl(NFSERR_NOFILEHANDLE)
#define nfserr_minor_vers_mismatch __constant_htonl(NFSERR_MINOR_VERS_MISMATCH)
#define nfserr_symlink __constant_htonl(NFSERR_SYMLINK)
#define nfserr_not_same __constant_htonl(NFSERR_NOT_SAME)
#define nfserr_readdir_nospc __constant_htonl(NFSERR_READDIR_NOSPC)
#define nfserr_bad_xdr __constant_htonl(NFSERR_BAD_XDR)
/* error code for internal use - if a request fails due to /* error code for internal use - if a request fails due to
* kmalloc failure, it gets dropped. Client should resend eventually * kmalloc failure, it gets dropped. Client should resend eventually
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include <linux/raid/md.h> #include <linux/raid/md.h>
struct dev_info { struct dev_info {
struct block_device *bdev; mdk_rdev_t *rdev;
unsigned long size; unsigned long size;
unsigned long offset; unsigned long offset;
}; };
......
...@@ -76,9 +76,8 @@ extern void md_unregister_thread (mdk_thread_t *thread); ...@@ -76,9 +76,8 @@ extern void md_unregister_thread (mdk_thread_t *thread);
extern void md_wakeup_thread(mdk_thread_t *thread); extern void md_wakeup_thread(mdk_thread_t *thread);
extern void md_interrupt_thread (mdk_thread_t *thread); extern void md_interrupt_thread (mdk_thread_t *thread);
extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors); extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
extern int md_error (mddev_t *mddev, struct block_device *bdev); extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
extern int md_run_setup(void);
extern void md_print_devices (void); extern void md_print_devices (void);
......
...@@ -154,12 +154,26 @@ struct mdk_rdev_s ...@@ -154,12 +154,26 @@ struct mdk_rdev_s
mdp_super_t *sb; mdp_super_t *sb;
unsigned long sb_offset; unsigned long sb_offset;
int alias_device; /* device alias to the same disk */ /* A device can be in one of three states based on two flags:
* Not working: faulty==1 in_sync==0
* Fully working: faulty==0 in_sync==1
* Working, but not
* in sync with array
* faulty==0 in_sync==0
*
* It can never have faulty==1, in_sync==1
* This reduces the burden of testing multiple flags in many cases
*/
int faulty; /* if faulty do not issue IO requests */ int faulty; /* if faulty do not issue IO requests */
int in_sync; /* device is a full member of the array */ int in_sync; /* device is a full member of the array */
int desc_nr; /* descriptor index in the superblock */ int desc_nr; /* descriptor index in the superblock */
int raid_disk; /* role of device in array */ int raid_disk; /* role of device in array */
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
* support hot removal
*/
}; };
typedef struct mdk_personality_s mdk_personality_t; typedef struct mdk_personality_s mdk_personality_t;
...@@ -202,7 +216,7 @@ struct mddev_s ...@@ -202,7 +216,7 @@ struct mddev_s
int in_sync; /* know to not need resync */ int in_sync; /* know to not need resync */
struct semaphore reconfig_sem; struct semaphore reconfig_sem;
atomic_t active; atomic_t active;
mdk_rdev_t *spare; int spares;
int degraded; /* whether md should consider int degraded; /* whether md should consider
* adding a spare * adding a spare
...@@ -223,11 +237,12 @@ struct mdk_personality_s ...@@ -223,11 +237,12 @@ struct mdk_personality_s
int (*run)(mddev_t *mddev); int (*run)(mddev_t *mddev);
int (*stop)(mddev_t *mddev); int (*stop)(mddev_t *mddev);
int (*status)(char *page, mddev_t *mddev); int (*status)(char *page, mddev_t *mddev);
int (*error_handler)(mddev_t *mddev, struct block_device *bdev); /* error_handler must set ->faulty and clear ->in_sync
* if appropriate, and should abort recovery if needed
*/
void (*error_handler)(mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev); int (*hot_add_disk) (mddev_t *mddev, mdk_rdev_t *rdev);
int (*hot_remove_disk) (mddev_t *mddev, int number); int (*hot_remove_disk) (mddev_t *mddev, int number);
int (*spare_write) (mddev_t *mddev);
int (*spare_inactive) (mddev_t *mddev);
int (*spare_active) (mddev_t *mddev); int (*spare_active) (mddev_t *mddev);
int (*sync_request)(mddev_t *mddev, sector_t sector_nr, int go_faster); int (*sync_request)(mddev_t *mddev, sector_t sector_nr, int go_faster);
}; };
...@@ -272,9 +287,6 @@ extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr); ...@@ -272,9 +287,6 @@ extern mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr);
#define ITERATE_RDEV_PENDING(rdev,tmp) \ #define ITERATE_RDEV_PENDING(rdev,tmp) \
ITERATE_RDEV_GENERIC(pending_raid_disks,rdev,tmp) ITERATE_RDEV_GENERIC(pending_raid_disks,rdev,tmp)
#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
x = y; y = __tmp; } while (0)
typedef struct mdk_thread_s { typedef struct mdk_thread_s {
void (*run) (void *data); void (*run) (void *data);
void *data; void *data;
......
...@@ -5,14 +5,7 @@ ...@@ -5,14 +5,7 @@
#include <linux/bio.h> #include <linux/bio.h>
struct multipath_info { struct multipath_info {
struct block_device *bdev; mdk_rdev_t *rdev;
/*
* State bits:
*/
int operational;
int used_slot;
}; };
struct multipath_private_data { struct multipath_private_data {
......
...@@ -6,18 +6,8 @@ ...@@ -6,18 +6,8 @@
typedef struct mirror_info mirror_info_t; typedef struct mirror_info mirror_info_t;
struct mirror_info { struct mirror_info {
struct block_device *bdev; mdk_rdev_t *rdev;
sector_t head_position; sector_t head_position;
atomic_t nr_pending;
/*
* State bits:
*/
int operational;
int write_only;
int spare;
int used_slot;
}; };
typedef struct r1bio_s r1bio_t; typedef struct r1bio_s r1bio_t;
...@@ -30,7 +20,6 @@ struct r1_private_data_s { ...@@ -30,7 +20,6 @@ struct r1_private_data_s {
int last_used; int last_used;
sector_t next_seq_sect; sector_t next_seq_sect;
mdk_thread_t *thread; mdk_thread_t *thread;
mirror_info_t *spare;
spinlock_t device_lock; spinlock_t device_lock;
/* for use when syncing mirrors: */ /* for use when syncing mirrors: */
......
...@@ -148,6 +148,11 @@ struct stripe_head { ...@@ -148,6 +148,11 @@ struct stripe_head {
#define R5_UPTODATE 0 /* page contains current data */ #define R5_UPTODATE 0 /* page contains current data */
#define R5_LOCKED 1 /* IO has been submitted on "req" */ #define R5_LOCKED 1 /* IO has been submitted on "req" */
#define R5_OVERWRITE 2 /* towrite covers whole page */ #define R5_OVERWRITE 2 /* towrite covers whole page */
/* and some that are internal to handle_stripe */
#define R5_Insync 3 /* rdev && rdev->in_sync at start */
#define R5_Wantread 4 /* want to schedule a read */
#define R5_Wantwrite 5
#define R5_Syncio 6 /* this io need to be accounted as resync io */
/* /*
* Write method * Write method
...@@ -192,11 +197,7 @@ struct stripe_head { ...@@ -192,11 +197,7 @@ struct stripe_head {
struct disk_info { struct disk_info {
struct block_device *bdev; mdk_rdev_t *rdev;
int operational;
int write_only;
int spare;
int used_slot;
}; };
struct raid5_private_data { struct raid5_private_data {
......
...@@ -76,21 +76,19 @@ void lru_add_drain(void) ...@@ -76,21 +76,19 @@ void lru_add_drain(void)
*/ */
void __page_cache_release(struct page *page) void __page_cache_release(struct page *page)
{ {
unsigned long flags;
BUG_ON(page_count(page) != 0); BUG_ON(page_count(page) != 0);
if (PageLRU(page)) {
unsigned long flags;
spin_lock_irqsave(&_pagemap_lru_lock, flags); spin_lock_irqsave(&_pagemap_lru_lock, flags);
if (TestClearPageLRU(page)) { if (TestClearPageLRU(page)) {
if (PageActive(page)) if (PageActive(page))
del_page_from_active_list(page); del_page_from_active_list(page);
else else
del_page_from_inactive_list(page); del_page_from_inactive_list(page);
}
if (page_count(page) != 0)
page = NULL;
spin_unlock_irqrestore(&_pagemap_lru_lock, flags);
} }
if (page_count(page) != 0)
page = NULL;
spin_unlock_irqrestore(&_pagemap_lru_lock, flags);
if (page) if (page)
__free_pages_ok(page, 0); __free_pages_ok(page, 0);
} }
......
...@@ -153,15 +153,20 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) ...@@ -153,15 +153,20 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
unsigned long address = VMALLOC_VMADDR(area->addr); unsigned long address = VMALLOC_VMADDR(area->addr);
unsigned long end = address + (area->size-PAGE_SIZE); unsigned long end = address + (area->size-PAGE_SIZE);
pgd_t *dir; pgd_t *dir;
int err = 0;
dir = pgd_offset_k(address); dir = pgd_offset_k(address);
spin_lock(&init_mm.page_table_lock); spin_lock(&init_mm.page_table_lock);
do { do {
pmd_t *pmd = pmd_alloc(&init_mm, dir, address); pmd_t *pmd = pmd_alloc(&init_mm, dir, address);
if (!pmd) if (!pmd) {
return -ENOMEM; err = -ENOMEM;
if (map_area_pmd(pmd, address, end - address, prot, pages)) break;
return -ENOMEM; }
if (map_area_pmd(pmd, address, end - address, prot, pages)) {
err = -ENOMEM;
break;
}
address = (address + PGDIR_SIZE) & PGDIR_MASK; address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++; dir++;
...@@ -169,7 +174,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) ...@@ -169,7 +174,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
spin_unlock(&init_mm.page_table_lock); spin_unlock(&init_mm.page_table_lock);
flush_cache_all(); flush_cache_all();
return 0; return err;
} }
...@@ -379,14 +384,20 @@ void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot) ...@@ -379,14 +384,20 @@ void *__vmalloc(unsigned long size, int gfp_mask, pgprot_t prot)
area->nr_pages = nr_pages; area->nr_pages = nr_pages;
area->pages = pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); area->pages = pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM));
if (!area->pages) if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL; return NULL;
}
memset(area->pages, 0, array_size); memset(area->pages, 0, array_size);
for (i = 0; i < area->nr_pages; i++) { for (i = 0; i < area->nr_pages; i++) {
area->pages[i] = alloc_page(gfp_mask); area->pages[i] = alloc_page(gfp_mask);
if (unlikely(!area->pages[i])) if (unlikely(!area->pages[i])) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail; goto fail;
}
} }
if (map_vm_area(area, prot, &pages)) if (map_vm_area(area, prot, &pages))
......
...@@ -140,7 +140,7 @@ svc_sock_enqueue(struct svc_sock *svsk) ...@@ -140,7 +140,7 @@ svc_sock_enqueue(struct svc_sock *svsk)
&& !test_bit(SK_CLOSE, &svsk->sk_flags) && !test_bit(SK_CLOSE, &svsk->sk_flags)
&& !test_bit(SK_CONN, &svsk->sk_flags)) { && !test_bit(SK_CONN, &svsk->sk_flags)) {
/* Don't enqueue while not enough space for reply */ /* Don't enqueue while not enough space for reply */
dprintk("svc: socket %p no space, %d > %ld, not enqueued\n", dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz,
sock_wspace(svsk->sk_sk)); sock_wspace(svsk->sk_sk));
goto out_unlock; goto out_unlock;
...@@ -574,6 +574,14 @@ svc_udp_init(struct svc_sock *svsk) ...@@ -574,6 +574,14 @@ svc_udp_init(struct svc_sock *svsk)
svsk->sk_recvfrom = svc_udp_recvfrom; svsk->sk_recvfrom = svc_udp_recvfrom;
svsk->sk_sendto = svc_udp_sendto; svsk->sk_sendto = svc_udp_sendto;
/* initialise setting must have enough space to
* receive and respond to one request.
* svc_udp_recvfrom will re-adjust if necessary
*/
svc_sock_setbufsize(svsk->sk_sock,
3 * svsk->sk_server->sv_bufsz,
3 * svsk->sk_server->sv_bufsz);
set_bit(SK_CHNGBUF, &svsk->sk_flags); set_bit(SK_CHNGBUF, &svsk->sk_flags);
return 0; return 0;
...@@ -679,6 +687,8 @@ svc_tcp_accept(struct svc_sock *svsk) ...@@ -679,6 +687,8 @@ svc_tcp_accept(struct svc_sock *svsk)
goto failed; /* aborted connection or whatever */ goto failed; /* aborted connection or whatever */
} }
set_bit(SK_CONN, &svsk->sk_flags); set_bit(SK_CONN, &svsk->sk_flags);
svc_sock_enqueue(svsk);
slen = sizeof(sin); slen = sizeof(sin);
err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1); err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1);
if (err < 0) { if (err < 0) {
...@@ -944,6 +954,14 @@ svc_tcp_init(struct svc_sock *svsk) ...@@ -944,6 +954,14 @@ svc_tcp_init(struct svc_sock *svsk)
svsk->sk_reclen = 0; svsk->sk_reclen = 0;
svsk->sk_tcplen = 0; svsk->sk_tcplen = 0;
/* initialise setting must have enough space to
* receive and respond to one request.
* svc_tcp_recvfrom will re-adjust if necessary
*/
svc_sock_setbufsize(svsk->sk_sock,
3 * svsk->sk_server->sv_bufsz,
3 * svsk->sk_server->sv_bufsz);
set_bit(SK_CHNGBUF, &svsk->sk_flags); set_bit(SK_CHNGBUF, &svsk->sk_flags);
} }
...@@ -1220,7 +1238,7 @@ svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) ...@@ -1220,7 +1238,7 @@ svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin)
} }
if (protocol == IPPROTO_TCP) { if (protocol == IPPROTO_TCP) {
if ((error = sock->ops->listen(sock, 5)) < 0) if ((error = sock->ops->listen(sock, 64)) < 0)
goto bummer; goto bummer;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment