/* * multipath.c : Multiple Devices driver for Linux * * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat * * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman * * MULTIPATH management functions. * * derived from raid1.c. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/raid/multipath.h> #include <linux/bio.h> #include <linux/buffer_head.h> #include <asm/atomic.h> #define MAJOR_NR MD_MAJOR #define MD_DRIVER #define MD_PERSONALITY #define DEVICE_NR(device) (minor(device)) #define MAX_WORK_PER_DISK 128 #define NR_RESERVED_BUFS 32 /* * The following can be used to debug the driver */ #define MULTIPATH_DEBUG 0 #if MULTIPATH_DEBUG #define PRINTK(x...) printk(x) #define inline #define __inline__ #else #define PRINTK(x...) do { } while (0) #endif static mdk_personality_t multipath_personality; static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED; struct multipath_bh *multipath_retry_list = NULL, **multipath_retry_tail; static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state); static struct multipath_bh *multipath_alloc_mpbh(multipath_conf_t *conf) { struct multipath_bh *mp_bh = NULL; do { spin_lock_irq(&conf->device_lock); if (!conf->freer1_blocked && conf->freer1) { mp_bh = conf->freer1; conf->freer1 = mp_bh->next_mp; conf->freer1_cnt--; mp_bh->next_mp = NULL; mp_bh->state = (1 << MPBH_PreAlloc); } spin_unlock_irq(&conf->device_lock); if (mp_bh) return mp_bh; mp_bh = (struct multipath_bh *) kmalloc(sizeof(struct multipath_bh), GFP_NOIO); if (mp_bh) { memset(mp_bh, 0, sizeof(*mp_bh)); return mp_bh; } conf->freer1_blocked = 1; wait_disk_event(conf->wait_buffer, !conf->freer1_blocked || conf->freer1_cnt > NR_RESERVED_BUFS/2 ); conf->freer1_blocked = 0; } while (1); } static inline void multipath_free_mpbh(struct multipath_bh *mp_bh) { multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); if (test_bit(MPBH_PreAlloc, &mp_bh->state)) { unsigned long flags; mp_bh->bio = NULL; spin_lock_irqsave(&conf->device_lock, flags); mp_bh->next_mp = conf->freer1; conf->freer1 = mp_bh; conf->freer1_cnt++; spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_buffer); } else { kfree(mp_bh); } } static int multipath_grow_mpbh (multipath_conf_t *conf, int cnt) { int i = 0; while (i < cnt) { struct multipath_bh *mp_bh; mp_bh = (struct multipath_bh*)kmalloc(sizeof(*mp_bh), GFP_KERNEL); if (!mp_bh) break; memset(mp_bh, 0, sizeof(*mp_bh)); set_bit(MPBH_PreAlloc, &mp_bh->state); mp_bh->mddev = conf->mddev; multipath_free_mpbh(mp_bh); i++; } return i; } static void multipath_shrink_mpbh(multipath_conf_t *conf) { spin_lock_irq(&conf->device_lock); while (conf->freer1) { struct multipath_bh *mp_bh = conf->freer1; conf->freer1 = mp_bh->next_mp; conf->freer1_cnt--; kfree(mp_bh); } spin_unlock_irq(&conf->device_lock); } static int multipath_map (mddev_t *mddev, struct block_device **bdev) { multipath_conf_t *conf = mddev_to_conf(mddev); int i, disks = MD_SB_DISKS; /* * Later we do read balancing on the read side * now we use the first available disk. */ for (i = 0; i < disks; i++) { if (conf->multipaths[i].operational) { *bdev = conf->multipaths[i].bdev; return (0); } } printk (KERN_ERR "multipath_map(): no more operational IO paths?\n"); return (-1); } static void multipath_reschedule_retry (struct multipath_bh *mp_bh) { unsigned long flags; mddev_t *mddev = mp_bh->mddev; multipath_conf_t *conf = mddev_to_conf(mddev); spin_lock_irqsave(&retry_list_lock, flags); if (multipath_retry_list == NULL) multipath_retry_tail = &multipath_retry_list; *multipath_retry_tail = mp_bh; multipath_retry_tail = &mp_bh->next_mp; mp_bh->next_mp = NULL; spin_unlock_irqrestore(&retry_list_lock, flags); md_wakeup_thread(conf->thread); } /* * multipath_end_bh_io() is called when we have finished servicing a multipathed * operation and are ready to return a success/failure code to the buffer * cache layer. */ static void multipath_end_bh_io (struct multipath_bh *mp_bh, int uptodate) { struct bio *bio = mp_bh->master_bio; bio_endio(bio, uptodate); bio_put(mp_bh->bio); multipath_free_mpbh(mp_bh); } void multipath_end_request(struct bio *bio) { int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); /* * this branch is our 'one multipath IO has finished' event handler: */ if (!uptodate) md_error (mp_bh->mddev, bio->bi_bdev); else /* * Set MPBH_Uptodate in our master buffer_head, so that * we will return a good error code for to the higher * levels even if IO on some other multipathed buffer fails. * * The 'master' represents the complex operation to * user-side. So if something waits for IO, then it will * wait for the 'master' buffer_head. */ set_bit (MPBH_Uptodate, &mp_bh->state); if (uptodate) { multipath_end_bh_io(mp_bh, uptodate); return; } /* * oops, IO error: */ printk(KERN_ERR "multipath: %s: rescheduling sector %lu\n", bdev_partition_name(bio->bi_bdev), bio->bi_sector); multipath_reschedule_retry(mp_bh); return; } /* * This routine returns the disk from which the requested read should * be done. */ static int multipath_read_balance (multipath_conf_t *conf) { int disk; for (disk = 0; disk < conf->raid_disks; disk++) if (conf->multipaths[disk].operational) return disk; BUG(); return 0; } static int multipath_make_request (request_queue_t *q, struct bio * bio) { mddev_t *mddev = q->queuedata; multipath_conf_t *conf = mddev_to_conf(mddev); struct bio *real_bio; struct multipath_bh * mp_bh; struct multipath_info *multipath; mp_bh = multipath_alloc_mpbh (conf); mp_bh->master_bio = bio; mp_bh->mddev = mddev; mp_bh->cmd = bio_data_dir(bio); /* * read balancing logic: */ multipath = conf->multipaths + multipath_read_balance(conf); real_bio = bio_clone(bio, GFP_NOIO); real_bio->bi_bdev = multipath->bdev; real_bio->bi_rw = bio_data_dir(bio); real_bio->bi_end_io = multipath_end_request; real_bio->bi_private = mp_bh; mp_bh->bio = real_bio; generic_make_request(real_bio); return 0; } static int multipath_status (char *page, mddev_t *mddev) { multipath_conf_t *conf = mddev_to_conf(mddev); int sz = 0, i; sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); for (i = 0; i < conf->raid_disks; i++) sz += sprintf (page+sz, "%s", conf->multipaths[i].operational ? "U" : "_"); sz += sprintf (page+sz, "]"); return sz; } #define LAST_DISK KERN_ALERT \ "multipath: only one IO path left and IO error.\n" #define NO_SPARE_DISK KERN_ALERT \ "multipath: no spare IO path left!\n" #define DISK_FAILED KERN_ALERT \ "multipath: IO failure on %s, disabling IO path. \n" \ " Operation continuing on %d IO paths.\n" static void mark_disk_bad (mddev_t *mddev, int failed) { multipath_conf_t *conf = mddev_to_conf(mddev); struct multipath_info *multipath = conf->multipaths+failed; mdp_super_t *sb = mddev->sb; multipath->operational = 0; mark_disk_faulty(sb->disks+multipath->number); mark_disk_nonsync(sb->disks+multipath->number); mark_disk_inactive(sb->disks+multipath->number); sb->active_disks--; sb->working_disks--; sb->failed_disks++; mddev->sb_dirty = 1; md_wakeup_thread(conf->thread); conf->working_disks--; printk (DISK_FAILED, partition_name (multipath->dev), conf->working_disks); } /* * Careful, this can execute in IRQ contexts as well! */ static int multipath_error (mddev_t *mddev, kdev_t dev) { multipath_conf_t *conf = mddev_to_conf(mddev); struct multipath_info * multipaths = conf->multipaths; int disks = MD_SB_DISKS; int other_paths = 1; int i; if (conf->working_disks == 1) { other_paths = 0; for (i = 0; i < disks; i++) { if (multipaths[i].spare) { other_paths = 1; break; } } } if (!other_paths) { /* * Uh oh, we can do nothing if this is our last path, but * first check if this is a queued request for a device * which has just failed. */ for (i = 0; i < disks; i++) { if (kdev_same(multipaths[i].dev, dev) && !multipaths[i].operational) return 0; } printk (LAST_DISK); } else { /* * Mark disk as unusable */ for (i = 0; i < disks; i++) { if (kdev_same(multipaths[i].dev,dev) && multipaths[i].operational) { mark_disk_bad(mddev, i); break; } } if (!conf->working_disks) { int err = 1; mdp_disk_t *spare; mdp_super_t *sb = mddev->sb; spare = get_spare(mddev); if (spare) { err = multipath_diskop(mddev, &spare, DISKOP_SPARE_WRITE); printk("got DISKOP_SPARE_WRITE err: %d. (spare_faulty(): %d)\n", err, disk_faulty(spare)); } if (!err && !disk_faulty(spare)) { multipath_diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); mark_disk_sync(spare); mark_disk_active(spare); sb->active_disks++; sb->spare_disks--; } } } return 0; } #undef LAST_DISK #undef NO_SPARE_DISK #undef DISK_FAILED static void print_multipath_conf (multipath_conf_t *conf) { int i; struct multipath_info *tmp; printk("MULTIPATH conf printout:\n"); if (!conf) { printk("(conf==NULL)\n"); return; } printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks, conf->raid_disks, conf->nr_disks); for (i = 0; i < MD_SB_DISKS; i++) { tmp = conf->multipaths + i; if (tmp->spare || tmp->operational || tmp->number || tmp->raid_disk || tmp->used_slot) printk(" disk%d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", i, tmp->spare,tmp->operational, tmp->number,tmp->raid_disk,tmp->used_slot, partition_name(tmp->dev)); } } static int multipath_diskop(mddev_t *mddev, mdp_disk_t **d, int state) { int err = 0; int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1; multipath_conf_t *conf = mddev->private; struct multipath_info *tmp, *sdisk, *fdisk, *rdisk, *adisk; mdp_super_t *sb = mddev->sb; mdp_disk_t *failed_desc, *spare_desc, *added_desc; mdk_rdev_t *spare_rdev, *failed_rdev; struct block_device *bdev; print_multipath_conf(conf); spin_lock_irq(&conf->device_lock); /* * find the disk ... */ switch (state) { case DISKOP_SPARE_ACTIVE: /* * Find the failed disk within the MULTIPATH configuration ... * (this can only be in the first conf->working_disks part) */ for (i = 0; i < conf->raid_disks; i++) { tmp = conf->multipaths + i; if ((!tmp->operational && !tmp->spare) || !tmp->used_slot) { failed_disk = i; break; } } /* * When we activate a spare disk we _must_ have a disk in * the lower (active) part of the array to replace. */ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { MD_BUG(); err = 1; goto abort; } /* fall through */ case DISKOP_SPARE_WRITE: case DISKOP_SPARE_INACTIVE: /* * Find the spare disk ... (can only be in the 'high' * area of the array) */ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { tmp = conf->multipaths + i; if (tmp->spare && tmp->number == (*d)->number) { spare_disk = i; break; } } if (spare_disk == -1) { MD_BUG(); err = 1; goto abort; } break; case DISKOP_HOT_REMOVE_DISK: for (i = 0; i < MD_SB_DISKS; i++) { tmp = conf->multipaths + i; if (tmp->used_slot && (tmp->number == (*d)->number)) { if (tmp->operational) { printk(KERN_ERR "hot-remove-disk, slot %d is identified to be the requested disk (number %d), but is still operational!\n", i, (*d)->number); err = -EBUSY; goto abort; } removed_disk = i; break; } } if (removed_disk == -1) { MD_BUG(); err = 1; goto abort; } break; case DISKOP_HOT_ADD_DISK: for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { tmp = conf->multipaths + i; if (!tmp->used_slot) { added_disk = i; break; } } if (added_disk == -1) { MD_BUG(); err = 1; goto abort; } break; } switch (state) { /* * Switch the spare disk to write-only mode: */ case DISKOP_SPARE_WRITE: sdisk = conf->multipaths + spare_disk; sdisk->operational = 1; break; /* * Deactivate a spare disk: */ case DISKOP_SPARE_INACTIVE: sdisk = conf->multipaths + spare_disk; sdisk->operational = 0; break; /* * Activate (mark read-write) the (now sync) spare disk, * which means we switch it's 'raid position' (->raid_disk) * with the failed disk. (only the first 'conf->nr_disks' * slots are used for 'real' disks and we must preserve this * property) */ case DISKOP_SPARE_ACTIVE: sdisk = conf->multipaths + spare_disk; fdisk = conf->multipaths + failed_disk; spare_desc = &sb->disks[sdisk->number]; failed_desc = &sb->disks[fdisk->number]; if (spare_desc != *d) { MD_BUG(); err = 1; goto abort; } if (spare_desc->raid_disk != sdisk->raid_disk) { MD_BUG(); err = 1; goto abort; } if (sdisk->raid_disk != spare_disk) { MD_BUG(); err = 1; goto abort; } if (failed_desc->raid_disk != fdisk->raid_disk) { MD_BUG(); err = 1; goto abort; } if (fdisk->raid_disk != failed_disk) { MD_BUG(); err = 1; goto abort; } /* * do the switch finally */ spare_rdev = find_rdev_nr(mddev, spare_desc->number); failed_rdev = find_rdev_nr(mddev, failed_desc->number); xchg_values(spare_rdev->desc_nr, failed_rdev->desc_nr); spare_rdev->alias_device = 0; failed_rdev->alias_device = 1; xchg_values(*spare_desc, *failed_desc); xchg_values(*fdisk, *sdisk); /* * (careful, 'failed' and 'spare' are switched from now on) * * we want to preserve linear numbering and we want to * give the proper raid_disk number to the now activated * disk. (this means we switch back these values) */ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); xchg_values(sdisk->raid_disk, fdisk->raid_disk); xchg_values(spare_desc->number, failed_desc->number); xchg_values(sdisk->number, fdisk->number); *d = failed_desc; if (!sdisk->bdev) sdisk->used_slot = 0; /* * this really activates the spare. */ fdisk->spare = 0; /* * if we activate a spare, we definitely replace a * non-operational disk slot in the 'low' area of * the disk array. */ conf->working_disks++; break; case DISKOP_HOT_REMOVE_DISK: rdisk = conf->multipaths + removed_disk; if (rdisk->spare && (removed_disk < conf->raid_disks)) { MD_BUG(); err = 1; goto abort; } bdev = rdisk->bdev; rdisk->dev = NODEV; rdisk->bdev = NULL; rdisk->used_slot = 0; conf->nr_disks--; bdput(bdev); break; case DISKOP_HOT_ADD_DISK: adisk = conf->multipaths + added_disk; added_desc = *d; if (added_disk != added_desc->number) { MD_BUG(); err = 1; goto abort; } adisk->number = added_desc->number; adisk->raid_disk = added_desc->raid_disk; adisk->dev = mk_kdev(added_desc->major,added_desc->minor); /* it will be held open by rdev */ adisk->bdev = bdget(kdev_t_to_nr(adisk->dev)); adisk->operational = 0; adisk->spare = 1; adisk->used_slot = 1; conf->nr_disks++; break; default: MD_BUG(); err = 1; goto abort; } abort: spin_unlock_irq(&conf->device_lock); print_multipath_conf(conf); return err; } #define IO_ERROR KERN_ALERT \ "multipath: %s: unrecoverable IO read error for block %lu\n" #define REDIRECT_SECTOR KERN_ERR \ "multipath: %s: redirecting sector %lu to another IO path\n" /* * This is a kernel thread which: * * 1. Retries failed read operations on working multipaths. * 2. Updates the raid superblock when problems encounter. * 3. Performs writes following reads for array syncronising. */ static void multipathd (void *data) { struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; mddev_t *mddev; struct block_device *bdev; for (;;) { spin_lock_irqsave(&retry_list_lock, flags); mp_bh = multipath_retry_list; if (!mp_bh) break; multipath_retry_list = mp_bh->next_mp; spin_unlock_irqrestore(&retry_list_lock, flags); mddev = mp_bh->mddev; if (mddev->sb_dirty) { printk(KERN_INFO "dirty sb detected, updating.\n"); md_update_sb(mddev); } bio = mp_bh->bio; bdev = bio->bi_bdev; multipath_map (mddev, &bio->bi_bdev); if (bio->bi_bdev == bdev) { printk(IO_ERROR, bdev_partition_name(bio->bi_bdev), bio->bi_sector); multipath_end_bh_io(mp_bh, 0); } else { printk(REDIRECT_SECTOR, bdev_partition_name(bio->bi_bdev), bio->bi_sector); generic_make_request(bio); } } spin_unlock_irqrestore(&retry_list_lock, flags); } #undef IO_ERROR #undef REDIRECT_SECTOR /* * This will catch the scenario in which one of the multipaths was * mounted as a normal device rather than as a part of a raid set. * * check_consistency is very personality-dependent, eg. RAID5 cannot * do this check, it uses another method. */ static int __check_consistency (mddev_t *mddev, int row) { multipath_conf_t *conf = mddev_to_conf(mddev); int disks = MD_SB_DISKS; struct block_device *bdev; int i, rc = 0; char *buffer; struct page *page = NULL; int first = 1; int order = PAGE_CACHE_SHIFT-PAGE_SHIFT; buffer = (char *) __get_free_pages(GFP_KERNEL, order); if (!buffer) return rc; for (i = 0; i < disks; i++) { struct address_space *mapping; char *p; if (!conf->multipaths[i].operational) continue; printk("(checking disk %d)\n",i); bdev = conf->multipaths[i].bdev; mapping = bdev->bd_inode->i_mapping; page = read_cache_page(mapping, row/(PAGE_CACHE_SIZE/1024), (filler_t *)mapping->a_ops->readpage, NULL); if (IS_ERR(page)) { page = NULL; break; } wait_on_page_locked(page); if (!PageUptodate(page)) break; if (PageError(page)) break; p = page_address(page); if (first) { memcpy(buffer, p, PAGE_CACHE_SIZE); first = 0; } else if (memcmp(buffer, p, PAGE_CACHE_SIZE)) { rc = 1; break; } page_cache_release(page); fsync_bdev(bdev); invalidate_bdev(bdev, 0); page = NULL; } if (page) { bdev = page->mapping->host->i_bdev; page_cache_release(page); fsync_bdev(bdev); invalidate_bdev(bdev, 0); } free_pages((unsigned long) buffer, order); return rc; } static int check_consistency (mddev_t *mddev) { if (__check_consistency(mddev, 0)) /* * we do not do this currently, as it's perfectly possible to * have an inconsistent array when it's freshly created. Only * newly written data has to be consistent. */ return 0; return 0; } #define INVALID_LEVEL KERN_WARNING \ "multipath: md%d: raid level not set to multipath IO (%d)\n" #define NO_SB KERN_ERR \ "multipath: disabled IO path %s (couldn't access raid superblock)\n" #define ERRORS KERN_ERR \ "multipath: disabled IO path %s (errors detected)\n" #define NOT_IN_SYNC KERN_ERR \ "multipath: making IO path %s a spare path (not in sync)\n" #define INCONSISTENT KERN_ERR \ "multipath: disabled IO path %s (inconsistent descriptor)\n" #define ALREADY_RUNNING KERN_ERR \ "multipath: disabled IO path %s (multipath %d already operational)\n" #define OPERATIONAL KERN_INFO \ "multipath: device %s operational as IO path %d\n" #define MEM_ERROR KERN_ERR \ "multipath: couldn't allocate memory for md%d\n" #define SPARE KERN_INFO \ "multipath: spare IO path %s\n" #define NONE_OPERATIONAL KERN_ERR \ "multipath: no operational IO paths for md%d\n" #define SB_DIFFERENCES KERN_ERR \ "multipath: detected IO path differences!\n" #define ARRAY_IS_ACTIVE KERN_INFO \ "multipath: array md%d active with %d out of %d IO paths (%d spare IO paths)\n" #define THREAD_ERROR KERN_ERR \ "multipath: couldn't allocate thread for md%d\n" static int multipath_run (mddev_t *mddev) { multipath_conf_t *conf; int i, j, disk_idx; struct multipath_info *disk, *disk2; mdp_super_t *sb = mddev->sb; mdp_disk_t *desc, *desc2; mdk_rdev_t *rdev, *def_rdev = NULL; struct list_head *tmp; int num_rdevs = 0; MOD_INC_USE_COUNT; if (sb->level != -4) { printk(INVALID_LEVEL, mdidx(mddev), sb->level); goto out; } /* * copy the already verified devices into our private MULTIPATH * bookkeeping area. [whatever we allocate in multipath_run(), * should be freed in multipath_stop()] */ conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); mddev->private = conf; if (!conf) { printk(MEM_ERROR, mdidx(mddev)); goto out; } memset(conf, 0, sizeof(*conf)); ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->faulty) { /* this is a "should never happen" case and if it */ /* ever does happen, a continue; won't help */ printk(ERRORS, partition_name(rdev->dev)); continue; } else { /* this is a "should never happen" case and if it */ /* ever does happen, a continue; won't help */ if (!rdev->sb) { MD_BUG(); continue; } } if (rdev->desc_nr == -1) { MD_BUG(); continue; } desc = &sb->disks[rdev->desc_nr]; disk_idx = desc->raid_disk; disk = conf->multipaths + disk_idx; if (!disk_sync(desc)) printk(NOT_IN_SYNC, partition_name(rdev->dev)); /* * Mark all disks as spare to start with, then pick our * active disk. If we have a disk that is marked active * in the sb, then use it, else use the first rdev. */ disk->number = desc->number; disk->raid_disk = desc->raid_disk; disk->dev = rdev->dev; disk->bdev = rdev->bdev; atomic_inc(&rdev->bdev->bd_count); disk->operational = 0; disk->spare = 1; disk->used_slot = 1; mark_disk_sync(desc); if (disk_active(desc)) { if(!conf->working_disks) { printk(OPERATIONAL, partition_name(rdev->dev), desc->raid_disk); disk->operational = 1; disk->spare = 0; conf->working_disks++; def_rdev = rdev; } else { mark_disk_spare(desc); } } else mark_disk_spare(desc); if(!num_rdevs++) def_rdev = rdev; } if(!conf->working_disks && num_rdevs) { desc = &sb->disks[def_rdev->desc_nr]; disk = conf->multipaths + desc->raid_disk; printk(OPERATIONAL, partition_name(def_rdev->dev), disk->raid_disk); disk->operational = 1; disk->spare = 0; conf->working_disks++; mark_disk_active(desc); } /* * Make sure our active path is in desc spot 0 */ if(def_rdev->desc_nr != 0) { rdev = find_rdev_nr(mddev, 0); desc = &sb->disks[def_rdev->desc_nr]; desc2 = sb->disks; disk = conf->multipaths + desc->raid_disk; disk2 = conf->multipaths + desc2->raid_disk; xchg_values(*desc2,*desc); xchg_values(*disk2,*disk); xchg_values(desc2->number, desc->number); xchg_values(disk2->number, disk->number); xchg_values(desc2->raid_disk, desc->raid_disk); xchg_values(disk2->raid_disk, disk->raid_disk); if(rdev) { xchg_values(def_rdev->desc_nr,rdev->desc_nr); } else { def_rdev->desc_nr = 0; } } conf->raid_disks = sb->raid_disks = sb->active_disks = 1; conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs; sb->failed_disks = 0; sb->spare_disks = num_rdevs - 1; mddev->sb_dirty = 1; conf->mddev = mddev; conf->device_lock = SPIN_LOCK_UNLOCKED; init_waitqueue_head(&conf->wait_buffer); if (!conf->working_disks) { printk(NONE_OPERATIONAL, mdidx(mddev)); goto out_free_conf; } /* pre-allocate some buffer_head structures. * As a minimum, 1 mpbh and raid_disks buffer_heads * would probably get us by in tight memory situations, * but a few more is probably a good idea. * For now, try NR_RESERVED_BUFS mpbh and * NR_RESERVED_BUFS*raid_disks bufferheads * This will allow at least NR_RESERVED_BUFS concurrent * reads or writes even if kmalloc starts failing */ if (multipath_grow_mpbh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) { printk(MEM_ERROR, mdidx(mddev)); goto out_free_conf; } if ((sb->state & (1 << MD_SB_CLEAN))) { /* * we do sanity checks even if the device says * it's clean ... */ if (check_consistency(mddev)) { printk(SB_DIFFERENCES); sb->state &= ~(1 << MD_SB_CLEAN); } } { const char * name = "multipathd"; conf->thread = md_register_thread(multipathd, conf, name); if (!conf->thread) { printk(THREAD_ERROR, mdidx(mddev)); goto out_free_conf; } } /* * Regenerate the "device is in sync with the raid set" bit for * each device. */ for (i = 0; i < MD_SB_DISKS; i++) { mark_disk_nonsync(sb->disks+i); for (j = 0; j < sb->raid_disks; j++) { if (sb->disks[i].number == conf->multipaths[j].number) mark_disk_sync(sb->disks+i); } } printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks, sb->spare_disks); /* * Ok, everything is just fine now */ return 0; out_free_conf: multipath_shrink_mpbh(conf); for (i = 0; i < MD_SB_DISKS; i++) if (conf->multipaths[i].bdev) bdput(conf->multipaths[i].bdev); kfree(conf); mddev->private = NULL; out: MOD_DEC_USE_COUNT; return -EIO; } #undef INVALID_LEVEL #undef NO_SB #undef ERRORS #undef NOT_IN_SYNC #undef INCONSISTENT #undef ALREADY_RUNNING #undef OPERATIONAL #undef SPARE #undef NONE_OPERATIONAL #undef SB_DIFFERENCES #undef ARRAY_IS_ACTIVE static int multipath_stop (mddev_t *mddev) { multipath_conf_t *conf = mddev_to_conf(mddev); int i; md_unregister_thread(conf->thread); multipath_shrink_mpbh(conf); for (i = 0; i < MD_SB_DISKS; i++) if (conf->multipaths[i].bdev) bdput(conf->multipaths[i].bdev); kfree(conf); mddev->private = NULL; MOD_DEC_USE_COUNT; return 0; } static mdk_personality_t multipath_personality= { name: "multipath", make_request: multipath_make_request, run: multipath_run, stop: multipath_stop, status: multipath_status, error_handler: multipath_error, diskop: multipath_diskop, }; static int __init multipath_init (void) { return register_md_personality (MULTIPATH, &multipath_personality); } static void __exit multipath_exit (void) { unregister_md_personality (MULTIPATH); } module_init(multipath_init); module_exit(multipath_exit); MODULE_LICENSE("GPL");