Import 2.3.12pre7

197e67c5 · Linus Torvalds · 72be09b0 · 197e67c5 · 197e67c5 · 197e67c5
Commit 197e67c5 authored Nov 23, 2007 by Linus Torvalds
43 changed files
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -55,7 +55,6 @@
 unsigned long init_user_stack[1024] = { STACK_MAGIC, };
 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct file * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/arm/kernel/init_task.c
+++ b/arch/arm/kernel/init_task.c
@@ -6,7 +6,6 @@

 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct file * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/i386/kernel/init_task.c
+++ b/arch/i386/kernel/init_task.c
@@ -8,7 +8,6 @@

 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct file * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -40,7 +40,6 @@
 */
 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct file * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/mips/kernel/init_task.c
+++ b/arch/mips/kernel/init_task.c
@@ -6,7 +6,6 @@

 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct files * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/ppc/kernel/process.c
+++ b/arch/ppc/kernel/process.c
@@ -47,7 +47,6 @@ extern unsigned long _get_SP(void);
 struct task_struct *last_task_used_math = NULL;
 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct file * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/sparc/kernel/init_task.c
+++ b/arch/sparc/kernel/init_task.c
@@ -6,7 +6,6 @@

 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct file * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/sparc64/kernel/init_task.c
+++ b/arch/sparc64/kernel/init_task.c
@@ -6,7 +6,6 @@

 static struct vm_area_struct init_mmap = INIT_MMAP;
 static struct fs_struct init_fs = INIT_FS;
-static struct file * init_fd_array[NR_OPEN] = { NULL, };
 static struct files_struct init_files = INIT_FILES;
 static struct signal_struct init_signals = INIT_SIGNALS;
 struct mm_struct init_mm = INIT_MM(init_mm);

--- a/arch/sparc64/solaris/timod.c
+++ b/arch/sparc64/solaris/timod.c
@@ -866,7 +866,7 @@ asmlinkage int solaris_getmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)

 	SOLD("entry");
 	lock_kernel();
-	if(fd >= NR_OPEN) goto out;
+	if(fd >= current->files->max_fds) goto out;

 	filp = current->files->fd[fd];
 	if(!filp) goto out;
@@ -933,7 +933,7 @@ asmlinkage int solaris_putmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)

 	SOLD("entry");
 	lock_kernel();
-	if(fd >= NR_OPEN) goto out;
+	if(fd >= current->files->max_fds) goto out;

 	filp = current->files->fd[fd];
 	if(!filp) goto out;

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -428,6 +428,9 @@ void make_request(int major,int rw, struct buffer_head * bh)
 			kstat.pgpgin++;
 			max_req = NR_REQUEST;	/* reads take precedence */
 			break;
+		case WRITERAW:
+			rw = WRITE;
+			goto do_write;	/* Skip the buffer refile */
 		case WRITEA:
 			rw_ahead = 1;
 			rw = WRITE;	/* drop into WRITE */
@@ -435,6 +438,7 @@ void make_request(int major,int rw, struct buffer_head * bh)
 			if (!test_and_clear_bit(BH_Dirty, &bh->b_state))
 				goto end_io;	/* Hmmph! Nothing to write */
 			refile_buffer(bh);
+		do_write:
 			/*
 			 * We don't allow the write-requests to fill up the
 			 * queue completely:  we want some room for reads,
@@ -641,7 +645,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head * bh[])
 #endif
 	}

-	if ((rw == WRITE || rw == WRITEA) && is_read_only(bh[0]->b_dev)) {
+	if ((rw & WRITE) && is_read_only(bh[0]->b_dev)) {
 		printk(KERN_NOTICE "Can't write to read-only device %s\n",
 		       kdevname(bh[0]->b_dev));
 		goto sorry;

--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -20,7 +20,7 @@ FONTMAPFILE = cp437.uni

 L_TARGET := char.a
 M_OBJS   :=
-L_OBJS   := tty_io.o n_tty.o tty_ioctl.o mem.o random.o
+L_OBJS   := tty_io.o n_tty.o tty_ioctl.o mem.o random.o raw.o
 LX_OBJS  := pty.o misc.o

 ifdef CONFIG_VT

--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/joystick.h>
 #include <linux/i2c.h>
+#include <linux/raw.h>

 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -602,6 +603,7 @@ __initfunc(int chr_dev_init(void))
 	if (register_chrdev(MEM_MAJOR,"mem",&memory_fops))
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
 	rand_initialize();
+	raw_init();
 #ifdef CONFIG_USB
 	usb_init();
 #endif

--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
+/*
+ * linux/drivers/char/raw.c
+ *
+ * Front-end raw character devices.  These can be bound to any block
+ * devices to provide genuine Unix raw character device semantics.
+ *
+ * We reserve minor number 0 for a control interface.  ioctl()s on this
+ * device are used to bind the other minor numbers to block devices.
+ */
+
+#include <linux/fs.h>
+#include <linux/iobuf.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/raw.h>
+#include <asm/uaccess.h>
+
+#define dprintk(x...) 
+
+static kdev_t raw_device_bindings[256] = {};
+static int raw_device_inuse[256] = {};
+static int raw_device_sector_size[256] = {};
+static int raw_device_sector_bits[256] = {};
+
+extern struct file_operations * get_blkfops(unsigned int major);
+
+static ssize_t rw_raw_dev(int rw, struct file *, char *, size_t, loff_t *);
+
+ssize_t	raw_read(struct file *, char *, size_t, loff_t *);
+ssize_t	raw_write(struct file *, const char *, size_t, loff_t *);
+int	raw_open(struct inode *, struct file *);
+int	raw_release(struct inode *, struct file *);
+int	raw_ctl_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+
+
+static struct file_operations raw_fops = {
+	NULL,		/* llseek */
+	raw_read,	/* read */
+	raw_write,	/* write */
+	NULL,		/* readdir */
+	NULL,		/* poll */
+	NULL,		/* ioctl */
+	NULL,		/* mmap */
+	raw_open,	/* open */
+	NULL,		/* flush */
+	raw_release,	/* release */
+	NULL		/* fsync */
+};
+
+static struct file_operations raw_ctl_fops = {
+	NULL,		/* llseek */
+	NULL,		/* read */
+	NULL,		/* write */
+	NULL,		/* readdir */
+	NULL,		/* poll */
+	raw_ctl_ioctl,	/* ioctl */
+	NULL,		/* mmap */
+	raw_open,	/* open */
+	NULL,		/* flush */
+	NULL,		/* no special release code */
+	NULL		/* fsync */
+};
+
+
+
+void __init raw_init(void)
+{
+	register_chrdev(RAW_MAJOR, "raw", &raw_fops);
+}
+
+
+/*
+ * The raw IO open and release code needs to fake appropriate
+ * open/release calls to the underlying block devices.  
+ */
+
+static int bdev_open(kdev_t dev, int mode)
+{
+	int err = 0;
+	struct file dummy_file = {};
+	struct dentry dummy_dentry = {};
+	struct inode * inode = get_empty_inode();
+	
+	if (!inode)
+		return -ENOMEM;
+	
+	dummy_file.f_op = get_blkfops(MAJOR(dev));
+	if (!dummy_file.f_op) {
+		err = -ENODEV;
+		goto done;
+	}
+	
+	if (dummy_file.f_op->open) {
+		inode->i_rdev = dev;
+		dummy_dentry.d_inode = inode;
+		dummy_file.f_dentry = &dummy_dentry;
+		dummy_file.f_mode = mode;
+		err = dummy_file.f_op->open(inode, &dummy_file);
+	}
+
+ done:
+	iput(inode);
+	return err;
+}
+
+static int bdev_close(kdev_t dev)
+{
+	int err;
+	struct inode * inode = get_empty_inode();
+
+	if (!inode)
+		return -ENOMEM;
+	
+	inode->i_rdev = dev;
+	err = blkdev_release(inode);
+	iput(inode);
+	return err;
+}
+
+
+
+/* 
+ * Open/close code for raw IO.
+ */
+
+int raw_open(struct inode *inode, struct file *filp)
+{
+	int minor;
+	kdev_t bdev;
+	int err;
+	int sector_size;
+	int sector_bits;
+
+	minor = MINOR(inode->i_rdev);
+	
+	/* 
+	 * Is it the control device? 
+	 */
+	
+	if (minor == 0) {
+		filp->f_op = &raw_ctl_fops;
+		return 0;
+	}
+	
+	/*
+	 * No, it is a normal raw device.  All we need to do on open is
+	 * to check that the device is bound, and force the underlying
+	 * block device to a sector-size blocksize. 
+	 */
+
+	bdev = raw_device_bindings[minor];
+	if (bdev == NODEV) 
+		return -ENODEV;
+
+	err = bdev_open(bdev, filp->f_mode);
+	if (err)
+		return err;
+	
+	/*
+	 * Don't change the blocksize if we already have users using
+	 * this device 
+	 */
+
+	if (raw_device_inuse[minor]++)
+		return 0;
+	
+	/* 
+	 * Don't interfere with mounted devices: we cannot safely set
+	 * the blocksize on a device which is already mounted.  
+	 */
+	
+	sector_size = 512;
+	if (lookup_vfsmnt(bdev) != NULL) {
+		if (blksize_size[MAJOR(bdev)])
+			sector_size = blksize_size[MAJOR(bdev)][MINOR(bdev)];
+	} else {
+		if (hardsect_size[MAJOR(bdev)])
+			sector_size = hardsect_size[MAJOR(bdev)][MINOR(bdev)];
+	}
+
+	set_blocksize(bdev, sector_size);
+	raw_device_sector_size[minor] = sector_size;
+
+	for (sector_bits = 0; !(sector_size & 1); )
+		sector_size>>=1, sector_bits++;
+	raw_device_sector_bits[minor] = sector_bits;
+	
+	return 0;
+}
+
+int raw_release(struct inode *inode, struct file *filp)
+{
+	int minor;
+	kdev_t bdev;
+	
+	minor = MINOR(inode->i_rdev);
+	bdev = raw_device_bindings[minor];
+	bdev_close(bdev);
+	raw_device_inuse[minor]--;
+	return 0;
+}
+
+
+
+/*
+ * Deal with ioctls against the raw-device control interface, to bind
+ * and unbind other raw devices.  
+ */
+
+int raw_ctl_ioctl(struct inode *inode, 
+		  struct file *flip,
+		  unsigned int command, 
+		  unsigned long arg)
+{
+	struct raw_config_request rq;
+	int err = 0;
+	int minor;
+	
+	switch (command) {
+	case RAW_SETBIND:
+	case RAW_GETBIND:
+
+		/* First, find out which raw minor we want */
+
+		err = copy_from_user(&rq, (void *) arg, sizeof(rq));
+		if (err)
+			break;
+		
+		minor = rq.raw_minor;
+		if (minor == 0 || minor > MINORMASK) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (command == RAW_SETBIND) {
+			/* 
+			 * For now, we don't need to check that the underlying
+			 * block device is present or not: we can do that when
+			 * the raw device is opened.  Just check that the
+			 * major/minor numbers make sense. 
+			 */
+
+			if (rq.block_major == NODEV || 
+			    rq.block_major > MAX_BLKDEV ||
+			    rq.block_minor > MINORMASK) {
+				err = -EINVAL;
+				break;
+			}
+			
+			if (raw_device_inuse[minor]) {
+				err = -EBUSY;
+				break;
+			}
+			raw_device_bindings[minor] = 
+				MKDEV(rq.block_major, rq.block_minor);
+		} else {
+			rq.block_major = MAJOR(raw_device_bindings[minor]);
+			rq.block_minor = MINOR(raw_device_bindings[minor]);
+			err = copy_to_user((void *) arg, &rq, sizeof(rq));
+		}
+		break;
+		
+	default:
+		err = -EINVAL;
+	}
+	
+	return err;
+}
+
+
+
+ssize_t	raw_read(struct file *filp, char * buf, 
+		 size_t size, loff_t *offp)
+{
+	return rw_raw_dev(READ, filp, buf, size, offp);
+}
+
+ssize_t	raw_write(struct file *filp, const char *buf, 
+		  size_t size, loff_t *offp)
+{
+	return rw_raw_dev(WRITE, filp, (char *) buf, size, offp);
+}
+
+#define SECTOR_BITS 9
+#define SECTOR_SIZE (1U << SECTOR_BITS)
+#define SECTOR_MASK (SECTOR_SIZE - 1)
+
+ssize_t	rw_raw_dev(int rw, struct file *filp, char *buf, 
+		   size_t size, loff_t *offp)
+{
+	struct kiobuf * iobuf;
+	int		err;
+	unsigned long	blocknr, blocks;
+	unsigned long	b[KIO_MAX_SECTORS];
+	size_t		transferred;
+	int		iosize;
+	int		i;
+	int		minor;
+	kdev_t		dev;
+	unsigned long	limit;
+
+	int		sector_size, sector_bits, sector_mask;
+	int		max_sectors;
+	
+	/*
+	 * First, a few checks on device size limits 
+	 */
+
+	minor = MINOR(filp->f_dentry->d_inode->i_rdev);
+	dev = raw_device_bindings[minor];
+	sector_size = raw_device_sector_size[minor];
+	sector_bits = raw_device_sector_bits[minor];
+	sector_mask = sector_size- 1;
+	max_sectors = KIO_MAX_SECTORS >> (sector_bits - 9);
+	
+	if (blk_size[MAJOR(dev)])
+		limit = (((loff_t) blk_size[MAJOR(dev)][MINOR(dev)]) << BLOCK_SIZE_BITS) >> sector_bits;
+	else
+		limit = INT_MAX;
+	dprintk ("rw_raw_dev: dev %d:%d (+%d)\n",
+		 MAJOR(dev), MINOR(dev), limit);
+	
+	if ((*offp & sector_mask) || (size & sector_mask))
+		return -EINVAL;
+	if ((*offp >> sector_bits) > limit)
+		return 0;
+
+	/* 
+	 * We'll just use one kiobuf
+	 */
+
+	err = alloc_kiovec(1, &iobuf);
+	if (err)
+		return err;
+
+	/*
+	 * Split the IO into KIO_MAX_SECTORS chunks, mapping and
+	 * unmapping the single kiobuf as we go to perform each chunk of
+	 * IO.  
+	 */
+
+	transferred = 0;
+	blocknr = *offp >> sector_bits;
+	while (size > 0) {
+		blocks = size >> sector_bits;
+		if (blocks > max_sectors)
+			blocks = max_sectors;
+		if (blocks > limit - blocknr)
+			blocks = limit - blocknr;
+		if (!blocks)
+			break;
+
+		iosize = blocks << sector_bits;
+
+		err = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
+		if (err)
+			break;
+		
+		for (i=0; i < blocks; i++) 
+			b[i] = blocknr++;
+		
+		err = brw_kiovec(rw, 1, &iobuf, dev, b, sector_size, 0);
+
+		if (err >= 0) {
+			transferred += err;
+			size -= err;
+			buf += err;
+		}
+
+		unmap_kiobuf(iobuf);
+
+		if (err != iosize)
+			break;
+	}
+	
+	free_kiovec(1, &iobuf);
+
+	if (transferred) {
+		*offp += transferred;
+		return transferred;
+	}
+	
+	return err;
+}
--- a/drivers/usb/hub.c
+++ b/drivers/usb/hub.c
@@ -432,38 +432,14 @@ int usb_hub_init(void)

 void usb_hub_cleanup(void)
 {
-	struct list_head *next, *tmp, *head = &all_hubs_list;
-	struct usb_hub *hub;
-	unsigned long flags, flags2;
-
-	/* Free the resources allocated by each hub */
-	spin_lock_irqsave(&hub_list_lock, flags);
-	spin_lock_irqsave(&hub_event_lock, flags2);
-
-	tmp = head->next;
-	while (tmp != head) {
-		hub = list_entry(tmp, struct usb_hub, hub_list);
-
-		next = tmp->next;
-
-		list_del(&hub->event_list);
-		INIT_LIST_HEAD(&hub->event_list);
-		list_del(tmp);         /* &hub->hub_list */
-		INIT_LIST_HEAD(tmp);   /* &hub->hub_list */
-
-		/* XXX we should disconnect each connected port here */
-
-		usb_release_irq(hub->dev, hub->irq_handle);
-		hub->irq_handle = NULL;
-		kfree(hub);
-
-		tmp = next;
-	}
-
+	/*
+	 * Hub resources are freed for us by usb_deregister.  It
+	 * usb_driver_purge on every device which in turn calls that
+	 * devices disconnect function if it is using this driver.
+	 * The hub_disconnect function takes care of releasing the
+	 * individual hub resources. -greg
+	 */
 	usb_deregister(&hub_driver);
-
-	spin_unlock_irqrestore(&hub_event_lock, flags2);
-	spin_unlock_irqrestore(&hub_list_lock, flags);
 } /* usb_hub_cleanup() */

 #ifdef MODULE

--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ O_TARGET := fs.o
 O_OBJS    = open.o read_write.o devices.o file_table.o buffer.o \
 		super.o  block_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
 		ioctl.o readdir.o select.o fifo.o locks.o filesystems.o \
-		dcache.o inode.o attr.o bad_inode.o $(BINFMTS) 
+		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o $(BINFMTS) 

 MOD_LIST_NAME := FS_MODULES
 ALL_SUB_DIRS = coda minix ext2 fat msdos vfat proc isofs nfs umsdos ntfs \

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -40,6 +40,7 @@
 #include <linux/file.h>
 #include <linux/init.h>
 #include <linux/quotaops.h>
+#include <linux/iobuf.h>

 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -1527,6 +1528,221 @@ int block_write_partial_page(struct file *file, struct page *page, unsigned long
 	return err;
 }

+
+/*
+ * IO completion routine for a buffer_head being used for kiobuf IO: we
+ * can't dispatch the kiobuf callback until io_count reaches 0.  
+ */
+
+static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
+{
+	struct kiobuf *kiobuf;
+	
+	mark_buffer_uptodate(bh, uptodate);
+
+	kiobuf = bh->b_kiobuf;
+	if (atomic_dec_and_test(&kiobuf->io_count))
+		kiobuf->end_io(kiobuf);
+	if (!uptodate)
+		kiobuf->errno = -EIO;
+}
+
+
+/*
+ * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
+ * for them to complete.  Clean up the buffer_heads afterwards.  
+ */
+
+#define dprintk(x...) 
+
+static int do_kio(struct kiobuf *kiobuf,
+		  int rw, int nr, struct buffer_head *bh[], int size)
+{
+	int iosize;
+	int i;
+	struct buffer_head *tmp;
+
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	dprintk ("do_kio start %d\n", rw);
+
+	if (rw == WRITE)
+		rw = WRITERAW;
+	atomic_add(nr, &kiobuf->io_count);
+	kiobuf->errno = 0;
+	ll_rw_block(rw, nr, bh);
+
+	kiobuf_wait_for_io(kiobuf);
+	
+	spin_lock(&unused_list_lock);
+	
+	iosize = 0;
+	for (i = nr; --i >= 0; ) {
+		iosize += size;
+		tmp = bh[i];
+		if (!buffer_uptodate(tmp)) {
+			/* We are traversing bh'es in reverse order so
+                           clearing iosize on error calculates the
+                           amount of IO before the first error. */
+			iosize = 0;
+		}
+		__put_unused_buffer_head(tmp);
+	}
+	
+	spin_unlock(&unused_list_lock);
+
+	dprintk ("do_kio end %d %d\n", iosize, err);
+	
+	if (iosize)
+		return iosize;
+	if (kiobuf->errno)
+		return kiobuf->errno;
+	return -EIO;
+}
+
+/*
+ * Start I/O on a physical range of kernel memory, defined by a vector
+ * of kiobuf structs (much like a user-space iovec list).
+ *
+ * The kiobuf must already be locked for IO.  IO is submitted
+ * asynchronously: you need to check page->locked, page->uptodate, and
+ * maybe wait on page->wait.
+ *
+ * It is up to the caller to make sure that there are enough blocks
+ * passed in to completely map the iobufs to disk.
+ */
+
+int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
+	       kdev_t dev, unsigned long b[], int size, int bmap)
+{
+	int		err;
+	int		length;
+	int		transferred;
+	int		i;
+	int		bufind;
+	int		pageind;
+	int		bhind;
+	int		offset;
+	unsigned long	blocknr;
+	struct kiobuf *	iobuf = NULL;
+	unsigned long	page;
+	struct page *	map;
+	struct buffer_head *tmp, *bh[KIO_MAX_SECTORS];
+
+	if (!nr)
+		return 0;
+	
+	/* 
+	 * First, do some alignment and validity checks 
+	 */
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		if ((iobuf->offset & (size-1)) ||
+		    (iobuf->length & (size-1)))
+			return -EINVAL;
+		if (!iobuf->locked)
+			panic("brw_kiovec: iobuf not locked for I/O");
+		if (!iobuf->nr_pages)
+			panic("brw_kiovec: iobuf not initialised");
+	}
+
+	/* DEBUG */
+#if 0
+	return iobuf->length;
+#endif
+	dprintk ("brw_kiovec: start\n");
+	
+	/* 
+	 * OK to walk down the iovec doing page IO on each page we find. 
+	 */
+	bufind = bhind = transferred = err = 0;
+	for (i = 0; i < nr; i++) {
+		iobuf = iovec[i];
+		offset = iobuf->offset;
+		length = iobuf->length;
+		dprintk ("iobuf %d %d %d\n", offset, length, size);
+
+		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
+			page = iobuf->pagelist[pageind];
+			map  = iobuf->maplist[pageind];
+
+			while (length > 0) {
+				blocknr = b[bufind++];
+				tmp = get_unused_buffer_head(0);
+				if (!tmp) {
+					err = -ENOMEM;
+					goto error;
+				}
+				
+				tmp->b_dev = B_FREE;
+				tmp->b_size = size;
+				tmp->b_data = (char *) (page + offset);
+				tmp->b_this_page = tmp;
+
+				init_buffer(tmp, end_buffer_io_kiobuf, NULL);
+				tmp->b_dev = dev;
+				tmp->b_blocknr = blocknr;
+				tmp->b_state = 1 << BH_Mapped;
+				tmp->b_kiobuf = iobuf;
+
+				if (rw == WRITE) {
+					set_bit(BH_Uptodate, &tmp->b_state);
+					set_bit(BH_Dirty, &tmp->b_state);
+				}
+
+				dprintk ("buffer %d (%d) at %p\n", 
+					 bhind, tmp->b_blocknr, tmp->b_data);
+				bh[bhind++] = tmp;
+				length -= size;
+				offset += size;
+
+				/* 
+				 * Start the IO if we have got too much 
+				 */
+				if (bhind >= KIO_MAX_SECTORS) {
+					err = do_kio(iobuf, rw, bhind, bh, size);
+					if (err >= 0)
+						transferred += err;
+					else
+						goto finished;
+					bhind = 0;
+				}
+				
+				if (offset >= PAGE_SIZE) {
+					offset = 0;
+					break;
+				}
+			} /* End of block loop */
+		} /* End of page loop */		
+	} /* End of iovec loop */
+
+	/* Is there any IO still left to submit? */
+	if (bhind) {
+		err = do_kio(iobuf, rw, bhind, bh, size);
+		if (err >= 0)
+			transferred += err;
+		else
+			goto finished;
+	}
+
+ finished:
+	dprintk ("brw_kiovec: end (%d, %d)\n", transferred, err);
+	if (transferred)
+		return transferred;
+	return err;
+
+ error:
+	/* We got an error allocation the bh'es.  Just free the current
+           buffer_heads and exit. */
+	spin_lock(&unused_list_lock);
+	for (i = bhind; --i >= 0; ) {
+		__put_unused_buffer_head(bh[bhind]);
+	}
+	spin_unlock(&unused_list_lock);
+	goto finished;
+}
+
 /*
 * Start I/O on a page.
 * This function expects the page to be locked and may return

--- a/fs/exec.c
+++ b/fs/exec.c
@@ -449,9 +449,9 @@ static inline void flush_old_files(struct files_struct * files)
 		unsigned long set, i;

 		i = j * __NFDBITS;
-		if (i >= files->max_fds)
+		if (i >= files->max_fds || i >= files->max_fdset)
 			break;
-		set = xchg(&files->close_on_exec.fds_bits[j], 0);
+		set = xchg(&files->close_on_exec->fds_bits[j], 0);
 		j++;
 		for ( ; set ; i++,set >>= 1) {
 			if (set & 1)

--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,36 +12,89 @@

 extern int sock_fcntl (struct file *, unsigned int cmd, unsigned long arg);

-static inline int dupfd(struct file *file, unsigned int arg)
+/*
+ * locate_fd finds a free file descriptor in the open_fds fdset,
+ * expanding the fd arrays if necessary.  The files write lock will be
+ * held on exit to ensure that the fd can be entered atomically.
+ */
+
+static inline int locate_fd(struct files_struct *files, 
+			    struct file *file, int start)
 {
-	struct files_struct * files = current->files;
+	unsigned int newfd;
 	int error;

-	error = -EMFILE;
 	write_lock(&files->file_lock);
-	arg = find_next_zero_bit(&files->open_fds, NR_OPEN, arg);
-	if (arg >= current->rlim[RLIMIT_NOFILE].rlim_cur)
-		goto out_putf;
-	FD_SET(arg, &files->open_fds);
-	FD_CLR(arg, &files->close_on_exec);
-	write_unlock(&files->file_lock);
-	fd_install(arg, file);
-	error = arg;
+	
+repeat:
+	error = -EMFILE;
+	if (start < files->next_fd)
+		start = files->next_fd;
+	if (start >= files->max_fdset) {
+	expand:
+		error = expand_files(files, start);
+		if (error < 0)
+			goto out;
+		goto repeat;
+	}
+	
+	newfd = find_next_zero_bit(files->open_fds->fds_bits, 
+				   files->max_fdset, start);
+
+	error = -EMFILE;
+	if (newfd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
+		goto out;
+	if (newfd >= files->max_fdset) 
+		goto expand;
+
+	error = expand_files(files, newfd);
+	if (error < 0)
+		goto out;
+	if (error) /* If we might have blocked, try again. */
+		goto repeat;
+
+	if (start <= files->next_fd)
+		files->next_fd = newfd + 1;
+	
+	error = newfd;
+	
 out:
 	return error;
+}
+
+static inline void allocate_fd(struct files_struct *files, 
+					struct file *file, int fd)
+{
+	FD_SET(fd, files->open_fds);
+	FD_CLR(fd, files->close_on_exec);
+	write_unlock(&files->file_lock);
+	fd_install(fd, file);
+}
+
+static int dupfd(struct file *file, int start)
+{
+	struct files_struct * files = current->files;
+	int ret;
+
+	ret = locate_fd(files, file, start);
+	if (ret < 0) 
+		goto out_putf;
+	allocate_fd(files, file, ret);
+	return ret;

 out_putf:
 	write_unlock(&files->file_lock);
 	fput(file);
-	goto out;
+	return ret;
 }

 asmlinkage int sys_dup2(unsigned int oldfd, unsigned int newfd)
 {
 	int err = -EBADF;
 	struct file * file;
+	struct files_struct * files = current->files;

-	read_lock(&current->files->file_lock);
+	write_lock(&current->files->file_lock);
 	if (!(file = fcheck(oldfd)))
 		goto out_unlock;
 	err = newfd;
@@ -50,15 +103,33 @@ asmlinkage int sys_dup2(unsigned int oldfd, unsigned int newfd)
 	err = -EBADF;
 	if (newfd >= NR_OPEN)
 		goto out_unlock;	/* following POSIX.1 6.2.1 */
-	get_file(file);
-	read_unlock(&current->files->file_lock);
+	get_file(file);			/* We are now finished with oldfd */
+
+	err = expand_files(files, newfd);
+	if (err < 0) {
+		write_unlock(&files->file_lock);
+		fput(file);
+		goto out;
+	}
+
+	/* To avoid races with open() and dup(), we will mark the fd as
+	 * in-use in the open-file bitmap throughout the entire dup2()
+	 * process.  This is quite safe: do_close() uses the fd array
+	 * entry, not the bitmap, to decide what work needs to be
+	 * done.  --sct */
+	FD_SET(newfd, files->open_fds);
+	write_unlock(&files->file_lock);
+	
+	do_close(newfd, 0);
+
+	write_lock(&files->file_lock);
+	allocate_fd(files, file, newfd);
+	err = newfd;

-	sys_close(newfd);
-	err = dupfd(file, newfd);
 out:
 	return err;
 out_unlock:
-	read_unlock(&current->files->file_lock);
+	write_unlock(&current->files->file_lock);
 	goto out;
 }

@@ -66,6 +137,7 @@ asmlinkage int sys_dup(unsigned int fildes)
 {
 	int ret = -EBADF;
 	struct file * file = fget(fildes);
+
 	if (file)
 		ret = dupfd(file, 0);
 	return ret;
@@ -118,13 +190,13 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
 			}
 			break;
 		case F_GETFD:
-			err = FD_ISSET(fd, &current->files->close_on_exec);
+			err = FD_ISSET(fd, current->files->close_on_exec);
 			break;
 		case F_SETFD:
 			if (arg&1)
-				FD_SET(fd, &current->files->close_on_exec);
+				FD_SET(fd, current->files->close_on_exec);
 			else
-				FD_CLR(fd, &current->files->close_on_exec);
+				FD_CLR(fd, current->files->close_on_exec);
 			break;
 		case F_GETFL:
 			err = filp->f_flags;
@@ -152,7 +224,6 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
 			err = filp->f_owner.pid;
 			break;
 		case F_SETOWN:
-			err = 0;
 			filp->f_owner.pid = arg;
 			filp->f_owner.uid = current->uid;
 			filp->f_owner.euid = current->euid;
@@ -172,10 +243,9 @@ asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
 			break;
 		default:
 			/* sockets need a few special fcntls. */
+			err = -EINVAL;
 			if (S_ISSOCK (filp->f_dentry->d_inode->i_mode))
 				err = sock_fcntl (filp, cmd, arg);
-			else
-				err = -EINVAL;
 			break;
 	}
 	fput(filp);

--- a/fs/file.c
+++ b/fs/file.c
+/*
+ *  linux/fs/open.c
+ *
+ *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
+ *
+ *  Manage the dynamic fd arrays in the process files_struct.
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/malloc.h>
+#include <linux/vmalloc.h>
+
+#include <asm/bitops.h>
+
+
+/*
+ * Allocate an fd array, using get_free_page() if possible.
+ * Note: the array isn't cleared at allocation time.
+ */
+struct file ** alloc_fd_array(int num)
+{
+	struct file **new_fds;
+	int size = num * sizeof(struct file *);
+
+	if (size < PAGE_SIZE)
+		new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
+	else if (size == PAGE_SIZE)
+		new_fds = (struct file **) __get_free_page(GFP_KERNEL);
+	else
+		new_fds = (struct file **) vmalloc(size);
+	return new_fds;
+}
+
+void free_fd_array(struct file **array, int num)
+{
+	int size = num * sizeof(struct file *);
+
+	if (!array) {
+		printk (KERN_ERR __FUNCTION__ "array = 0 (num = %d)\n", num);
+		return;
+	}
+
+	if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */
+		return;
+	else if (size < PAGE_SIZE)
+		kfree(array);
+	else if (size == PAGE_SIZE)
+		free_page((unsigned long) array);
+	else
+		vfree(array);
+}
+
+/*
+ * Expand the fd array in the files_struct.  Called with the files
+ * spinlock held for write.
+ */
+
+int expand_fd_array(struct files_struct *files, int nr)
+{
+	struct file **new_fds;
+	int error, nfds;
+
+	
+	error = -EMFILE;
+	if (files->max_fds >= NR_OPEN || nr > NR_OPEN)
+		goto out;
+
+	nfds = files->max_fds;
+	write_unlock(&files->file_lock);
+
+	/* 
+	 * Expand to the max in easy steps, and keep expanding it until
+	 * we have enough for the requested fd array size. 
+	 */
+
+	do {
+#if NR_OPEN_DEFAULT < 256
+		if (nfds < 256)
+			nfds = 256;
+		else 
+#endif
+		if (nfds < (PAGE_SIZE / sizeof(struct file *)))
+			nfds = PAGE_SIZE / sizeof(struct file *);
+		else {
+			nfds = nfds * 2;
+			if (nfds > NR_OPEN)
+				nfds = NR_OPEN;
+		}
+	} while (nfds < nr);
+
+	error = -ENOMEM;
+	new_fds = alloc_fd_array(nfds);
+	write_lock(&files->file_lock);
+	if (!new_fds)
+		goto out;
+
+	/* Copy the existing array and install the new pointer */
+
+	if (nfds > files->max_fds) {
+		struct file **old_fds;
+		int i;
+		
+		old_fds = xchg(&files->fd, new_fds);
+		i = xchg(&files->max_fds, nfds);
+
+		/* Don't copy/clear the array if we are creating a new
+		   fd array for fork() */
+		if (i) {
+			memcpy(new_fds, old_fds, i * sizeof(struct file *));
+			/* clear the remainder of the array */
+			memset(&new_fds[i], 0,
+			       (nfds-i) * sizeof(struct file *)); 
+
+			write_unlock(&files->file_lock);
+			free_fd_array(old_fds, i);
+			write_lock(&files->file_lock);
+		}
+	} else {
+		/* Somebody expanded the array while we slept ... */
+		write_unlock(&files->file_lock);
+		free_fd_array(new_fds, nfds);
+		write_lock(&files->file_lock);
+	}
+	error = 0;
+out:
+	return error;
+}
+
+/*
+ * Allocate an fdset array, using get_free_page() if possible.
+ * Note: the array isn't cleared at allocation time.
+ */
+fd_set * alloc_fdset(int num)
+{
+	fd_set *new_fdset;
+	int size = num / 8;
+
+	if (size < PAGE_SIZE)
+		new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL);
+	else if (size == PAGE_SIZE)
+		new_fdset = (fd_set *) __get_free_page(GFP_KERNEL);
+	else
+		new_fdset = (fd_set *) vmalloc(size);
+	return new_fdset;
+}
+
+void free_fdset(fd_set *array, int num)
+{
+	int size = num / 8;
+
+	if (!array) {
+		printk (KERN_ERR __FUNCTION__ "array = 0 (num = %d)\n", num);
+		return;
+	}
+	
+	if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */
+		return;
+	else if (size < PAGE_SIZE)
+		kfree(array);
+	else if (size == PAGE_SIZE)
+		free_page((unsigned long) array);
+	else
+		vfree(array);
+}
+
+/*
+ * Expand the fdset in the files_struct.  Called with the files spinlock
+ * held for write.
+ */
+int expand_fdset(struct files_struct *files, int nr)
+{
+	fd_set *new_openset = 0, *new_execset = 0;
+	int error, nfds = 0;
+
+	error = -EMFILE;
+	if (files->max_fdset >= NR_OPEN || nr > NR_OPEN)
+		goto out;
+
+	nfds = files->max_fdset;
+	write_unlock(&files->file_lock);
+
+	/* Expand to the max in easy steps */
+	do {
+		if (nfds < (PAGE_SIZE * 8))
+			nfds = PAGE_SIZE * 8;
+		else {
+			nfds = nfds * 2;
+			if (nfds > NR_OPEN)
+				nfds = NR_OPEN;
+		}
+	} while (nfds < nr);
+
+	error = -ENOMEM;
+	new_openset = alloc_fdset(nfds);
+	new_execset = alloc_fdset(nfds);
+	write_lock(&files->file_lock);
+	if (!new_openset || !new_execset)
+		goto out;
+
+	error = 0;
+	
+	/* Copy the existing tables and install the new pointers */
+	if (nfds > files->max_fdset) {
+		int i = files->max_fdset / (sizeof(unsigned long) * 8);
+		int count = (nfds - files->max_fdset) / 8;
+		
+		/* 
+		 * Don't copy the entire array if the current fdset is
+		 * not yet initialised.  
+		 */
+		if (i) {
+			memcpy (new_openset, files->open_fds, files->max_fdset/8);
+			memcpy (new_execset, files->close_on_exec, files->max_fdset/8);
+			memset (&new_openset->fds_bits[i], 0, count);
+			memset (&new_execset->fds_bits[i], 0, count);
+		}
+		
+		nfds = xchg(&files->max_fdset, nfds);
+		new_openset = xchg(&files->open_fds, new_openset);
+		new_execset = xchg(&files->close_on_exec, new_execset);
+		write_unlock(&files->file_lock);
+		free_fdset (new_openset, nfds);
+		free_fdset (new_execset, nfds);
+		write_lock(&files->file_lock);
+		return 0;
+	} 
+	/* Somebody expanded the array while we slept ... */
+
+out:
+	write_unlock(&files->file_lock);
+	if (new_openset)
+		free_fdset(new_openset, nfds);
+	if (new_execset)
+		free_fdset(new_execset, nfds);
+	write_lock(&files->file_lock);
+	return error;
+}
+
--- a/fs/iobuf.c
+++ b/fs/iobuf.c
+/*
+ * iobuf.c
+ *
+ * Keep track of the general-purpose IO-buffer structures used to track
+ * abstract kernel-space io buffers.
+ * 
+ */
+
+#include <linux/iobuf.h>
+#include <linux/malloc.h>
+#include <linux/slab.h>
+
+static kmem_cache_t *kiobuf_cachep;
+
+/*
+ * The default IO completion routine for kiobufs: just wake up
+ * the kiobuf, nothing more.  
+ */
+
+void simple_wakeup_kiobuf(struct kiobuf *kiobuf)
+{
+	wake_up(&kiobuf->wait_queue);
+}
+
+
+void __init kiobuf_init(void)
+{
+	kiobuf_cachep =  kmem_cache_create("kiobuf",
+					   sizeof(struct kiobuf),
+					   0,
+					   SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if(!kiobuf_cachep)
+		panic("Cannot create kernel iobuf cache\n");
+}
+
+
+int alloc_kiovec(int nr, struct kiobuf **bufp)
+{
+	int i;
+	struct kiobuf *iobuf;
+	
+	for (i = 0; i < nr; i++) {
+		iobuf = kmem_cache_alloc(kiobuf_cachep, SLAB_KERNEL);
+		if (!iobuf) {
+			free_kiovec(i, bufp);
+			return -ENOMEM;
+		}
+		
+		memset(iobuf, 0, sizeof(*iobuf));
+		init_waitqueue_head(&iobuf->wait_queue);
+		iobuf->end_io = simple_wakeup_kiobuf;
+		iobuf->array_len = KIO_STATIC_PAGES;
+		iobuf->pagelist  = iobuf->page_array;
+		iobuf->maplist   = iobuf->map_array;
+		*bufp++ = iobuf;
+	}
+	
+	return 0;
+}
+
+void free_kiovec(int nr, struct kiobuf **bufp) 
+{
+	int i;
+	struct kiobuf *iobuf;
+	
+	for (i = 0; i < nr; i++) {
+		iobuf = bufp[i];
+		if (iobuf->array_len > KIO_STATIC_PAGES) {
+			kfree (iobuf->pagelist);
+			kfree (iobuf->maplist);
+		}
+		kmem_cache_free(kiobuf_cachep, bufp[i]);
+	}
+}
+
+int expand_kiobuf(struct kiobuf *iobuf, int wanted)
+{
+	unsigned long *	pagelist;
+	struct page ** maplist;
+	
+	if (iobuf->array_len >= wanted)
+		return 0;
+	
+	pagelist = (unsigned long *) 
+		kmalloc(wanted * sizeof(unsigned long), GFP_KERNEL);
+	if (!pagelist)
+		return -ENOMEM;
+	
+	maplist = (struct page **) 
+		kmalloc(wanted * sizeof(struct page **), GFP_KERNEL);
+	if (!maplist) {
+		kfree(pagelist);
+		return -ENOMEM;
+	}
+
+	/* Did it grow while we waited? */
+	if (iobuf->array_len >= wanted) {
+		kfree(pagelist);
+		kfree(maplist);
+		return 0;
+	}
+	
+	memcpy (pagelist, iobuf->pagelist, wanted * sizeof(unsigned long));
+	memcpy (maplist,  iobuf->maplist,   wanted * sizeof(struct page **));
+
+	if (iobuf->array_len > KIO_STATIC_PAGES) {
+		kfree (iobuf->pagelist);
+		kfree (iobuf->maplist);
+	}
+	
+	iobuf->pagelist  = pagelist;
+	iobuf->maplist   = maplist;
+	iobuf->array_len = wanted;
+	return 0;
+}
+
+
+void kiobuf_wait_for_io(struct kiobuf *kiobuf)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+
+	add_wait_queue(&kiobuf->wait_queue, &wait);
+repeat:
+	tsk->state = TASK_UNINTERRUPTIBLE;
+	run_task_queue(&tq_disk);
+	if (atomic_read(&kiobuf->io_count) != 0) {
+		schedule();
+		goto repeat;
+	}
+	tsk->state = TASK_RUNNING;
+	remove_wait_queue(&kiobuf->wait_queue, &wait);
+}
+
+
+
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -61,11 +61,11 @@ asmlinkage int sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 	lock_kernel();
 	switch (cmd) {
 		case FIOCLEX:
-			FD_SET(fd, &current->files->close_on_exec);
+			FD_SET(fd, current->files->close_on_exec);
 			break;

 		case FIONCLEX:
-			FD_CLR(fd, &current->files->close_on_exec);
+			FD_CLR(fd, current->files->close_on_exec);
 			break;

 		case FIONBIO:

--- a/fs/open.c
+++ b/fs/open.c
@@ -685,10 +685,14 @@ int get_unused_fd(void)
 	struct files_struct * files = current->files;
 	int fd, error;

-	error = -EMFILE;
-
+  	error = -EMFILE;
 	write_lock(&files->file_lock);
-	fd = find_first_zero_bit(&files->open_fds, NR_OPEN);
+
+repeat:
+ 	fd = find_next_zero_bit(files->open_fds, 
+				current->files->max_fdset, 
+				files->next_fd);
+
 	/*
 	 * N.B. For clone tasks sharing a files structure, this test
 	 * will limit the total number of files that can be opened.
@@ -696,10 +700,31 @@ int get_unused_fd(void)
 	if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur)
 		goto out;

-	/* Check here for fd > files->max_fds to do dynamic expansion */
+	/* Do we need to expand the fdset array? */
+	if (fd >= current->files->max_fdset) {
+		error = expand_fdset(files, 0);
+		if (!error) {
+			error = -EMFILE;
+			goto repeat;
+		}
+		goto out;
+	}
+	
+	/* 
+	 * Check whether we need to expand the fd array.
+	 */
+	if (fd >= files->max_fds) {
+		error = expand_fd_array(files, 0);
+		if (!error) {
+			error = -EMFILE;
+			goto repeat;
+		}
+		goto out;
+	}

-	FD_SET(fd, &files->open_fds);
-	FD_CLR(fd, &files->close_on_exec);
+	FD_SET(fd, files->open_fds);
+	FD_CLR(fd, files->close_on_exec);
+	files->next_fd = fd + 1;
 #if 1
 	/* Sanity check */
 	if (files->fd[fd] != NULL) {
@@ -717,7 +742,9 @@ int get_unused_fd(void)
 inline void put_unused_fd(unsigned int fd)
 {
 	write_lock(&current->files->file_lock);
-	FD_CLR(fd, &current->files->open_fds);
+	FD_CLR(fd, current->files->open_fds);
+	if (fd < current->files->next_fd)
+		current->files->next_fd = fd;
 	write_unlock(&current->files->file_lock);
 }

@@ -790,8 +817,12 @@ int filp_close(struct file *filp, fl_owner_t id)
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
+ *
+ * The "release" argument tells us whether or not to mark the fd as free
+ * or not in the open-files bitmap.  dup2 uses this to retain the fd
+ * without races.
 */
-asmlinkage int sys_close(unsigned int fd)
+int do_close(unsigned int fd, int release)
 {
 	int error;
 	struct file * filp;
@@ -802,9 +833,10 @@ asmlinkage int sys_close(unsigned int fd)
 	filp = frip(fd);
 	if (!filp)
 		goto out_unlock;
-	FD_CLR(fd, &files->close_on_exec);
+	FD_CLR(fd, files->close_on_exec);
 	write_unlock(&files->file_lock);
-	put_unused_fd(fd);
+	if (release)
+		put_unused_fd(fd);
 	lock_kernel();
 	error = filp_close(filp, files);
 	unlock_kernel();
@@ -815,6 +847,11 @@ asmlinkage int sys_close(unsigned int fd)
 	goto out;
 }

+asmlinkage int sys_close(unsigned int fd)
+{
+	return do_close(fd, 1);
+}
+
 /*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.

--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -725,11 +725,13 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		"PPid:\t%d\n"
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n"
+		"FDSize:\t%d\n"
 		"Groups:\t",
 		get_task_state(p),
 		p->pid, p->p_pptr->pid,
 		p->uid, p->euid, p->suid, p->fsuid,
-		p->gid, p->egid, p->sgid, p->fsgid);
+		p->gid, p->egid, p->sgid, p->fsgid,
+		p->files ? p->files->max_fds : 0);

 	for (g = 0; g < p->ngroups; g++)
 		buffer += sprintf(buffer, "%d ", p->groups[g]);

--- a/fs/select.c
+++ b/fs/select.c
@@ -106,7 +106,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
 	/* handle last in-complete long-word first */
 	set = ~(~0UL << (n & (__NFDBITS-1)));
 	n /= __NFDBITS;
-	open_fds = current->files->open_fds.fds_bits+n;
+	open_fds = current->files->open_fds->fds_bits+n;
 	max = 0;
 	if (set) {
 		set &= BITS(fds, n);
@@ -268,8 +268,8 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 	if (n < 0)
 		goto out_nofds;

-	if (n > KFDS_NR)
-		n = KFDS_NR;
+	if (n > current->files->max_fdset + 1)
+		n = current->files->max_fdset + 1;

 	/*
 	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
@@ -277,7 +277,7 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 	 * long-words. 
 	 */
 	ret = -ENOMEM;
-	size = FDS_BYTES(n);
+	size = (n + 8 * sizeof(long) - 1) / (8 * sizeof(long)) * sizeof(long);
 	bits = kmalloc(6 * size, GFP_KERNEL);
 	if (!bits)
 		goto out_nofds;
@@ -380,7 +380,7 @@ asmlinkage int sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
 	lock_kernel();
 	/* Do a sanity check on nfds ... */
 	err = -EINVAL;
-	if (nfds > NR_OPEN)
+	if (nfds > current->files->max_fds)
 		goto out;

 	if (timeout) {

--- a/include/asm-alpha/resource.h
+++ b/include/asm-alpha/resource.h
@@ -28,7 +28,7 @@
    {_STK_LIM, _STK_LIM},			/* RLIMIT_STACK */	\
    {       0, LONG_MAX},			/* RLIMIT_CORE */	\
    {LONG_MAX, LONG_MAX},			/* RLIMIT_RSS */	\
-    { NR_OPEN,  NR_OPEN},			/* RLIMIT_NOFILE */	\
+    {INR_OPEN, INR_OPEN},			/* RLIMIT_NOFILE */	\
    {LONG_MAX, LONG_MAX},			/* RLIMIT_AS */		\
    {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER},	/* RLIMIT_NPROC */	\
    {LONG_MAX, LONG_MAX},			/* RLIMIT_MEMLOCK */	\

--- a/include/asm-arm/resource.h
+++ b/include/asm-arm/resource.h
@@ -29,7 +29,7 @@
 	{        0, LONG_MAX },				\
 	{ LONG_MAX, LONG_MAX },				\
 	{ MAX_TASKS_PER_USER, MAX_TASKS_PER_USER },	\
-	{ NR_OPEN, NR_OPEN },				\
+	{ INR_OPEN, INR_OPEN },				\
 	{ LONG_MAX, LONG_MAX },				\
 	{ LONG_MAX, LONG_MAX },				\
 }

--- a/include/asm-i386/resource.h
+++ b/include/asm-i386/resource.h
@@ -29,7 +29,7 @@
 	{        0, LONG_MAX },				\
 	{ LONG_MAX, LONG_MAX },				\
 	{ 0, 0 },					\
-	{ NR_OPEN, NR_OPEN },				\
+	{ INR_OPEN, INR_OPEN },				\
 	{ LONG_MAX, LONG_MAX },				\
 	{ LONG_MAX, LONG_MAX },				\
 }

--- a/include/asm-m68k/resource.h
+++ b/include/asm-m68k/resource.h
@@ -29,7 +29,7 @@
  {       0, LONG_MAX}, \
  {LONG_MAX, LONG_MAX}, \
  {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER}, \
-  {NR_OPEN, NR_OPEN},   \
+  {INR_OPEN, INR_OPEN}, \
  {LONG_MAX, LONG_MAX}, \
  {LONG_MAX, LONG_MAX}  \
 }

--- a/include/asm-mips/resource.h
+++ b/include/asm-mips/resource.h
@@ -34,7 +34,7 @@
 	{ LONG_MAX, LONG_MAX },				\
 	{ _STK_LIM, LONG_MAX },				\
 	{        0, LONG_MAX },				\
-	{ NR_OPEN, NR_OPEN },				\
+	{ INR_OPEN, INR_OPEN },				\
 	{ LONG_MAX, LONG_MAX },				\
 	{ LONG_MAX, LONG_MAX },				\
 	{ MAX_TASKS_PER_USER, MAX_TASKS_PER_USER },	\

--- a/include/asm-ppc/resource.h
+++ b/include/asm-ppc/resource.h
@@ -25,7 +25,7 @@
    {       0, LONG_MAX},			/* RLIMIT_CORE */	\
    {LONG_MAX, LONG_MAX},			/* RLIMIT_RSS */	\
    {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER},	/* RLIMIT_NPROC */	\
-    { NR_OPEN,  NR_OPEN},			/* RLIMIT_NOFILE */	\
+    {INR_OPEN, INR_OPEN},			/* RLIMIT_NOFILE */	\
    {LONG_MAX, LONG_MAX},			/* RLIMIT_MEMLOCK */	\
    {LONG_MAX, LONG_MAX},			/* RLIMIT_AS */		\
 }

--- a/include/asm-sparc/resource.h
+++ b/include/asm-sparc/resource.h
@@ -31,7 +31,7 @@
    {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX},				\
    {LONG_MAX, LONG_MAX}, {_STK_LIM, LONG_MAX},				\
    {       0, LONG_MAX}, {LONG_MAX, LONG_MAX},				\
-    {NR_OPEN, NR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER},	\
+    {INR_OPEN, INR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER},	\
    {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX}                          \
 }


--- a/include/asm-sparc64/resource.h
+++ b/include/asm-sparc64/resource.h
@@ -30,7 +30,7 @@
    {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX},				\
    {LONG_MAX, LONG_MAX}, {_STK_LIM, LONG_MAX},				\
    {       0, LONG_MAX}, {LONG_MAX, LONG_MAX},				\
-    {NR_OPEN, NR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER},	\
+    {INR_OPEN, INR_OPEN}, {MAX_TASKS_PER_USER, MAX_TASKS_PER_USER},	\
    {LONG_MAX, LONG_MAX}, {LONG_MAX, LONG_MAX}                          \
 }


--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -55,18 +55,6 @@ extern inline struct file * fget(unsigned int fd)
 	return file;
 }

-/*
- * Install a file pointer in the fd array.
- */
-extern inline void fd_install(unsigned int fd, struct file * file)
-{
-	struct files_struct *files = current->files;
-
-	write_lock(&files->file_lock);
-	files->fd[fd] = file;
-	write_unlock(&files->file_lock);
-}
-
 /*
 * 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: 
 * 
@@ -90,4 +78,26 @@ extern inline void fput(struct file * file)
 }
 extern void put_filp(struct file *);

+/*
+ * Install a file pointer in the fd array.  
+ *
+ * The VFS is full of places where we drop the files lock between
+ * setting the open_fds bitmap and installing the file in the file
+ * array.  At any such point, we are vulnerable to a dup2() race
+ * installing a file in the array before us.  We need to detect this and
+ * fput() the struct file we are about to overwrite in this case.
+ */
+
+extern inline void fd_install(unsigned int fd, struct file * file)
+{
+	struct files_struct *files = current->files;
+	struct file * result;
+	
+	write_lock(&files->file_lock);
+	result = xchg(&files->fd[fd], file);
+	write_unlock(&files->file_lock);
+	if (result)
+		fput(result);
+}
+
 #endif /* __LINUX_FILE_H */
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -27,17 +27,19 @@ struct poll_table_struct;


 /*
- * It's silly to have NR_OPEN bigger than NR_FILE, but I'll fix
- * that later. Anyway, now the file code is no longer dependent
- * on bitmaps in unsigned longs, but uses the new fd_set structure..
+ * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
+ * the file limit at runtime and only root can increase the per-process
+ * nr_file rlimit, so it's safe to set up a ridiculously high absolute
+ * upper limit on files-per-process.
 *
 * Some programs (notably those using select()) may have to be 
- * recompiled to take full advantage of the new limits..
+ * recompiled to take full advantage of the new limits..  
 */

 /* Fixed constants first: */
 #undef NR_OPEN
-#define NR_OPEN 1024
+#define NR_OPEN (1024*1024)	/* Absolute upper limit on fd num */
+#define INR_OPEN 1024		/* Initial setting for nfile rlimits */

 #define BLOCK_SIZE_BITS 10
 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
@@ -62,6 +64,7 @@ extern int max_super_blocks, nr_super_blocks;
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define WRITEA 3	/* write-ahead - don't block if no resources */
+#define WRITERAW 5	/* raw write - don't play with buffer lists */

 #ifndef NULL
 #define NULL ((void *) 0)
@@ -228,6 +231,7 @@ struct buffer_head {

 	unsigned long b_rsector;	/* Real buffer location on disk */
 	wait_queue_head_t b_wait;
+	struct kiobuf * b_kiobuf;	/* kiobuf which owns this IO */
 };

 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
@@ -704,6 +708,7 @@ extern inline int locks_verify_area(int read_write, struct inode *inode,

 asmlinkage int sys_open(const char *, int, int);
 asmlinkage int sys_close(unsigned int);		/* yes, it's really unsigned */
+extern int do_close(unsigned int, int);		/* yes, it's really unsigned */
 extern int do_truncate(struct dentry *, unsigned long);
 extern int get_unused_fd(void);
 extern void put_unused_fd(unsigned int);

--- a/include/linux/iobuf.h
+++ b/include/linux/iobuf.h
+/*
+ * iobuf.h
+ *
+ * Defines the structures used to track abstract kernel-space io buffers.
+ *
+ */
+
+#ifndef __LINUX_IOBUF_H
+#define __LINUX_IOBUF_H
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+
+/*
+ * The kiobuf structure describes a physical set of pages reserved
+ * locked for IO.  The reference counts on each page will have been
+ * incremented, and the flags field will indicate whether or not we have
+ * pre-locked all of the pages for IO.
+ *
+ * kiobufs may be passed in arrays to form a kiovec, but we must
+ * preserve the property that no page is present more than once over the
+ * entire iovec.
+ */
+
+#define KIO_MAX_ATOMIC_IO	64 /* in kb */
+#define KIO_MAX_ATOMIC_BYTES	(64 * 1024)
+#define KIO_STATIC_PAGES	(KIO_MAX_ATOMIC_IO / (PAGE_SIZE >> 10) + 1)
+#define KIO_MAX_SECTORS		(KIO_MAX_ATOMIC_IO * 2)
+
+struct kiobuf 
+{
+	int		nr_pages;	/* Pages actually referenced */
+	int		array_len;	/* Space in the allocated lists */
+	int		offset;		/* Offset to start of valid data */
+	int		length;		/* Number of valid bytes of data */
+
+	/* Keep separate track of the physical addresses and page
+	 * structs involved.  If we do IO to a memory-mapped device
+	 * region, there won't necessarily be page structs defined for
+	 * every address. */
+
+	unsigned long *	pagelist;
+	struct page **	maplist;
+
+	unsigned int	locked : 1;	/* If set, pages has been locked */
+	
+	/* Always embed enough struct pages for 64k of IO */
+	unsigned long	page_array[KIO_STATIC_PAGES];
+	struct page *	map_array[KIO_STATIC_PAGES];
+
+	/* Dynamic state for IO completion: */
+	atomic_t	io_count;	/* IOs still in progress */
+	int		errno;		/* Status of completed IO */
+	void		(*end_io) (struct kiobuf *); /* Completion callback */
+	wait_queue_head_t wait_queue;
+};
+
+
+/* mm/memory.c */
+
+int	map_user_kiobuf(int rw, struct kiobuf *, unsigned long va, size_t len);
+void	unmap_kiobuf(struct kiobuf *iobuf);
+
+/* fs/iobuf.c */
+
+void __init kiobuf_init(void);
+void	simple_wakeup_kiobuf(struct kiobuf *);
+int	alloc_kiovec(int nr, struct kiobuf **);
+void	free_kiovec(int nr, struct kiobuf **);
+int	expand_kiobuf(struct kiobuf *, int);
+void	kiobuf_wait_for_io(struct kiobuf *);
+
+/* fs/buffer.c */
+
+int	brw_kiovec(int rw, int nr, struct kiobuf *iovec[], 
+		   kdev_t dev, unsigned long b[], int size, int bmap);
+
+#endif /* __LINUX_IOBUF_H */
--- a/include/linux/limits.h
+++ b/include/linux/limits.h
 #ifndef _LINUX_LIMITS_H
 #define _LINUX_LIMITS_H

-#define NR_OPEN		1024
+#define NR_OPEN	        1024

 #define NGROUPS_MAX       32	/* supplemental group IDs are available */
 #define ARG_MAX       131072	/* # bytes of args + environ for exec() */

--- a/include/linux/major.h
+++ b/include/linux/major.h
@@ -115,6 +115,8 @@

 #define AURORA_MAJOR 79

+#define RAW_MAJOR	162
+
 #define UNIX98_PTY_MASTER_MAJOR	128
 #define UNIX98_PTY_MAJOR_COUNT	8
 #define UNIX98_PTY_SLAVE_MAJOR	(UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT)

--- a/include/linux/raw.h
+++ b/include/linux/raw.h
+#ifndef __LINUX_RAW_H
+#define __LINUX_RAW_H
+
+#include <linux/types.h>
+
+#define RAW_SETBIND	_IO( 0xac, 0 )
+#define RAW_GETBIND	_IO( 0xac, 1 )
+
+struct raw_config_request 
+{
+	int	raw_minor;
+	__u64	block_major;
+	__u64	block_minor;
+};
+
+#ifdef __KERNEL__
+
+/* drivers/char/raw.c */
+extern void raw_init(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* __LINUX_RAW_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -127,6 +127,12 @@ extern void trap_init(void);
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);

+/*
+ * The default fd array needs to be at least BITS_PER_LONG,
+ * as this is the granularity returned by copy_fdset().
+ */
+#define NR_OPEN_DEFAULT BITS_PER_LONG
+
 /*
 * Open file table structure
 */
@@ -134,18 +140,28 @@ struct files_struct {
 	atomic_t count;
 	rwlock_t file_lock;
 	int max_fds;
+	int max_fdset;
+	int next_fd;
 	struct file ** fd;	/* current fd array */
-	fd_set close_on_exec;
-	fd_set open_fds;
+	fd_set *close_on_exec;
+	fd_set *open_fds;
+	fd_set close_on_exec_init;
+	fd_set open_fds_init;
+	struct file * fd_array[NR_OPEN_DEFAULT];
 };

 #define INIT_FILES { \
 	ATOMIC_INIT(1), \
 	RW_LOCK_UNLOCKED, \
-	NR_OPEN, \
-	&init_fd_array[0], \
+	NR_OPEN_DEFAULT, \
+	__FD_SETSIZE, \
+	0, \
+	&init_files.fd_array[0], \
+	&init_files.close_on_exec_init, \
+	&init_files.open_fds_init, \
 	{ { 0, } }, \
-	{ { 0, } } \
+	{ { 0, } }, \
+	{ NULL, } \
 }

 struct fs_struct {
@@ -633,6 +649,48 @@ extern void mmput(struct mm_struct *);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(void);

+/*
+ * Routines for handling the fd arrays
+ */
+extern struct file ** alloc_fd_array(int);
+extern int expand_fd_array(struct files_struct *, int nr);
+extern void free_fd_array(struct file **, int);
+
+extern fd_set *alloc_fdset(int);
+extern int expand_fdset(struct files_struct *, int nr);
+extern void free_fdset(fd_set *, int);
+
+/* Expand files.  Return <0 on error; 0 nothing done; 1 files expanded,
+ * we may have blocked. 
+ *
+ * Should be called with the files->file_lock spinlock held for write.
+ */
+static inline int expand_files(struct files_struct *files, int nr)
+{
+	int err, expand = 0;
+#ifdef FDSET_DEBUG	
+	printk (KERN_ERR __FUNCTION__ " %d: nr = %d\n", current->pid, nr);
+#endif
+	
+	if (nr >= files->max_fdset) {
+		expand = 1;
+		if ((err = expand_fdset(files, nr)))
+			goto out;
+	}
+	if (nr >= files->max_fds) {
+		expand = 1;
+		if ((err = expand_fd_array(files, nr)))
+			goto out;
+	}
+	err = expand;
+ out:
+#ifdef FDSET_DEBUG	
+	if (err)
+		printk (KERN_ERR __FUNCTION__ " %d: return %d\n", current->pid, err);
+#endif
+	return err;
+}
+
 extern int  copy_thread(int, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);

--- a/init/main.c
+++ b/init/main.c
@@ -23,6 +23,7 @@
 #include <linux/smp_lock.h>
 #include <linux/blk.h>
 #include <linux/hdreg.h>
+#include <linux/iobuf.h>

 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -1193,6 +1194,7 @@ asmlinkage void __init start_kernel(void)
 	vma_init();
 	buffer_init(memory_end-memory_start);
 	page_cache_init(memory_end-memory_start);
+	kiobuf_init();
 	signals_init();
 	inode_init();
 	file_table_init();

--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -149,11 +149,11 @@ static inline void close_files(struct files_struct * files)

 	j = 0;
 	for (;;) {
-		unsigned long set = files->open_fds.fds_bits[j];
+		unsigned long set;
 		i = j * __NFDBITS;
-		j++;
-		if (i >= files->max_fds)
+		if (i >= files->max_fdset || i >= files->max_fds)
 			break;
+		set = files->open_fds->fds_bits[j++];
 		while (set) {
 			if (set & 1) {
 				struct file * file = xchg(&files->fd[i], NULL);
@@ -176,12 +176,14 @@ static inline void __exit_files(struct task_struct *tsk)
 		if (atomic_dec_and_test(&files->count)) {
 			close_files(files);
 			/*
-			 * Free the fd array as appropriate ...
+			 * Free the fd and fdset arrays if we expanded them.
 			 */
-			if (NR_OPEN * sizeof(struct file *) == PAGE_SIZE)
-				free_page((unsigned long) files->fd);
-			else
-				kfree(files->fd);
+			if (files->fd != &files->fd_array[0])
+				free_fd_array(files->fd, files->max_fds);
+			if (files->max_fdset > __FD_SETSIZE) {
+				free_fdset(files->open_fds, files->max_fdset);
+				free_fdset(files->close_on_exec, files->max_fdset);
+			}
 			kmem_cache_free(files_cachep, files);
 		}
 	}

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -433,32 +433,24 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 	return 0;
 }

-/*
- * Copy a fd_set and compute the maximum fd it contains. 
- */
-static inline int __copy_fdset(unsigned long *d, unsigned long *src)
+static int count_open_files(struct files_struct *files, int size)
 {
-	int i; 
-	unsigned long *p = src; 
-	unsigned long *max = src; 
-
-	for (i = __FDSET_LONGS; i; --i) {
-		if ((*d++ = *p++) != 0) 
-			max = p; 
+	int i;
+	
+	/* Find the last open fd */
+	for (i = size/(8*sizeof(long)); i > 0; ) {
+		if (files->open_fds->fds_bits[--i])
+			break;
 	}
-	return (max - src)*sizeof(long)*8; 
-}
-
-static inline int copy_fdset(fd_set *dst, fd_set *src)
-{
-	return __copy_fdset(dst->fds_bits, src->fds_bits);  
+	i = (i+1) * 8 * sizeof(long);
+	return i;
 }

 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct files_struct *oldf, *newf;
 	struct file **old_fds, **new_fds;
-	int size, i, error = 0;
+	int open_files, nfds, size, i, error = 0;

 	/*
 	 * A background process may not have any files ...
@@ -478,43 +470,85 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	if (!newf) 
 		goto out;

-	/*
-	 * Allocate the fd array, using get_free_page() if possible.
-	 * Eventually we want to make the array size variable ...
-	 */
-	size = NR_OPEN * sizeof(struct file *);
-	if (size == PAGE_SIZE)
-		new_fds = (struct file **) __get_free_page(GFP_KERNEL);
-	else
-		new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
-	if (!new_fds)
-		goto out_release;
-
-	newf->file_lock = RW_LOCK_UNLOCKED;
 	atomic_set(&newf->count, 1);
-	newf->max_fds = NR_OPEN;
-	newf->fd = new_fds;
+
+	newf->file_lock	    = RW_LOCK_UNLOCKED;
+	newf->next_fd	    = 0;
+	newf->max_fds	    = NR_OPEN_DEFAULT;
+	newf->max_fdset	    = __FD_SETSIZE;
+	newf->close_on_exec = &newf->close_on_exec_init;
+	newf->open_fds	    = &newf->open_fds_init;
+	newf->fd	    = &newf->fd_array[0];
+
+	/* We don't yet have the oldf readlock, but even if the old
+           fdset gets grown now, we'll only copy up to "size" fds */
+	size = oldf->max_fdset;
+	if (size > __FD_SETSIZE) {
+		newf->max_fdset = 0;
+		write_lock(&newf->file_lock);
+		error = expand_fdset(newf, size);
+		write_unlock(&newf->file_lock);
+		if (error)
+			goto out_release;
+	}
 	read_lock(&oldf->file_lock);
-	newf->close_on_exec = oldf->close_on_exec;
-	i = copy_fdset(&newf->open_fds, &oldf->open_fds);
+
+	open_files = count_open_files(oldf, size);
+
+	/*
+	 * Check whether we need to allocate a larger fd array.
+	 * Note: we're not a clone task, so the open count won't
+	 * change.
+	 */
+	nfds = NR_OPEN_DEFAULT;
+	if (open_files > nfds) {
+		read_unlock(&oldf->file_lock);
+		newf->max_fds = 0;
+		write_lock(&newf->file_lock);
+		error = expand_fd_array(newf, open_files);
+		write_unlock(&newf->file_lock);
+		if (error) 
+			goto out_release;
+		nfds = newf->max_fds;
+		read_lock(&oldf->file_lock);
+	}

 	old_fds = oldf->fd;
-	for (; i != 0; i--) {
+	new_fds = newf->fd;
+
+	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
+	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+
+	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
 		if (f)
 			get_file(f);
 		*new_fds++ = f;
 	}
 	read_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (newf->max_fds - open_files) * sizeof(struct file *);
+
 	/* This is long word aligned thus could use a optimized version */ 
-	memset(new_fds, 0, (char *)newf->fd + size - (char *)new_fds); 
-      
+	memset(new_fds, 0, size); 
+
+	if (newf->max_fdset > open_files) {
+		int left = (newf->max_fdset-open_files)/8;
+		int start = open_files / (8 * sizeof(unsigned long));
+		
+		memset(&newf->open_fds->fds_bits[start], 0, left);
+		memset(&newf->close_on_exec->fds_bits[start], 0, left);
+	}
+
 	tsk->files = newf;
 	error = 0;
 out:
 	return error;

 out_release:
+	free_fdset (newf->close_on_exec, newf->max_fdset);
+	free_fdset (newf->open_fds, newf->max_fdset);
 	kmem_cache_free(files_cachep, newf);
 	goto out;
 }

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -39,6 +39,7 @@
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 #include <linux/swapctl.h>
+#include <linux/iobuf.h>

 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -406,6 +407,181 @@ void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long s
 	}
 }

+
+/*
+ * Do a quick page-table lookup for a single page. 
+ */
+static unsigned long follow_page(unsigned long address) 
+{
+	pgd_t *pgd;
+	pmd_t *pmd;
+
+	pgd = pgd_offset(current->mm, address);
+	pmd = pmd_offset(pgd, address);
+	if (pmd) {
+		pte_t * pte = pte_offset(pmd, address);
+		if (pte && pte_present(*pte)) {
+			return pte_page(*pte);
+		}
+	}
+	
+	printk(KERN_ERR "Missing page in follow_page\n");
+	return 0;
+}
+
+/* 
+ * Given a physical address, is there a useful struct page pointing to it?
+ */
+
+static struct page * get_page_map(unsigned long page)
+{
+	struct page *map;
+	
+	if (MAP_NR(page) >= max_mapnr)
+		return 0;
+	if (page == ZERO_PAGE(page))
+		return 0;
+	map = mem_map + MAP_NR(page);
+	if (PageReserved(map))
+		return 0;
+	return map;
+}
+
+/*
+ * Force in an entire range of pages from the current process's user VA,
+ * and pin and lock the pages for IO.  
+ */
+
+#define dprintk(x...)
+int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
+{
+	unsigned long		ptr, end;
+	int			err;
+	struct mm_struct *	mm;
+	struct vm_area_struct *	vma = 0;
+	unsigned long		page;
+	struct page *		map;
+	int			doublepage = 0;
+	int			repeat = 0;
+	int			i;
+	
+	/* Make sure the iobuf is not already mapped somewhere. */
+	if (iobuf->nr_pages)
+		return -EINVAL;
+
+	mm = current->mm;
+	dprintk ("map_user_kiobuf: begin\n");
+	
+	ptr = va & PAGE_MASK;
+	end = (va + len + PAGE_SIZE - 1) & PAGE_MASK;
+	err = expand_kiobuf(iobuf, (end - ptr) >> PAGE_SHIFT);
+	if (err)
+		return err;
+
+ repeat:
+	down(&mm->mmap_sem);
+
+	err = -EFAULT;
+	iobuf->locked = 1;
+	iobuf->offset = va & ~PAGE_MASK;
+	iobuf->length = len;
+	
+	i = 0;
+	
+	/* 
+	 * First of all, try to fault in all of the necessary pages
+	 */
+	while (ptr < end) {
+		if (!vma || ptr >= vma->vm_end) {
+			vma = find_vma(current->mm, ptr);
+			if (!vma) 
+				goto out_unlock;
+		}
+		if (!handle_mm_fault(current, vma, ptr, (rw==READ))) 
+			goto out_unlock;
+		page = follow_page(ptr);
+		if (!page) {
+			printk (KERN_ERR "Missing page in map_user_kiobuf\n");
+			goto out_unlock;
+		}
+		map = get_page_map(page);
+		if (map) {
+			if (TryLockPage(map))
+				goto retry;
+			atomic_inc(&map->count);
+		}
+		dprintk ("Installing page %p %p: %d\n", (void *)page, map, i);
+		iobuf->pagelist[i] = page;
+		iobuf->maplist[i] = map;
+		iobuf->nr_pages = ++i;
+		
+		ptr += PAGE_SIZE;
+	}
+
+	up(&mm->mmap_sem);
+	dprintk ("map_user_kiobuf: end OK\n");
+	return 0;
+
+ out_unlock:
+	up(&mm->mmap_sem);
+	unmap_kiobuf(iobuf);
+	dprintk ("map_user_kiobuf: end %d\n", err);
+	return err;
+
+ retry:
+
+	/* 
+	 * Undo the locking so far, wait on the page we got to, and try again.
+	 */
+	unmap_kiobuf(iobuf);
+	up(&mm->mmap_sem);
+
+	/* 
+	 * Did the release also unlock the page we got stuck on?
+	 */
+	if (!PageLocked(map)) {
+		/* If so, we may well have the page mapped twice in the
+		 * IO address range.  Bad news.  Of course, it _might_
+		 * just be a coincidence, but if it happens more than
+		 * once, chances are we have a double-mapped page. */
+		if (++doublepage >= 3) {
+			return -EINVAL;
+		}
+	}
+	
+	/*
+	 * Try again...
+	 */
+	wait_on_page(map);
+	if (++repeat < 16)
+		goto repeat;
+	return -EAGAIN;
+}
+
+
+/*
+ * Unmap all of the pages referenced by a kiobuf.  We release the pages,
+ * and unlock them if they were locked. 
+ */
+
+void unmap_kiobuf (struct kiobuf *iobuf) 
+{
+	int i;
+	struct page *map;
+	
+	for (i = 0; i < iobuf->nr_pages; i++) {
+		map = iobuf->maplist[i];
+		
+		if (map && iobuf->locked) {
+			__free_page(map);
+			UnlockPage(map);
+		}
+	}
+	
+	iobuf->nr_pages = 0;
+	iobuf->locked = 0;
+}
+
 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
                                     unsigned long size, pgprot_t prot)
 {