Merge bk://ldm.bkbits.net/linux-2.5

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge bk://ldm.bkbits.net/linux-2.5
into home.transmeta.com:/home/torvalds/v2.5/linux
2ce067b0 · Linus Torvalds · 56d8b39d · 4ab1a3e6 · 2ce067b0 · 2ce067b0
Commit 2ce067b0 authored Sep 25, 2002 by Linus Torvalds
41 changed files
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -121,7 +121,7 @@ static int vidport;
 static int lines, cols;
 #ifdef CONFIG_MULTIQUAD
-static void * const xquad_portio = NULL;
+static void * xquad_portio = NULL;
 #endif
 #include "../../../../lib/inflate.c"

--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1060,11 +1060,11 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
        if (clustered_apic_mode && (numnodes > 1)) {
                printk("Remapping cross-quad port I/O for %d quads\n",
 			numnodes);
+                xquad_portio = ioremap (XQUAD_PORTIO_BASE, 
+			numnodes * XQUAD_PORTIO_QUAD);
                printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
                        (u_long) xquad_portio, 
-			(u_long) numnodes * XQUAD_PORTIO_LEN);
+			(u_long) numnodes * XQUAD_PORTIO_QUAD);
-                xquad_portio = ioremap (XQUAD_PORTIO_BASE, 
-			numnodes * XQUAD_PORTIO_LEN);
        }
 	/*

--- a/arch/i386/kernel/sys_i386.c
+++ b/arch/i386/kernel/sys_i386.c
@@ -272,10 +272,9 @@ get_addr(unsigned long addr, unsigned long len)
 			return -ENOMEM;
 		if (!vma || ((addr + len) < vma->vm_start))
 			goto found_addr;
-		addr = vma->vm_end;
+		addr = HPAGE_ALIGN(vma->vm_end);
 	}
 found_addr:
-	addr = HPAGE_ALIGN(addr);
 	return addr;
 }

--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -9,9 +9,9 @@
 #
 export-objs	:= elevator.o ll_rw_blk.o loop.o genhd.o acsi.o \
-		   block_ioctl.o
+		   block_ioctl.o deadline-iosched.o
-obj-y	:= elevator.o ll_rw_blk.o blkpg.o genhd.o block_ioctl.o
+obj-y	:= elevator.o ll_rw_blk.o blkpg.o genhd.o block_ioctl.o deadline-iosched.o
 obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
 obj-$(CONFIG_BLK_DEV_FD)	+= floppy.o

--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
+/*
+ *  linux/drivers/block/deadline-iosched.c
+ *
+ *  Deadline i/o scheduler.
+ *
+ *  Copyright (C) 2002 Jens Axboe <axboe@suse.de>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/blk.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/hash.h>
+/*
+ * feel free to try other values :-). read_expire value is the timeout for
+ * reads, our goal is to start a request "around" the time when it expires.
+ * fifo_batch is how many steps along the sorted list we will take when the
+ * front fifo request expires.
+ */
+static int read_expire = HZ / 2;	/* 500ms start timeout */
+static int fifo_batch = 64;		/* 4 seeks, or 64 contig */
+static int seek_cost = 16;		/* seek is 16 times more expensive */
+/*
+ * how many times reads are allowed to starve writes
+ */
+static int writes_starved = 2;
+static const int deadline_hash_shift = 8;
+#define DL_HASH_BLOCK(sec)	((sec) >> 3)
+#define DL_HASH_FN(sec)		(hash_long(DL_HASH_BLOCK((sec)), deadline_hash_shift))
+#define DL_HASH_ENTRIES		(1 << deadline_hash_shift)
+#define DL_INVALIDATE_HASH(dd)				\
+	do {						\
+		if (!++(dd)->hash_valid_count)		\
+			(dd)->hash_valid_count = 1;	\
+	} while (0)
+struct deadline_data {
+	/*
+	 * run time data
+	 */
+	struct list_head sort_list[2];	/* sorted listed */
+	struct list_head read_fifo;	/* fifo list */
+	struct list_head *dispatch;	/* driver dispatch queue */
+	struct list_head *hash;		/* request hash */
+	sector_t last_sector;		/* last sector sent to drive */
+	unsigned long hash_valid_count;	/* barrier hash count */
+	unsigned int starved;		/* writes starved */
+	/*
+	 * settings that change how the i/o scheduler behaves
+	 */
+	unsigned int fifo_batch;
+	unsigned long read_expire;
+	unsigned int seek_cost;
+	unsigned int writes_starved;
+};
+/*
+ * pre-request data.
+ */
+struct deadline_rq {
+	struct list_head fifo;
+	struct list_head hash;
+	unsigned long hash_valid_count;
+	struct request *request;
+	unsigned long expires;
+};
+static kmem_cache_t *drq_pool;
+#define RQ_DATA(rq)	((struct deadline_rq *) (rq)->elevator_private)
+/*
+ * rq hash
+ */
+static inline void __deadline_del_rq_hash(struct deadline_rq *drq)
+{
+	drq->hash_valid_count = 0;
+	list_del_init(&drq->hash);
+}
+#define ON_HASH(drq)	(drq)->hash_valid_count
+static inline void deadline_del_rq_hash(struct deadline_rq *drq)
+{
+	if (ON_HASH(drq))
+		__deadline_del_rq_hash(drq);
+}
+static inline void
+deadline_add_rq_hash(struct deadline_data *dd, struct deadline_rq *drq)
+{
+	struct request *rq = drq->request;
+	BUG_ON(ON_HASH(drq));
+	drq->hash_valid_count = dd->hash_valid_count;
+	list_add(&drq->hash, &dd->hash[DL_HASH_FN(rq->sector +rq->nr_sectors)]);
+}
+#define list_entry_hash(ptr)	list_entry((ptr), struct deadline_rq, hash)
+static struct request *
+deadline_find_hash(struct deadline_data *dd, sector_t offset)
+{
+	struct list_head *hash_list = &dd->hash[DL_HASH_FN(offset)];
+	struct list_head *entry, *next = hash_list->next;
+	struct deadline_rq *drq;
+	struct request *rq = NULL;
+	while ((entry = next) != hash_list) {
+		next = entry->next;
+		drq = list_entry_hash(entry);
+		BUG_ON(!drq->hash_valid_count);
+		if (!rq_mergeable(drq->request)
+		    || drq->hash_valid_count != dd->hash_valid_count) {
+			__deadline_del_rq_hash(drq);
+			continue;
+		}
+		if (drq->request->sector + drq->request->nr_sectors == offset) {
+			rq = drq->request;
+			break;
+		}
+	}
+	return rq;
+}
+static int
+deadline_merge(request_queue_t *q, struct request **req, struct bio *bio)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	const int data_dir = bio_data_dir(bio);
+	struct list_head *entry, *sort_list;
+	struct deadline_rq *drq;
+	struct request *__rq;
+	int ret = ELEVATOR_NO_MERGE;
+	/*
+	 * try last_merge to avoid going to hash
+	 */
+	ret = elv_try_last_merge(q, req, bio);
+	if (ret != ELEVATOR_NO_MERGE)
+		goto out;
+	/*
+	 * see if the merge hash can satisfy a back merge
+	 */
+	if ((__rq = deadline_find_hash(dd, bio->bi_sector))) {
+		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
+		if (elv_rq_merge_ok(__rq, bio)) {
+			*req = __rq;
+			q->last_merge = &__rq->queuelist;
+			ret = ELEVATOR_BACK_MERGE;
+			goto out_ret;
+		}
+	}
+	entry = sort_list = &dd->sort_list[data_dir];
+	while ((entry = entry->prev) != sort_list) {
+		__rq = list_entry_rq(entry);
+		drq = RQ_DATA(__rq);
+		BUG_ON(__rq->flags & REQ_STARTED);
+		if (!(__rq->flags & REQ_CMD))
+			continue;
+		if (!*req && bio_rq_in_between(bio, __rq, sort_list))
+			*req = __rq;
+		if (__rq->flags & REQ_BARRIER)
+			break;
+		/*
+		 * checking for a front merge, hash will miss those
+		 */
+		if (__rq->sector - bio_sectors(bio) == bio->bi_sector) {
+			ret = elv_try_merge(__rq, bio);
+			if (ret != ELEVATOR_NO_MERGE) {
+				*req = __rq;
+				q->last_merge = &__rq->queuelist;
+				break;
+			}
+		}
+	}
+out:
+	if (ret != ELEVATOR_NO_MERGE) {
+		struct deadline_rq *drq = RQ_DATA(*req);
+		deadline_del_rq_hash(drq);
+		deadline_add_rq_hash(dd, drq);
+	}
+out_ret:
+	return ret;
+}
+static void
+deadline_merge_request(request_queue_t *q, struct request *req, struct request *next)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_rq *drq = RQ_DATA(req);
+	struct deadline_rq *dnext = RQ_DATA(next);
+	BUG_ON(!drq);
+	BUG_ON(!dnext);
+	deadline_del_rq_hash(drq);
+	deadline_add_rq_hash(dd, drq);
+	/*
+	 * if dnext expires before drq, assign it's expire time to drq
+	 * and move into dnext position (dnext will be deleted) in fifo
+	 */
+	if (!list_empty(&drq->fifo) && !list_empty(&dnext->fifo)) {
+		if (time_before(dnext->expires, drq->expires)) {
+			list_move(&drq->fifo, &dnext->fifo);
+			drq->expires = dnext->expires;
+		}
+	}
+}
+/*
+ * move request from sort list to dispatch queue. maybe remove from rq hash
+ * here too?
+ */
+static inline void
+deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
+{
+	struct deadline_rq *drq = RQ_DATA(rq);
+	list_move_tail(&rq->queuelist, dd->dispatch);
+	list_del_init(&drq->fifo);
+}
+/*
+ * move along sort list and move entries to dispatch queue, starting from rq
+ */
+static void deadline_move_requests(struct deadline_data *dd, struct request *rq)
+{
+	struct list_head *sort_head = &dd->sort_list[rq_data_dir(rq)];
+	sector_t last_sec = dd->last_sector;
+	int batch_count = dd->fifo_batch;
+	do {
+		struct list_head *nxt = rq->queuelist.next;
+		/*
+		 * take it off the sort and fifo list, move
+		 * to dispatch queue
+		 */
+		deadline_move_to_dispatch(dd, rq);
+		if (rq->sector == last_sec)
+			batch_count--;
+		else
+			batch_count -= dd->seek_cost;
+		if (nxt == sort_head)
+			break;
+		last_sec = rq->sector + rq->nr_sectors;
+		rq = list_entry_rq(nxt);
+	} while (batch_count > 0);
+}
+/*
+ * returns 0 if there are no expired reads on the fifo, 1 otherwise
+ */
+#define list_entry_fifo(ptr)	list_entry((ptr), struct deadline_rq, fifo)
+static inline int deadline_check_fifo(struct deadline_data *dd)
+{
+	struct deadline_rq *drq;
+	if (list_empty(&dd->read_fifo))
+		return 0;
+	drq = list_entry_fifo(dd->read_fifo.next);
+	if (time_before(jiffies, drq->expires))
+		return 0;
+	return 1;
+}
+static struct request *deadline_next_request(request_queue_t *q)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_rq *drq;
+	struct list_head *nxt;
+	struct request *rq;
+	int writes;
+	/*
+	 * if still requests on the dispatch queue, just grab the first one
+	 */
+	if (!list_empty(&q->queue_head)) {
+dispatch:
+		rq = list_entry_rq(q->queue_head.next);
+		dd->last_sector = rq->sector + rq->nr_sectors;
+		return rq;
+	}
+	writes = !list_empty(&dd->sort_list[WRITE]);
+	/*
+	 * if we have expired entries on the fifo list, move some to dispatch
+	 */
+	if (deadline_check_fifo(dd)) {
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+		nxt = dd->read_fifo.next;
+		drq = list_entry_fifo(nxt);
+		deadline_move_requests(dd, drq->request);
+		goto dispatch;
+	}
+	if (!list_empty(&dd->sort_list[READ])) {
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+		nxt = dd->sort_list[READ].next;
+		deadline_move_requests(dd, list_entry_rq(nxt));
+		goto dispatch;
+	}
+	/*
+	 * either there are no reads expired or on sort list, or the reads
+	 * have starved writes for too long. dispatch some writes
+	 */
+	if (writes) {
+dispatch_writes:
+		nxt = dd->sort_list[WRITE].next;
+		deadline_move_requests(dd, list_entry_rq(nxt));
+		dd->starved = 0;
+		goto dispatch;
+	}
+	BUG_ON(!list_empty(&dd->sort_list[READ]));
+	BUG_ON(writes);
+	return NULL;
+}
+static void
+deadline_add_request(request_queue_t *q, struct request *rq, struct list_head *insert_here)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	struct deadline_rq *drq = RQ_DATA(rq);
+	const int data_dir = rq_data_dir(rq);
+	/*
+	 * flush hash on barrier insert, as not to allow merges before a
+	 * barrier.
+	 */
+	if (unlikely(rq->flags & REQ_BARRIER)) {
+		DL_INVALIDATE_HASH(dd);
+		q->last_merge = NULL;
+	}
+	/*
+	 * add to sort list
+	 */
+	if (!insert_here)
+		insert_here = dd->sort_list[data_dir].prev;
+	list_add(&rq->queuelist, insert_here);
+	if (unlikely(!(rq->flags & REQ_CMD)))
+		return;
+	if (rq_mergeable(rq)) {
+		deadline_add_rq_hash(dd, drq);
+		if (!q->last_merge)
+			q->last_merge = &rq->queuelist;
+	}
+	if (data_dir == READ) {
+		/*
+		 * set expire time and add to fifo list
+		 */
+		drq->expires = jiffies + dd->read_expire;
+		list_add_tail(&drq->fifo, &dd->read_fifo);
+	}
+}
+static void deadline_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct deadline_rq *drq = RQ_DATA(rq);
+	if (drq) {
+		list_del_init(&drq->fifo);
+		deadline_del_rq_hash(drq);
+	}
+}
+static int deadline_queue_empty(request_queue_t *q)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	if (!list_empty(&q->queue_head) || !list_empty(&dd->sort_list[READ])
+	    || !list_empty(&dd->sort_list[WRITE]))
+		return 0;
+	BUG_ON(!list_empty(&dd->read_fifo));
+	return 1;
+}
+static struct list_head *
+deadline_get_sort_head(request_queue_t *q, struct request *rq)
+{
+	struct deadline_data *dd = q->elevator.elevator_data;
+	return &dd->sort_list[rq_data_dir(rq)];
+}
+static void deadline_exit(request_queue_t *q, elevator_t *e)
+{
+	struct deadline_data *dd = e->elevator_data;
+	struct deadline_rq *drq;
+	struct request *rq;
+	int i;
+	BUG_ON(!list_empty(&dd->read_fifo));
+	BUG_ON(!list_empty(&dd->sort_list[READ]));
+	BUG_ON(!list_empty(&dd->sort_list[WRITE]));
+	for (i = READ; i <= WRITE; i++) {
+		struct request_list *rl = &q->rq[i];
+		struct list_head *entry = &rl->free;
+		if (list_empty(&rl->free))
+			continue;
+		while ((entry = entry->next) != &rl->free) {
+			rq = list_entry_rq(entry);
+			if ((drq = RQ_DATA(rq)) == NULL)
+				continue;
+			rq->elevator_private = NULL;
+			kmem_cache_free(drq_pool, drq);
+		}
+	}
+	kfree(dd->hash);
+	kfree(dd);
+}
+/*
+ * initialize elevator private data (deadline_data), and alloc a drq for
+ * each request on the free lists
+ */
+static int deadline_init(request_queue_t *q, elevator_t *e)
+{
+	struct deadline_data *dd;
+	struct deadline_rq *drq;
+	struct request *rq;
+	int i, ret = 0;
+	if (!drq_pool)
+		return -ENOMEM;
+	dd = kmalloc(sizeof(*dd), GFP_KERNEL);
+	if (!dd)
+		return -ENOMEM;
+	memset(dd, 0, sizeof(*dd));
+	dd->hash = kmalloc(sizeof(struct list_head)*DL_HASH_ENTRIES,GFP_KERNEL);
+	if (!dd->hash) {
+		kfree(dd);
+		return -ENOMEM;
+	}
+	for (i = 0; i < DL_HASH_ENTRIES; i++)
+		INIT_LIST_HEAD(&dd->hash[i]);
+	INIT_LIST_HEAD(&dd->read_fifo);
+	INIT_LIST_HEAD(&dd->sort_list[READ]);
+	INIT_LIST_HEAD(&dd->sort_list[WRITE]);
+	dd->dispatch = &q->queue_head;
+	dd->fifo_batch = fifo_batch;
+	dd->read_expire = read_expire;
+	dd->seek_cost = seek_cost;
+	dd->hash_valid_count = 1;
+	dd->writes_starved = writes_starved;
+	e->elevator_data = dd;
+	for (i = READ; i <= WRITE; i++) {
+		struct request_list *rl = &q->rq[i];
+		struct list_head *entry = &rl->free;
+		if (list_empty(&rl->free))
+			continue;
+		while ((entry = entry->next) != &rl->free) {
+			rq = list_entry_rq(entry);
+			drq = kmem_cache_alloc(drq_pool, GFP_KERNEL);
+			if (!drq) {
+				ret = -ENOMEM;
+				break;
+			}
+			memset(drq, 0, sizeof(*drq));
+			INIT_LIST_HEAD(&drq->fifo);
+			INIT_LIST_HEAD(&drq->hash);
+			drq->request = rq;
+			rq->elevator_private = drq;
+		}
+	}
+	if (ret)
+		deadline_exit(q, e);
+	return ret;
+}
+static int __init deadline_slab_setup(void)
+{
+	drq_pool = kmem_cache_create("deadline_drq", sizeof(struct deadline_rq),
+				     0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!drq_pool)
+		panic("deadline: can't init slab pool\n");
+	return 0;
+}
+module_init(deadline_slab_setup);
+elevator_t iosched_deadline = {
+	.elevator_merge_fn = 		deadline_merge,
+	.elevator_merge_req_fn =	deadline_merge_request,
+	.elevator_next_req_fn =		deadline_next_request,
+	.elevator_add_req_fn =		deadline_add_request,
+	.elevator_remove_req_fn =	deadline_remove_request,
+	.elevator_queue_empty_fn =	deadline_queue_empty,
+	.elevator_get_sort_head_fn =	deadline_get_sort_head,
+	.elevator_init_fn =		deadline_init,
+	.elevator_exit_fn =		deadline_exit,
+};
+EXPORT_SYMBOL(iosched_deadline);
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -157,114 +157,6 @@ inline int elv_try_last_merge(request_queue_t *q, struct request **req,
 	return ret;
 }
-static int bio_rq_before(struct bio *bio, struct request *rq)
-{
-	if (!kdev_same(to_kdev_t(bio->bi_bdev->bd_dev), rq->rq_dev))
-		return 0;
-	return bio->bi_sector < rq->sector;
-}
-/*
- * elevator_linux starts here
- */
-int elevator_linus_merge(request_queue_t *q, struct request **req,
-			 struct bio *bio)
-{
-	struct list_head *entry, *good;
-	struct request *__rq;
-	int ret;
-	if ((ret = elv_try_last_merge(q, req, bio)))
-		return ret;
-	entry = &q->queue_head;
-	good = &q->queue_head;
-	ret = ELEVATOR_NO_MERGE;
-	while ((entry = entry->prev) != &q->queue_head) {
-		__rq = list_entry_rq(entry);
-		if (__rq->flags & (REQ_BARRIER | REQ_STARTED))
-			break;
-		if (!(__rq->flags & REQ_CMD))
-			break;
-		if (bio_data_dir(bio) != rq_data_dir(__rq)) {
-			if (bio_data_dir(bio) == WRITE)
-				break;
-			good = entry->prev;
-			continue;
-		}
-		ret = elv_try_merge(__rq, bio);
-		if (ret) {
-			*req = __rq;
-			q->last_merge = &__rq->queuelist;
-			return ret;
-		}
-		if (bio_rq_before(bio, __rq))
-			good = entry->prev;
-	}
-	if (good != &q->queue_head)
-		*req = list_entry_rq(good);
-	return ELEVATOR_NO_MERGE;
-}
-void elevator_linus_merge_req(request_queue_t *q, struct request *req,
-			      struct request *next)
-{
-	if (elv_linus_sequence(next) < elv_linus_sequence(req))
-		elv_linus_sequence(req) = elv_linus_sequence(next);
-}
-void elevator_linus_add_request(request_queue_t *q, struct request *rq,
-				struct list_head *insert_here)
-{
-	elevator_t *e = &q->elevator;
-	int lat = 0, *latency = e->elevator_data;
-	if (!insert_here)
-		insert_here = q->queue_head.prev;
-	if (!(rq->flags & REQ_BARRIER))
-		lat = latency[rq_data_dir(rq)];
-	elv_linus_sequence(rq) = lat;
-	list_add(&rq->queuelist, insert_here);
-	/*
-	 * new merges must not precede this barrier
-	 */
-	if (rq->flags & REQ_BARRIER)
-		q->last_merge = NULL;
-	else if (!q->last_merge)
-		q->last_merge = &rq->queuelist;
-}
-int elevator_linus_init(request_queue_t *q, elevator_t *e)
-{
-	int *latency;
-	latency = kmalloc(2 * sizeof(int), GFP_KERNEL);
-	if (!latency)
-		return -ENOMEM;
-	latency[READ] = 1024;
-	latency[WRITE] = 2048;
-	e->elevator_data = latency;
-	return 0;
-}
-void elevator_linus_exit(request_queue_t *q, elevator_t *e)
-{
-	kfree(e->elevator_data);
-}
 /*
 * elevator noop
 *
@@ -442,15 +334,6 @@ inline struct list_head *elv_get_sort_head(request_queue_t *q,
 	return &q->queue_head;
 }
-elevator_t elevator_linus = {
-	elevator_merge_fn:		elevator_linus_merge,
-	elevator_merge_req_fn:		elevator_linus_merge_req,
-	elevator_next_req_fn:		elevator_noop_next_request,
-	elevator_add_req_fn:		elevator_linus_add_request,
-	elevator_init_fn:		elevator_linus_init,
-	elevator_exit_fn:		elevator_linus_exit,
-};
 elevator_t elevator_noop = {
 	elevator_merge_fn:		elevator_noop_merge,
 	elevator_next_req_fn:		elevator_noop_next_request,
@@ -459,7 +342,6 @@ elevator_t elevator_noop = {
 module_init(elevator_global_init);
-EXPORT_SYMBOL(elevator_linus);
 EXPORT_SYMBOL(elevator_noop);
 EXPORT_SYMBOL(__elv_add_request);

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1175,7 +1175,7 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
 	if (blk_init_free_list(q))
 		return -ENOMEM;
-	if ((ret = elevator_init(q, &q->elevator, elevator_linus))) {
+	if ((ret = elevator_init(q, &q->elevator, iosched_deadline))) {
 		blk_cleanup_queue(q);
 		return ret;
 	}
@@ -1233,24 +1233,23 @@ static struct request *get_request(request_queue_t *q, int rw)
 */
 static struct request *get_request_wait(request_queue_t *q, int rw)
 {
-	DECLARE_WAITQUEUE(wait, current);
+	DEFINE_WAIT(wait);
 	struct request_list *rl = &q->rq[rw];
 	struct request *rq;
 	spin_lock_prefetch(q->queue_lock);
 	generic_unplug_device(q);
-	add_wait_queue_exclusive(&rl->wait, &wait);
 	do {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		prepare_to_wait_exclusive(&rl->wait, &wait,
+					TASK_UNINTERRUPTIBLE);
 		if (!rl->count)
 			schedule();
+		finish_wait(&rl->wait, &wait);
 		spin_lock_irq(q->queue_lock);
 		rq = get_request(q, rw);
 		spin_unlock_irq(q->queue_lock);
 	} while (rq == NULL);
-	remove_wait_queue(&rl->wait, &wait);
-	current->state = TASK_RUNNING;
 	return rq;
 }
@@ -1460,18 +1459,16 @@ void blk_put_request(struct request *req)
 */
 void blk_congestion_wait(int rw, long timeout)
 {
-	DECLARE_WAITQUEUE(wait, current);
+	DEFINE_WAIT(wait);
 	struct congestion_state *cs = &congestion_states[rw];
 	if (atomic_read(&cs->nr_congested_queues) == 0)
 		return;
 	blk_run_queues();
-	set_current_state(TASK_UNINTERRUPTIBLE);
+	prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE);
-	add_wait_queue(&cs->wqh, &wait);
 	if (atomic_read(&cs->nr_congested_queues) != 0)
 		schedule_timeout(timeout);
-	set_current_state(TASK_RUNNING);
+	finish_wait(&cs->wqh, &wait);
-	remove_wait_queue(&cs->wqh, &wait);
 }
 /*

--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -157,18 +157,12 @@ struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
 #define MAX_DISK_SIZE 1024*1024*1024
-static unsigned long
-compute_loop_size(struct loop_device *lo, struct dentry * lo_dentry)
-{
-	loff_t size = lo_dentry->d_inode->i_mapping->host->i_size;
-	return (size - lo->lo_offset) >> BLOCK_SIZE_BITS;
-}
 static void figure_loop_size(struct loop_device *lo)
 {
-	set_capacity(disks + lo->lo_number, compute_loop_size(lo,
+	loff_t size = lo->lo_backing_file->f_dentry->d_inode->i_size;
-					lo->lo_backing_file->f_dentry));
+	set_capacity(disks + lo->lo_number,
+		     (size - lo->lo_offset) >> 9);
 }
 static inline int lo_do_transfer(struct loop_device *lo, int cmd, char *rbuf,

--- a/drivers/ide/pci/cy82c693.c
+++ b/drivers/ide/pci/cy82c693.c
@@ -338,6 +338,9 @@ static void cy82c693_tune_drive (ide_drive_t *drive, u8 pio)
 */
 unsigned int __init init_chipset_cy82c693(struct pci_dev *dev, const char *name)
 {
+	if (PCI_FUNC(dev->devfn) != 1)
+		return 0;
 #ifdef CY82C693_SETDMA_CLOCK
 	u8 data = 0;
 #endif /* CY82C693_SETDMA_CLOCK */ 
@@ -411,20 +414,30 @@ void __init init_hwif_cy82c693(ide_hwif_t *hwif)
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 }
-void __init init_dma_cy82c693 (ide_hwif_t *hwif, unsigned long dmabase)
+static __initdata ide_hwif_t *primary;
+void __init init_iops_cy82c693(ide_hwif_t *hwif)
 {
-	ide_setup_dma(hwif, dmabase, 8);
+	if (PCI_FUNC(hwif->pci_dev->devfn) == 1)
+		primary = hwif;
+	else {
+		hwif->mate = primary;
+		hwif->channel = 1;
+	}
 }
-extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 static int __devinit cy82c693_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
 	ide_pci_device_t *d = &cy82c693_chipsets[id->driver_data];
-        if ((!(PCI_FUNC(dev->devfn) & 1) ||
+	struct pci_dev *dev2;
-	    (!((dev->class >> 8) == PCI_CLASS_STORAGE_IDE))))
-		return 0;	/* CY82C693 is more than only a IDE controller */
+	/* CY82C693 is more than only a IDE controller.
-	ide_setup_pci_device(dev, d);
+	   Function 1 is primary IDE channel, function 2 - secondary. */
+        if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE &&
+	    PCI_FUNC(dev->devfn) == 1) {
+		dev2 = pci_find_slot(dev->bus->number, dev->devfn + 1);
+		ide_setup_pci_devices(dev, dev2, d);
+	}
 	return 0;
 }

--- a/drivers/ide/pci/cy82c693.h
+++ b/drivers/ide/pci/cy82c693.h
@@ -66,7 +66,7 @@ typedef struct pio_clocks_s {
 extern unsigned int init_chipset_cy82c693(struct pci_dev *, const char *);
 extern void init_hwif_cy82c693(ide_hwif_t *);
-extern void init_dma_cy82c693(ide_hwif_t *, unsigned long);
+extern void init_iops_cy82c693(ide_hwif_t *);
 static ide_pci_device_t cy82c693_chipsets[] __initdata = {
 	{	/* 0 */
@@ -74,10 +74,10 @@ static ide_pci_device_t cy82c693_chipsets[] __initdata = {
 		device:		PCI_DEVICE_ID_CONTAQ_82C693,
 		name:		"CY82C693",
 		init_chipset:	init_chipset_cy82c693,
-		init_iops:	NULL,
+		init_iops:	init_iops_cy82c693,
 		init_hwif:	init_hwif_cy82c693,
-		init_dma:	init_dma_cy82c693,
+		init_dma:	NULL,
-		channels:	2,
+		channels:	1,
 		autodma:	AUTODMA,
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,

--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -250,6 +250,7 @@ static unsigned long __init ide_get_or_set_dma_base (ide_hwif_t *hwif)
 		switch(dev->device) {
 			case PCI_DEVICE_ID_AL_M5219:
+			case PCI_DEVICE_ID_AL_M5229:
 			case PCI_DEVICE_ID_AMD_VIPER_7409:
 			case PCI_DEVICE_ID_CMD_643:
 			case PCI_DEVICE_ID_SERVERWORKS_CSB5IDE:

--- a/drivers/pnp/pnpbios_proc.c
+++ b/drivers/pnp/pnpbios_proc.c
@@ -68,6 +68,7 @@ static int proc_read_escdinfo(char *buf, char **start, off_t pos,
 	);
 }
+#define MAX_SANE_ESCD_SIZE (32*1024)
 static int proc_read_escd(char *buf, char **start, off_t pos,
                          int count, int *eof, void *data)
 {
@@ -79,8 +80,8 @@ static int proc_read_escd(char *buf, char **start, off_t pos,
 		return -EIO;
 	/* sanity check */
-	if (escd.escd_size > (32*1024)) {
+	if (escd.escd_size > MAX_SANE_ESCD_SIZE) {
-		printk(KERN_ERR "PnPBIOS: proc_read_escd: ESCD size is too great\n");
+		printk(KERN_ERR "PnPBIOS: proc_read_escd: ESCD size reported by BIOS escd_info call is too great\n");
 		return -EFBIG;
 	}
@@ -90,7 +91,14 @@ static int proc_read_escd(char *buf, char **start, off_t pos,
 	if (pnp_bios_read_escd(tmpbuf, escd.nv_storage_base))
 		return -EIO;
-	escd_size = (unsigned char)(buf[0]) + (unsigned char)(buf[1])*256;
+	escd_size = (unsigned char)(tmpbuf[0]) + (unsigned char)(tmpbuf[1])*256;
+	/* sanity check */
+	if (escd_size > MAX_SANE_ESCD_SIZE) {
+		printk(KERN_ERR "PnPBIOS: proc_read_escd: ESCD size reported by BIOS read_escd call is too great\n");
+		return -EFBIG;
+	}
 	escd_left_to_read = escd_size - pos;
 	if (escd_left_to_read < 0) escd_left_to_read = 0;
 	if (escd_left_to_read == 0) *eof = 1;

--- a/drivers/scsi/3w-xxxx.c
+++ b/drivers/scsi/3w-xxxx.c
@@ -148,6 +148,11 @@
                 Fix bug in raw command post with data ioctl method.
                 Fix bug where rollcall sometimes failed with cable errors.
                 Print unit # on all command timeouts.
+   1.02.00.026 - Fix possible infinite retry bug with power glitch induced
+                 drive timeouts.
+                 Cleanup some AEN severity levels.
+   1.02.00.027 - Add drive not supported AEN code for SATA controllers.
+                 Remove spurious unknown ioctl error message.
 */
 #include <linux/module.h>
@@ -201,7 +206,7 @@ static struct notifier_block tw_notifier = {
 };
 /* Globals */
-char *tw_driver_version="1.02.00.025";
+char *tw_driver_version="1.02.00.027";
 TW_Device_Extension *tw_device_extension_list[TW_MAX_SLOT];
 int tw_device_extension_count = 0;
@@ -212,7 +217,7 @@ int tw_aen_complete(TW_Device_Extension *tw_dev, int request_id)
 {
 	TW_Param *param;
 	unsigned short aen;
-	int error = 0;
+	int error = 0, table_max = 0;
 	dprintk(KERN_WARNING "3w-xxxx: tw_aen_complete()\n");
 	if (tw_dev->alignment_virtual_address[request_id] == NULL) {
@@ -227,7 +232,8 @@ int tw_aen_complete(TW_Device_Extension *tw_dev, int request_id)
 	if (aen == 0x0ff) {
 		printk(KERN_WARNING "3w-xxxx: scsi%d: AEN: INFO: AEN queue overflow.\n", tw_dev->host->host_no);
 	} else {
-		if ((aen & 0x0ff) < TW_AEN_STRING_MAX) {
+		table_max = sizeof(tw_aen_string)/sizeof(char *);
+		if ((aen & 0x0ff) < table_max) {
 			if ((tw_aen_string[aen & 0xff][strlen(tw_aen_string[aen & 0xff])-1]) == '#') {
 				printk(KERN_WARNING "3w-xxxx: scsi%d: AEN: %s%d.\n", tw_dev->host->host_no, tw_aen_string[aen & 0xff], aen >> 8);
 			} else {
@@ -289,7 +295,7 @@ int tw_aen_drain_queue(TW_Device_Extension *tw_dev)
 	int first_reset = 0;
 	int queue = 0;
 	int imax, i;
-	int found = 0;
+	int found = 0, table_max = 0;
 	dprintk(KERN_NOTICE "3w-xxxx: tw_aen_drain_queue()\n");
@@ -409,7 +415,8 @@ int tw_aen_drain_queue(TW_Device_Extension *tw_dev)
 						if (aen == 0x0ff) {
 							printk(KERN_WARNING "3w-xxxx: AEN: INFO: AEN queue overflow.\n");
 						} else {
-							if ((aen & 0x0ff) < TW_AEN_STRING_MAX) {
+							table_max = sizeof(tw_aen_string)/sizeof(char *);
+							if ((aen & 0x0ff) < table_max) {
 								if ((tw_aen_string[aen & 0xff][strlen(tw_aen_string[aen & 0xff])-1]) == '#') {
 									printk(KERN_WARNING "3w-xxxx: AEN: %s%d.\n", tw_aen_string[aen & 0xff], aen >> 8);
 								} else {
@@ -1442,7 +1449,8 @@ static void tw_interrupt(int irq, void *dev_instance, struct pt_regs *regs)
 					/* If error, command failed */
 					if (error == 1) {
-						tw_dev->srb[request_id]->result = (DID_RESET << 16);
+						/* Ask for a host reset */
+						tw_dev->srb[request_id]->result = (DID_OK << 16) | (CHECK_CONDITION << 1);
 					}
 					/* Now complete the io */
@@ -1784,7 +1792,7 @@ int tw_ioctl(TW_Device_Extension *tw_dev, int request_id)
 				return 1;
 			}
 		default:
-			printk(KERN_WARNING "3w-xxxx: Unknown ioctl 0x%x.\n", opcode);
+			dprintk(KERN_WARNING "3w-xxxx: Unknown ioctl 0x%x.\n", opcode);
 			tw_dev->state[request_id] = TW_S_COMPLETED;
 			tw_state_request_finish(tw_dev, request_id);
 			tw_dev->srb[request_id]->result = (DID_OK << 16);

--- a/drivers/scsi/3w-xxxx.h
+++ b/drivers/scsi/3w-xxxx.h
@@ -90,14 +90,13 @@ static char *tw_aen_string[] = {
 	"INFO: Verify started: Unit #",                // 0x029
 	"ERROR: Verify failed: Port #",                // 0x02A
 	"INFO: Verify complete: Unit #",               // 0x02B
-	"ERROR: Overwrote bad sector during rebuild: Port #",   //0x02C
+	"WARNING: Overwrote bad sector during rebuild: Port #",  //0x02C
 	"ERROR: Encountered bad sector during rebuild: Port #",  //0x02D
-	"INFO: Replacement drive is too small: Port #",         //0x02E
+	"ERROR: Replacement drive is too small: Port #",         //0x02E
-	"WARNING: Verify error: Unit not previously initialized: Unit #" //0x02F
+	"WARNING: Verify error: Unit not previously initialized: Unit #", //0x02F
+	"ERROR: Drive not supported: Port #"           // 0x030
 };
-#define TW_AEN_STRING_MAX                      0x030
 /*
   Sense key lookup table
   Format: ESDC/flags,SenseKey,AdditionalSenseCode,AdditionalSenseCodeQualifier

--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -128,22 +128,18 @@ void unlock_buffer(struct buffer_head *bh)
 */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-	wait_queue_head_t *wq = bh_waitq_head(bh);
+	wait_queue_head_t *wqh = bh_waitq_head(bh);
-	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
-	DECLARE_WAITQUEUE(wait, tsk);
 	get_bh(bh);
-	add_wait_queue(wq, &wait);
 	do {
+		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		blk_run_queues();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (buffer_locked(bh))
-		if (!buffer_locked(bh))
-			break;
 			schedule();
 	} while (buffer_locked(bh));
-	tsk->state = TASK_RUNNING;
-	remove_wait_queue(wq, &wait);
 	put_bh(bh);
+	finish_wait(wqh, &wait);
 }
 static inline void
@@ -246,10 +242,12 @@ int fsync_bdev(struct block_device *bdev)
 }
 /*
- * sync everything.
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
 */
 asmlinkage long sys_sync(void)
 {
+	wakeup_bdflush(0);
 	sync_inodes(0);	/* All mappings and inodes, including block devices */
 	DQUOT_SYNC(NULL);
 	sync_supers();	/* Write the superblocks */

--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -329,12 +329,11 @@ static inline void prune_one_dentry(struct dentry * dentry)
 void prune_dcache(int count)
 {
 	spin_lock(&dcache_lock);
-	for (;;) {
+	for (; count ; count--) {
 		struct dentry *dentry;
 		struct list_head *tmp;
 		tmp = dentry_unused.prev;
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
@@ -349,12 +348,8 @@ void prune_dcache(int count)
 		dentry_stat.nr_unused--;
 		/* Unused dentry with a count? */
-		if (atomic_read(&dentry->d_count))
+		BUG_ON(atomic_read(&dentry->d_count));
-			BUG();
 		prune_one_dentry(dentry);
-		if (!--count)
-			break;
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -573,19 +568,11 @@ void shrink_dcache_anon(struct list_head *head)
 /*
 * This is called from kswapd when we think we need some
- * more memory, but aren't really sure how much. So we
+ * more memory. 
- * carefully try to free a _bit_ of our dcache, but not
- * too much.
- *
- * Priority:
- *   1 - very urgent: shrink everything
- *  ...
- *   6 - base-level: try to shrink a bit.
 */
-int shrink_dcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dcache_memory(int ratio, unsigned int gfp_mask)
 {
-	int count = 0;
+	int entries = dentry_stat.nr_dentry / ratio + 1;
 	/*
 	 * Nasty deadlock avoidance.
 	 *
@@ -600,11 +587,8 @@ int shrink_dcache_memory(int priority, unsigned int gfp_mask)
 	if (!(gfp_mask & __GFP_FS))
 		return 0;
-	count = dentry_stat.nr_unused / priority;
+	prune_dcache(entries);
+	return entries;
-	prune_dcache(count);
-	kmem_cache_shrink(dentry_cache);
-	return 0;
 }
 #define NAME_ALLOC_LEN(len)	((len+16) & ~15)

--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -480,26 +480,17 @@ static void prune_dqcache(int count)
 /*
 * This is called from kswapd when we think we need some
- * more memory, but aren't really sure how much. So we
+ * more memory
- * carefully try to free a _bit_ of our dqcache, but not
- * too much.
- *
- * Priority:
- *   1 - very urgent: shrink everything
- *   ...
- *   6 - base-level: try to shrink a bit.
 */
-int shrink_dqcache_memory(int priority, unsigned int gfp_mask)
+int shrink_dqcache_memory(int ratio, unsigned int gfp_mask)
 {
-	int count = 0;
+	int entries = dqstats.allocated_dquots / ratio + 1;
 	lock_kernel();
-	count = dqstats.free_dquots / priority;
+	prune_dqcache(entries);
-	prune_dqcache(count);
 	unlock_kernel();
-	kmem_cache_shrink(dquot_cachep);
+	return entries;
-	return 0;
 }
 /*

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -386,10 +386,11 @@ void prune_icache(int goal)
 	count = 0;
 	entry = inode_unused.prev;
-	while (entry != &inode_unused)
+	for(; goal; goal--) {
-	{
 		struct list_head *tmp = entry;
+		if (entry == &inode_unused)
+			break;
 		entry = entry->prev;
 		inode = INODE(tmp);
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK))
@@ -403,8 +404,6 @@ void prune_icache(int goal)
 		list_add(tmp, freeable);
 		inode->i_state |= I_FREEING;
 		count++;
-		if (!--goal)
-			break;
 	}
 	inodes_stat.nr_unused -= count;
 	spin_unlock(&inode_lock);
@@ -414,19 +413,11 @@ void prune_icache(int goal)
 /*
 * This is called from kswapd when we think we need some
- * more memory, but aren't really sure how much. So we
+ * more memory. 
- * carefully try to free a _bit_ of our icache, but not
- * too much.
- *
- * Priority:
- *   1 - very urgent: shrink everything
- *  ...
- *   6 - base-level: try to shrink a bit.
 */
-int shrink_icache_memory(int priority, int gfp_mask)
+int shrink_icache_memory(int ratio, unsigned int gfp_mask)
 {
-	int count = 0;
+	int entries = inodes_stat.nr_inodes / ratio + 1;
 	/*
 	 * Nasty deadlock avoidance..
 	 *
@@ -437,12 +428,10 @@ int shrink_icache_memory(int priority, int gfp_mask)
 	if (!(gfp_mask & __GFP_FS))
 		return 0;
-	count = inodes_stat.nr_unused / priority;
+	prune_icache(entries);
+	return entries;
-	prune_icache(count);
-	kmem_cache_shrink(inode_cachep);
-	return 0;
 }
+EXPORT_SYMBOL(shrink_icache_memory);
 /*
 * Called with the inode lock held.

--- a/fs/locks.c
+++ b/fs/locks.c
@@ -252,7 +252,7 @@ static int flock_make_lock(struct file *filp,
 		return -ENOMEM;
 	fl->fl_file = filp;
-	fl->fl_pid = current->pid;
+	fl->fl_pid = current->tgid;
 	fl->fl_flags = (cmd & LOCK_NB) ? FL_FLOCK : FL_FLOCK | FL_SLEEP;
 	fl->fl_type = type;
 	fl->fl_end = OFFSET_MAX;
@@ -308,7 +308,7 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
 		fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = current->files;
-	fl->fl_pid = current->pid;
+	fl->fl_pid = current->tgid;
 	fl->fl_file = filp;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_notify = NULL;
@@ -348,7 +348,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
 		fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = current->files;
-	fl->fl_pid = current->pid;
+	fl->fl_pid = current->tgid;
 	fl->fl_file = filp;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_notify = NULL;
@@ -377,7 +377,7 @@ static int lease_alloc(struct file *filp, int type, struct file_lock **flp)
 		return -ENOMEM;
 	fl->fl_owner = current->files;
-	fl->fl_pid = current->pid;
+	fl->fl_pid = current->tgid;
 	fl->fl_file = filp;
 	fl->fl_flags = FL_LEASE;
@@ -669,7 +669,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 	int error;
 	fl.fl_owner = current->files;
-	fl.fl_pid = current->pid;
+	fl.fl_pid = current->tgid;
 	fl.fl_file = filp;
 	fl.fl_flags = FL_POSIX | FL_ACCESS | FL_SLEEP;
 	fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
@@ -1241,7 +1241,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 	*before = fl;
 	list_add(&fl->fl_link, &file_lock_list);
-	error = f_setown(filp, current->pid, 1);
+	error = f_setown(filp, current->tgid, 1);
 out_unlock:
 	unlock_kernel();
 	return error;
@@ -1632,7 +1632,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	lock.fl_start = 0;
 	lock.fl_end = OFFSET_MAX;
 	lock.fl_owner = owner;
-	lock.fl_pid = current->pid;
+	lock.fl_pid = current->tgid;
 	lock.fl_file = filp;
 	if (filp->f_op && filp->f_op->lock != NULL) {

--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -40,7 +40,6 @@
 #define XQUAD_PORTIO_BASE 0xfe400000
 #define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
-#define XQUAD_PORTIO_LEN  0x80000  /* Only remapping first 2 quads */
 #ifdef __KERNEL__

--- a/include/asm-i386/semaphore.h
+++ b/include/asm-i386/semaphore.h
@@ -116,7 +116,7 @@ static inline void down(struct semaphore * sem)
 #if WAITQUEUE_DEBUG
 	CHECK_MAGIC(sem->__magic);
 #endif
+	might_sleep();
 	__asm__ __volatile__(
 		"# atomic down operation\n\t"
 		LOCK "decl %0\n\t"     /* --sem->count */
@@ -142,7 +142,7 @@ static inline int down_interruptible(struct semaphore * sem)
 #if WAITQUEUE_DEBUG
 	CHECK_MAGIC(sem->__magic);
 #endif
+	might_sleep();
 	__asm__ __volatile__(
 		"# atomic interruptible down operation\n\t"
 		LOCK "decl %1\n\t"     /* --sem->count */

--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -186,7 +186,7 @@ extern int shrink_dcache_memory(int, unsigned int);
 extern void prune_dcache(int);
 /* icache memory management (defined in linux/fs/inode.c) */
-extern int shrink_icache_memory(int, int);
+extern int shrink_icache_memory(int, unsigned int);
 extern void prune_icache(int);
 /* quota cache memory management (defined in linux/fs/dquot.c) */

--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -52,12 +52,10 @@ extern inline struct list_head *elv_get_sort_head(request_queue_t *, struct requ
 extern elevator_t elevator_noop;
 /*
- * elevator linus. based on linus ideas of starvation control, using
+ * deadline i/o scheduler. uses request time outs to prevent indefinite
- * sequencing to manage inserts and merges.
+ * starvation
 */
-extern elevator_t elevator_linus;
+extern elevator_t iosched_deadline;
-#define elv_linus_sequence(rq)	((long)(rq)->elevator_private)
-#define ELV_LINUS_SEEK_COST	16
 /*
 * use the /proc/iosched interface, all the below is history ->

--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -40,6 +40,13 @@
 struct completion;
+#ifdef CONFIG_DEBUG_KERNEL
+void __might_sleep(char *file, int line);
+#define might_sleep() __might_sleep(__FILE__, __LINE__)
+#else
+#define might_sleep() do {} while(0)
+#endif
 extern struct notifier_block *panic_notifier_list;
 NORET_TYPE void panic(const char * fmt, ...)
 	__attribute__ ((NORET_AND format (printf, 1, 2)));

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -524,6 +524,7 @@ extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned lon
 extern struct page * vmalloc_to_page(void *addr);
 extern unsigned long get_page_cache_size(void);
+extern unsigned int nr_used_zone_pages(void);
 #endif /* __KERNEL__ */

--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -74,9 +74,15 @@ static inline void ___add_to_page_cache(struct page *page,
 	inc_page_state(nr_pagecache);
 }
-extern void FASTCALL(lock_page(struct page *page));
+extern void FASTCALL(__lock_page(struct page *page));
 extern void FASTCALL(unlock_page(struct page *page));
+static inline void lock_page(struct page *page)
+{
+	if (TestSetPageLocked(page))
+		__lock_page(page);
+}
 /*
 * This is exported only for wait_on_page_locked/wait_on_page_writeback.
 * Never use this directly!

--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -40,6 +40,7 @@ extern void FASTCALL(rwsemtrace(struct rw_semaphore *sem, const char *str));
 */
 static inline void down_read(struct rw_semaphore *sem)
 {
+	might_sleep();
 	rwsemtrace(sem,"Entering down_read");
 	__down_read(sem);
 	rwsemtrace(sem,"Leaving down_read");
@@ -62,6 +63,7 @@ static inline int down_read_trylock(struct rw_semaphore *sem)
 */
 static inline void down_write(struct rw_semaphore *sem)
 {
+	might_sleep();
 	rwsemtrace(sem,"Entering down_write");
 	__down_write(sem);
 	rwsemtrace(sem,"Leaving down_write");

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -100,8 +100,9 @@ extern unsigned long nr_uninterruptible(void);
 #define TASK_RUNNING		0
 #define TASK_INTERRUPTIBLE	1
 #define TASK_UNINTERRUPTIBLE	2
-#define TASK_ZOMBIE		4
+#define TASK_STOPPED		4
-#define TASK_STOPPED		8
+#define TASK_ZOMBIE		8
+#define TASK_DEAD		16
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)

--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -119,6 +119,32 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 		_raced;						\
 	})
+/*
+ * Waitqueue's which are removed from the waitqueue_head at wakeup time
+ */
+void FASTCALL(prepare_to_wait(wait_queue_head_t *q,
+				wait_queue_t *wait, int state));
+void FASTCALL(prepare_to_wait_exclusive(wait_queue_head_t *q,
+				wait_queue_t *wait, int state));
+void FASTCALL(finish_wait(wait_queue_head_t *q, wait_queue_t *wait));
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync);
+#define DEFINE_WAIT(name)						\
+	wait_queue_t name = {						\
+		.task		= current,				\
+		.func		= autoremove_wake_function,		\
+		.task_list	= {	.next = &name.task_list,	\
+					.prev = &name.task_list,	\
+				},					\
+	}
+#define init_wait(wait)							\
+	do {								\
+		wait->task = current;					\
+		wait->func = autoremove_wake_function;			\
+		INIT_LIST_HEAD(&wait->task_list);			\
+	} while (0)
 #endif /* __KERNEL__ */
 #endif
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -32,6 +32,7 @@ int getrusage(struct task_struct *, int, struct rusage *);
 static struct dentry * __unhash_process(struct task_struct *p)
 {
 	struct dentry *proc_dentry;
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
 	detach_pid(p, PIDTYPE_TGID);
@@ -57,31 +58,31 @@ static struct dentry * __unhash_process(struct task_struct *p)
 void release_task(struct task_struct * p)
 {
 	struct dentry *proc_dentry;
+	task_t *leader;
-	if (p->state != TASK_ZOMBIE)
+	if (p->state < TASK_ZOMBIE)
 		BUG();
 	if (p != current)
 		wait_task_inactive(p);
 	atomic_dec(&p->user->processes);
 	security_ops->task_free_security(p);
 	free_uid(p->user);
-	if (unlikely(p->ptrace)) {
 	write_lock_irq(&tasklist_lock);
+	if (unlikely(p->ptrace))
 		__ptrace_unlink(p);
-		write_unlock_irq(&tasklist_lock);
-	}
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
-	write_lock_irq(&tasklist_lock);
 	__exit_sighand(p);
 	proc_dentry = __unhash_process(p);
 	/*
 	 * If we are the last non-leader member of the thread
 	 * group, and the leader is zombie, then notify the
-	 * group leader's parent process.
+	 * group leader's parent process. (if it wants notification.)
 	 */
-	if (p->group_leader != p && thread_group_empty(p))
+	leader = p->group_leader;
-		do_notify_parent(p->group_leader, p->group_leader->exit_signal);
+	if (leader != p && thread_group_empty(leader) &&
+		    leader->state == TASK_ZOMBIE && leader->exit_signal != -1)
+		do_notify_parent(leader, leader->exit_signal);
 	p->parent->cutime += p->utime + p->cutime;
 	p->parent->cstime += p->stime + p->cstime;
@@ -159,7 +160,7 @@ static int __will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) {
 		if (p == ignored_task
-				|| p->state == TASK_ZOMBIE 
+				|| p->state >= TASK_ZOMBIE 
 				|| p->real_parent->pid == 1)
 			continue;
 		if (p->real_parent->pgrp != pgrp
@@ -435,8 +436,11 @@ void exit_mm(struct task_struct *tsk)
 static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
 {
-	/* Make sure we're not reparenting to ourselves.  */
+	/*
-	if (p == reaper)
+	 * Make sure we're not reparenting to ourselves and that
+	 * the parent is not a zombie.
+	 */
+	if (p == reaper || reaper->state >= TASK_ZOMBIE)
 		p->real_parent = child_reaper;
 	else
 		p->real_parent = reaper;
@@ -774,9 +778,10 @@ static int eligible_child(pid_t pid, int options, task_t *p)
 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru)
 {
-	int flag, retval;
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
+	unsigned long state;
+	int flag, retval;
 	if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
@@ -827,7 +832,15 @@ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struc
 				 */
 				if (ret == 2)
 					continue;
+				/*
+				 * Try to move the task's state to DEAD
+				 * only one thread is allowed to do this:
+				 */
+				state = xchg(&p->state, TASK_DEAD);
+				if (state != TASK_ZOMBIE)
+					continue;
 				read_unlock(&tasklist_lock);
 				retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 				if (!retval && stat_addr) {
 					if (p->sig->group_exit)
@@ -835,13 +848,16 @@ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struc
 					else
 						retval = put_user(p->exit_code, stat_addr);
 				}
-				if (retval)
+				if (retval) {
+					p->state = TASK_ZOMBIE;
 					goto end_wait4;
+				}
 				retval = p->pid;
 				if (p->real_parent != p->parent) {
 					write_lock_irq(&tasklist_lock);
 					__ptrace_unlink(p);
 					do_notify_parent(p, SIGCHLD);
+					p->state = TASK_ZOMBIE;
 					write_unlock_irq(&tasklist_lock);
 				} else
 					release_task(p);

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -103,6 +103,52 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 	spin_unlock_irqrestore(&q->lock, flags);
 }
+void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+	__set_current_state(state);
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+void
+prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+	__set_current_state(state);
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue_tail(q, wait);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+	__set_current_state(TASK_RUNNING);
+	if (!list_empty(&wait->task_list)) {
+		spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync)
+{
+	int ret = default_wake_function(wait, mode, sync);
+	if (ret)
+		list_del_init(&wait->task_list);
+	return ret;
+}
 void __init fork_init(unsigned long mempages)
 {
 	/* create a slab on which task_structs can be allocated */

--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -400,6 +400,10 @@ EXPORT_SYMBOL(irq_stat);
 EXPORT_SYMBOL(add_wait_queue);
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 EXPORT_SYMBOL(remove_wait_queue);
+EXPORT_SYMBOL(prepare_to_wait);
+EXPORT_SYMBOL(prepare_to_wait_exclusive);
+EXPORT_SYMBOL(finish_wait);
+EXPORT_SYMBOL(autoremove_wake_function);
 /* completion handling */
 EXPORT_SYMBOL(wait_for_completion);
@@ -493,7 +497,9 @@ EXPORT_SYMBOL(jiffies_64);
 EXPORT_SYMBOL(xtime);
 EXPORT_SYMBOL(do_gettimeofday);
 EXPORT_SYMBOL(do_settimeofday);
+#ifdef CONFIG_DEBUG_KERNEL
+EXPORT_SYMBOL(__might_sleep);
+#endif
 #if !defined(__ia64__)
 EXPORT_SYMBOL(loops_per_jiffy);
 #endif

--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -53,6 +53,8 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
 static pidmap_t *map_limit = pidmap_array + PIDMAP_ENTRIES;
+static spinlock_t pidmap_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 inline void free_pidmap(int pid)
 {
 	pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
@@ -77,8 +79,13 @@ static inline pidmap_t *next_free_map(pidmap_t *map, int *max_steps)
 			 * Free the page if someone raced with us
 			 * installing it:
 			 */
-			if (cmpxchg(&map->page, NULL, (void *) page))
+			spin_lock(&pidmap_lock);
+			if (map->page)
 				free_page(page);
+			else
+				map->page = (void *)page;
+			spin_unlock(&pidmap_lock);
 			if (!map->page)
 				break;
 		}

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2150,3 +2150,20 @@ void __init sched_init(void)
 	enter_lazy_tlb(&init_mm, current, smp_processor_id());
 }
+#ifdef CONFIG_DEBUG_KERNEL
+void __might_sleep(char *file, int line)
+{
+#if defined(in_atomic)
+	static unsigned long prev_jiffy;	/* ratelimiting */
+	if (in_atomic()) {
+		if (time_before(jiffies, prev_jiffy + HZ))
+			return;
+		prev_jiffy = jiffies;
+		printk("Sleeping function called from illegal"
+				" context at %s:%d\n", file, line);
+		dump_stack();
+	}
+#endif
+}
+#endif
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -888,20 +888,6 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
 	if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0)
 		return -EINVAL;
-	if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
-	    current->policy != SCHED_NORMAL)
-	{
-		/*
-		 * Short delay requests up to 2 ms will be handled with
-		 * high precision by a busy wait for all real-time processes.
-		 *
-		 * Its important on SMP not to do this holding locks.
-		 */
-		udelay((t.tv_nsec + 999) / 1000);
-		return 0;
-	}
 	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
 	current->state = TASK_INTERRUPTIBLE;

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -632,19 +632,15 @@ static inline wait_queue_head_t *page_waitqueue(struct page *page)
 void wait_on_page_bit(struct page *page, int bit_nr)
 {
 	wait_queue_head_t *waitqueue = page_waitqueue(page);
-	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
-	DECLARE_WAITQUEUE(wait, tsk);
-	add_wait_queue(waitqueue, &wait);
 	do {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
-		if (!test_bit(bit_nr, &page->flags))
-			break;
 		sync_page(page);
+		if (test_bit(bit_nr, &page->flags))
 			schedule();
 	} while (test_bit(bit_nr, &page->flags));
-	__set_task_state(tsk, TASK_RUNNING);
+	finish_wait(waitqueue, &wait);
-	remove_wait_queue(waitqueue, &wait);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
@@ -690,38 +686,27 @@ void end_page_writeback(struct page *page)
 EXPORT_SYMBOL(end_page_writeback);
 /*
- * Get a lock on the page, assuming we need to sleep
+ * Get a lock on the page, assuming we need to sleep to get it.
- * to get it..
+ *
+ * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
+ * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
+ * chances are that on the second loop, the block layer's plug list is empty,
+ * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 */
-static void __lock_page(struct page *page)
+void __lock_page(struct page *page)
 {
-	wait_queue_head_t *waitqueue = page_waitqueue(page);
+	wait_queue_head_t *wqh = page_waitqueue(page);
-	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
-	DECLARE_WAITQUEUE(wait, tsk);
-	add_wait_queue_exclusive(waitqueue, &wait);
+	while (TestSetPageLocked(page)) {
-	for (;;) {
+		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		if (PageLocked(page)) {
 		sync_page(page);
+		if (PageLocked(page))
 			schedule();
 	}
-		if (!TestSetPageLocked(page))
+	finish_wait(wqh, &wait);
-			break;
-	}
-	__set_task_state(tsk, TASK_RUNNING);
-	remove_wait_queue(waitqueue, &wait);
-}
-/*
- * Get an exclusive lock on the page, optimistically
- * assuming it's not locked..
- */
-void lock_page(struct page *page)
-{
-	if (TestSetPageLocked(page))
-		__lock_page(page);
 }
+EXPORT_SYMBOL(__lock_page);
 /*
 * a rather lightweight function, finding and getting a reference to a

--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -187,7 +187,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		 * Try to merge with the previous vma.
 		 */
 		if (mprotect_attempt_merge(vma, *pprev, end, newflags))
-			return 0;
+			goto success;
 	} else {
 		error = split_vma(mm, vma, start, 1);
 		if (error)
@@ -209,7 +209,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
 	spin_unlock(&mm->page_table_lock);
+success:
 	change_protection(vma, start, end, newprot);
 	return 0;

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -321,6 +321,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct page * page;
 	int freed, i;
+	if (gfp_mask & __GFP_WAIT)
+		might_sleep();
 	KERNEL_STAT_ADD(pgalloc, 1<<order);
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
@@ -479,6 +482,17 @@ unsigned int nr_free_pages(void)
 	return sum;
 }
+unsigned int nr_used_zone_pages(void)
+{
+	unsigned int pages = 0;
+	struct zone *zone;
+	for_each_zone(zone)
+		pages += zone->nr_active + zone->nr_inactive;
+	return pages;
+}
 static unsigned int nr_free_zone_pages(int offset)
 {
 	pg_data_t *pgdat;

--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -79,9 +79,9 @@ static unsigned long last_empty_jifs;
 */
 struct pdflush_work {
 	struct task_struct *who;	/* The thread */
-	void (*fn)(unsigned long);	/* A callback function for pdflush to work on */
+	void (*fn)(unsigned long);	/* A callback function */
-	unsigned long arg0;		/* An argument to the callback function */
+	unsigned long arg0;		/* An argument to the callback */
-	struct list_head list;		/* On pdflush_list, when the thread is idle */
+	struct list_head list;		/* On pdflush_list, when idle */
 	unsigned long when_i_went_to_sleep;
 };
@@ -99,23 +99,34 @@ static int __pdflush(struct pdflush_work *my_work)
 	current->flags |= PF_FLUSHER;
 	my_work->fn = NULL;
 	my_work->who = current;
+	INIT_LIST_HEAD(&my_work->list);
 	spin_lock_irq(&pdflush_lock);
 	nr_pdflush_threads++;
-//	printk("pdflush %d [%d] starts\n", nr_pdflush_threads, current->pid);
 	for ( ; ; ) {
 		struct pdflush_work *pdf;
-		list_add(&my_work->list, &pdflush_list);
-		my_work->when_i_went_to_sleep = jiffies;
 		set_current_state(TASK_INTERRUPTIBLE);
+		list_move(&my_work->list, &pdflush_list);
+		my_work->when_i_went_to_sleep = jiffies;
 		spin_unlock_irq(&pdflush_lock);
 		if (current->flags & PF_FREEZE)
 			refrigerator(PF_IOTHREAD);
 		schedule();
-		if (my_work->fn)
+		spin_lock_irq(&pdflush_lock);
+		if (!list_empty(&my_work->list)) {
+			printk("pdflush: bogus wakeup!\n");
+			my_work->fn = NULL;
+			continue;
+		}
+		if (my_work->fn == NULL) {
+			printk("pdflush: NULL work function\n");
+			continue;
+		}
+		spin_unlock_irq(&pdflush_lock);
 		(*my_work->fn)(my_work->arg0);
 		/*
@@ -132,6 +143,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		}
 		spin_lock_irq(&pdflush_lock);
+		my_work->fn = NULL;
 		/*
 		 * Thread destruction: For how long has the sleepiest
@@ -143,13 +155,12 @@ static int __pdflush(struct pdflush_work *my_work)
 			continue;
 		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
 		if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
-			pdf->when_i_went_to_sleep = jiffies;	/* Limit exit rate */
+			/* Limit exit rate */
+			pdf->when_i_went_to_sleep = jiffies;
 			break;					/* exeunt */
 		}
-		my_work->fn = NULL;
 	}
 	nr_pdflush_threads--;
-//	printk("pdflush %d [%d] ends\n", nr_pdflush_threads, current->pid);
 	spin_unlock_irq(&pdflush_lock);
 	return 0;
 }
@@ -191,11 +202,10 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
 		list_del_init(&pdf->list);
 		if (list_empty(&pdflush_list))
 			last_empty_jifs = jiffies;
-		spin_unlock_irqrestore(&pdflush_lock, flags);
 		pdf->fn = fn;
 		pdf->arg0 = arg0;
-		wmb();			/* ? */
 		wake_up_process(pdf->who);
+		spin_unlock_irqrestore(&pdflush_lock, flags);
 	}
 	return ret;
 }

--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1370,6 +1370,9 @@ static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags)
 	unsigned long save_flags;
 	void* objp;
+	if (flags & __GFP_WAIT)
+		might_sleep();
 	kmem_cache_alloc_head(cachep, flags);
 try_again:
 	local_irq_save(save_flags);
@@ -1496,7 +1499,11 @@ static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp)
 		if (unlikely(!--slabp->inuse)) {
 			/* Was partial or full, now empty. */
 			list_del(&slabp->list);
-			list_add(&slabp->list, &cachep->slabs_free);
+/*			list_add(&slabp->list, &cachep->slabs_free); 		*/
+			if (unlikely(list_empty(&cachep->slabs_partial)))
+				list_add(&slabp->list, &cachep->slabs_partial);
+			else
+				kmem_slab_destroy(cachep, slabp);
 		} else if (unlikely(inuse == cachep->num)) {
 			/* Was full. */
 			list_del(&slabp->list);
@@ -1970,7 +1977,7 @@ static int s_show(struct seq_file *m, void *p)
 	}
 	list_for_each(q,&cachep->slabs_partial) {
 		slabp = list_entry(q, slab_t, list);
-		if (slabp->inuse == cachep->num || !slabp->inuse)
+		if (slabp->inuse == cachep->num)
 			BUG();
 		active_objs += slabp->inuse;
 		active_slabs++;

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -70,6 +70,10 @@
 #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 #endif
+#ifndef CONFIG_QUOTA
+#define shrink_dqcache_memory(ratio, gfp_mask) do { } while (0)
+#endif
 /* Must be called with page's pte_chain_lock held. */
 static inline int page_mapping_inuse(struct page * page)
 {
@@ -97,7 +101,7 @@ static inline int is_page_cache_freeable(struct page *page)
 static /* inline */ int
 shrink_list(struct list_head *page_list, int nr_pages,
-		unsigned int gfp_mask, int *max_scan)
+		unsigned int gfp_mask, int *max_scan, int *nr_mapped)
 {
 	struct address_space *mapping;
 	LIST_HEAD(ret_pages);
@@ -116,6 +120,10 @@ shrink_list(struct list_head *page_list, int nr_pages,
 		if (TestSetPageLocked(page))
 			goto keep;
+		/* Double the slab pressure for mapped and swapcache pages */
+		if (page_mapped(page) || PageSwapCache(page))
+			(*nr_mapped)++;
 		BUG_ON(PageActive(page));
 		may_enter_fs = (gfp_mask & __GFP_FS) ||
 				(PageSwapCache(page) && (gfp_mask & __GFP_IO));
@@ -320,7 +328,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
 */
 static /* inline */ int
 shrink_cache(int nr_pages, struct zone *zone,
-		unsigned int gfp_mask, int max_scan)
+		unsigned int gfp_mask, int max_scan, int *nr_mapped)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -371,7 +379,8 @@ shrink_cache(int nr_pages, struct zone *zone,
 		max_scan -= nr_scan;
 		KERNEL_STAT_ADD(pgscan, nr_scan);
-		nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan);
+		nr_pages = shrink_list(&page_list, nr_pages,
+				gfp_mask, &max_scan, nr_mapped);
 		if (nr_pages <= 0 && list_empty(&page_list))
 			goto done;
@@ -522,14 +531,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 static /* inline */ int
 shrink_zone(struct zone *zone, int max_scan,
-		unsigned int gfp_mask, int nr_pages)
+		unsigned int gfp_mask, int nr_pages, int *nr_mapped)
 {
 	unsigned long ratio;
-	/* This is bogus for ZONE_HIGHMEM? */
-	if (kmem_cache_reap(gfp_mask) >= nr_pages)
-  		return 0;
 	/*
 	 * Try to keep the active list 2/3 of the size of the cache.  And
 	 * make sure that refill_inactive is given a decent number of pages.
@@ -547,7 +552,8 @@ shrink_zone(struct zone *zone, int max_scan,
 		atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
 		refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
 	}
-	nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan);
+	nr_pages = shrink_cache(nr_pages, zone, gfp_mask,
+				max_scan, nr_mapped);
 	return nr_pages;
 }
@@ -557,6 +563,9 @@ shrink_caches(struct zone *classzone, int priority,
 {
 	struct zone *first_classzone;
 	struct zone *zone;
+	int ratio;
+	int nr_mapped = 0;
+	int pages = nr_used_zone_pages();
 	first_classzone = classzone->zone_pgdat->node_zones;
 	for (zone = classzone; zone >= first_classzone; zone--) {
@@ -581,16 +590,28 @@ shrink_caches(struct zone *classzone, int priority,
 		max_scan = zone->nr_inactive >> priority;
 		if (max_scan < to_reclaim * 2)
 			max_scan = to_reclaim * 2;
-		unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim);
+		unreclaimed = shrink_zone(zone, max_scan,
+				gfp_mask, to_reclaim, &nr_mapped);
 		nr_pages -= to_reclaim - unreclaimed;
 		*total_scanned += max_scan;
 	}
-	shrink_dcache_memory(priority, gfp_mask);
+	/*
-	shrink_icache_memory(1, gfp_mask);
+	 * Here we assume it costs one seek to replace a lru page and that
-#ifdef CONFIG_QUOTA
+	 * it also takes a seek to recreate a cache object.  With this in
-	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+	 * mind we age equal percentages of the lru and ageable caches.
-#endif
+	 * This should balance the seeks generated by these structures.
+	 *
+	 * NOTE: for now I do this for all zones.  If we find this is too
+	 * aggressive on large boxes we may want to exclude ZONE_HIGHMEM
+	 *
+	 * If we're encountering mapped pages on the LRU then increase the
+	 * pressure on slab to avoid swapping.
+	 */
+	ratio = (pages / (*total_scanned + nr_mapped + 1)) + 1;
+	shrink_dcache_memory(ratio, gfp_mask);
+	shrink_icache_memory(ratio, gfp_mask);
+	shrink_dqcache_memory(ratio, gfp_mask);
 	return nr_pages;
 }