Merge master.kernel.org:/home/hch/BK/xfs/linux-2.5

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge master.kernel.org:/home/hch/BK/xfs/linux-2.5
into home.transmeta.com:/home/torvalds/v2.5/linux
d88ebc0a · Linus Torvalds · 3f614a3d · 3f27dd28 · d88ebc0a · d88ebc0a
Commit d88ebc0a authored Feb 25, 2003 by Linus Torvalds
64 changed files
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -50,7 +50,7 @@
 #include "proto.h"
 #include "irq_impl.h"

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 extern unsigned long wall_jiffies;	/* kernel/timer.c */


--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -32,7 +32,7 @@
 #include <asm/irq.h>
 #include <asm/leds.h>

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 extern unsigned long wall_jiffies;


--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -45,7 +45,7 @@

 #include <asm/svinto.h>

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 static int have_rtc;  /* used to remember if we have an RTC or not */


--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -75,6 +75,11 @@ config X86_SUMMIT

 	  If you don't have one of these computers, you should say N here.

+config ACPI_SRAT
+	bool
+	default y
+	depends on NUMA && X86_SUMMIT
+
 config X86_BIGSMP
 	bool "Support for other sub-arch SMP systems with more than 8 CPUs"
 	help
@@ -483,7 +488,7 @@ config NR_CPUS
 # Common NUMA Features
 config NUMA
 	bool "Numa Memory Allocation Support"
-	depends on X86_NUMAQ
+	depends on (HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT && ACPI && !ACPI_HT_ONLY)))

 config DISCONTIGMEM
 	bool
@@ -752,6 +757,13 @@ config HAVE_DEC_LOCK
 	depends on (SMP || PREEMPT) && X86_CMPXCHG
 	default y

+# turning this on wastes a bunch of space.
+# Summit needs it only when NUMA is on
+config BOOT_IOREMAP
+	bool
+	depends on (X86_SUMMIT && NUMA)
+	default y
+
 endmenu



--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 obj-$(CONFIG_EDD)             	+= edd.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-y				+= sysenter.o
+obj-$(CONFIG_ACPI_SRAT) 	+= srat.o

 EXTRA_AFLAGS   := -traditional


--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -223,7 +223,7 @@ static void set_ioapic_affinity (unsigned int irq, unsigned long mask)

 extern unsigned long irq_affinity [NR_IRQS];
 int __cacheline_aligned pending_irq_balance_apicid [NR_IRQS];
-static int irqbalance_disabled __initdata = 0;
+static int irqbalance_disabled = NO_BALANCE_IRQ;
 static int physical_balance = 0;

 struct irq_cpu_info {
@@ -492,7 +492,7 @@ static inline void balance_irq (int cpu, int irq)
 	unsigned long allowed_mask;
 	unsigned int new_cpu;
 		
-	if (no_balance_irq)
+	if (irqbalance_disabled)
 		return;

 	allowed_mask = cpu_online_map & irq_affinity[irq];

--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
+/*
+ * Some of the code in this file has been gleaned from the 64 bit 
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <asm/srat.h>
+
+/*
+ * proximity macros and definitions
+ */
+#define NODE_ARRAY_INDEX(x)	((x) / 8)	/* 8 bits/char */
+#define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
+#define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
+#define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
+#define MAX_PXM_DOMAINS		256	/* 1 byte and no promises about values */
+/* bitmap length; _PXM is at most 255 */
+#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
+static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
+
+#define MAX_CHUNKS_PER_NODE	4
+#define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
+struct node_memory_chunk_s {
+	unsigned long	start_pfn;
+	unsigned long	end_pfn;
+	u8	pxm;		// proximity domain of node
+	u8	nid;		// which cnode contains this chunk?
+	u8	bank;		// which mem bank on this node
+};
+static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+
+static int num_memory_chunks;		/* total number of memory chunks */
+static int zholes_size_init;
+static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
+
+unsigned long node_start_pfn[MAX_NUMNODES];
+unsigned long node_end_pfn[MAX_NUMNODES];
+
+extern void * boot_ioremap(unsigned long, unsigned long);
+
+/* Identify CPU proximity domains */
+static void __init parse_cpu_affinity_structure(char *p)
+{
+	struct acpi_table_processor_affinity *cpu_affinity = 
+				(struct acpi_table_processor_affinity *) p;
+
+	if (!cpu_affinity->flags.enabled)
+		return;		/* empty entry */
+
+	/* mark this node as "seen" in node bitmap */
+	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain);
+
+	printk("CPU 0x%02X in proximity domain 0x%02X\n",
+		cpu_affinity->apic_id, cpu_affinity->proximity_domain);
+}
+
+/*
+ * Identify memory proximity domains and hot-remove capabilities.
+ * Fill node memory chunk list structure.
+ */
+static void __init parse_memory_affinity_structure (char *sratp)
+{
+	unsigned long long paddr, size;
+	unsigned long start_pfn, end_pfn; 
+	u8 pxm;
+	struct node_memory_chunk_s *p, *q, *pend;
+	struct acpi_table_memory_affinity *memory_affinity =
+			(struct acpi_table_memory_affinity *) sratp;
+
+	if (!memory_affinity->flags.enabled)
+		return;		/* empty entry */
+
+	/* mark this node as "seen" in node bitmap */
+	BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain);
+
+	/* calculate info for memory chunk structure */
+	paddr = memory_affinity->base_addr_hi;
+	paddr = (paddr << 32) | memory_affinity->base_addr_lo;
+	size = memory_affinity->length_hi;
+	size = (size << 32) | memory_affinity->length_lo;
+	
+	start_pfn = paddr >> PAGE_SHIFT;
+	end_pfn = (paddr + size) >> PAGE_SHIFT;
+	
+	pxm = memory_affinity->proximity_domain;
+
+	if (num_memory_chunks >= MAXCHUNKS) {
+		printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n",
+			size/(1024*1024), paddr);
+		return;
+	}
+
+	/* Insertion sort based on base address */
+	pend = &node_memory_chunk[num_memory_chunks];
+	for (p = &node_memory_chunk[0]; p < pend; p++) {
+		if (start_pfn < p->start_pfn)
+			break;
+	}
+	if (p < pend) {
+		for (q = pend; q >= p; q--)
+			*(q + 1) = *q;
+	}
+	p->start_pfn = start_pfn;
+	p->end_pfn = end_pfn;
+	p->pxm = pxm;
+
+	num_memory_chunks++;
+
+	printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n",
+		start_pfn, end_pfn,
+		memory_affinity->memory_type,
+		memory_affinity->proximity_domain,
+		(memory_affinity->flags.hot_pluggable ?
+		 "enabled and removable" : "enabled" ) );
+}
+
+#if MAX_NR_ZONES != 3
+#error "MAX_NR_ZONES != 3, chunk_to_zone requires review"
+#endif
+/* Take a chunk of pages from page frame cstart to cend and count the number
+ * of pages in each zone, returned via zones[].
+ */
+static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, 
+		unsigned long *zones)
+{
+	unsigned long max_dma;
+	extern unsigned long max_low_pfn;
+
+	int z;
+	unsigned long rend;
+
+	/* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide
+	 * similarly scoped information and should be handled in a consistant
+	 * manner.
+	 */
+	max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+	/* Split the hole into the zones in which it falls.  Repeatedly
+	 * take the segment in which the remaining hole starts, round it
+	 * to the end of that zone.
+	 */
+	memset(zones, 0, MAX_NR_ZONES * sizeof(long));
+	while (cstart < cend) {
+		if (cstart < max_dma) {
+			z = ZONE_DMA;
+			rend = (cend < max_dma)? cend : max_dma;
+
+		} else if (cstart < max_low_pfn) {
+			z = ZONE_NORMAL;
+			rend = (cend < max_low_pfn)? cend : max_low_pfn;
+
+		} else {
+			z = ZONE_HIGHMEM;
+			rend = cend;
+		}
+		zones[z] += rend - cstart;
+		cstart = rend;
+	}
+}
+
+/*
+ * physnode_map keeps track of the physical memory layout of the
+ * numaq nodes on a 256Mb break (each element of the array will
+ * represent 256Mb of memory and will be marked by the node id.  so,
+ * if the first gig is on node 0, and the second gig is on node 1
+ * physnode_map will contain:
+ * physnode_map[0-3] = 0;
+ * physnode_map[4-7] = 1;
+ * physnode_map[8- ] = -1;
+ */
+int pfnnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+EXPORT_SYMBOL(pfnnode_map);
+
+static void __init initialize_pfnnode_map(void)
+{
+	unsigned long topofchunk, cur = 0;
+	int i;
+	
+	for (i = 0; i < num_memory_chunks; i++) {
+		cur = node_memory_chunk[i].start_pfn;
+		topofchunk = node_memory_chunk[i].end_pfn;
+		while (cur < topofchunk) {
+			pfnnode_map[PFN_TO_ELEMENT(cur)] = node_memory_chunk[i].nid;
+			cur ++;
+		}
+	}
+}
+
+/* Parse the ACPI Static Resource Affinity Table */
+static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+{
+	u8 *start, *end, *p;
+	int i, j, nid;
+	u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */
+	u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */
+
+	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
+	p = start;
+	end = (u8 *)sratp + sratp->header.length;
+
+	memset(pxm_bitmap, 0, sizeof(pxm_bitmap));	/* init proximity domain bitmap */
+	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
+	memset(zholes_size, 0, sizeof(zholes_size));
+
+	/* -1 in these maps means not available */
+	memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
+	memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
+
+	num_memory_chunks = 0;
+	while (p < end) {
+		switch (*p) {
+		case ACPI_SRAT_PROCESSOR_AFFINITY:
+			parse_cpu_affinity_structure(p);
+			break;
+		case ACPI_SRAT_MEMORY_AFFINITY:
+			parse_memory_affinity_structure(p);
+			break;
+		default:
+			printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
+			break;
+		}
+		p += p[1];
+		if (p[1] == 0) {
+			printk("acpi20_parse_srat: Entry length value is zero;"
+				" can't parse any further!\n");
+			break;
+		}
+	}
+
+	/* Calculate total number of nodes in system from PXM bitmap and create
+	 * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
+	 * to specify the range of _PXM values.)
+	 */
+	numnodes = 0;		/* init total nodes in system */
+	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+		if (BMAP_TEST(pxm_bitmap, i)) {
+			pxm_to_nid_map[i] = numnodes;
+			nid_to_pxm_map[numnodes] = i;
+			node_set_online(numnodes);
+			++numnodes;
+		}
+	}
+
+	if (numnodes == 0)
+		BUG();
+
+	/* set cnode id in memory chunk structure */
+	for (i = 0; i < num_memory_chunks; i++)
+		node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm];
+
+	initialize_pfnnode_map();
+	
+	printk("pxm bitmap: ");
+	for (i = 0; i < sizeof(pxm_bitmap); i++) {
+		printk("%02X ", pxm_bitmap[i]);
+	}
+	printk("\n");
+	printk("Number of logical nodes in system = %d\n", numnodes);
+	printk("Number of memory chunks in system = %d\n", num_memory_chunks);
+
+	for (j = 0; j < num_memory_chunks; j++){
+		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
+		       j, node_memory_chunk[j].nid,
+		       node_memory_chunk[j].start_pfn,
+		       node_memory_chunk[j].end_pfn);
+	}
+ 
+	/*calculate node_start_pfn/node_end_pfn arrays*/
+	for (nid = 0; nid < numnodes; nid++) {
+		int been_here_before = 0;
+
+		for (j = 0; j < num_memory_chunks; j++){
+			if (node_memory_chunk[j].nid == nid) {
+				if (been_here_before == 0) {
+					node_start_pfn[nid] = node_memory_chunk[j].start_pfn;
+					node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
+					been_here_before = 1;
+				} else { /* We've found another chunk of memory for the node */
+					if (node_start_pfn[nid] < node_memory_chunk[j].start_pfn) {
+						node_end_pfn[nid] = node_memory_chunk[j].end_pfn;
+					}
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+void __init get_memcfg_from_srat(void)
+{
+	struct acpi_table_header *header = NULL;
+	struct acpi_table_rsdp *rsdp = NULL;
+	struct acpi_table_rsdt *rsdt = NULL;
+	struct acpi_pointer *rsdp_address = NULL;
+	struct acpi_table_rsdt saved_rsdt;
+	int tables = 0;
+	int i = 0;
+
+	acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING, rsdp_address);
+
+	if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) {
+		printk("%s: assigning address to rsdp\n", __FUNCTION__);
+		rsdp = (struct acpi_table_rsdp *)rsdp_address->pointer.physical;
+	} else {
+		printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__);
+		return;
+	}
+	if (!rsdp) {
+		printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
+		return;
+	}
+
+	printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
+		rsdp->oem_id);
+
+	if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) {
+		printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
+		return;
+	}
+
+	rsdt = (struct acpi_table_rsdt *)
+	    boot_ioremap(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt));
+
+	if (!rsdt) {
+		printk(KERN_WARNING
+		       "%s: ACPI: Invalid root system description tables (RSDT)\n",
+		       __FUNCTION__);
+		return;
+	}
+
+	header = & rsdt->header;
+
+	if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) {
+		printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
+		return;
+	}
+
+	/* 
+	 * The number of tables is computed by taking the 
+	 * size of all entries (header size minus total 
+	 * size of RSDT) divided by the size of each entry
+	 * (4-byte table pointers).
+	 */
+	tables = (header->length - sizeof(struct acpi_table_header)) / 4;
+
+	memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
+
+	if (saved_rsdt.header.length > sizeof(saved_rsdt)) {
+		printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
+		       saved_rsdt.header.length);
+		return;
+	}
+
+printk("Begin table scan....\n");
+
+	for (i = 0; i < tables; i++) {
+		/* Map in header, then map in full table length. */
+		header = (struct acpi_table_header *)
+			boot_ioremap(saved_rsdt.entry[i], sizeof(struct acpi_table_header));
+		if (!header)
+			break;
+		header = (struct acpi_table_header *)
+			boot_ioremap(saved_rsdt.entry[i], header->length);
+		if (!header)
+			break;
+
+		if (strncmp((char *) &header->signature, "SRAT", 4))
+			continue;
+		acpi20_parse_srat((struct acpi_table_srat *)header);
+		/* we've found the srat table. don't need to look at any more tables */
+		break;
+	}
+}
+
+/* For each node run the memory list to determine whether there are
+ * any memory holes.  For each hole determine which ZONE they fall
+ * into.
+ *
+ * NOTE#1: this requires knowledge of the zone boundries and so
+ * _cannot_ be performed before those are calculated in setup_memory.
+ * 
+ * NOTE#2: we rely on the fact that the memory chunks are ordered by
+ * start pfn number during setup.
+ */
+static void __init get_zholes_init(void)
+{
+	int nid;
+	int c;
+	int first;
+	unsigned long end = 0;
+
+	for (nid = 0; nid < numnodes; nid++) {
+		first = 1;
+		for (c = 0; c < num_memory_chunks; c++){
+			if (node_memory_chunk[c].nid == nid) {
+				if (first) {
+					end = node_memory_chunk[c].end_pfn;
+					first = 0;
+
+				} else {
+					/* Record any gap between this chunk
+					 * and the previous chunk on this node
+					 * against the zones it spans.
+					 */
+					chunk_to_zones(end,
+						node_memory_chunk[c].start_pfn,
+						&zholes_size[nid * MAX_NR_ZONES]);
+				}
+			}
+		}
+	}
+}
+
+unsigned long * __init get_zholes_size(int nid)
+{
+	if (!zholes_size_init) {
+		zholes_size_init++;
+		get_zholes_init();
+	}
+	if((nid >= numnodes) | (nid >= MAX_NUMNODES))
+		printk("%s: nid = %d is invalid. numnodes = %d",
+		       __FUNCTION__, nid, numnodes);
+	return &zholes_size[nid * MAX_NR_ZONES];
+}
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -66,7 +66,7 @@ int pit_latch_buggy;              /* extern */

 #include "do_timer.h"

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 unsigned long cpu_khz;	/* Detected as we calibrate the TSC */


--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -1230,9 +1230,10 @@ flush_tlb_all_function(void* info)
 void
 flush_tlb_all(void)
 {
+	preempt_disable();
 	smp_call_function (flush_tlb_all_function, 0, 1, 1);
-
 	do_flush_tlb_all_local();
+	preempt_enable();
 }

 /* used to set up the trampoline for other CPUs when the memory manager

--- a/arch/i386/mm/Makefile
+++ b/arch/i386/mm/Makefile
@@ -2,8 +2,9 @@
 # Makefile for the linux i386-specific parts of the memory manager.
 #

-obj-y	:= init.o pgtable.o fault.o ioremap.o extable.o pageattr.o
+obj-y	:= init.o pgtable.o fault.o ioremap.o extable.o pageattr.o 

 obj-$(CONFIG_DISCONTIGMEM)	+= discontig.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_HIGHMEM) += highmem.o
+obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
+/*
+ * arch/i386/mm/boot_ioremap.c
+ * 
+ * Re-map functions for early boot-time before paging_init() when the 
+ * boot-time pagetables are still in use
+ *
+ * Written by Dave Hansen <haveblue@us.ibm.com>
+ */
+
+
+/*
+ * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE
+ * keeps that from happenning.  If anyone has a better way, I'm listening.
+ *
+ * boot_pte_t is defined only if this all works correctly
+ */
+
+#include <linux/config.h>
+#undef CONFIG_X86_PAE
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <linux/init.h>
+#include <linux/stddef.h>
+
+/* 
+ * I'm cheating here.  It is known that the two boot PTE pages are 
+ * allocated next to each other.  I'm pretending that they're just
+ * one big array. 
+ */
+
+#define BOOT_PTE_PTRS (PTRS_PER_PTE*2)
+#define boot_pte_index(address) \
+	     (((address) >> PAGE_SHIFT) & (BOOT_PTE_PTRS - 1))
+
+static inline boot_pte_t* boot_vaddr_to_pte(void *address)
+{
+	boot_pte_t* boot_pg = (boot_pte_t*)pg0;
+	return &boot_pg[boot_pte_index((unsigned long)address)];
+}
+
+/*
+ * This is only for a caller who is clever enough to page-align
+ * phys_addr and virtual_source, and who also has a preference
+ * about which virtual address from which to steal ptes
+ */
+static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, 
+		    void* virtual_source)
+{
+	boot_pte_t* pte;
+	int i;
+
+	pte = boot_vaddr_to_pte(virtual_source);
+	for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) {
+		set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL));
+	}
+}
+
+/* the virtual space we're going to remap comes from this array */
+#define BOOT_IOREMAP_PAGES 4
+#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE)
+__initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] 
+		__attribute__ ((aligned (PAGE_SIZE)));
+
+/*
+ * This only applies to things which need to ioremap before paging_init()
+ * bt_ioremap() and plain ioremap() are both useless at this point.
+ * 
+ * When used, we're still using the boot-time pagetables, which only
+ * have 2 PTE pages mapping the first 8MB
+ *
+ * There is no unmap.  The boot-time PTE pages aren't used after boot.
+ * If you really want the space back, just remap it yourself.
+ * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE)
+ */
+__init void* boot_ioremap(unsigned long phys_addr, unsigned long size)
+{
+	unsigned long last_addr, offset;
+	unsigned int nrpages;
+	
+	last_addr = phys_addr + size - 1;
+
+	/* page align the requested address */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
+	size = PAGE_ALIGN(last_addr) - phys_addr;
+	
+	nrpages = size >> PAGE_SHIFT;
+	if (nrpages > BOOT_IOREMAP_PAGES)
+		return NULL;
+	
+	__boot_ioremap(phys_addr, nrpages, boot_ioremap_space);
+
+	return &boot_ioremap_space[offset];
+}
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -284,6 +284,7 @@ void __init zone_sizes_init(void)

 	for (nid = 0; nid < numnodes; nid++) {
 		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+		unsigned long *zholes_size;
 		unsigned int max_dma;

 		unsigned long low = max_low_pfn;
@@ -307,6 +308,7 @@ void __init zone_sizes_init(void)
 #endif
 			}
 		}
+		zholes_size = get_zholes_size(nid);
 		/*
 		 * We let the lmem_map for node 0 be allocated from the
 		 * normal bootmem allocator, but other nodes come from the
@@ -315,10 +317,10 @@ void __init zone_sizes_init(void)
 		if (nid)
 			free_area_init_node(nid, NODE_DATA(nid), 
 				node_remap_start_vaddr[nid], zones_size, 
-				start, 0);
+				start, zholes_size);
 		else
 			free_area_init_node(nid, NODE_DATA(nid), 0, 
-				zones_size, start, 0);
+				zones_size, start, zholes_size);
 	}
 	return;
 }

--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -29,6 +29,8 @@ static long    htlbzone_pages;
 static LIST_HEAD(htlbpage_freelist);
 static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;

+void free_huge_page(struct page *page);
+
 static struct page *alloc_hugetlb_page(void)
 {
 	int i;
@@ -45,7 +47,7 @@ static struct page *alloc_hugetlb_page(void)
 	htlbpagemem--;
 	spin_unlock(&htlbpage_lock);
 	set_page_count(page, 1);
-	page->lru.prev = (void *)huge_page_release;
+	page->lru.prev = (void *)free_huge_page;
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_highpage(&page[i]);
 	return page;

--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -205,6 +205,7 @@ void *ioremap_nocache (unsigned long phys_addr, unsigned long size)
 			iounmap(p); 
 			p = NULL;
 		}
+		global_flush_tlb();
 	} 

 	return p;					
@@ -226,6 +227,7 @@ void iounmap(void *addr)
 		change_page_attr(virt_to_page(__va(p->phys_addr)),
 				 p->size >> PAGE_SHIFT,
 				 PAGE_KERNEL); 				 
+		global_flush_tlb();
 	} 
 	kfree(p); 
 }

--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -27,7 +27,7 @@
 extern unsigned long wall_jiffies;
 extern unsigned long last_time_offset;

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 #ifdef CONFIG_IA64_DEBUG_IRQ


--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -26,6 +26,8 @@ static long    htlbzone_pages;
 static LIST_HEAD(htlbpage_freelist);
 static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;

+void free_huge_page(struct page *page);
+
 static struct page *alloc_hugetlb_page(void)
 {
 	int i;
@@ -42,6 +44,7 @@ static struct page *alloc_hugetlb_page(void)
 	htlbpagemem--;
 	spin_unlock(&htlbpage_lock);
 	set_page_count(page, 1);
+	page->lru.prev = (void *)free_huge_page;
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_highpage(&page[i]);
 	return page;

--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -26,7 +26,7 @@
 #include <linux/timex.h>
 #include <linux/profile.h>

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 static inline int set_rtc_mmss(unsigned long nowtime)
 {

--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -26,7 +26,7 @@

 #define	TICK_SIZE (tick_nsec / 1000)

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 static inline int set_rtc_mmss(unsigned long nowtime)
 {

--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -32,7 +32,7 @@
 #define USECS_PER_JIFFY (1000000/HZ)
 #define USECS_PER_JIFFY_FRAC ((1000000ULL << 32) / HZ & 0xffffffff)

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 /*
 * forward reference

--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -32,7 +32,7 @@

 #include <linux/timex.h>

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 /* xtime and wall_jiffies keep wall-clock time */
 extern unsigned long wall_jiffies;

--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -68,7 +68,7 @@
 #include <asm/time.h>

 /* XXX false sharing with below? */
-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 unsigned long disarm_decr[NR_CPUS];


--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -65,7 +65,7 @@

 void smp_local_timer_interrupt(struct pt_regs *);

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 /* keep track of when we need to update the rtc */
 time_t last_rtc_update;

--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -46,7 +46,7 @@

 #define TICK_SIZE tick

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 static ext_int_info_t ext_int_info_timer;
 static uint64_t xtime_cc;

--- a/arch/s390x/kernel/time.c
+++ b/arch/s390x/kernel/time.c
@@ -45,7 +45,7 @@

 #define TICK_SIZE tick

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 static ext_int_info_t ext_int_info_timer;
 static uint64_t xtime_cc;

--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -70,7 +70,7 @@
 #endif /* CONFIG_CPU_SUBTYPE_ST40STB1 */
 #endif /* __sh3__ or __SH4__ */

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 extern unsigned long wall_jiffies;
 #define TICK_SIZE tick

--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -45,7 +45,7 @@

 extern unsigned long wall_jiffies;

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 enum sparc_clock_type sp_clock_typ;

--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -47,7 +47,7 @@ unsigned long ds1287_regs = 0UL;

 extern unsigned long wall_jiffies;

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 static unsigned long mstk48t08_regs = 0UL;
 static unsigned long mstk48t59_regs = 0UL;

--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -25,6 +25,7 @@ spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
 extern long htlbpagemem;

 static void zap_hugetlb_resources(struct vm_area_struct *);
+void free_huge_page(struct page *page);

 #define MAX_ID 	32
 struct htlbpagekey {
@@ -64,6 +65,7 @@ static struct page *alloc_hugetlb_page(void)
 	spin_unlock(&htlbpage_lock);

 	set_page_count(page, 1);
+	page->lru.prev = (void *)free_huge_page;
 	memset(page_address(page), 0, HPAGE_SIZE);

 	return page;

--- a/arch/v850/kernel/time.c
+++ b/arch/v850/kernel/time.c
@@ -25,7 +25,7 @@

 #include "mach.h"

-u64 jiffies_64;
+u64 jiffies_64 = INITIAL_JIFFIES;

 #define TICK_SIZE	(tick_nsec / 1000)


--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -437,6 +437,7 @@ static __init int init_k8_gatt(agp_kern_info *info)
 	}
 	flush_gart(); 
 	
+	global_flush_tlb();
 		
 	printk("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); 
 	return 0;

--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -30,7 +30,7 @@
 #include <asm/apic.h>
 #endif

-u64 jiffies_64; 
+u64 jiffies_64;

 extern int using_apic_timer;


--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -205,6 +205,7 @@ void *ioremap_nocache (unsigned long phys_addr, unsigned long size)
 			iounmap(p); 
 			p = NULL;
 		}
+		global_flush_tlb();
 	} 

 	return p;					
@@ -226,6 +227,7 @@ void iounmap(void *addr)
 		change_page_attr(virt_to_page(__va(p->phys_addr)),
 				 p->size >> PAGE_SHIFT,
 				 PAGE_KERNEL); 				 
+		global_flush_tlb();
 	} 
 	kfree(p); 
 }
--- a/drivers/block/genhd.c
+++ b/drivers/block/genhd.c
 /*
- *  Code extracted from
- *  linux/kernel/hd.c
- *
- *  Copyright (C) 1991-1998  Linus Torvalds
- *
- *  devfs support - jj, rgooch, 980122
- *
- *  Moved partition checking code to fs/partitions* - Russell King
- *  (linux@arm.uk.linux.org)
- */
-
-/*
- * TODO:  rip out the remaining init crap from this file  --hch
+ *  gendisk handling
 */

 #include <linux/config.h>
@@ -29,8 +17,9 @@

 static struct subsystem block_subsys;

+#define MAX_PROBE_HASH 23	/* random */

-struct blk_probe {
+static struct blk_probe {
 	struct blk_probe *next;
 	dev_t dev;
 	unsigned long range;
@@ -38,21 +27,27 @@ struct blk_probe {
 	struct gendisk *(*get)(dev_t dev, int *part, void *data);
 	int (*lock)(dev_t, void *);
 	void *data;
-} *probes[MAX_BLKDEV];
+} *probes[MAX_PROBE_HASH];

-/* index in the above */
+/* index in the above - for now: assume no multimajor ranges */
 static inline int dev_to_index(dev_t dev)
 {
-	return MAJOR(dev);
+	return MAJOR(dev) % MAX_PROBE_HASH;
 }

+/*
+ * Register device numbers dev..(dev+range-1)
+ * range must be nonzero
+ * The hash chain is sorted on range, so that subranges can override.
+ */
 void blk_register_region(dev_t dev, unsigned long range, struct module *module,
-		    struct gendisk *(*probe)(dev_t, int *, void *),
-		    int (*lock)(dev_t, void *), void *data)
+			 struct gendisk *(*probe)(dev_t, int *, void *),
+			 int (*lock)(dev_t, void *), void *data)
 {
 	int index = dev_to_index(dev);
 	struct blk_probe *p = kmalloc(sizeof(struct blk_probe), GFP_KERNEL);
 	struct blk_probe **s;
+
 	p->owner = module;
 	p->get = probe;
 	p->lock = lock;
@@ -71,6 +66,7 @@ void blk_unregister_region(dev_t dev, unsigned long range)
 {
 	int index = dev_to_index(dev);
 	struct blk_probe **s;
+
 	down_write(&block_subsys.rwsem);
 	for (s = &probes[index]; *s; s = &(*s)->next) {
 		struct blk_probe *p = *s;
@@ -94,6 +90,7 @@ static struct gendisk *exact_match(dev_t dev, int *part, void *data)
 static int exact_lock(dev_t dev, void *data)
 {
 	struct gendisk *p = data;
+
 	if (!get_disk(p))
 		return -1;
 	return 0;
@@ -109,14 +106,14 @@ static int exact_lock(dev_t dev, void *data)
 void add_disk(struct gendisk *disk)
 {
 	disk->flags |= GENHD_FL_UP;
-	blk_register_region(MKDEV(disk->major, disk->first_minor), disk->minors,
-			NULL, exact_match, exact_lock, disk);
+	blk_register_region(MKDEV(disk->major, disk->first_minor),
+			    disk->minors, NULL, exact_match, exact_lock, disk);
 	register_disk(disk);
 	elv_register_queue(disk);
 }

 EXPORT_SYMBOL(add_disk);
-EXPORT_SYMBOL(del_gendisk);
+EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */

 void unlink_gendisk(struct gendisk *disk)
 {
@@ -146,18 +143,17 @@ get_gendisk(dev_t dev, int *part)
 		struct gendisk *(*probe)(dev_t, int *, void *);
 		struct module *owner;
 		void *data;
-		if (p->dev > dev || p->dev + p->range <= dev)
+
+		if (p->dev > dev || p->dev + p->range - 1 < dev)
 			continue;
-		if (p->range >= best) {
-			up_read(&block_subsys.rwsem);
-			return NULL;
-		}
+		if (p->range - 1 >= best)
+			break;
 		if (!try_module_get(p->owner))
 			continue;
 		owner = p->owner;
 		data = p->data;
 		probe = p->get;
-		best = p->range;
+		best = p->range - 1;
 		*part = dev - p->dev;
 		if (p->lock && p->lock(dev, data) < 0) {
 			module_put(owner);
@@ -169,7 +165,7 @@ get_gendisk(dev_t dev, int *part)
 		module_put(owner);
 		if (disk)
 			return disk;
-		goto retry;
+		goto retry;		/* this terminates: best decreases */
 	}
 	up_read(&block_subsys.rwsem);
 	return NULL;
@@ -245,7 +241,7 @@ extern int blk_dev_init(void);

 static struct gendisk *base_probe(dev_t dev, int *part, void *data)
 {
-	char name[20];
+	char name[30];
 	sprintf(name, "block-major-%d", MAJOR(dev));
 	request_module(name);
 	return NULL;
@@ -256,11 +252,11 @@ int __init device_init(void)
 	struct blk_probe *base = kmalloc(sizeof(struct blk_probe), GFP_KERNEL);
 	int i;
 	memset(base, 0, sizeof(struct blk_probe));
-	base->dev = MKDEV(1,0);
-	base->range = MKDEV(MAX_BLKDEV-1, 255) - base->dev + 1;
+	base->dev = 1;
+	base->range = ~0;		/* range 1 .. ~0 */
 	base->get = base_probe;
-	for (i = 1; i < MAX_BLKDEV; i++)
-		probes[i] = base;
+	for (i = 0; i < MAX_PROBE_HASH; i++)
+		probes[i] = base;	/* must remain last in chain */
 	blk_dev_init();
 	subsystem_register(&block_subsys);
 	return 0;
@@ -281,12 +277,14 @@ struct disk_attribute {
 	ssize_t (*show)(struct gendisk *, char *);
 };

-static ssize_t disk_attr_show(struct kobject * kobj, struct attribute * attr,
-			      char * page)
+static ssize_t disk_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *page)
 {
-	struct gendisk * disk = to_disk(kobj);
-	struct disk_attribute * disk_attr = container_of(attr,struct disk_attribute,attr);
+	struct gendisk *disk = to_disk(kobj);
+	struct disk_attribute *disk_attr =
+		container_of(attr,struct disk_attribute,attr);
 	ssize_t ret = 0;
+
 	if (disk_attr->show)
 		ret = disk_attr->show(disk,page);
 	return ret;
@@ -303,11 +301,11 @@ static ssize_t disk_dev_read(struct gendisk * disk, char *page)
 }
 static ssize_t disk_range_read(struct gendisk * disk, char *page)
 {
-	return sprintf(page, "%d\n",disk->minors);
+	return sprintf(page, "%d\n", disk->minors);
 }
 static ssize_t disk_size_read(struct gendisk * disk, char *page)
 {
-	return sprintf(page, "%llu\n",(unsigned long long)get_capacity(disk));
+	return sprintf(page, "%llu\n", (unsigned long long)get_capacity(disk));
 }

 static inline unsigned jiffies_to_msec(unsigned jif)

--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1461,6 +1461,7 @@ void blk_insert_request(request_queue_t *q, struct request *rq,
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);

+	drive_stat_acct(rq, rq->nr_sectors, 1);
 	__elv_add_request(q, rq, !at_head, 0);
 	q->request_fn(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);

--- a/drivers/block/scsi_ioctl.c
+++ b/drivers/block/scsi_ioctl.c
@@ -60,6 +60,7 @@ int blk_do_rq(request_queue_t *q, struct block_device *bdev, struct request *rq)

 	rq->flags |= REQ_NOMERGE;
 	rq->waiting = &wait;
+        drive_stat_acct(rq, rq->nr_sectors, 1);
 	elv_add_request(q, rq, 1, 1);
 	generic_unplug_device(q);
 	wait_for_completion(&wait);

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -119,7 +119,7 @@ static ctl_table raid_root_table[] = {
 		.procname	= "dev",
 		.maxlen		= 0,
 		.mode		= 0555,
-		.proc_handler	= raid_dir_table,
+		.child		= raid_dir_table,
 	},
 	{ .ctl_name = 0 }
 };

--- a/drivers/net/irda/sir_kthread.c
+++ b/drivers/net/irda/sir_kthread.c
@@ -151,7 +151,7 @@ static int irda_thread(void *startup)

 	while (irda_rq_queue.thread != NULL) {

-		set_task_state(current, TASK_UNINTERRUPTIBLE);
+		set_task_state(current, TASK_INTERRUPTIBLE);
 		add_wait_queue(&irda_rq_queue.kick, &wait);
 		if (list_empty(&irda_rq_queue.request_list))
 			schedule();

--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1004,8 +1004,11 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 		 */ 
 		if (unlikely(move_count != dentry->d_move_count)) 
 			break;
-		if (!d_unhashed(dentry))
-			found = dget(dentry);
+		if (!d_unhashed(dentry)) {
+			atomic_inc(&dentry->d_count);
+			dentry->d_vfs_flags |= DCACHE_REFERENCED;
+			found = dentry;
+		}
 		spin_unlock(&dentry->d_lock);
 		break;
 	}

--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -33,12 +33,17 @@ static unsigned char ext3_filetype_table[] = {
 static int ext3_readdir(struct file *, void *, filldir_t);
 static int ext3_dx_readdir(struct file * filp,
 			   void * dirent, filldir_t filldir);
+static int ext3_release_dir (struct inode * inode,
+				struct file * filp);

 struct file_operations ext3_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
 	.ioctl		= ext3_ioctl,		/* BKL held */
 	.fsync		= ext3_sync_file,		/* BKL held */
+#ifdef CONFIG_EXT3_INDEX
+	.release	= ext3_release_dir,
+#endif
 };


@@ -275,7 +280,11 @@ static void free_rb_tree_fname(struct rb_root *root)
 		 */
 		parent = n->rb_parent;
 		fname = rb_entry(n, struct fname, rb_hash);
-		kfree(fname);
+		while (fname) {
+			struct fname * old = fname;
+			fname = fname->next;
+			kfree (old);
+		}
 		if (!parent)
 			root->rb_node = 0;
 		else if (parent->rb_left == n)
@@ -481,4 +490,13 @@ static int ext3_dx_readdir(struct file * filp,
 	UPDATE_ATIME(inode);
 	return 0;
 }
+
+static int ext3_release_dir (struct inode * inode, struct file * filp)
+{
+       if (is_dx(inode) && filp->private_data)
+		ext3_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
 #endif
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -55,29 +55,61 @@ static int ext3_open_file (struct inode * inode, struct file * filp)
 	return 0;
 }

-/*
- * ext3_file_write().
- *
- * Most things are done in ext3_prepare_write() and ext3_commit_write().
- */
-
 static ssize_t
 ext3_file_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode;
+	int ret, err;
+
+	ret = generic_file_aio_write(iocb, buf, count, pos);

 	/*
-	 * Nasty: if the file is subject to synchronous writes then we need
-	 * to force generic_osync_inode() to call ext3_write_inode().
-	 * We do that by marking the inode dirty.  This adds much more
-	 * computational expense than we need, but we're going to sync
-	 * anyway.
+	 * Skip flushing if there was an error, or if nothing was written.
+	 */
+	if (ret <= 0)
+		return ret;
+	
+	/*
+	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
+	 * journalling then we need to make sure that we force the transaction
+	 * to disk to keep all metadata uptodate synchronously.
 	 */
-	if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
-		mark_inode_dirty(inode);
+	if (file->f_flags & O_SYNC) {
+		/*
+		 * If we are non-data-journaled, then the dirty data has
+		 * already been flushed to backing store by generic_osync_inode,
+		 * and the inode has been flushed too if there have been any
+		 * modifications other than mere timestamp updates.
+		 *
+		 * Open question --- do we care about flushing timestamps too
+		 * if the inode is IS_SYNC?
+		 */
+		if (!ext3_should_journal_data(inode))
+			return ret;
+
+		goto force_commit;
+	}

-	return generic_file_aio_write(iocb, buf, count, pos);
+	/*
+	 * So we know that there has been no forced data flush.  If the inode
+	 * is marked IS_SYNC, we need to force one ourselves.
+	 */
+	if (!IS_SYNC(inode))
+		return ret;
+	
+	/*
+	 * Open question #2 --- should we force data to disk here too?  If we
+	 * don't, the only impact is that data=writeback filesystems won't
+	 * flush data to disk automatically on IS_SYNC, only metadata (but
+	 * historically, that is what ext2 has done.)
+	 */
+	
+force_commit:
+	err = ext3_force_commit(inode->i_sb);
+	if (err) 
+		return err;
+	return ret;
 }

 struct file_operations ext3_file_operations = {

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -80,6 +80,16 @@ static LIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
 */
 spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;

+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path.  Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
 /*
 * Statistics gathering..
 */
@@ -320,6 +330,7 @@ int invalidate_inodes(struct super_block * sb)
 	int busy;
 	LIST_HEAD(throw_away);

+	down(&iprune_sem);
 	spin_lock(&inode_lock);
 	busy = invalidate_list(&inode_in_use, sb, &throw_away);
 	busy |= invalidate_list(&inode_unused, sb, &throw_away);
@@ -328,6 +339,7 @@ int invalidate_inodes(struct super_block * sb)
 	spin_unlock(&inode_lock);

 	dispose_list(&throw_away);
+	up(&iprune_sem);

 	return busy;
 }
@@ -395,6 +407,7 @@ static void prune_icache(int nr_to_scan)
 	int nr_scanned;
 	unsigned long reap = 0;

+	down(&iprune_sem);
 	spin_lock(&inode_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
@@ -429,7 +442,10 @@ static void prune_icache(int nr_to_scan)
 	}
 	inodes_stat.nr_unused -= nr_pruned;
 	spin_unlock(&inode_lock);
+
 	dispose_list(&freeable);
+	up(&iprune_sem);
+
 	if (current_is_kswapd)
 		mod_page_state(kswapd_inodesteal, reap);
 	else

--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -358,7 +358,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		nice,
 		0UL /* removed */,
 		jiffies_to_clock_t(task->it_real_value),
-		(unsigned long long) jiffies_64_to_clock_t(task->start_time),
+		(unsigned long long)
+		    jiffies_64_to_clock_t(task->start_time - INITIAL_JIFFIES),
 		vsize,
 		mm ? mm->rss : 0, /* you might want to shift this left 3 */
 		task->rlim[RLIMIT_RSS].rlim_cur,

--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -104,7 +104,7 @@ static int uptime_read_proc(char *page, char **start, off_t off,
 	unsigned long uptime_remainder;
 	int len;

-	uptime = get_jiffies_64();
+	uptime = get_jiffies_64() - INITIAL_JIFFIES;
 	uptime_remainder = (unsigned long) do_div(uptime, HZ);

 #if HZ!=100
@@ -320,7 +320,7 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 {
 	int i, len;
 	extern unsigned long total_forks;
-	u64 jif = get_jiffies_64();
+	u64 jif = get_jiffies_64() - INITIAL_JIFFIES;
 	unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0;

 	for (i = 0 ; i < NR_CPUS; i++) {

--- a/include/asm-i386/mach-bigsmp/mach_apic.h
+++ b/include/asm-i386/mach-bigsmp/mach_apic.h
@@ -10,7 +10,7 @@
 		((phys_apic) & (~0xf)) )
 #endif

-#define no_balance_irq (1)
+#define NO_BALANCE_IRQ (1)
 #define esr_disable (1)

 static inline int apic_id_registered(void)

--- a/include/asm-i386/mach-default/mach_apic.h
+++ b/include/asm-i386/mach-default/mach_apic.h
@@ -9,7 +9,7 @@
 #define TARGET_CPUS 0x01
 #endif

-#define no_balance_irq (0)
+#define NO_BALANCE_IRQ (0)
 #define esr_disable (0)

 #define INT_DELIVERY_MODE dest_LowestPrio

--- a/include/asm-i386/mach-numaq/mach_apic.h
+++ b/include/asm-i386/mach-numaq/mach_apic.h
@@ -5,7 +5,7 @@

 #define TARGET_CPUS (0xf)

-#define no_balance_irq (1)
+#define NO_BALANCE_IRQ (1)
 #define esr_disable (1)

 #define INT_DELIVERY_MODE dest_LowestPrio

--- a/include/asm-i386/mach-summit/mach_apic.h
+++ b/include/asm-i386/mach-summit/mach_apic.h
@@ -4,7 +4,7 @@
 extern int x86_summit;

 #define esr_disable (x86_summit ? 1 : 0)
-#define no_balance_irq (0)
+#define NO_BALANCE_IRQ (0)

 #define XAPIC_DEST_CPUS_MASK    0x0Fu
 #define XAPIC_DEST_CLUSTER_MASK 0xF0u

--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -12,6 +12,8 @@

 #ifdef CONFIG_X86_NUMAQ
 #include <asm/numaq.h>
+#elif CONFIG_X86_SUMMIT
+#include <asm/srat.h>
 #else
 #define pfn_to_nid(pfn)		(0)
 #endif /* CONFIG_X86_NUMAQ */

--- a/include/asm-i386/numaq.h
+++ b/include/asm-i386/numaq.h
@@ -168,6 +168,10 @@ struct sys_cfg_data {
        struct	eachquadmem eq[MAX_NUMNODES];	/* indexed by quad id */
 };

+static inline unsigned long get_zholes_size(int nid)
+{
+	return 0;
+}
 #endif /* CONFIG_X86_NUMAQ */
 #endif /* NUMAQ_H */

--- a/include/asm-i386/numnodes.h
+++ b/include/asm-i386/numnodes.h
@@ -5,6 +5,8 @@

 #ifdef CONFIG_X86_NUMAQ
 #include <asm/numaq.h>
+#elif CONFIG_X86_SUMMIT
+#include <asm/srat.h>
 #else
 #define MAX_NUMNODES	1
 #endif /* CONFIG_X86_NUMAQ */

--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -49,6 +49,7 @@ typedef struct { unsigned long long pgd; } pgd_t;
 typedef struct { unsigned long pte_low; } pte_t;
 typedef struct { unsigned long pmd; } pmd_t;
 typedef struct { unsigned long pgd; } pgd_t;
+#define boot_pte_t pte_t /* or would you rather have a typedef */
 #define pte_val(x)	((x).pte_low)
 #define HPAGE_SHIFT	22
 #endif

--- a/include/asm-i386/srat.h
+++ b/include/asm-i386/srat.h
+/*
+ * Some of the code in this file has been gleaned from the 64 bit 
+ * discontigmem support code base.
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send feedback to Pat Gaughen <gone@us.ibm.com>
+ */
+
+#ifndef _ASM_SRAT_H_
+#define _ASM_SRAT_H_
+
+/*
+ * each element in pfnnode_map represents 256 MB (2^28) of pages.
+ * so, to represent 64GB we need 256 elements.
+ */
+#define MAX_ELEMENTS 256
+#define PFN_TO_ELEMENT(pfn) ((pfn)>>(28 - PAGE_SHIFT))
+
+extern int pfnnode_map[];
+#define pfn_to_nid(pfn) ({ pfnnode_map[PFN_TO_ELEMENT(pfn)]; })
+#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn))
+#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT)
+#define MAX_NUMNODES		8
+extern void get_memcfg_from_srat(void);
+extern unsigned long *get_zholes_size(int);
+#define get_memcfg_numa() get_memcfg_from_srat()
+
+#endif /* _ASM_SRAT_H_ */
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -79,7 +79,7 @@ typedef struct {

 struct acpi_table_rsdt {
 	struct acpi_table_header header;
-	u32			entry[1];
+	u32			entry[8];
 } __attribute__ ((packed));

 /* Extended System Description Table (XSDT) */

--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -262,6 +262,8 @@ extern char * d_path(struct dentry *, struct vfsmount *, char *, int);
 static __inline__ struct dentry * dget(struct dentry *dentry)
 {
 	if (dentry) {
+		if (!atomic_read(&dentry->d_count))
+			BUG();
 		atomic_inc(&dentry->d_count);
 		dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	}

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -232,11 +232,15 @@ static inline void get_page(struct page *page)
 static inline void put_page(struct page *page)
 {
 	if (PageCompound(page)) {
-		page = (struct page *)page->lru.next;
-		if (page->lru.prev) {	/* destructor? */
-			(*(void (*)(struct page *))page->lru.prev)(page);
-			return;
+		if (put_page_testzero(page)) {
+			page = (struct page *)page->lru.next;
+			if (page->lru.prev) {	/* destructor? */
+				(*(void (*)(struct page *))page->lru.prev)(page);
+			} else {
+				__page_cache_release(page);
+			}
 		}
+		return;
 	}
 	if (!PageReserved(page) && put_page_testzero(page))
 		__page_cache_release(page);

--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -27,6 +27,12 @@ struct timezone {
 #include <linux/spinlock.h>
 #include <linux/seqlock.h>

+/*
+ * Have the 32 bit jiffies value wrap 5 minutes after boot
+ * so jiffies wrap bugs show up earlier.
+ */
+#define INITIAL_JIFFIES ((unsigned int) (-300*HZ))
+
 /*
 * Change timeval to jiffies, trying to avoid the
 * most obvious overflows..

--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -870,7 +870,7 @@ asmlinkage long sys_times(struct tms * tbuf)
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
-	return jiffies_to_clock_t(jiffies);
+	return (long) jiffies_64_to_clock_t(get_jiffies_64());
 }

 /*

--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -757,7 +757,7 @@ static inline void calc_load(unsigned long ticks)
 }

 /* jiffies at the most recent update of wall time */
-unsigned long wall_jiffies;
+unsigned long wall_jiffies = INITIAL_JIFFIES;

 /*
 * This read-write spinlock protects us from races in SMP while
@@ -1104,7 +1104,7 @@ asmlinkage long sys_sysinfo(struct sysinfo *info)
 	do {
 		seq = read_seqbegin(&xtime_lock);

-		uptime = jiffies_64;
+		uptime = jiffies_64 - INITIAL_JIFFIES;
 		do_div(uptime, HZ);
 		val.uptime = (unsigned long) uptime;

@@ -1180,6 +1180,13 @@ static void __devinit init_timers_cpu(int cpu)
 	}
 	for (j = 0; j < TVR_SIZE; j++)
 		INIT_LIST_HEAD(base->tv1.vec + j);
+
+	base->timer_jiffies = INITIAL_JIFFIES;
+	base->tv1.index = INITIAL_JIFFIES & TVR_MASK;
+	base->tv2.index = (INITIAL_JIFFIES >> TVR_BITS) & TVN_MASK;
+	base->tv3.index = (INITIAL_JIFFIES >> (TVR_BITS+TVN_BITS)) & TVN_MASK;
+	base->tv4.index = (INITIAL_JIFFIES >> (TVR_BITS+2*TVN_BITS)) & TVN_MASK;
+	base->tv5.index = (INITIAL_JIFFIES >> (TVR_BITS+3*TVN_BITS)) & TVN_MASK;
 }
 	
 static int __devinit timer_cpu_notify(struct notifier_block *self, 

--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -90,19 +90,16 @@ u32 attribute((pure)) crc32_le(u32 crc, unsigned char const *p, size_t len)
 	const u32      *tab = crc32table_le;

 # ifdef __LITTLE_ENDIAN
-#  define DO_CRC crc = (crc>>8) ^ tab[ crc & 255 ]
-#  define ENDIAN_SHIFT 0
+#  define DO_CRC(x) crc = tab[ (crc ^ (x)) & 255 ] ^ (crc>>8)
 # else
-#  define DO_CRC crc = (crc<<8) ^ tab[ crc >> 24 ]
-#  define ENDIAN_SHIFT 24
+#  define DO_CRC(x) crc = tab[ ((crc >> 24) ^ (x)) & 255] ^ (crc<<8)
 # endif

 	crc = __cpu_to_le32(crc);
 	/* Align it */
 	if(unlikely(((long)b)&3 && len)){
 		do {
-			crc ^= *((u8 *)b)++ << ENDIAN_SHIFT;
-			DO_CRC;
+			DO_CRC(*((u8 *)b)++);
 		} while ((--len) && ((long)b)&3 );
 	}
 	if(likely(len >= 4)){
@@ -112,10 +109,10 @@ u32 attribute((pure)) crc32_le(u32 crc, unsigned char const *p, size_t len)
 		--b; /* use pre increment below(*++b) for speed */
 		do {
 			crc ^= *++b;
-			DO_CRC;
-			DO_CRC;
-			DO_CRC;
-			DO_CRC;
+			DO_CRC(0);
+			DO_CRC(0);
+			DO_CRC(0);
+			DO_CRC(0);
 		} while (--len);
 		b++; /* point to next byte(s) */
 		len = save_len;
@@ -123,8 +120,7 @@ u32 attribute((pure)) crc32_le(u32 crc, unsigned char const *p, size_t len)
 	/* And the last few bytes */
 	if(len){
 		do {
-			crc ^= *((u8 *)b)++ << ENDIAN_SHIFT;
-			DO_CRC;
+			DO_CRC(*((u8 *)b)++);
 		} while (--len);
 	}

@@ -195,19 +191,16 @@ u32 attribute((pure)) crc32_be(u32 crc, unsigned char const *p, size_t len)
 	const u32      *tab = crc32table_be;

 # ifdef __LITTLE_ENDIAN
-#  define DO_CRC crc = (crc>>8) ^ tab[ crc & 255 ]
-#  define ENDIAN_SHIFT 24
+#  define DO_CRC(x) crc = tab[ (crc ^ (x)) & 255 ] ^ (crc>>8)
 # else
-#  define DO_CRC crc = (crc<<8) ^ tab[ crc >> 24 ]
-#  define ENDIAN_SHIFT 0
+#  define DO_CRC(x) crc = tab[ ((crc >> 24) ^ (x)) & 255] ^ (crc<<8)
 # endif

 	crc = __cpu_to_be32(crc);
 	/* Align it */
 	if(unlikely(((long)b)&3 && len)){
 		do {
-			crc ^= *((u8 *)b)++ << ENDIAN_SHIFT;
-			DO_CRC;
+			DO_CRC(*((u8 *)b)++);
 		} while ((--len) && ((long)b)&3 );
 	}
 	if(likely(len >= 4)){
@@ -217,10 +210,10 @@ u32 attribute((pure)) crc32_be(u32 crc, unsigned char const *p, size_t len)
 		--b; /* use pre increment below(*++b) for speed */
 		do {
 			crc ^= *++b;
-			DO_CRC;
-			DO_CRC;
-			DO_CRC;
-			DO_CRC;
+			DO_CRC(0);
+			DO_CRC(0);
+			DO_CRC(0);
+			DO_CRC(0);
 		} while (--len);
 		b++; /* point to next byte(s) */
 		len = save_len;
@@ -228,8 +221,7 @@ u32 attribute((pure)) crc32_be(u32 crc, unsigned char const *p, size_t len)
 	/* And the last few bytes */
 	if(len){
 		do {
-			crc ^= *((u8 *)b)++ << ENDIAN_SHIFT;
-			DO_CRC;
+			DO_CRC(*((u8 *)b)++);
 		} while (--len);
 	}
 	return __be32_to_cpu(crc);

--- a/lib/crc32defs.h
+++ b/lib/crc32defs.h
@@ -8,8 +8,12 @@

 /* How many bits at a time to use.  Requires a table of 4<<CRC_xx_BITS bytes. */
 /* For less performance-sensitive, use 4 */
-#define CRC_LE_BITS 8
-#define CRC_BE_BITS 8
+#ifndef CRC_LE_BITS 
+# define CRC_LE_BITS 8
+#endif
+#ifndef CRC_BE_BITS
+# define CRC_BE_BITS 8
+#endif

 /*
 * Little-endian CRC computation.  Used with serial bit streams sent

--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -559,21 +559,12 @@ void do_generic_mapping_read(struct address_space *mapping,
 		page_cache_readahead(mapping, ra, filp, index);

 		nr = nr - offset;
-
-		/*
-		 * Try to find the data in the page cache..
-		 */
 find_page:
-		read_lock(&mapping->page_lock);
-		page = radix_tree_lookup(&mapping->page_tree, index);
-		if (!page) {
-			read_unlock(&mapping->page_lock);
-			handle_ra_miss(mapping,ra);
+		page = find_get_page(mapping, index);
+		if (unlikely(page == NULL)) {
+			handle_ra_miss(mapping, ra);
 			goto no_cached_page;
 		}
-		page_cache_get(page);
-		read_unlock(&mapping->page_lock);
-
 		if (!PageUptodate(page))
 			goto page_not_up_to_date;
 page_ok:

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -158,9 +158,7 @@ pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 		pmd_populate(mm, pmd, new);
 	}
 out:
-	if (pmd_present(*pmd))
-		return pte_offset_map(pmd, address);
-	return NULL;
+	return pte_offset_map(pmd, address);
 }

 pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)

--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -61,6 +61,9 @@ static int badness(struct task_struct *p)

 	if (!p->mm)
 		return 0;
+
+	if (p->flags & PF_MEMDIE)
+		return 0;
 	/*
 	 * The memory size of the process is the basis for the badness.
 	 */

--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1643,7 +1643,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
 		unsigned long	ctor_flags = SLAB_CTOR_CONSTRUCTOR;

-		if (!flags & __GFP_WAIT)
+		if (!(flags & __GFP_WAIT))
 			ctor_flags |= SLAB_CTOR_ATOMIC;

 		cachep->ctor(objp, cachep, ctor_flags);
@@ -2064,7 +2064,7 @@ static void enable_cpucache (kmem_cache_t *cachep)
 	else
 		limit = 248;

-#ifndef DEBUG
+#if DEBUG
 	/* With debugging enabled, large batchcount lead to excessively
 	 * long periods with disabled local interrupts. Limit the 
 	 * batchcount