[PATCH] x86_64: Use function pointers to call DMA mapping functions

AK: I hacked Muli's original patch a lot and there were a lot of changes - all bugs are probably to blame on me now. There were also some changes in the fall back behaviour for swiotlb - in particular it doesn't try to use GFP_DMA now anymore. Also all DMA mapping operations use the same core dma_alloc_coherent code with proper fallbacks now. And various other changes and cleanups. Known problems: iommu=force swiotlb=force together breaks needs more testing. This patch cleans up x86_64's DMA mapping dispatching code. Right now we have three possible IOMMU types: AGP GART, swiotlb and nommu, and in the future we will also have Xen's x86_64 swiotlb and other HW IOMMUs for x86_64. In order to support all of them cleanly, this patch: - introduces a struct dma_mapping_ops with function pointers for each of the DMA mapping operations of gart (AMD HW IOMMU), swiotlb (software IOMMU) and nommu (no IOMMU). - gets rid of: if (swiotlb) return swiotlb_xxx(); - PCI_DMA_BUS_IS_PHYS is now checked against the dma_ops being set This makes swiotlb faster by avoiding double copying in some cases. Signed-Off-By: Muli Ben-Yehuda <mulix@mulix.org> Signed-Off-By: Jon D. Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] x86_64: Use function pointers to call DMA mapping functions
AK: I hacked Muli's original patch a lot and there were a lot of changes - all bugs are probably to blame on me now. There were also some changes in the fall back behaviour for swiotlb - in particular it doesn't try to use GFP_DMA now anymore. Also all DMA mapping operations use the same core dma_alloc_coherent code with proper fallbacks now. And various other changes and cleanups. Known problems: iommu=force swiotlb=force together breaks needs more testing. This patch cleans up x86_64's DMA mapping dispatching code. Right now we have three possible IOMMU types: AGP GART, swiotlb and nommu, and in the future we will also have Xen's x86_64 swiotlb and other HW IOMMUs for x86_64. In order to support all of them cleanly, this patch: - introduces a struct dma_mapping_ops with function pointers for each of the DMA mapping operations of gart (AMD HW IOMMU), swiotlb (software IOMMU) and nommu (no IOMMU). - gets rid of: if (swiotlb) return swiotlb_xxx(); - PCI_DMA_BUS_IS_PHYS is now checked against the dma_ops being set This makes swiotlb faster by avoiding double copying in some cases. Signed-Off-By: Muli Ben-Yehuda <mulix@mulix.org> Signed-Off-By: Jon D. Mason <jdmason@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
17a941d8 · Muli Ben-Yehuda · Linus Torvalds · 8a6fdd3e · 17a941d8 · 17a941d8
Commit 17a941d8 authored Jan 11, 2006 by Muli Ben-Yehuda Committed by Linus Torvalds Jan 11, 2006
14 changed files
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -351,32 +351,24 @@ config HPET_EMULATE_RTC
 	depends on HPET_TIMER && RTC=y

 config GART_IOMMU
-	bool "IOMMU support"
+	bool "K8 GART IOMMU support"
 	default y
+	select SWIOTLB
 	depends on PCI
 	help
 	  Support the IOMMU. Needed to run systems with more than 3GB of memory
 	  properly with 32-bit PCI devices that do not support DAC (Double Address
 	  Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter.
 	  Normally the kernel will take the right choice by itself.
-	  This option includes a driver for the AMD Opteron/Athlon64 IOMMU
-	  and a software emulation used on some other systems.
+	  This option includes a driver for the AMD Opteron/Athlon64 northbridge IOMMU
+	  and a software emulation used on other systems.
 	  If unsure, say Y.

 # need this always enabled with GART_IOMMU for the VIA workaround
 config SWIOTLB
-       bool
-       depends on GART_IOMMU
-       default y
-
-config DUMMY_IOMMU
 	bool
-	depends on !GART_IOMMU && !SWIOTLB
 	default y
-	help
-	  Don't use IOMMU code. This will cause problems when you have more than 4GB
-	  of memory and any 32-bit devices. Don't turn on unless you know what you
-	  are doing.
+	depends on GART_IOMMU

 config X86_MCE
 	bool "Machine check support" if EMBEDDED

--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
-		dmi_scan.o
+		dmi_scan.o pci-dma.o pci-nommu.o

 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
@@ -29,7 +29,7 @@ obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
-obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
+obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o


--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -8,53 +8,259 @@
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <asm/io.h>
+#include <asm/proto.h>

-/* Map a set of buffers described by scatterlist in streaming
- * mode for DMA.  This is the scatter-gather version of the
- * above pci_map_single interface.  Here the scatter gather list
- * elements are each tagged with the appropriate dma address
- * and length.  They are obtained via sg_dma_{address,length}(SG).
- *
- * NOTE: An implementation may be able to use a smaller number of
- *       DMA address/length pairs than there are SG table elements.
- *       (for example via virtual mapping capabilities)
- *       The routine returns the number of addr/length pairs actually
- *       used, at most nents.
- *
- * Device ownership issues as mentioned above for pci_map_single are
- * the same here.
+int iommu_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_merge);
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+int iommu_sac_force __read_mostly = 0;
+EXPORT_SYMBOL(iommu_sac_force);
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int panic_on_overflow __read_mostly = 1;
+int force_iommu __read_mostly = 1;
+#else
+int panic_on_overflow __read_mostly = 0;
+int force_iommu __read_mostly= 0;
+#endif
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible
+   to i386. */
+struct device fallback_dev = {
+	.bus_id = "fallback device",
+	.coherent_dma_mask = 0xffffffff,
+	.dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+/* Allocate DMA memory on node near device */
+noinline static void *
+dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
+{
+	struct page *page;
+	int node;
+	if (dev->bus == &pci_bus_type)
+		node = pcibus_to_node(to_pci_dev(dev)->bus);
+	else
+		node = numa_node_id();
+	page = alloc_pages_node(node, gfp, order);
+	return page ? page_address(page) : NULL;
+}
+
+/*
+ * Allocate memory for a coherent mapping.
 */
-int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
-	       int nents, int direction)
+void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		   gfp_t gfp)
 {
-	int i;
-
-	BUG_ON(direction == DMA_NONE);
- 	for (i = 0; i < nents; i++ ) {
-		struct scatterlist *s = &sg[i];
-		BUG_ON(!s->page); 
-		s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
-		s->dma_length = s->length;
+	void *memory;
+	unsigned long dma_mask = 0;
+	u64 bus;
+
+	if (!dev)
+		dev = &fallback_dev;
+	dma_mask = dev->coherent_dma_mask;
+	if (dma_mask == 0)
+		dma_mask = 0xffffffff;
+
+	/* Kludge to make it bug-to-bug compatible with i386. i386
+	   uses the normal dma_mask for alloc_coherent. */
+	dma_mask &= *dev->dma_mask;
+
+	/* Why <=? Even when the mask is smaller than 4GB it is often
+	   larger than 16MB and in this case we have a chance of
+	   finding fitting memory in the next higher zone first. If
+	   not retry with true GFP_DMA. -AK */
+	if (dma_mask <= 0xffffffff)
+		gfp |= GFP_DMA32;
+
+ again:
+	memory = dma_alloc_pages(dev, gfp, get_order(size));
+	if (memory == NULL)
+		return NULL;
+
+	{
+		int high, mmu;
+		bus = virt_to_bus(memory);
+	        high = (bus + size) >= dma_mask;
+		mmu = high;
+		if (force_iommu && !(gfp & GFP_DMA))
+			mmu = 1;
+		else if (high) {
+			free_pages((unsigned long)memory,
+				   get_order(size));
+
+			/* Don't use the 16MB ZONE_DMA unless absolutely
+			   needed. It's better to use remapping first. */
+			if (dma_mask < 0xffffffff && !(gfp & GFP_DMA)) {
+				gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
+				goto again;
+			}
+
+			if (dma_ops->alloc_coherent)
+				return dma_ops->alloc_coherent(dev, size,
+							   dma_handle, gfp);
+			return NULL;
+		}
+
+		memset(memory, 0, size);
+		if (!mmu) {
+			*dma_handle = virt_to_bus(memory);
+			return memory;
+		}
+	}
+
+	if (dma_ops->alloc_coherent) {
+		free_pages((unsigned long)memory, get_order(size));
+		gfp &= ~(GFP_DMA|GFP_DMA32);
+		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
+	}
+
+	if (dma_ops->map_simple) {
+		*dma_handle = dma_ops->map_simple(dev, memory,
+					      size,
+					      PCI_DMA_BIDIRECTIONAL);
+		if (*dma_handle != bad_dma_address)
+			return memory;
 	}
-	return nents;
-}

-EXPORT_SYMBOL(dma_map_sg);
+	if (panic_on_overflow)
+		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
+	free_pages((unsigned long)memory, get_order(size));
+	return NULL;
+}
+EXPORT_SYMBOL(dma_alloc_coherent);

-/* Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
+/*
+ * Unmap coherent memory.
+ * The caller must ensure that the device has finished accessing the mapping.
 */
-void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-		  int nents, int dir)
+void dma_free_coherent(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t bus)
+{
+	if (dma_ops->unmap_single)
+		dma_ops->unmap_single(dev, bus, size, 0);
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+EXPORT_SYMBOL(dma_free_coherent);
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	if (dma_ops->dma_supported)
+		return dma_ops->dma_supported(dev, mask);
+
+	/* Copied from i386. Doesn't make much sense, because it will
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+        if (mask < 0x00ffffff)
+                return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on.  This
+	   allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area and
+	   return DAC as fallback address the device may not handle it
+	   correctly.
+
+	   As a special case some controllers have a 39bit address
+	   mode that is as efficient as 32bit (aic79xx). Don't force
+	   SAC for these.  Assume all masks <= 40 bits are of this
+	   type. Normally this doesn't make any difference, but gives
+	   more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
+		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
+int dma_set_mask(struct device *dev, u64 mask)
 {
-	int i;
-	for (i = 0; i < nents; i++) { 
-		struct scatterlist *s = &sg[i];
-		BUG_ON(s->page == NULL); 
-		BUG_ON(s->dma_address == 0); 
-		dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
-	} 
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+	*dev->dma_mask = mask;
+	return 0;
 }
+EXPORT_SYMBOL(dma_set_mask);
+
+/* iommu=[size][,noagp][,off][,force][,noforce][,leak][,memaper[=order]][,merge]
+         [,forcesac][,fullflush][,nomerge][,biomerge]
+   size  set size of iommu (in bytes)
+   noagp don't initialize the AGP driver and use full aperture.
+   off   don't use the IOMMU
+   leak  turn on simple iommu leak tracing (only when CONFIG_IOMMU_LEAK is on)
+   memaper[=order] allocate an own aperture over RAM with size 32MB^order.
+   noforce don't force IOMMU usage. Default.
+   force  Force IOMMU.
+   merge  Do lazy merging. This may improve performance on some block devices.
+          Implies force (experimental)
+   biomerge Do merging at the BIO layer. This is more efficient than merge,
+            but should be only done with very big IOMMUs. Implies merge,force.
+   nomerge Don't do SG merging.
+   forcesac For SAC mode for masks <40bits  (experimental)
+   fullflush Flush IOMMU on each allocation (default)
+   nofullflush Don't use IOMMU fullflush
+   allowed  overwrite iommu off workarounds for specific chipsets.
+   soft	 Use software bounce buffering (default for Intel machines)
+   noaperture Don't touch the aperture for AGP.
+*/
+__init int iommu_setup(char *p)
+{
+    iommu_merge = 1;

-EXPORT_SYMBOL(dma_unmap_sg);
+    while (*p) {
+	    if (!strncmp(p,"off",3))
+		    no_iommu = 1;
+	    /* gart_parse_options has more force support */
+	    if (!strncmp(p,"force",5))
+		    force_iommu = 1;
+	    if (!strncmp(p,"noforce",7)) {
+		    iommu_merge = 0;
+		    force_iommu = 0;
+	    }
+
+	    if (!strncmp(p, "biomerge",8)) {
+		    iommu_bio_merge = 4096;
+		    iommu_merge = 1;
+		    force_iommu = 1;
+	    }
+	    if (!strncmp(p, "panic",5))
+		    panic_on_overflow = 1;
+	    if (!strncmp(p, "nopanic",7))
+		    panic_on_overflow = 0;
+	    if (!strncmp(p, "merge",5)) {
+		    iommu_merge = 1;
+		    force_iommu = 1;
+	    }
+	    if (!strncmp(p, "nomerge",7))
+		    iommu_merge = 0;
+	    if (!strncmp(p, "forcesac",8))
+		    iommu_sac_force = 1;
+
+#ifdef CONFIG_SWIOTLB
+	    if (!strncmp(p, "soft",4))
+		    swiotlb = 1;
+#endif
+
+#ifdef CONFIG_GART_IOMMU
+	    gart_parse_options(p);
+#endif
+
+	    p += strcspn(p, ",");
+	    if (*p == ',')
+		    ++p;
+    }
+    return 1;
+}
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -6,89 +6,93 @@
 #include <linux/string.h>
 #include <asm/proto.h>
 #include <asm/processor.h>
+#include <asm/dma.h>

-int iommu_merge = 0;
-EXPORT_SYMBOL(iommu_merge);
-
-dma_addr_t bad_dma_address;
-EXPORT_SYMBOL(bad_dma_address);
-
-int iommu_bio_merge = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
-int iommu_sac_force = 0;
-EXPORT_SYMBOL(iommu_sac_force);
-
-/* 
- * Dummy IO MMU functions
- */
-
-void *dma_alloc_coherent(struct device *hwdev, size_t size,
-			 dma_addr_t *dma_handle, gfp_t gfp)
+static int
+check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 {
-	void *ret;
-	u64 mask;
-	int order = get_order(size);
-
-	if (hwdev)
-		mask = hwdev->coherent_dma_mask & *hwdev->dma_mask;
-	else
-		mask = 0xffffffff;
-	for (;;) {
-		ret = (void *)__get_free_pages(gfp, order);
-		if (ret == NULL)
-			return NULL;
-		*dma_handle = virt_to_bus(ret);
-		if ((*dma_handle & ~mask) == 0)
-			break;
-		free_pages((unsigned long)ret, order);
-		if (gfp & GFP_DMA)
-			return NULL;
-		gfp |= GFP_DMA;
+        if (hwdev && bus + size > *hwdev->dma_mask) {
+		printk(KERN_ERR
+		    "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
+	       name, (long long)bus, size, (long long)*hwdev->dma_mask);
+		return 0;
 	}
+	return 1;
+}

-	memset(ret, 0, size);
-	return ret;
+static dma_addr_t
+nommu_map_single(struct device *hwdev, void *ptr, size_t size,
+	       int direction)
+{
+	dma_addr_t bus = virt_to_bus(ptr);
+	if (!check_addr("map_single", hwdev, bus, size))
+				return bad_dma_address;
+	return bus;
 }
-EXPORT_SYMBOL(dma_alloc_coherent);

-void dma_free_coherent(struct device *hwdev, size_t size,
-			 void *vaddr, dma_addr_t dma_handle)
+void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
+			int direction)
 {
-	free_pages((unsigned long)vaddr, get_order(size));
 }
-EXPORT_SYMBOL(dma_free_coherent);

-int dma_supported(struct device *hwdev, u64 mask)
+/* Map a set of buffers described by scatterlist in streaming
+ * mode for DMA.  This is the scatter-gather version of the
+ * above pci_map_single interface.  Here the scatter gather list
+ * elements are each tagged with the appropriate dma address
+ * and length.  They are obtained via sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ *       DMA address/length pairs than there are SG table elements.
+ *       (for example via virtual mapping capabilities)
+ *       The routine returns the number of addr/length pairs actually
+ *       used, at most nents.
+ *
+ * Device ownership issues as mentioned above for pci_map_single are
+ * the same here.
+ */
+int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
+	       int nents, int direction)
 {
-        /*
-         * we fall back to GFP_DMA when the mask isn't all 1s,
-         * so we can't guarantee allocations that must be
-         * within a tighter range than GFP_DMA..
-	 * RED-PEN this won't work for pci_map_single. Caller has to
-	 * use GFP_DMA in the first place.
-         */
-        if (mask < 0x00ffffff)
-                return 0;
+	int i;

-	return 1;
-} 
-EXPORT_SYMBOL(dma_supported);
+	BUG_ON(direction == DMA_NONE);
+ 	for (i = 0; i < nents; i++ ) {
+		struct scatterlist *s = &sg[i];
+		BUG_ON(!s->page);
+		s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+		if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
+			return 0;
+		s->dma_length = s->length;
+	}
+	return nents;
+}

-int dma_get_cache_alignment(void)
+/* Unmap a set of streaming mode DMA translations.
+ * Again, cpu read rules concerning calls here are the same as for
+ * pci_unmap_single() above.
+ */
+void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
+		  int nents, int dir)
 {
-	return boot_cpu_data.x86_clflush_size;
 }
-EXPORT_SYMBOL(dma_get_cache_alignment);

-static int __init check_ram(void) 
-{ 
-	if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { 
-		printk(
-		KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n"
-		KERN_ERR "WARNING 32bit PCI may malfunction.\n");
-	} 
-	return 0;
-} 
-__initcall(check_ram);
+struct dma_mapping_ops nommu_dma_ops = {
+	.map_single = nommu_map_single,
+	.unmap_single = nommu_unmap_single,
+	.map_sg = nommu_map_sg,
+	.unmap_sg = nommu_unmap_sg,
+	.is_phys = 1,
+};

+void __init no_iommu_init(void)
+{
+	if (dma_ops)
+		return;
+	printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
+	dma_ops = &nommu_dma_ops;
+	if (end_pfn > MAX_DMA32_PFN) {
+		printk(KERN_ERR
+		       "WARNING more than 4GB of memory but IOMMU disabled.\n"
+		       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
+	}
+}
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/dma-mapping.h>
+#include <asm/proto.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+
+int swiotlb __read_mostly;
+EXPORT_SYMBOL(swiotlb);
+
+struct dma_mapping_ops swiotlb_dma_ops = {
+	.mapping_error = swiotlb_dma_mapping_error,
+	.alloc_coherent = swiotlb_alloc_coherent,
+	.free_coherent = swiotlb_free_coherent,
+	.map_single = swiotlb_map_single,
+	.unmap_single = swiotlb_unmap_single,
+	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = swiotlb_sync_sg_for_device,
+	.map_sg = swiotlb_map_sg,
+	.unmap_sg = swiotlb_unmap_sg,
+	.dma_supported = NULL,
+};
+
+void pci_swiotlb_init(void)
+{
+	/* don't initialize swiotlb if iommu=off (no_iommu=1) */
+	if (!iommu_aperture && !no_iommu &&
+	    (end_pfn > MAX_DMA32_PFN || force_iommu))
+	       swiotlb = 1;
+	if (swiotlb) {
+		swiotlb_init();
+		printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
+		dma_ops = &swiotlb_dma_ops;
+	}
+}
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -45,6 +45,7 @@
 #include <linux/kexec.h>
 #include <linux/cpufreq.h>
 #include <linux/dmi.h>
+#include <linux/dma-mapping.h>

 #include <asm/mtrr.h>
 #include <asm/uaccess.h>
@@ -63,7 +64,9 @@
 #include <asm/setup.h>
 #include <asm/mach_apic.h>
 #include <asm/numa.h>
+#include <asm/swiotlb.h>
 #include <asm/sections.h>
+#include <asm/gart-mapping.h>

 /*
 * Machine setup..
@@ -88,11 +91,6 @@ int bootloader_type;

 unsigned long saved_video_mode;

-#ifdef CONFIG_SWIOTLB
-int swiotlb;
-EXPORT_SYMBOL(swiotlb);
-#endif
-
 /*
 * Setup options
 */
@@ -389,11 +387,9 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 			numa_setup(from+5); 
 #endif

-#ifdef CONFIG_GART_IOMMU 
 		if (!memcmp(from,"iommu=",6)) { 
 			iommu_setup(from+6); 
 		}
-#endif

 		if (!memcmp(from,"oops=panic", 10))
 			panic_on_oops = 1;

--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -23,6 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>

 #include <asm/processor.h>
 #include <asm/system.h>
@@ -38,11 +39,16 @@
 #include <asm/proto.h>
 #include <asm/smp.h>
 #include <asm/sections.h>
+#include <asm/dma-mapping.h>
+#include <asm/swiotlb.h>

 #ifndef Dprintk
 #define Dprintk(x...)
 #endif

+struct dma_mapping_ops* dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
 static unsigned long dma_reserve __initdata;

 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -423,12 +429,9 @@ void __init mem_init(void)
 	long codesize, reservedpages, datasize, initsize;

 #ifdef CONFIG_SWIOTLB
-	if (!iommu_aperture &&
-	    ((end_pfn-1) >= 0xffffffff>>PAGE_SHIFT || force_iommu))
-	       swiotlb = 1;
-	if (swiotlb)
-		swiotlb_init();	
+	pci_swiotlb_init();
 #endif
+	no_iommu_init();

 	/* How many end-of-memory variables you have, grandma! */
 	max_low_pfn = end_pfn;

--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -12,155 +12,176 @@
 #include <asm/io.h>
 #include <asm/swiotlb.h>

-extern dma_addr_t bad_dma_address;
-#define dma_mapping_error(x) \
-	(swiotlb ? swiotlb_dma_mapping_error(x) : ((x) == bad_dma_address))
-
-void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-			 gfp_t gfp);
-void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
-			 dma_addr_t dma_handle);
+struct dma_mapping_ops {
+	int             (*mapping_error)(dma_addr_t dma_addr);
+	void*           (*alloc_coherent)(struct device *dev, size_t size,
+                                dma_addr_t *dma_handle, gfp_t gfp);
+	void            (*free_coherent)(struct device *dev, size_t size,
+                                void *vaddr, dma_addr_t dma_handle);
+	dma_addr_t      (*map_single)(struct device *hwdev, void *ptr,
+                                size_t size, int direction);
+	/* like map_single, but doesn't check the device mask */
+	dma_addr_t      (*map_simple)(struct device *hwdev, char *ptr,
+                                size_t size, int direction);
+	void            (*unmap_single)(struct device *dev, dma_addr_t addr,
+		                size_t size, int direction);
+	void            (*sync_single_for_cpu)(struct device *hwdev,
+		                dma_addr_t dma_handle, size_t size,
+				int direction);
+	void            (*sync_single_for_device)(struct device *hwdev,
+                                dma_addr_t dma_handle, size_t size,
+				int direction);
+	void            (*sync_single_range_for_cpu)(struct device *hwdev,
+                                dma_addr_t dma_handle, unsigned long offset,
+		                size_t size, int direction);
+	void            (*sync_single_range_for_device)(struct device *hwdev,
+				dma_addr_t dma_handle, unsigned long offset,
+		                size_t size, int direction);
+	void            (*sync_sg_for_cpu)(struct device *hwdev,
+                                struct scatterlist *sg, int nelems,
+				int direction);
+	void            (*sync_sg_for_device)(struct device *hwdev,
+				struct scatterlist *sg, int nelems,
+				int direction);
+	int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
+		                int nents, int direction);
+	void            (*unmap_sg)(struct device *hwdev,
+				struct scatterlist *sg, int nents,
+				int direction);
+	int             (*dma_supported)(struct device *hwdev, u64 mask);
+	int		is_phys;
+};

-#ifdef CONFIG_GART_IOMMU
+extern dma_addr_t bad_dma_address;
+extern struct dma_mapping_ops* dma_ops;
+extern int iommu_merge;

-extern dma_addr_t dma_map_single(struct device *hwdev, void *ptr, size_t size,
-				 int direction);
-extern void dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
-			     int direction);
+static inline int dma_mapping_error(dma_addr_t dma_addr)
+{
+	if (dma_ops->mapping_error)
+		return dma_ops->mapping_error(dma_addr);

-#else
+	return (dma_addr == bad_dma_address);
+}

-/* No IOMMU */
+extern void *dma_alloc_coherent(struct device *dev, size_t size,
+				dma_addr_t *dma_handle, gfp_t gfp);
+extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
+			      dma_addr_t dma_handle);

-static inline dma_addr_t dma_map_single(struct device *hwdev, void *ptr,
-					size_t size, int direction)
+static inline dma_addr_t
+dma_map_single(struct device *hwdev, void *ptr, size_t size,
+	       int direction)
 {
-	dma_addr_t addr;
-
-	if (direction == DMA_NONE)
-		out_of_line_bug();
-	addr = virt_to_bus(ptr);
-
-	if ((addr+size) & ~*hwdev->dma_mask)
-		out_of_line_bug();
-	return addr;
+	return dma_ops->map_single(hwdev, ptr, size, direction);
 }

-static inline void dma_unmap_single(struct device *hwdev, dma_addr_t dma_addr,
-				    size_t size, int direction)
+static inline void
+dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
+		 int direction)
 {
-	if (direction == DMA_NONE)
-		out_of_line_bug();
-	/* Nothing to do */
+	dma_ops->unmap_single(dev, addr, size, direction);
 }

-#endif
-
 #define dma_map_page(dev,page,offset,size,dir) \
 	dma_map_single((dev), page_address(page)+(offset), (size), (dir))

-static inline void dma_sync_single_for_cpu(struct device *hwdev,
-					       dma_addr_t dma_handle,
-					       size_t size, int direction)
-{
-	if (direction == DMA_NONE)
-		out_of_line_bug();
-
-	if (swiotlb)
-		return swiotlb_sync_single_for_cpu(hwdev,dma_handle,size,direction);
+#define dma_unmap_page dma_unmap_single

+static inline void
+dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+			size_t size, int direction)
+{
+	if (dma_ops->sync_single_for_cpu)
+		dma_ops->sync_single_for_cpu(hwdev, dma_handle, size,
+					     direction);
 	flush_write_buffers();
 }

-static inline void dma_sync_single_for_device(struct device *hwdev,
-						  dma_addr_t dma_handle,
-						  size_t size, int direction)
+static inline void
+dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
+			   size_t size, int direction)
 {
-        if (direction == DMA_NONE)
-		out_of_line_bug();
-
-	if (swiotlb)
-		return swiotlb_sync_single_for_device(hwdev,dma_handle,size,direction);
-
+	if (dma_ops->sync_single_for_device)
+		dma_ops->sync_single_for_device(hwdev, dma_handle, size,
+						direction);
 	flush_write_buffers();
 }

-static inline void dma_sync_single_range_for_cpu(struct device *hwdev,
-						 dma_addr_t dma_handle,
-						 unsigned long offset,
-						 size_t size, int direction)
+static inline void
+dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
+			      unsigned long offset, size_t size, int direction)
 {
-	if (direction == DMA_NONE)
-		out_of_line_bug();
-
-	if (swiotlb)
-		return swiotlb_sync_single_range_for_cpu(hwdev,dma_handle,offset,size,direction);
+	if (dma_ops->sync_single_range_for_cpu) {
+		dma_ops->sync_single_range_for_cpu(hwdev, dma_handle, offset, size, direction);
+	}

 	flush_write_buffers();
 }

-static inline void dma_sync_single_range_for_device(struct device *hwdev,
-						    dma_addr_t dma_handle,
-						    unsigned long offset,
-						    size_t size, int direction)
+static inline void
+dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
+				 unsigned long offset, size_t size, int direction)
 {
-        if (direction == DMA_NONE)
-		out_of_line_bug();
-
-	if (swiotlb)
-		return swiotlb_sync_single_range_for_device(hwdev,dma_handle,offset,size,direction);
+	if (dma_ops->sync_single_range_for_device)
+		dma_ops->sync_single_range_for_device(hwdev, dma_handle,
+						      offset, size, direction);

 	flush_write_buffers();
 }

-static inline void dma_sync_sg_for_cpu(struct device *hwdev,
-				       struct scatterlist *sg,
-				       int nelems, int direction)
+static inline void
+dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+		    int nelems, int direction)
 {
-	if (direction == DMA_NONE)
-		out_of_line_bug();
-
-	if (swiotlb)
-		return swiotlb_sync_sg_for_cpu(hwdev,sg,nelems,direction);
-
+	if (dma_ops->sync_sg_for_cpu)
+		dma_ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
 	flush_write_buffers();
 }

-static inline void dma_sync_sg_for_device(struct device *hwdev,
-					  struct scatterlist *sg,
-					  int nelems, int direction)
+static inline void
+dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+		       int nelems, int direction)
 {
-	if (direction == DMA_NONE)
-		out_of_line_bug();
-
-	if (swiotlb)
-		return swiotlb_sync_sg_for_device(hwdev,sg,nelems,direction);
+	if (dma_ops->sync_sg_for_device) {
+		dma_ops->sync_sg_for_device(hwdev, sg, nelems, direction);
+	}

 	flush_write_buffers();
 }

-extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg,
-		      int nents, int direction);
-extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg,
-			 int nents, int direction);
+static inline int
+dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction)
+{
+	return dma_ops->map_sg(hwdev, sg, nents, direction);
+}

-#define dma_unmap_page dma_unmap_single
+static inline void
+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
+	     int direction)
+{
+	dma_ops->unmap_sg(hwdev, sg, nents, direction);
+}

 extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_get_cache_alignment(void);
-#define dma_is_consistent(h) 1

-static inline int dma_set_mask(struct device *dev, u64 mask)
+/* same for gart, swiotlb, and nommu */
+static inline int dma_get_cache_alignment(void)
 {
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-	*dev->dma_mask = mask;
-	return 0;
+	return boot_cpu_data.x86_clflush_size;
 }

-static inline void dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
+#define dma_is_consistent(h) 1
+
+extern int dma_set_mask(struct device *dev, u64 mask);
+
+static inline void
+dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
 {
 	flush_write_buffers();
 }

-#endif
+extern struct device fallback_dev;
+extern int panic_on_overflow;
+
+#endif /* _X8664_DMA_MAPPING_H */
--- a/include/asm-x86_64/gart-mapping.h
+++ b/include/asm-x86_64/gart-mapping.h
+#ifndef _X8664_GART_MAPPING_H
+#define _X8664_GART_MAPPING_H 1
+
+#include <linux/types.h>
+#include <asm/types.h>
+
+struct device;
+
+extern void*
+gart_alloc_coherent(struct device *dev, size_t size,
+        dma_addr_t *dma_handle, gfp_t gfp);
+
+extern int
+gart_dma_supported(struct device *hwdev, u64 mask);
+
+#endif /* _X8664_GART_MAPPING_H */
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -42,18 +42,20 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 #include <asm/scatterlist.h>
 #include <linux/string.h>
 #include <asm/page.h>
+#include <linux/dma-mapping.h> /* for have_iommu */

 extern int iommu_setup(char *opt);

-#ifdef CONFIG_GART_IOMMU
 /* The PCI address space does equal the physical memory
 * address space.  The networking and block device layers use
 * this boolean for bounce buffer decisions
 *
- * On AMD64 it mostly equals, but we set it to zero to tell some subsystems
- * that an IOMMU is available.
+ * On AMD64 it mostly equals, but we set it to zero if a hardware
+ * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
 */
-#define PCI_DMA_BUS_IS_PHYS	(no_iommu ? 1 : 0)
+#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
+
+#ifdef CONFIG_GART_IOMMU

 /*
 * x86-64 always supports DAC, but sometimes it is useful to force
@@ -79,7 +81,6 @@ extern int iommu_sac_force;
 #else
 /* No IOMMU */

-#define PCI_DMA_BUS_IS_PHYS	1
 #define pci_dac_dma_supported(pci_dev, mask)    1

 #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)

--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -92,7 +92,9 @@ extern void check_efer(void);
 extern int unhandled_signal(struct task_struct *tsk, int sig);

 extern void select_idle_routine(const struct cpuinfo_x86 *c);
-extern void swiotlb_init(void);
+
+extern void gart_parse_options(char *);
+extern void __init no_iommu_init(void);

 extern unsigned long table_start, table_end;

@@ -106,12 +108,17 @@ extern int skip_ioapic_setup;
 extern int acpi_ht;
 extern int acpi_disabled;

+#ifdef CONFIG_GART_IOMMU
 extern int fallback_aper_order;
 extern int fallback_aper_force;
 extern int iommu_aperture;
-extern int iommu_aperture_disabled;
 extern int iommu_aperture_allowed;
+extern int iommu_aperture_disabled;
 extern int fix_aperture;
+#else
+#define iommu_aperture 0
+#define iommu_aperture_allowed 0
+#endif
 extern int force_iommu;

 extern int reboot_force;

--- a/include/asm-x86_64/swiotlb.h
+++ b/include/asm-x86_64/swiotlb.h
@@ -3,10 +3,14 @@

 #include <linux/config.h>

+#include <asm/dma-mapping.h>
+
 /* SWIOTLB interface */

-extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size,
-				      int dir);
+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
+				     size_t size, int dir);
+extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+                       dma_addr_t *dma_handle, gfp_t flags);
 extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 				  size_t size, int dir);
 extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
@@ -34,10 +38,10 @@ extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
 extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
 			 int nents, int direction);
 extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr);
-extern void *swiotlb_alloc_coherent (struct device *hwdev, size_t size,
-				     dma_addr_t *dma_handle, gfp_t flags);
 extern void swiotlb_free_coherent (struct device *hwdev, size_t size,
 				   void *vaddr, dma_addr_t dma_handle);
+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
+extern void swiotlb_init(void);

 #ifdef CONFIG_SWIOTLB
 extern int swiotlb;
@@ -45,4 +49,6 @@ extern int swiotlb;
 #define swiotlb 0
 #endif

-#endif
+extern void pci_swiotlb_init(void);
+
+#endif /* _ASM_SWTIOLB_H */
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -463,7 +463,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 */
 		dma_addr_t handle;
 		handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE);
-		if (dma_mapping_error(handle))
+		if (swiotlb_dma_mapping_error(handle))
 			return NULL;

 		ret = phys_to_virt(handle);