Commit f2059100 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Critical x86-64 IOMMU fixes for 2.6.0

From: Andi Kleen <ak@muc.de>

Please consider applying this patch, I would consider it critical for x86-64.

The 2.6.0 x86-64 IOMMU code unfortunately had a few problems, leading
to non booting systems and in a few cases to data corruption.

It fixes a two serious bugs in handling special kinds of scatter gather
lists in pci_map_sg.

AGP was completely broken with IOMMU because of a wrong #ifdef.
Fix that.

One TLB flush optimization I did a long time ago seems to break on
some 3ware boards (who require IOMMU because they don't support 64bit
addresses).  The breakage lead to data corruption. This patch diables
the optimization for now and fixes a potential SMP race in the flush
code too. The TLB flush is done in a slower, but more reliable way
now too.

This patch fixes them. Please consider applying, because some of these
problems hit quite many people.

This also disables the IOMMU_DEBUG in the defconfig. A lot of people 
were using the IOMMU when they didn't need to, which multiplied the
problems.

IOMMU merge is disabled for now. This was an experimental optimization
which helped with some block devices, but for production it seems to
be better to disable it for now because there are some questionable
corner cases when the IOMMU aperture fragments. The same is done
for IOMMU SAC force, which was related to that. 

i386 has quite broken semantics for pci_alloc_consistent(). It uses
the standard device DMA mask instead of the consistent mask. Make us
bug-to-bug compatible here. This fixes problems with some sound
drivers that don't support full 32bit addressing.
parent b14a4258
...@@ -511,6 +511,7 @@ config FRAME_POINTER ...@@ -511,6 +511,7 @@ config FRAME_POINTER
Normally you should say N. Normally you should say N.
config IOMMU_DEBUG config IOMMU_DEBUG
depends on GART_IOMMU && DEBUG_KERNEL
bool "Force IOMMU to on" bool "Force IOMMU to on"
help help
Force the IOMMU to on even when you have less than 4GB of memory and add Force the IOMMU to on even when you have less than 4GB of memory and add
......
...@@ -743,7 +743,7 @@ CONFIG_MAGIC_SYSRQ=y ...@@ -743,7 +743,7 @@ CONFIG_MAGIC_SYSRQ=y
# CONFIG_INIT_DEBUG is not set # CONFIG_INIT_DEBUG is not set
# CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_INFO is not set
# CONFIG_FRAME_POINTER is not set # CONFIG_FRAME_POINTER is not set
CONFIG_IOMMU_DEBUG=y # CONFIG_IOMMU_DEBUG is not set
CONFIG_IOMMU_LEAK=y CONFIG_IOMMU_LEAK=y
CONFIG_MCE_DEBUG=y CONFIG_MCE_DEBUG=y
......
...@@ -44,12 +44,13 @@ static int no_agp; ...@@ -44,12 +44,13 @@ static int no_agp;
#ifdef CONFIG_IOMMU_DEBUG #ifdef CONFIG_IOMMU_DEBUG
int panic_on_overflow = 1; int panic_on_overflow = 1;
int force_iommu = 1; int force_iommu = 1;
int sac_force_size = 0;
#else #else
int panic_on_overflow = 1; /* for testing */ int panic_on_overflow = 0;
int force_iommu = 0; int force_iommu = 0;
int sac_force_size = 256*1024*1024;
#endif #endif
int iommu_merge = 0;
int iommu_sac_force = 0;
int iommu_fullflush = 1;
/* Allocation bitmap for the remapping area */ /* Allocation bitmap for the remapping area */
static spinlock_t iommu_bitmap_lock = SPIN_LOCK_UNLOCKED; static spinlock_t iommu_bitmap_lock = SPIN_LOCK_UNLOCKED;
...@@ -125,7 +126,7 @@ static void free_iommu(unsigned long offset, int size) ...@@ -125,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
/* /*
* Use global flush state to avoid races with multiple flushers. * Use global flush state to avoid races with multiple flushers.
*/ */
static void __flush_gart(struct pci_dev *dev) static void flush_gart(struct pci_dev *dev)
{ {
unsigned long flags; unsigned long flags;
int bus = dev ? dev->bus->number : -1; int bus = dev ? dev->bus->number : -1;
...@@ -134,13 +135,17 @@ static void __flush_gart(struct pci_dev *dev) ...@@ -134,13 +135,17 @@ static void __flush_gart(struct pci_dev *dev)
int i; int i;
spin_lock_irqsave(&iommu_bitmap_lock, flags); spin_lock_irqsave(&iommu_bitmap_lock, flags);
/* recheck flush count inside lock */ if (need_flush || iommu_fullflush) {
if (need_flush) {
for (i = 0; northbridges[i]; i++) { for (i = 0; northbridges[i]; i++) {
u32 w;
if (bus >= 0 && !(cpu_isset_const(i, bus_cpumask))) if (bus >= 0 && !(cpu_isset_const(i, bus_cpumask)))
continue; continue;
pci_write_config_dword(northbridges[i], 0x9c, pci_write_config_dword(northbridges[i], 0x9c,
northbridge_flush_word[i] | 1); northbridge_flush_word[i] | 1);
/* Make sure the hardware actually executed the flush. */
do {
pci_read_config_dword(northbridges[i], 0x9c, &w);
} while (w & 1);
flushed++; flushed++;
} }
if (!flushed) if (!flushed)
...@@ -150,12 +155,6 @@ static void __flush_gart(struct pci_dev *dev) ...@@ -150,12 +155,6 @@ static void __flush_gart(struct pci_dev *dev)
spin_unlock_irqrestore(&iommu_bitmap_lock, flags); spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
} }
static inline void flush_gart(struct pci_dev *dev)
{
if (need_flush)
__flush_gart(dev);
}
/* /*
* Allocate memory for a consistent mapping. * Allocate memory for a consistent mapping.
* All mappings are consistent here, so this is just a wrapper around * All mappings are consistent here, so this is just a wrapper around
...@@ -174,11 +173,16 @@ void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, ...@@ -174,11 +173,16 @@ void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size,
} else { } else {
dma_mask = hwdev->consistent_dma_mask; dma_mask = hwdev->consistent_dma_mask;
} }
if (dma_mask == 0) if (dma_mask == 0)
dma_mask = 0xffffffff; dma_mask = 0xffffffff;
if (dma_mask < 0xffffffff || no_iommu) if (dma_mask < 0xffffffff || no_iommu)
gfp |= GFP_DMA; gfp |= GFP_DMA;
/* Kludge to make it bug-to-bug compatible with i386. i386
uses the normal dma_mask for alloc_consistent. */
dma_mask &= hwdev->dma_mask;
memory = (void *)__get_free_pages(gfp, get_order(size)); memory = (void *)__get_free_pages(gfp, get_order(size));
if (memory == NULL) { if (memory == NULL) {
return NULL; return NULL;
...@@ -394,7 +398,9 @@ static int __pci_map_cont(struct scatterlist *sg, int start, int stopat, ...@@ -394,7 +398,9 @@ static int __pci_map_cont(struct scatterlist *sg, int start, int stopat,
for (i = start; i < stopat; i++) { for (i = start; i < stopat; i++) {
struct scatterlist *s = &sg[i]; struct scatterlist *s = &sg[i];
unsigned long start_addr = s->dma_address; unsigned long pages, addr;
unsigned long phys_addr = s->dma_address;
BUG_ON(i > start && s->offset); BUG_ON(i > start && s->offset);
if (i == start) { if (i == start) {
*sout = *s; *sout = *s;
...@@ -403,8 +409,10 @@ static int __pci_map_cont(struct scatterlist *sg, int start, int stopat, ...@@ -403,8 +409,10 @@ static int __pci_map_cont(struct scatterlist *sg, int start, int stopat,
} else { } else {
sout->length += s->length; sout->length += s->length;
} }
unsigned long addr = start_addr;
while (addr < start_addr + s->length) { addr = phys_addr;
pages = to_pages(s->offset, s->length);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
SET_LEAK(iommu_page); SET_LEAK(iommu_page);
addr += PAGE_SIZE; addr += PAGE_SIZE;
...@@ -437,7 +445,7 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir) ...@@ -437,7 +445,7 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir)
int out; int out;
int start; int start;
unsigned long pages = 0; unsigned long pages = 0;
int need = 0; int need = 0, nextneed;
unsigned long size = 0; unsigned long size = 0;
...@@ -453,13 +461,14 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir) ...@@ -453,13 +461,14 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir)
BUG_ON(s->length == 0); BUG_ON(s->length == 0);
size += s->length; size += s->length;
nextneed = need_iommu(dev, addr, s->length);
/* Handle the previous not yet processed entries */ /* Handle the previous not yet processed entries */
if (i > start) { if (i > start) {
struct scatterlist *ps = &sg[i-1]; struct scatterlist *ps = &sg[i-1];
/* Can only merge when the last chunk ends on a page /* Can only merge when the last chunk ends on a page
boundary. */ boundary and the new one doesn't have an offset. */
if (!force_iommu || !need || (i-1 > start && ps->offset) || if (!iommu_merge || !nextneed || !need || s->offset ||
(ps->offset + ps->length) % PAGE_SIZE) { (ps->offset + ps->length) % PAGE_SIZE) {
if (pci_map_cont(sg, start, i, sg+out, pages, if (pci_map_cont(sg, start, i, sg+out, pages,
need) < 0) need) < 0)
...@@ -470,7 +479,7 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir) ...@@ -470,7 +479,7 @@ int pci_map_sg(struct pci_dev *dev, struct scatterlist *sg, int nents, int dir)
} }
} }
need = need_iommu(dev, addr, s->length); need = nextneed;
pages += to_pages(s->offset, s->length); pages += to_pages(s->offset, s->length);
} }
if (pci_map_cont(sg, start, i, sg+out, pages, need) < 0) if (pci_map_cont(sg, start, i, sg+out, pages, need) < 0)
...@@ -544,14 +553,12 @@ int pci_dma_supported(struct pci_dev *dev, u64 mask) ...@@ -544,14 +553,12 @@ int pci_dma_supported(struct pci_dev *dev, u64 mask)
Problem with this is that if we overflow the IOMMU area Problem with this is that if we overflow the IOMMU area
and return DAC as fallback address the device may not handle it correctly. and return DAC as fallback address the device may not handle it correctly.
As a compromise we only do this if the IOMMU area is >= 256MB
which should make overflow unlikely enough.
As a special case some controllers have a 39bit address mode As a special case some controllers have a 39bit address mode
that is as efficient as 32bit (aic79xx). Don't force SAC for these. that is as efficient as 32bit (aic79xx). Don't force SAC for these.
Assume all masks <= 40 bits are of this type. Normally this doesn't Assume all masks <= 40 bits are of this type. Normally this doesn't
make any difference, but gives more gentle handling of IOMMU overflow. */ make any difference, but gives more gentle handling of IOMMU overflow. */
if (force_iommu && (mask > 0xffffffffffULL) && (iommu_size >= sac_force_size)){ if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->slot_name,mask); printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->slot_name,mask);
return 0; return 0;
} }
...@@ -680,7 +687,7 @@ static int __init pci_iommu_init(void) ...@@ -680,7 +687,7 @@ static int __init pci_iommu_init(void)
unsigned long iommu_start; unsigned long iommu_start;
struct pci_dev *dev; struct pci_dev *dev;
#ifndef CONFIG_AGP_AMD_8151 #ifndef CONFIG_AGP_AMD64
no_agp = 1; no_agp = 1;
#else #else
/* Makefile puts PCI initialization via subsys_initcall first. */ /* Makefile puts PCI initialization via subsys_initcall first. */
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <linux/string.h> #include <linux/string.h>
#include <asm/proto.h> #include <asm/proto.h>
int iommu_merge = 0;
/* /*
* Dummy IO MMU functions * Dummy IO MMU functions
*/ */
......
...@@ -304,8 +304,8 @@ static inline int isa_check_signature(unsigned long io_addr, ...@@ -304,8 +304,8 @@ static inline int isa_check_signature(unsigned long io_addr,
/* Disable vmerge for now. Need to fix the block layer code /* Disable vmerge for now. Need to fix the block layer code
to check for non iommu addresses first. to check for non iommu addresses first.
When the IOMMU is force it is safe to enable. */ When the IOMMU is force it is safe to enable. */
extern int force_iommu; extern int iommu_merge;
#define BIO_VERMGE_BOUNDARY (force_iommu ? 4096 : 0) #define BIO_VMERGE_BOUNDARY (iommu_merge ? 4096 : 0)
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment