firewire: ohci: Asynchronous Reception rewrite

Move the AR DMA descriptors out of the buffer pages, and map the buffer pages linearly into the kernel's address space. This allows the driver to ignore any page boundaries in the DMA data and thus to avoid any copying around of packet payloads. This fixes the bug where S800 packets that are so big (> 4080 bytes) that they can be split over three pages were not handled correctly. Due to the changed algorithm, we can now use arbitrarily many buffer pages, which improves performance because the controller can more easily unload its DMA FIFO. Furthermore, using streaming DMA mappings should improve perfomance on architectures where coherent DMA mappings are not cacheable. Even on other architectures, the caching behaviour should be improved slightly because the CPU no longer writes to the buffer pages. v2: Detect the last filled buffer page by searching the descriptor's residual count value fields in order (like in the old code), instead of going backwards through the transfer status fields; it looks as if some controllers do not set the latter correctly. v3: Fix an old resume bug that would now make the handler run into a BUG_ON, and replace that check with more useful error handling. Increase the buffer size for better performance with non-TI chips. Signed-off-by: Clemens Ladisch <clemens@ladisch.de> Maxim Levitsky writes: Works almost perfectly. I can still see RCODE_BUSY errors sometimes, not very often though. 64K here eliminates these errors completely. This is most likely due to nouveau drivers and lowest perf level I use to lower card temperature. That increases latencies too much I think. Besides that the IO is just perfect. Tested-by: Maxim Levitsky <maximlevitsky@gmail.com> Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>

firewire: ohci: Asynchronous Reception rewrite
Move the AR DMA descriptors out of the buffer pages, and map the buffer pages linearly into the kernel's address space. This allows the driver to ignore any page boundaries in the DMA data and thus to avoid any copying around of packet payloads. This fixes the bug where S800 packets that are so big (> 4080 bytes) that they can be split over three pages were not handled correctly. Due to the changed algorithm, we can now use arbitrarily many buffer pages, which improves performance because the controller can more easily unload its DMA FIFO. Furthermore, using streaming DMA mappings should improve perfomance on architectures where coherent DMA mappings are not cacheable. Even on other architectures, the caching behaviour should be improved slightly because the CPU no longer writes to the buffer pages. v2: Detect the last filled buffer page by searching the descriptor's residual count value fields in order (like in the old code), instead of going backwards through the transfer status fields; it looks as if some controllers do not set the latter correctly. v3: Fix an old resume bug that would now make the handler run into a BUG_ON, and replace that check with more useful error handling. Increase the buffer size for better performance with non-TI chips. Signed-off-by: Clemens Ladisch <clemens@ladisch.de> Maxim Levitsky writes: Works almost perfectly. I can still see RCODE_BUSY errors sometimes, not very often though. 64K here eliminates these errors completely. This is most likely due to nouveau drivers and lowest perf level I use to lower card temperature. That increases latencies too much I think. Besides that the IO is just perfect. Tested-by: Maxim Levitsky <maximlevitsky@gmail.com> Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
7a39d8b8 · Clemens Ladisch · Stefan Richter · 5878730b · 7a39d8b8 · 7a39d8b8
Commit 7a39d8b8 authored Nov 26, 2010 by Clemens Ladisch Committed by Stefan Richter Dec 13, 2010
Hide whitespace changes
Inline Side-by-side

Showing with 278 additions and 135 deletions

drivers/firewire/Kconfig drivers/firewire/Kconfig +1 -1

drivers/firewire/ohci.c drivers/firewire/ohci.c +277 -134

No files found.
--- a/drivers/firewire/Kconfig
+++ b/drivers/firewire/Kconfig
@@ -19,7 +19,7 @@ config FIREWIRE

 config FIREWIRE_OHCI
 	tristate "OHCI-1394 controllers"
-	depends on PCI && FIREWIRE
+	depends on PCI && FIREWIRE && MMU
 	help
 	  Enable this driver if you have a FireWire controller based
 	  on the OHCI specification.  For all practical purposes, this

--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -40,6 +40,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>

 #include <asm/byteorder.h>
 #include <asm/page.h>
@@ -80,17 +81,23 @@ struct descriptor {
 #define COMMAND_PTR(regs)	((regs) + 12)
 #define CONTEXT_MATCH(regs)	((regs) + 16)

-struct ar_buffer {
-	struct descriptor descriptor;
-	struct ar_buffer *next;
-	__le32 data[0];
-};
+#define AR_BUFFER_SIZE	(32*1024)
+#define AR_BUFFERS_MIN	DIV_ROUND_UP(AR_BUFFER_SIZE, PAGE_SIZE)
+/* we need at least two pages for proper list management */
+#define AR_BUFFERS	(AR_BUFFERS_MIN >= 2 ? AR_BUFFERS_MIN : 2)
+
+#define MAX_ASYNC_PAYLOAD	4096
+#define MAX_AR_PACKET_SIZE	(16 + MAX_ASYNC_PAYLOAD + 4)
+#define AR_WRAPAROUND_PAGES	DIV_ROUND_UP(MAX_AR_PACKET_SIZE, PAGE_SIZE)

 struct ar_context {
 	struct fw_ohci *ohci;
-	struct ar_buffer *current_buffer;
-	struct ar_buffer *last_buffer;
+	struct page *pages[AR_BUFFERS];
+	void *buffer;
+	struct descriptor *descriptors;
+	dma_addr_t descriptors_bus;
 	void *pointer;
+	unsigned int last_buffer_index;
 	u32 regs;
 	struct tasklet_struct tasklet;
 };
@@ -594,59 +601,155 @@ static int ohci_update_phy_reg(struct fw_card *card, int addr,
 	return ret;
 }

-static void ar_context_link_page(struct ar_context *ctx,
-				 struct ar_buffer *ab, dma_addr_t ab_bus)
+static inline dma_addr_t ar_buffer_bus(struct ar_context *ctx, unsigned int i)
+{
+	return page_private(ctx->pages[i]);
+}
+
+static void ar_context_link_page(struct ar_context *ctx, unsigned int index)
 {
-	size_t offset;
+	struct descriptor *d;

-	ab->next = NULL;
-	memset(&ab->descriptor, 0, sizeof(ab->descriptor));
-	ab->descriptor.control        = cpu_to_le16(DESCRIPTOR_INPUT_MORE |
-						    DESCRIPTOR_STATUS |
-						    DESCRIPTOR_BRANCH_ALWAYS);
-	offset = offsetof(struct ar_buffer, data);
-	ab->descriptor.req_count      = cpu_to_le16(PAGE_SIZE - offset);
-	ab->descriptor.data_address   = cpu_to_le32(ab_bus + offset);
-	ab->descriptor.res_count      = cpu_to_le16(PAGE_SIZE - offset);
-	ab->descriptor.branch_address = 0;
+	d = &ctx->descriptors[index];
+	d->branch_address  &= cpu_to_le32(~0xf);
+	d->res_count       =  cpu_to_le16(PAGE_SIZE);
+	d->transfer_status =  0;

 	wmb(); /* finish init of new descriptors before branch_address update */
-	ctx->last_buffer->descriptor.branch_address = cpu_to_le32(ab_bus | 1);
-	ctx->last_buffer->next = ab;
-	ctx->last_buffer = ab;
+	d = &ctx->descriptors[ctx->last_buffer_index];
+	d->branch_address  |= cpu_to_le32(1);
+
+	ctx->last_buffer_index = index;

 	reg_write(ctx->ohci, CONTROL_SET(ctx->regs), CONTEXT_WAKE);
 	flush_writes(ctx->ohci);
 }

-static int ar_context_add_page(struct ar_context *ctx)
+static void ar_context_release(struct ar_context *ctx)
 {
-	struct device *dev = ctx->ohci->card.device;
-	struct ar_buffer *ab;
-	dma_addr_t uninitialized_var(ab_bus);
+	unsigned int i;

-	ab = dma_alloc_coherent(dev, PAGE_SIZE, &ab_bus, GFP_ATOMIC);
-	if (ab == NULL)
-		return -ENOMEM;
+	if (ctx->descriptors)
+		dma_free_coherent(ctx->ohci->card.device,
+				  AR_BUFFERS * sizeof(struct descriptor),
+				  ctx->descriptors, ctx->descriptors_bus);

-	ar_context_link_page(ctx, ab, ab_bus);
+	if (ctx->buffer)
+		vm_unmap_ram(ctx->buffer, AR_BUFFERS + AR_WRAPAROUND_PAGES);

-	return 0;
+	for (i = 0; i < AR_BUFFERS; i++)
+		if (ctx->pages[i]) {
+			dma_unmap_page(ctx->ohci->card.device,
+				       ar_buffer_bus(ctx, i),
+				       PAGE_SIZE, DMA_FROM_DEVICE);
+			__free_page(ctx->pages[i]);
+		}
 }

-static void ar_context_release(struct ar_context *ctx)
+static void ar_context_abort(struct ar_context *ctx, const char *error_msg)
 {
-	struct ar_buffer *ab, *ab_next;
-	size_t offset;
-	dma_addr_t ab_bus;
+	if (reg_read(ctx->ohci, CONTROL_CLEAR(ctx->regs)) & CONTEXT_RUN) {
+		reg_write(ctx->ohci, CONTROL_CLEAR(ctx->regs), CONTEXT_RUN);
+		flush_writes(ctx->ohci);

-	for (ab = ctx->current_buffer; ab; ab = ab_next) {
-		ab_next = ab->next;
-		offset = offsetof(struct ar_buffer, data);
-		ab_bus = le32_to_cpu(ab->descriptor.data_address) - offset;
-		dma_free_coherent(ctx->ohci->card.device, PAGE_SIZE,
-				  ab, ab_bus);
+		fw_error("AR error: %s; DMA stopped\n", error_msg);
 	}
+	/* FIXME: restart? */
+}
+
+static inline unsigned int ar_next_buffer_index(unsigned int index)
+{
+	return (index + 1) % AR_BUFFERS;
+}
+
+static inline unsigned int ar_prev_buffer_index(unsigned int index)
+{
+	return (index - 1 + AR_BUFFERS) % AR_BUFFERS;
+}
+
+static inline unsigned int ar_first_buffer_index(struct ar_context *ctx)
+{
+	return ar_next_buffer_index(ctx->last_buffer_index);
+}
+
+/*
+ * We search for the buffer that contains the last AR packet DMA data written
+ * by the controller.
+ */
+static unsigned int ar_search_last_active_buffer(struct ar_context *ctx,
+						 unsigned int *buffer_offset)
+{
+	unsigned int i, next_i, last = ctx->last_buffer_index;
+	__le16 res_count, next_res_count;
+
+	i = ar_first_buffer_index(ctx);
+	res_count = ACCESS_ONCE(ctx->descriptors[i].res_count);
+
+	/* A buffer that is not yet completely filled must be the last one. */
+	while (i != last && res_count == 0) {
+
+		/* Peek at the next descriptor. */
+		next_i = ar_next_buffer_index(i);
+		rmb(); /* read descriptors in order */
+		next_res_count = ACCESS_ONCE(
+				ctx->descriptors[next_i].res_count);
+		/*
+		 * If the next descriptor is still empty, we must stop at this
+		 * descriptor.
+		 */
+		if (next_res_count == cpu_to_le16(PAGE_SIZE)) {
+			/*
+			 * The exception is when the DMA data for one packet is
+			 * split over three buffers; in this case, the middle
+			 * buffer's descriptor might be never updated by the
+			 * controller and look still empty, and we have to peek
+			 * at the third one.
+			 */
+			if (MAX_AR_PACKET_SIZE > PAGE_SIZE && i != last) {
+				next_i = ar_next_buffer_index(next_i);
+				rmb();
+				next_res_count = ACCESS_ONCE(
+					ctx->descriptors[next_i].res_count);
+				if (next_res_count != cpu_to_le16(PAGE_SIZE))
+					goto next_buffer_is_active;
+			}
+
+			break;
+		}
+
+next_buffer_is_active:
+		i = next_i;
+		res_count = next_res_count;
+	}
+
+	rmb(); /* read res_count before the DMA data */
+
+	*buffer_offset = PAGE_SIZE - le16_to_cpu(res_count);
+	if (*buffer_offset > PAGE_SIZE) {
+		*buffer_offset = 0;
+		ar_context_abort(ctx, "corrupted descriptor");
+	}
+
+	return i;
+}
+
+static void ar_sync_buffers_for_cpu(struct ar_context *ctx,
+				    unsigned int end_buffer_index,
+				    unsigned int end_buffer_offset)
+{
+	unsigned int i;
+
+	i = ar_first_buffer_index(ctx);
+	while (i != end_buffer_index) {
+		dma_sync_single_for_cpu(ctx->ohci->card.device,
+					ar_buffer_bus(ctx, i),
+					PAGE_SIZE, DMA_FROM_DEVICE);
+		i = ar_next_buffer_index(i);
+	}
+	if (end_buffer_offset > 0)
+		dma_sync_single_for_cpu(ctx->ohci->card.device,
+					ar_buffer_bus(ctx, i),
+					end_buffer_offset, DMA_FROM_DEVICE);
 }

 #if defined(CONFIG_PPC_PMAC) && defined(CONFIG_PPC32)
@@ -689,6 +792,10 @@ static __le32 *handle_ar_packet(struct ar_context *ctx, __le32 *buffer)
 		p.header[3] = cond_le32_to_cpu(buffer[3]);
 		p.header_length = 16;
 		p.payload_length = p.header[3] >> 16;
+		if (p.payload_length > MAX_ASYNC_PAYLOAD) {
+			ar_context_abort(ctx, "invalid packet length");
+			return NULL;
+		}
 		break;

 	case TCODE_WRITE_RESPONSE:
@@ -699,9 +806,8 @@ static __le32 *handle_ar_packet(struct ar_context *ctx, __le32 *buffer)
 		break;

 	default:
-		/* FIXME: Stop context, discard everything, and restart? */
-		p.header_length = 0;
-		p.payload_length = 0;
+		ar_context_abort(ctx, "invalid tcode");
+		return NULL;
 	}

 	p.payload = (void *) buffer + p.header_length;
@@ -751,121 +857,152 @@ static __le32 *handle_ar_packet(struct ar_context *ctx, __le32 *buffer)
 	return buffer + length + 1;
 }

+static void *handle_ar_packets(struct ar_context *ctx, void *p, void *end)
+{
+	void *next;
+
+	while (p < end) {
+		next = handle_ar_packet(ctx, p);
+		if (!next)
+			return p;
+		p = next;
+	}
+
+	return p;
+}
+
+static void ar_recycle_buffers(struct ar_context *ctx, unsigned int end_buffer)
+{
+	unsigned int i;
+
+	i = ar_first_buffer_index(ctx);
+	while (i != end_buffer) {
+		dma_sync_single_for_device(ctx->ohci->card.device,
+					   ar_buffer_bus(ctx, i),
+					   PAGE_SIZE, DMA_FROM_DEVICE);
+		ar_context_link_page(ctx, i);
+		i = ar_next_buffer_index(i);
+	}
+}
+
 static void ar_context_tasklet(unsigned long data)
 {
 	struct ar_context *ctx = (struct ar_context *)data;
-	struct ar_buffer *ab;
-	struct descriptor *d;
-	void *buffer, *end;
-	__le16 res_count;
+	unsigned int end_buffer_index, end_buffer_offset;
+	void *p, *end;

-	ab = ctx->current_buffer;
-	d = &ab->descriptor;
+	p = ctx->pointer;
+	if (!p)
+		return;

-	res_count = ACCESS_ONCE(d->res_count);
-	if (res_count == 0) {
-		size_t size, size2, rest, pktsize, size3, offset;
-		dma_addr_t start_bus;
-		void *start;
+	end_buffer_index = ar_search_last_active_buffer(ctx,
+							&end_buffer_offset);
+	ar_sync_buffers_for_cpu(ctx, end_buffer_index, end_buffer_offset);
+	end = ctx->buffer + end_buffer_index * PAGE_SIZE + end_buffer_offset;

+	if (end_buffer_index < ar_first_buffer_index(ctx)) {
 		/*
-		 * This descriptor is finished and we may have a
-		 * packet split across this and the next buffer. We
-		 * reuse the page for reassembling the split packet.
+		 * The filled part of the overall buffer wraps around; handle
+		 * all packets up to the buffer end here.  If the last packet
+		 * wraps around, its tail will be visible after the buffer end
+		 * because the buffer start pages are mapped there again.
 		 */
+		void *buffer_end = ctx->buffer + AR_BUFFERS * PAGE_SIZE;
+		p = handle_ar_packets(ctx, p, buffer_end);
+		if (p < buffer_end)
+			goto error;
+		/* adjust p to point back into the actual buffer */
+		p -= AR_BUFFERS * PAGE_SIZE;
+	}

-		offset = offsetof(struct ar_buffer, data);
-		start = ab;
-		start_bus = le32_to_cpu(ab->descriptor.data_address) - offset;
-		buffer = ab->data;
-
-		ab = ab->next;
-		d = &ab->descriptor;
-		size = start + PAGE_SIZE - ctx->pointer;
-		/* valid buffer data in the next page */
-		rest = le16_to_cpu(d->req_count) - le16_to_cpu(d->res_count);
-		/* what actually fits in this page */
-		size2 = min(rest, (size_t)PAGE_SIZE - offset - size);
-		memmove(buffer, ctx->pointer, size);
-		memcpy(buffer + size, ab->data, size2);
-
-		while (size > 0) {
-			void *next = handle_ar_packet(ctx, buffer);
-			pktsize = next - buffer;
-			if (pktsize >= size) {
-				/*
-				 * We have handled all the data that was
-				 * originally in this page, so we can now
-				 * continue in the next page.
-				 */
-				buffer = next;
-				break;
-			}
-			/* move the next packet to the start of the buffer */
-			memmove(buffer, next, size + size2 - pktsize);
-			size -= pktsize;
-			/* fill up this page again */
-			size3 = min(rest - size2,
-				    (size_t)PAGE_SIZE - offset - size - size2);
-			memcpy(buffer + size + size2,
-			       (void *) ab->data + size2, size3);
-			size2 += size3;
-		}
-
-		if (rest > 0) {
-			/* handle the packets that are fully in the next page */
-			buffer = (void *) ab->data +
-					(buffer - (start + offset + size));
-			end = (void *) ab->data + rest;
+	p = handle_ar_packets(ctx, p, end);
+	if (p != end) {
+		if (p > end)
+			ar_context_abort(ctx, "inconsistent descriptor");
+		goto error;
+	}

-			while (buffer < end)
-				buffer = handle_ar_packet(ctx, buffer);
+	ctx->pointer = p;
+	ar_recycle_buffers(ctx, end_buffer_index);

-			ctx->current_buffer = ab;
-			ctx->pointer = end;
+	return;

-			ar_context_link_page(ctx, start, start_bus);
-		} else {
-			ctx->pointer = start + PAGE_SIZE;
-		}
-	} else {
-		buffer = ctx->pointer;
-		ctx->pointer = end =
-			(void *) ab + PAGE_SIZE - le16_to_cpu(res_count);
-
-		while (buffer < end)
-			buffer = handle_ar_packet(ctx, buffer);
-	}
+error:
+	ctx->pointer = NULL;
 }

 static int ar_context_init(struct ar_context *ctx,
 			   struct fw_ohci *ohci, u32 regs)
 {
-	struct ar_buffer ab;
+	unsigned int i;
+	dma_addr_t dma_addr;
+	struct page *pages[AR_BUFFERS + AR_WRAPAROUND_PAGES];
+	struct descriptor *d;

 	ctx->regs        = regs;
 	ctx->ohci        = ohci;
-	ctx->last_buffer = &ab;
 	tasklet_init(&ctx->tasklet, ar_context_tasklet, (unsigned long)ctx);

-	ar_context_add_page(ctx);
-	ar_context_add_page(ctx);
-	ctx->current_buffer = ab.next;
-	ctx->pointer = ctx->current_buffer->data;
+	for (i = 0; i < AR_BUFFERS; i++) {
+		ctx->pages[i] = alloc_page(GFP_KERNEL | GFP_DMA32);
+		if (!ctx->pages[i])
+			goto out_of_memory;
+		dma_addr = dma_map_page(ohci->card.device, ctx->pages[i],
+					0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(ohci->card.device, dma_addr)) {
+			__free_page(ctx->pages[i]);
+			ctx->pages[i] = NULL;
+			goto out_of_memory;
+		}
+		set_page_private(ctx->pages[i], dma_addr);
+	}
+
+	for (i = 0; i < AR_BUFFERS; i++)
+		pages[i]              = ctx->pages[i];
+	for (i = 0; i < AR_WRAPAROUND_PAGES; i++)
+		pages[AR_BUFFERS + i] = ctx->pages[i];
+	ctx->buffer = vm_map_ram(pages, AR_BUFFERS + AR_WRAPAROUND_PAGES,
+				 -1, PAGE_KERNEL_RO);
+	if (!ctx->buffer)
+		goto out_of_memory;
+
+	ctx->descriptors =
+		dma_alloc_coherent(ohci->card.device,
+				   AR_BUFFERS * sizeof(struct descriptor),
+				   &ctx->descriptors_bus,
+				   GFP_KERNEL);
+	if (!ctx->descriptors)
+		goto out_of_memory;
+
+	for (i = 0; i < AR_BUFFERS; i++) {
+		d = &ctx->descriptors[i];
+		d->req_count      = cpu_to_le16(PAGE_SIZE);
+		d->control        = cpu_to_le16(DESCRIPTOR_INPUT_MORE |
+						DESCRIPTOR_STATUS |
+						DESCRIPTOR_BRANCH_ALWAYS);
+		d->data_address   = cpu_to_le32(ar_buffer_bus(ctx, i));
+		d->branch_address = cpu_to_le32(ctx->descriptors_bus +
+			ar_next_buffer_index(i) * sizeof(struct descriptor));
+	}

 	return 0;
+
+out_of_memory:
+	ar_context_release(ctx);
+
+	return -ENOMEM;
 }

 static void ar_context_run(struct ar_context *ctx)
 {
-	struct ar_buffer *ab = ctx->current_buffer;
-	dma_addr_t ab_bus;
-	size_t offset;
+	unsigned int i;
+
+	for (i = 0; i < AR_BUFFERS; i++)
+		ar_context_link_page(ctx, i);

-	offset = offsetof(struct ar_buffer, data);
-	ab_bus = le32_to_cpu(ab->descriptor.data_address) - offset;
+	ctx->pointer = ctx->buffer;

-	reg_write(ctx->ohci, COMMAND_PTR(ctx->regs), ab_bus | 1);
+	reg_write(ctx->ohci, COMMAND_PTR(ctx->regs), ctx->descriptors_bus | 1);
 	reg_write(ctx->ohci, CONTROL_SET(ctx->regs), CONTEXT_RUN);
 	flush_writes(ctx->ohci);
 }
@@ -2955,11 +3092,15 @@ static int __devinit pci_probe(struct pci_dev *dev,
 	if (param_quirks)
 		ohci->quirks = param_quirks;

-	ar_context_init(&ohci->ar_request_ctx, ohci,
-			OHCI1394_AsReqRcvContextControlSet);
+	err = ar_context_init(&ohci->ar_request_ctx, ohci,
+			      OHCI1394_AsReqRcvContextControlSet);
+	if (err < 0)
+		goto fail_iounmap;

-	ar_context_init(&ohci->ar_response_ctx, ohci,
-			OHCI1394_AsRspRcvContextControlSet);
+	err = ar_context_init(&ohci->ar_response_ctx, ohci,
+			      OHCI1394_AsRspRcvContextControlSet);
+	if (err < 0)
+		goto fail_arreq_ctx;

 	context_init(&ohci->at_request_ctx, ohci,
 		     OHCI1394_AsReqTrContextControlSet, handle_at_packet);
@@ -3024,7 +3165,9 @@ static int __devinit pci_probe(struct pci_dev *dev,
 	context_release(&ohci->at_response_ctx);
 	context_release(&ohci->at_request_ctx);
 	ar_context_release(&ohci->ar_response_ctx);
+ fail_arreq_ctx:
 	ar_context_release(&ohci->ar_request_ctx);
+ fail_iounmap:
 	pci_iounmap(dev, ohci->registers);
 fail_iomem:
 	pci_release_region(dev, 0);