Fix ia64-version of thread_info. Add McKinley-optimized copy_page().

3c4cefb3 · David Mosberger · 8c9ac7c2 · 3c4cefb3 · 3c4cefb3
Commit 3c4cefb3 authored Mar 26, 2002 by David Mosberger
Show whitespace changes
Inline Side-by-side

Showing with 188 additions and 1 deletion

arch/ia64/lib/copy_page_mck.S arch/ia64/lib/copy_page_mck.S +184 -0

include/asm-ia64/thread_info.h include/asm-ia64/thread_info.h +4 -1

No files found.
--- a/arch/ia64/lib/copy_page_mck.S
+++ b/arch/ia64/lib/copy_page_mck.S
+/*
+ * McKinley-optimized version of copy_page().
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *	David Mosberger <davidm@hpl.hp.com>
+ *
+ * Inputs:
+ *	in0:	address of target page
+ *	in1:	address of source page
+ * Output:
+ *	no return value
+ *
+ * General idea:
+ *	- use regular loads and stores to prefetch data to avoid consuming M-slot just for
+ *	  lfetches => good for in-cache performance
+ *	- avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
+ *	  cycle
+ *
+ * Principle of operation:
+ *	We use a software-pipelined loop to control the overall operation.  The pipeline
+ *	has 2*PREFETCH_DIST+2 stages.  The first PREFETCH_DIST stages are used for prefetching
+ *	source cache-lines.  The second PREFETCH_DIST stages are used for prefetching destination
+ *	cache-lines, the two last stages are used to copy the cache-line words not copied by
+ *	the prefetches.  The four relevant points in the pipelined are called A, B, C, D:
+ *	p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
+ *	should be prefetched, p[C] is TRUE if at least one more cacheline needs to be copied,
+ *	and p[D] is TRUE if a cachline needs to be copied.
+ *
+ *	Note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.  To avoid
+ *	secondary misses in L2, we prefetch both source and destination with a line-size
+ *	of 128 bytes.  When both of these lines are in the L2 and the first half of the
+ *	source line is in L1, we start copying the remaining words.  The second half of the
+ *	source line is prefetched in the previous iteration, so that by the time we start
+ *	accessing it, it's also present in the L1.
+ *
+ *	This all sounds very complicated, but thanks to the modulo-scheduled loop support,
+ *	the resulting code is very regular and quite easy to follow (once you get the idea).
+ *
+ *	As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
+ *	as the separate .prefetch_loop.  Logically, this loop performs exactly like the
+ *	main-loop (.line_copy), but has all know-to-be-predicated-off instructions removed,
+ *	so that each loop iteration is faster (again, good for cached case).
+ *
+ *	When reading the code, it helps to keep the following picture in mind:
+ *
+ *	       bank 0 bank 1
+ *            +------+------+---
+ *	      |	v[x] | 	t1  | ^
+ *	      |	t2   |	t3  | |
+ *	      |	t4   |	t5  | |
+ *	      |	t6   |	t7  | | 128 bytes
+ *     	      |	n8   | 	t9  | |	(L2 cache line)
+ *	      |	t10  | 	t11 | |
+ *	      |	t12  | 	t13 | |
+ *	      |	t14  | 	t15 | v
+ *	      +------+------+---
+ *
+ *	Here, v[x] is copied by the (memory) prefetch.  n8 is loaded in the previous iteration
+ *	to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
+ *	an order that avoids bank conflicts.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+#define PREFETCH_DIST	8		// McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
+#define src0		r2
+#define src1		r3
+#define dst0		r9
+#define dst1		r10
+#define src_pre_mem	r11
+#define dst_pre_mem	r14
+#define src_pre_l2	r15
+#define dst_pre_l2	r16
+#define t1		r17
+#define t2		r18
+#define t3		r19
+#define t4		r20
+#define t5		t1	// alias!
+#define t6		t2	// alias!
+#define t7		t3	// alias!
+#define n8		r21
+#define t9		t5	// alias!
+#define t10		t4	// alias!
+#define t11		t7	// alias!
+#define t12		t6	// alias!
+#define t14		t10	// alias!
+#define t13		r22
+#define t15		r23
+#define saved_lc	r24
+#define saved_pr	r25
+#define	A	0
+#define B	(PREFETCH_DIST)
+#define C	(B + PREFETCH_DIST)
+#define D	(C + 1)
+#define N	(D + 1)
+#define Nrot	((N + 7) & ~7)
+GLOBAL_ENTRY(copy_page)
+	.prologue
+	alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
+	.rotr v[2*PREFETCH_DIST]
+	.rotp p[N]
+	.save ar.lc, saved_lc
+	mov saved_lc = ar.lc
+	.save pr, saved_pr
+	mov saved_pr = pr
+	.body
+	mov src_pre_mem = in1
+	mov pr.rot = 0x10000
+	mov ar.ec = 1				// special unrolled loop
+	mov dst_pre_mem = in0
+	mov ar.lc = 2*PREFETCH_DIST - 1
+	add src_pre_l2 = 8*8, in1
+	add dst_pre_l2 = 8*8, in0
+	add src0 = 8, in1			// first t1 src
+	add src1 = 3*8, in1			// first t3 src
+	add dst0 = 8, in0			// first t1 dst
+	add dst1 = 3*8, in0			// first t3 dst
+	;;
+	// same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0
+(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2
+	br.ctop.sptk .prefetch_loop
+	;;
+	cmp.eq p16, p0 = r0, r0			// reset p16 to 1 (br.ctop cleared it to zero)
+	mov ar.lc = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
+	mov ar.ec = N				// # of stages in pipeline
+	;;
+	.align 32
+.line_copy:
+(p[D])	ld8 t2 = [src0], 3*8			// M0
+(p[D])	ld8 t4 = [src1], 3*8			// M1
+(p[B])	st8 [dst_pre_mem] = v[B], 128		// M2 prefetch dst from memory
+(p[D])	st8 [dst_pre_l2] = n8, 128		// M3 prefetch dst from L2
+	;;
+(p[A])	ld8 v[A] = [src_pre_mem], 128		// M0 prefetch src from memory
+(p[C])	ld8 n8 = [src_pre_l2], 128		// M1 prefetch src from L2
+(p[D])	st8 [dst0] =  t1, 8			// M2
+(p[D])	st8 [dst1] =  t3, 8			// M3
+	;;
+(p[D])	ld8  t5 = [src0], 8
+(p[D])	ld8  t7 = [src1], 3*8
+(p[D])	st8 [dst0] =  t2, 3*8
+(p[D])	st8 [dst1] =  t4, 3*8
+	;;
+(p[D])	ld8  t6 = [src0], 3*8
+(p[D])	ld8 t10 = [src1], 8
+(p[D])	st8 [dst0] =  t5, 8
+(p[D])	st8 [dst1] =  t7, 3*8
+	;;
+(p[D])	ld8  t9 = [src0], 3*8
+(p[D])	ld8 t11 = [src1], 3*8
+(p[D])	st8 [dst0] =  t6, 3*8
+(p[D])	st8 [dst1] = t10, 8
+	;;
+(p[D])	ld8 t12 = [src0], 8
+(p[D])	ld8 t14 = [src1], 8
+(p[D])	st8 [dst0] =  t9, 3*8
+(p[D])	st8 [dst1] = t11, 3*8
+	;;
+(p[D])	ld8 t13 = [src0], 4*8
+(p[D])	ld8 t15 = [src1], 4*8
+(p[D])	st8 [dst0] = t12, 8
+(p[D])	st8 [dst1] = t14, 8
+	;;
+(p[C])	ld8  t1 = [src0], 8
+(p[C])	ld8  t3 = [src1], 8
+(p[D])	st8 [dst0] = t13, 4*8
+(p[D])	st8 [dst1] = t15, 4*8
+	br.ctop.sptk .line_copy
+	;;
+	mov ar.lc = saved_lc
+	mov pr = saved_pr, -1
+	br.ret.sptk.many rp
+END(copy_page)
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -12,7 +12,8 @@
 #define TI_EXEC_DOMAIN	0x00
 #define TI_FLAGS	0x08
 #define TI_CPU		0x0c
-#define TI_ADDR_LIMI	0x10
+#define TI_ADDR_LIMIT	0x10
+#define TI_PRE_COUNT	0x18
 #ifndef __ASSEMBLY__
@@ -26,6 +27,7 @@ struct thread_info {
 	__u32 flags;			/* thread_info flags (see TIF_*) */
 	__u32 cpu;			/* current CPU */
 	mm_segment_t addr_limit;	/* user-level address space limit */
+	__s32 preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 };
 #define INIT_THREAD_SIZE		/* tell sched.h not to declare the thread_union */
@@ -37,6 +39,7 @@ struct thread_info {
 	flags:		0,			\
 	cpu:		0,			\
 	addr_limit:	KERNEL_DS,		\
+	preempt_count:	0,			\
 }
 /* how to get the thread information struct from C */