Commit 58cef2ea authored by David Mosberger's avatar David Mosberger

arch/ia64/lib/copy_page_mck.S:

    Tweak for better performance when data is in L2 or L3 cache.
parent 8b39f58f
...@@ -17,28 +17,28 @@ ...@@ -17,28 +17,28 @@
* cycle * cycle
* *
* Principle of operation: * Principle of operation:
* First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
* To avoid secondary misses in L2, we prefetch both source and destination with a line-size
* of 128 bytes. When both of these lines are in the L2 and the first half of the
* source line is in L1, we start copying the remaining words. The second half of the
* source line is prefetched in an earlier iteration, so that by the time we start
* accessing it, it's also present in the L1.
*
* We use a software-pipelined loop to control the overall operation. The pipeline * We use a software-pipelined loop to control the overall operation. The pipeline
* has 2*PREFETCH_DIST+2 stages. The first PREFETCH_DIST stages are used for prefetching * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching
* source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination
* cache-lines, the two last stages are used to copy the cache-line words not copied by * cache-lines, the last K stages are used to copy the cache-line words not copied by
* the prefetches. The four relevant points in the pipelined are called A, B, C, D: * the prefetches. The four relevant points in the pipelined are called A, B, C, D:
* p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
* should be prefetched, p[C] is TRUE if at least one more cacheline needs to be copied, * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
* and p[D] is TRUE if a cachline needs to be copied. * into L1D and p[D] is TRUE if a cacheline needs to be copied.
*
* Note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes. To avoid
* secondary misses in L2, we prefetch both source and destination with a line-size
* of 128 bytes. When both of these lines are in the L2 and the first half of the
* source line is in L1, we start copying the remaining words. The second half of the
* source line is prefetched in the previous iteration, so that by the time we start
* accessing it, it's also present in the L1.
* *
* This all sounds very complicated, but thanks to the modulo-scheduled loop support, * This all sounds very complicated, but thanks to the modulo-scheduled loop support,
* the resulting code is very regular and quite easy to follow (once you get the idea). * the resulting code is very regular and quite easy to follow (once you get the idea).
* *
* As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
* as the separate .prefetch_loop. Logically, this loop performs exactly like the * as the separate .prefetch_loop. Logically, this loop performs exactly like the
* main-loop (.line_copy), but has all know-to-be-predicated-off instructions removed, * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
* so that each loop iteration is faster (again, good for cached case). * so that each loop iteration is faster (again, good for cached case).
* *
* When reading the code, it helps to keep the following picture in mind: * When reading the code, it helps to keep the following picture in mind:
...@@ -49,13 +49,13 @@ ...@@ -49,13 +49,13 @@
* | t2 | t3 | | * | t2 | t3 | |
* | t4 | t5 | | * | t4 | t5 | |
* | t6 | t7 | | 128 bytes * | t6 | t7 | | 128 bytes
* | n8 | t9 | | (L2 cache line) * | n[y] | t9 | | (L2 cache line)
* | t10 | t11 | | * | t10 | t11 | |
* | t12 | t13 | | * | t12 | t13 | |
* | t14 | t15 | v * | t14 | t15 | v
* +------+------+--- * +------+------+---
* *
* Here, v[x] is copied by the (memory) prefetch. n8 is loaded in the previous iteration * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C]
* to fetch the second-half of the L2 cache line into L1, and the tX words are copied in * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
* an order that avoids bank conflicts. * an order that avoids bank conflicts.
*/ */
...@@ -79,22 +79,21 @@ ...@@ -79,22 +79,21 @@
#define t5 t1 // alias! #define t5 t1 // alias!
#define t6 t2 // alias! #define t6 t2 // alias!
#define t7 t3 // alias! #define t7 t3 // alias!
#define n8 r21
#define t9 t5 // alias! #define t9 t5 // alias!
#define t10 t4 // alias! #define t10 t4 // alias!
#define t11 t7 // alias! #define t11 t7 // alias!
#define t12 t6 // alias! #define t12 t6 // alias!
#define t14 t10 // alias! #define t14 t10 // alias!
#define t13 r22 #define t13 r21
#define t15 r23 #define t15 r22
#define saved_lc r24 #define saved_lc r23
#define saved_pr r25 #define saved_pr r24
#define A 0 #define A 0
#define B (PREFETCH_DIST) #define B (PREFETCH_DIST)
#define C (B + PREFETCH_DIST) #define C (B + PREFETCH_DIST)
#define D (C + 1) #define D (C + 3)
#define N (D + 1) #define N (D + 1)
#define Nrot ((N + 7) & ~7) #define Nrot ((N + 7) & ~7)
...@@ -102,7 +101,7 @@ GLOBAL_ENTRY(copy_page) ...@@ -102,7 +101,7 @@ GLOBAL_ENTRY(copy_page)
.prologue .prologue
alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
.rotr v[2*PREFETCH_DIST] .rotr v[2*PREFETCH_DIST], n[D-C+1]
.rotp p[N] .rotp p[N]
.save ar.lc, saved_lc .save ar.lc, saved_lc
...@@ -124,6 +123,9 @@ GLOBAL_ENTRY(copy_page) ...@@ -124,6 +123,9 @@ GLOBAL_ENTRY(copy_page)
add src1 = 3*8, in1 // first t3 src add src1 = 3*8, in1 // first t3 src
add dst0 = 8, in0 // first t1 dst add dst0 = 8, in0 // first t1 dst
add dst1 = 3*8, in0 // first t3 dst add dst1 = 3*8, in0 // first t3 dst
nop.m 0
nop.m 0
nop.i 0
;; ;;
// same as .line_copy loop, but with all predicated-off instructions removed: // same as .line_copy loop, but with all predicated-off instructions removed:
.prefetch_loop: .prefetch_loop:
...@@ -135,15 +137,14 @@ GLOBAL_ENTRY(copy_page) ...@@ -135,15 +137,14 @@ GLOBAL_ENTRY(copy_page)
mov ar.lc = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1 mov ar.lc = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
mov ar.ec = N // # of stages in pipeline mov ar.ec = N // # of stages in pipeline
;; ;;
.align 32
.line_copy: .line_copy:
(p[D]) ld8 t2 = [src0], 3*8 // M0 (p[D]) ld8 t2 = [src0], 3*8 // M0
(p[D]) ld8 t4 = [src1], 3*8 // M1 (p[D]) ld8 t4 = [src1], 3*8 // M1
(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory (p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory
(p[D]) st8 [dst_pre_l2] = n8, 128 // M3 prefetch dst from L2 (p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2
;; ;;
(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory (p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory
(p[C]) ld8 n8 = [src_pre_l2], 128 // M1 prefetch src from L2 (p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2
(p[D]) st8 [dst0] = t1, 8 // M2 (p[D]) st8 [dst0] = t1, 8 // M2
(p[D]) st8 [dst1] = t3, 8 // M3 (p[D]) st8 [dst1] = t3, 8 // M3
;; ;;
...@@ -172,8 +173,8 @@ GLOBAL_ENTRY(copy_page) ...@@ -172,8 +173,8 @@ GLOBAL_ENTRY(copy_page)
(p[D]) st8 [dst0] = t12, 8 (p[D]) st8 [dst0] = t12, 8
(p[D]) st8 [dst1] = t14, 8 (p[D]) st8 [dst1] = t14, 8
;; ;;
(p[C]) ld8 t1 = [src0], 8 (p[D-1])ld8 t1 = [src0], 8
(p[C]) ld8 t3 = [src1], 8 (p[D-1])ld8 t3 = [src1], 8
(p[D]) st8 [dst0] = t13, 4*8 (p[D]) st8 [dst0] = t13, 4*8
(p[D]) st8 [dst1] = t15, 4*8 (p[D]) st8 [dst1] = t15, 4*8
br.ctop.sptk .line_copy br.ctop.sptk .line_copy
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment