Commit 789c299c authored by Anton Blanchard's avatar Anton Blanchard Committed by Benjamin Herrenschmidt

powerpc: Improve 64bit copy_tofrom_user

Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user.
The loop now does 32 bytes at a time and as well as pairing loads and stores.

A quick test case that reads 8kB over and over shows the improvement:

POWER6: 53% faster
POWER7: 51% faster

#define _XOPEN_SOURCE 500
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>

#define BUFSIZE (8 * 1024)
#define ITERATIONS 10000000

int main()
{
	char tmpfile[] = "/tmp/copy_to_user_testXXXXXX";
	int fd;
	char *buf[BUFSIZE];
	unsigned long i;

	fd = mkstemp(tmpfile);
	if (fd < 0) {
		perror("open");
		exit(1);
	}

	if (write(fd, buf, BUFSIZE) != BUFSIZE) {
		perror("open");
		exit(1);
	}

	for (i = 0; i < 10000000; i++) {
		if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) {
			perror("pread");
			exit(1);
		}
	}

	unlink(tmpfile);

	return 0;
}
Signed-off-by: default avatarAnton Blanchard <anton@samba.org>
Signed-off-by: default avatarBenjamin Herrenschmidt <benh@kernel.crashing.org>
parent 63e6c5b8
......@@ -44,37 +44,55 @@ BEGIN_FTR_SECTION
andi. r0,r4,7
bne .Lsrc_unaligned
END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
srdi r7,r5,4
20: ld r9,0(r4)
addi r4,r4,-8
mtctr r7
andi. r5,r5,7
bf cr7*4+0,22f
addi r3,r3,8
addi r4,r4,8
mr r8,r9
blt cr1,72f
21: ld r9,8(r4)
70: std r8,8(r3)
22: ldu r8,16(r4)
71: stdu r9,16(r3)
blt cr1,.Ldo_tail /* if < 16 bytes to copy */
srdi r0,r5,5
cmpdi cr1,r0,0
20: ld r7,0(r4)
220: ld r6,8(r4)
addi r4,r4,16
mtctr r0
andi. r0,r5,0x10
beq 22f
addi r3,r3,16
addi r4,r4,-16
mr r9,r7
mr r8,r6
beq cr1,72f
21: ld r7,16(r4)
221: ld r6,24(r4)
addi r4,r4,32
70: std r9,0(r3)
270: std r8,8(r3)
22: ld r9,0(r4)
222: ld r8,8(r4)
71: std r7,16(r3)
271: std r6,24(r3)
addi r3,r3,32
bdnz 21b
72: std r8,8(r3)
72: std r9,0(r3)
272: std r8,8(r3)
andi. r5,r5,0xf
beq+ 3f
addi r3,r3,16
addi r4,r4,16
.Ldo_tail:
bf cr7*4+1,1f
23: lwz r9,8(r4)
addi r3,r3,16
bf cr7*4+0,246f
244: ld r9,0(r4)
addi r4,r4,8
245: std r9,0(r3)
addi r3,r3,8
246: bf cr7*4+1,1f
23: lwz r9,0(r4)
addi r4,r4,4
73: stw r9,0(r3)
addi r3,r3,4
1: bf cr7*4+2,2f
44: lhz r9,8(r4)
44: lhz r9,0(r4)
addi r4,r4,2
74: sth r9,0(r3)
addi r3,r3,2
2: bf cr7*4+3,3f
45: lbz r9,8(r4)
45: lbz r9,0(r4)
75: stb r9,0(r3)
3: li r3,0
blr
......@@ -220,7 +238,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
131:
addi r3,r3,8
120:
320:
122:
322:
124:
125:
126:
......@@ -229,9 +249,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
129:
133:
addi r3,r3,8
121:
132:
addi r3,r3,8
121:
321:
344:
134:
135:
138:
......@@ -303,18 +325,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
183:
add r3,r3,r7
b 1f
371:
180:
addi r3,r3,8
171:
177:
addi r3,r3,8
170:
172:
370:
372:
176:
178:
addi r3,r3,4
185:
addi r3,r3,4
170:
172:
345:
173:
174:
175:
......@@ -341,11 +367,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
.section __ex_table,"a"
.align 3
.llong 20b,120b
.llong 220b,320b
.llong 21b,121b
.llong 221b,321b
.llong 70b,170b
.llong 270b,370b
.llong 22b,122b
.llong 222b,322b
.llong 71b,171b
.llong 271b,371b
.llong 72b,172b
.llong 272b,372b
.llong 244b,344b
.llong 245b,345b
.llong 23b,123b
.llong 73b,173b
.llong 44b,144b
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment