Commit a792a27c authored by Andrew Morton's avatar Andrew Morton Committed by Jens Axboe

[PATCH] faster copy_*_user for bad alignments on intel ia32

This patch speeds up copy_*_user for some Intel ia32 processors.  It is
based on work by Mala Anand.

It is a good win.  Around 30% for all src/dest alignments except 32/32.

In this test a fully-cached one gigabyte file was read into an
8192-byte userspace buffer using read(fd, buf, 8192).  The alignment of
the user-side buffer was altered between runs.  This is a PIII.  Times
are in seconds.

User buffer	2.5.41		2.5.41+
				patch++

0x804c000	4.373		4.343
0x804c001	10.024		6.401
0x804c002	10.002		6.347
0x804c003	10.013		6.328
0x804c004	10.105		6.273
0x804c005	10.184		6.323
0x804c006	10.179		6.322
0x804c007	10.185		6.319
0x804c008	9.725		6.347
0x804c009	9.780		6.275
0x804c00a	9.779		6.355
0x804c00b	9.778		6.350
0x804c00c	9.723		6.351
0x804c00d	9.790		6.307
0x804c00e	9.790		6.289
0x804c00f	9.785		6.294
0x804c010	9.727		6.277
0x804c011	9.779		6.251
0x804c012	9.783		6.246
0x804c013	9.786		6.245
0x804c014	9.772		6.063
0x804c015	9.919		6.237
0x804c016	9.920		6.234
0x804c017	9.918		6.237
0x804c018	9.846		6.372
0x804c019	10.060		6.294
0x804c01a	10.049		6.328
0x804c01b	10.041		6.337
0x804c01c	9.931		6.347
0x804c01d	10.013		6.273
0x804c01e	10.020		6.346
0x804c01f	10.016		6.356
0x804c020	4.442		4.366

So `rep;movsl' is slower at all non-cache-aligned offsets.

PII is using the PIII alignment.  I don't have a PII any more, but I do
recall that it demonstrated the same behaviour as the PIII.

The patch contains an enhancement (based on careful testing) from
Hirokazu Takahashi <taka@valinux.co.jp>.  In cases where source and
dest have the same alignment, but that aligment is poor, we do a short
copy of a few bytes to bring the two pointers onto a favourable
boundary and then do the big copy.

And also a bugfix from Hirokazu Takahashi.

As an added bonus, this patch decreases the kernel text by 28 kbytes.
22k of this in in .text and the rest in __ex_table.  I'm not really
sure why .text shrunk so much.

These copy routines have no special-case for constant-sized copies.  So
a lot of uaccess.h becomes dead code with this patch.  The next patch
which uninlines the copy_*_user functions cleans all that up and saves
an additional 5k.
parent 43c8cc21
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/thread_info.h> #include <asm/thread_info.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/uaccess.h>
#include "cpu.h" #include "cpu.h"
...@@ -13,6 +14,11 @@ static int disable_x86_serial_nr __initdata = 1; ...@@ -13,6 +14,11 @@ static int disable_x86_serial_nr __initdata = 1;
static int disable_P4_HT __initdata = 0; static int disable_P4_HT __initdata = 0;
extern int trap_init_f00f_bug(void); extern int trap_init_f00f_bug(void);
#ifdef INTEL_MOVSL
struct movsl_mask movsl_mask; /* alignment at which movsl is preferred for
bulk memory copies */
#endif
/* /*
* Early probe support logic for ppro memory erratum #50 * Early probe support logic for ppro memory erratum #50
* *
...@@ -348,6 +354,25 @@ static void __init init_intel(struct cpuinfo_x86 *c) ...@@ -348,6 +354,25 @@ static void __init init_intel(struct cpuinfo_x86 *c)
/* Work around errata */ /* Work around errata */
Intel_errata_workarounds(c); Intel_errata_workarounds(c);
#ifdef INTEL_MOVSL
/*
* Set up the preferred alignment for movsl bulk memory moves
*/
switch (c->x86) {
case 4: /* 486: untested */
break;
case 5: /* Old Pentia: untested */
break;
case 6: /* PII/PIII only like movsl with 8-byte alignment */
movsl_mask.mask = 7;
break;
case 15: /* P4 is OK down to 8-byte alignment */
movsl_mask.mask = 7;
break;
}
#endif
} }
......
...@@ -119,6 +119,11 @@ EXPORT_SYMBOL(__clear_user); ...@@ -119,6 +119,11 @@ EXPORT_SYMBOL(__clear_user);
EXPORT_SYMBOL(__generic_copy_from_user); EXPORT_SYMBOL(__generic_copy_from_user);
EXPORT_SYMBOL(__generic_copy_to_user); EXPORT_SYMBOL(__generic_copy_to_user);
EXPORT_SYMBOL(strnlen_user); EXPORT_SYMBOL(strnlen_user);
#ifdef INTEL_MOVSL
EXPORT_SYMBOL(movsl_mask);
EXPORT_SYMBOL(__copy_user_int);
EXPORT_SYMBOL(__copy_user_zeroing_int);
#endif
EXPORT_SYMBOL(pci_alloc_consistent); EXPORT_SYMBOL(pci_alloc_consistent);
EXPORT_SYMBOL(pci_free_consistent); EXPORT_SYMBOL(pci_free_consistent);
......
...@@ -45,8 +45,12 @@ unsigned long ...@@ -45,8 +45,12 @@ unsigned long
__generic_copy_to_user(void *to, const void *from, unsigned long n) __generic_copy_to_user(void *to, const void *from, unsigned long n)
{ {
prefetch(from); prefetch(from);
if (access_ok(VERIFY_WRITE, to, n)) if (access_ok(VERIFY_WRITE, to, n)) {
__copy_user(to,from,n); if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_int(to, from, n);
}
return n; return n;
} }
...@@ -54,10 +58,14 @@ unsigned long ...@@ -54,10 +58,14 @@ unsigned long
__generic_copy_from_user(void *to, const void *from, unsigned long n) __generic_copy_from_user(void *to, const void *from, unsigned long n)
{ {
prefetchw(to); prefetchw(to);
if (access_ok(VERIFY_READ, from, n)) if (access_ok(VERIFY_READ, from, n)) {
__copy_user_zeroing(to,from,n); if (movsl_is_ok(to, from, n))
else __copy_user_zeroing(to,from,n);
else
n = __copy_user_zeroing_int(to, from, n);
} else {
memset(to, 0, n); memset(to, 0, n);
}
return n; return n;
} }
...@@ -188,3 +196,191 @@ long strnlen_user(const char *s, long n) ...@@ -188,3 +196,191 @@ long strnlen_user(const char *s, long n)
:"cc"); :"cc");
return res & mask; return res & mask;
} }
#ifdef INTEL_MOVSL
/*
* Copy To/From Userspace
*/
/* Generic arbitrary sized copy. */
unsigned long __copy_user_int(void *to, const void *from,unsigned long size)
{
int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
"0: movl 32(%4), %%eax\n"
" cmpl $67, %0\n"
" jbe 1f\n"
" movl 64(%4), %%eax\n"
" .align 2,0x90\n"
"1: movl 0(%4), %%eax\n"
" movl 4(%4), %%edx\n"
"2: movl %%eax, 0(%3)\n"
"21: movl %%edx, 4(%3)\n"
" movl 8(%4), %%eax\n"
" movl 12(%4),%%edx\n"
"3: movl %%eax, 8(%3)\n"
"31: movl %%edx, 12(%3)\n"
" movl 16(%4), %%eax\n"
" movl 20(%4), %%edx\n"
"4: movl %%eax, 16(%3)\n"
"41: movl %%edx, 20(%3)\n"
" movl 24(%4), %%eax\n"
" movl 28(%4), %%edx\n"
"10: movl %%eax, 24(%3)\n"
"51: movl %%edx, 28(%3)\n"
" movl 32(%4), %%eax\n"
" movl 36(%4), %%edx\n"
"11: movl %%eax, 32(%3)\n"
"61: movl %%edx, 36(%3)\n"
" movl 40(%4), %%eax\n"
" movl 44(%4), %%edx\n"
"12: movl %%eax, 40(%3)\n"
"71: movl %%edx, 44(%3)\n"
" movl 48(%4), %%eax\n"
" movl 52(%4), %%edx\n"
"13: movl %%eax, 48(%3)\n"
"81: movl %%edx, 52(%3)\n"
" movl 56(%4), %%eax\n"
" movl 60(%4), %%edx\n"
"14: movl %%eax, 56(%3)\n"
"91: movl %%edx, 60(%3)\n"
" addl $-64, %0\n"
" addl $64, %4\n"
" addl $64, %3\n"
" cmpl $63, %0\n"
" ja 0b\n"
"5: movl %0, %%eax\n"
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
"6: rep; movsl\n"
" movl %%eax, %0\n"
"7: rep; movsb\n"
"8:\n"
".section .fixup,\"ax\"\n"
"9: lea 0(%%eax,%0,4),%0\n"
" jmp 8b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 2b,8b\n"
" .long 21b,8b\n"
" .long 3b,8b\n"
" .long 31b,8b\n"
" .long 4b,8b\n"
" .long 41b,8b\n"
" .long 10b,8b\n"
" .long 51b,8b\n"
" .long 11b,8b\n"
" .long 61b,8b\n"
" .long 12b,8b\n"
" .long 71b,8b\n"
" .long 13b,8b\n"
" .long 81b,8b\n"
" .long 14b,8b\n"
" .long 91b,8b\n"
" .long 6b,9b\n"
" .long 7b,8b\n"
".previous"
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
return size;
}
unsigned long
__copy_user_zeroing_int(void *to, const void *from, unsigned long size)
{
int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
"0: movl 32(%4), %%eax\n"
" cmpl $67, %0\n"
" jbe 2f\n"
"1: movl 64(%4), %%eax\n"
" .align 2,0x90\n"
"2: movl 0(%4), %%eax\n"
"21: movl 4(%4), %%edx\n"
" movl %%eax, 0(%3)\n"
" movl %%edx, 4(%3)\n"
"3: movl 8(%4), %%eax\n"
"31: movl 12(%4),%%edx\n"
" movl %%eax, 8(%3)\n"
" movl %%edx, 12(%3)\n"
"4: movl 16(%4), %%eax\n"
"41: movl 20(%4), %%edx\n"
" movl %%eax, 16(%3)\n"
" movl %%edx, 20(%3)\n"
"10: movl 24(%4), %%eax\n"
"51: movl 28(%4), %%edx\n"
" movl %%eax, 24(%3)\n"
" movl %%edx, 28(%3)\n"
"11: movl 32(%4), %%eax\n"
"61: movl 36(%4), %%edx\n"
" movl %%eax, 32(%3)\n"
" movl %%edx, 36(%3)\n"
"12: movl 40(%4), %%eax\n"
"71: movl 44(%4), %%edx\n"
" movl %%eax, 40(%3)\n"
" movl %%edx, 44(%3)\n"
"13: movl 48(%4), %%eax\n"
"81: movl 52(%4), %%edx\n"
" movl %%eax, 48(%3)\n"
" movl %%edx, 52(%3)\n"
"14: movl 56(%4), %%eax\n"
"91: movl 60(%4), %%edx\n"
" movl %%eax, 56(%3)\n"
" movl %%edx, 60(%3)\n"
" addl $-64, %0\n"
" addl $64, %4\n"
" addl $64, %3\n"
" cmpl $63, %0\n"
" ja 0b\n"
"5: movl %0, %%eax\n"
" shrl $2, %0\n"
" andl $3, %%eax\n"
" cld\n"
"6: rep; movsl\n"
" movl %%eax,%0\n"
"7: rep; movsb\n"
"8:\n"
".section .fixup,\"ax\"\n"
"9: lea 0(%%eax,%0,4),%0\n"
"16: pushl %0\n"
" pushl %%eax\n"
" xorl %%eax,%%eax\n"
" rep; stosb\n"
" popl %%eax\n"
" popl %0\n"
" jmp 8b\n"
".previous\n"
".section __ex_table,\"a\"\n"
" .align 4\n"
" .long 0b,16b\n"
" .long 1b,16b\n"
" .long 2b,16b\n"
" .long 21b,16b\n"
" .long 3b,16b\n"
" .long 31b,16b\n"
" .long 4b,16b\n"
" .long 41b,16b\n"
" .long 10b,16b\n"
" .long 51b,16b\n"
" .long 11b,16b\n"
" .long 61b,16b\n"
" .long 12b,16b\n"
" .long 71b,16b\n"
" .long 13b,16b\n"
" .long 81b,16b\n"
" .long 14b,16b\n"
" .long 91b,16b\n"
" .long 6b,9b\n"
" .long 7b,16b\n"
".previous"
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
return size;
}
#endif /* INTEL_MOVSL */
...@@ -33,7 +33,39 @@ ...@@ -33,7 +33,39 @@
#define segment_eq(a,b) ((a).seg == (b).seg) #define segment_eq(a,b) ((a).seg == (b).seg)
extern int __verify_write(const void *, unsigned long); /*
* movsl can be slow when source and dest are not both 8-byte aligned
*/
#if defined(CONFIG_M586MMX) || defined(CONFIG_M686) || \
defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUM4)
#define INTEL_MOVSL
#endif
#ifdef INTEL_MOVSL
extern struct movsl_mask {
int mask;
} ____cacheline_aligned_in_smp movsl_mask;
static inline int movsl_is_ok(const void *a1, const void *a2, unsigned long n)
{
if (n < 64)
return 1;
if ((((const long)a1 ^ (const long)a2) & movsl_mask.mask) == 0)
return 1;
return 0;
}
#else
static inline int movsl_is_ok(const void *a1, const void *a2, unsigned long n)
{
return 1;
}
#endif
/* These are undefined on !INTEL_MOVSL. And they should be unreferenced. */
unsigned long __copy_user_int(void *, const void *, unsigned long);
unsigned long __copy_user_zeroing_int(void *, const void *, unsigned long);
int __verify_write(const void *, unsigned long);
#define __addr_ok(addr) ((unsigned long)(addr) < (current_thread_info()->addr_limit.seg)) #define __addr_ok(addr) ((unsigned long)(addr) < (current_thread_info()->addr_limit.seg))
...@@ -255,37 +287,64 @@ do { \ ...@@ -255,37 +287,64 @@ do { \
/* Generic arbitrary sized copy. */ /* Generic arbitrary sized copy. */
#define __copy_user(to,from,size) \ #define __copy_user(to,from,size) \
do { \ do { \
int __d0, __d1; \ int __d0, __d1, __d2; \
__asm__ __volatile__( \ __asm__ __volatile__( \
" cmp $7,%0\n" \
" jbe 1f\n" \
" movl %1,%0\n" \
" negl %0\n" \
" andl $7,%0\n" \
" subl %0,%3\n" \
"4: rep; movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
"0: rep; movsl\n" \ "0: rep; movsl\n" \
" movl %3,%0\n" \ " movl %3,%0\n" \
"1: rep; movsb\n" \ "1: rep; movsb\n" \
"2:\n" \ "2:\n" \
".section .fixup,\"ax\"\n" \ ".section .fixup,\"ax\"\n" \
"5: addl %3,%0\n" \
" jmp 2b\n" \
"3: lea 0(%3,%0,4),%0\n" \ "3: lea 0(%3,%0,4),%0\n" \
" jmp 2b\n" \ " jmp 2b\n" \
".previous\n" \ ".previous\n" \
".section __ex_table,\"a\"\n" \ ".section __ex_table,\"a\"\n" \
" .align 4\n" \ " .align 4\n" \
" .long 4b,5b\n" \
" .long 0b,3b\n" \ " .long 0b,3b\n" \
" .long 1b,2b\n" \ " .long 1b,2b\n" \
".previous" \ ".previous" \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from) \ : "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \ : "memory"); \
} while (0) } while (0)
#define __copy_user_zeroing(to,from,size) \ #define __copy_user_zeroing(to,from,size) \
do { \ do { \
int __d0, __d1; \ int __d0, __d1, __d2; \
__asm__ __volatile__( \ __asm__ __volatile__( \
" cmp $7,%0\n" \
" jbe 1f\n" \
" movl %1,%0\n" \
" negl %0\n" \
" andl $7,%0\n" \
" subl %0,%3\n" \
"4: rep; movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
"0: rep; movsl\n" \ "0: rep; movsl\n" \
" movl %3,%0\n" \ " movl %3,%0\n" \
"1: rep; movsb\n" \ "1: rep; movsb\n" \
"2:\n" \ "2:\n" \
".section .fixup,\"ax\"\n" \ ".section .fixup,\"ax\"\n" \
"5: addl %3,%0\n" \
" jmp 6f\n" \
"3: lea 0(%3,%0,4),%0\n" \ "3: lea 0(%3,%0,4),%0\n" \
"4: pushl %0\n" \ "6: pushl %0\n" \
" pushl %%eax\n" \ " pushl %%eax\n" \
" xorl %%eax,%%eax\n" \ " xorl %%eax,%%eax\n" \
" rep; stosb\n" \ " rep; stosb\n" \
...@@ -295,28 +354,37 @@ do { \ ...@@ -295,28 +354,37 @@ do { \
".previous\n" \ ".previous\n" \
".section __ex_table,\"a\"\n" \ ".section __ex_table,\"a\"\n" \
" .align 4\n" \ " .align 4\n" \
" .long 4b,5b\n" \
" .long 0b,3b\n" \ " .long 0b,3b\n" \
" .long 1b,4b\n" \ " .long 1b,6b\n" \
".previous" \ ".previous" \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1) \ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "r"(size & 3), "0"(size / 4), "1"(to), "2"(from) \ : "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \ : "memory"); \
} while (0) } while (0)
/* We let the __ versions of copy_from/to_user inline, because they're often /* We let the __ versions of copy_from/to_user inline, because they're often
* used in fast paths and have only a small space overhead. * used in fast paths and have only a small space overhead.
*/ */
static inline unsigned long static inline unsigned long
__generic_copy_from_user_nocheck(void *to, const void *from, unsigned long n) __generic_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
{ {
__copy_user_zeroing(to,from,n); if (movsl_is_ok(to, from, n))
__copy_user_zeroing(to, from, n);
else
n = __copy_user_zeroing_int(to, from, n);
return n; return n;
} }
static inline unsigned long static inline unsigned long
__generic_copy_to_user_nocheck(void *to, const void *from, unsigned long n) __generic_copy_to_user_nocheck(void *to, const void *from, unsigned long n)
{ {
__copy_user(to,from,n); if (movsl_is_ok(to, from, n))
__copy_user(to, from, n);
else
n = __copy_user_int(to, from, n);
return n; return n;
} }
...@@ -578,24 +646,16 @@ __constant_copy_from_user_nocheck(void *to, const void *from, unsigned long n) ...@@ -578,24 +646,16 @@ __constant_copy_from_user_nocheck(void *to, const void *from, unsigned long n)
} }
#define copy_to_user(to,from,n) \ #define copy_to_user(to,from,n) \
(__builtin_constant_p(n) ? \ __generic_copy_to_user((to),(from),(n))
__constant_copy_to_user((to),(from),(n)) : \
__generic_copy_to_user((to),(from),(n)))
#define copy_from_user(to,from,n) \ #define copy_from_user(to,from,n) \
(__builtin_constant_p(n) ? \ __generic_copy_from_user((to),(from),(n))
__constant_copy_from_user((to),(from),(n)) : \
__generic_copy_from_user((to),(from),(n)))
#define __copy_to_user(to,from,n) \ #define __copy_to_user(to,from,n) \
(__builtin_constant_p(n) ? \ __generic_copy_to_user_nocheck((to),(from),(n))
__constant_copy_to_user_nocheck((to),(from),(n)) : \
__generic_copy_to_user_nocheck((to),(from),(n)))
#define __copy_from_user(to,from,n) \ #define __copy_from_user(to,from,n) \
(__builtin_constant_p(n) ? \ __generic_copy_from_user_nocheck((to),(from),(n))
__constant_copy_from_user_nocheck((to),(from),(n)) : \
__generic_copy_from_user_nocheck((to),(from),(n)))
long strncpy_from_user(char *dst, const char *src, long count); long strncpy_from_user(char *dst, const char *src, long count);
long __strncpy_from_user(char *dst, const char *src, long count); long __strncpy_from_user(char *dst, const char *src, long count);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment