[SPARC64]: Revamped memcpy infrastructure.

- Make it easier to maintain the Ultra-I vs. Ultra-III memcpy implementations. Before you had to maintain 3 different entire copies of the routines. - Kill %asi register writing Ultra-I single memcpy loop for both user and kernel. Was not worth it. - Simplify exception detection and handling enormously. Signed-off-by: David S. Miller <davem@redhat.com>

[SPARC64]: Revamped memcpy infrastructure.
- Make it easier to maintain the Ultra-I vs. Ultra-III memcpy implementations. Before you had to maintain 3 different entire copies of the routines. - Kill %asi register writing Ultra-I single memcpy loop for both user and kernel. Was not worth it. - Simplify exception detection and handling enormously. Signed-off-by: David S. Miller <davem@redhat.com>
2657fd8f · David S. Miller · d360f1ee · 2657fd8f · 2657fd8f · 2657fd8f
Commit 2657fd8f authored Aug 23, 2004 by David S. Miller
15 changed files
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -334,7 +334,6 @@ EXPORT_SYMBOL(sys_close);
 #endif

 /* Special internal versions of library functions. */
-EXPORT_SYMBOL(__memset);
 EXPORT_SYMBOL(_clear_page);
 EXPORT_SYMBOL(clear_user_page);
 EXPORT_SYMBOL(copy_user_page);
@@ -343,7 +342,7 @@ EXPORT_SYMBOL(__memscan_zero);
 EXPORT_SYMBOL(__memscan_generic);
 EXPORT_SYMBOL(__memcmp);
 EXPORT_SYMBOL(__strncmp);
-EXPORT_SYMBOL(__memmove);
+EXPORT_SYMBOL(__memset);
 EXPORT_SYMBOL(memchr);

 EXPORT_SYMBOL(csum_partial);
@@ -351,9 +350,12 @@ EXPORT_SYMBOL(csum_partial_copy_sparc64);
 EXPORT_SYMBOL(ip_fast_csum);

 /* Moving data to/from/in userspace. */
-EXPORT_SYMBOL(__copy_to_user);
-EXPORT_SYMBOL(__copy_from_user);
-EXPORT_SYMBOL(__copy_in_user);
+EXPORT_SYMBOL(___copy_to_user);
+EXPORT_SYMBOL(___copy_from_user);
+EXPORT_SYMBOL(___copy_in_user);
+EXPORT_SYMBOL(copy_to_user_fixup);
+EXPORT_SYMBOL(copy_from_user_fixup);
+EXPORT_SYMBOL(copy_in_user_fixup);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__bzero_noasi);


--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -7,11 +7,12 @@ EXTRA_CFLAGS := -Werror

 lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
 	 memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \
-	 VIScopy.o VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \
+	 VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \
 	 VIScsumcopyusr.o VISsave.o atomic.o rwlock.o bitops.o \
-	 U3memcpy.o U3copy_from_user.o U3copy_to_user.o \
-	 U3copy_in_user.o mcount.o ipcsum.o rwsem.o xor.o splock.o \
-	 find_bit.o
+	 U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
+	 U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
+	 copy_in_user.o user_fixup.o memmove.o \
+	 mcount.o ipcsum.o rwsem.o xor.o splock.o find_bit.o

 lib-$(CONFIG_DEBUG_SPINLOCK) += debuglocks.o
 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
--- a/arch/sparc64/lib/U1copy_from_user.S
+++ b/arch/sparc64/lib/U1copy_from_user.S
+/* U1copy_from_user.S: UltraSparc-I/II/IIi/IIe optimized copy from userspace.
+ *
+ * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
+ */
+
+#define EX_LD(x)		\
+98:	x;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	retl;			\
+	 mov	1, %o0;		\
+	.section __ex_table;	\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+#define FUNC_NAME		___copy_from_user
+#define LOAD(type,addr,dest)	type##a [addr] %asi, dest
+#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_AIUS, dest
+#define EX_RETVAL(x)		0
+
+#include "U1memcpy.S"
--- a/arch/sparc64/lib/U1copy_to_user.S
+++ b/arch/sparc64/lib/U1copy_to_user.S
+/* U1copy_to_user.S: UltraSparc-I/II/IIi/IIe optimized copy to userspace.
+ *
+ * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
+ */
+
+#define EX_ST(x)		\
+98:	x;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	retl;			\
+	 mov	1, %o0;		\
+	.section __ex_table;	\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+#define FUNC_NAME		___copy_to_user
+#define STORE(type,src,addr)	type##a src, [addr] ASI_AIUS
+#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_AIUS
+#define EX_RETVAL(x)		0
+
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, memcpy_user_stub;		\
+	 nop;						\
+
+#include "U1memcpy.S"
--- a/arch/sparc64/lib/U1memcpy.S
+++ b/arch/sparc64/lib/U1memcpy.S
--- a/arch/sparc64/lib/U3copy_from_user.S
+++ b/arch/sparc64/lib/U3copy_from_user.S
@@ -3,410 +3,20 @@
 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
 */

-#include <asm/visasm.h>
-#include <asm/asi.h>
-#include <asm/dcu.h>
-#include <asm/spitfire.h>
-
-#define XCC xcc
-
-#define EXNV_RAW(x,y,a,b)		\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	ba U3cfu_fixup;			\
-	 a, b, %o1;			\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	add %o1, %o3, %o0;		\
-	ba U3cfu_fixup;			\
-	 a, b, %o1;			\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV4(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	add %o1, %o3, %o0;		\
-	a, b, %o1;			\
-	ba U3cfu_fixup;			\
-	 add %o1, 4, %o1;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV8(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	add %o1, %o3, %o0;		\
-	a, b, %o1;			\
-	ba U3cfu_fixup;			\
-	 add %o1, 8, %o1;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EX(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	ba U3cfu_fixup;			\
-	 a, b, %o1;			\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EX2(x,y)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	and %o2, (0x40 - 1), %o1;	\
-	add %o1, %o4, %o1;		\
-	ba U3cfu_fixup;			\
-	 add %o1, 0x1c0, %o1;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EX3(x,y)			\
-98:	x,y;				\
+#define EX_LD(x)		\
+98:	x;			\
 	.section .fixup;	\
 	.align 4;		\
-99:	VISExitHalf;			\
-	and %o2, (0x40 - 1), %o1;	\
-	sll %g3, 6, %g3;		\
-	add %o1, 0x80, %o1;		\
-	ba U3cfu_fixup;			\
-	 add %o1, %g3, %o1;		\
+99:	retl;			\
+	 mov	1, %o0;		\
 	.section __ex_table;	\
 	.align 4;		\
 	.word 98b, 99b;		\
 	.text;			\
 	.align 4;
-#define EX4(x,y)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	and %o2, (0x40 - 1), %o1;	\
-	add %o1, 0x40, %o1;		\
-	ba U3cfu_fixup;			\
-	 add %o1, %g3, %o1;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-
-	.register	%g2,#scratch
-	.register	%g3,#scratch
-
-	/* Special/non-trivial issues of this code:
-	 *
-	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
-	 * 2) Only low 32 FPU registers are used so that only the
-	 *    lower half of the FPU register set is dirtied by this
-	 *    code.  This is especially important in the kernel.
-	 * 3) This code never prefetches cachelines past the end
-	 *    of the source buffer.
-	 */
-
-	.text
-	.align	32
-
-	/* The cheetah's flexible spine, oversized liver, enlarged heart,
-	 * slender muscular body, and claws make it the swiftest hunter
-	 * in Africa and the fastest animal on land.  Can reach speeds
-	 * of up to 2.4GB per second.
-	 */
-
-	.globl	U3copy_from_user
-U3copy_from_user:	/* %o0=dst, %o1=src, %o2=len */
-	cmp		%o2, 0
-	be,pn		%XCC, 85f
-	 or		%o0, %o1, %o3
-	cmp		%o2, 16
-	bleu,a,pn	%XCC, 80f
-	 or		%o3, %o2, %o3
-
-	cmp		%o2, 256
-	blu,pt		%XCC, 70f
-	 andcc		%o3, 0x7, %g0
-
-	ba,pt		%xcc, 1f
-	 andcc		%o0, 0x3f, %g2
-
-	/* Here len >= 256 and condition codes reflect execution
-	 * of "andcc %o0, 0x7, %g2", done by caller.
-	 */
-	.align		64
-1:
-	/* Is 'dst' already aligned on an 64-byte boundary? */
-	be,pt		%XCC, 2f
-
-	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
-	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
-	 * subtract this from 'len'.
-	 */
-	 sub		%g2, 0x40, %g2
-	sub		%g0, %g2, %g2
-	sub		%o2, %g2, %o2
-
-	/* Copy %g2 bytes from src to dst, one byte at a time. */
-1:	EXNV_RAW(lduba [%o1 + 0x00] %asi, %o3, add %o2, %g2)
-	add		%o1, 0x1, %o1
-	add		%o0, 0x1, %o0
-	subcc		%g2, 0x1, %g2
-
-	bg,pt		%XCC, 1b
-	 stb		%o3, [%o0 + -1]
-
-2:	VISEntryHalf
-	and		%o1, 0x7, %g1
-	ba,pt		%xcc, 1f
-	 alignaddr	%o1, %g0, %o1
-
-	.align		64
-1:
-	membar		#StoreLoad | #StoreStore | #LoadStore
-	prefetcha	[%o1 + 0x000] %asi, #one_read
-	prefetcha	[%o1 + 0x040] %asi, #one_read
-	andn		%o2, (0x40 - 1), %o4
-	prefetcha	[%o1 + 0x080] %asi, #one_read
-	prefetcha	[%o1 + 0x0c0] %asi, #one_read
-	EX(ldda [%o1 + 0x000] %asi, %f0, add %o2, %g0)
-	prefetcha	[%o1 + 0x100] %asi, #one_read
-	EX(ldda [%o1 + 0x008] %asi, %f2, add %o2, %g0)
-	prefetcha	[%o1 + 0x140] %asi, #one_read
-	EX(ldda [%o1 + 0x010] %asi, %f4, add %o2, %g0)
-	prefetcha	[%o1 + 0x180] %asi, #one_read
-	faligndata	%f0, %f2, %f16
-	EX(ldda [%o1 + 0x018] %asi, %f6, add %o2, %g0)
-	faligndata	%f2, %f4, %f18
-	EX(ldda [%o1 + 0x020] %asi, %f8, add %o2, %g0)
-	faligndata	%f4, %f6, %f20
-	EX(ldda [%o1 + 0x028] %asi, %f10, add %o2, %g0)
-	faligndata	%f6, %f8, %f22
-
-	EX(ldda [%o1 + 0x030] %asi, %f12, add %o2, %g0)
-	faligndata	%f8, %f10, %f24
-	EX(ldda [%o1 + 0x038] %asi, %f14, add %o2, %g0)
-	faligndata	%f10, %f12, %f26
-	EX(ldda [%o1 + 0x040] %asi, %f0, add %o2, %g0)
-
-	sub		%o4, 0x80, %o4
-	add		%o1, 0x40, %o1
-	ba,pt		%xcc, 1f
-	 srl		%o4, 6, %o3
-
-	.align		64
-1:
-	EX3(ldda [%o1 + 0x008] %asi, %f2)
-	faligndata	%f12, %f14, %f28
-	EX3(ldda [%o1 + 0x010] %asi, %f4)
-	faligndata	%f14, %f0, %f30
-	stda		%f16, [%o0] ASI_BLK_P
-	EX3(ldda [%o1 + 0x018] %asi, %f6)
-	faligndata	%f0, %f2, %f16
-
-	EX3(ldda [%o1 + 0x020] %asi, %f8)
-	faligndata	%f2, %f4, %f18
-	EX3(ldda [%o1 + 0x028] %asi, %f10)
-	faligndata	%f4, %f6, %f20
-	EX3(ldda [%o1 + 0x030] %asi, %f12)
-	faligndata	%f6, %f8, %f22
-	EX3(ldda [%o1 + 0x038] %asi, %f14)
-	faligndata	%f8, %f10, %f24
-
-	EX3(ldda [%o1 + 0x040] %asi, %f0)
-	prefetcha	[%o1 + 0x180] %asi, #one_read
-	faligndata	%f10, %f12, %f26
-	subcc		%o3, 0x01, %o3
-	add		%o1, 0x40, %o1
-	bg,pt		%XCC, 1b
-	 add		%o0, 0x40, %o0
-
-	/* Finally we copy the last full 64-byte block. */
-	EX3(ldda [%o1 + 0x008] %asi, %f2)
-	faligndata	%f12, %f14, %f28
-	EX3(ldda [%o1 + 0x010] %asi, %f4)
-	faligndata	%f14, %f0, %f30
-	stda		%f16, [%o0] ASI_BLK_P
-	EX3(ldda [%o1 + 0x018] %asi, %f6)
-	faligndata	%f0, %f2, %f16
-	EX3(ldda [%o1 + 0x020] %asi, %f8)
-	faligndata	%f2, %f4, %f18
-	EX3(ldda [%o1 + 0x028] %asi, %f10)
-	faligndata	%f4, %f6, %f20
-	EX3(ldda [%o1 + 0x030] %asi, %f12)
-	faligndata	%f6, %f8, %f22
-	EX3(ldda [%o1 + 0x038] %asi, %f14)
-	faligndata	%f8, %f10, %f24
-	cmp		%g1, 0
-	be,pt		%XCC, 1f
-	 add		%o0, 0x40, %o0
-	EX4(ldda [%o1 + 0x040] %asi, %f0)
-1:	faligndata	%f10, %f12, %f26
-	faligndata	%f12, %f14, %f28
-	faligndata	%f14, %f0, %f30
-	stda		%f16, [%o0] ASI_BLK_P
-	add		%o0, 0x40, %o0
-	add		%o1, 0x40, %o1
-
-	membar		#Sync
-
-	/* Now we copy the (len modulo 64) bytes at the end.
-	 * Note how we borrow the %f0 loaded above.
-	 *
-	 * Also notice how this code is careful not to perform a
-	 * load past the end of the src buffer.
-	 */
-	and		%o2, 0x3f, %o2
-	andcc		%o2, 0x38, %g2
-	be,pn		%XCC, 10f
-	 subcc		%g2, 0x8, %g2
-	be,pn		%XCC, 10f
-	 cmp		%g1, 0
-
-	be,a,pt		%XCC, 1f
-	 EX(ldda [%o1 + 0x00] %asi, %f0, add %o2, %g0)
-
-1:	EX(ldda [%o1 + 0x08] %asi, %f2, add %o2, %g0)
-	add		%o1, 0x8, %o1
-	sub		%o2, 0x8, %o2
-	subcc		%g2, 0x8, %g2
-	faligndata	%f0, %f2, %f8
-	std		%f8, [%o0 + 0x00]
-	be,pn		%XCC, 10f
-	 add		%o0, 0x8, %o0
-	EX(ldda [%o1 + 0x08] %asi, %f0, add %o2, %g0)
-	add		%o1, 0x8, %o1
-	sub		%o2, 0x8, %o2
-	subcc		%g2, 0x8, %g2
-	faligndata	%f2, %f0, %f8
-	std		%f8, [%o0 + 0x00]
-	bne,pn		%XCC, 1b
-	 add		%o0, 0x8, %o0
-
-	/* If anything is left, we copy it one byte at a time.
-	 * Note that %g1 is (src & 0x3) saved above before the
-	 * alignaddr was performed.
-	 */
-10:
-	cmp		%o2, 0
-	add		%o1, %g1, %o1
-	VISExitHalf
-	be,pn		%XCC, 85f
-	 sub		%o0, %o1, %o3
-
-	andcc		%g1, 0x7, %g0
-	bne,pn		%icc, 90f
-	 andcc		%o2, 0x8, %g0
-	be,pt		%icc, 1f
-	 nop
-	EXNV(ldxa [%o1] %asi, %o5, add %o2, %g0)
-	stx		%o5, [%o1 + %o3]
-	add		%o1, 0x8, %o1
-
-1:	andcc		%o2, 0x4, %g0
-	be,pt		%icc, 1f
-	 nop
-	EXNV(lduwa [%o1] %asi, %o5, and %o2, 0x7)
-	stw		%o5, [%o1 + %o3]
-	add		%o1, 0x4, %o1
-
-1:	andcc		%o2, 0x2, %g0
-	be,pt		%icc, 1f
-	 nop
-	EXNV(lduha [%o1] %asi, %o5, and %o2, 0x3)
-	sth		%o5, [%o1 + %o3]
-	add		%o1, 0x2, %o1
-
-1:	andcc		%o2, 0x1, %g0
-	be,pt		%icc, 85f
-	 nop
-	EXNV(lduba [%o1] %asi, %o5, and %o2, 0x1)
-	ba,pt		%xcc, 85f
-	 stb		%o5, [%o1 + %o3]
-
-70: /* 16 < len <= 64 */
-	bne,pn		%XCC, 90f
-	 sub		%o0, %o1, %o3
-
-	andn		%o2, 0x7, %o4
-	and		%o2, 0x7, %o2
-1:	subcc		%o4, 0x8, %o4
-	EXNV8(ldxa [%o1] %asi, %o5, add %o2, %o4)
-	stx		%o5, [%o1 + %o3]
-	bgu,pt		%XCC, 1b
-	 add		%o1, 0x8, %o1
-	andcc		%o2, 0x4, %g0
-	be,pt		%XCC, 1f
-	 nop
-	sub		%o2, 0x4, %o2
-	EXNV4(lduwa [%o1] %asi, %o5, add %o2, %g0)
-	stw		%o5, [%o1 + %o3]
-	add		%o1, 0x4, %o1
-1:	cmp		%o2, 0
-	be,pt		%XCC, 85f
-	 nop
-	ba,pt		%xcc, 90f
-	 nop
-
-80: /* 0 < len <= 16 */
-	andcc		%o3, 0x3, %g0
-	bne,pn		%XCC, 90f
-	 sub		%o0, %o1, %o3
-
-1:
-	subcc		%o2, 4, %o2
-	EXNV(lduwa [%o1] %asi, %g1, add %o2, %g0)
-	stw		%g1, [%o1 + %o3]
-	bgu,pt		%XCC, 1b
-	 add		%o1, 4, %o1
-
-85:	retl
-	 clr		%o0
-
-	.align	32
-90:
-	subcc		%o2, 1, %o2
-	EXNV(lduba [%o1] %asi, %g1, add %o2, %g0)
-	stb		%g1, [%o1 + %o3]
-	bgu,pt		%XCC, 90b
-	 add		%o1, 1, %o1
-	retl
-	 clr		%o0
-
-U3cfu_fixup:
-	/* Since this is copy_from_user(), zero out the rest of the
-	 * kernel buffer.
-	 */
-	cmp		%o1, 0
-	ble,pn		%icc, 2f
-	 mov		%o1, %g2

-1:	subcc		%g2, 1, %g2
-	stb		%g0, [%o0]
-	bne,pt		%icc, 1b
-	 add		%o0, 1, %o0
+#define FUNC_NAME		U3copy_from_user
+#define LOAD(type,addr,dest)	type##a [addr] %asi, dest
+#define EX_RETVAL(x)		0

-2:	retl
-	 mov		%o1, %o0
+#include "U3memcpy.S"
--- a/arch/sparc64/lib/U3copy_to_user.S
+++ b/arch/sparc64/lib/U3copy_to_user.S
-/* U3copy_to_user.S: UltraSparc-III optimized memcpy.
+/* U3copy_to_user.S: UltraSparc-III optimized copy to userspace.
 *
 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
 */

-#include <asm/visasm.h>
-#include <asm/asi.h>
-#include <asm/dcu.h>
-#include <asm/spitfire.h>
-
-#define XCC xcc
-
-#define EXNV(x,y,a,b)	\
-98:	x,y;				\
+#define EX_ST(x)		\
+98:	x;			\
 	.section .fixup;	\
 	.align 4;		\
 99:	retl;			\
-	 a, b, %o0;			\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV2(x,y,a,b)	\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	a, b, %o0;			\
-	retl;				\
-	 add %o0, 1, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV3(x,y,a,b)	\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	a, b, %o0;			\
-	retl;				\
-	 add %o0, 4, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV4(x,y,a,b)	\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	a, b, %o0;			\
-	retl;				\
-	 add %o0, 8, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EX(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	retl;				\
-	 a, b, %o0;			\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXBLK1(x,y)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	add %o4, 0x1c0, %o1;		\
-	and %o2, (0x40 - 1), %o2;	\
-	retl;				\
-	 add %o1, %o2, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXBLK2(x,y)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	sll %o3, 6, %o3;		\
-	and %o2, (0x40 - 1), %o2;	\
-	add %o3, 0x80, %o1;		\
-	retl;				\
-	 add %o1, %o2, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXBLK3(x,y)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	and %o2, (0x40 - 1), %o2;	\
-	retl;				\
-	 add %o2, 0x80, %o0;		\
+	 mov	1, %o0;		\
 	.section __ex_table;	\
 	.align 4;		\
 	.word 98b, 99b;		\
 	.text;			\
 	.align 4;
-#define EXBLK4(x,y)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	VISExitHalf;			\
-	and %o2, (0x40 - 1), %o2;	\
-	retl;				\
-	 add %o2, 0x40, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-
-	.register	%g2,#scratch
-	.register	%g3,#scratch
-
-	/* Special/non-trivial issues of this code:
-	 *
-	 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
-	 * 2) Only low 32 FPU registers are used so that only the
-	 *    lower half of the FPU register set is dirtied by this
-	 *    code.  This is especially important in the kernel.
-	 * 3) This code never prefetches cachelines past the end
-	 *    of the source buffer.
-	 */

-	.text
-	.align	32
+#define FUNC_NAME		U3copy_to_user
+#define STORE(type,src,addr)	type##a src, [addr] ASI_AIUS
+#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_AIUS
+#define EX_RETVAL(x)		0

-	/* The cheetah's flexible spine, oversized liver, enlarged heart,
-	 * slender muscular body, and claws make it the swiftest hunter
-	 * in Africa and the fastest animal on land.  Can reach speeds
-	 * of up to 2.4GB per second.
-	 */
-
-	.globl	U3copy_to_user
-U3copy_to_user:	/* %o0=dst, %o1=src, %o2=len */
 	/* Writing to %asi is _expensive_ so we hardcode it.
 	 * Reading %asi to check for KERNEL_DS is comparatively
 	 * cheap.
 	 */
-	rd		%asi, %g1
-	cmp		%g1, ASI_AIUS
-	bne,pn		%icc, U3memcpy_user_stub
-	 nop
-
-	cmp		%o2, 0
-	be,pn		%XCC, 85f
-	 or		%o0, %o1, %o3
-	cmp		%o2, 16
-	bleu,a,pn	%XCC, 80f
-	 or		%o3, %o2, %o3
-
-	cmp		%o2, 256
-	blu,pt		%XCC, 70f
-	 andcc		%o3, 0x7, %g0
-
-	ba,pt		%xcc, 1f
-	 andcc		%o0, 0x3f, %g2
-
-	/* Here len >= 256 and condition codes reflect execution
-	 * of "andcc %o0, 0x7, %g2", done by caller.
-	 */
-	.align		64
-1:
-	/* Is 'dst' already aligned on an 64-byte boundary? */
-	be,pt		%XCC, 2f
-
-	/* Compute abs((dst & 0x3f) - 0x40) into %g2.  This is the number
-	 * of bytes to copy to make 'dst' 64-byte aligned.  We pre-
-	 * subtract this from 'len'.
-	 */
-	 sub		%g2, 0x40, %g2
-	sub		%g0, %g2, %g2
-	sub		%o2, %g2, %o2
-
-	/* Copy %g2 bytes from src to dst, one byte at a time. */
-1:	ldub		[%o1 + 0x00], %o3
-	add		%o1, 0x1, %o1
-	add		%o0, 0x1, %o0
-	subcc		%g2, 0x1, %g2
-
-	bg,pt		%XCC, 1b
-	 EXNV2(stba %o3, [%o0 + -1] %asi, add %o2, %g2)
-
-2:	VISEntryHalf
-	and		%o1, 0x7, %g1
-	ba,pt		%xcc, 1f
-	 alignaddr	%o1, %g0, %o1
-
-	.align		64
-1:
-	membar		#StoreLoad | #StoreStore | #LoadStore
-	prefetch	[%o1 + 0x000], #one_read
-	prefetch	[%o1 + 0x040], #one_read
-	andn		%o2, (0x40 - 1), %o4
-	prefetch	[%o1 + 0x080], #one_read
-	prefetch	[%o1 + 0x0c0], #one_read
-	ldd		[%o1 + 0x000], %f0
-	prefetch	[%o1 + 0x100], #one_read
-	ldd		[%o1 + 0x008], %f2
-	prefetch	[%o1 + 0x140], #one_read
-	ldd		[%o1 + 0x010], %f4
-	prefetch	[%o1 + 0x180], #one_read
-	faligndata	%f0, %f2, %f16
-	ldd		[%o1 + 0x018], %f6
-	faligndata	%f2, %f4, %f18
-	ldd		[%o1 + 0x020], %f8
-	faligndata	%f4, %f6, %f20
-	ldd		[%o1 + 0x028], %f10
-	faligndata	%f6, %f8, %f22
-
-	ldd		[%o1 + 0x030], %f12
-	faligndata	%f8, %f10, %f24
-	ldd		[%o1 + 0x038], %f14
-	faligndata	%f10, %f12, %f26
-	ldd		[%o1 + 0x040], %f0
-
-	sub		%o4, 0x80, %o4
-	add		%o1, 0x40, %o1
-	ba,pt		%xcc, 1f
-	 srl		%o4, 6, %o3
-
-	.align		64
-1:
-	ldd		[%o1 + 0x008], %f2
-	faligndata	%f12, %f14, %f28
-	ldd		[%o1 + 0x010], %f4
-	faligndata	%f14, %f0, %f30
-	EXBLK2(stda %f16, [%o0] ASI_BLK_AIUS)
-	ldd		[%o1 + 0x018], %f6
-	faligndata	%f0, %f2, %f16
-
-	ldd		[%o1 + 0x020], %f8
-	faligndata	%f2, %f4, %f18
-	ldd		[%o1 + 0x028], %f10
-	faligndata	%f4, %f6, %f20
-	ldd		[%o1 + 0x030], %f12
-	faligndata	%f6, %f8, %f22
-	ldd		[%o1 + 0x038], %f14
-	faligndata	%f8, %f10, %f24
-
-	ldd		[%o1 + 0x040], %f0
-	prefetch	[%o1 + 0x180], #one_read
-	faligndata	%f10, %f12, %f26
-	subcc		%o3, 0x01, %o3
-	add		%o1, 0x40, %o1
-	bg,pt		%XCC, 1b
-	 add		%o0, 0x40, %o0
-
-	/* Finally we copy the last full 64-byte block. */
-	ldd		[%o1 + 0x008], %f2
-	faligndata	%f12, %f14, %f28
-	ldd		[%o1 + 0x010], %f4
-	faligndata	%f14, %f0, %f30
-	EXBLK3(stda %f16, [%o0] ASI_BLK_AIUS)
-	ldd		[%o1 + 0x018], %f6
-	faligndata	%f0, %f2, %f16
-	ldd		[%o1 + 0x020], %f8
-	faligndata	%f2, %f4, %f18
-	ldd		[%o1 + 0x028], %f10
-	faligndata	%f4, %f6, %f20
-	ldd		[%o1 + 0x030], %f12
-	faligndata	%f6, %f8, %f22
-	ldd		[%o1 + 0x038], %f14
-	faligndata	%f8, %f10, %f24
-	cmp		%g1, 0
-	be,pt		%XCC, 1f
-	 add		%o0, 0x40, %o0
-	ldd		[%o1 + 0x040], %f0
-1:	faligndata	%f10, %f12, %f26
-	faligndata	%f12, %f14, %f28
-	faligndata	%f14, %f0, %f30
-	EXBLK4(stda %f16, [%o0] ASI_BLK_AIUS)
-	add		%o0, 0x40, %o0
-	add		%o1, 0x40, %o1
-
-	membar		#Sync
-
-	/* Now we copy the (len modulo 64) bytes at the end.
-	 * Note how we borrow the %f0 loaded above.
-	 *
-	 * Also notice how this code is careful not to perform a
-	 * load past the end of the src buffer.
-	 */
-	and		%o2, 0x3f, %o2
-	andcc		%o2, 0x38, %g2
-	be,pn		%XCC, 2f
-	 subcc		%g2, 0x8, %g2
-	be,pn		%XCC, 2f
-	 cmp		%g1, 0
-
-	be,a,pt		%XCC, 1f
-	 ldd		[%o1 + 0x00], %f0
-
-1:	ldd		[%o1 + 0x08], %f2
-	add		%o1, 0x8, %o1
-	sub		%o2, 0x8, %o2
-	subcc		%g2, 0x8, %g2
-	faligndata	%f0, %f2, %f8
-	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)
-	be,pn		%XCC, 2f
-	 add		%o0, 0x8, %o0
-	ldd		[%o1 + 0x08], %f0
-	add		%o1, 0x8, %o1
-	sub		%o2, 0x8, %o2
-	subcc		%g2, 0x8, %g2
-	faligndata	%f2, %f0, %f8
-	EX(stda %f8, [%o0 + 0x00] %asi, add %o2, 0x8)
-	bne,pn		%XCC, 1b
-	 add		%o0, 0x8, %o0
-
-	/* If anything is left, we copy it one byte at a time.
-	 * Note that %g1 is (src & 0x3) saved above before the
-	 * alignaddr was performed.
-	 */
-2:
-	cmp		%o2, 0
-	add		%o1, %g1, %o1
-	VISExitHalf
-	be,pn		%XCC, 85f
-	 sub		%o0, %o1, %o3
-
-	andcc		%g1, 0x7, %g0
-	bne,pn		%icc, 90f
-	 andcc		%o2, 0x8, %g0
-	be,pt		%icc, 1f
-	 nop
-	ldx		[%o1], %o5
-	EXNV(stxa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
-	add		%o1, 0x8, %o1
-
-1:	andcc		%o2, 0x4, %g0
-	be,pt		%icc, 1f
-	 nop
-	lduw		[%o1], %o5
-	EXNV(stwa %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x7)
-	add		%o1, 0x4, %o1
-
-1:	andcc		%o2, 0x2, %g0
-	be,pt		%icc, 1f
-	 nop
-	lduh		[%o1], %o5
-	EXNV(stha %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x3)
-	add		%o1, 0x2, %o1
-
-1:	andcc		%o2, 0x1, %g0
-	be,pt		%icc, 85f
-	 nop
-	ldub		[%o1], %o5
-	ba,pt		%xcc, 85f
-	 EXNV(stba %o5, [%o1 + %o3] ASI_AIUS, and %o2, 0x1)
-
-70: /* 16 < len <= 64 */
-	bne,pn		%XCC, 90f
-	 sub		%o0, %o1, %o3
-
-	andn		%o2, 0x7, %o4
-	and		%o2, 0x7, %o2
-1:	subcc		%o4, 0x8, %o4
-	ldx		[%o1], %o5
-	EXNV4(stxa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %o4)
-	bgu,pt		%XCC, 1b
-	 add		%o1, 0x8, %o1
-	andcc		%o2, 0x4, %g0
-	be,pt		%XCC, 1f
-	 nop
-	sub		%o2, 0x4, %o2
-	lduw		[%o1], %o5
-	EXNV3(stwa %o5, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
-	add		%o1, 0x4, %o1
-1:	cmp		%o2, 0
-	be,pt		%XCC, 85f
-	 nop
-	ba,pt		%xcc, 90f
-	 nop
-
-80: /* 0 < len <= 16 */
-	andcc		%o3, 0x3, %g0
-	bne,pn		%XCC, 90f
-	 sub		%o0, %o1, %o3
-
-1:
-	subcc		%o2, 4, %o2
-	lduw		[%o1], %g1
-	EXNV3(stwa %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
-	bgu,pt		%XCC, 1b
-	 add		%o1, 4, %o1
-
-85:	retl
-	 clr		%o0
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, memcpy_user_stub;		\
+	 nop;						\

-	.align	32
-90:
-	subcc		%o2, 1, %o2
-	ldub		[%o1], %g1
-	EXNV2(stba %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
-	bgu,pt		%XCC, 90b
-	 add		%o1, 1, %o1
-	retl
-	 clr		%o0
+#include "U3memcpy.S"
--- a/arch/sparc64/lib/U3memcpy.S
+++ b/arch/sparc64/lib/U3memcpy.S
--- a/arch/sparc64/lib/U3patch.S
+++ b/arch/sparc64/lib/U3patch.S
+/* U3patch.S: Patch Ultra-I routines with Ultra-III variant.
+ *
+ * Copyright (C) 2004 David S. Miller <davem@redhat.com>
+ */
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define ULTRA3_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	srl	%g1, 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	cheetah_patch_copyops
+cheetah_patch_copyops:
+	ULTRA3_DO_PATCH(memcpy, U3memcpy)
+	ULTRA3_DO_PATCH(___copy_from_user, U3copy_from_user)
+	ULTRA3_DO_PATCH(___copy_to_user, U3copy_to_user)
+	retl
+	 nop
--- a/arch/sparc64/lib/VIScopy.S
+++ b/arch/sparc64/lib/VIScopy.S
--- a/arch/sparc64/lib/U3copy_in_user.S
+++ b/arch/sparc64/lib/U3copy_in_user.S
-/* U3copy_in_user.S: UltraSparc-III optimized memcpy.
+/* copy_in_user.S: Copy from userspace to userspace.
 *
 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
 */

-#include <asm/visasm.h>
 #include <asm/asi.h>
-#include <asm/dcu.h>
-#include <asm/spitfire.h>

 #define XCC xcc

-#define EXNV(x,y,a,b)	\
+#define EX(x,y)			\
 98:	x,y;			\
 	.section .fixup;	\
 	.align 4;		\
 99:	retl;			\
-	 a, b, %o0;			\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV1(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	a, b, %o0;			\
-	retl;				\
-	 add %o0, 1, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV4(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	a, b, %o0;			\
-	retl;				\
-	 add %o0, 4, %o0;		\
-	.section __ex_table;		\
-	.align 4;			\
-	.word 98b, 99b;			\
-	.text;				\
-	.align 4;
-#define EXNV8(x,y,a,b)			\
-98:	x,y;				\
-	.section .fixup;		\
-	.align 4;			\
-99:	a, b, %o0;			\
-	retl;				\
-	 add %o0, 8, %o0;		\
+	 mov 1, %o0;		\
 	.section __ex_table;	\
 	.align 4;		\
 	.word 98b, 99b;		\
@@ -70,71 +31,84 @@
 	 * to copy register windows around during thread cloning.
 	 */

-	.globl	U3copy_in_user
-U3copy_in_user:	/* %o0=dst, %o1=src, %o2=len */
+	.globl	___copy_in_user
+___copy_in_user:	/* %o0=dst, %o1=src, %o2=len */
 	/* Writing to %asi is _expensive_ so we hardcode it.
 	 * Reading %asi to check for KERNEL_DS is comparatively
 	 * cheap.
 	 */
 	rd		%asi, %g1
 	cmp		%g1, ASI_AIUS
-	bne,pn		%icc, U3memcpy_user_stub
+	bne,pn		%icc, memcpy_user_stub
 	 nop

 	cmp		%o2, 0
-	be,pn		%XCC, out
+	be,pn		%XCC, 85f
 	 or		%o0, %o1, %o3
 	cmp		%o2, 16
-	bleu,a,pn	%XCC, small_copy
+	bleu,a,pn	%XCC, 80f
 	 or		%o3, %o2, %o3

-medium_copy: /* 16 < len <= 64 */
+	/* 16 < len <= 64 */
 	andcc		%o3, 0x7, %g0
-	bne,pn		%XCC, small_copy_unaligned
+	bne,pn		%XCC, 90f
 	 sub		%o0, %o1, %o3

-medium_copy_aligned:
 	andn		%o2, 0x7, %o4
 	and		%o2, 0x7, %o2
 1:	subcc		%o4, 0x8, %o4
-	EXNV8(ldxa [%o1] %asi, %o5, add %o4, %o2)
-	EXNV8(stxa %o5, [%o1 + %o3] ASI_AIUS, add %o4, %o2)
+	EX(ldxa [%o1] %asi, %o5)
+	EX(stxa %o5, [%o1 + %o3] ASI_AIUS)
 	bgu,pt		%XCC, 1b
 	 add		%o1, 0x8, %o1
 	andcc		%o2, 0x4, %g0
 	be,pt		%XCC, 1f
 	 nop
 	sub		%o2, 0x4, %o2
-	EXNV4(lduwa [%o1] %asi, %o5, add %o4, %o2)
-	EXNV4(stwa %o5, [%o1 + %o3] ASI_AIUS, add %o4, %o2)
+	EX(lduwa [%o1] %asi, %o5)
+	EX(stwa %o5, [%o1 + %o3] ASI_AIUS)
 	add		%o1, 0x4, %o1
 1:	cmp		%o2, 0
-	be,pt		%XCC, out
+	be,pt		%XCC, 85f
 	 nop
-	ba,pt		%xcc, small_copy_unaligned
+	ba,pt		%xcc, 90f
 	 nop

-small_copy: /* 0 < len <= 16 */
+80:	/* 0 < len <= 16 */
 	andcc		%o3, 0x3, %g0
-	bne,pn		%XCC, small_copy_unaligned
+	bne,pn		%XCC, 90f
 	 sub		%o0, %o1, %o3

-small_copy_aligned:
+82:
 	subcc		%o2, 4, %o2
-	EXNV4(lduwa [%o1] %asi, %g1, add %o2, %g0)
-	EXNV4(stwa %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
-	bgu,pt		%XCC, small_copy_aligned
+	EX(lduwa [%o1] %asi, %g1)
+	EX(stwa %g1, [%o1 + %o3] ASI_AIUS)
+	bgu,pt		%XCC, 82b
 	 add		%o1, 4, %o1

-out:	retl
+85:	retl
 	 clr		%o0

 	.align	32
-small_copy_unaligned:
+90:
 	subcc		%o2, 1, %o2
-	EXNV1(lduba [%o1] %asi, %g1, add %o2, %g0)
-	EXNV1(stba %g1, [%o1 + %o3] ASI_AIUS, add %o2, %g0)
-	bgu,pt		%XCC, small_copy_unaligned
+	EX(lduba [%o1] %asi, %g1)
+	EX(stba %g1, [%o1 + %o3] ASI_AIUS)
+	bgu,pt		%XCC, 90b
 	 add		%o1, 1, %o1
 	retl
 	 clr		%o0
+
+	/* Act like copy_{to,in}_user(), ie. return zero instead
+	 * of original destination pointer.  This is invoked when
+	 * copy_{to,in}_user() finds that %asi is kernel space.
+	 */
+	.globl		memcpy_user_stub
+memcpy_user_stub:
+	save		%sp, -192, %sp
+	mov		%i0, %o0
+	mov		%i1, %o1
+	call		memcpy
+	 mov		%i2, %o2
+	ret
+	 restore	%g0, %g0, %o0
--- a/arch/sparc64/lib/memmove.S
+++ b/arch/sparc64/lib/memmove.S
+/* memmove.S: Simple memmove implementation.
+ *
+ * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com)
+ * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ */
+
+	.text
+	.align	32
+	.globl	memmove
+memmove:
+	mov		%o0, %g1
+	cmp		%o0, %o1
+	blu,pt		%xcc, memcpy
+	 sub		%o0, %o1, %g5
+	add		%o1, %o2, %g3
+	cmp		%g3, %o0
+	bleu,pt		%xcc, memcpy
+	 add		%o1, %o2, %g5
+	add		%o0, %o2, %o5
+
+	sub		%g5, 1, %o1
+	sub		%o5, 1, %o0
+1:	ldub		[%o1], %g5
+	subcc		%o2, 1, %o2
+	sub		%o1, 1, %o1
+	stb		%g5, [%o0]
+	bne,pt		%icc, 1b
+	 sub		%o0, 1, %o0
+
+	retl
+	 mov		%g1, %o0
--- a/arch/sparc64/lib/user_fixup.c
+++ b/arch/sparc64/lib/user_fixup.c
+/* user_fixup.c: Fix up user copy faults.
+ *
+ * Copyright (C) 2004 David S. Miller <davem@redhat.com>
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <asm/uaccess.h>
+
+/* Calculating the exact fault address when using
+ * block loads and stores can be very complicated.
+ * Instead of trying to be clever and handling all
+ * of the cases, just fix things up simply here.
+ */
+
+unsigned long copy_from_user_fixup(void *to, const void __user *from, unsigned long size)
+{
+	char *dst = to;
+	const char __user *src = from;
+
+	while (size--) {
+		if (__get_user(*dst, src))
+			break;
+		dst++;
+		src++;
+	}
+
+	if (size)
+		memset(dst, 0, size);
+
+	return size;
+}
+
+unsigned long copy_to_user_fixup(void __user *to, const void *from, unsigned long size)
+{
+	char __user *dst = to;
+	const char *src = from;
+
+	while (size--) {
+		if (__put_user(*src, dst))
+			break;
+		dst++;
+		src++;
+	}
+
+	return size;
+}
+
+unsigned long copy_in_user_fixup(void __user *to, void __user *from, unsigned long size)
+{
+	char __user *dst = to;
+	char __user *src = from;
+
+	while (size--) {
+		char tmp;
+
+		if (__get_user(tmp, src))
+			break;
+		if (__put_user(tmp, dst))
+			break;
+		dst++;
+		src++;
+	}
+
+	return size;
+}
--- a/include/asm-sparc64/string.h
+++ b/include/asm-sparc64/string.h
@@ -15,35 +15,25 @@

 #include <asm/asi.h>

-extern void __memmove(void *,const void *,__kernel_size_t);
 extern void *__memset(void *,int,__kernel_size_t);
-extern void *__builtin_memset(void *,int,__kernel_size_t);

 #ifndef EXPORT_SYMTAB_STROPS

 /* First the mem*() things. */
-#define __HAVE_ARCH_BCOPY
 #define __HAVE_ARCH_MEMMOVE
-
-#undef memmove
-#define memmove(_to, _from, _n) \
-({ \
-	void *_t = (_to); \
-	__memmove(_t, (_from), (_n)); \
-	_t; \
-})
+extern void *memmove(void *, const void *, __kernel_size_t);

 #define __HAVE_ARCH_MEMCPY
-
-extern void * memcpy(void *,const void *,__kernel_size_t);
+extern void *memcpy(void *, const void *, __kernel_size_t);

 #define __HAVE_ARCH_MEMSET
+extern void *__builtin_memset(void *,int,__kernel_size_t);

 static inline void *__constant_memset(void *s, int c, __kernel_size_t count)
 {
 	extern __kernel_size_t __bzero(void *, __kernel_size_t);

-	if(!c) {
+	if (!c) {
 		__bzero(s, count);
 		return s;
 	} else

--- a/include/asm-sparc64/uaccess.h
+++ b/include/asm-sparc64/uaccess.h
@@ -252,18 +252,50 @@ __asm__ __volatile__(							\

 extern int __get_user_bad(void);

-extern unsigned long __copy_from_user(void *to, const void __user *from,
+extern unsigned long ___copy_from_user(void *to, const void __user *from,
 				       unsigned long size);
+extern unsigned long copy_from_user_fixup(void *to, const void __user *from,
+					  unsigned long size);
+static inline unsigned long copy_from_user(void *to, const void __user *from,
+					   unsigned long size)
+{
+	unsigned long ret = ___copy_from_user(to, from, size);
+
+	if (ret)
+		ret = copy_from_user_fixup(to, from, size);
+	return ret;
+}
+#define __copy_from_user copy_from_user

-extern unsigned long __copy_to_user(void __user *to, const void *from,
+extern unsigned long ___copy_to_user(void __user *to, const void *from,
+				     unsigned long size);
+extern unsigned long copy_to_user_fixup(void __user *to, const void *from,
 					unsigned long size);
+static inline unsigned long copy_to_user(void __user *to, const void *from,
+					 unsigned long size)
+{
+	unsigned long ret = ___copy_to_user(to, from, size);

-extern unsigned long __copy_in_user(void __user *to, const void __user *from,
+	if (ret)
+		ret = copy_to_user_fixup(to, from, size);
+	return ret;
+}
+#define __copy_to_user copy_to_user
+
+extern unsigned long ___copy_in_user(void __user *to, const void __user *from,
+				     unsigned long size);
+extern unsigned long copy_in_user_fixup(void __user *to, void __user *from,
 					unsigned long size);
+static inline unsigned long copy_in_user(void __user *to, void __user *from,
+					 unsigned long size)
+{
+	unsigned long ret = ___copy_in_user(to, from, size);

-#define copy_from_user __copy_from_user
-#define copy_to_user __copy_to_user
-#define copy_in_user __copy_in_user
+	if (ret)
+		ret = copy_in_user_fixup(to, from, size);
+	return ret;
+}
+#define __copy_in_user copy_in_user

 extern unsigned long __bzero_noasi(void __user *, unsigned long);