Commit 24ff1e9d authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: x86/camellia - Use RIP-relative addressing

Prefer RIP-relative addressing where possible, which removes the need
for boot time relocation fixups.
Co-developed-by: default avatarThomas Garnier <thgarnie@chromium.org>
Signed-off-by: default avatarThomas Garnier <thgarnie@chromium.org>
Signed-off-by: default avatarArd Biesheuvel <ardb@kernel.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 52fc482a
...@@ -52,10 +52,10 @@ ...@@ -52,10 +52,10 @@
/* \ /* \
* S-function with AES subbytes \ * S-function with AES subbytes \
*/ \ */ \
vmovdqa .Linv_shift_row, t4; \ vmovdqa .Linv_shift_row(%rip), t4; \
vbroadcastss .L0f0f0f0f, t7; \ vbroadcastss .L0f0f0f0f(%rip), t7; \
vmovdqa .Lpre_tf_lo_s1, t0; \ vmovdqa .Lpre_tf_lo_s1(%rip), t0; \
vmovdqa .Lpre_tf_hi_s1, t1; \ vmovdqa .Lpre_tf_hi_s1(%rip), t1; \
\ \
/* AES inverse shift rows */ \ /* AES inverse shift rows */ \
vpshufb t4, x0, x0; \ vpshufb t4, x0, x0; \
...@@ -68,8 +68,8 @@ ...@@ -68,8 +68,8 @@
vpshufb t4, x6, x6; \ vpshufb t4, x6, x6; \
\ \
/* prefilter sboxes 1, 2 and 3 */ \ /* prefilter sboxes 1, 2 and 3 */ \
vmovdqa .Lpre_tf_lo_s4, t2; \ vmovdqa .Lpre_tf_lo_s4(%rip), t2; \
vmovdqa .Lpre_tf_hi_s4, t3; \ vmovdqa .Lpre_tf_hi_s4(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x1, t0, t1, t7, t6); \ filter_8bit(x1, t0, t1, t7, t6); \
...@@ -83,8 +83,8 @@ ...@@ -83,8 +83,8 @@
filter_8bit(x6, t2, t3, t7, t6); \ filter_8bit(x6, t2, t3, t7, t6); \
\ \
/* AES subbytes + AES shift rows */ \ /* AES subbytes + AES shift rows */ \
vmovdqa .Lpost_tf_lo_s1, t0; \ vmovdqa .Lpost_tf_lo_s1(%rip), t0; \
vmovdqa .Lpost_tf_hi_s1, t1; \ vmovdqa .Lpost_tf_hi_s1(%rip), t1; \
vaesenclast t4, x0, x0; \ vaesenclast t4, x0, x0; \
vaesenclast t4, x7, x7; \ vaesenclast t4, x7, x7; \
vaesenclast t4, x1, x1; \ vaesenclast t4, x1, x1; \
...@@ -95,16 +95,16 @@ ...@@ -95,16 +95,16 @@
vaesenclast t4, x6, x6; \ vaesenclast t4, x6, x6; \
\ \
/* postfilter sboxes 1 and 4 */ \ /* postfilter sboxes 1 and 4 */ \
vmovdqa .Lpost_tf_lo_s3, t2; \ vmovdqa .Lpost_tf_lo_s3(%rip), t2; \
vmovdqa .Lpost_tf_hi_s3, t3; \ vmovdqa .Lpost_tf_hi_s3(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \
filter_8bit(x6, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \
\ \
/* postfilter sbox 3 */ \ /* postfilter sbox 3 */ \
vmovdqa .Lpost_tf_lo_s2, t4; \ vmovdqa .Lpost_tf_lo_s2(%rip), t4; \
vmovdqa .Lpost_tf_hi_s2, t5; \ vmovdqa .Lpost_tf_hi_s2(%rip), t5; \
filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \
\ \
...@@ -443,7 +443,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -443,7 +443,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \
\ \
vmovdqu .Lshufb_16x16b, a0; \ vmovdqu .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \ vmovdqu st1, a1; \
vpshufb a0, a2, a2; \ vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \ vpshufb a0, a3, a3; \
...@@ -482,7 +482,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -482,7 +482,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \ y6, y7, rio, key) \
vmovq key, x0; \ vmovq key, x0; \
vpshufb .Lpack_bswap, x0, x0; \ vpshufb .Lpack_bswap(%rip), x0, x0; \
\ \
vpxor 0 * 16(rio), x0, y7; \ vpxor 0 * 16(rio), x0, y7; \
vpxor 1 * 16(rio), x0, y6; \ vpxor 1 * 16(rio), x0, y6; \
...@@ -533,7 +533,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -533,7 +533,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vmovdqu x0, stack_tmp0; \ vmovdqu x0, stack_tmp0; \
\ \
vmovq key, x0; \ vmovq key, x0; \
vpshufb .Lpack_bswap, x0, x0; \ vpshufb .Lpack_bswap(%rip), x0, x0; \
\ \
vpxor x0, y7, y7; \ vpxor x0, y7, y7; \
vpxor x0, y6, y6; \ vpxor x0, y6, y6; \
......
...@@ -64,12 +64,12 @@ ...@@ -64,12 +64,12 @@
/* \ /* \
* S-function with AES subbytes \ * S-function with AES subbytes \
*/ \ */ \
vbroadcasti128 .Linv_shift_row, t4; \ vbroadcasti128 .Linv_shift_row(%rip), t4; \
vpbroadcastd .L0f0f0f0f, t7; \ vpbroadcastd .L0f0f0f0f(%rip), t7; \
vbroadcasti128 .Lpre_tf_lo_s1, t5; \ vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \
vbroadcasti128 .Lpre_tf_hi_s1, t6; \ vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \
vbroadcasti128 .Lpre_tf_lo_s4, t2; \ vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \
vbroadcasti128 .Lpre_tf_hi_s4, t3; \ vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \
\ \
/* AES inverse shift rows */ \ /* AES inverse shift rows */ \
vpshufb t4, x0, x0; \ vpshufb t4, x0, x0; \
...@@ -115,8 +115,8 @@ ...@@ -115,8 +115,8 @@
vinserti128 $1, t2##_x, x6, x6; \ vinserti128 $1, t2##_x, x6, x6; \
vextracti128 $1, x1, t3##_x; \ vextracti128 $1, x1, t3##_x; \
vextracti128 $1, x4, t2##_x; \ vextracti128 $1, x4, t2##_x; \
vbroadcasti128 .Lpost_tf_lo_s1, t0; \ vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \
vbroadcasti128 .Lpost_tf_hi_s1, t1; \ vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \
vaesenclast t4##_x, x2##_x, x2##_x; \ vaesenclast t4##_x, x2##_x, x2##_x; \
vaesenclast t4##_x, t6##_x, t6##_x; \ vaesenclast t4##_x, t6##_x, t6##_x; \
vinserti128 $1, t6##_x, x2, x2; \ vinserti128 $1, t6##_x, x2, x2; \
...@@ -131,16 +131,16 @@ ...@@ -131,16 +131,16 @@
vinserti128 $1, t2##_x, x4, x4; \ vinserti128 $1, t2##_x, x4, x4; \
\ \
/* postfilter sboxes 1 and 4 */ \ /* postfilter sboxes 1 and 4 */ \
vbroadcasti128 .Lpost_tf_lo_s3, t2; \ vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \
vbroadcasti128 .Lpost_tf_hi_s3, t3; \ vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \
filter_8bit(x6, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \
\ \
/* postfilter sbox 3 */ \ /* postfilter sbox 3 */ \
vbroadcasti128 .Lpost_tf_lo_s2, t4; \ vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \
vbroadcasti128 .Lpost_tf_hi_s2, t5; \ vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \
filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \
\ \
...@@ -475,7 +475,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -475,7 +475,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \
\ \
vbroadcasti128 .Lshufb_16x16b, a0; \ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \ vmovdqu st1, a1; \
vpshufb a0, a2, a2; \ vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \ vpshufb a0, a3, a3; \
...@@ -514,7 +514,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -514,7 +514,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \ y6, y7, rio, key) \
vpbroadcastq key, x0; \ vpbroadcastq key, x0; \
vpshufb .Lpack_bswap, x0, x0; \ vpshufb .Lpack_bswap(%rip), x0, x0; \
\ \
vpxor 0 * 32(rio), x0, y7; \ vpxor 0 * 32(rio), x0, y7; \
vpxor 1 * 32(rio), x0, y6; \ vpxor 1 * 32(rio), x0, y6; \
...@@ -565,7 +565,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) ...@@ -565,7 +565,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vmovdqu x0, stack_tmp0; \ vmovdqu x0, stack_tmp0; \
\ \
vpbroadcastq key, x0; \ vpbroadcastq key, x0; \
vpshufb .Lpack_bswap, x0, x0; \ vpshufb .Lpack_bswap(%rip), x0, x0; \
\ \
vpxor x0, y7, y7; \ vpxor x0, y7, y7; \
vpxor x0, y6, y6; \ vpxor x0, y6, y6; \
......
...@@ -77,11 +77,13 @@ ...@@ -77,11 +77,13 @@
#define RXORbl %r9b #define RXORbl %r9b
#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
leaq T0(%rip), tmp1; \
movzbl ab ## bl, tmp2 ## d; \ movzbl ab ## bl, tmp2 ## d; \
xorq (tmp1, tmp2, 8), dst; \
leaq T1(%rip), tmp2; \
movzbl ab ## bh, tmp1 ## d; \ movzbl ab ## bh, tmp1 ## d; \
rorq $16, ab; \ rorq $16, ab; \
xorq T0(, tmp2, 8), dst; \ xorq (tmp2, tmp1, 8), dst;
xorq T1(, tmp1, 8), dst;
/********************************************************************** /**********************************************************************
1-way camellia 1-way camellia
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment