Blame view
arch/x86/crypto/twofish-avx-x86_64-asm_64.S
10.5 KB
1a59d1b8e
|
1 |
/* SPDX-License-Identifier: GPL-2.0-or-later */ |
107778b59
|
2 3 4 5 6 7 |
/* * Twofish Cipher 8-way parallel algorithm (AVX/x86_64) * * Copyright (C) 2012 Johannes Goetzfried * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> * |
18be45270
|
8 |
* Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> |
107778b59
|
9 |
*/ |
d3f5188df
|
10 |
#include <linux/linkage.h> |
8691ccd76
|
11 |
#include <asm/frame.h> |
8435a3c30
|
12 |
#include "glue_helper-asm-avx.S" |
107778b59
|
13 |
.file "twofish-avx-x86_64-asm_64.S" |
8435a3c30
|
14 |
|
e183914af
|
15 |
.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 |
8435a3c30
|
16 |
.align 16 |
8435a3c30
|
17 18 |
.Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
e183914af
|
19 20 21 |
.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 .align 16 |
18be45270
|
22 23 |
.Lxts_gf128mul_and_shl1_mask: .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 |
8435a3c30
|
24 |
|
107778b59
|
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
.text /* structure of crypto context */ #define s0 0 #define s1 1024 #define s2 2048 #define s3 3072 #define w 4096 #define k 4128 /********************************************************************** 8-way AVX twofish **********************************************************************/ #define CTX %rdi #define RA1 %xmm0 #define RB1 %xmm1 #define RC1 %xmm2 #define RD1 %xmm3 #define RA2 %xmm4 #define RB2 %xmm5 #define RC2 %xmm6 #define RD2 %xmm7 |
f94a73f8d
|
49 50 51 52 53 |
#define RX0 %xmm8 #define RY0 %xmm9 #define RX1 %xmm10 #define RY1 %xmm11 |
107778b59
|
54 |
|
f94a73f8d
|
55 56 |
#define RK1 %xmm12 #define RK2 %xmm13 |
107778b59
|
57 |
|
f94a73f8d
|
58 59 |
#define RT %xmm14 #define RR %xmm15 |
8f182f845
|
60 61 |
#define RID1 %r13 #define RID1d %r13d |
f94a73f8d
|
62 63 |
#define RID2 %rsi #define RID2d %esi |
107778b59
|
64 65 66 67 68 69 70 |
#define RGI1 %rdx #define RGI1bl %dl #define RGI1bh %dh #define RGI2 %rcx #define RGI2bl %cl #define RGI2bh %ch |
f94a73f8d
|
71 72 73 74 75 76 |
#define RGI3 %rax #define RGI3bl %al #define RGI3bh %ah #define RGI4 %rbx #define RGI4bl %bl #define RGI4bh %bh |
107778b59
|
77 78 79 80 81 82 |
#define RGS1 %r8 #define RGS1d %r8d #define RGS2 %r9 #define RGS2d %r9d #define RGS3 %r10 #define RGS3d %r10d |
f94a73f8d
|
83 84 85 |
#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \ movzbl src ## bl, RID1d; \ movzbl src ## bh, RID2d; \ |
107778b59
|
86 |
shrq $16, src; \ |
f94a73f8d
|
87 88 89 90 91 92 |
movl t0(CTX, RID1, 4), dst ## d; \ movl t1(CTX, RID2, 4), RID2d; \ movzbl src ## bl, RID1d; \ xorl RID2d, dst ## d; \ movzbl src ## bh, RID2d; \ interleave_op(il_reg); \ |
107778b59
|
93 94 |
xorl t2(CTX, RID1, 4), dst ## d; \ xorl t3(CTX, RID2, 4), dst ## d; |
f94a73f8d
|
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
#define dummy(d) /* do nothing */ #define shr_next(reg) \ shrq $16, reg; #define G(gi1, gi2, x, t0, t1, t2, t3) \ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1); \ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2); \ \ lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none); \ shlq $32, RGS2; \ orq RGS1, RGS2; \ lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none); \ shlq $32, RGS1; \ orq RGS1, RGS3; #define round_head_2(a, b, x1, y1, x2, y2) \ vmovq b ## 1, RGI3; \ vpextrq $1, b ## 1, RGI4; \ |
107778b59
|
114 |
\ |
f94a73f8d
|
115 116 117 118 119 |
G(RGI1, RGI2, x1, s0, s1, s2, s3); \ vmovq a ## 2, RGI1; \ vpextrq $1, a ## 2, RGI2; \ vmovq RGS2, x1; \ vpinsrq $1, RGS3, x1, x1; \ |
107778b59
|
120 |
\ |
f94a73f8d
|
121 122 123 124 125 |
G(RGI3, RGI4, y1, s1, s2, s3, s0); \ vmovq b ## 2, RGI3; \ vpextrq $1, b ## 2, RGI4; \ vmovq RGS2, y1; \ vpinsrq $1, RGS3, y1, y1; \ |
107778b59
|
126 |
\ |
f94a73f8d
|
127 128 129 130 131 132 133 |
G(RGI1, RGI2, x2, s0, s1, s2, s3); \ vmovq RGS2, x2; \ vpinsrq $1, RGS3, x2, x2; \ \ G(RGI3, RGI4, y2, s1, s2, s3, s0); \ vmovq RGS2, y2; \ vpinsrq $1, RGS3, y2, y2; |
107778b59
|
134 |
|
f94a73f8d
|
135 |
#define encround_tail(a, b, c, d, x, y, prerotate) \ |
107778b59
|
136 |
vpaddd x, y, x; \ |
f94a73f8d
|
137 138 139 |
vpaddd x, RK1, RT;\ prerotate(b); \ vpxor RT, c, c; \ |
107778b59
|
140 |
vpaddd y, x, y; \ |
107778b59
|
141 |
vpaddd y, RK2, y; \ |
f94a73f8d
|
142 |
vpsrld $1, c, RT; \ |
107778b59
|
143 |
vpslld $(32 - 1), c, c; \ |
f94a73f8d
|
144 145 146 147 |
vpor c, RT, c; \ vpxor d, y, d; \ #define decround_tail(a, b, c, d, x, y, prerotate) \ |
107778b59
|
148 |
vpaddd x, y, x; \ |
f94a73f8d
|
149 150 151 |
vpaddd x, RK1, RT;\ prerotate(a); \ vpxor RT, c, c; \ |
107778b59
|
152 153 154 155 156 157 |
vpaddd y, x, y; \ vpaddd y, RK2, y; \ vpxor d, y, d; \ vpsrld $1, d, y; \ vpslld $(32 - 1), d, d; \ vpor d, y, d; \ |
f94a73f8d
|
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
#define rotate_1l(x) \ vpslld $1, x, RR; \ vpsrld $(32 - 1), x, x; \ vpor x, RR, x; #define preload_rgi(c) \ vmovq c, RGI1; \ vpextrq $1, c, RGI2; #define encrypt_round(n, a, b, c, d, preload, prerotate) \ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ round_head_2(a, b, RX0, RY0, RX1, RY1); \ encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ preload(c ## 1); \ encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); #define decrypt_round(n, a, b, c, d, preload, prerotate) \ vbroadcastss (k+4*(2*(n)))(CTX), RK1; \ vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \ round_head_2(a, b, RX0, RY0, RX1, RY1); \ decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0, prerotate); \ preload(c ## 1); \ decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1, prerotate); |
107778b59
|
183 184 |
#define encrypt_cycle(n) \ |
f94a73f8d
|
185 186 187 188 189 190 |
encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); #define encrypt_cycle_last(n) \ encrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); \ encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy, dummy); |
107778b59
|
191 192 |
#define decrypt_cycle(n) \ |
f94a73f8d
|
193 194 |
decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ decrypt_round((2*n), RA, RB, RC, RD, preload_rgi, rotate_1l); |
107778b59
|
195 |
|
f94a73f8d
|
196 197 198 |
#define decrypt_cycle_last(n) \ decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi, rotate_1l); \ decrypt_round((2*n), RA, RB, RC, RD, dummy, dummy); |
107778b59
|
199 200 201 202 203 204 205 206 207 208 209 |
#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ vpunpckldq x1, x0, t0; \ vpunpckhdq x1, x0, t2; \ vpunpckldq x3, x2, t1; \ vpunpckhdq x3, x2, x3; \ \ vpunpcklqdq t1, t0, x0; \ vpunpckhqdq t1, t0, x1; \ vpunpcklqdq x3, t2, x2; \ vpunpckhqdq x3, t2, x3; |
8435a3c30
|
210 211 212 213 214 |
#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ vpxor x0, wkey, x0; \ vpxor x1, wkey, x1; \ vpxor x2, wkey, x2; \ vpxor x3, wkey, x3; \ |
107778b59
|
215 216 |
\ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
8435a3c30
|
217 |
#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ |
107778b59
|
218 219 |
transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ \ |
8435a3c30
|
220 221 222 223 |
vpxor x0, wkey, x0; \ vpxor x1, wkey, x1; \ vpxor x2, wkey, x2; \ vpxor x3, wkey, x3; |
107778b59
|
224 225 |
.align 8 |
74d8b90a8
|
226 |
SYM_FUNC_START_LOCAL(__twofish_enc_blk8) |
107778b59
|
227 228 |
/* input: * %rdi: ctx, CTX |
8435a3c30
|
229 230 231 |
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks * output: * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks |
107778b59
|
232 |
*/ |
8435a3c30
|
233 |
vmovdqu w(CTX), RK1; |
8f182f845
|
234 |
pushq %r13; |
107778b59
|
235 236 |
pushq %rbx; pushq %rcx; |
8435a3c30
|
237 |
inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); |
f94a73f8d
|
238 239 |
preload_rgi(RA1); rotate_1l(RD1); |
8435a3c30
|
240 |
inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); |
f94a73f8d
|
241 |
rotate_1l(RD2); |
107778b59
|
242 |
|
107778b59
|
243 244 245 246 247 248 249 |
encrypt_cycle(0); encrypt_cycle(1); encrypt_cycle(2); encrypt_cycle(3); encrypt_cycle(4); encrypt_cycle(5); encrypt_cycle(6); |
f94a73f8d
|
250 |
encrypt_cycle_last(7); |
107778b59
|
251 252 253 254 255 |
vmovdqu (w+4*4)(CTX), RK1; popq %rcx; popq %rbx; |
8f182f845
|
256 |
popq %r13; |
107778b59
|
257 |
|
8435a3c30
|
258 259 |
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); |
107778b59
|
260 261 |
ret; |
74d8b90a8
|
262 |
SYM_FUNC_END(__twofish_enc_blk8) |
107778b59
|
263 264 |
.align 8 |
74d8b90a8
|
265 |
SYM_FUNC_START_LOCAL(__twofish_dec_blk8) |
107778b59
|
266 267 |
/* input: * %rdi: ctx, CTX |
8435a3c30
|
268 269 270 |
* RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks * output: * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks |
107778b59
|
271 |
*/ |
8435a3c30
|
272 |
vmovdqu (w+4*4)(CTX), RK1; |
8f182f845
|
273 |
pushq %r13; |
107778b59
|
274 |
pushq %rbx; |
8435a3c30
|
275 |
inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); |
f94a73f8d
|
276 277 |
preload_rgi(RC1); rotate_1l(RA1); |
8435a3c30
|
278 |
inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); |
f94a73f8d
|
279 |
rotate_1l(RA2); |
107778b59
|
280 |
|
107778b59
|
281 282 283 284 285 286 287 |
decrypt_cycle(7); decrypt_cycle(6); decrypt_cycle(5); decrypt_cycle(4); decrypt_cycle(3); decrypt_cycle(2); decrypt_cycle(1); |
f94a73f8d
|
288 |
decrypt_cycle_last(0); |
107778b59
|
289 290 291 292 |
vmovdqu (w)(CTX), RK1; popq %rbx; |
8f182f845
|
293 |
popq %r13; |
107778b59
|
294 |
|
8435a3c30
|
295 296 297 298 |
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); ret; |
74d8b90a8
|
299 |
SYM_FUNC_END(__twofish_dec_blk8) |
8435a3c30
|
300 |
|
6dcc5627f
|
301 |
SYM_FUNC_START(twofish_ecb_enc_8way) |
8435a3c30
|
302 303 304 305 306 |
/* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ |
8691ccd76
|
307 |
FRAME_BEGIN |
8435a3c30
|
308 309 310 311 312 313 314 315 |
movq %rsi, %r11; load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); call __twofish_enc_blk8; store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); |
8691ccd76
|
316 |
FRAME_END |
8435a3c30
|
317 |
ret; |
6dcc5627f
|
318 |
SYM_FUNC_END(twofish_ecb_enc_8way) |
8435a3c30
|
319 |
|
6dcc5627f
|
320 |
SYM_FUNC_START(twofish_ecb_dec_8way) |
8435a3c30
|
321 322 323 324 325 |
/* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ |
8691ccd76
|
326 |
FRAME_BEGIN |
8435a3c30
|
327 328 329 330 331 332 333 334 |
movq %rsi, %r11; load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); call __twofish_dec_blk8; store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); |
8691ccd76
|
335 |
FRAME_END |
8435a3c30
|
336 |
ret; |
6dcc5627f
|
337 |
SYM_FUNC_END(twofish_ecb_dec_8way) |
8435a3c30
|
338 |
|
6dcc5627f
|
339 |
SYM_FUNC_START(twofish_cbc_dec_8way) |
8435a3c30
|
340 341 342 343 344 |
/* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src */ |
8691ccd76
|
345 |
FRAME_BEGIN |
8435a3c30
|
346 347 348 349 350 351 352 353 354 355 356 357 358 |
pushq %r12; movq %rsi, %r11; movq %rdx, %r12; load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); call __twofish_dec_blk8; store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); popq %r12; |
8691ccd76
|
359 |
FRAME_END |
8435a3c30
|
360 |
ret; |
6dcc5627f
|
361 |
SYM_FUNC_END(twofish_cbc_dec_8way) |
8435a3c30
|
362 |
|
6dcc5627f
|
363 |
SYM_FUNC_START(twofish_ctr_8way) |
8435a3c30
|
364 365 366 367 368 369 |
/* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src * %rcx: iv (little endian, 128bit) */ |
8691ccd76
|
370 |
FRAME_BEGIN |
8435a3c30
|
371 372 373 374 375 376 377 378 379 380 381 382 383 384 |
pushq %r12; movq %rsi, %r11; movq %rdx, %r12; load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, RX0, RX1, RY0); call __twofish_enc_blk8; store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); popq %r12; |
107778b59
|
385 |
|
8691ccd76
|
386 |
FRAME_END |
107778b59
|
387 |
ret; |
6dcc5627f
|
388 |
SYM_FUNC_END(twofish_ctr_8way) |
18be45270
|
389 |
|
6dcc5627f
|
390 |
SYM_FUNC_START(twofish_xts_enc_8way) |
18be45270
|
391 392 393 394 395 396 |
/* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ |
8691ccd76
|
397 |
FRAME_BEGIN |
18be45270
|
398 399 400 401 402 403 404 405 406 407 408 |
movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); call __twofish_enc_blk8; /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); |
8691ccd76
|
409 |
FRAME_END |
18be45270
|
410 |
ret; |
6dcc5627f
|
411 |
SYM_FUNC_END(twofish_xts_enc_8way) |
18be45270
|
412 |
|
6dcc5627f
|
413 |
SYM_FUNC_START(twofish_xts_dec_8way) |
18be45270
|
414 415 416 417 418 419 |
/* input: * %rdi: ctx, CTX * %rsi: dst * %rdx: src * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) */ |
8691ccd76
|
420 |
FRAME_BEGIN |
18be45270
|
421 422 423 424 425 426 427 428 429 430 431 |
movq %rsi, %r11; /* regs <= src, dst <= IVs, regs <= regs xor IVs */ load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2, RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask); call __twofish_dec_blk8; /* dst <= regs xor IVs(in dst) */ store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); |
8691ccd76
|
432 |
FRAME_END |
18be45270
|
433 |
ret; |
6dcc5627f
|
434 |
SYM_FUNC_END(twofish_xts_dec_8way) |