Commit 6574e6c64e971c9adb629e81e497afdb52b1c9df

Authored by Jussi Kivilinna
Committed by Herbert Xu
1 parent 87131507e1

crypto: des_3des - add x86-64 assembly implementation

Patch adds x86_64 assembly implementation of Triple DES EDE cipher algorithm.
Two assembly implementations are provided. First is regular 'one-block at
time' encrypt/decrypt function. Second is 'three-blocks at time' function that
gains performance increase on out-of-order CPUs.

tcrypt test results:

Intel Core i5-4570:

des3_ede-asm vs des3_ede-generic:
size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     1.21x   1.22x   1.27x   1.36x   1.25x   1.25x
64B     1.98x   1.96x   1.23x   2.04x   2.01x   2.00x
256B    2.34x   2.37x   1.21x   2.40x   2.38x   2.39x
1024B   2.50x   2.47x   1.22x   2.51x   2.52x   2.51x
8192B   2.51x   2.53x   1.21x   2.56x   2.54x   2.55x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Showing 6 changed files with 1349 additions and 5 deletions Side-by-side Diff

arch/x86/crypto/Makefile
... ... @@ -14,6 +14,7 @@
14 14 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
15 15  
16 16 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
  17 +obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
17 18 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
18 19 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
19 20 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
... ... @@ -52,6 +53,7 @@
52 53 serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
53 54  
54 55 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
  56 +des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
55 57 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
56 58 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
57 59 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
arch/x86/crypto/des3_ede-asm_64.S
  1 +/*
  2 + * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
  3 + *
  4 + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  5 + *
  6 + * This program is free software; you can redistribute it and/or modify
  7 + * it under the terms of the GNU General Public License as published by
  8 + * the Free Software Foundation; either version 2 of the License, or
  9 + * (at your option) any later version.
  10 + *
  11 + * This program is distributed in the hope that it will be useful,
  12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14 + * GNU General Public License for more details.
  15 + */
  16 +
  17 +#include <linux/linkage.h>
  18 +
  19 +.file "des3_ede-asm_64.S"
  20 +.text
  21 +
  22 +#define s1 .L_s1
  23 +#define s2 ((s1) + (64*8))
  24 +#define s3 ((s2) + (64*8))
  25 +#define s4 ((s3) + (64*8))
  26 +#define s5 ((s4) + (64*8))
  27 +#define s6 ((s5) + (64*8))
  28 +#define s7 ((s6) + (64*8))
  29 +#define s8 ((s7) + (64*8))
  30 +
  31 +/* register macros */
  32 +#define CTX %rdi
  33 +
  34 +#define RL0 %r8
  35 +#define RL1 %r9
  36 +#define RL2 %r10
  37 +
  38 +#define RL0d %r8d
  39 +#define RL1d %r9d
  40 +#define RL2d %r10d
  41 +
  42 +#define RR0 %r11
  43 +#define RR1 %r12
  44 +#define RR2 %r13
  45 +
  46 +#define RR0d %r11d
  47 +#define RR1d %r12d
  48 +#define RR2d %r13d
  49 +
  50 +#define RW0 %rax
  51 +#define RW1 %rbx
  52 +#define RW2 %rcx
  53 +
  54 +#define RW0d %eax
  55 +#define RW1d %ebx
  56 +#define RW2d %ecx
  57 +
  58 +#define RW0bl %al
  59 +#define RW1bl %bl
  60 +#define RW2bl %cl
  61 +
  62 +#define RW0bh %ah
  63 +#define RW1bh %bh
  64 +#define RW2bh %ch
  65 +
  66 +#define RT0 %r15
  67 +#define RT1 %rbp
  68 +#define RT2 %r14
  69 +#define RT3 %rdx
  70 +
  71 +#define RT0d %r15d
  72 +#define RT1d %ebp
  73 +#define RT2d %r14d
  74 +#define RT3d %edx
  75 +
  76 +/***********************************************************************
  77 + * 1-way 3DES
  78 + ***********************************************************************/
  79 +#define do_permutation(a, b, offset, mask) \
  80 + movl a, RT0d; \
  81 + shrl $(offset), RT0d; \
  82 + xorl b, RT0d; \
  83 + andl $(mask), RT0d; \
  84 + xorl RT0d, b; \
  85 + shll $(offset), RT0d; \
  86 + xorl RT0d, a;
  87 +
  88 +#define expand_to_64bits(val, mask) \
  89 + movl val##d, RT0d; \
  90 + rorl $4, RT0d; \
  91 + shlq $32, RT0; \
  92 + orq RT0, val; \
  93 + andq mask, val;
  94 +
  95 +#define compress_to_64bits(val) \
  96 + movq val, RT0; \
  97 + shrq $32, RT0; \
  98 + roll $4, RT0d; \
  99 + orl RT0d, val##d;
  100 +
  101 +#define initial_permutation(left, right) \
  102 + do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \
  103 + do_permutation(left##d, right##d, 16, 0x0000ffff); \
  104 + do_permutation(right##d, left##d, 2, 0x33333333); \
  105 + do_permutation(right##d, left##d, 8, 0x00ff00ff); \
  106 + movabs $0x3f3f3f3f3f3f3f3f, RT3; \
  107 + movl left##d, RW0d; \
  108 + roll $1, right##d; \
  109 + xorl right##d, RW0d; \
  110 + andl $0xaaaaaaaa, RW0d; \
  111 + xorl RW0d, left##d; \
  112 + xorl RW0d, right##d; \
  113 + roll $1, left##d; \
  114 + expand_to_64bits(right, RT3); \
  115 + expand_to_64bits(left, RT3);
  116 +
  117 +#define final_permutation(left, right) \
  118 + compress_to_64bits(right); \
  119 + compress_to_64bits(left); \
  120 + movl right##d, RW0d; \
  121 + rorl $1, left##d; \
  122 + xorl left##d, RW0d; \
  123 + andl $0xaaaaaaaa, RW0d; \
  124 + xorl RW0d, right##d; \
  125 + xorl RW0d, left##d; \
  126 + rorl $1, right##d; \
  127 + do_permutation(right##d, left##d, 8, 0x00ff00ff); \
  128 + do_permutation(right##d, left##d, 2, 0x33333333); \
  129 + do_permutation(left##d, right##d, 16, 0x0000ffff); \
  130 + do_permutation(left##d, right##d, 4, 0x0f0f0f0f);
  131 +
  132 +#define round1(n, from, to, load_next_key) \
  133 + xorq from, RW0; \
  134 + \
  135 + movzbl RW0bl, RT0d; \
  136 + movzbl RW0bh, RT1d; \
  137 + shrq $16, RW0; \
  138 + movzbl RW0bl, RT2d; \
  139 + movzbl RW0bh, RT3d; \
  140 + shrq $16, RW0; \
  141 + movq s8(, RT0, 8), RT0; \
  142 + xorq s6(, RT1, 8), to; \
  143 + movzbl RW0bl, RL1d; \
  144 + movzbl RW0bh, RT1d; \
  145 + shrl $16, RW0d; \
  146 + xorq s4(, RT2, 8), RT0; \
  147 + xorq s2(, RT3, 8), to; \
  148 + movzbl RW0bl, RT2d; \
  149 + movzbl RW0bh, RT3d; \
  150 + xorq s7(, RL1, 8), RT0; \
  151 + xorq s5(, RT1, 8), to; \
  152 + xorq s3(, RT2, 8), RT0; \
  153 + load_next_key(n, RW0); \
  154 + xorq RT0, to; \
  155 + xorq s1(, RT3, 8), to; \
  156 +
  157 +#define load_next_key(n, RWx) \
  158 + movq (((n) + 1) * 8)(CTX), RWx;
  159 +
  160 +#define dummy2(a, b) /*_*/
  161 +
  162 +#define read_block(io, left, right) \
  163 + movl (io), left##d; \
  164 + movl 4(io), right##d; \
  165 + bswapl left##d; \
  166 + bswapl right##d;
  167 +
  168 +#define write_block(io, left, right) \
  169 + bswapl left##d; \
  170 + bswapl right##d; \
  171 + movl left##d, (io); \
  172 + movl right##d, 4(io);
  173 +
  174 +ENTRY(des3_ede_x86_64_crypt_blk)
  175 + /* input:
  176 + * %rdi: round keys, CTX
  177 + * %rsi: dst
  178 + * %rdx: src
  179 + */
  180 + pushq %rbp;
  181 + pushq %rbx;
  182 + pushq %r12;
  183 + pushq %r13;
  184 + pushq %r14;
  185 + pushq %r15;
  186 +
  187 + read_block(%rdx, RL0, RR0);
  188 + initial_permutation(RL0, RR0);
  189 +
  190 + movq (CTX), RW0;
  191 +
  192 + round1(0, RR0, RL0, load_next_key);
  193 + round1(1, RL0, RR0, load_next_key);
  194 + round1(2, RR0, RL0, load_next_key);
  195 + round1(3, RL0, RR0, load_next_key);
  196 + round1(4, RR0, RL0, load_next_key);
  197 + round1(5, RL0, RR0, load_next_key);
  198 + round1(6, RR0, RL0, load_next_key);
  199 + round1(7, RL0, RR0, load_next_key);
  200 + round1(8, RR0, RL0, load_next_key);
  201 + round1(9, RL0, RR0, load_next_key);
  202 + round1(10, RR0, RL0, load_next_key);
  203 + round1(11, RL0, RR0, load_next_key);
  204 + round1(12, RR0, RL0, load_next_key);
  205 + round1(13, RL0, RR0, load_next_key);
  206 + round1(14, RR0, RL0, load_next_key);
  207 + round1(15, RL0, RR0, load_next_key);
  208 +
  209 + round1(16+0, RL0, RR0, load_next_key);
  210 + round1(16+1, RR0, RL0, load_next_key);
  211 + round1(16+2, RL0, RR0, load_next_key);
  212 + round1(16+3, RR0, RL0, load_next_key);
  213 + round1(16+4, RL0, RR0, load_next_key);
  214 + round1(16+5, RR0, RL0, load_next_key);
  215 + round1(16+6, RL0, RR0, load_next_key);
  216 + round1(16+7, RR0, RL0, load_next_key);
  217 + round1(16+8, RL0, RR0, load_next_key);
  218 + round1(16+9, RR0, RL0, load_next_key);
  219 + round1(16+10, RL0, RR0, load_next_key);
  220 + round1(16+11, RR0, RL0, load_next_key);
  221 + round1(16+12, RL0, RR0, load_next_key);
  222 + round1(16+13, RR0, RL0, load_next_key);
  223 + round1(16+14, RL0, RR0, load_next_key);
  224 + round1(16+15, RR0, RL0, load_next_key);
  225 +
  226 + round1(32+0, RR0, RL0, load_next_key);
  227 + round1(32+1, RL0, RR0, load_next_key);
  228 + round1(32+2, RR0, RL0, load_next_key);
  229 + round1(32+3, RL0, RR0, load_next_key);
  230 + round1(32+4, RR0, RL0, load_next_key);
  231 + round1(32+5, RL0, RR0, load_next_key);
  232 + round1(32+6, RR0, RL0, load_next_key);
  233 + round1(32+7, RL0, RR0, load_next_key);
  234 + round1(32+8, RR0, RL0, load_next_key);
  235 + round1(32+9, RL0, RR0, load_next_key);
  236 + round1(32+10, RR0, RL0, load_next_key);
  237 + round1(32+11, RL0, RR0, load_next_key);
  238 + round1(32+12, RR0, RL0, load_next_key);
  239 + round1(32+13, RL0, RR0, load_next_key);
  240 + round1(32+14, RR0, RL0, load_next_key);
  241 + round1(32+15, RL0, RR0, dummy2);
  242 +
  243 + final_permutation(RR0, RL0);
  244 + write_block(%rsi, RR0, RL0);
  245 +
  246 + popq %r15;
  247 + popq %r14;
  248 + popq %r13;
  249 + popq %r12;
  250 + popq %rbx;
  251 + popq %rbp;
  252 +
  253 + ret;
  254 +ENDPROC(des3_ede_x86_64_crypt_blk)
  255 +
  256 +/***********************************************************************
  257 + * 3-way 3DES
  258 + ***********************************************************************/
  259 +#define expand_to_64bits(val, mask) \
  260 + movl val##d, RT0d; \
  261 + rorl $4, RT0d; \
  262 + shlq $32, RT0; \
  263 + orq RT0, val; \
  264 + andq mask, val;
  265 +
  266 +#define compress_to_64bits(val) \
  267 + movq val, RT0; \
  268 + shrq $32, RT0; \
  269 + roll $4, RT0d; \
  270 + orl RT0d, val##d;
  271 +
  272 +#define initial_permutation3(left, right) \
  273 + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
  274 + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
  275 + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
  276 + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
  277 + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \
  278 + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
  279 + \
  280 + do_permutation(right##0d, left##0d, 2, 0x33333333); \
  281 + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
  282 + do_permutation(right##1d, left##1d, 2, 0x33333333); \
  283 + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
  284 + do_permutation(right##2d, left##2d, 2, 0x33333333); \
  285 + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
  286 + \
  287 + movabs $0x3f3f3f3f3f3f3f3f, RT3; \
  288 + \
  289 + movl left##0d, RW0d; \
  290 + roll $1, right##0d; \
  291 + xorl right##0d, RW0d; \
  292 + andl $0xaaaaaaaa, RW0d; \
  293 + xorl RW0d, left##0d; \
  294 + xorl RW0d, right##0d; \
  295 + roll $1, left##0d; \
  296 + expand_to_64bits(right##0, RT3); \
  297 + expand_to_64bits(left##0, RT3); \
  298 + movl left##1d, RW1d; \
  299 + roll $1, right##1d; \
  300 + xorl right##1d, RW1d; \
  301 + andl $0xaaaaaaaa, RW1d; \
  302 + xorl RW1d, left##1d; \
  303 + xorl RW1d, right##1d; \
  304 + roll $1, left##1d; \
  305 + expand_to_64bits(right##1, RT3); \
  306 + expand_to_64bits(left##1, RT3); \
  307 + movl left##2d, RW2d; \
  308 + roll $1, right##2d; \
  309 + xorl right##2d, RW2d; \
  310 + andl $0xaaaaaaaa, RW2d; \
  311 + xorl RW2d, left##2d; \
  312 + xorl RW2d, right##2d; \
  313 + roll $1, left##2d; \
  314 + expand_to_64bits(right##2, RT3); \
  315 + expand_to_64bits(left##2, RT3);
  316 +
  317 +#define final_permutation3(left, right) \
  318 + compress_to_64bits(right##0); \
  319 + compress_to_64bits(left##0); \
  320 + movl right##0d, RW0d; \
  321 + rorl $1, left##0d; \
  322 + xorl left##0d, RW0d; \
  323 + andl $0xaaaaaaaa, RW0d; \
  324 + xorl RW0d, right##0d; \
  325 + xorl RW0d, left##0d; \
  326 + rorl $1, right##0d; \
  327 + compress_to_64bits(right##1); \
  328 + compress_to_64bits(left##1); \
  329 + movl right##1d, RW1d; \
  330 + rorl $1, left##1d; \
  331 + xorl left##1d, RW1d; \
  332 + andl $0xaaaaaaaa, RW1d; \
  333 + xorl RW1d, right##1d; \
  334 + xorl RW1d, left##1d; \
  335 + rorl $1, right##1d; \
  336 + compress_to_64bits(right##2); \
  337 + compress_to_64bits(left##2); \
  338 + movl right##2d, RW2d; \
  339 + rorl $1, left##2d; \
  340 + xorl left##2d, RW2d; \
  341 + andl $0xaaaaaaaa, RW2d; \
  342 + xorl RW2d, right##2d; \
  343 + xorl RW2d, left##2d; \
  344 + rorl $1, right##2d; \
  345 + \
  346 + do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
  347 + do_permutation(right##0d, left##0d, 2, 0x33333333); \
  348 + do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
  349 + do_permutation(right##1d, left##1d, 2, 0x33333333); \
  350 + do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
  351 + do_permutation(right##2d, left##2d, 2, 0x33333333); \
  352 + \
  353 + do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
  354 + do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
  355 + do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
  356 + do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
  357 + do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
  358 + do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f);
  359 +
  360 +#define round3(n, from, to, load_next_key, do_movq) \
  361 + xorq from##0, RW0; \
  362 + movzbl RW0bl, RT3d; \
  363 + movzbl RW0bh, RT1d; \
  364 + shrq $16, RW0; \
  365 + xorq s8(, RT3, 8), to##0; \
  366 + xorq s6(, RT1, 8), to##0; \
  367 + movzbl RW0bl, RT3d; \
  368 + movzbl RW0bh, RT1d; \
  369 + shrq $16, RW0; \
  370 + xorq s4(, RT3, 8), to##0; \
  371 + xorq s2(, RT1, 8), to##0; \
  372 + movzbl RW0bl, RT3d; \
  373 + movzbl RW0bh, RT1d; \
  374 + shrl $16, RW0d; \
  375 + xorq s7(, RT3, 8), to##0; \
  376 + xorq s5(, RT1, 8), to##0; \
  377 + movzbl RW0bl, RT3d; \
  378 + movzbl RW0bh, RT1d; \
  379 + load_next_key(n, RW0); \
  380 + xorq s3(, RT3, 8), to##0; \
  381 + xorq s1(, RT1, 8), to##0; \
  382 + xorq from##1, RW1; \
  383 + movzbl RW1bl, RT3d; \
  384 + movzbl RW1bh, RT1d; \
  385 + shrq $16, RW1; \
  386 + xorq s8(, RT3, 8), to##1; \
  387 + xorq s6(, RT1, 8), to##1; \
  388 + movzbl RW1bl, RT3d; \
  389 + movzbl RW1bh, RT1d; \
  390 + shrq $16, RW1; \
  391 + xorq s4(, RT3, 8), to##1; \
  392 + xorq s2(, RT1, 8), to##1; \
  393 + movzbl RW1bl, RT3d; \
  394 + movzbl RW1bh, RT1d; \
  395 + shrl $16, RW1d; \
  396 + xorq s7(, RT3, 8), to##1; \
  397 + xorq s5(, RT1, 8), to##1; \
  398 + movzbl RW1bl, RT3d; \
  399 + movzbl RW1bh, RT1d; \
  400 + do_movq(RW0, RW1); \
  401 + xorq s3(, RT3, 8), to##1; \
  402 + xorq s1(, RT1, 8), to##1; \
  403 + xorq from##2, RW2; \
  404 + movzbl RW2bl, RT3d; \
  405 + movzbl RW2bh, RT1d; \
  406 + shrq $16, RW2; \
  407 + xorq s8(, RT3, 8), to##2; \
  408 + xorq s6(, RT1, 8), to##2; \
  409 + movzbl RW2bl, RT3d; \
  410 + movzbl RW2bh, RT1d; \
  411 + shrq $16, RW2; \
  412 + xorq s4(, RT3, 8), to##2; \
  413 + xorq s2(, RT1, 8), to##2; \
  414 + movzbl RW2bl, RT3d; \
  415 + movzbl RW2bh, RT1d; \
  416 + shrl $16, RW2d; \
  417 + xorq s7(, RT3, 8), to##2; \
  418 + xorq s5(, RT1, 8), to##2; \
  419 + movzbl RW2bl, RT3d; \
  420 + movzbl RW2bh, RT1d; \
  421 + do_movq(RW0, RW2); \
  422 + xorq s3(, RT3, 8), to##2; \
  423 + xorq s1(, RT1, 8), to##2;
  424 +
  425 +#define __movq(src, dst) \
  426 + movq src, dst;
  427 +
  428 +ENTRY(des3_ede_x86_64_crypt_blk_3way)
  429 + /* input:
  430 + * %rdi: ctx, round keys
  431 + * %rsi: dst (3 blocks)
  432 + * %rdx: src (3 blocks)
  433 + */
  434 +
  435 + pushq %rbp;
  436 + pushq %rbx;
  437 + pushq %r12;
  438 + pushq %r13;
  439 + pushq %r14;
  440 + pushq %r15;
  441 +
  442 + /* load input */
  443 + movl 0 * 4(%rdx), RL0d;
  444 + movl 1 * 4(%rdx), RR0d;
  445 + movl 2 * 4(%rdx), RL1d;
  446 + movl 3 * 4(%rdx), RR1d;
  447 + movl 4 * 4(%rdx), RL2d;
  448 + movl 5 * 4(%rdx), RR2d;
  449 +
  450 + bswapl RL0d;
  451 + bswapl RR0d;
  452 + bswapl RL1d;
  453 + bswapl RR1d;
  454 + bswapl RL2d;
  455 + bswapl RR2d;
  456 +
  457 + initial_permutation3(RL, RR);
  458 +
  459 + movq 0(CTX), RW0;
  460 + movq RW0, RW1;
  461 + movq RW0, RW2;
  462 +
  463 + round3(0, RR, RL, load_next_key, __movq);
  464 + round3(1, RL, RR, load_next_key, __movq);
  465 + round3(2, RR, RL, load_next_key, __movq);
  466 + round3(3, RL, RR, load_next_key, __movq);
  467 + round3(4, RR, RL, load_next_key, __movq);
  468 + round3(5, RL, RR, load_next_key, __movq);
  469 + round3(6, RR, RL, load_next_key, __movq);
  470 + round3(7, RL, RR, load_next_key, __movq);
  471 + round3(8, RR, RL, load_next_key, __movq);
  472 + round3(9, RL, RR, load_next_key, __movq);
  473 + round3(10, RR, RL, load_next_key, __movq);
  474 + round3(11, RL, RR, load_next_key, __movq);
  475 + round3(12, RR, RL, load_next_key, __movq);
  476 + round3(13, RL, RR, load_next_key, __movq);
  477 + round3(14, RR, RL, load_next_key, __movq);
  478 + round3(15, RL, RR, load_next_key, __movq);
  479 +
  480 + round3(16+0, RL, RR, load_next_key, __movq);
  481 + round3(16+1, RR, RL, load_next_key, __movq);
  482 + round3(16+2, RL, RR, load_next_key, __movq);
  483 + round3(16+3, RR, RL, load_next_key, __movq);
  484 + round3(16+4, RL, RR, load_next_key, __movq);
  485 + round3(16+5, RR, RL, load_next_key, __movq);
  486 + round3(16+6, RL, RR, load_next_key, __movq);
  487 + round3(16+7, RR, RL, load_next_key, __movq);
  488 + round3(16+8, RL, RR, load_next_key, __movq);
  489 + round3(16+9, RR, RL, load_next_key, __movq);
  490 + round3(16+10, RL, RR, load_next_key, __movq);
  491 + round3(16+11, RR, RL, load_next_key, __movq);
  492 + round3(16+12, RL, RR, load_next_key, __movq);
  493 + round3(16+13, RR, RL, load_next_key, __movq);
  494 + round3(16+14, RL, RR, load_next_key, __movq);
  495 + round3(16+15, RR, RL, load_next_key, __movq);
  496 +
  497 + round3(32+0, RR, RL, load_next_key, __movq);
  498 + round3(32+1, RL, RR, load_next_key, __movq);
  499 + round3(32+2, RR, RL, load_next_key, __movq);
  500 + round3(32+3, RL, RR, load_next_key, __movq);
  501 + round3(32+4, RR, RL, load_next_key, __movq);
  502 + round3(32+5, RL, RR, load_next_key, __movq);
  503 + round3(32+6, RR, RL, load_next_key, __movq);
  504 + round3(32+7, RL, RR, load_next_key, __movq);
  505 + round3(32+8, RR, RL, load_next_key, __movq);
  506 + round3(32+9, RL, RR, load_next_key, __movq);
  507 + round3(32+10, RR, RL, load_next_key, __movq);
  508 + round3(32+11, RL, RR, load_next_key, __movq);
  509 + round3(32+12, RR, RL, load_next_key, __movq);
  510 + round3(32+13, RL, RR, load_next_key, __movq);
  511 + round3(32+14, RR, RL, load_next_key, __movq);
  512 + round3(32+15, RL, RR, dummy2, dummy2);
  513 +
  514 + final_permutation3(RR, RL);
  515 +
  516 + bswapl RR0d;
  517 + bswapl RL0d;
  518 + bswapl RR1d;
  519 + bswapl RL1d;
  520 + bswapl RR2d;
  521 + bswapl RL2d;
  522 +
  523 + movl RR0d, 0 * 4(%rsi);
  524 + movl RL0d, 1 * 4(%rsi);
  525 + movl RR1d, 2 * 4(%rsi);
  526 + movl RL1d, 3 * 4(%rsi);
  527 + movl RR2d, 4 * 4(%rsi);
  528 + movl RL2d, 5 * 4(%rsi);
  529 +
  530 + popq %r15;
  531 + popq %r14;
  532 + popq %r13;
  533 + popq %r12;
  534 + popq %rbx;
  535 + popq %rbp;
  536 +
  537 + ret;
  538 +ENDPROC(des3_ede_x86_64_crypt_blk_3way)
  539 +
  540 +.data
  541 +.align 16
  542 +.L_s1:
  543 + .quad 0x0010100001010400, 0x0000000000000000
  544 + .quad 0x0000100000010000, 0x0010100001010404
  545 + .quad 0x0010100001010004, 0x0000100000010404
  546 + .quad 0x0000000000000004, 0x0000100000010000
  547 + .quad 0x0000000000000400, 0x0010100001010400
  548 + .quad 0x0010100001010404, 0x0000000000000400
  549 + .quad 0x0010000001000404, 0x0010100001010004
  550 + .quad 0x0010000001000000, 0x0000000000000004
  551 + .quad 0x0000000000000404, 0x0010000001000400
  552 + .quad 0x0010000001000400, 0x0000100000010400
  553 + .quad 0x0000100000010400, 0x0010100001010000
  554 + .quad 0x0010100001010000, 0x0010000001000404
  555 + .quad 0x0000100000010004, 0x0010000001000004
  556 + .quad 0x0010000001000004, 0x0000100000010004
  557 + .quad 0x0000000000000000, 0x0000000000000404
  558 + .quad 0x0000100000010404, 0x0010000001000000
  559 + .quad 0x0000100000010000, 0x0010100001010404
  560 + .quad 0x0000000000000004, 0x0010100001010000
  561 + .quad 0x0010100001010400, 0x0010000001000000
  562 + .quad 0x0010000001000000, 0x0000000000000400
  563 + .quad 0x0010100001010004, 0x0000100000010000
  564 + .quad 0x0000100000010400, 0x0010000001000004
  565 + .quad 0x0000000000000400, 0x0000000000000004
  566 + .quad 0x0010000001000404, 0x0000100000010404
  567 + .quad 0x0010100001010404, 0x0000100000010004
  568 + .quad 0x0010100001010000, 0x0010000001000404
  569 + .quad 0x0010000001000004, 0x0000000000000404
  570 + .quad 0x0000100000010404, 0x0010100001010400
  571 + .quad 0x0000000000000404, 0x0010000001000400
  572 + .quad 0x0010000001000400, 0x0000000000000000
  573 + .quad 0x0000100000010004, 0x0000100000010400
  574 + .quad 0x0000000000000000, 0x0010100001010004
  575 +.L_s2:
  576 + .quad 0x0801080200100020, 0x0800080000000000
  577 + .quad 0x0000080000000000, 0x0001080200100020
  578 + .quad 0x0001000000100000, 0x0000000200000020
  579 + .quad 0x0801000200100020, 0x0800080200000020
  580 + .quad 0x0800000200000020, 0x0801080200100020
  581 + .quad 0x0801080000100000, 0x0800000000000000
  582 + .quad 0x0800080000000000, 0x0001000000100000
  583 + .quad 0x0000000200000020, 0x0801000200100020
  584 + .quad 0x0001080000100000, 0x0001000200100020
  585 + .quad 0x0800080200000020, 0x0000000000000000
  586 + .quad 0x0800000000000000, 0x0000080000000000
  587 + .quad 0x0001080200100020, 0x0801000000100000
  588 + .quad 0x0001000200100020, 0x0800000200000020
  589 + .quad 0x0000000000000000, 0x0001080000100000
  590 + .quad 0x0000080200000020, 0x0801080000100000
  591 + .quad 0x0801000000100000, 0x0000080200000020
  592 + .quad 0x0000000000000000, 0x0001080200100020
  593 + .quad 0x0801000200100020, 0x0001000000100000
  594 + .quad 0x0800080200000020, 0x0801000000100000
  595 + .quad 0x0801080000100000, 0x0000080000000000
  596 + .quad 0x0801000000100000, 0x0800080000000000
  597 + .quad 0x0000000200000020, 0x0801080200100020
  598 + .quad 0x0001080200100020, 0x0000000200000020
  599 + .quad 0x0000080000000000, 0x0800000000000000
  600 + .quad 0x0000080200000020, 0x0801080000100000
  601 + .quad 0x0001000000100000, 0x0800000200000020
  602 + .quad 0x0001000200100020, 0x0800080200000020
  603 + .quad 0x0800000200000020, 0x0001000200100020
  604 + .quad 0x0001080000100000, 0x0000000000000000
  605 + .quad 0x0800080000000000, 0x0000080200000020
  606 + .quad 0x0800000000000000, 0x0801000200100020
  607 + .quad 0x0801080200100020, 0x0001080000100000
  608 +.L_s3:
  609 + .quad 0x0000002000000208, 0x0000202008020200
  610 + .quad 0x0000000000000000, 0x0000200008020008
  611 + .quad 0x0000002008000200, 0x0000000000000000
  612 + .quad 0x0000202000020208, 0x0000002008000200
  613 + .quad 0x0000200000020008, 0x0000000008000008
  614 + .quad 0x0000000008000008, 0x0000200000020000
  615 + .quad 0x0000202008020208, 0x0000200000020008
  616 + .quad 0x0000200008020000, 0x0000002000000208
  617 + .quad 0x0000000008000000, 0x0000000000000008
  618 + .quad 0x0000202008020200, 0x0000002000000200
  619 + .quad 0x0000202000020200, 0x0000200008020000
  620 + .quad 0x0000200008020008, 0x0000202000020208
  621 + .quad 0x0000002008000208, 0x0000202000020200
  622 + .quad 0x0000200000020000, 0x0000002008000208
  623 + .quad 0x0000000000000008, 0x0000202008020208
  624 + .quad 0x0000002000000200, 0x0000000008000000
  625 + .quad 0x0000202008020200, 0x0000000008000000
  626 + .quad 0x0000200000020008, 0x0000002000000208
  627 + .quad 0x0000200000020000, 0x0000202008020200
  628 + .quad 0x0000002008000200, 0x0000000000000000
  629 + .quad 0x0000002000000200, 0x0000200000020008
  630 + .quad 0x0000202008020208, 0x0000002008000200
  631 + .quad 0x0000000008000008, 0x0000002000000200
  632 + .quad 0x0000000000000000, 0x0000200008020008
  633 + .quad 0x0000002008000208, 0x0000200000020000
  634 + .quad 0x0000000008000000, 0x0000202008020208
  635 + .quad 0x0000000000000008, 0x0000202000020208
  636 + .quad 0x0000202000020200, 0x0000000008000008
  637 + .quad 0x0000200008020000, 0x0000002008000208
  638 + .quad 0x0000002000000208, 0x0000200008020000
  639 + .quad 0x0000202000020208, 0x0000000000000008
  640 + .quad 0x0000200008020008, 0x0000202000020200
  641 +.L_s4:
  642 + .quad 0x1008020000002001, 0x1000020800002001
  643 + .quad 0x1000020800002001, 0x0000000800000000
  644 + .quad 0x0008020800002000, 0x1008000800000001
  645 + .quad 0x1008000000000001, 0x1000020000002001
  646 + .quad 0x0000000000000000, 0x0008020000002000
  647 + .quad 0x0008020000002000, 0x1008020800002001
  648 + .quad 0x1000000800000001, 0x0000000000000000
  649 + .quad 0x0008000800000000, 0x1008000000000001
  650 + .quad 0x1000000000000001, 0x0000020000002000
  651 + .quad 0x0008000000000000, 0x1008020000002001
  652 + .quad 0x0000000800000000, 0x0008000000000000
  653 + .quad 0x1000020000002001, 0x0000020800002000
  654 + .quad 0x1008000800000001, 0x1000000000000001
  655 + .quad 0x0000020800002000, 0x0008000800000000
  656 + .quad 0x0000020000002000, 0x0008020800002000
  657 + .quad 0x1008020800002001, 0x1000000800000001
  658 + .quad 0x0008000800000000, 0x1008000000000001
  659 + .quad 0x0008020000002000, 0x1008020800002001
  660 + .quad 0x1000000800000001, 0x0000000000000000
  661 + .quad 0x0000000000000000, 0x0008020000002000
  662 + .quad 0x0000020800002000, 0x0008000800000000
  663 + .quad 0x1008000800000001, 0x1000000000000001
  664 + .quad 0x1008020000002001, 0x1000020800002001
  665 + .quad 0x1000020800002001, 0x0000000800000000
  666 + .quad 0x1008020800002001, 0x1000000800000001
  667 + .quad 0x1000000000000001, 0x0000020000002000
  668 + .quad 0x1008000000000001, 0x1000020000002001
  669 + .quad 0x0008020800002000, 0x1008000800000001
  670 + .quad 0x1000020000002001, 0x0000020800002000
  671 + .quad 0x0008000000000000, 0x1008020000002001
  672 + .quad 0x0000000800000000, 0x0008000000000000
  673 + .quad 0x0000020000002000, 0x0008020800002000
  674 +.L_s5:
  675 + .quad 0x0000001000000100, 0x0020001002080100
  676 + .quad 0x0020000002080000, 0x0420001002000100
  677 + .quad 0x0000000000080000, 0x0000001000000100
  678 + .quad 0x0400000000000000, 0x0020000002080000
  679 + .quad 0x0400001000080100, 0x0000000000080000
  680 + .quad 0x0020001002000100, 0x0400001000080100
  681 + .quad 0x0420001002000100, 0x0420000002080000
  682 + .quad 0x0000001000080100, 0x0400000000000000
  683 + .quad 0x0020000002000000, 0x0400000000080000
  684 + .quad 0x0400000000080000, 0x0000000000000000
  685 + .quad 0x0400001000000100, 0x0420001002080100
  686 + .quad 0x0420001002080100, 0x0020001002000100
  687 + .quad 0x0420000002080000, 0x0400001000000100
  688 + .quad 0x0000000000000000, 0x0420000002000000
  689 + .quad 0x0020001002080100, 0x0020000002000000
  690 + .quad 0x0420000002000000, 0x0000001000080100
  691 + .quad 0x0000000000080000, 0x0420001002000100
  692 + .quad 0x0000001000000100, 0x0020000002000000
  693 + .quad 0x0400000000000000, 0x0020000002080000
  694 + .quad 0x0420001002000100, 0x0400001000080100
  695 + .quad 0x0020001002000100, 0x0400000000000000
  696 + .quad 0x0420000002080000, 0x0020001002080100
  697 + .quad 0x0400001000080100, 0x0000001000000100
  698 + .quad 0x0020000002000000, 0x0420000002080000
  699 + .quad 0x0420001002080100, 0x0000001000080100
  700 + .quad 0x0420000002000000, 0x0420001002080100
  701 + .quad 0x0020000002080000, 0x0000000000000000
  702 + .quad 0x0400000000080000, 0x0420000002000000
  703 + .quad 0x0000001000080100, 0x0020001002000100
  704 + .quad 0x0400001000000100, 0x0000000000080000
  705 + .quad 0x0000000000000000, 0x0400000000080000
  706 + .quad 0x0020001002080100, 0x0400001000000100
  707 +.L_s6:
  708 + .quad 0x0200000120000010, 0x0204000020000000
  709 + .quad 0x0000040000000000, 0x0204040120000010
  710 + .quad 0x0204000020000000, 0x0000000100000010
  711 + .quad 0x0204040120000010, 0x0004000000000000
  712 + .quad 0x0200040020000000, 0x0004040100000010
  713 + .quad 0x0004000000000000, 0x0200000120000010
  714 + .quad 0x0004000100000010, 0x0200040020000000
  715 + .quad 0x0200000020000000, 0x0000040100000010
  716 + .quad 0x0000000000000000, 0x0004000100000010
  717 + .quad 0x0200040120000010, 0x0000040000000000
  718 + .quad 0x0004040000000000, 0x0200040120000010
  719 + .quad 0x0000000100000010, 0x0204000120000010
  720 + .quad 0x0204000120000010, 0x0000000000000000
  721 + .quad 0x0004040100000010, 0x0204040020000000
  722 + .quad 0x0000040100000010, 0x0004040000000000
  723 + .quad 0x0204040020000000, 0x0200000020000000
  724 + .quad 0x0200040020000000, 0x0000000100000010
  725 + .quad 0x0204000120000010, 0x0004040000000000
  726 + .quad 0x0204040120000010, 0x0004000000000000
  727 + .quad 0x0000040100000010, 0x0200000120000010
  728 + .quad 0x0004000000000000, 0x0200040020000000
  729 + .quad 0x0200000020000000, 0x0000040100000010
  730 + .quad 0x0200000120000010, 0x0204040120000010
  731 + .quad 0x0004040000000000, 0x0204000020000000
  732 + .quad 0x0004040100000010, 0x0204040020000000
  733 + .quad 0x0000000000000000, 0x0204000120000010
  734 + .quad 0x0000000100000010, 0x0000040000000000
  735 + .quad 0x0204000020000000, 0x0004040100000010
  736 + .quad 0x0000040000000000, 0x0004000100000010
  737 + .quad 0x0200040120000010, 0x0000000000000000
  738 + .quad 0x0204040020000000, 0x0200000020000000
  739 + .quad 0x0004000100000010, 0x0200040120000010
  740 +.L_s7:
  741 + .quad 0x0002000000200000, 0x2002000004200002
  742 + .quad 0x2000000004000802, 0x0000000000000000
  743 + .quad 0x0000000000000800, 0x2000000004000802
  744 + .quad 0x2002000000200802, 0x0002000004200800
  745 + .quad 0x2002000004200802, 0x0002000000200000
  746 + .quad 0x0000000000000000, 0x2000000004000002
  747 + .quad 0x2000000000000002, 0x0000000004000000
  748 + .quad 0x2002000004200002, 0x2000000000000802
  749 + .quad 0x0000000004000800, 0x2002000000200802
  750 + .quad 0x2002000000200002, 0x0000000004000800
  751 + .quad 0x2000000004000002, 0x0002000004200000
  752 + .quad 0x0002000004200800, 0x2002000000200002
  753 + .quad 0x0002000004200000, 0x0000000000000800
  754 + .quad 0x2000000000000802, 0x2002000004200802
  755 + .quad 0x0002000000200800, 0x2000000000000002
  756 + .quad 0x0000000004000000, 0x0002000000200800
  757 + .quad 0x0000000004000000, 0x0002000000200800
  758 + .quad 0x0002000000200000, 0x2000000004000802
  759 + .quad 0x2000000004000802, 0x2002000004200002
  760 + .quad 0x2002000004200002, 0x2000000000000002
  761 + .quad 0x2002000000200002, 0x0000000004000000
  762 + .quad 0x0000000004000800, 0x0002000000200000
  763 + .quad 0x0002000004200800, 0x2000000000000802
  764 + .quad 0x2002000000200802, 0x0002000004200800
  765 + .quad 0x2000000000000802, 0x2000000004000002
  766 + .quad 0x2002000004200802, 0x0002000004200000
  767 + .quad 0x0002000000200800, 0x0000000000000000
  768 + .quad 0x2000000000000002, 0x2002000004200802
  769 + .quad 0x0000000000000000, 0x2002000000200802
  770 + .quad 0x0002000004200000, 0x0000000000000800
  771 + .quad 0x2000000004000002, 0x0000000004000800
  772 + .quad 0x0000000000000800, 0x2002000000200002
  773 +.L_s8:
  774 + .quad 0x0100010410001000, 0x0000010000001000
  775 + .quad 0x0000000000040000, 0x0100010410041000
  776 + .quad 0x0100000010000000, 0x0100010410001000
  777 + .quad 0x0000000400000000, 0x0100000010000000
  778 + .quad 0x0000000400040000, 0x0100000010040000
  779 + .quad 0x0100010410041000, 0x0000010000041000
  780 + .quad 0x0100010010041000, 0x0000010400041000
  781 + .quad 0x0000010000001000, 0x0000000400000000
  782 + .quad 0x0100000010040000, 0x0100000410000000
  783 + .quad 0x0100010010001000, 0x0000010400001000
  784 + .quad 0x0000010000041000, 0x0000000400040000
  785 + .quad 0x0100000410040000, 0x0100010010041000
  786 + .quad 0x0000010400001000, 0x0000000000000000
  787 + .quad 0x0000000000000000, 0x0100000410040000
  788 + .quad 0x0100000410000000, 0x0100010010001000
  789 + .quad 0x0000010400041000, 0x0000000000040000
  790 + .quad 0x0000010400041000, 0x0000000000040000
  791 + .quad 0x0100010010041000, 0x0000010000001000
  792 + .quad 0x0000000400000000, 0x0100000410040000
  793 + .quad 0x0000010000001000, 0x0000010400041000
  794 + .quad 0x0100010010001000, 0x0000000400000000
  795 + .quad 0x0100000410000000, 0x0100000010040000
  796 + .quad 0x0100000410040000, 0x0100000010000000
  797 + .quad 0x0000000000040000, 0x0100010410001000
  798 + .quad 0x0000000000000000, 0x0100010410041000
  799 + .quad 0x0000000400040000, 0x0100000410000000
  800 + .quad 0x0100000010040000, 0x0100010010001000
  801 + .quad 0x0100010410001000, 0x0000000000000000
  802 + .quad 0x0100010410041000, 0x0000010000041000
  803 + .quad 0x0000010000041000, 0x0000010400001000
  804 + .quad 0x0000010400001000, 0x0000000400040000
  805 + .quad 0x0100000010000000, 0x0100010010041000
arch/x86/crypto/des3_ede_glue.c
  1 +/*
  2 + * Glue Code for assembler optimized version of 3DES
  3 + *
  4 + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5 + *
  6 + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  7 + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  8 + * CTR part based on code (crypto/ctr.c) by:
  9 + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
  10 + *
  11 + * This program is free software; you can redistribute it and/or modify
  12 + * it under the terms of the GNU General Public License as published by
  13 + * the Free Software Foundation; either version 2 of the License, or
  14 + * (at your option) any later version.
  15 + *
  16 + * This program is distributed in the hope that it will be useful,
  17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19 + * GNU General Public License for more details.
  20 + *
  21 + */
  22 +
  23 +#include <asm/processor.h>
  24 +#include <crypto/des.h>
  25 +#include <linux/crypto.h>
  26 +#include <linux/init.h>
  27 +#include <linux/module.h>
  28 +#include <linux/types.h>
  29 +#include <crypto/algapi.h>
  30 +
  31 +struct des3_ede_x86_ctx {
  32 + u32 enc_expkey[DES3_EDE_EXPKEY_WORDS];
  33 + u32 dec_expkey[DES3_EDE_EXPKEY_WORDS];
  34 +};
  35 +
  36 +/* regular block cipher functions */
  37 +asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
  38 + const u8 *src);
  39 +
  40 +/* 3-way parallel cipher functions */
  41 +asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
  42 + const u8 *src);
  43 +
  44 +static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
  45 + const u8 *src)
  46 +{
  47 + u32 *enc_ctx = ctx->enc_expkey;
  48 +
  49 + des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
  50 +}
  51 +
  52 +static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
  53 + const u8 *src)
  54 +{
  55 + u32 *dec_ctx = ctx->dec_expkey;
  56 +
  57 + des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
  58 +}
  59 +
  60 +static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
  61 + const u8 *src)
  62 +{
  63 + u32 *enc_ctx = ctx->enc_expkey;
  64 +
  65 + des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
  66 +}
  67 +
  68 +static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
  69 + const u8 *src)
  70 +{
  71 + u32 *dec_ctx = ctx->dec_expkey;
  72 +
  73 + des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
  74 +}
  75 +
  76 +static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
  77 +{
  78 + des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
  79 +}
  80 +
  81 +static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
  82 +{
  83 + des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
  84 +}
  85 +
  86 +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
  87 + const u32 *expkey)
  88 +{
  89 + unsigned int bsize = DES3_EDE_BLOCK_SIZE;
  90 + unsigned int nbytes;
  91 + int err;
  92 +
  93 + err = blkcipher_walk_virt(desc, walk);
  94 +
  95 + while ((nbytes = walk->nbytes)) {
  96 + u8 *wsrc = walk->src.virt.addr;
  97 + u8 *wdst = walk->dst.virt.addr;
  98 +
  99 + /* Process four block batch */
  100 + if (nbytes >= bsize * 3) {
  101 + do {
  102 + des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
  103 + wsrc);
  104 +
  105 + wsrc += bsize * 3;
  106 + wdst += bsize * 3;
  107 + nbytes -= bsize * 3;
  108 + } while (nbytes >= bsize * 3);
  109 +
  110 + if (nbytes < bsize)
  111 + goto done;
  112 + }
  113 +
  114 + /* Handle leftovers */
  115 + do {
  116 + des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
  117 +
  118 + wsrc += bsize;
  119 + wdst += bsize;
  120 + nbytes -= bsize;
  121 + } while (nbytes >= bsize);
  122 +
  123 +done:
  124 + err = blkcipher_walk_done(desc, walk, nbytes);
  125 + }
  126 +
  127 + return err;
  128 +}
  129 +
  130 +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  131 + struct scatterlist *src, unsigned int nbytes)
  132 +{
  133 + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  134 + struct blkcipher_walk walk;
  135 +
  136 + blkcipher_walk_init(&walk, dst, src, nbytes);
  137 + return ecb_crypt(desc, &walk, ctx->enc_expkey);
  138 +}
  139 +
  140 +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  141 + struct scatterlist *src, unsigned int nbytes)
  142 +{
  143 + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  144 + struct blkcipher_walk walk;
  145 +
  146 + blkcipher_walk_init(&walk, dst, src, nbytes);
  147 + return ecb_crypt(desc, &walk, ctx->dec_expkey);
  148 +}
  149 +
  150 +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
  151 + struct blkcipher_walk *walk)
  152 +{
  153 + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  154 + unsigned int bsize = DES3_EDE_BLOCK_SIZE;
  155 + unsigned int nbytes = walk->nbytes;
  156 + u64 *src = (u64 *)walk->src.virt.addr;
  157 + u64 *dst = (u64 *)walk->dst.virt.addr;
  158 + u64 *iv = (u64 *)walk->iv;
  159 +
  160 + do {
  161 + *dst = *src ^ *iv;
  162 + des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
  163 + iv = dst;
  164 +
  165 + src += 1;
  166 + dst += 1;
  167 + nbytes -= bsize;
  168 + } while (nbytes >= bsize);
  169 +
  170 + *(u64 *)walk->iv = *iv;
  171 + return nbytes;
  172 +}
  173 +
  174 +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  175 + struct scatterlist *src, unsigned int nbytes)
  176 +{
  177 + struct blkcipher_walk walk;
  178 + int err;
  179 +
  180 + blkcipher_walk_init(&walk, dst, src, nbytes);
  181 + err = blkcipher_walk_virt(desc, &walk);
  182 +
  183 + while ((nbytes = walk.nbytes)) {
  184 + nbytes = __cbc_encrypt(desc, &walk);
  185 + err = blkcipher_walk_done(desc, &walk, nbytes);
  186 + }
  187 +
  188 + return err;
  189 +}
  190 +
  191 +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
  192 + struct blkcipher_walk *walk)
  193 +{
  194 + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  195 + unsigned int bsize = DES3_EDE_BLOCK_SIZE;
  196 + unsigned int nbytes = walk->nbytes;
  197 + u64 *src = (u64 *)walk->src.virt.addr;
  198 + u64 *dst = (u64 *)walk->dst.virt.addr;
  199 + u64 ivs[3 - 1];
  200 + u64 last_iv;
  201 +
  202 + /* Start of the last block. */
  203 + src += nbytes / bsize - 1;
  204 + dst += nbytes / bsize - 1;
  205 +
  206 + last_iv = *src;
  207 +
  208 + /* Process four block batch */
  209 + if (nbytes >= bsize * 3) {
  210 + do {
  211 + nbytes -= bsize * 3 - bsize;
  212 + src -= 3 - 1;
  213 + dst -= 3 - 1;
  214 +
  215 + ivs[0] = src[0];
  216 + ivs[1] = src[1];
  217 +
  218 + des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
  219 +
  220 + dst[1] ^= ivs[0];
  221 + dst[2] ^= ivs[1];
  222 +
  223 + nbytes -= bsize;
  224 + if (nbytes < bsize)
  225 + goto done;
  226 +
  227 + *dst ^= *(src - 1);
  228 + src -= 1;
  229 + dst -= 1;
  230 + } while (nbytes >= bsize * 3);
  231 + }
  232 +
  233 + /* Handle leftovers */
  234 + for (;;) {
  235 + des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
  236 +
  237 + nbytes -= bsize;
  238 + if (nbytes < bsize)
  239 + break;
  240 +
  241 + *dst ^= *(src - 1);
  242 + src -= 1;
  243 + dst -= 1;
  244 + }
  245 +
  246 +done:
  247 + *dst ^= *(u64 *)walk->iv;
  248 + *(u64 *)walk->iv = last_iv;
  249 +
  250 + return nbytes;
  251 +}
  252 +
  253 +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  254 + struct scatterlist *src, unsigned int nbytes)
  255 +{
  256 + struct blkcipher_walk walk;
  257 + int err;
  258 +
  259 + blkcipher_walk_init(&walk, dst, src, nbytes);
  260 + err = blkcipher_walk_virt(desc, &walk);
  261 +
  262 + while ((nbytes = walk.nbytes)) {
  263 + nbytes = __cbc_decrypt(desc, &walk);
  264 + err = blkcipher_walk_done(desc, &walk, nbytes);
  265 + }
  266 +
  267 + return err;
  268 +}
  269 +
  270 +static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
  271 + struct blkcipher_walk *walk)
  272 +{
  273 + u8 *ctrblk = walk->iv;
  274 + u8 keystream[DES3_EDE_BLOCK_SIZE];
  275 + u8 *src = walk->src.virt.addr;
  276 + u8 *dst = walk->dst.virt.addr;
  277 + unsigned int nbytes = walk->nbytes;
  278 +
  279 + des3_ede_enc_blk(ctx, keystream, ctrblk);
  280 + crypto_xor(keystream, src, nbytes);
  281 + memcpy(dst, keystream, nbytes);
  282 +
  283 + crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
  284 +}
  285 +
  286 +static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
  287 + struct blkcipher_walk *walk)
  288 +{
  289 + struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  290 + unsigned int bsize = DES3_EDE_BLOCK_SIZE;
  291 + unsigned int nbytes = walk->nbytes;
  292 + u64 *src = (u64 *)walk->src.virt.addr;
  293 + u64 *dst = (u64 *)walk->dst.virt.addr;
  294 + u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
  295 + __be64 ctrblocks[3];
  296 +
  297 + /* Process four block batch */
  298 + if (nbytes >= bsize * 3) {
  299 + do {
  300 + /* create ctrblks for parallel encrypt */
  301 + ctrblocks[0] = cpu_to_be64(ctrblk++);
  302 + ctrblocks[1] = cpu_to_be64(ctrblk++);
  303 + ctrblocks[2] = cpu_to_be64(ctrblk++);
  304 +
  305 + des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
  306 + (u8 *)ctrblocks);
  307 +
  308 + dst[0] = src[0] ^ ctrblocks[0];
  309 + dst[1] = src[1] ^ ctrblocks[1];
  310 + dst[2] = src[2] ^ ctrblocks[2];
  311 +
  312 + src += 3;
  313 + dst += 3;
  314 + } while ((nbytes -= bsize * 3) >= bsize * 3);
  315 +
  316 + if (nbytes < bsize)
  317 + goto done;
  318 + }
  319 +
  320 + /* Handle leftovers */
  321 + do {
  322 + ctrblocks[0] = cpu_to_be64(ctrblk++);
  323 +
  324 + des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
  325 +
  326 + dst[0] = src[0] ^ ctrblocks[0];
  327 +
  328 + src += 1;
  329 + dst += 1;
  330 + } while ((nbytes -= bsize) >= bsize);
  331 +
  332 +done:
  333 + *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
  334 + return nbytes;
  335 +}
  336 +
  337 +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  338 + struct scatterlist *src, unsigned int nbytes)
  339 +{
  340 + struct blkcipher_walk walk;
  341 + int err;
  342 +
  343 + blkcipher_walk_init(&walk, dst, src, nbytes);
  344 + err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE);
  345 +
  346 + while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
  347 + nbytes = __ctr_crypt(desc, &walk);
  348 + err = blkcipher_walk_done(desc, &walk, nbytes);
  349 + }
  350 +
  351 + if (walk.nbytes) {
  352 + ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk);
  353 + err = blkcipher_walk_done(desc, &walk, 0);
  354 + }
  355 +
  356 + return err;
  357 +}
  358 +
  359 +static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
  360 + unsigned int keylen)
  361 +{
  362 + struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
  363 + u32 i, j, tmp;
  364 + int err;
  365 +
  366 + /* Generate encryption context using generic implementation. */
  367 + err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen);
  368 + if (err < 0)
  369 + return err;
  370 +
  371 + /* Fix encryption context for this implementation and form decryption
  372 + * context. */
  373 + j = DES3_EDE_EXPKEY_WORDS - 2;
  374 + for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
  375 + tmp = ror32(ctx->enc_expkey[i + 1], 4);
  376 + ctx->enc_expkey[i + 1] = tmp;
  377 +
  378 + ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0];
  379 + ctx->dec_expkey[j + 1] = tmp;
  380 + }
  381 +
  382 + return 0;
  383 +}
  384 +
  385 +static struct crypto_alg des3_ede_algs[4] = { {
  386 + .cra_name = "des3_ede",
  387 + .cra_driver_name = "des3_ede-asm",
  388 + .cra_priority = 200,
  389 + .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
  390 + .cra_blocksize = DES3_EDE_BLOCK_SIZE,
  391 + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
  392 + .cra_alignmask = 0,
  393 + .cra_module = THIS_MODULE,
  394 + .cra_u = {
  395 + .cipher = {
  396 + .cia_min_keysize = DES3_EDE_KEY_SIZE,
  397 + .cia_max_keysize = DES3_EDE_KEY_SIZE,
  398 + .cia_setkey = des3_ede_x86_setkey,
  399 + .cia_encrypt = des3_ede_x86_encrypt,
  400 + .cia_decrypt = des3_ede_x86_decrypt,
  401 + }
  402 + }
  403 +}, {
  404 + .cra_name = "ecb(des3_ede)",
  405 + .cra_driver_name = "ecb-des3_ede-asm",
  406 + .cra_priority = 300,
  407 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
  408 + .cra_blocksize = DES3_EDE_BLOCK_SIZE,
  409 + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
  410 + .cra_alignmask = 0,
  411 + .cra_type = &crypto_blkcipher_type,
  412 + .cra_module = THIS_MODULE,
  413 + .cra_u = {
  414 + .blkcipher = {
  415 + .min_keysize = DES3_EDE_KEY_SIZE,
  416 + .max_keysize = DES3_EDE_KEY_SIZE,
  417 + .setkey = des3_ede_x86_setkey,
  418 + .encrypt = ecb_encrypt,
  419 + .decrypt = ecb_decrypt,
  420 + },
  421 + },
  422 +}, {
  423 + .cra_name = "cbc(des3_ede)",
  424 + .cra_driver_name = "cbc-des3_ede-asm",
  425 + .cra_priority = 300,
  426 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
  427 + .cra_blocksize = DES3_EDE_BLOCK_SIZE,
  428 + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
  429 + .cra_alignmask = 0,
  430 + .cra_type = &crypto_blkcipher_type,
  431 + .cra_module = THIS_MODULE,
  432 + .cra_u = {
  433 + .blkcipher = {
  434 + .min_keysize = DES3_EDE_KEY_SIZE,
  435 + .max_keysize = DES3_EDE_KEY_SIZE,
  436 + .ivsize = DES3_EDE_BLOCK_SIZE,
  437 + .setkey = des3_ede_x86_setkey,
  438 + .encrypt = cbc_encrypt,
  439 + .decrypt = cbc_decrypt,
  440 + },
  441 + },
  442 +}, {
  443 + .cra_name = "ctr(des3_ede)",
  444 + .cra_driver_name = "ctr-des3_ede-asm",
  445 + .cra_priority = 300,
  446 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
  447 + .cra_blocksize = 1,
  448 + .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
  449 + .cra_alignmask = 0,
  450 + .cra_type = &crypto_blkcipher_type,
  451 + .cra_module = THIS_MODULE,
  452 + .cra_u = {
  453 + .blkcipher = {
  454 + .min_keysize = DES3_EDE_KEY_SIZE,
  455 + .max_keysize = DES3_EDE_KEY_SIZE,
  456 + .ivsize = DES3_EDE_BLOCK_SIZE,
  457 + .setkey = des3_ede_x86_setkey,
  458 + .encrypt = ctr_crypt,
  459 + .decrypt = ctr_crypt,
  460 + },
  461 + },
  462 +} };
  463 +
  464 +static bool is_blacklisted_cpu(void)
  465 +{
  466 + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
  467 + return false;
  468 +
  469 + if (boot_cpu_data.x86 == 0x0f) {
  470 + /*
  471 + * On Pentium 4, des3_ede-x86_64 is slower than generic C
  472 + * implementation because use of 64bit rotates (which are really
  473 + * slow on P4). Therefore blacklist P4s.
  474 + */
  475 + return true;
  476 + }
  477 +
  478 + return false;
  479 +}
  480 +
  481 +static int force;
  482 +module_param(force, int, 0);
  483 +MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
  484 +
  485 +static int __init des3_ede_x86_init(void)
  486 +{
  487 + if (!force && is_blacklisted_cpu()) {
  488 + pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
  489 + return -ENODEV;
  490 + }
  491 +
  492 + return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
  493 +}
  494 +
  495 +static void __exit des3_ede_x86_fini(void)
  496 +{
  497 + crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
  498 +}
  499 +
  500 +module_init(des3_ede_x86_init);
  501 +module_exit(des3_ede_x86_fini);
  502 +
  503 +MODULE_LICENSE("GPL");
  504 +MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
  505 +MODULE_ALIAS("des3_ede");
  506 +MODULE_ALIAS("des3_ede-asm");
  507 +MODULE_ALIAS("des");
  508 +MODULE_ALIAS("des-asm");
  509 +MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
... ... @@ -1019,6 +1019,19 @@
1019 1019 DES cipher algorithm (FIPS 46-2), and Triple DES EDE (FIPS 46-3),
1020 1020 optimized using SPARC64 crypto opcodes.
1021 1021  
  1022 +config CRYPTO_DES3_EDE_X86_64
  1023 + tristate "Triple DES EDE cipher algorithm (x86-64)"
  1024 + depends on X86 && 64BIT
  1025 + select CRYPTO_ALGAPI
  1026 + select CRYPTO_DES
  1027 + help
  1028 + Triple DES EDE (FIPS 46-3) algorithm.
  1029 +
  1030 + This module provides implementation of the Triple DES EDE cipher
  1031 + algorithm that is optimized for x86-64 processors. Two versions of
  1032 + algorithm are provided; regular processing one input block and
  1033 + one that processes three blocks parallel.
  1034 +
1022 1035 config CRYPTO_FCRYPT
1023 1036 tristate "FCrypt cipher algorithm"
1024 1037 select CRYPTO_ALGAPI
crypto/des_generic.c
... ... @@ -859,13 +859,10 @@
859 859 * property.
860 860 *
861 861 */
862   -static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key,
863   - unsigned int keylen)
  862 +int __des3_ede_setkey(u32 *expkey, u32 *flags, const u8 *key,
  863 + unsigned int keylen)
864 864 {
865 865 const u32 *K = (const u32 *)key;
866   - struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
867   - u32 *expkey = dctx->expkey;
868   - u32 *flags = &tfm->crt_flags;
869 866  
870 867 if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
871 868 !((K[2] ^ K[4]) | (K[3] ^ K[5]))) &&
872 869  
... ... @@ -880,7 +877,18 @@
880 877  
881 878 return 0;
882 879 }
  880 +EXPORT_SYMBOL_GPL(__des3_ede_setkey);
883 881  
  882 +static int des3_ede_setkey(struct crypto_tfm *tfm, const u8 *key,
  883 + unsigned int keylen)
  884 +{
  885 + struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
  886 + u32 *flags = &tfm->crt_flags;
  887 + u32 *expkey = dctx->expkey;
  888 +
  889 + return __des3_ede_setkey(expkey, flags, key, keylen);
  890 +}
  891 +
884 892 static void des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
885 893 {
886 894 struct des3_ede_ctx *dctx = crypto_tfm_ctx(tfm);
... ... @@ -945,6 +953,8 @@
945 953  
946 954 static struct crypto_alg des_algs[2] = { {
947 955 .cra_name = "des",
  956 + .cra_driver_name = "des-generic",
  957 + .cra_priority = 100,
948 958 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
949 959 .cra_blocksize = DES_BLOCK_SIZE,
950 960 .cra_ctxsize = sizeof(struct des_ctx),
... ... @@ -958,6 +968,8 @@
958 968 .cia_decrypt = des_decrypt } }
959 969 }, {
960 970 .cra_name = "des3_ede",
  971 + .cra_driver_name = "des3_ede-generic",
  972 + .cra_priority = 100,
961 973 .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
962 974 .cra_blocksize = DES3_EDE_BLOCK_SIZE,
963 975 .cra_ctxsize = sizeof(struct des3_ede_ctx),
include/crypto/des.h
... ... @@ -16,5 +16,8 @@
16 16  
17 17 extern unsigned long des_ekey(u32 *pe, const u8 *k);
18 18  
  19 +extern int __des3_ede_setkey(u32 *expkey, u32 *flags, const u8 *key,
  20 + unsigned int keylen);
  21 +
19 22 #endif /* __CRYPTO_DES_H */