Commit 937c30d7f560210b0163035edd42b2aef78fed9e

Authored by Jussi Kivilinna
Committed by Herbert Xu
1 parent d19978f587

crypto: serpent - add 8-way parallel x86_64/SSE2 assembler implementation

Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations
in parallel to improve performance on out-of-order CPUs). Glue code is based
on one from AES-NI implementation, so requests from irq context are redirected
to cryptd.

v2:
 - add missing include of linux/module.h
   (appearently crypto.h used to include module.h, which changed for 3.2 by
    commit 7c926402a7e8c9b279968fd94efec8700ba3859e)

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

AMD Phenom II 1055T (fam:16, model:10):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     1.03x   1.01x   1.03x   1.05x   1.00x   0.99x
64B     1.00x   1.01x   1.02x   1.04x   1.02x   1.01x
256B    2.34x   2.41x   0.99x   2.43x   2.39x   2.40x
1024B   2.51x   2.57x   1.00x   2.59x   2.56x   2.56x
8192B   2.50x   2.54x   1.00x   2.55x   2.57x   2.57x

Intel Celeron T1600 (fam:6, model:15, step:13):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     0.97x   0.97x   1.01x   1.01x   1.01x   1.02x
64B     1.00x   1.00x   1.00x   1.02x   1.01x   1.01x
256B    3.41x   3.35x   1.00x   3.39x   3.42x   3.44x
1024B   3.75x   3.72x   0.99x   3.74x   3.75x   3.75x
8192B   3.70x   3.68x   0.99x   3.68x   3.69x   3.69x

Full output:
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-sse2.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-sse2.txt

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Showing 6 changed files with 1591 additions and 0 deletions Side-by-side Diff

arch/x86/crypto/Makefile
... ... @@ -11,6 +11,7 @@
11 11 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
12 12 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
13 13 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
  14 +obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
14 15 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15 16 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
16 17  
... ... @@ -26,6 +27,7 @@
26 27 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
27 28 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
28 29 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
  30 +serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
29 31  
30 32 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
31 33  
arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
  1 +/*
  2 + * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
  3 + *
  4 + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5 + *
  6 + * Based on crypto/serpent.c by
  7 + * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
  8 + * 2003 Herbert Valerio Riedel <hvr@gnu.org>
  9 + *
  10 + * This program is free software; you can redistribute it and/or modify
  11 + * it under the terms of the GNU General Public License as published by
  12 + * the Free Software Foundation; either version 2 of the License, or
  13 + * (at your option) any later version.
  14 + *
  15 + * This program is distributed in the hope that it will be useful,
  16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18 + * GNU General Public License for more details.
  19 + *
  20 + * You should have received a copy of the GNU General Public License
  21 + * along with this program; if not, write to the Free Software
  22 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  23 + * USA
  24 + *
  25 + */
  26 +
  27 +.file "serpent-sse2-x86_64-asm_64.S"
  28 +.text
  29 +
  30 +#define CTX %rdi
  31 +
  32 +/**********************************************************************
  33 + 8-way SSE2 serpent
  34 + **********************************************************************/
  35 +#define RA1 %xmm0
  36 +#define RB1 %xmm1
  37 +#define RC1 %xmm2
  38 +#define RD1 %xmm3
  39 +#define RE1 %xmm4
  40 +
  41 +#define RA2 %xmm5
  42 +#define RB2 %xmm6
  43 +#define RC2 %xmm7
  44 +#define RD2 %xmm8
  45 +#define RE2 %xmm9
  46 +
  47 +#define RNOT %xmm10
  48 +
  49 +#define RK0 %xmm11
  50 +#define RK1 %xmm12
  51 +#define RK2 %xmm13
  52 +#define RK3 %xmm14
  53 +
  54 +#define S0_1(x0, x1, x2, x3, x4) \
  55 + movdqa x3, x4; \
  56 + por x0, x3; \
  57 + pxor x4, x0; \
  58 + pxor x2, x4; \
  59 + pxor RNOT, x4; \
  60 + pxor x1, x3; \
  61 + pand x0, x1; \
  62 + pxor x4, x1; \
  63 + pxor x0, x2;
  64 +#define S0_2(x0, x1, x2, x3, x4) \
  65 + pxor x3, x0; \
  66 + por x0, x4; \
  67 + pxor x2, x0; \
  68 + pand x1, x2; \
  69 + pxor x2, x3; \
  70 + pxor RNOT, x1; \
  71 + pxor x4, x2; \
  72 + pxor x2, x1;
  73 +
  74 +#define S1_1(x0, x1, x2, x3, x4) \
  75 + movdqa x1, x4; \
  76 + pxor x0, x1; \
  77 + pxor x3, x0; \
  78 + pxor RNOT, x3; \
  79 + pand x1, x4; \
  80 + por x1, x0; \
  81 + pxor x2, x3; \
  82 + pxor x3, x0; \
  83 + pxor x3, x1;
  84 +#define S1_2(x0, x1, x2, x3, x4) \
  85 + pxor x4, x3; \
  86 + por x4, x1; \
  87 + pxor x2, x4; \
  88 + pand x0, x2; \
  89 + pxor x1, x2; \
  90 + por x0, x1; \
  91 + pxor RNOT, x0; \
  92 + pxor x2, x0; \
  93 + pxor x1, x4;
  94 +
  95 +#define S2_1(x0, x1, x2, x3, x4) \
  96 + pxor RNOT, x3; \
  97 + pxor x0, x1; \
  98 + movdqa x0, x4; \
  99 + pand x2, x0; \
  100 + pxor x3, x0; \
  101 + por x4, x3; \
  102 + pxor x1, x2; \
  103 + pxor x1, x3; \
  104 + pand x0, x1;
  105 +#define S2_2(x0, x1, x2, x3, x4) \
  106 + pxor x2, x0; \
  107 + pand x3, x2; \
  108 + por x1, x3; \
  109 + pxor RNOT, x0; \
  110 + pxor x0, x3; \
  111 + pxor x0, x4; \
  112 + pxor x2, x0; \
  113 + por x2, x1;
  114 +
  115 +#define S3_1(x0, x1, x2, x3, x4) \
  116 + movdqa x1, x4; \
  117 + pxor x3, x1; \
  118 + por x0, x3; \
  119 + pand x0, x4; \
  120 + pxor x2, x0; \
  121 + pxor x1, x2; \
  122 + pand x3, x1; \
  123 + pxor x3, x2; \
  124 + por x4, x0; \
  125 + pxor x3, x4;
  126 +#define S3_2(x0, x1, x2, x3, x4) \
  127 + pxor x0, x1; \
  128 + pand x3, x0; \
  129 + pand x4, x3; \
  130 + pxor x2, x3; \
  131 + por x1, x4; \
  132 + pand x1, x2; \
  133 + pxor x3, x4; \
  134 + pxor x3, x0; \
  135 + pxor x2, x3;
  136 +
  137 +#define S4_1(x0, x1, x2, x3, x4) \
  138 + movdqa x3, x4; \
  139 + pand x0, x3; \
  140 + pxor x4, x0; \
  141 + pxor x2, x3; \
  142 + por x4, x2; \
  143 + pxor x1, x0; \
  144 + pxor x3, x4; \
  145 + por x0, x2; \
  146 + pxor x1, x2;
  147 +#define S4_2(x0, x1, x2, x3, x4) \
  148 + pand x0, x1; \
  149 + pxor x4, x1; \
  150 + pand x2, x4; \
  151 + pxor x3, x2; \
  152 + pxor x0, x4; \
  153 + por x1, x3; \
  154 + pxor RNOT, x1; \
  155 + pxor x0, x3;
  156 +
  157 +#define S5_1(x0, x1, x2, x3, x4) \
  158 + movdqa x1, x4; \
  159 + por x0, x1; \
  160 + pxor x1, x2; \
  161 + pxor RNOT, x3; \
  162 + pxor x0, x4; \
  163 + pxor x2, x0; \
  164 + pand x4, x1; \
  165 + por x3, x4; \
  166 + pxor x0, x4;
  167 +#define S5_2(x0, x1, x2, x3, x4) \
  168 + pand x3, x0; \
  169 + pxor x3, x1; \
  170 + pxor x2, x3; \
  171 + pxor x1, x0; \
  172 + pand x4, x2; \
  173 + pxor x2, x1; \
  174 + pand x0, x2; \
  175 + pxor x2, x3;
  176 +
  177 +#define S6_1(x0, x1, x2, x3, x4) \
  178 + movdqa x1, x4; \
  179 + pxor x0, x3; \
  180 + pxor x2, x1; \
  181 + pxor x0, x2; \
  182 + pand x3, x0; \
  183 + por x3, x1; \
  184 + pxor RNOT, x4; \
  185 + pxor x1, x0; \
  186 + pxor x2, x1;
  187 +#define S6_2(x0, x1, x2, x3, x4) \
  188 + pxor x4, x3; \
  189 + pxor x0, x4; \
  190 + pand x0, x2; \
  191 + pxor x1, x4; \
  192 + pxor x3, x2; \
  193 + pand x1, x3; \
  194 + pxor x0, x3; \
  195 + pxor x2, x1;
  196 +
  197 +#define S7_1(x0, x1, x2, x3, x4) \
  198 + pxor RNOT, x1; \
  199 + movdqa x1, x4; \
  200 + pxor RNOT, x0; \
  201 + pand x2, x1; \
  202 + pxor x3, x1; \
  203 + por x4, x3; \
  204 + pxor x2, x4; \
  205 + pxor x3, x2; \
  206 + pxor x0, x3; \
  207 + por x1, x0;
  208 +#define S7_2(x0, x1, x2, x3, x4) \
  209 + pand x0, x2; \
  210 + pxor x4, x0; \
  211 + pxor x3, x4; \
  212 + pand x0, x3; \
  213 + pxor x1, x4; \
  214 + pxor x4, x2; \
  215 + pxor x1, x3; \
  216 + por x0, x4; \
  217 + pxor x1, x4;
  218 +
  219 +#define SI0_1(x0, x1, x2, x3, x4) \
  220 + movdqa x3, x4; \
  221 + pxor x0, x1; \
  222 + por x1, x3; \
  223 + pxor x1, x4; \
  224 + pxor RNOT, x0; \
  225 + pxor x3, x2; \
  226 + pxor x0, x3; \
  227 + pand x1, x0; \
  228 + pxor x2, x0;
  229 +#define SI0_2(x0, x1, x2, x3, x4) \
  230 + pand x3, x2; \
  231 + pxor x4, x3; \
  232 + pxor x3, x2; \
  233 + pxor x3, x1; \
  234 + pand x0, x3; \
  235 + pxor x0, x1; \
  236 + pxor x2, x0; \
  237 + pxor x3, x4;
  238 +
  239 +#define SI1_1(x0, x1, x2, x3, x4) \
  240 + pxor x3, x1; \
  241 + movdqa x0, x4; \
  242 + pxor x2, x0; \
  243 + pxor RNOT, x2; \
  244 + por x1, x4; \
  245 + pxor x3, x4; \
  246 + pand x1, x3; \
  247 + pxor x2, x1; \
  248 + pand x4, x2;
  249 +#define SI1_2(x0, x1, x2, x3, x4) \
  250 + pxor x1, x4; \
  251 + por x3, x1; \
  252 + pxor x0, x3; \
  253 + pxor x0, x2; \
  254 + por x4, x0; \
  255 + pxor x4, x2; \
  256 + pxor x0, x1; \
  257 + pxor x1, x4;
  258 +
  259 +#define SI2_1(x0, x1, x2, x3, x4) \
  260 + pxor x1, x2; \
  261 + movdqa x3, x4; \
  262 + pxor RNOT, x3; \
  263 + por x2, x3; \
  264 + pxor x4, x2; \
  265 + pxor x0, x4; \
  266 + pxor x1, x3; \
  267 + por x2, x1; \
  268 + pxor x0, x2;
  269 +#define SI2_2(x0, x1, x2, x3, x4) \
  270 + pxor x4, x1; \
  271 + por x3, x4; \
  272 + pxor x3, x2; \
  273 + pxor x2, x4; \
  274 + pand x1, x2; \
  275 + pxor x3, x2; \
  276 + pxor x4, x3; \
  277 + pxor x0, x4;
  278 +
  279 +#define SI3_1(x0, x1, x2, x3, x4) \
  280 + pxor x1, x2; \
  281 + movdqa x1, x4; \
  282 + pand x2, x1; \
  283 + pxor x0, x1; \
  284 + por x4, x0; \
  285 + pxor x3, x4; \
  286 + pxor x3, x0; \
  287 + por x1, x3; \
  288 + pxor x2, x1;
  289 +#define SI3_2(x0, x1, x2, x3, x4) \
  290 + pxor x3, x1; \
  291 + pxor x2, x0; \
  292 + pxor x3, x2; \
  293 + pand x1, x3; \
  294 + pxor x0, x1; \
  295 + pand x2, x0; \
  296 + pxor x3, x4; \
  297 + pxor x0, x3; \
  298 + pxor x1, x0;
  299 +
  300 +#define SI4_1(x0, x1, x2, x3, x4) \
  301 + pxor x3, x2; \
  302 + movdqa x0, x4; \
  303 + pand x1, x0; \
  304 + pxor x2, x0; \
  305 + por x3, x2; \
  306 + pxor RNOT, x4; \
  307 + pxor x0, x1; \
  308 + pxor x2, x0; \
  309 + pand x4, x2;
  310 +#define SI4_2(x0, x1, x2, x3, x4) \
  311 + pxor x0, x2; \
  312 + por x4, x0; \
  313 + pxor x3, x0; \
  314 + pand x2, x3; \
  315 + pxor x3, x4; \
  316 + pxor x1, x3; \
  317 + pand x0, x1; \
  318 + pxor x1, x4; \
  319 + pxor x3, x0;
  320 +
  321 +#define SI5_1(x0, x1, x2, x3, x4) \
  322 + movdqa x1, x4; \
  323 + por x2, x1; \
  324 + pxor x4, x2; \
  325 + pxor x3, x1; \
  326 + pand x4, x3; \
  327 + pxor x3, x2; \
  328 + por x0, x3; \
  329 + pxor RNOT, x0; \
  330 + pxor x2, x3; \
  331 + por x0, x2;
  332 +#define SI5_2(x0, x1, x2, x3, x4) \
  333 + pxor x1, x4; \
  334 + pxor x4, x2; \
  335 + pand x0, x4; \
  336 + pxor x1, x0; \
  337 + pxor x3, x1; \
  338 + pand x2, x0; \
  339 + pxor x3, x2; \
  340 + pxor x2, x0; \
  341 + pxor x4, x2; \
  342 + pxor x3, x4;
  343 +
  344 +#define SI6_1(x0, x1, x2, x3, x4) \
  345 + pxor x2, x0; \
  346 + movdqa x0, x4; \
  347 + pand x3, x0; \
  348 + pxor x3, x2; \
  349 + pxor x2, x0; \
  350 + pxor x1, x3; \
  351 + por x4, x2; \
  352 + pxor x3, x2; \
  353 + pand x0, x3;
  354 +#define SI6_2(x0, x1, x2, x3, x4) \
  355 + pxor RNOT, x0; \
  356 + pxor x1, x3; \
  357 + pand x2, x1; \
  358 + pxor x0, x4; \
  359 + pxor x4, x3; \
  360 + pxor x2, x4; \
  361 + pxor x1, x0; \
  362 + pxor x0, x2;
  363 +
  364 +#define SI7_1(x0, x1, x2, x3, x4) \
  365 + movdqa x3, x4; \
  366 + pand x0, x3; \
  367 + pxor x2, x0; \
  368 + por x4, x2; \
  369 + pxor x1, x4; \
  370 + pxor RNOT, x0; \
  371 + por x3, x1; \
  372 + pxor x0, x4; \
  373 + pand x2, x0; \
  374 + pxor x1, x0;
  375 +#define SI7_2(x0, x1, x2, x3, x4) \
  376 + pand x2, x1; \
  377 + pxor x2, x3; \
  378 + pxor x3, x4; \
  379 + pand x3, x2; \
  380 + por x0, x3; \
  381 + pxor x4, x1; \
  382 + pxor x4, x3; \
  383 + pand x0, x4; \
  384 + pxor x2, x4;
  385 +
  386 +#define get_key(i, j, t) \
  387 + movd (4*(i)+(j))*4(CTX), t; \
  388 + pshufd $0, t, t;
  389 +
  390 +#define K2(x0, x1, x2, x3, x4, i) \
  391 + get_key(i, 0, RK0); \
  392 + get_key(i, 1, RK1); \
  393 + get_key(i, 2, RK2); \
  394 + get_key(i, 3, RK3); \
  395 + pxor RK0, x0 ## 1; \
  396 + pxor RK1, x1 ## 1; \
  397 + pxor RK2, x2 ## 1; \
  398 + pxor RK3, x3 ## 1; \
  399 + pxor RK0, x0 ## 2; \
  400 + pxor RK1, x1 ## 2; \
  401 + pxor RK2, x2 ## 2; \
  402 + pxor RK3, x3 ## 2;
  403 +
  404 +#define LK2(x0, x1, x2, x3, x4, i) \
  405 + movdqa x0 ## 1, x4 ## 1; \
  406 + pslld $13, x0 ## 1; \
  407 + psrld $(32 - 13), x4 ## 1; \
  408 + por x4 ## 1, x0 ## 1; \
  409 + pxor x0 ## 1, x1 ## 1; \
  410 + movdqa x2 ## 1, x4 ## 1; \
  411 + pslld $3, x2 ## 1; \
  412 + psrld $(32 - 3), x4 ## 1; \
  413 + por x4 ## 1, x2 ## 1; \
  414 + pxor x2 ## 1, x1 ## 1; \
  415 + movdqa x0 ## 2, x4 ## 2; \
  416 + pslld $13, x0 ## 2; \
  417 + psrld $(32 - 13), x4 ## 2; \
  418 + por x4 ## 2, x0 ## 2; \
  419 + pxor x0 ## 2, x1 ## 2; \
  420 + movdqa x2 ## 2, x4 ## 2; \
  421 + pslld $3, x2 ## 2; \
  422 + psrld $(32 - 3), x4 ## 2; \
  423 + por x4 ## 2, x2 ## 2; \
  424 + pxor x2 ## 2, x1 ## 2; \
  425 + movdqa x1 ## 1, x4 ## 1; \
  426 + pslld $1, x1 ## 1; \
  427 + psrld $(32 - 1), x4 ## 1; \
  428 + por x4 ## 1, x1 ## 1; \
  429 + movdqa x0 ## 1, x4 ## 1; \
  430 + pslld $3, x4 ## 1; \
  431 + pxor x2 ## 1, x3 ## 1; \
  432 + pxor x4 ## 1, x3 ## 1; \
  433 + movdqa x3 ## 1, x4 ## 1; \
  434 + get_key(i, 1, RK1); \
  435 + movdqa x1 ## 2, x4 ## 2; \
  436 + pslld $1, x1 ## 2; \
  437 + psrld $(32 - 1), x4 ## 2; \
  438 + por x4 ## 2, x1 ## 2; \
  439 + movdqa x0 ## 2, x4 ## 2; \
  440 + pslld $3, x4 ## 2; \
  441 + pxor x2 ## 2, x3 ## 2; \
  442 + pxor x4 ## 2, x3 ## 2; \
  443 + movdqa x3 ## 2, x4 ## 2; \
  444 + get_key(i, 3, RK3); \
  445 + pslld $7, x3 ## 1; \
  446 + psrld $(32 - 7), x4 ## 1; \
  447 + por x4 ## 1, x3 ## 1; \
  448 + movdqa x1 ## 1, x4 ## 1; \
  449 + pslld $7, x4 ## 1; \
  450 + pxor x1 ## 1, x0 ## 1; \
  451 + pxor x3 ## 1, x0 ## 1; \
  452 + pxor x3 ## 1, x2 ## 1; \
  453 + pxor x4 ## 1, x2 ## 1; \
  454 + get_key(i, 0, RK0); \
  455 + pslld $7, x3 ## 2; \
  456 + psrld $(32 - 7), x4 ## 2; \
  457 + por x4 ## 2, x3 ## 2; \
  458 + movdqa x1 ## 2, x4 ## 2; \
  459 + pslld $7, x4 ## 2; \
  460 + pxor x1 ## 2, x0 ## 2; \
  461 + pxor x3 ## 2, x0 ## 2; \
  462 + pxor x3 ## 2, x2 ## 2; \
  463 + pxor x4 ## 2, x2 ## 2; \
  464 + get_key(i, 2, RK2); \
  465 + pxor RK1, x1 ## 1; \
  466 + pxor RK3, x3 ## 1; \
  467 + movdqa x0 ## 1, x4 ## 1; \
  468 + pslld $5, x0 ## 1; \
  469 + psrld $(32 - 5), x4 ## 1; \
  470 + por x4 ## 1, x0 ## 1; \
  471 + movdqa x2 ## 1, x4 ## 1; \
  472 + pslld $22, x2 ## 1; \
  473 + psrld $(32 - 22), x4 ## 1; \
  474 + por x4 ## 1, x2 ## 1; \
  475 + pxor RK0, x0 ## 1; \
  476 + pxor RK2, x2 ## 1; \
  477 + pxor RK1, x1 ## 2; \
  478 + pxor RK3, x3 ## 2; \
  479 + movdqa x0 ## 2, x4 ## 2; \
  480 + pslld $5, x0 ## 2; \
  481 + psrld $(32 - 5), x4 ## 2; \
  482 + por x4 ## 2, x0 ## 2; \
  483 + movdqa x2 ## 2, x4 ## 2; \
  484 + pslld $22, x2 ## 2; \
  485 + psrld $(32 - 22), x4 ## 2; \
  486 + por x4 ## 2, x2 ## 2; \
  487 + pxor RK0, x0 ## 2; \
  488 + pxor RK2, x2 ## 2;
  489 +
  490 +#define KL2(x0, x1, x2, x3, x4, i) \
  491 + pxor RK0, x0 ## 1; \
  492 + pxor RK2, x2 ## 1; \
  493 + movdqa x0 ## 1, x4 ## 1; \
  494 + psrld $5, x0 ## 1; \
  495 + pslld $(32 - 5), x4 ## 1; \
  496 + por x4 ## 1, x0 ## 1; \
  497 + pxor RK3, x3 ## 1; \
  498 + pxor RK1, x1 ## 1; \
  499 + movdqa x2 ## 1, x4 ## 1; \
  500 + psrld $22, x2 ## 1; \
  501 + pslld $(32 - 22), x4 ## 1; \
  502 + por x4 ## 1, x2 ## 1; \
  503 + pxor x3 ## 1, x2 ## 1; \
  504 + pxor RK0, x0 ## 2; \
  505 + pxor RK2, x2 ## 2; \
  506 + movdqa x0 ## 2, x4 ## 2; \
  507 + psrld $5, x0 ## 2; \
  508 + pslld $(32 - 5), x4 ## 2; \
  509 + por x4 ## 2, x0 ## 2; \
  510 + pxor RK3, x3 ## 2; \
  511 + pxor RK1, x1 ## 2; \
  512 + movdqa x2 ## 2, x4 ## 2; \
  513 + psrld $22, x2 ## 2; \
  514 + pslld $(32 - 22), x4 ## 2; \
  515 + por x4 ## 2, x2 ## 2; \
  516 + pxor x3 ## 2, x2 ## 2; \
  517 + pxor x3 ## 1, x0 ## 1; \
  518 + movdqa x1 ## 1, x4 ## 1; \
  519 + pslld $7, x4 ## 1; \
  520 + pxor x1 ## 1, x0 ## 1; \
  521 + pxor x4 ## 1, x2 ## 1; \
  522 + movdqa x1 ## 1, x4 ## 1; \
  523 + psrld $1, x1 ## 1; \
  524 + pslld $(32 - 1), x4 ## 1; \
  525 + por x4 ## 1, x1 ## 1; \
  526 + pxor x3 ## 2, x0 ## 2; \
  527 + movdqa x1 ## 2, x4 ## 2; \
  528 + pslld $7, x4 ## 2; \
  529 + pxor x1 ## 2, x0 ## 2; \
  530 + pxor x4 ## 2, x2 ## 2; \
  531 + movdqa x1 ## 2, x4 ## 2; \
  532 + psrld $1, x1 ## 2; \
  533 + pslld $(32 - 1), x4 ## 2; \
  534 + por x4 ## 2, x1 ## 2; \
  535 + movdqa x3 ## 1, x4 ## 1; \
  536 + psrld $7, x3 ## 1; \
  537 + pslld $(32 - 7), x4 ## 1; \
  538 + por x4 ## 1, x3 ## 1; \
  539 + pxor x0 ## 1, x1 ## 1; \
  540 + movdqa x0 ## 1, x4 ## 1; \
  541 + pslld $3, x4 ## 1; \
  542 + pxor x4 ## 1, x3 ## 1; \
  543 + movdqa x0 ## 1, x4 ## 1; \
  544 + movdqa x3 ## 2, x4 ## 2; \
  545 + psrld $7, x3 ## 2; \
  546 + pslld $(32 - 7), x4 ## 2; \
  547 + por x4 ## 2, x3 ## 2; \
  548 + pxor x0 ## 2, x1 ## 2; \
  549 + movdqa x0 ## 2, x4 ## 2; \
  550 + pslld $3, x4 ## 2; \
  551 + pxor x4 ## 2, x3 ## 2; \
  552 + movdqa x0 ## 2, x4 ## 2; \
  553 + psrld $13, x0 ## 1; \
  554 + pslld $(32 - 13), x4 ## 1; \
  555 + por x4 ## 1, x0 ## 1; \
  556 + pxor x2 ## 1, x1 ## 1; \
  557 + pxor x2 ## 1, x3 ## 1; \
  558 + movdqa x2 ## 1, x4 ## 1; \
  559 + psrld $3, x2 ## 1; \
  560 + pslld $(32 - 3), x4 ## 1; \
  561 + por x4 ## 1, x2 ## 1; \
  562 + psrld $13, x0 ## 2; \
  563 + pslld $(32 - 13), x4 ## 2; \
  564 + por x4 ## 2, x0 ## 2; \
  565 + pxor x2 ## 2, x1 ## 2; \
  566 + pxor x2 ## 2, x3 ## 2; \
  567 + movdqa x2 ## 2, x4 ## 2; \
  568 + psrld $3, x2 ## 2; \
  569 + pslld $(32 - 3), x4 ## 2; \
  570 + por x4 ## 2, x2 ## 2;
  571 +
  572 +#define S(SBOX, x0, x1, x2, x3, x4) \
  573 + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
  574 + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
  575 + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
  576 + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
  577 +
  578 +#define SP(SBOX, x0, x1, x2, x3, x4, i) \
  579 + get_key(i, 0, RK0); \
  580 + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
  581 + get_key(i, 2, RK2); \
  582 + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
  583 + get_key(i, 3, RK3); \
  584 + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
  585 + get_key(i, 1, RK1); \
  586 + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
  587 +
  588 +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
  589 + movdqa x2, t3; \
  590 + movdqa x0, t1; \
  591 + unpcklps x3, t3; \
  592 + movdqa x0, t2; \
  593 + unpcklps x1, t1; \
  594 + unpckhps x1, t2; \
  595 + movdqa t3, x1; \
  596 + unpckhps x3, x2; \
  597 + movdqa t1, x0; \
  598 + movhlps t1, x1; \
  599 + movdqa t2, t1; \
  600 + movlhps t3, x0; \
  601 + movlhps x2, t1; \
  602 + movhlps t2, x2; \
  603 + movdqa x2, x3; \
  604 + movdqa t1, x2;
  605 +
  606 +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
  607 + movdqu (0*4*4)(in), x0; \
  608 + movdqu (1*4*4)(in), x1; \
  609 + movdqu (2*4*4)(in), x2; \
  610 + movdqu (3*4*4)(in), x3; \
  611 + \
  612 + transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  613 +
  614 +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  615 + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  616 + \
  617 + movdqu x0, (0*4*4)(out); \
  618 + movdqu x1, (1*4*4)(out); \
  619 + movdqu x2, (2*4*4)(out); \
  620 + movdqu x3, (3*4*4)(out);
  621 +
  622 +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
  623 + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
  624 + \
  625 + movdqu (0*4*4)(out), t0; \
  626 + pxor t0, x0; \
  627 + movdqu x0, (0*4*4)(out); \
  628 + movdqu (1*4*4)(out), t0; \
  629 + pxor t0, x1; \
  630 + movdqu x1, (1*4*4)(out); \
  631 + movdqu (2*4*4)(out), t0; \
  632 + pxor t0, x2; \
  633 + movdqu x2, (2*4*4)(out); \
  634 + movdqu (3*4*4)(out), t0; \
  635 + pxor t0, x3; \
  636 + movdqu x3, (3*4*4)(out);
  637 +
  638 +.align 8
  639 +.global __serpent_enc_blk_8way
  640 +.type __serpent_enc_blk_8way,@function;
  641 +
  642 +__serpent_enc_blk_8way:
  643 + /* input:
  644 + * %rdi: ctx, CTX
  645 + * %rsi: dst
  646 + * %rdx: src
  647 + * %rcx: bool, if true: xor output
  648 + */
  649 +
  650 + pcmpeqd RNOT, RNOT;
  651 +
  652 + leaq (4*4*4)(%rdx), %rax;
  653 + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
  654 + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
  655 +
  656 + K2(RA, RB, RC, RD, RE, 0);
  657 + S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
  658 + S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
  659 + S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
  660 + S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
  661 + S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
  662 + S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
  663 + S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
  664 + S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
  665 + S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
  666 + S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
  667 + S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
  668 + S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
  669 + S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
  670 + S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
  671 + S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
  672 + S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
  673 + S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
  674 + S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
  675 + S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
  676 + S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
  677 + S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
  678 + S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
  679 + S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
  680 + S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
  681 + S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
  682 + S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
  683 + S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
  684 + S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
  685 + S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
  686 + S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
  687 + S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
  688 + S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
  689 +
  690 + leaq (4*4*4)(%rsi), %rax;
  691 +
  692 + testb %cl, %cl;
  693 + jnz __enc_xor8;
  694 +
  695 + write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
  696 + write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
  697 +
  698 + ret;
  699 +
  700 +__enc_xor8:
  701 + xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
  702 + xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
  703 +
  704 + ret;
  705 +
  706 +.align 8
  707 +.global serpent_dec_blk_8way
  708 +.type serpent_dec_blk_8way,@function;
  709 +
  710 +serpent_dec_blk_8way:
  711 + /* input:
  712 + * %rdi: ctx, CTX
  713 + * %rsi: dst
  714 + * %rdx: src
  715 + */
  716 +
  717 + pcmpeqd RNOT, RNOT;
  718 +
  719 + leaq (4*4*4)(%rdx), %rax;
  720 + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
  721 + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
  722 +
  723 + K2(RA, RB, RC, RD, RE, 32);
  724 + SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
  725 + SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
  726 + SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
  727 + SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
  728 + SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
  729 + SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
  730 + SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
  731 + SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
  732 + SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
  733 + SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
  734 + SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
  735 + SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
  736 + SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
  737 + SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
  738 + SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
  739 + SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
  740 + SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
  741 + SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
  742 + SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
  743 + SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
  744 + SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
  745 + SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
  746 + SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
  747 + SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
  748 + SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
  749 + SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
  750 + SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
  751 + SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
  752 + SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
  753 + SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
  754 + SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
  755 + S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
  756 +
  757 + leaq (4*4*4)(%rsi), %rax;
  758 + write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
  759 + write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
  760 +
  761 + ret;
arch/x86/crypto/serpent_sse2_glue.c
  1 +/*
  2 + * Glue Code for SSE2 assembler versions of Serpent Cipher
  3 + *
  4 + * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5 + *
  6 + * Glue code based on aesni-intel_glue.c by:
  7 + * Copyright (C) 2008, Intel Corp.
  8 + * Author: Huang Ying <ying.huang@intel.com>
  9 + *
  10 + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  11 + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  12 + * CTR part based on code (crypto/ctr.c) by:
  13 + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
  14 + *
  15 + * This program is free software; you can redistribute it and/or modify
  16 + * it under the terms of the GNU General Public License as published by
  17 + * the Free Software Foundation; either version 2 of the License, or
  18 + * (at your option) any later version.
  19 + *
  20 + * This program is distributed in the hope that it will be useful,
  21 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  23 + * GNU General Public License for more details.
  24 + *
  25 + * You should have received a copy of the GNU General Public License
  26 + * along with this program; if not, write to the Free Software
  27 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  28 + * USA
  29 + *
  30 + */
  31 +
  32 +#include <linux/module.h>
  33 +#include <linux/hardirq.h>
  34 +#include <linux/types.h>
  35 +#include <linux/crypto.h>
  36 +#include <linux/err.h>
  37 +#include <crypto/algapi.h>
  38 +#include <crypto/serpent.h>
  39 +#include <crypto/cryptd.h>
  40 +#include <crypto/b128ops.h>
  41 +#include <crypto/ctr.h>
  42 +#include <asm/i387.h>
  43 +#include <asm/serpent.h>
  44 +#include <crypto/scatterwalk.h>
  45 +#include <linux/workqueue.h>
  46 +#include <linux/spinlock.h>
  47 +
  48 +struct async_serpent_ctx {
  49 + struct cryptd_ablkcipher *cryptd_tfm;
  50 +};
  51 +
  52 +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
  53 +{
  54 + if (fpu_enabled)
  55 + return true;
  56 +
  57 + /* SSE2 is only used when chunk to be processed is large enough, so
  58 + * do not enable FPU until it is necessary.
  59 + */
  60 + if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
  61 + return false;
  62 +
  63 + kernel_fpu_begin();
  64 + return true;
  65 +}
  66 +
  67 +static inline void serpent_fpu_end(bool fpu_enabled)
  68 +{
  69 + if (fpu_enabled)
  70 + kernel_fpu_end();
  71 +}
  72 +
  73 +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
  74 + bool enc)
  75 +{
  76 + bool fpu_enabled = false;
  77 + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  78 + const unsigned int bsize = SERPENT_BLOCK_SIZE;
  79 + unsigned int nbytes;
  80 + int err;
  81 +
  82 + err = blkcipher_walk_virt(desc, walk);
  83 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  84 +
  85 + while ((nbytes = walk->nbytes)) {
  86 + u8 *wsrc = walk->src.virt.addr;
  87 + u8 *wdst = walk->dst.virt.addr;
  88 +
  89 + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
  90 +
  91 + /* Process multi-block batch */
  92 + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
  93 + do {
  94 + if (enc)
  95 + serpent_enc_blk_xway(ctx, wdst, wsrc);
  96 + else
  97 + serpent_dec_blk_xway(ctx, wdst, wsrc);
  98 +
  99 + wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
  100 + wdst += bsize * SERPENT_PARALLEL_BLOCKS;
  101 + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
  102 + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
  103 +
  104 + if (nbytes < bsize)
  105 + goto done;
  106 + }
  107 +
  108 + /* Handle leftovers */
  109 + do {
  110 + if (enc)
  111 + __serpent_encrypt(ctx, wdst, wsrc);
  112 + else
  113 + __serpent_decrypt(ctx, wdst, wsrc);
  114 +
  115 + wsrc += bsize;
  116 + wdst += bsize;
  117 + nbytes -= bsize;
  118 + } while (nbytes >= bsize);
  119 +
  120 +done:
  121 + err = blkcipher_walk_done(desc, walk, nbytes);
  122 + }
  123 +
  124 + serpent_fpu_end(fpu_enabled);
  125 + return err;
  126 +}
  127 +
  128 +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  129 + struct scatterlist *src, unsigned int nbytes)
  130 +{
  131 + struct blkcipher_walk walk;
  132 +
  133 + blkcipher_walk_init(&walk, dst, src, nbytes);
  134 + return ecb_crypt(desc, &walk, true);
  135 +}
  136 +
  137 +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  138 + struct scatterlist *src, unsigned int nbytes)
  139 +{
  140 + struct blkcipher_walk walk;
  141 +
  142 + blkcipher_walk_init(&walk, dst, src, nbytes);
  143 + return ecb_crypt(desc, &walk, false);
  144 +}
  145 +
  146 +static struct crypto_alg blk_ecb_alg = {
  147 + .cra_name = "__ecb-serpent-sse2",
  148 + .cra_driver_name = "__driver-ecb-serpent-sse2",
  149 + .cra_priority = 0,
  150 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
  151 + .cra_blocksize = SERPENT_BLOCK_SIZE,
  152 + .cra_ctxsize = sizeof(struct serpent_ctx),
  153 + .cra_alignmask = 0,
  154 + .cra_type = &crypto_blkcipher_type,
  155 + .cra_module = THIS_MODULE,
  156 + .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
  157 + .cra_u = {
  158 + .blkcipher = {
  159 + .min_keysize = SERPENT_MIN_KEY_SIZE,
  160 + .max_keysize = SERPENT_MAX_KEY_SIZE,
  161 + .setkey = serpent_setkey,
  162 + .encrypt = ecb_encrypt,
  163 + .decrypt = ecb_decrypt,
  164 + },
  165 + },
  166 +};
  167 +
  168 +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
  169 + struct blkcipher_walk *walk)
  170 +{
  171 + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  172 + const unsigned int bsize = SERPENT_BLOCK_SIZE;
  173 + unsigned int nbytes = walk->nbytes;
  174 + u128 *src = (u128 *)walk->src.virt.addr;
  175 + u128 *dst = (u128 *)walk->dst.virt.addr;
  176 + u128 *iv = (u128 *)walk->iv;
  177 +
  178 + do {
  179 + u128_xor(dst, src, iv);
  180 + __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
  181 + iv = dst;
  182 +
  183 + src += 1;
  184 + dst += 1;
  185 + nbytes -= bsize;
  186 + } while (nbytes >= bsize);
  187 +
  188 + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
  189 + return nbytes;
  190 +}
  191 +
  192 +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  193 + struct scatterlist *src, unsigned int nbytes)
  194 +{
  195 + struct blkcipher_walk walk;
  196 + int err;
  197 +
  198 + blkcipher_walk_init(&walk, dst, src, nbytes);
  199 + err = blkcipher_walk_virt(desc, &walk);
  200 +
  201 + while ((nbytes = walk.nbytes)) {
  202 + nbytes = __cbc_encrypt(desc, &walk);
  203 + err = blkcipher_walk_done(desc, &walk, nbytes);
  204 + }
  205 +
  206 + return err;
  207 +}
  208 +
  209 +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
  210 + struct blkcipher_walk *walk)
  211 +{
  212 + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  213 + const unsigned int bsize = SERPENT_BLOCK_SIZE;
  214 + unsigned int nbytes = walk->nbytes;
  215 + u128 *src = (u128 *)walk->src.virt.addr;
  216 + u128 *dst = (u128 *)walk->dst.virt.addr;
  217 + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
  218 + u128 last_iv;
  219 + int i;
  220 +
  221 + /* Start of the last block. */
  222 + src += nbytes / bsize - 1;
  223 + dst += nbytes / bsize - 1;
  224 +
  225 + last_iv = *src;
  226 +
  227 + /* Process multi-block batch */
  228 + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
  229 + do {
  230 + nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
  231 + src -= SERPENT_PARALLEL_BLOCKS - 1;
  232 + dst -= SERPENT_PARALLEL_BLOCKS - 1;
  233 +
  234 + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
  235 + ivs[i] = src[i];
  236 +
  237 + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
  238 +
  239 + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
  240 + u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
  241 +
  242 + nbytes -= bsize;
  243 + if (nbytes < bsize)
  244 + goto done;
  245 +
  246 + u128_xor(dst, dst, src - 1);
  247 + src -= 1;
  248 + dst -= 1;
  249 + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
  250 +
  251 + if (nbytes < bsize)
  252 + goto done;
  253 + }
  254 +
  255 + /* Handle leftovers */
  256 + for (;;) {
  257 + __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
  258 +
  259 + nbytes -= bsize;
  260 + if (nbytes < bsize)
  261 + break;
  262 +
  263 + u128_xor(dst, dst, src - 1);
  264 + src -= 1;
  265 + dst -= 1;
  266 + }
  267 +
  268 +done:
  269 + u128_xor(dst, dst, (u128 *)walk->iv);
  270 + *(u128 *)walk->iv = last_iv;
  271 +
  272 + return nbytes;
  273 +}
  274 +
  275 +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  276 + struct scatterlist *src, unsigned int nbytes)
  277 +{
  278 + bool fpu_enabled = false;
  279 + struct blkcipher_walk walk;
  280 + int err;
  281 +
  282 + blkcipher_walk_init(&walk, dst, src, nbytes);
  283 + err = blkcipher_walk_virt(desc, &walk);
  284 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  285 +
  286 + while ((nbytes = walk.nbytes)) {
  287 + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
  288 + nbytes = __cbc_decrypt(desc, &walk);
  289 + err = blkcipher_walk_done(desc, &walk, nbytes);
  290 + }
  291 +
  292 + serpent_fpu_end(fpu_enabled);
  293 + return err;
  294 +}
  295 +
  296 +static struct crypto_alg blk_cbc_alg = {
  297 + .cra_name = "__cbc-serpent-sse2",
  298 + .cra_driver_name = "__driver-cbc-serpent-sse2",
  299 + .cra_priority = 0,
  300 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
  301 + .cra_blocksize = SERPENT_BLOCK_SIZE,
  302 + .cra_ctxsize = sizeof(struct serpent_ctx),
  303 + .cra_alignmask = 0,
  304 + .cra_type = &crypto_blkcipher_type,
  305 + .cra_module = THIS_MODULE,
  306 + .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
  307 + .cra_u = {
  308 + .blkcipher = {
  309 + .min_keysize = SERPENT_MIN_KEY_SIZE,
  310 + .max_keysize = SERPENT_MAX_KEY_SIZE,
  311 + .setkey = serpent_setkey,
  312 + .encrypt = cbc_encrypt,
  313 + .decrypt = cbc_decrypt,
  314 + },
  315 + },
  316 +};
  317 +
  318 +static inline void u128_to_be128(be128 *dst, const u128 *src)
  319 +{
  320 + dst->a = cpu_to_be64(src->a);
  321 + dst->b = cpu_to_be64(src->b);
  322 +}
  323 +
  324 +static inline void be128_to_u128(u128 *dst, const be128 *src)
  325 +{
  326 + dst->a = be64_to_cpu(src->a);
  327 + dst->b = be64_to_cpu(src->b);
  328 +}
  329 +
  330 +static inline void u128_inc(u128 *i)
  331 +{
  332 + i->b++;
  333 + if (!i->b)
  334 + i->a++;
  335 +}
  336 +
  337 +static void ctr_crypt_final(struct blkcipher_desc *desc,
  338 + struct blkcipher_walk *walk)
  339 +{
  340 + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  341 + u8 *ctrblk = walk->iv;
  342 + u8 keystream[SERPENT_BLOCK_SIZE];
  343 + u8 *src = walk->src.virt.addr;
  344 + u8 *dst = walk->dst.virt.addr;
  345 + unsigned int nbytes = walk->nbytes;
  346 +
  347 + __serpent_encrypt(ctx, keystream, ctrblk);
  348 + crypto_xor(keystream, src, nbytes);
  349 + memcpy(dst, keystream, nbytes);
  350 +
  351 + crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
  352 +}
  353 +
  354 +static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
  355 + struct blkcipher_walk *walk)
  356 +{
  357 + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
  358 + const unsigned int bsize = SERPENT_BLOCK_SIZE;
  359 + unsigned int nbytes = walk->nbytes;
  360 + u128 *src = (u128 *)walk->src.virt.addr;
  361 + u128 *dst = (u128 *)walk->dst.virt.addr;
  362 + u128 ctrblk;
  363 + be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
  364 + int i;
  365 +
  366 + be128_to_u128(&ctrblk, (be128 *)walk->iv);
  367 +
  368 + /* Process multi-block batch */
  369 + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
  370 + do {
  371 + /* create ctrblks for parallel encrypt */
  372 + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
  373 + if (dst != src)
  374 + dst[i] = src[i];
  375 +
  376 + u128_to_be128(&ctrblocks[i], &ctrblk);
  377 + u128_inc(&ctrblk);
  378 + }
  379 +
  380 + serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
  381 + (u8 *)ctrblocks);
  382 +
  383 + src += SERPENT_PARALLEL_BLOCKS;
  384 + dst += SERPENT_PARALLEL_BLOCKS;
  385 + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
  386 + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
  387 +
  388 + if (nbytes < bsize)
  389 + goto done;
  390 + }
  391 +
  392 + /* Handle leftovers */
  393 + do {
  394 + if (dst != src)
  395 + *dst = *src;
  396 +
  397 + u128_to_be128(&ctrblocks[0], &ctrblk);
  398 + u128_inc(&ctrblk);
  399 +
  400 + __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
  401 + u128_xor(dst, dst, (u128 *)ctrblocks);
  402 +
  403 + src += 1;
  404 + dst += 1;
  405 + nbytes -= bsize;
  406 + } while (nbytes >= bsize);
  407 +
  408 +done:
  409 + u128_to_be128((be128 *)walk->iv, &ctrblk);
  410 + return nbytes;
  411 +}
  412 +
  413 +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
  414 + struct scatterlist *src, unsigned int nbytes)
  415 +{
  416 + bool fpu_enabled = false;
  417 + struct blkcipher_walk walk;
  418 + int err;
  419 +
  420 + blkcipher_walk_init(&walk, dst, src, nbytes);
  421 + err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
  422 + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
  423 +
  424 + while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
  425 + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
  426 + nbytes = __ctr_crypt(desc, &walk);
  427 + err = blkcipher_walk_done(desc, &walk, nbytes);
  428 + }
  429 +
  430 + serpent_fpu_end(fpu_enabled);
  431 +
  432 + if (walk.nbytes) {
  433 + ctr_crypt_final(desc, &walk);
  434 + err = blkcipher_walk_done(desc, &walk, 0);
  435 + }
  436 +
  437 + return err;
  438 +}
  439 +
  440 +static struct crypto_alg blk_ctr_alg = {
  441 + .cra_name = "__ctr-serpent-sse2",
  442 + .cra_driver_name = "__driver-ctr-serpent-sse2",
  443 + .cra_priority = 0,
  444 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
  445 + .cra_blocksize = 1,
  446 + .cra_ctxsize = sizeof(struct serpent_ctx),
  447 + .cra_alignmask = 0,
  448 + .cra_type = &crypto_blkcipher_type,
  449 + .cra_module = THIS_MODULE,
  450 + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
  451 + .cra_u = {
  452 + .blkcipher = {
  453 + .min_keysize = SERPENT_MIN_KEY_SIZE,
  454 + .max_keysize = SERPENT_MAX_KEY_SIZE,
  455 + .ivsize = SERPENT_BLOCK_SIZE,
  456 + .setkey = serpent_setkey,
  457 + .encrypt = ctr_crypt,
  458 + .decrypt = ctr_crypt,
  459 + },
  460 + },
  461 +};
  462 +
  463 +static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
  464 + unsigned int key_len)
  465 +{
  466 + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
  467 + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
  468 + int err;
  469 +
  470 + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
  471 + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
  472 + & CRYPTO_TFM_REQ_MASK);
  473 + err = crypto_ablkcipher_setkey(child, key, key_len);
  474 + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
  475 + & CRYPTO_TFM_RES_MASK);
  476 + return err;
  477 +}
  478 +
  479 +static int __ablk_encrypt(struct ablkcipher_request *req)
  480 +{
  481 + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
  482 + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
  483 + struct blkcipher_desc desc;
  484 +
  485 + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
  486 + desc.info = req->info;
  487 + desc.flags = 0;
  488 +
  489 + return crypto_blkcipher_crt(desc.tfm)->encrypt(
  490 + &desc, req->dst, req->src, req->nbytes);
  491 +}
  492 +
  493 +static int ablk_encrypt(struct ablkcipher_request *req)
  494 +{
  495 + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
  496 + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
  497 +
  498 + if (!irq_fpu_usable()) {
  499 + struct ablkcipher_request *cryptd_req =
  500 + ablkcipher_request_ctx(req);
  501 +
  502 + memcpy(cryptd_req, req, sizeof(*req));
  503 + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
  504 +
  505 + return crypto_ablkcipher_encrypt(cryptd_req);
  506 + } else {
  507 + return __ablk_encrypt(req);
  508 + }
  509 +}
  510 +
  511 +static int ablk_decrypt(struct ablkcipher_request *req)
  512 +{
  513 + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
  514 + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
  515 +
  516 + if (!irq_fpu_usable()) {
  517 + struct ablkcipher_request *cryptd_req =
  518 + ablkcipher_request_ctx(req);
  519 +
  520 + memcpy(cryptd_req, req, sizeof(*req));
  521 + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
  522 +
  523 + return crypto_ablkcipher_decrypt(cryptd_req);
  524 + } else {
  525 + struct blkcipher_desc desc;
  526 +
  527 + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
  528 + desc.info = req->info;
  529 + desc.flags = 0;
  530 +
  531 + return crypto_blkcipher_crt(desc.tfm)->decrypt(
  532 + &desc, req->dst, req->src, req->nbytes);
  533 + }
  534 +}
  535 +
  536 +static void ablk_exit(struct crypto_tfm *tfm)
  537 +{
  538 + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
  539 +
  540 + cryptd_free_ablkcipher(ctx->cryptd_tfm);
  541 +}
  542 +
  543 +static void ablk_init_common(struct crypto_tfm *tfm,
  544 + struct cryptd_ablkcipher *cryptd_tfm)
  545 +{
  546 + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
  547 +
  548 + ctx->cryptd_tfm = cryptd_tfm;
  549 + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
  550 + crypto_ablkcipher_reqsize(&cryptd_tfm->base);
  551 +}
  552 +
  553 +static int ablk_ecb_init(struct crypto_tfm *tfm)
  554 +{
  555 + struct cryptd_ablkcipher *cryptd_tfm;
  556 +
  557 + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
  558 + if (IS_ERR(cryptd_tfm))
  559 + return PTR_ERR(cryptd_tfm);
  560 + ablk_init_common(tfm, cryptd_tfm);
  561 + return 0;
  562 +}
  563 +
  564 +static struct crypto_alg ablk_ecb_alg = {
  565 + .cra_name = "ecb(serpent)",
  566 + .cra_driver_name = "ecb-serpent-sse2",
  567 + .cra_priority = 400,
  568 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
  569 + .cra_blocksize = SERPENT_BLOCK_SIZE,
  570 + .cra_ctxsize = sizeof(struct async_serpent_ctx),
  571 + .cra_alignmask = 0,
  572 + .cra_type = &crypto_ablkcipher_type,
  573 + .cra_module = THIS_MODULE,
  574 + .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
  575 + .cra_init = ablk_ecb_init,
  576 + .cra_exit = ablk_exit,
  577 + .cra_u = {
  578 + .ablkcipher = {
  579 + .min_keysize = SERPENT_MIN_KEY_SIZE,
  580 + .max_keysize = SERPENT_MAX_KEY_SIZE,
  581 + .setkey = ablk_set_key,
  582 + .encrypt = ablk_encrypt,
  583 + .decrypt = ablk_decrypt,
  584 + },
  585 + },
  586 +};
  587 +
  588 +static int ablk_cbc_init(struct crypto_tfm *tfm)
  589 +{
  590 + struct cryptd_ablkcipher *cryptd_tfm;
  591 +
  592 + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
  593 + if (IS_ERR(cryptd_tfm))
  594 + return PTR_ERR(cryptd_tfm);
  595 + ablk_init_common(tfm, cryptd_tfm);
  596 + return 0;
  597 +}
  598 +
  599 +static struct crypto_alg ablk_cbc_alg = {
  600 + .cra_name = "cbc(serpent)",
  601 + .cra_driver_name = "cbc-serpent-sse2",
  602 + .cra_priority = 400,
  603 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
  604 + .cra_blocksize = SERPENT_BLOCK_SIZE,
  605 + .cra_ctxsize = sizeof(struct async_serpent_ctx),
  606 + .cra_alignmask = 0,
  607 + .cra_type = &crypto_ablkcipher_type,
  608 + .cra_module = THIS_MODULE,
  609 + .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
  610 + .cra_init = ablk_cbc_init,
  611 + .cra_exit = ablk_exit,
  612 + .cra_u = {
  613 + .ablkcipher = {
  614 + .min_keysize = SERPENT_MIN_KEY_SIZE,
  615 + .max_keysize = SERPENT_MAX_KEY_SIZE,
  616 + .ivsize = SERPENT_BLOCK_SIZE,
  617 + .setkey = ablk_set_key,
  618 + .encrypt = __ablk_encrypt,
  619 + .decrypt = ablk_decrypt,
  620 + },
  621 + },
  622 +};
  623 +
  624 +static int ablk_ctr_init(struct crypto_tfm *tfm)
  625 +{
  626 + struct cryptd_ablkcipher *cryptd_tfm;
  627 +
  628 + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
  629 + if (IS_ERR(cryptd_tfm))
  630 + return PTR_ERR(cryptd_tfm);
  631 + ablk_init_common(tfm, cryptd_tfm);
  632 + return 0;
  633 +}
  634 +
  635 +static struct crypto_alg ablk_ctr_alg = {
  636 + .cra_name = "ctr(serpent)",
  637 + .cra_driver_name = "ctr-serpent-sse2",
  638 + .cra_priority = 400,
  639 + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
  640 + .cra_blocksize = 1,
  641 + .cra_ctxsize = sizeof(struct async_serpent_ctx),
  642 + .cra_alignmask = 0,
  643 + .cra_type = &crypto_ablkcipher_type,
  644 + .cra_module = THIS_MODULE,
  645 + .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
  646 + .cra_init = ablk_ctr_init,
  647 + .cra_exit = ablk_exit,
  648 + .cra_u = {
  649 + .ablkcipher = {
  650 + .min_keysize = SERPENT_MIN_KEY_SIZE,
  651 + .max_keysize = SERPENT_MAX_KEY_SIZE,
  652 + .ivsize = SERPENT_BLOCK_SIZE,
  653 + .setkey = ablk_set_key,
  654 + .encrypt = ablk_encrypt,
  655 + .decrypt = ablk_encrypt,
  656 + .geniv = "chainiv",
  657 + },
  658 + },
  659 +};
  660 +
  661 +static int __init serpent_sse2_init(void)
  662 +{
  663 + int err;
  664 +
  665 + if (!cpu_has_xmm2) {
  666 + printk(KERN_INFO "SSE2 instructions are not detected.\n");
  667 + return -ENODEV;
  668 + }
  669 +
  670 + err = crypto_register_alg(&blk_ecb_alg);
  671 + if (err)
  672 + goto blk_ecb_err;
  673 + err = crypto_register_alg(&blk_cbc_alg);
  674 + if (err)
  675 + goto blk_cbc_err;
  676 + err = crypto_register_alg(&blk_ctr_alg);
  677 + if (err)
  678 + goto blk_ctr_err;
  679 + err = crypto_register_alg(&ablk_ecb_alg);
  680 + if (err)
  681 + goto ablk_ecb_err;
  682 + err = crypto_register_alg(&ablk_cbc_alg);
  683 + if (err)
  684 + goto ablk_cbc_err;
  685 + err = crypto_register_alg(&ablk_ctr_alg);
  686 + if (err)
  687 + goto ablk_ctr_err;
  688 + return err;
  689 +
  690 +ablk_ctr_err:
  691 + crypto_unregister_alg(&ablk_cbc_alg);
  692 +ablk_cbc_err:
  693 + crypto_unregister_alg(&ablk_ecb_alg);
  694 +ablk_ecb_err:
  695 + crypto_unregister_alg(&blk_ctr_alg);
  696 +blk_ctr_err:
  697 + crypto_unregister_alg(&blk_cbc_alg);
  698 +blk_cbc_err:
  699 + crypto_unregister_alg(&blk_ecb_alg);
  700 +blk_ecb_err:
  701 + return err;
  702 +}
  703 +
  704 +static void __exit serpent_sse2_exit(void)
  705 +{
  706 + crypto_unregister_alg(&ablk_ctr_alg);
  707 + crypto_unregister_alg(&ablk_cbc_alg);
  708 + crypto_unregister_alg(&ablk_ecb_alg);
  709 + crypto_unregister_alg(&blk_ctr_alg);
  710 + crypto_unregister_alg(&blk_cbc_alg);
  711 + crypto_unregister_alg(&blk_ecb_alg);
  712 +}
  713 +
  714 +module_init(serpent_sse2_init);
  715 +module_exit(serpent_sse2_exit);
  716 +
  717 +MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
  718 +MODULE_LICENSE("GPL");
  719 +MODULE_ALIAS("serpent");
arch/x86/include/asm/serpent.h
  1 +#ifndef ASM_X86_SERPENT_H
  2 +#define ASM_X86_SERPENT_H
  3 +
  4 +#include <linux/crypto.h>
  5 +#include <crypto/serpent.h>
  6 +
  7 +#define SERPENT_PARALLEL_BLOCKS 8
  8 +
  9 +asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
  10 + const u8 *src, bool xor);
  11 +asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
  12 + const u8 *src);
  13 +
  14 +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
  15 + const u8 *src)
  16 +{
  17 + __serpent_enc_blk_8way(ctx, dst, src, false);
  18 +}
  19 +
  20 +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
  21 + const u8 *src)
  22 +{
  23 + __serpent_enc_blk_8way(ctx, dst, src, true);
  24 +}
  25 +
  26 +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
  27 + const u8 *src)
  28 +{
  29 + serpent_dec_blk_8way(ctx, dst, src);
  30 +}
  31 +
  32 +#endif
... ... @@ -766,6 +766,23 @@
766 766 See also:
767 767 <http://www.cl.cam.ac.uk/~rja14/serpent.html>
768 768  
  769 +config CRYPTO_SERPENT_SSE2_X86_64
  770 + tristate "Serpent cipher algorithm (x86_64/SSE2)"
  771 + depends on X86 && 64BIT
  772 + select CRYPTO_ALGAPI
  773 + select CRYPTO_SERPENT
  774 + help
  775 + Serpent cipher algorithm, by Anderson, Biham & Knudsen.
  776 +
  777 + Keys are allowed to be from 0 to 256 bits in length, in steps
  778 + of 8 bits.
  779 +
  780 + This module provides Serpent cipher algorithm that processes eigth
  781 + blocks parallel using SSE2 instruction set.
  782 +
  783 + See also:
  784 + <http://www.cl.cam.ac.uk/~rja14/serpent.html>
  785 +
769 786 config CRYPTO_TEA
770 787 tristate "TEA, XTEA and XETA cipher algorithms"
771 788 select CRYPTO_ALGAPI
... ... @@ -1534,6 +1534,21 @@
1534 1534 /* Please keep this list sorted by algorithm name. */
1535 1535 static const struct alg_test_desc alg_test_descs[] = {
1536 1536 {
  1537 + .alg = "__cbc-serpent-sse2",
  1538 + .test = alg_test_null,
  1539 + .suite = {
  1540 + .cipher = {
  1541 + .enc = {
  1542 + .vecs = NULL,
  1543 + .count = 0
  1544 + },
  1545 + .dec = {
  1546 + .vecs = NULL,
  1547 + .count = 0
  1548 + }
  1549 + }
  1550 + }
  1551 + }, {
1537 1552 .alg = "__driver-cbc-aes-aesni",
1538 1553 .test = alg_test_null,
1539 1554 .suite = {
... ... @@ -1549,6 +1564,21 @@
1549 1564 }
1550 1565 }
1551 1566 }, {
  1567 + .alg = "__driver-cbc-serpent-sse2",
  1568 + .test = alg_test_null,
  1569 + .suite = {
  1570 + .cipher = {
  1571 + .enc = {
  1572 + .vecs = NULL,
  1573 + .count = 0
  1574 + },
  1575 + .dec = {
  1576 + .vecs = NULL,
  1577 + .count = 0
  1578 + }
  1579 + }
  1580 + }
  1581 + }, {
1552 1582 .alg = "__driver-ecb-aes-aesni",
1553 1583 .test = alg_test_null,
1554 1584 .suite = {
... ... @@ -1564,6 +1594,21 @@
1564 1594 }
1565 1595 }
1566 1596 }, {
  1597 + .alg = "__driver-ecb-serpent-sse2",
  1598 + .test = alg_test_null,
  1599 + .suite = {
  1600 + .cipher = {
  1601 + .enc = {
  1602 + .vecs = NULL,
  1603 + .count = 0
  1604 + },
  1605 + .dec = {
  1606 + .vecs = NULL,
  1607 + .count = 0
  1608 + }
  1609 + }
  1610 + }
  1611 + }, {
1567 1612 .alg = "__ghash-pclmulqdqni",
1568 1613 .test = alg_test_null,
1569 1614 .suite = {
... ... @@ -1732,6 +1777,21 @@
1732 1777 }
1733 1778 }, {
1734 1779 .alg = "cryptd(__driver-ecb-aes-aesni)",
  1780 + .test = alg_test_null,
  1781 + .suite = {
  1782 + .cipher = {
  1783 + .enc = {
  1784 + .vecs = NULL,
  1785 + .count = 0
  1786 + },
  1787 + .dec = {
  1788 + .vecs = NULL,
  1789 + .count = 0
  1790 + }
  1791 + }
  1792 + }
  1793 + }, {
  1794 + .alg = "cryptd(__driver-ecb-serpent-sse2)",
1735 1795 .test = alg_test_null,
1736 1796 .suite = {
1737 1797 .cipher = {