Blame view
crypto/aegis128-neon-inner.c
5.12 KB
a4397635a crypto: aegis128 ... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> */ #ifdef CONFIG_ARM64 #include <asm/neon-intrinsics.h> #define AES_ROUND "aese %0.16b, %1.16b \t aesmc %0.16b, %0.16b" #else #include <arm_neon.h> #define AES_ROUND "aese.8 %q0, %q1 \t aesmc.8 %q0, %q0" #endif #define AEGIS_BLOCK_SIZE 16 #include <stddef.h> |
198429631 crypto: arm64/aeg... |
21 |
extern int aegis128_have_aes_insn; |
a4397635a crypto: aegis128 ... |
22 23 24 25 26 27 |
void *memcpy(void *dest, const void *src, size_t n); void *memset(void *s, int c, size_t n); struct aegis128_state { uint8x16_t v[5]; }; |
389139b34 crypto: arm64/aeg... |
28 |
extern const uint8_t crypto_aes_sbox[]; |
198429631 crypto: arm64/aeg... |
29 |
|
a4397635a crypto: aegis128 ... |
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
static struct aegis128_state aegis128_load_state_neon(const void *state) { return (struct aegis128_state){ { vld1q_u8(state), vld1q_u8(state + 16), vld1q_u8(state + 32), vld1q_u8(state + 48), vld1q_u8(state + 64) } }; } static void aegis128_save_state_neon(struct aegis128_state st, void *state) { vst1q_u8(state, st.v[0]); vst1q_u8(state + 16, st.v[1]); vst1q_u8(state + 32, st.v[2]); vst1q_u8(state + 48, st.v[3]); vst1q_u8(state + 64, st.v[4]); } static inline __attribute__((always_inline)) uint8x16_t aegis_aes_round(uint8x16_t w) { uint8x16_t z = {}; |
198429631 crypto: arm64/aeg... |
54 55 |
#ifdef CONFIG_ARM64 if (!__builtin_expect(aegis128_have_aes_insn, 1)) { |
389139b34 crypto: arm64/aeg... |
56 |
static const uint8_t shift_rows[] = { |
198429631 crypto: arm64/aeg... |
57 58 59 |
0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }; |
389139b34 crypto: arm64/aeg... |
60 |
static const uint8_t ror32by8[] = { |
198429631 crypto: arm64/aeg... |
61 62 63 64 65 66 |
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, }; uint8x16_t v; // shift rows |
389139b34 crypto: arm64/aeg... |
67 |
w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); |
198429631 crypto: arm64/aeg... |
68 69 |
// sub bytes |
389139b34 crypto: arm64/aeg... |
70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
#ifndef CONFIG_CC_IS_GCC v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w); v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0); #else asm("tbl %0.16b, {v16.16b-v19.16b}, %1.16b" : "=w"(v) : "w"(w)); w -= 0x40; asm("tbx %0.16b, {v20.16b-v23.16b}, %1.16b" : "+w"(v) : "w"(w)); w -= 0x40; asm("tbx %0.16b, {v24.16b-v27.16b}, %1.16b" : "+w"(v) : "w"(w)); w -= 0x40; asm("tbx %0.16b, {v28.16b-v31.16b}, %1.16b" : "+w"(v) : "w"(w)); #endif |
198429631 crypto: arm64/aeg... |
84 85 86 87 |
// mix columns w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b); w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v); |
389139b34 crypto: arm64/aeg... |
88 |
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); |
198429631 crypto: arm64/aeg... |
89 90 91 92 |
return w; } #endif |
a4397635a crypto: aegis128 ... |
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
/* * We use inline asm here instead of the vaeseq_u8/vaesmcq_u8 intrinsics * to force the compiler to issue the aese/aesmc instructions in pairs. * This is much faster on many cores, where the instruction pair can * execute in a single cycle. */ asm(AES_ROUND : "+w"(w) : "w"(z)); return w; } static inline __attribute__((always_inline)) struct aegis128_state aegis128_update_neon(struct aegis128_state st, uint8x16_t m) { m ^= aegis_aes_round(st.v[4]); st.v[4] ^= aegis_aes_round(st.v[3]); st.v[3] ^= aegis_aes_round(st.v[2]); st.v[2] ^= aegis_aes_round(st.v[1]); st.v[1] ^= aegis_aes_round(st.v[0]); st.v[0] ^= m; return st; } |
198429631 crypto: arm64/aeg... |
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
static inline __attribute__((always_inline)) void preload_sbox(void) { if (!IS_ENABLED(CONFIG_ARM64) || !IS_ENABLED(CONFIG_CC_IS_GCC) || __builtin_expect(aegis128_have_aes_insn, 1)) return; asm("ld1 {v16.16b-v19.16b}, [%0], #64 \t" "ld1 {v20.16b-v23.16b}, [%0], #64 \t" "ld1 {v24.16b-v27.16b}, [%0], #64 \t" "ld1 {v28.16b-v31.16b}, [%0] \t" :: "r"(crypto_aes_sbox)); } |
a4397635a crypto: aegis128 ... |
134 135 136 |
void crypto_aegis128_update_neon(void *state, const void *msg) { struct aegis128_state st = aegis128_load_state_neon(state); |
198429631 crypto: arm64/aeg... |
137 |
preload_sbox(); |
a4397635a crypto: aegis128 ... |
138 139 140 141 142 143 144 145 146 147 |
st = aegis128_update_neon(st, vld1q_u8(msg)); aegis128_save_state_neon(st, state); } void crypto_aegis128_encrypt_chunk_neon(void *state, void *dst, const void *src, unsigned int size) { struct aegis128_state st = aegis128_load_state_neon(state); uint8x16_t msg; |
198429631 crypto: arm64/aeg... |
148 |
preload_sbox(); |
a4397635a crypto: aegis128 ... |
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
while (size >= AEGIS_BLOCK_SIZE) { uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; msg = vld1q_u8(src); st = aegis128_update_neon(st, msg); vst1q_u8(dst, msg ^ s); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } if (size > 0) { uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; uint8_t buf[AEGIS_BLOCK_SIZE] = {}; memcpy(buf, src, size); msg = vld1q_u8(buf); st = aegis128_update_neon(st, msg); vst1q_u8(buf, msg ^ s); memcpy(dst, buf, size); } aegis128_save_state_neon(st, state); } void crypto_aegis128_decrypt_chunk_neon(void *state, void *dst, const void *src, unsigned int size) { struct aegis128_state st = aegis128_load_state_neon(state); uint8x16_t msg; |
198429631 crypto: arm64/aeg... |
180 |
preload_sbox(); |
a4397635a crypto: aegis128 ... |
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
while (size >= AEGIS_BLOCK_SIZE) { msg = vld1q_u8(src) ^ st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; st = aegis128_update_neon(st, msg); vst1q_u8(dst, msg); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } if (size > 0) { uint8x16_t s = st.v[1] ^ (st.v[2] & st.v[3]) ^ st.v[4]; uint8_t buf[AEGIS_BLOCK_SIZE]; vst1q_u8(buf, s); memcpy(buf, src, size); msg = vld1q_u8(buf) ^ s; vst1q_u8(buf, msg); memcpy(dst, buf, size); st = aegis128_update_neon(st, msg); } aegis128_save_state_neon(st, state); } |