speck-neon-core.S 9.9 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352


// SPDX-License-Identifier: GPL-2.0
/*
 * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
 *
 * Copyright (c) 2018 Google, Inc
 *
 * Author: Eric Biggers <ebiggers@google.com>
 */

#include <linux/linkage.h>

	.text

	// arguments
	ROUND_KEYS	.req	x0	// const {u64,u32} *round_keys
	NROUNDS		.req	w1	// int nrounds
	NROUNDS_X	.req	x1
	DST		.req	x2	// void *dst
	SRC		.req	x3	// const void *src
	NBYTES		.req	w4	// unsigned int nbytes
	TWEAK		.req	x5	// void *tweak

	// registers which hold the data being encrypted/decrypted
	// (underscores avoid a naming collision with ARM64 registers x0-x3)
	X_0		.req	v0
	Y_0		.req	v1
	X_1		.req	v2
	Y_1		.req	v3
	X_2		.req	v4
	Y_2		.req	v5
	X_3		.req	v6
	Y_3		.req	v7

	// the round key, duplicated in all lanes
	ROUND_KEY	.req	v8

	// index vector for tbl-based 8-bit rotates
	ROTATE_TABLE	.req	v9
	ROTATE_TABLE_Q	.req	q9

	// temporary registers
	TMP0		.req	v10
	TMP1		.req	v11
	TMP2		.req	v12
	TMP3		.req	v13

	// multiplication table for updating XTS tweaks
	GFMUL_TABLE	.req	v14
	GFMUL_TABLE_Q	.req	q14

	// next XTS tweak value(s)
	TWEAKV_NEXT	.req	v15

	// XTS tweaks for the blocks currently being encrypted/decrypted
	TWEAKV0		.req	v16
	TWEAKV1		.req	v17
	TWEAKV2		.req	v18
	TWEAKV3		.req	v19
	TWEAKV4		.req	v20
	TWEAKV5		.req	v21
	TWEAKV6		.req	v22
	TWEAKV7		.req	v23

	.align		4
.Lror64_8_table:
	.octa		0x080f0e0d0c0b0a090007060504030201
.Lror32_8_table:
	.octa		0x0c0f0e0d080b0a090407060500030201
.Lrol64_8_table:
	.octa		0x0e0d0c0b0a09080f0605040302010007
.Lrol32_8_table:
	.octa		0x0e0d0c0f0a09080b0605040702010003
.Lgf128mul_table:
	.octa		0x00000000000000870000000000000001
.Lgf64mul_table:
	.octa		0x0000000000000000000000002d361b00

/*
 * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
 *
 * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
 * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
 * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
 * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
 */
.macro _speck_round_128bytes	n, lanes

	// x = ror(x, 8)
	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b

	// x += y
	add		X_0.\lanes, X_0.\lanes, Y_0.\lanes
	add		X_1.\lanes, X_1.\lanes, Y_1.\lanes
	add		X_2.\lanes, X_2.\lanes, Y_2.\lanes
	add		X_3.\lanes, X_3.\lanes, Y_3.\lanes

	// x ^= k
	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
	eor		X_3.16b, X_3.16b, ROUND_KEY.16b

	// y = rol(y, 3)
	shl		TMP0.\lanes, Y_0.\lanes, #3
	shl		TMP1.\lanes, Y_1.\lanes, #3
	shl		TMP2.\lanes, Y_2.\lanes, #3
	shl		TMP3.\lanes, Y_3.\lanes, #3
	sri		TMP0.\lanes, Y_0.\lanes, #(\n - 3)
	sri		TMP1.\lanes, Y_1.\lanes, #(\n - 3)
	sri		TMP2.\lanes, Y_2.\lanes, #(\n - 3)
	sri		TMP3.\lanes, Y_3.\lanes, #(\n - 3)

	// y ^= x
	eor		Y_0.16b, TMP0.16b, X_0.16b
	eor		Y_1.16b, TMP1.16b, X_1.16b
	eor		Y_2.16b, TMP2.16b, X_2.16b
	eor		Y_3.16b, TMP3.16b, X_3.16b
.endm

/*
 * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
 *
 * This is the inverse of _speck_round_128bytes().
 */
.macro _speck_unround_128bytes	n, lanes

	// y ^= x
	eor		TMP0.16b, Y_0.16b, X_0.16b
	eor		TMP1.16b, Y_1.16b, X_1.16b
	eor		TMP2.16b, Y_2.16b, X_2.16b
	eor		TMP3.16b, Y_3.16b, X_3.16b

	// y = ror(y, 3)
	ushr		Y_0.\lanes, TMP0.\lanes, #3
	ushr		Y_1.\lanes, TMP1.\lanes, #3
	ushr		Y_2.\lanes, TMP2.\lanes, #3
	ushr		Y_3.\lanes, TMP3.\lanes, #3
	sli		Y_0.\lanes, TMP0.\lanes, #(\n - 3)
	sli		Y_1.\lanes, TMP1.\lanes, #(\n - 3)
	sli		Y_2.\lanes, TMP2.\lanes, #(\n - 3)
	sli		Y_3.\lanes, TMP3.\lanes, #(\n - 3)

	// x ^= k
	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
	eor		X_3.16b, X_3.16b, ROUND_KEY.16b

	// x -= y
	sub		X_0.\lanes, X_0.\lanes, Y_0.\lanes
	sub		X_1.\lanes, X_1.\lanes, Y_1.\lanes
	sub		X_2.\lanes, X_2.\lanes, Y_2.\lanes
	sub		X_3.\lanes, X_3.\lanes, Y_3.\lanes

	// x = rol(x, 8)
	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
.endm

.macro _next_xts_tweak	next, cur, tmp, n
.if \n == 64
	/*
	 * Calculate the next tweak by multiplying the current one by x,
	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
	 */
	sshr		\tmp\().2d, \cur\().2d, #63
	and		\tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
	shl		\next\().2d, \cur\().2d, #1
	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
	eor		\next\().16b, \next\().16b, \tmp\().16b
.else
	/*
	 * Calculate the next two tweaks by multiplying the current ones by x^2,
	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
	 */
	ushr		\tmp\().2d, \cur\().2d, #62
	shl		\next\().2d, \cur\().2d, #2
	tbl		\tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
	eor		\next\().16b, \next\().16b, \tmp\().16b
.endif
.endm

/*
 * _speck_xts_crypt() - Speck-XTS encryption/decryption
 *
 * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
 * using Speck-XTS, specifically the variant with a block size of '2n' and round
 * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
 * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
 * nonzero multiple of 128.
 */
.macro _speck_xts_crypt	n, lanes, decrypting

	/*
	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
	 * round key rather than the first, since for decryption the round keys
	 * are used in reverse order.
	 */
.if \decrypting
	mov		NROUNDS, NROUNDS	/* zero the high 32 bits */
.if \n == 64
	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
	sub		ROUND_KEYS, ROUND_KEYS, #8
.else
	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
	sub		ROUND_KEYS, ROUND_KEYS, #4
.endif
.endif

	// Load the index vector for tbl-based 8-bit rotates
.if \decrypting
	ldr		ROTATE_TABLE_Q, .Lrol\n\()_8_table
.else
	ldr		ROTATE_TABLE_Q, .Lror\n\()_8_table
.endif

	// One-time XTS preparation
.if \n == 64
	// Load first tweak
	ld1		{TWEAKV0.16b}, [TWEAK]

	// Load GF(2^128) multiplication table
	ldr		GFMUL_TABLE_Q, .Lgf128mul_table
.else
	// Load first tweak
	ld1		{TWEAKV0.8b}, [TWEAK]

	// Load GF(2^64) multiplication table
	ldr		GFMUL_TABLE_Q, .Lgf64mul_table

	// Calculate second tweak, packing it together with the first
	ushr		TMP0.2d, TWEAKV0.2d, #63
	shl		TMP1.2d, TWEAKV0.2d, #1
	tbl		TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
	eor		TMP0.8b, TMP0.8b, TMP1.8b
	mov		TWEAKV0.d[1], TMP0.d[0]
.endif

.Lnext_128bytes_\@:

	// Calculate XTS tweaks for next 128 bytes
	_next_xts_tweak	TWEAKV1, TWEAKV0, TMP0, \n
	_next_xts_tweak	TWEAKV2, TWEAKV1, TMP0, \n
	_next_xts_tweak	TWEAKV3, TWEAKV2, TMP0, \n
	_next_xts_tweak	TWEAKV4, TWEAKV3, TMP0, \n
	_next_xts_tweak	TWEAKV5, TWEAKV4, TMP0, \n
	_next_xts_tweak	TWEAKV6, TWEAKV5, TMP0, \n
	_next_xts_tweak	TWEAKV7, TWEAKV6, TMP0, \n
	_next_xts_tweak	TWEAKV_NEXT, TWEAKV7, TMP0, \n

	// Load the next source blocks into {X,Y}[0-3]
	ld1		{X_0.16b-Y_1.16b}, [SRC], #64
	ld1		{X_2.16b-Y_3.16b}, [SRC], #64

	// XOR the source blocks with their XTS tweaks
	eor		TMP0.16b, X_0.16b, TWEAKV0.16b
	eor		Y_0.16b,  Y_0.16b, TWEAKV1.16b
	eor		TMP1.16b, X_1.16b, TWEAKV2.16b
	eor		Y_1.16b,  Y_1.16b, TWEAKV3.16b
	eor		TMP2.16b, X_2.16b, TWEAKV4.16b
	eor		Y_2.16b,  Y_2.16b, TWEAKV5.16b
	eor		TMP3.16b, X_3.16b, TWEAKV6.16b
	eor		Y_3.16b,  Y_3.16b, TWEAKV7.16b

	/*
	 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
	 * that the X[0-3] registers contain only the second halves of blocks,
	 * and the Y[0-3] registers contain only the first halves of blocks.
	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
	 */
	uzp2		X_0.\lanes, TMP0.\lanes, Y_0.\lanes
	uzp1		Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
	uzp2		X_1.\lanes, TMP1.\lanes, Y_1.\lanes
	uzp1		Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
	uzp2		X_2.\lanes, TMP2.\lanes, Y_2.\lanes
	uzp1		Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
	uzp2		X_3.\lanes, TMP3.\lanes, Y_3.\lanes
	uzp1		Y_3.\lanes, TMP3.\lanes, Y_3.\lanes

	// Do the cipher rounds
	mov		x6, ROUND_KEYS
	mov		w7, NROUNDS
.Lnext_round_\@:
.if \decrypting
	ld1r		{ROUND_KEY.\lanes}, [x6]
	sub		x6, x6, #( \n / 8 )
	_speck_unround_128bytes	\n, \lanes
.else
	ld1r		{ROUND_KEY.\lanes}, [x6], #( \n / 8 )
	_speck_round_128bytes	\n, \lanes
.endif
	subs		w7, w7, #1
	bne		.Lnext_round_\@

	// Re-interleave the 'x' and 'y' elements of each block
	zip1		TMP0.\lanes, Y_0.\lanes, X_0.\lanes
	zip2		Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
	zip1		TMP1.\lanes, Y_1.\lanes, X_1.\lanes
	zip2		Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
	zip1		TMP2.\lanes, Y_2.\lanes, X_2.\lanes
	zip2		Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
	zip1		TMP3.\lanes, Y_3.\lanes, X_3.\lanes
	zip2		Y_3.\lanes,  Y_3.\lanes, X_3.\lanes

	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
	eor		X_0.16b, TMP0.16b, TWEAKV0.16b
	eor		Y_0.16b, Y_0.16b,  TWEAKV1.16b
	eor		X_1.16b, TMP1.16b, TWEAKV2.16b
	eor		Y_1.16b, Y_1.16b,  TWEAKV3.16b
	eor		X_2.16b, TMP2.16b, TWEAKV4.16b
	eor		Y_2.16b, Y_2.16b,  TWEAKV5.16b
	eor		X_3.16b, TMP3.16b, TWEAKV6.16b
	eor		Y_3.16b, Y_3.16b,  TWEAKV7.16b
	mov		TWEAKV0.16b, TWEAKV_NEXT.16b

	// Store the ciphertext in the destination buffer
	st1		{X_0.16b-Y_1.16b}, [DST], #64
	st1		{X_2.16b-Y_3.16b}, [DST], #64

	// Continue if there are more 128-byte chunks remaining
	subs		NBYTES, NBYTES, #128
	bne		.Lnext_128bytes_\@

	// Store the next tweak and return
.if \n == 64
	st1		{TWEAKV_NEXT.16b}, [TWEAK]
.else
	st1		{TWEAKV_NEXT.8b}, [TWEAK]
.endif
	ret
.endm

ENTRY(speck128_xts_encrypt_neon)
	_speck_xts_crypt	n=64, lanes=2d, decrypting=0
ENDPROC(speck128_xts_encrypt_neon)

ENTRY(speck128_xts_decrypt_neon)
	_speck_xts_crypt	n=64, lanes=2d, decrypting=1
ENDPROC(speck128_xts_decrypt_neon)

ENTRY(speck64_xts_encrypt_neon)
	_speck_xts_crypt	n=32, lanes=4s, decrypting=0
ENDPROC(speck64_xts_encrypt_neon)

ENTRY(speck64_xts_decrypt_neon)
	_speck_xts_crypt	n=32, lanes=4s, decrypting=1
ENDPROC(speck64_xts_decrypt_neon)