blob: 8cfa468ee570b0bd0f2bb800fa1f7f21b66edaf6 [file] [log] [blame]
Ard Biesheuvel86464852015-03-10 09:47:47 +01001/*
2 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14 .text
15 .fpu crypto-neon-fp-armv8
16 .align 3
17
18 .macro enc_round, state, key
19 aese.8 \state, \key
20 aesmc.8 \state, \state
21 .endm
22
23 .macro dec_round, state, key
24 aesd.8 \state, \key
25 aesimc.8 \state, \state
26 .endm
27
28 .macro enc_dround, key1, key2
29 enc_round q0, \key1
30 enc_round q0, \key2
31 .endm
32
33 .macro dec_dround, key1, key2
34 dec_round q0, \key1
35 dec_round q0, \key2
36 .endm
37
38 .macro enc_fround, key1, key2, key3
39 enc_round q0, \key1
40 aese.8 q0, \key2
41 veor q0, q0, \key3
42 .endm
43
44 .macro dec_fround, key1, key2, key3
45 dec_round q0, \key1
46 aesd.8 q0, \key2
47 veor q0, q0, \key3
48 .endm
49
50 .macro enc_dround_3x, key1, key2
51 enc_round q0, \key1
52 enc_round q1, \key1
53 enc_round q2, \key1
54 enc_round q0, \key2
55 enc_round q1, \key2
56 enc_round q2, \key2
57 .endm
58
59 .macro dec_dround_3x, key1, key2
60 dec_round q0, \key1
61 dec_round q1, \key1
62 dec_round q2, \key1
63 dec_round q0, \key2
64 dec_round q1, \key2
65 dec_round q2, \key2
66 .endm
67
68 .macro enc_fround_3x, key1, key2, key3
69 enc_round q0, \key1
70 enc_round q1, \key1
71 enc_round q2, \key1
72 aese.8 q0, \key2
73 aese.8 q1, \key2
74 aese.8 q2, \key2
75 veor q0, q0, \key3
76 veor q1, q1, \key3
77 veor q2, q2, \key3
78 .endm
79
80 .macro dec_fround_3x, key1, key2, key3
81 dec_round q0, \key1
82 dec_round q1, \key1
83 dec_round q2, \key1
84 aesd.8 q0, \key2
85 aesd.8 q1, \key2
86 aesd.8 q2, \key2
87 veor q0, q0, \key3
88 veor q1, q1, \key3
89 veor q2, q2, \key3
90 .endm
91
92 .macro do_block, dround, fround
93 cmp r3, #12 @ which key size?
94 vld1.8 {q10-q11}, [ip]!
95 \dround q8, q9
96 vld1.8 {q12-q13}, [ip]!
97 \dround q10, q11
98 vld1.8 {q10-q11}, [ip]!
99 \dround q12, q13
100 vld1.8 {q12-q13}, [ip]!
101 \dround q10, q11
102 blo 0f @ AES-128: 10 rounds
103 vld1.8 {q10-q11}, [ip]!
104 beq 1f @ AES-192: 12 rounds
105 \dround q12, q13
106 vld1.8 {q12-q13}, [ip]
107 \dround q10, q11
1080: \fround q12, q13, q14
109 bx lr
110
1111: \dround q12, q13
112 \fround q10, q11, q14
113 bx lr
114 .endm
115
116 /*
117 * Internal, non-AAPCS compliant functions that implement the core AES
118 * transforms. These should preserve all registers except q0 - q2 and ip
119 * Arguments:
120 * q0 : first in/output block
121 * q1 : second in/output block (_3x version only)
122 * q2 : third in/output block (_3x version only)
123 * q8 : first round key
124 * q9 : secound round key
125 * ip : address of 3rd round key
126 * q14 : final round key
127 * r3 : number of rounds
128 */
129 .align 6
130aes_encrypt:
131 add ip, r2, #32 @ 3rd round key
132.Laes_encrypt_tweak:
133 do_block enc_dround, enc_fround
134ENDPROC(aes_encrypt)
135
136 .align 6
137aes_decrypt:
138 add ip, r2, #32 @ 3rd round key
139 do_block dec_dround, dec_fround
140ENDPROC(aes_decrypt)
141
142 .align 6
143aes_encrypt_3x:
144 add ip, r2, #32 @ 3rd round key
145 do_block enc_dround_3x, enc_fround_3x
146ENDPROC(aes_encrypt_3x)
147
148 .align 6
149aes_decrypt_3x:
150 add ip, r2, #32 @ 3rd round key
151 do_block dec_dround_3x, dec_fround_3x
152ENDPROC(aes_decrypt_3x)
153
154 .macro prepare_key, rk, rounds
155 add ip, \rk, \rounds, lsl #4
156 vld1.8 {q8-q9}, [\rk] @ load first 2 round keys
157 vld1.8 {q14}, [ip] @ load last round key
158 .endm
159
160 /*
161 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
162 * int blocks)
163 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
164 * int blocks)
165 */
166ENTRY(ce_aes_ecb_encrypt)
167 push {r4, lr}
168 ldr r4, [sp, #8]
169 prepare_key r2, r3
170.Lecbencloop3x:
171 subs r4, r4, #3
172 bmi .Lecbenc1x
173 vld1.8 {q0-q1}, [r1, :64]!
174 vld1.8 {q2}, [r1, :64]!
175 bl aes_encrypt_3x
176 vst1.8 {q0-q1}, [r0, :64]!
177 vst1.8 {q2}, [r0, :64]!
178 b .Lecbencloop3x
179.Lecbenc1x:
180 adds r4, r4, #3
181 beq .Lecbencout
182.Lecbencloop:
183 vld1.8 {q0}, [r1, :64]!
184 bl aes_encrypt
185 vst1.8 {q0}, [r0, :64]!
186 subs r4, r4, #1
187 bne .Lecbencloop
188.Lecbencout:
189 pop {r4, pc}
190ENDPROC(ce_aes_ecb_encrypt)
191
192ENTRY(ce_aes_ecb_decrypt)
193 push {r4, lr}
194 ldr r4, [sp, #8]
195 prepare_key r2, r3
196.Lecbdecloop3x:
197 subs r4, r4, #3
198 bmi .Lecbdec1x
199 vld1.8 {q0-q1}, [r1, :64]!
200 vld1.8 {q2}, [r1, :64]!
201 bl aes_decrypt_3x
202 vst1.8 {q0-q1}, [r0, :64]!
203 vst1.8 {q2}, [r0, :64]!
204 b .Lecbdecloop3x
205.Lecbdec1x:
206 adds r4, r4, #3
207 beq .Lecbdecout
208.Lecbdecloop:
209 vld1.8 {q0}, [r1, :64]!
210 bl aes_decrypt
211 vst1.8 {q0}, [r0, :64]!
212 subs r4, r4, #1
213 bne .Lecbdecloop
214.Lecbdecout:
215 pop {r4, pc}
216ENDPROC(ce_aes_ecb_decrypt)
217
218 /*
219 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
220 * int blocks, u8 iv[])
221 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
222 * int blocks, u8 iv[])
223 */
224ENTRY(ce_aes_cbc_encrypt)
225 push {r4-r6, lr}
226 ldrd r4, r5, [sp, #16]
227 vld1.8 {q0}, [r5]
228 prepare_key r2, r3
229.Lcbcencloop:
230 vld1.8 {q1}, [r1, :64]! @ get next pt block
231 veor q0, q0, q1 @ ..and xor with iv
232 bl aes_encrypt
233 vst1.8 {q0}, [r0, :64]!
234 subs r4, r4, #1
235 bne .Lcbcencloop
236 vst1.8 {q0}, [r5]
237 pop {r4-r6, pc}
238ENDPROC(ce_aes_cbc_encrypt)
239
240ENTRY(ce_aes_cbc_decrypt)
241 push {r4-r6, lr}
242 ldrd r4, r5, [sp, #16]
243 vld1.8 {q6}, [r5] @ keep iv in q6
244 prepare_key r2, r3
245.Lcbcdecloop3x:
246 subs r4, r4, #3
247 bmi .Lcbcdec1x
248 vld1.8 {q0-q1}, [r1, :64]!
249 vld1.8 {q2}, [r1, :64]!
250 vmov q3, q0
251 vmov q4, q1
252 vmov q5, q2
253 bl aes_decrypt_3x
254 veor q0, q0, q6
255 veor q1, q1, q3
256 veor q2, q2, q4
257 vmov q6, q5
258 vst1.8 {q0-q1}, [r0, :64]!
259 vst1.8 {q2}, [r0, :64]!
260 b .Lcbcdecloop3x
261.Lcbcdec1x:
262 adds r4, r4, #3
263 beq .Lcbcdecout
264 vmov q15, q14 @ preserve last round key
265.Lcbcdecloop:
266 vld1.8 {q0}, [r1, :64]! @ get next ct block
267 veor q14, q15, q6 @ combine prev ct with last key
268 vmov q6, q0
269 bl aes_decrypt
270 vst1.8 {q0}, [r0, :64]!
271 subs r4, r4, #1
272 bne .Lcbcdecloop
273.Lcbcdecout:
274 vst1.8 {q6}, [r5] @ keep iv in q6
275 pop {r4-r6, pc}
276ENDPROC(ce_aes_cbc_decrypt)
277
278 /*
279 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
280 * int blocks, u8 ctr[])
281 */
282ENTRY(ce_aes_ctr_encrypt)
283 push {r4-r6, lr}
284 ldrd r4, r5, [sp, #16]
285 vld1.8 {q6}, [r5] @ load ctr
286 prepare_key r2, r3
287 vmov r6, s27 @ keep swabbed ctr in r6
288 rev r6, r6
289 cmn r6, r4 @ 32 bit overflow?
290 bcs .Lctrloop
291.Lctrloop3x:
292 subs r4, r4, #3
293 bmi .Lctr1x
294 add r6, r6, #1
295 vmov q0, q6
296 vmov q1, q6
297 rev ip, r6
298 add r6, r6, #1
299 vmov q2, q6
300 vmov s7, ip
301 rev ip, r6
302 add r6, r6, #1
303 vmov s11, ip
304 vld1.8 {q3-q4}, [r1, :64]!
305 vld1.8 {q5}, [r1, :64]!
306 bl aes_encrypt_3x
307 veor q0, q0, q3
308 veor q1, q1, q4
309 veor q2, q2, q5
310 rev ip, r6
311 vst1.8 {q0-q1}, [r0, :64]!
312 vst1.8 {q2}, [r0, :64]!
313 vmov s27, ip
314 b .Lctrloop3x
315.Lctr1x:
316 adds r4, r4, #3
317 beq .Lctrout
318.Lctrloop:
319 vmov q0, q6
320 bl aes_encrypt
321 subs r4, r4, #1
322 bmi .Lctrhalfblock @ blocks < 0 means 1/2 block
323 vld1.8 {q3}, [r1, :64]!
324 veor q3, q0, q3
325 vst1.8 {q3}, [r0, :64]!
326
327 adds r6, r6, #1 @ increment BE ctr
328 rev ip, r6
329 vmov s27, ip
330 bcs .Lctrcarry
331 teq r4, #0
332 bne .Lctrloop
333.Lctrout:
334 vst1.8 {q6}, [r5]
335 pop {r4-r6, pc}
336
337.Lctrhalfblock:
338 vld1.8 {d1}, [r1, :64]
339 veor d0, d0, d1
340 vst1.8 {d0}, [r0, :64]
341 pop {r4-r6, pc}
342
343.Lctrcarry:
344 .irp sreg, s26, s25, s24
345 vmov ip, \sreg @ load next word of ctr
346 rev ip, ip @ ... to handle the carry
347 adds ip, ip, #1
348 rev ip, ip
349 vmov \sreg, ip
350 bcc 0f
351 .endr
3520: teq r4, #0
353 beq .Lctrout
354 b .Lctrloop
355ENDPROC(ce_aes_ctr_encrypt)
356
357 /*
358 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
359 * int blocks, u8 iv[], u8 const rk2[], int first)
360 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
361 * int blocks, u8 iv[], u8 const rk2[], int first)
362 */
363
364 .macro next_tweak, out, in, const, tmp
365 vshr.s64 \tmp, \in, #63
366 vand \tmp, \tmp, \const
367 vadd.u64 \out, \in, \in
368 vext.8 \tmp, \tmp, \tmp, #8
369 veor \out, \out, \tmp
370 .endm
371
372 .align 3
373.Lxts_mul_x:
374 .quad 1, 0x87
375
376ce_aes_xts_init:
377 vldr d14, .Lxts_mul_x
378 vldr d15, .Lxts_mul_x + 8
379
380 ldrd r4, r5, [sp, #16] @ load args
381 ldr r6, [sp, #28]
382 vld1.8 {q0}, [r5] @ load iv
383 teq r6, #1 @ start of a block?
384 bxne lr
385
386 @ Encrypt the IV in q0 with the second AES key. This should only
387 @ be done at the start of a block.
388 ldr r6, [sp, #24] @ load AES key 2
389 prepare_key r6, r3
390 add ip, r6, #32 @ 3rd round key of key 2
391 b .Laes_encrypt_tweak @ tail call
392ENDPROC(ce_aes_xts_init)
393
394ENTRY(ce_aes_xts_encrypt)
395 push {r4-r6, lr}
396
397 bl ce_aes_xts_init @ run shared prologue
398 prepare_key r2, r3
399 vmov q3, q0
400
401 teq r6, #0 @ start of a block?
402 bne .Lxtsenc3x
403
404.Lxtsencloop3x:
405 next_tweak q3, q3, q7, q6
406.Lxtsenc3x:
407 subs r4, r4, #3
408 bmi .Lxtsenc1x
409 vld1.8 {q0-q1}, [r1, :64]! @ get 3 pt blocks
410 vld1.8 {q2}, [r1, :64]!
411 next_tweak q4, q3, q7, q6
412 veor q0, q0, q3
413 next_tweak q5, q4, q7, q6
414 veor q1, q1, q4
415 veor q2, q2, q5
416 bl aes_encrypt_3x
417 veor q0, q0, q3
418 veor q1, q1, q4
419 veor q2, q2, q5
420 vst1.8 {q0-q1}, [r0, :64]! @ write 3 ct blocks
421 vst1.8 {q2}, [r0, :64]!
422 vmov q3, q5
423 teq r4, #0
424 beq .Lxtsencout
425 b .Lxtsencloop3x
426.Lxtsenc1x:
427 adds r4, r4, #3
428 beq .Lxtsencout
429.Lxtsencloop:
430 vld1.8 {q0}, [r1, :64]!
431 veor q0, q0, q3
432 bl aes_encrypt
433 veor q0, q0, q3
434 vst1.8 {q0}, [r0, :64]!
435 subs r4, r4, #1
436 beq .Lxtsencout
437 next_tweak q3, q3, q7, q6
438 b .Lxtsencloop
439.Lxtsencout:
440 vst1.8 {q3}, [r5]
441 pop {r4-r6, pc}
442ENDPROC(ce_aes_xts_encrypt)
443
444
445ENTRY(ce_aes_xts_decrypt)
446 push {r4-r6, lr}
447
448 bl ce_aes_xts_init @ run shared prologue
449 prepare_key r2, r3
450 vmov q3, q0
451
452 teq r6, #0 @ start of a block?
453 bne .Lxtsdec3x
454
455.Lxtsdecloop3x:
456 next_tweak q3, q3, q7, q6
457.Lxtsdec3x:
458 subs r4, r4, #3
459 bmi .Lxtsdec1x
460 vld1.8 {q0-q1}, [r1, :64]! @ get 3 ct blocks
461 vld1.8 {q2}, [r1, :64]!
462 next_tweak q4, q3, q7, q6
463 veor q0, q0, q3
464 next_tweak q5, q4, q7, q6
465 veor q1, q1, q4
466 veor q2, q2, q5
467 bl aes_decrypt_3x
468 veor q0, q0, q3
469 veor q1, q1, q4
470 veor q2, q2, q5
471 vst1.8 {q0-q1}, [r0, :64]! @ write 3 pt blocks
472 vst1.8 {q2}, [r0, :64]!
473 vmov q3, q5
474 teq r4, #0
475 beq .Lxtsdecout
476 b .Lxtsdecloop3x
477.Lxtsdec1x:
478 adds r4, r4, #3
479 beq .Lxtsdecout
480.Lxtsdecloop:
481 vld1.8 {q0}, [r1, :64]!
482 veor q0, q0, q3
483 add ip, r2, #32 @ 3rd round key
484 bl aes_decrypt
485 veor q0, q0, q3
486 vst1.8 {q0}, [r0, :64]!
487 subs r4, r4, #1
488 beq .Lxtsdecout
489 next_tweak q3, q3, q7, q6
490 b .Lxtsdecloop
491.Lxtsdecout:
492 vst1.8 {q3}, [r5]
493 pop {r4-r6, pc}
494ENDPROC(ce_aes_xts_decrypt)
495
496 /*
497 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
498 * AES sbox substitution on each byte in
499 * 'input'
500 */
501ENTRY(ce_aes_sub)
502 vdup.32 q1, r0
503 veor q0, q0, q0
504 aese.8 q0, q1
505 vmov r0, s0
506 bx lr
507ENDPROC(ce_aes_sub)
508
509 /*
510 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
511 * operation on round key *src
512 */
513ENTRY(ce_aes_invert)
514 vld1.8 {q0}, [r1]
515 aesimc.8 q0, q0
516 vst1.8 {q0}, [r0]
517 bx lr
518ENDPROC(ce_aes_invert)