blob: ce45ba0c06879b8c748ae763603cff14ef68aa60 [file] [log] [blame]
Ard Biesheuvel1d481f12016-12-05 18:42:26 +00001//
2// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
3//
4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5//
6// This program is free software; you can redistribute it and/or modify
7// it under the terms of the GNU General Public License version 2 as
8// published by the Free Software Foundation.
9//
10
11//
12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13//
14// Copyright (c) 2013, Intel Corporation
15//
16// Authors:
17// Erdinc Ozturk <erdinc.ozturk@intel.com>
18// Vinodh Gopal <vinodh.gopal@intel.com>
19// James Guilford <james.guilford@intel.com>
20// Tim Chen <tim.c.chen@linux.intel.com>
21//
22// This software is available to you under a choice of one of two
23// licenses. You may choose to be licensed under the terms of the GNU
24// General Public License (GPL) Version 2, available from the file
25// COPYING in the main directory of this source tree, or the
26// OpenIB.org BSD license below:
27//
28// Redistribution and use in source and binary forms, with or without
29// modification, are permitted provided that the following conditions are
30// met:
31//
32// * Redistributions of source code must retain the above copyright
33// notice, this list of conditions and the following disclaimer.
34//
35// * Redistributions in binary form must reproduce the above copyright
36// notice, this list of conditions and the following disclaimer in the
37// documentation and/or other materials provided with the
38// distribution.
39//
40// * Neither the name of the Intel Corporation nor the names of its
41// contributors may be used to endorse or promote products derived from
42// this software without specific prior written permission.
43//
44//
45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56//
57// Function API:
58// UINT16 crc_t10dif_pcl(
59// UINT16 init_crc, //initial CRC value, 16 bits
60// const unsigned char *buf, //buffer pointer to calculate CRC on
61// UINT64 len //buffer length in bytes (64-bit data)
62// );
63//
64// Reference paper titled "Fast CRC Computation for Generic
65// Polynomials Using PCLMULQDQ Instruction"
66// URL: http://www.intel.com/content/dam/www/public/us/en/documents
67// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68//
69//
70
71#include <linux/linkage.h>
72#include <asm/assembler.h>
73
74#ifdef CONFIG_CPU_ENDIAN_BE8
75#define CPU_LE(code...)
76#else
77#define CPU_LE(code...) code
78#endif
79
80 .text
81 .fpu crypto-neon-fp-armv8
82
83 arg1_low32 .req r0
84 arg2 .req r1
85 arg3 .req r2
86
87 qzr .req q13
88
89 q0l .req d0
90 q0h .req d1
91 q1l .req d2
92 q1h .req d3
93 q2l .req d4
94 q2h .req d5
95 q3l .req d6
96 q3h .req d7
97 q4l .req d8
98 q4h .req d9
99 q5l .req d10
100 q5h .req d11
101 q6l .req d12
102 q6h .req d13
103 q7l .req d14
104 q7h .req d15
105
106ENTRY(crc_t10dif_pmull)
107 vmov.i8 qzr, #0 // init zero register
108
109 // adjust the 16-bit initial_crc value, scale it to 32 bits
110 lsl arg1_low32, arg1_low32, #16
111
112 // check if smaller than 256
113 cmp arg3, #256
114
115 // for sizes less than 128, we can't fold 64B at a time...
116 blt _less_than_128
117
118 // load the initial crc value
119 // crc value does not need to be byte-reflected, but it needs
120 // to be moved to the high part of the register.
121 // because data will be byte-reflected and will align with
122 // initial crc at correct place.
123 vmov s0, arg1_low32 // initial crc
124 vext.8 q10, qzr, q0, #4
125
126 // receive the initial 64B data, xor the initial crc value
127 vld1.64 {q0-q1}, [arg2, :128]!
128 vld1.64 {q2-q3}, [arg2, :128]!
129 vld1.64 {q4-q5}, [arg2, :128]!
130 vld1.64 {q6-q7}, [arg2, :128]!
131CPU_LE( vrev64.8 q0, q0 )
132CPU_LE( vrev64.8 q1, q1 )
133CPU_LE( vrev64.8 q2, q2 )
134CPU_LE( vrev64.8 q3, q3 )
135CPU_LE( vrev64.8 q4, q4 )
136CPU_LE( vrev64.8 q5, q5 )
137CPU_LE( vrev64.8 q6, q6 )
138CPU_LE( vrev64.8 q7, q7 )
139
140 vswp d0, d1
141 vswp d2, d3
142 vswp d4, d5
143 vswp d6, d7
144 vswp d8, d9
145 vswp d10, d11
146 vswp d12, d13
147 vswp d14, d15
148
149 // XOR the initial_crc value
150 veor.8 q0, q0, q10
151
152 adr ip, rk3
153 vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
154
155 //
156 // we subtract 256 instead of 128 to save one instruction from the loop
157 //
158 sub arg3, arg3, #256
159
160 // at this section of the code, there is 64*x+y (0<=y<64) bytes of
161 // buffer. The _fold_64_B_loop will fold 64B at a time
162 // until we have 64+y Bytes of buffer
163
164
165 // fold 64B at a time. This section of the code folds 4 vector
166 // registers in parallel
167_fold_64_B_loop:
168
169 .macro fold64, reg1, reg2
170 vld1.64 {q11-q12}, [arg2, :128]!
171
172 vmull.p64 q8, \reg1\()h, d21
173 vmull.p64 \reg1, \reg1\()l, d20
174 vmull.p64 q9, \reg2\()h, d21
175 vmull.p64 \reg2, \reg2\()l, d20
176
177CPU_LE( vrev64.8 q11, q11 )
178CPU_LE( vrev64.8 q12, q12 )
179 vswp d22, d23
180 vswp d24, d25
181
182 veor.8 \reg1, \reg1, q8
183 veor.8 \reg2, \reg2, q9
184 veor.8 \reg1, \reg1, q11
185 veor.8 \reg2, \reg2, q12
186 .endm
187
188 fold64 q0, q1
189 fold64 q2, q3
190 fold64 q4, q5
191 fold64 q6, q7
192
193 subs arg3, arg3, #128
194
195 // check if there is another 64B in the buffer to be able to fold
196 bge _fold_64_B_loop
197
198 // at this point, the buffer pointer is pointing at the last y Bytes
199 // of the buffer the 64B of folded data is in 4 of the vector
200 // registers: v0, v1, v2, v3
201
202 // fold the 8 vector registers to 1 vector register with different
203 // constants
204
205 adr ip, rk9
206 vld1.64 {q10}, [ip, :128]!
207
208 .macro fold16, reg, rk
209 vmull.p64 q8, \reg\()l, d20
210 vmull.p64 \reg, \reg\()h, d21
211 .ifnb \rk
212 vld1.64 {q10}, [ip, :128]!
213 .endif
214 veor.8 q7, q7, q8
215 veor.8 q7, q7, \reg
216 .endm
217
218 fold16 q0, rk11
219 fold16 q1, rk13
220 fold16 q2, rk15
221 fold16 q3, rk17
222 fold16 q4, rk19
223 fold16 q5, rk1
224 fold16 q6
225
226 // instead of 64, we add 48 to the loop counter to save 1 instruction
227 // from the loop instead of a cmp instruction, we use the negative
228 // flag with the jl instruction
229 adds arg3, arg3, #(128-16)
230 blt _final_reduction_for_128
231
232 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
233 // and the rest is in memory. We can fold 16 bytes at a time if y>=16
234 // continue folding 16B at a time
235
236_16B_reduction_loop:
237 vmull.p64 q8, d14, d20
238 vmull.p64 q7, d15, d21
239 veor.8 q7, q7, q8
240
241 vld1.64 {q0}, [arg2, :128]!
242CPU_LE( vrev64.8 q0, q0 )
243 vswp d0, d1
244 veor.8 q7, q7, q0
245 subs arg3, arg3, #16
246
247 // instead of a cmp instruction, we utilize the flags with the
248 // jge instruction equivalent of: cmp arg3, 16-16
249 // check if there is any more 16B in the buffer to be able to fold
250 bge _16B_reduction_loop
251
252 // now we have 16+z bytes left to reduce, where 0<= z < 16.
253 // first, we reduce the data in the xmm7 register
254
255_final_reduction_for_128:
256 // check if any more data to fold. If not, compute the CRC of
257 // the final 128 bits
258 adds arg3, arg3, #16
259 beq _128_done
260
261 // here we are getting data that is less than 16 bytes.
262 // since we know that there was data before the pointer, we can
263 // offset the input pointer before the actual point, to receive
264 // exactly 16 bytes. after that the registers need to be adjusted.
265_get_last_two_regs:
266 add arg2, arg2, arg3
267 sub arg2, arg2, #16
268 vld1.64 {q1}, [arg2]
269CPU_LE( vrev64.8 q1, q1 )
270 vswp d2, d3
271
272 // get rid of the extra data that was loaded before
273 // load the shift constant
274 adr ip, tbl_shf_table + 16
275 sub ip, ip, arg3
276 vld1.8 {q0}, [ip]
277
278 // shift v2 to the left by arg3 bytes
279 vtbl.8 d4, {d14-d15}, d0
280 vtbl.8 d5, {d14-d15}, d1
281
282 // shift v7 to the right by 16-arg3 bytes
283 vmov.i8 q9, #0x80
284 veor.8 q0, q0, q9
285 vtbl.8 d18, {d14-d15}, d0
286 vtbl.8 d19, {d14-d15}, d1
287
288 // blend
289 vshr.s8 q0, q0, #7 // convert to 8-bit mask
290 vbsl.8 q0, q2, q1
291
292 // fold 16 Bytes
293 vmull.p64 q8, d18, d20
294 vmull.p64 q7, d19, d21
295 veor.8 q7, q7, q8
296 veor.8 q7, q7, q0
297
298_128_done:
299 // compute crc of a 128-bit value
300 vldr d20, rk5
301 vldr d21, rk6 // rk5 and rk6 in xmm10
302
303 // 64b fold
304 vext.8 q0, qzr, q7, #8
305 vmull.p64 q7, d15, d20
306 veor.8 q7, q7, q0
307
308 // 32b fold
309 vext.8 q0, q7, qzr, #12
310 vmov s31, s3
311 vmull.p64 q0, d0, d21
312 veor.8 q7, q0, q7
313
314 // barrett reduction
315_barrett:
316 vldr d20, rk7
317 vldr d21, rk8
318
319 vmull.p64 q0, d15, d20
320 vext.8 q0, qzr, q0, #12
321 vmull.p64 q0, d1, d21
322 vext.8 q0, qzr, q0, #12
323 veor.8 q7, q7, q0
324 vmov r0, s29
325
326_cleanup:
327 // scale the result back to 16 bits
328 lsr r0, r0, #16
329 bx lr
330
331_less_than_128:
332 teq arg3, #0
333 beq _cleanup
334
335 vmov.i8 q0, #0
336 vmov s3, arg1_low32 // get the initial crc value
337
338 vld1.64 {q7}, [arg2, :128]!
339CPU_LE( vrev64.8 q7, q7 )
340 vswp d14, d15
341 veor.8 q7, q7, q0
342
343 cmp arg3, #16
344 beq _128_done // exactly 16 left
345 blt _less_than_16_left
346
347 // now if there is, load the constants
348 vldr d20, rk1
349 vldr d21, rk2 // rk1 and rk2 in xmm10
350
351 // check if there is enough buffer to be able to fold 16B at a time
352 subs arg3, arg3, #32
353 addlt arg3, arg3, #16
354 blt _get_last_two_regs
355 b _16B_reduction_loop
356
357_less_than_16_left:
358 // shl r9, 4
359 adr ip, tbl_shf_table + 16
360 sub ip, ip, arg3
361 vld1.8 {q0}, [ip]
362 vmov.i8 q9, #0x80
363 veor.8 q0, q0, q9
364 vtbl.8 d18, {d14-d15}, d0
365 vtbl.8 d15, {d14-d15}, d1
366 vmov d14, d18
367 b _128_done
368ENDPROC(crc_t10dif_pmull)
369
370// precomputed constants
371// these constants are precomputed from the poly:
372// 0x8bb70000 (0x8bb7 scaled to 32 bits)
373 .align 4
374// Q = 0x18BB70000
375// rk1 = 2^(32*3) mod Q << 32
376// rk2 = 2^(32*5) mod Q << 32
377// rk3 = 2^(32*15) mod Q << 32
378// rk4 = 2^(32*17) mod Q << 32
379// rk5 = 2^(32*3) mod Q << 32
380// rk6 = 2^(32*2) mod Q << 32
381// rk7 = floor(2^64/Q)
382// rk8 = Q
383
384rk3: .quad 0x9d9d000000000000
385rk4: .quad 0x7cf5000000000000
386rk5: .quad 0x2d56000000000000
387rk6: .quad 0x1368000000000000
388rk7: .quad 0x00000001f65a57f8
389rk8: .quad 0x000000018bb70000
390rk9: .quad 0xceae000000000000
391rk10: .quad 0xbfd6000000000000
392rk11: .quad 0x1e16000000000000
393rk12: .quad 0x713c000000000000
394rk13: .quad 0xf7f9000000000000
395rk14: .quad 0x80a6000000000000
396rk15: .quad 0x044c000000000000
397rk16: .quad 0xe658000000000000
398rk17: .quad 0xad18000000000000
399rk18: .quad 0xa497000000000000
400rk19: .quad 0x6ee3000000000000
401rk20: .quad 0xe7b5000000000000
402rk1: .quad 0x2d56000000000000
403rk2: .quad 0x06df000000000000
404
405tbl_shf_table:
406// use these values for shift constants for the tbl/tbx instruction
407// different alignments result in values as shown:
408// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
409// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
410// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
411// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
412// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
413// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
414// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
415// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
416// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
417// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
418// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
419// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
420// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
421// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
422// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
423
424 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
425 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
426 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
427 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0