Ard Biesheuvel | 1d481f1 | 2016-12-05 18:42:26 +0000 | [diff] [blame^] | 1 | // |
| 2 | // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions |
| 3 | // |
| 4 | // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> |
| 5 | // |
| 6 | // This program is free software; you can redistribute it and/or modify |
| 7 | // it under the terms of the GNU General Public License version 2 as |
| 8 | // published by the Free Software Foundation. |
| 9 | // |
| 10 | |
| 11 | // |
| 12 | // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions |
| 13 | // |
| 14 | // Copyright (c) 2013, Intel Corporation |
| 15 | // |
| 16 | // Authors: |
| 17 | // Erdinc Ozturk <erdinc.ozturk@intel.com> |
| 18 | // Vinodh Gopal <vinodh.gopal@intel.com> |
| 19 | // James Guilford <james.guilford@intel.com> |
| 20 | // Tim Chen <tim.c.chen@linux.intel.com> |
| 21 | // |
| 22 | // This software is available to you under a choice of one of two |
| 23 | // licenses. You may choose to be licensed under the terms of the GNU |
| 24 | // General Public License (GPL) Version 2, available from the file |
| 25 | // COPYING in the main directory of this source tree, or the |
| 26 | // OpenIB.org BSD license below: |
| 27 | // |
| 28 | // Redistribution and use in source and binary forms, with or without |
| 29 | // modification, are permitted provided that the following conditions are |
| 30 | // met: |
| 31 | // |
| 32 | // * Redistributions of source code must retain the above copyright |
| 33 | // notice, this list of conditions and the following disclaimer. |
| 34 | // |
| 35 | // * Redistributions in binary form must reproduce the above copyright |
| 36 | // notice, this list of conditions and the following disclaimer in the |
| 37 | // documentation and/or other materials provided with the |
| 38 | // distribution. |
| 39 | // |
| 40 | // * Neither the name of the Intel Corporation nor the names of its |
| 41 | // contributors may be used to endorse or promote products derived from |
| 42 | // this software without specific prior written permission. |
| 43 | // |
| 44 | // |
| 45 | // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY |
| 46 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 47 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 48 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR |
| 49 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 50 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 51 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 52 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 53 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 54 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 55 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 56 | // |
| 57 | // Function API: |
| 58 | // UINT16 crc_t10dif_pcl( |
| 59 | // UINT16 init_crc, //initial CRC value, 16 bits |
| 60 | // const unsigned char *buf, //buffer pointer to calculate CRC on |
| 61 | // UINT64 len //buffer length in bytes (64-bit data) |
| 62 | // ); |
| 63 | // |
| 64 | // Reference paper titled "Fast CRC Computation for Generic |
| 65 | // Polynomials Using PCLMULQDQ Instruction" |
| 66 | // URL: http://www.intel.com/content/dam/www/public/us/en/documents |
| 67 | // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
| 68 | // |
| 69 | // |
| 70 | |
| 71 | #include <linux/linkage.h> |
| 72 | #include <asm/assembler.h> |
| 73 | |
| 74 | #ifdef CONFIG_CPU_ENDIAN_BE8 |
| 75 | #define CPU_LE(code...) |
| 76 | #else |
| 77 | #define CPU_LE(code...) code |
| 78 | #endif |
| 79 | |
| 80 | .text |
| 81 | .fpu crypto-neon-fp-armv8 |
| 82 | |
| 83 | arg1_low32 .req r0 |
| 84 | arg2 .req r1 |
| 85 | arg3 .req r2 |
| 86 | |
| 87 | qzr .req q13 |
| 88 | |
| 89 | q0l .req d0 |
| 90 | q0h .req d1 |
| 91 | q1l .req d2 |
| 92 | q1h .req d3 |
| 93 | q2l .req d4 |
| 94 | q2h .req d5 |
| 95 | q3l .req d6 |
| 96 | q3h .req d7 |
| 97 | q4l .req d8 |
| 98 | q4h .req d9 |
| 99 | q5l .req d10 |
| 100 | q5h .req d11 |
| 101 | q6l .req d12 |
| 102 | q6h .req d13 |
| 103 | q7l .req d14 |
| 104 | q7h .req d15 |
| 105 | |
| 106 | ENTRY(crc_t10dif_pmull) |
| 107 | vmov.i8 qzr, #0 // init zero register |
| 108 | |
| 109 | // adjust the 16-bit initial_crc value, scale it to 32 bits |
| 110 | lsl arg1_low32, arg1_low32, #16 |
| 111 | |
| 112 | // check if smaller than 256 |
| 113 | cmp arg3, #256 |
| 114 | |
| 115 | // for sizes less than 128, we can't fold 64B at a time... |
| 116 | blt _less_than_128 |
| 117 | |
| 118 | // load the initial crc value |
| 119 | // crc value does not need to be byte-reflected, but it needs |
| 120 | // to be moved to the high part of the register. |
| 121 | // because data will be byte-reflected and will align with |
| 122 | // initial crc at correct place. |
| 123 | vmov s0, arg1_low32 // initial crc |
| 124 | vext.8 q10, qzr, q0, #4 |
| 125 | |
| 126 | // receive the initial 64B data, xor the initial crc value |
| 127 | vld1.64 {q0-q1}, [arg2, :128]! |
| 128 | vld1.64 {q2-q3}, [arg2, :128]! |
| 129 | vld1.64 {q4-q5}, [arg2, :128]! |
| 130 | vld1.64 {q6-q7}, [arg2, :128]! |
| 131 | CPU_LE( vrev64.8 q0, q0 ) |
| 132 | CPU_LE( vrev64.8 q1, q1 ) |
| 133 | CPU_LE( vrev64.8 q2, q2 ) |
| 134 | CPU_LE( vrev64.8 q3, q3 ) |
| 135 | CPU_LE( vrev64.8 q4, q4 ) |
| 136 | CPU_LE( vrev64.8 q5, q5 ) |
| 137 | CPU_LE( vrev64.8 q6, q6 ) |
| 138 | CPU_LE( vrev64.8 q7, q7 ) |
| 139 | |
| 140 | vswp d0, d1 |
| 141 | vswp d2, d3 |
| 142 | vswp d4, d5 |
| 143 | vswp d6, d7 |
| 144 | vswp d8, d9 |
| 145 | vswp d10, d11 |
| 146 | vswp d12, d13 |
| 147 | vswp d14, d15 |
| 148 | |
| 149 | // XOR the initial_crc value |
| 150 | veor.8 q0, q0, q10 |
| 151 | |
| 152 | adr ip, rk3 |
| 153 | vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4 |
| 154 | |
| 155 | // |
| 156 | // we subtract 256 instead of 128 to save one instruction from the loop |
| 157 | // |
| 158 | sub arg3, arg3, #256 |
| 159 | |
| 160 | // at this section of the code, there is 64*x+y (0<=y<64) bytes of |
| 161 | // buffer. The _fold_64_B_loop will fold 64B at a time |
| 162 | // until we have 64+y Bytes of buffer |
| 163 | |
| 164 | |
| 165 | // fold 64B at a time. This section of the code folds 4 vector |
| 166 | // registers in parallel |
| 167 | _fold_64_B_loop: |
| 168 | |
| 169 | .macro fold64, reg1, reg2 |
| 170 | vld1.64 {q11-q12}, [arg2, :128]! |
| 171 | |
| 172 | vmull.p64 q8, \reg1\()h, d21 |
| 173 | vmull.p64 \reg1, \reg1\()l, d20 |
| 174 | vmull.p64 q9, \reg2\()h, d21 |
| 175 | vmull.p64 \reg2, \reg2\()l, d20 |
| 176 | |
| 177 | CPU_LE( vrev64.8 q11, q11 ) |
| 178 | CPU_LE( vrev64.8 q12, q12 ) |
| 179 | vswp d22, d23 |
| 180 | vswp d24, d25 |
| 181 | |
| 182 | veor.8 \reg1, \reg1, q8 |
| 183 | veor.8 \reg2, \reg2, q9 |
| 184 | veor.8 \reg1, \reg1, q11 |
| 185 | veor.8 \reg2, \reg2, q12 |
| 186 | .endm |
| 187 | |
| 188 | fold64 q0, q1 |
| 189 | fold64 q2, q3 |
| 190 | fold64 q4, q5 |
| 191 | fold64 q6, q7 |
| 192 | |
| 193 | subs arg3, arg3, #128 |
| 194 | |
| 195 | // check if there is another 64B in the buffer to be able to fold |
| 196 | bge _fold_64_B_loop |
| 197 | |
| 198 | // at this point, the buffer pointer is pointing at the last y Bytes |
| 199 | // of the buffer the 64B of folded data is in 4 of the vector |
| 200 | // registers: v0, v1, v2, v3 |
| 201 | |
| 202 | // fold the 8 vector registers to 1 vector register with different |
| 203 | // constants |
| 204 | |
| 205 | adr ip, rk9 |
| 206 | vld1.64 {q10}, [ip, :128]! |
| 207 | |
| 208 | .macro fold16, reg, rk |
| 209 | vmull.p64 q8, \reg\()l, d20 |
| 210 | vmull.p64 \reg, \reg\()h, d21 |
| 211 | .ifnb \rk |
| 212 | vld1.64 {q10}, [ip, :128]! |
| 213 | .endif |
| 214 | veor.8 q7, q7, q8 |
| 215 | veor.8 q7, q7, \reg |
| 216 | .endm |
| 217 | |
| 218 | fold16 q0, rk11 |
| 219 | fold16 q1, rk13 |
| 220 | fold16 q2, rk15 |
| 221 | fold16 q3, rk17 |
| 222 | fold16 q4, rk19 |
| 223 | fold16 q5, rk1 |
| 224 | fold16 q6 |
| 225 | |
| 226 | // instead of 64, we add 48 to the loop counter to save 1 instruction |
| 227 | // from the loop instead of a cmp instruction, we use the negative |
| 228 | // flag with the jl instruction |
| 229 | adds arg3, arg3, #(128-16) |
| 230 | blt _final_reduction_for_128 |
| 231 | |
| 232 | // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 |
| 233 | // and the rest is in memory. We can fold 16 bytes at a time if y>=16 |
| 234 | // continue folding 16B at a time |
| 235 | |
| 236 | _16B_reduction_loop: |
| 237 | vmull.p64 q8, d14, d20 |
| 238 | vmull.p64 q7, d15, d21 |
| 239 | veor.8 q7, q7, q8 |
| 240 | |
| 241 | vld1.64 {q0}, [arg2, :128]! |
| 242 | CPU_LE( vrev64.8 q0, q0 ) |
| 243 | vswp d0, d1 |
| 244 | veor.8 q7, q7, q0 |
| 245 | subs arg3, arg3, #16 |
| 246 | |
| 247 | // instead of a cmp instruction, we utilize the flags with the |
| 248 | // jge instruction equivalent of: cmp arg3, 16-16 |
| 249 | // check if there is any more 16B in the buffer to be able to fold |
| 250 | bge _16B_reduction_loop |
| 251 | |
| 252 | // now we have 16+z bytes left to reduce, where 0<= z < 16. |
| 253 | // first, we reduce the data in the xmm7 register |
| 254 | |
| 255 | _final_reduction_for_128: |
| 256 | // check if any more data to fold. If not, compute the CRC of |
| 257 | // the final 128 bits |
| 258 | adds arg3, arg3, #16 |
| 259 | beq _128_done |
| 260 | |
| 261 | // here we are getting data that is less than 16 bytes. |
| 262 | // since we know that there was data before the pointer, we can |
| 263 | // offset the input pointer before the actual point, to receive |
| 264 | // exactly 16 bytes. after that the registers need to be adjusted. |
| 265 | _get_last_two_regs: |
| 266 | add arg2, arg2, arg3 |
| 267 | sub arg2, arg2, #16 |
| 268 | vld1.64 {q1}, [arg2] |
| 269 | CPU_LE( vrev64.8 q1, q1 ) |
| 270 | vswp d2, d3 |
| 271 | |
| 272 | // get rid of the extra data that was loaded before |
| 273 | // load the shift constant |
| 274 | adr ip, tbl_shf_table + 16 |
| 275 | sub ip, ip, arg3 |
| 276 | vld1.8 {q0}, [ip] |
| 277 | |
| 278 | // shift v2 to the left by arg3 bytes |
| 279 | vtbl.8 d4, {d14-d15}, d0 |
| 280 | vtbl.8 d5, {d14-d15}, d1 |
| 281 | |
| 282 | // shift v7 to the right by 16-arg3 bytes |
| 283 | vmov.i8 q9, #0x80 |
| 284 | veor.8 q0, q0, q9 |
| 285 | vtbl.8 d18, {d14-d15}, d0 |
| 286 | vtbl.8 d19, {d14-d15}, d1 |
| 287 | |
| 288 | // blend |
| 289 | vshr.s8 q0, q0, #7 // convert to 8-bit mask |
| 290 | vbsl.8 q0, q2, q1 |
| 291 | |
| 292 | // fold 16 Bytes |
| 293 | vmull.p64 q8, d18, d20 |
| 294 | vmull.p64 q7, d19, d21 |
| 295 | veor.8 q7, q7, q8 |
| 296 | veor.8 q7, q7, q0 |
| 297 | |
| 298 | _128_done: |
| 299 | // compute crc of a 128-bit value |
| 300 | vldr d20, rk5 |
| 301 | vldr d21, rk6 // rk5 and rk6 in xmm10 |
| 302 | |
| 303 | // 64b fold |
| 304 | vext.8 q0, qzr, q7, #8 |
| 305 | vmull.p64 q7, d15, d20 |
| 306 | veor.8 q7, q7, q0 |
| 307 | |
| 308 | // 32b fold |
| 309 | vext.8 q0, q7, qzr, #12 |
| 310 | vmov s31, s3 |
| 311 | vmull.p64 q0, d0, d21 |
| 312 | veor.8 q7, q0, q7 |
| 313 | |
| 314 | // barrett reduction |
| 315 | _barrett: |
| 316 | vldr d20, rk7 |
| 317 | vldr d21, rk8 |
| 318 | |
| 319 | vmull.p64 q0, d15, d20 |
| 320 | vext.8 q0, qzr, q0, #12 |
| 321 | vmull.p64 q0, d1, d21 |
| 322 | vext.8 q0, qzr, q0, #12 |
| 323 | veor.8 q7, q7, q0 |
| 324 | vmov r0, s29 |
| 325 | |
| 326 | _cleanup: |
| 327 | // scale the result back to 16 bits |
| 328 | lsr r0, r0, #16 |
| 329 | bx lr |
| 330 | |
| 331 | _less_than_128: |
| 332 | teq arg3, #0 |
| 333 | beq _cleanup |
| 334 | |
| 335 | vmov.i8 q0, #0 |
| 336 | vmov s3, arg1_low32 // get the initial crc value |
| 337 | |
| 338 | vld1.64 {q7}, [arg2, :128]! |
| 339 | CPU_LE( vrev64.8 q7, q7 ) |
| 340 | vswp d14, d15 |
| 341 | veor.8 q7, q7, q0 |
| 342 | |
| 343 | cmp arg3, #16 |
| 344 | beq _128_done // exactly 16 left |
| 345 | blt _less_than_16_left |
| 346 | |
| 347 | // now if there is, load the constants |
| 348 | vldr d20, rk1 |
| 349 | vldr d21, rk2 // rk1 and rk2 in xmm10 |
| 350 | |
| 351 | // check if there is enough buffer to be able to fold 16B at a time |
| 352 | subs arg3, arg3, #32 |
| 353 | addlt arg3, arg3, #16 |
| 354 | blt _get_last_two_regs |
| 355 | b _16B_reduction_loop |
| 356 | |
| 357 | _less_than_16_left: |
| 358 | // shl r9, 4 |
| 359 | adr ip, tbl_shf_table + 16 |
| 360 | sub ip, ip, arg3 |
| 361 | vld1.8 {q0}, [ip] |
| 362 | vmov.i8 q9, #0x80 |
| 363 | veor.8 q0, q0, q9 |
| 364 | vtbl.8 d18, {d14-d15}, d0 |
| 365 | vtbl.8 d15, {d14-d15}, d1 |
| 366 | vmov d14, d18 |
| 367 | b _128_done |
| 368 | ENDPROC(crc_t10dif_pmull) |
| 369 | |
| 370 | // precomputed constants |
| 371 | // these constants are precomputed from the poly: |
| 372 | // 0x8bb70000 (0x8bb7 scaled to 32 bits) |
| 373 | .align 4 |
| 374 | // Q = 0x18BB70000 |
| 375 | // rk1 = 2^(32*3) mod Q << 32 |
| 376 | // rk2 = 2^(32*5) mod Q << 32 |
| 377 | // rk3 = 2^(32*15) mod Q << 32 |
| 378 | // rk4 = 2^(32*17) mod Q << 32 |
| 379 | // rk5 = 2^(32*3) mod Q << 32 |
| 380 | // rk6 = 2^(32*2) mod Q << 32 |
| 381 | // rk7 = floor(2^64/Q) |
| 382 | // rk8 = Q |
| 383 | |
| 384 | rk3: .quad 0x9d9d000000000000 |
| 385 | rk4: .quad 0x7cf5000000000000 |
| 386 | rk5: .quad 0x2d56000000000000 |
| 387 | rk6: .quad 0x1368000000000000 |
| 388 | rk7: .quad 0x00000001f65a57f8 |
| 389 | rk8: .quad 0x000000018bb70000 |
| 390 | rk9: .quad 0xceae000000000000 |
| 391 | rk10: .quad 0xbfd6000000000000 |
| 392 | rk11: .quad 0x1e16000000000000 |
| 393 | rk12: .quad 0x713c000000000000 |
| 394 | rk13: .quad 0xf7f9000000000000 |
| 395 | rk14: .quad 0x80a6000000000000 |
| 396 | rk15: .quad 0x044c000000000000 |
| 397 | rk16: .quad 0xe658000000000000 |
| 398 | rk17: .quad 0xad18000000000000 |
| 399 | rk18: .quad 0xa497000000000000 |
| 400 | rk19: .quad 0x6ee3000000000000 |
| 401 | rk20: .quad 0xe7b5000000000000 |
| 402 | rk1: .quad 0x2d56000000000000 |
| 403 | rk2: .quad 0x06df000000000000 |
| 404 | |
| 405 | tbl_shf_table: |
| 406 | // use these values for shift constants for the tbl/tbx instruction |
| 407 | // different alignments result in values as shown: |
| 408 | // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 |
| 409 | // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 |
| 410 | // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 |
| 411 | // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 |
| 412 | // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 |
| 413 | // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 |
| 414 | // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 |
| 415 | // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 |
| 416 | // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 |
| 417 | // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 |
| 418 | // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 |
| 419 | // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 |
| 420 | // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 |
| 421 | // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 |
| 422 | // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 |
| 423 | |
| 424 | .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 |
| 425 | .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f |
| 426 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 |
| 427 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 |