Blame - arch/arm/crypto/crct10dif-ce-core.S - linux-5.10

blob: ce45ba0c06879b8c748ae763603cff14ef68aa60 [file] [log] [blame]

Ard Biesheuvel	1d481f1	2016-12-05 18:42:26 +0000	[diff] [blame^]	1	//
				2	// Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
				3	//
				4	// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
				5	//
				6	// This program is free software; you can redistribute it and/or modify
				7	// it under the terms of the GNU General Public License version 2 as
				8	// published by the Free Software Foundation.
				9	//
				10
				11	//
				12	// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
				13	//
				14	// Copyright (c) 2013, Intel Corporation
				15	//
				16	// Authors:
				17	// Erdinc Ozturk <erdinc.ozturk@intel.com>
				18	// Vinodh Gopal <vinodh.gopal@intel.com>
				19	// James Guilford <james.guilford@intel.com>
				20	// Tim Chen <tim.c.chen@linux.intel.com>
				21	//
				22	// This software is available to you under a choice of one of two
				23	// licenses. You may choose to be licensed under the terms of the GNU
				24	// General Public License (GPL) Version 2, available from the file
				25	// COPYING in the main directory of this source tree, or the
				26	// OpenIB.org BSD license below:
				27	//
				28	// Redistribution and use in source and binary forms, with or without
				29	// modification, are permitted provided that the following conditions are
				30	// met:
				31	//
				32	// * Redistributions of source code must retain the above copyright
				33	// notice, this list of conditions and the following disclaimer.
				34	//
				35	// * Redistributions in binary form must reproduce the above copyright
				36	// notice, this list of conditions and the following disclaimer in the
				37	// documentation and/or other materials provided with the
				38	// distribution.
				39	//
				40	// * Neither the name of the Intel Corporation nor the names of its
				41	// contributors may be used to endorse or promote products derived from
				42	// this software without specific prior written permission.
				43	//
				44	//
				45	// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
				46	// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				47	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
				48	// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
				49	// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
				50	// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				51	// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				52	// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				53	// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				54	// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				55	// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				56	//
				57	// Function API:
				58	// UINT16 crc_t10dif_pcl(
				59	// UINT16 init_crc, //initial CRC value, 16 bits
				60	// const unsigned char *buf, //buffer pointer to calculate CRC on
				61	// UINT64 len //buffer length in bytes (64-bit data)
				62	// );
				63	//
				64	// Reference paper titled "Fast CRC Computation for Generic
				65	// Polynomials Using PCLMULQDQ Instruction"
				66	// URL: http://www.intel.com/content/dam/www/public/us/en/documents
				67	// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
				68	//
				69	//
				70
				71	#include <linux/linkage.h>
				72	#include <asm/assembler.h>
				73
				74	#ifdef CONFIG_CPU_ENDIAN_BE8
				75	#define CPU_LE(code...)
				76	#else
				77	#define CPU_LE(code...) code
				78	#endif
				79
				80	.text
				81	.fpu crypto-neon-fp-armv8
				82
				83	arg1_low32 .req r0
				84	arg2 .req r1
				85	arg3 .req r2
				86
				87	qzr .req q13
				88
				89	q0l .req d0
				90	q0h .req d1
				91	q1l .req d2
				92	q1h .req d3
				93	q2l .req d4
				94	q2h .req d5
				95	q3l .req d6
				96	q3h .req d7
				97	q4l .req d8
				98	q4h .req d9
				99	q5l .req d10
				100	q5h .req d11
				101	q6l .req d12
				102	q6h .req d13
				103	q7l .req d14
				104	q7h .req d15
				105
				106	ENTRY(crc_t10dif_pmull)
				107	vmov.i8 qzr, #0 // init zero register
				108
				109	// adjust the 16-bit initial_crc value, scale it to 32 bits
				110	lsl arg1_low32, arg1_low32, #16
				111
				112	// check if smaller than 256
				113	cmp arg3, #256
				114
				115	// for sizes less than 128, we can't fold 64B at a time...
				116	blt _less_than_128
				117
				118	// load the initial crc value
				119	// crc value does not need to be byte-reflected, but it needs
				120	// to be moved to the high part of the register.
				121	// because data will be byte-reflected and will align with
				122	// initial crc at correct place.
				123	vmov s0, arg1_low32 // initial crc
				124	vext.8 q10, qzr, q0, #4
				125
				126	// receive the initial 64B data, xor the initial crc value
				127	vld1.64 {q0-q1}, [arg2, :128]!
				128	vld1.64 {q2-q3}, [arg2, :128]!
				129	vld1.64 {q4-q5}, [arg2, :128]!
				130	vld1.64 {q6-q7}, [arg2, :128]!
				131	CPU_LE( vrev64.8 q0, q0 )
				132	CPU_LE( vrev64.8 q1, q1 )
				133	CPU_LE( vrev64.8 q2, q2 )
				134	CPU_LE( vrev64.8 q3, q3 )
				135	CPU_LE( vrev64.8 q4, q4 )
				136	CPU_LE( vrev64.8 q5, q5 )
				137	CPU_LE( vrev64.8 q6, q6 )
				138	CPU_LE( vrev64.8 q7, q7 )
				139
				140	vswp d0, d1
				141	vswp d2, d3
				142	vswp d4, d5
				143	vswp d6, d7
				144	vswp d8, d9
				145	vswp d10, d11
				146	vswp d12, d13
				147	vswp d14, d15
				148
				149	// XOR the initial_crc value
				150	veor.8 q0, q0, q10
				151
				152	adr ip, rk3
				153	vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
				154
				155	//
				156	// we subtract 256 instead of 128 to save one instruction from the loop
				157	//
				158	sub arg3, arg3, #256
				159
				160	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
				161	// buffer. The _fold_64_B_loop will fold 64B at a time
				162	// until we have 64+y Bytes of buffer
				163
				164
				165	// fold 64B at a time. This section of the code folds 4 vector
				166	// registers in parallel
				167	_fold_64_B_loop:
				168
				169	.macro fold64, reg1, reg2
				170	vld1.64 {q11-q12}, [arg2, :128]!
				171
				172	vmull.p64 q8, \reg1\()h, d21
				173	vmull.p64 \reg1, \reg1\()l, d20
				174	vmull.p64 q9, \reg2\()h, d21
				175	vmull.p64 \reg2, \reg2\()l, d20
				176
				177	CPU_LE( vrev64.8 q11, q11 )
				178	CPU_LE( vrev64.8 q12, q12 )
				179	vswp d22, d23
				180	vswp d24, d25
				181
				182	veor.8 \reg1, \reg1, q8
				183	veor.8 \reg2, \reg2, q9
				184	veor.8 \reg1, \reg1, q11
				185	veor.8 \reg2, \reg2, q12
				186	.endm
				187
				188	fold64 q0, q1
				189	fold64 q2, q3
				190	fold64 q4, q5
				191	fold64 q6, q7
				192
				193	subs arg3, arg3, #128
				194
				195	// check if there is another 64B in the buffer to be able to fold
				196	bge _fold_64_B_loop
				197
				198	// at this point, the buffer pointer is pointing at the last y Bytes
				199	// of the buffer the 64B of folded data is in 4 of the vector
				200	// registers: v0, v1, v2, v3
				201
				202	// fold the 8 vector registers to 1 vector register with different
				203	// constants
				204
				205	adr ip, rk9
				206	vld1.64 {q10}, [ip, :128]!
				207
				208	.macro fold16, reg, rk
				209	vmull.p64 q8, \reg\()l, d20
				210	vmull.p64 \reg, \reg\()h, d21
				211	.ifnb \rk
				212	vld1.64 {q10}, [ip, :128]!
				213	.endif
				214	veor.8 q7, q7, q8
				215	veor.8 q7, q7, \reg
				216	.endm
				217
				218	fold16 q0, rk11
				219	fold16 q1, rk13
				220	fold16 q2, rk15
				221	fold16 q3, rk17
				222	fold16 q4, rk19
				223	fold16 q5, rk1
				224	fold16 q6
				225
				226	// instead of 64, we add 48 to the loop counter to save 1 instruction
				227	// from the loop instead of a cmp instruction, we use the negative
				228	// flag with the jl instruction
				229	adds arg3, arg3, #(128-16)
				230	blt _final_reduction_for_128
				231
				232	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
				233	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
				234	// continue folding 16B at a time
				235
				236	_16B_reduction_loop:
				237	vmull.p64 q8, d14, d20
				238	vmull.p64 q7, d15, d21
				239	veor.8 q7, q7, q8
				240
				241	vld1.64 {q0}, [arg2, :128]!
				242	CPU_LE( vrev64.8 q0, q0 )
				243	vswp d0, d1
				244	veor.8 q7, q7, q0
				245	subs arg3, arg3, #16
				246
				247	// instead of a cmp instruction, we utilize the flags with the
				248	// jge instruction equivalent of: cmp arg3, 16-16
				249	// check if there is any more 16B in the buffer to be able to fold
				250	bge _16B_reduction_loop
				251
				252	// now we have 16+z bytes left to reduce, where 0<= z < 16.
				253	// first, we reduce the data in the xmm7 register
				254
				255	_final_reduction_for_128:
				256	// check if any more data to fold. If not, compute the CRC of
				257	// the final 128 bits
				258	adds arg3, arg3, #16
				259	beq _128_done
				260
				261	// here we are getting data that is less than 16 bytes.
				262	// since we know that there was data before the pointer, we can
				263	// offset the input pointer before the actual point, to receive
				264	// exactly 16 bytes. after that the registers need to be adjusted.
				265	_get_last_two_regs:
				266	add arg2, arg2, arg3
				267	sub arg2, arg2, #16
				268	vld1.64 {q1}, [arg2]
				269	CPU_LE( vrev64.8 q1, q1 )
				270	vswp d2, d3
				271
				272	// get rid of the extra data that was loaded before
				273	// load the shift constant
				274	adr ip, tbl_shf_table + 16
				275	sub ip, ip, arg3
				276	vld1.8 {q0}, [ip]
				277
				278	// shift v2 to the left by arg3 bytes
				279	vtbl.8 d4, {d14-d15}, d0
				280	vtbl.8 d5, {d14-d15}, d1
				281
				282	// shift v7 to the right by 16-arg3 bytes
				283	vmov.i8 q9, #0x80
				284	veor.8 q0, q0, q9
				285	vtbl.8 d18, {d14-d15}, d0
				286	vtbl.8 d19, {d14-d15}, d1
				287
				288	// blend
				289	vshr.s8 q0, q0, #7 // convert to 8-bit mask
				290	vbsl.8 q0, q2, q1
				291
				292	// fold 16 Bytes
				293	vmull.p64 q8, d18, d20
				294	vmull.p64 q7, d19, d21
				295	veor.8 q7, q7, q8
				296	veor.8 q7, q7, q0
				297
				298	_128_done:
				299	// compute crc of a 128-bit value
				300	vldr d20, rk5
				301	vldr d21, rk6 // rk5 and rk6 in xmm10
				302
				303	// 64b fold
				304	vext.8 q0, qzr, q7, #8
				305	vmull.p64 q7, d15, d20
				306	veor.8 q7, q7, q0
				307
				308	// 32b fold
				309	vext.8 q0, q7, qzr, #12
				310	vmov s31, s3
				311	vmull.p64 q0, d0, d21
				312	veor.8 q7, q0, q7
				313
				314	// barrett reduction
				315	_barrett:
				316	vldr d20, rk7
				317	vldr d21, rk8
				318
				319	vmull.p64 q0, d15, d20
				320	vext.8 q0, qzr, q0, #12
				321	vmull.p64 q0, d1, d21
				322	vext.8 q0, qzr, q0, #12
				323	veor.8 q7, q7, q0
				324	vmov r0, s29
				325
				326	_cleanup:
				327	// scale the result back to 16 bits
				328	lsr r0, r0, #16
				329	bx lr
				330
				331	_less_than_128:
				332	teq arg3, #0
				333	beq _cleanup
				334
				335	vmov.i8 q0, #0
				336	vmov s3, arg1_low32 // get the initial crc value
				337
				338	vld1.64 {q7}, [arg2, :128]!
				339	CPU_LE( vrev64.8 q7, q7 )
				340	vswp d14, d15
				341	veor.8 q7, q7, q0
				342
				343	cmp arg3, #16
				344	beq _128_done // exactly 16 left
				345	blt _less_than_16_left
				346
				347	// now if there is, load the constants
				348	vldr d20, rk1
				349	vldr d21, rk2 // rk1 and rk2 in xmm10
				350
				351	// check if there is enough buffer to be able to fold 16B at a time
				352	subs arg3, arg3, #32
				353	addlt arg3, arg3, #16
				354	blt _get_last_two_regs
				355	b _16B_reduction_loop
				356
				357	_less_than_16_left:
				358	// shl r9, 4
				359	adr ip, tbl_shf_table + 16
				360	sub ip, ip, arg3
				361	vld1.8 {q0}, [ip]
				362	vmov.i8 q9, #0x80
				363	veor.8 q0, q0, q9
				364	vtbl.8 d18, {d14-d15}, d0
				365	vtbl.8 d15, {d14-d15}, d1
				366	vmov d14, d18
				367	b _128_done
				368	ENDPROC(crc_t10dif_pmull)
				369
				370	// precomputed constants
				371	// these constants are precomputed from the poly:
				372	// 0x8bb70000 (0x8bb7 scaled to 32 bits)
				373	.align 4
				374	// Q = 0x18BB70000
				375	// rk1 = 2^(32*3) mod Q << 32
				376	// rk2 = 2^(32*5) mod Q << 32
				377	// rk3 = 2^(32*15) mod Q << 32
				378	// rk4 = 2^(32*17) mod Q << 32
				379	// rk5 = 2^(32*3) mod Q << 32
				380	// rk6 = 2^(32*2) mod Q << 32
				381	// rk7 = floor(2^64/Q)
				382	// rk8 = Q
				383
				384	rk3: .quad 0x9d9d000000000000
				385	rk4: .quad 0x7cf5000000000000
				386	rk5: .quad 0x2d56000000000000
				387	rk6: .quad 0x1368000000000000
				388	rk7: .quad 0x00000001f65a57f8
				389	rk8: .quad 0x000000018bb70000
				390	rk9: .quad 0xceae000000000000
				391	rk10: .quad 0xbfd6000000000000
				392	rk11: .quad 0x1e16000000000000
				393	rk12: .quad 0x713c000000000000
				394	rk13: .quad 0xf7f9000000000000
				395	rk14: .quad 0x80a6000000000000
				396	rk15: .quad 0x044c000000000000
				397	rk16: .quad 0xe658000000000000
				398	rk17: .quad 0xad18000000000000
				399	rk18: .quad 0xa497000000000000
				400	rk19: .quad 0x6ee3000000000000
				401	rk20: .quad 0xe7b5000000000000
				402	rk1: .quad 0x2d56000000000000
				403	rk2: .quad 0x06df000000000000
				404
				405	tbl_shf_table:
				406	// use these values for shift constants for the tbl/tbx instruction
				407	// different alignments result in values as shown:
				408	// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
				409	// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
				410	// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
				411	// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
				412	// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
				413	// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
				414	// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
				415	// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
				416	// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
				417	// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
				418	// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
				419	// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
				420	// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
				421	// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
				422	// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
				423
				424	.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
				425	.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
				426	.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
				427	.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0