Blame - arch/arm/crypto/crc32-ce-core.S - linux-5.10

blob: 5cbd4a6fedad7cb3c99ed35295b77f554d967434 [file] [log] [blame]

Ard Biesheuvel	d0a3431	2016-12-05 18:42:28 +0000	[diff] [blame]	1	/*
				2	* Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
				3	*
				4	* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
				5	*
				6	* This program is free software; you can redistribute it and/or modify
				7	* it under the terms of the GNU General Public License version 2 as
				8	* published by the Free Software Foundation.
				9	*/
				10
				11	/* GPL HEADER START
				12	*
				13	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				14	*
				15	* This program is free software; you can redistribute it and/or modify
				16	* it under the terms of the GNU General Public License version 2 only,
				17	* as published by the Free Software Foundation.
				18	*
				19	* This program is distributed in the hope that it will be useful, but
				20	* WITHOUT ANY WARRANTY; without even the implied warranty of
				21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				22	* General Public License version 2 for more details (a copy is included
				23	* in the LICENSE file that accompanied this code).
				24	*
				25	* You should have received a copy of the GNU General Public License
				26	* version 2 along with this program; If not, see http://www.gnu.org/licenses
				27	*
				28	* Please visit http://www.xyratex.com/contact if you need additional
				29	* information or have any questions.
				30	*
				31	* GPL HEADER END
				32	*/
				33
				34	/*
				35	* Copyright 2012 Xyratex Technology Limited
				36	*
				37	* Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
				38	* calculation.
				39	* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
				40	* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
				41	* at:
				42	* http://www.intel.com/products/processor/manuals/
				43	* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
				44	* Volume 2B: Instruction Set Reference, N-Z
				45	*
				46	* Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
				47	* Alexander Boyko <Alexander_Boyko@xyratex.com>
				48	*/
				49
				50	#include <linux/linkage.h>
				51	#include <asm/assembler.h>
				52
				53	.text
				54	.align 6
				55	.arch armv8-a
				56	.arch_extension crc
				57	.fpu crypto-neon-fp-armv8
				58
				59	.Lcrc32_constants:
				60	/*
				61	* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
				62	* #define CONSTANT_R1 0x154442bd4LL
				63	*
				64	* [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
				65	* #define CONSTANT_R2 0x1c6e41596LL
				66	*/
				67	.quad 0x0000000154442bd4
				68	.quad 0x00000001c6e41596
				69
				70	/*
				71	* [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
				72	* #define CONSTANT_R3 0x1751997d0LL
				73	*
				74	* [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
				75	* #define CONSTANT_R4 0x0ccaa009eLL
				76	*/
				77	.quad 0x00000001751997d0
				78	.quad 0x00000000ccaa009e
				79
				80	/*
				81	* [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
				82	* #define CONSTANT_R5 0x163cd6124LL
				83	*/
				84	.quad 0x0000000163cd6124
				85	.quad 0x00000000FFFFFFFF
				86
				87	/*
				88	* #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
				89	*
				90	* Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
				91	* = 0x1F7011641LL
				92	* #define CONSTANT_RU 0x1F7011641LL
				93	*/
				94	.quad 0x00000001DB710641
				95	.quad 0x00000001F7011641
				96
				97	.Lcrc32c_constants:
				98	.quad 0x00000000740eef02
				99	.quad 0x000000009e4addf8
				100	.quad 0x00000000f20c0dfe
				101	.quad 0x000000014cd00bd6
				102	.quad 0x00000000dd45aab8
				103	.quad 0x00000000FFFFFFFF
				104	.quad 0x0000000105ec76f0
				105	.quad 0x00000000dea713f1
				106
				107	dCONSTANTl .req d0
				108	dCONSTANTh .req d1
				109	qCONSTANT .req q0
				110
				111	BUF .req r0
				112	LEN .req r1
				113	CRC .req r2
				114
				115	qzr .req q9
				116
				117	/**
				118	* Calculate crc32
				119	* BUF - buffer
				120	* LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
				121	* CRC - initial crc32
				122	* return %eax crc32
				123	* uint crc32_pmull_le(unsigned char const *buffer,
				124	* size_t len, uint crc32)
				125	*/
				126	ENTRY(crc32_pmull_le)
				127	adr r3, .Lcrc32_constants
				128	b 0f
				129
				130	ENTRY(crc32c_pmull_le)
				131	adr r3, .Lcrc32c_constants
				132
				133	0: bic LEN, LEN, #15
				134	vld1.8 {q1-q2}, [BUF, :128]!
				135	vld1.8 {q3-q4}, [BUF, :128]!
				136	vmov.i8 qzr, #0
				137	vmov.i8 qCONSTANT, #0
Ard Biesheuvel	1fb1683	2017-02-28 14:36:56 +0000	[diff] [blame]	138	vmov.32 dCONSTANTl[0], CRC
Ard Biesheuvel	d0a3431	2016-12-05 18:42:28 +0000	[diff] [blame]	139	veor.8 d2, d2, dCONSTANTl
				140	sub LEN, LEN, #0x40
				141	cmp LEN, #0x40
				142	blt less_64
				143
				144	vld1.64 {qCONSTANT}, [r3]
				145
				146	loop_64: /* 64 bytes Full cache line folding */
				147	sub LEN, LEN, #0x40
				148
				149	vmull.p64 q5, d3, dCONSTANTh
				150	vmull.p64 q6, d5, dCONSTANTh
				151	vmull.p64 q7, d7, dCONSTANTh
				152	vmull.p64 q8, d9, dCONSTANTh
				153
				154	vmull.p64 q1, d2, dCONSTANTl
				155	vmull.p64 q2, d4, dCONSTANTl
				156	vmull.p64 q3, d6, dCONSTANTl
				157	vmull.p64 q4, d8, dCONSTANTl
				158
				159	veor.8 q1, q1, q5
				160	vld1.8 {q5}, [BUF, :128]!
				161	veor.8 q2, q2, q6
				162	vld1.8 {q6}, [BUF, :128]!
				163	veor.8 q3, q3, q7
				164	vld1.8 {q7}, [BUF, :128]!
				165	veor.8 q4, q4, q8
				166	vld1.8 {q8}, [BUF, :128]!
				167
				168	veor.8 q1, q1, q5
				169	veor.8 q2, q2, q6
				170	veor.8 q3, q3, q7
				171	veor.8 q4, q4, q8
				172
				173	cmp LEN, #0x40
				174	bge loop_64
				175
				176	less_64: /* Folding cache line into 128bit */
				177	vldr dCONSTANTl, [r3, #16]
				178	vldr dCONSTANTh, [r3, #24]
				179
				180	vmull.p64 q5, d3, dCONSTANTh
				181	vmull.p64 q1, d2, dCONSTANTl
				182	veor.8 q1, q1, q5
				183	veor.8 q1, q1, q2
				184
				185	vmull.p64 q5, d3, dCONSTANTh
				186	vmull.p64 q1, d2, dCONSTANTl
				187	veor.8 q1, q1, q5
				188	veor.8 q1, q1, q3
				189
				190	vmull.p64 q5, d3, dCONSTANTh
				191	vmull.p64 q1, d2, dCONSTANTl
				192	veor.8 q1, q1, q5
				193	veor.8 q1, q1, q4
				194
				195	teq LEN, #0
				196	beq fold_64
				197
				198	loop_16: /* Folding rest buffer into 128bit */
				199	subs LEN, LEN, #0x10
				200
				201	vld1.8 {q2}, [BUF, :128]!
				202	vmull.p64 q5, d3, dCONSTANTh
				203	vmull.p64 q1, d2, dCONSTANTl
				204	veor.8 q1, q1, q5
				205	veor.8 q1, q1, q2
				206
				207	bne loop_16
				208
				209	fold_64:
				210	/* perform the last 64 bit fold, also adds 32 zeroes
				211	* to the input stream */
				212	vmull.p64 q2, d2, dCONSTANTh
				213	vext.8 q1, q1, qzr, #8
				214	veor.8 q1, q1, q2
				215
				216	/* final 32-bit fold */
				217	vldr dCONSTANTl, [r3, #32]
				218	vldr d6, [r3, #40]
				219	vmov.i8 d7, #0
				220
				221	vext.8 q2, q1, qzr, #4
				222	vand.8 d2, d2, d6
				223	vmull.p64 q1, d2, dCONSTANTl
				224	veor.8 q1, q1, q2
				225
				226	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
				227	vldr dCONSTANTl, [r3, #48]
				228	vldr dCONSTANTh, [r3, #56]
				229
				230	vand.8 q2, q1, q3
				231	vext.8 q2, qzr, q2, #8
				232	vmull.p64 q2, d5, dCONSTANTh
				233	vand.8 q2, q2, q3
				234	vmull.p64 q2, d4, dCONSTANTl
				235	veor.8 q1, q1, q2
				236	vmov r0, s5
				237
				238	bx lr
				239	ENDPROC(crc32_pmull_le)
				240	ENDPROC(crc32c_pmull_le)
				241
				242	.macro __crc32, c
				243	subs ip, r2, #8
				244	bmi .Ltail\c
				245
				246	tst r1, #3
				247	bne .Lunaligned\c
				248
				249	teq ip, #0
				250	.Laligned8\c:
				251	ldrd r2, r3, [r1], #8
				252	ARM_BE8(rev r2, r2 )
				253	ARM_BE8(rev r3, r3 )
				254	crc32\c\()w r0, r0, r2
				255	crc32\c\()w r0, r0, r3
				256	bxeq lr
				257	subs ip, ip, #8
				258	bpl .Laligned8\c
				259
				260	.Ltail\c:
				261	tst ip, #4
				262	beq 2f
				263	ldr r3, [r1], #4
				264	ARM_BE8(rev r3, r3 )
				265	crc32\c\()w r0, r0, r3
				266
				267	2: tst ip, #2
				268	beq 1f
				269	ldrh r3, [r1], #2
				270	ARM_BE8(rev16 r3, r3 )
				271	crc32\c\()h r0, r0, r3
				272
				273	1: tst ip, #1
				274	bxeq lr
				275	ldrb r3, [r1]
				276	crc32\c\()b r0, r0, r3
				277	bx lr
				278
				279	.Lunaligned\c:
				280	tst r1, #1
				281	beq 2f
				282	ldrb r3, [r1], #1
				283	subs r2, r2, #1
				284	crc32\c\()b r0, r0, r3
				285
				286	tst r1, #2
				287	beq 0f
				288	2: ldrh r3, [r1], #2
				289	subs r2, r2, #2
				290	ARM_BE8(rev16 r3, r3 )
				291	crc32\c\()h r0, r0, r3
				292
				293	0: subs ip, r2, #8
				294	bpl .Laligned8\c
				295	b .Ltail\c
				296	.endm
				297
				298	.align 5
				299	ENTRY(crc32_armv8_le)
				300	__crc32
				301	ENDPROC(crc32_armv8_le)
				302
				303	.align 5
				304	ENTRY(crc32c_armv8_le)
				305	__crc32 c
				306	ENDPROC(crc32c_armv8_le)