Commit 4f1aef9b authored by Tianjia Zhang's avatar Tianjia Zhang Committed by Herbert Xu
Browse files

crypto: arm64/sm4 - add ARMv8 NEON implementation



This adds ARMv8 NEON implementations of SM4 in ECB, CBC, CFB and CTR
modes. This implementation uses the plain NEON instruction set, All
S-BOX substitutions uses the tbl/tbx instructions of ARMv8, combined
with the out-of-order execution in CPU, this optimization supports
encryption of up to 8 blocks at the same time.

The performance of encrypting one block is not as good as software
implementation, so the encryption operations of CBC and CFB still
use pure software algorithms.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 218
mode of tcrypt. The abscissas are blocks of different lengths. The
data is tabulated and the unit is Mb/s:

sm4-generic |     16       64      128      256     1024     1420     4096
    ECB enc |  80.05    91.42    93.66    94.77    95.69    95.77    95.86
    ECB dec |  79.98    91.41    93.64    94.76    95.66    95.77    95.85
    CBC enc |  78.55    86.50    88.02    88.77    89.36    89.42    89.48
    CBC dec |  76.82    89.06    91.52    92.77    93.75    93.83    93.96
    CFB enc |  77.64    86.13    87.62    88.42    89.08    88.83    89.18
    CFB dec |  77.57    88.34    90.36    91.45    92.34    92.00    92.44
    CTR enc |  77.80    88.28    90.23    91.22    92.11    91.81    92.25
    CTR dec |  77.83    88.22    90.22    91.22    92.04    91.82    92.28
sm4-neon
    ECB enc |  28.31   112.77   203.03   209.89   215.49   202.11   210.59
    ECB dec |  28.36   113.45   203.23   210.00   215.52   202.13   210.65
    CBC enc |  79.32    87.02    88.51    89.28    89.85    89.89    89.97
    CBC dec |  28.29   112.20   203.30   209.82   214.99   201.51   209.95
    CFB enc |  79.59    87.16    88.54    89.30    89.83    89.62    89.92
    CFB dec |  28.12   111.05   202.47   209.02   214.21   210.90   209.12
    CTR enc |  28.04   108.81   200.62   206.65   211.78   208.78   206.74
    CTR dec |  28.02   108.82   200.45   206.62   211.78   208.74   206.70

Signed-off-by: default avatarTianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 02436762
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -53,6 +53,12 @@ config CRYPTO_SM4_ARM64_CE
	select CRYPTO_ALGAPI
	select CRYPTO_SM4

config CRYPTO_SM4_ARM64_NEON_BLK
	tristate "SM4 in ECB/CBC/CFB/CTR modes using NEON instructions"
	depends on KERNEL_MODE_NEON
	select CRYPTO_SKCIPHER
	select CRYPTO_LIB_SM4

config CRYPTO_GHASH_ARM64_CE
	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
	depends on KERNEL_MODE_NEON
+3 −0
Original line number Diff line number Diff line
@@ -23,6 +23,9 @@ sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o
sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o

obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o
sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o

obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o

+487 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SM4 Cipher Algorithm for ARMv8 NEON
 * as specified in
 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
 *
 * Copyright (C) 2022, Alibaba Group.
 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

/* Register macros */

#define RTMP0	v8
#define RTMP1	v9
#define RTMP2	v10
#define RTMP3	v11

#define RX0	v12
#define RX1	v13
#define RKEY	v14
#define RIV	v15

/* Helper macros. */

#define PREPARE                                                 \
	adr_l		x5, crypto_sm4_sbox;                    \
	ld1		{v16.16b-v19.16b}, [x5], #64;           \
	ld1		{v20.16b-v23.16b}, [x5], #64;           \
	ld1		{v24.16b-v27.16b}, [x5], #64;           \
	ld1		{v28.16b-v31.16b}, [x5];

#define transpose_4x4(s0, s1, s2, s3)                           \
	zip1		RTMP0.4s, s0.4s, s1.4s;                 \
	zip1		RTMP1.4s, s2.4s, s3.4s;                 \
	zip2		RTMP2.4s, s0.4s, s1.4s;                 \
	zip2		RTMP3.4s, s2.4s, s3.4s;                 \
	zip1		s0.2d, RTMP0.2d, RTMP1.2d;              \
	zip2		s1.2d, RTMP0.2d, RTMP1.2d;              \
	zip1		s2.2d, RTMP2.2d, RTMP3.2d;              \
	zip2		s3.2d, RTMP2.2d, RTMP3.2d;

#define rotate_clockwise_90(s0, s1, s2, s3)                     \
	zip1		RTMP0.4s, s1.4s, s0.4s;                 \
	zip2		RTMP1.4s, s1.4s, s0.4s;                 \
	zip1		RTMP2.4s, s3.4s, s2.4s;                 \
	zip2		RTMP3.4s, s3.4s, s2.4s;                 \
	zip1		s0.2d, RTMP2.2d, RTMP0.2d;              \
	zip2		s1.2d, RTMP2.2d, RTMP0.2d;              \
	zip1		s2.2d, RTMP3.2d, RTMP1.2d;              \
	zip2		s3.2d, RTMP3.2d, RTMP1.2d;

#define ROUND4(round, s0, s1, s2, s3)                           \
	dup		RX0.4s, RKEY.s[round];                  \
	/* rk ^ s1 ^ s2 ^ s3 */                                 \
	eor		RTMP1.16b, s2.16b, s3.16b;              \
	eor		RX0.16b, RX0.16b, s1.16b;               \
	eor		RX0.16b, RX0.16b, RTMP1.16b;            \
                                                                \
	/* sbox, non-linear part */                             \
	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
                                                                \
	/* linear part */                                       \
	shl		RTMP1.4s, RTMP0.4s, #8;                 \
	shl		RTMP2.4s, RTMP0.4s, #16;                \
	shl		RTMP3.4s, RTMP0.4s, #24;                \
	sri		RTMP1.4s, RTMP0.4s, #(32-8);            \
	sri		RTMP2.4s, RTMP0.4s, #(32-16);           \
	sri		RTMP3.4s, RTMP0.4s, #(32-24);           \
	/* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
	eor		RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
	eor		RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
	/* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
	eor		RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
	shl		RTMP2.4s, RTMP1.4s, 2;                  \
	sri		RTMP2.4s, RTMP1.4s, #(32-2);            \
	eor		RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
	/* s0 ^= RTMP3 */                                       \
	eor		s0.16b, s0.16b, RTMP3.16b;

#define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
	rev32		b0.16b, b0.16b;                         \
	rev32		b1.16b, b1.16b;                         \
	rev32		b2.16b, b2.16b;                         \
	rev32		b3.16b, b3.16b;                         \
                                                                \
	transpose_4x4(b0, b1, b2, b3);                          \
                                                                \
	mov		x6, 8;                                  \
4:                                                              \
	ld1		{RKEY.4s}, [x0], #16;                   \
	subs		x6, x6, #1;                             \
                                                                \
	ROUND4(0, b0, b1, b2, b3);                              \
	ROUND4(1, b1, b2, b3, b0);                              \
	ROUND4(2, b2, b3, b0, b1);                              \
	ROUND4(3, b3, b0, b1, b2);                              \
                                                                \
	bne		4b;                                     \
                                                                \
	rotate_clockwise_90(b0, b1, b2, b3);                    \
	rev32		b0.16b, b0.16b;                         \
	rev32		b1.16b, b1.16b;                         \
	rev32		b2.16b, b2.16b;                         \
	rev32		b3.16b, b3.16b;                         \
                                                                \
	/* repoint to rkey */                                   \
	sub		x0, x0, #128;

#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
	/* rk ^ s1 ^ s2 ^ s3 */                                 \
	dup		RX0.4s, RKEY.s[round];                  \
	eor		RTMP0.16b, s2.16b, s3.16b;              \
	mov		RX1.16b, RX0.16b;                       \
	eor		RTMP1.16b, t2.16b, t3.16b;              \
	eor		RX0.16b, RX0.16b, s1.16b;               \
	eor		RX1.16b, RX1.16b, t1.16b;               \
	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
                                                                \
	/* sbox, non-linear part */                             \
	movi		RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
	tbl		RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
	tbl		RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
	tbx		RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
	tbx		RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
	tbx		RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
	tbx		RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
	sub		RX0.16b, RX0.16b, RTMP3.16b;            \
	sub		RX1.16b, RX1.16b, RTMP3.16b;            \
	tbx		RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
	tbx		RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
                                                                \
	/* linear part */                                       \
	shl		RX0.4s, RTMP0.4s, #8;                   \
	shl		RX1.4s, RTMP1.4s, #8;                   \
	shl		RTMP2.4s, RTMP0.4s, #16;                \
	shl		RTMP3.4s, RTMP1.4s, #16;                \
	sri		RX0.4s, RTMP0.4s, #(32 - 8);            \
	sri		RX1.4s, RTMP1.4s, #(32 - 8);            \
	sri		RTMP2.4s, RTMP0.4s, #(32 - 16);         \
	sri		RTMP3.4s, RTMP1.4s, #(32 - 16);         \
	/* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
	eor		RX0.16b, RX0.16b, RTMP0.16b;            \
	eor		RX1.16b, RX1.16b, RTMP1.16b;            \
	eor		RX0.16b, RX0.16b, RTMP2.16b;            \
	eor		RX1.16b, RX1.16b, RTMP3.16b;            \
	/* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
	shl		RTMP2.4s, RTMP0.4s, #24;                \
	shl		RTMP3.4s, RTMP1.4s, #24;                \
	sri		RTMP2.4s, RTMP0.4s, #(32 - 24);         \
	sri		RTMP3.4s, RTMP1.4s, #(32 - 24);         \
	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
	shl		RTMP2.4s, RX0.4s, #2;                   \
	shl		RTMP3.4s, RX1.4s, #2;                   \
	sri		RTMP2.4s, RX0.4s, #(32 - 2);            \
	sri		RTMP3.4s, RX1.4s, #(32 - 2);            \
	eor		RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
	eor		RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
	/* s0/t0 ^= RTMP0/1 */                                  \
	eor		s0.16b, s0.16b, RTMP0.16b;              \
	eor		t0.16b, t0.16b, RTMP1.16b;

#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)          \
	rev32		b0.16b, b0.16b;                         \
	rev32		b1.16b, b1.16b;                         \
	rev32		b2.16b, b2.16b;                         \
	rev32		b3.16b, b3.16b;                         \
	rev32		b4.16b, b4.16b;                         \
	rev32		b5.16b, b5.16b;                         \
	rev32		b6.16b, b6.16b;                         \
	rev32		b7.16b, b7.16b;                         \
                                                                \
	transpose_4x4(b0, b1, b2, b3);                          \
	transpose_4x4(b4, b5, b6, b7);                          \
                                                                \
	mov		x6, 8;                                  \
8:                                                              \
	ld1		{RKEY.4s}, [x0], #16;                   \
	subs		x6, x6, #1;                             \
                                                                \
	ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
	ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
	ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
	ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
                                                                \
	bne		8b;                                     \
                                                                \
	rotate_clockwise_90(b0, b1, b2, b3);                    \
	rotate_clockwise_90(b4, b5, b6, b7);                    \
	rev32		b0.16b, b0.16b;                         \
	rev32		b1.16b, b1.16b;                         \
	rev32		b2.16b, b2.16b;                         \
	rev32		b3.16b, b3.16b;                         \
	rev32		b4.16b, b4.16b;                         \
	rev32		b5.16b, b5.16b;                         \
	rev32		b6.16b, b6.16b;                         \
	rev32		b7.16b, b7.16b;                         \
                                                                \
	/* repoint to rkey */                                   \
	sub		x0, x0, #128;


.align 3
SYM_FUNC_START_LOCAL(__sm4_neon_crypt_blk1_4)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   w3: num blocks (1..4)
	 */
	PREPARE;

	ld1		{v0.16b}, [x2], #16;
	mov		v1.16b, v0.16b;
	mov		v2.16b, v0.16b;
	mov		v3.16b, v0.16b;
	cmp		w3, #2;
	blt		.Lblk4_load_input_done;
	ld1		{v1.16b}, [x2], #16;
	beq		.Lblk4_load_input_done;
	ld1		{v2.16b}, [x2], #16;
	cmp		w3, #3;
	beq		.Lblk4_load_input_done;
	ld1		{v3.16b}, [x2];

.Lblk4_load_input_done:
	SM4_CRYPT_BLK4(v0, v1, v2, v3);

	st1		{v0.16b}, [x1], #16;
	cmp		w3, #2;
	blt		.Lblk4_store_output_done;
	st1		{v1.16b}, [x1], #16;
	beq		.Lblk4_store_output_done;
	st1		{v2.16b}, [x1], #16;
	cmp		w3, #3;
	beq		.Lblk4_store_output_done;
	st1		{v3.16b}, [x1];

.Lblk4_store_output_done:
	ret;
SYM_FUNC_END(__sm4_neon_crypt_blk1_4)

.align 3
SYM_FUNC_START(sm4_neon_crypt_blk1_8)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   w3: num blocks (1..8)
	 */
	cmp		w3, #5;
	blt		__sm4_neon_crypt_blk1_4;

	PREPARE;

	ld1		{v0.16b-v3.16b}, [x2], #64;
	ld1		{v4.16b}, [x2], #16;
	mov		v5.16b, v4.16b;
	mov		v6.16b, v4.16b;
	mov		v7.16b, v4.16b;
	beq		.Lblk8_load_input_done;
	ld1		{v5.16b}, [x2], #16;
	cmp		w3, #7;
	blt		.Lblk8_load_input_done;
	ld1		{v6.16b}, [x2], #16;
	beq		.Lblk8_load_input_done;
	ld1		{v7.16b}, [x2];

.Lblk8_load_input_done:
	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	cmp		w3, #6;
	st1		{v0.16b-v3.16b}, [x1], #64;
	st1		{v4.16b}, [x1], #16;
	blt		.Lblk8_store_output_done;
	st1		{v5.16b}, [x1], #16;
	beq		.Lblk8_store_output_done;
	st1		{v6.16b}, [x1], #16;
	cmp		w3, #7;
	beq		.Lblk8_store_output_done;
	st1		{v7.16b}, [x1];

.Lblk8_store_output_done:
	ret;
SYM_FUNC_END(sm4_neon_crypt_blk1_8)

.align 3
SYM_FUNC_START(sm4_neon_crypt_blk8)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   w3: nblocks (multiples of 8)
	 */
	PREPARE;

.Lcrypt_loop_blk:
	subs		w3, w3, #8;
	bmi		.Lcrypt_end;

	ld1		{v0.16b-v3.16b}, [x2], #64;
	ld1		{v4.16b-v7.16b}, [x2], #64;

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	st1		{v0.16b-v3.16b}, [x1], #64;
	st1		{v4.16b-v7.16b}, [x1], #64;

	b		.Lcrypt_loop_blk;

.Lcrypt_end:
	ret;
SYM_FUNC_END(sm4_neon_crypt_blk8)

.align 3
SYM_FUNC_START(sm4_neon_cbc_dec_blk8)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks (multiples of 8)
	 */
	PREPARE;

	ld1		{RIV.16b}, [x3];

.Lcbc_loop_blk:
	subs		w4, w4, #8;
	bmi		.Lcbc_end;

	ld1		{v0.16b-v3.16b}, [x2], #64;
	ld1		{v4.16b-v7.16b}, [x2];

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	sub		x2, x2, #64;
	eor		v0.16b, v0.16b, RIV.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v1.16b, v1.16b, RTMP0.16b;
	eor		v2.16b, v2.16b, RTMP1.16b;
	eor		v3.16b, v3.16b, RTMP2.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	eor		v4.16b, v4.16b, RTMP3.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v5.16b, v5.16b, RTMP0.16b;
	eor		v6.16b, v6.16b, RTMP1.16b;
	eor		v7.16b, v7.16b, RTMP2.16b;

	mov		RIV.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;

	b		.Lcbc_loop_blk;

.Lcbc_end:
	/* store new IV */
	st1		{RIV.16b}, [x3];

	ret;
SYM_FUNC_END(sm4_neon_cbc_dec_blk8)

.align 3
SYM_FUNC_START(sm4_neon_cfb_dec_blk8)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks (multiples of 8)
	 */
	PREPARE;

	ld1		{v0.16b}, [x3];

.Lcfb_loop_blk:
	subs		w4, w4, #8;
	bmi		.Lcfb_end;

	ld1		{v1.16b, v2.16b, v3.16b}, [x2], #48;
	ld1		{v4.16b-v7.16b}, [x2];

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	sub		x2, x2, #48;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v4.16b, v4.16b, RTMP0.16b;
	eor		v5.16b, v5.16b, RTMP1.16b;
	eor		v6.16b, v6.16b, RTMP2.16b;
	eor		v7.16b, v7.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;

	mov		v0.16b, RTMP3.16b;

	b		.Lcfb_loop_blk;

.Lcfb_end:
	/* store new IV */
	st1		{v0.16b}, [x3];

	ret;
SYM_FUNC_END(sm4_neon_cfb_dec_blk8)

.align 3
SYM_FUNC_START(sm4_neon_ctr_enc_blk8)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: ctr (big endian, 128 bit)
	 *   w4: nblocks (multiples of 8)
	 */
	PREPARE;

	ldp		x7, x8, [x3];
	rev		x7, x7;
	rev		x8, x8;

.Lctr_loop_blk:
	subs		w4, w4, #8;
	bmi		.Lctr_end;

#define inc_le128(vctr)                     \
	mov		vctr.d[1], x8;      \
	mov		vctr.d[0], x7;      \
	adds		x8, x8, #1;         \
	adc		x7, x7, xzr;        \
	rev64		vctr.16b, vctr.16b;

	/* construct CTRs */
	inc_le128(v0);			/* +0 */
	inc_le128(v1);			/* +1 */
	inc_le128(v2);			/* +2 */
	inc_le128(v3);			/* +3 */
	inc_le128(v4);			/* +4 */
	inc_le128(v5);			/* +5 */
	inc_le128(v6);			/* +6 */
	inc_le128(v7);			/* +7 */

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v4.16b, v4.16b, RTMP0.16b;
	eor		v5.16b, v5.16b, RTMP1.16b;
	eor		v6.16b, v6.16b, RTMP2.16b;
	eor		v7.16b, v7.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;

	b		.Lctr_loop_blk;

.Lctr_end:
	/* store new CTR */
	rev		x7, x7;
	rev		x8, x8;
	stp		x7, x8, [x3];

	ret;
SYM_FUNC_END(sm4_neon_ctr_enc_blk8)
+442 −0

File added.

Preview size limit exceeded, changes collapsed.