Commit 5b33e0ec authored by Tianjia Zhang's avatar Tianjia Zhang Committed by Herbert Xu
Browse files

crypto: arm64/sm4 - add ARMv8 Crypto Extensions implementation



This adds ARMv8 implementations of SM4 in ECB, CBC, CFB and CTR
modes using Crypto Extensions, also includes key expansion operations
because the Crypto Extensions instruction is much faster than software
implementations.

The Crypto Extensions for SM4 can only run on ARMv8 implementations
that have support for these optional extensions.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 218
mode of tcrypt. The abscissas are blocks of different lengths. The
data is tabulated and the unit is Mb/s:

sm4-generic |     16       64      128      256     1024     1420     4096
    ECB enc |  80.05    91.42    93.66    94.77    95.69    95.77    95.86
    ECB dec |  79.98    91.41    93.64    94.76    95.66    95.77    95.85
    CBC enc |  78.55    86.50    88.02    88.77    89.36    89.42    89.48
    CBC dec |  76.82    89.06    91.52    92.77    93.75    93.83    93.96
    CFB enc |  77.64    86.13    87.62    88.42    89.08    88.83    89.18
    CFB dec |  77.57    88.34    90.36    91.45    92.34    92.00    92.44
    CTR enc |  77.80    88.28    90.23    91.22    92.11    91.81    92.25
    CTR dec |  77.83    88.22    90.22    91.22    92.04    91.82    92.28
sm4-neon
    ECB enc |  28.31   112.77   203.03   209.89   215.49   202.11   210.59
    ECB dec |  28.36   113.45   203.23   210.00   215.52   202.13   210.65
    CBC enc |  79.32    87.02    88.51    89.28    89.85    89.89    89.97
    CBC dec |  28.29   112.20   203.30   209.82   214.99   201.51   209.95
    CFB enc |  79.59    87.16    88.54    89.30    89.83    89.62    89.92
    CFB dec |  28.12   111.05   202.47   209.02   214.21   210.90   209.12
    CTR enc |  28.04   108.81   200.62   206.65   211.78   208.78   206.74
    CTR dec |  28.02   108.82   200.45   206.62   211.78   208.74   206.70
sm4-ce-cipher
    ECB enc | 336.79   587.13   682.70   747.37   803.75   811.52   818.06
    ECB dec | 339.18   584.52   679.72   743.68   798.82   803.83   811.54
    CBC enc | 316.63   521.47   597.00   647.14   690.82   695.21   700.55
    CBC dec | 291.80   503.79   585.66   640.82   689.86   695.16   701.72
    CFB enc | 294.79   482.31   552.13   594.71   631.60   628.91   638.92
    CFB dec | 293.09   466.44   526.56   563.17   594.41   592.26   601.97
    CTR enc | 309.61   506.13   576.86   620.47   656.38   654.51   665.10
    CTR dec | 306.69   505.57   576.84   620.18   657.09   654.52   665.32
sm4-ce
    ECB enc | 366.96  1329.81  2024.29  2755.50  3790.07  3861.91  4051.40
    ECB dec | 367.30  1323.93  2018.72  2747.43  3787.39  3862.55  4052.62
    CBC enc | 358.09   682.68   807.24   885.35   958.29   963.60   973.73
    CBC dec | 366.51  1303.63  1978.64  2667.93  3624.53  3683.41  3856.08
    CFB enc | 351.51   681.26   807.81   893.10   968.54   969.17   985.83
    CFB dec | 354.98  1266.61  1929.63  2634.81  3614.23  3611.59  3841.68
    CTR enc | 324.23  1121.25  1689.44  2256.70  2981.90  3007.79  3060.74
    CTR dec | 324.18  1120.44  1694.31  2258.32  2982.01  3010.09  3060.99

Signed-off-by: default avatarTianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 4f1aef9b
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -53,6 +53,12 @@ config CRYPTO_SM4_ARM64_CE
	select CRYPTO_ALGAPI
	select CRYPTO_SM4

config CRYPTO_SM4_ARM64_CE_BLK
	tristate "SM4 in ECB/CBC/CFB/CTR modes using ARMv8 Crypto Extensions"
	depends on KERNEL_MODE_NEON
	select CRYPTO_SKCIPHER
	select CRYPTO_LIB_SM4

config CRYPTO_SM4_ARM64_NEON_BLK
	tristate "SM4 in ECB/CBC/CFB/CTR modes using NEON instructions"
	depends on KERNEL_MODE_NEON
+3 −0
Original line number Diff line number Diff line
@@ -23,6 +23,9 @@ sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce-cipher.o
sm4-ce-cipher-y := sm4-ce-cipher-glue.o sm4-ce-cipher-core.o

obj-$(CONFIG_CRYPTO_SM4_ARM64_CE_BLK) += sm4-ce.o
sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o

obj-$(CONFIG_CRYPTO_SM4_ARM64_NEON_BLK) += sm4-neon.o
sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o

+660 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
 * as specified in
 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
 *
 * Copyright (C) 2022, Alibaba Group.
 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

.arch	armv8-a+crypto

.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
	.set .Lv\b\().4s, \b
.endr

.macro sm4e, vd, vn
	.inst 0xcec08400 | (.L\vn << 5) | .L\vd
.endm

.macro sm4ekey, vd, vn, vm
	.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
.endm

/* Register macros */

#define RTMP0	v16
#define RTMP1	v17
#define RTMP2	v18
#define RTMP3	v19

#define RIV	v20

/* Helper macros. */

#define PREPARE                                       \
	ld1		{v24.16b-v27.16b}, [x0], #64; \
	ld1		{v28.16b-v31.16b}, [x0];

#define SM4_CRYPT_BLK(b0)                           \
	rev32		b0.16b, b0.16b;             \
	sm4e		b0.4s, v24.4s;              \
	sm4e		b0.4s, v25.4s;              \
	sm4e		b0.4s, v26.4s;              \
	sm4e		b0.4s, v27.4s;              \
	sm4e		b0.4s, v28.4s;              \
	sm4e		b0.4s, v29.4s;              \
	sm4e		b0.4s, v30.4s;              \
	sm4e		b0.4s, v31.4s;              \
	rev64		b0.4s, b0.4s;               \
	ext		b0.16b, b0.16b, b0.16b, #8; \
	rev32		b0.16b, b0.16b;

#define SM4_CRYPT_BLK4(b0, b1, b2, b3)              \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;             \
	sm4e		b0.4s, v24.4s;              \
	sm4e		b1.4s, v24.4s;              \
	sm4e		b2.4s, v24.4s;              \
	sm4e		b3.4s, v24.4s;              \
	sm4e		b0.4s, v25.4s;              \
	sm4e		b1.4s, v25.4s;              \
	sm4e		b2.4s, v25.4s;              \
	sm4e		b3.4s, v25.4s;              \
	sm4e		b0.4s, v26.4s;              \
	sm4e		b1.4s, v26.4s;              \
	sm4e		b2.4s, v26.4s;              \
	sm4e		b3.4s, v26.4s;              \
	sm4e		b0.4s, v27.4s;              \
	sm4e		b1.4s, v27.4s;              \
	sm4e		b2.4s, v27.4s;              \
	sm4e		b3.4s, v27.4s;              \
	sm4e		b0.4s, v28.4s;              \
	sm4e		b1.4s, v28.4s;              \
	sm4e		b2.4s, v28.4s;              \
	sm4e		b3.4s, v28.4s;              \
	sm4e		b0.4s, v29.4s;              \
	sm4e		b1.4s, v29.4s;              \
	sm4e		b2.4s, v29.4s;              \
	sm4e		b3.4s, v29.4s;              \
	sm4e		b0.4s, v30.4s;              \
	sm4e		b1.4s, v30.4s;              \
	sm4e		b2.4s, v30.4s;              \
	sm4e		b3.4s, v30.4s;              \
	sm4e		b0.4s, v31.4s;              \
	sm4e		b1.4s, v31.4s;              \
	sm4e		b2.4s, v31.4s;              \
	sm4e		b3.4s, v31.4s;              \
	rev64		b0.4s, b0.4s;               \
	rev64		b1.4s, b1.4s;               \
	rev64		b2.4s, b2.4s;               \
	rev64		b3.4s, b3.4s;               \
	ext		b0.16b, b0.16b, b0.16b, #8; \
	ext		b1.16b, b1.16b, b1.16b, #8; \
	ext		b2.16b, b2.16b, b2.16b, #8; \
	ext		b3.16b, b3.16b, b3.16b, #8; \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;

#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;             \
	rev32		b4.16b, b4.16b;             \
	rev32		b5.16b, b5.16b;             \
	rev32		b6.16b, b6.16b;             \
	rev32		b7.16b, b7.16b;             \
	sm4e		b0.4s, v24.4s;              \
	sm4e		b1.4s, v24.4s;              \
	sm4e		b2.4s, v24.4s;              \
	sm4e		b3.4s, v24.4s;              \
	sm4e		b4.4s, v24.4s;              \
	sm4e		b5.4s, v24.4s;              \
	sm4e		b6.4s, v24.4s;              \
	sm4e		b7.4s, v24.4s;              \
	sm4e		b0.4s, v25.4s;              \
	sm4e		b1.4s, v25.4s;              \
	sm4e		b2.4s, v25.4s;              \
	sm4e		b3.4s, v25.4s;              \
	sm4e		b4.4s, v25.4s;              \
	sm4e		b5.4s, v25.4s;              \
	sm4e		b6.4s, v25.4s;              \
	sm4e		b7.4s, v25.4s;              \
	sm4e		b0.4s, v26.4s;              \
	sm4e		b1.4s, v26.4s;              \
	sm4e		b2.4s, v26.4s;              \
	sm4e		b3.4s, v26.4s;              \
	sm4e		b4.4s, v26.4s;              \
	sm4e		b5.4s, v26.4s;              \
	sm4e		b6.4s, v26.4s;              \
	sm4e		b7.4s, v26.4s;              \
	sm4e		b0.4s, v27.4s;              \
	sm4e		b1.4s, v27.4s;              \
	sm4e		b2.4s, v27.4s;              \
	sm4e		b3.4s, v27.4s;              \
	sm4e		b4.4s, v27.4s;              \
	sm4e		b5.4s, v27.4s;              \
	sm4e		b6.4s, v27.4s;              \
	sm4e		b7.4s, v27.4s;              \
	sm4e		b0.4s, v28.4s;              \
	sm4e		b1.4s, v28.4s;              \
	sm4e		b2.4s, v28.4s;              \
	sm4e		b3.4s, v28.4s;              \
	sm4e		b4.4s, v28.4s;              \
	sm4e		b5.4s, v28.4s;              \
	sm4e		b6.4s, v28.4s;              \
	sm4e		b7.4s, v28.4s;              \
	sm4e		b0.4s, v29.4s;              \
	sm4e		b1.4s, v29.4s;              \
	sm4e		b2.4s, v29.4s;              \
	sm4e		b3.4s, v29.4s;              \
	sm4e		b4.4s, v29.4s;              \
	sm4e		b5.4s, v29.4s;              \
	sm4e		b6.4s, v29.4s;              \
	sm4e		b7.4s, v29.4s;              \
	sm4e		b0.4s, v30.4s;              \
	sm4e		b1.4s, v30.4s;              \
	sm4e		b2.4s, v30.4s;              \
	sm4e		b3.4s, v30.4s;              \
	sm4e		b4.4s, v30.4s;              \
	sm4e		b5.4s, v30.4s;              \
	sm4e		b6.4s, v30.4s;              \
	sm4e		b7.4s, v30.4s;              \
	sm4e		b0.4s, v31.4s;              \
	sm4e		b1.4s, v31.4s;              \
	sm4e		b2.4s, v31.4s;              \
	sm4e		b3.4s, v31.4s;              \
	sm4e		b4.4s, v31.4s;              \
	sm4e		b5.4s, v31.4s;              \
	sm4e		b6.4s, v31.4s;              \
	sm4e		b7.4s, v31.4s;              \
	rev64		b0.4s, b0.4s;               \
	rev64		b1.4s, b1.4s;               \
	rev64		b2.4s, b2.4s;               \
	rev64		b3.4s, b3.4s;               \
	rev64		b4.4s, b4.4s;               \
	rev64		b5.4s, b5.4s;               \
	rev64		b6.4s, b6.4s;               \
	rev64		b7.4s, b7.4s;               \
	ext		b0.16b, b0.16b, b0.16b, #8; \
	ext		b1.16b, b1.16b, b1.16b, #8; \
	ext		b2.16b, b2.16b, b2.16b, #8; \
	ext		b3.16b, b3.16b, b3.16b, #8; \
	ext		b4.16b, b4.16b, b4.16b, #8; \
	ext		b5.16b, b5.16b, b5.16b, #8; \
	ext		b6.16b, b6.16b, b6.16b, #8; \
	ext		b7.16b, b7.16b, b7.16b, #8; \
	rev32		b0.16b, b0.16b;             \
	rev32		b1.16b, b1.16b;             \
	rev32		b2.16b, b2.16b;             \
	rev32		b3.16b, b3.16b;             \
	rev32		b4.16b, b4.16b;             \
	rev32		b5.16b, b5.16b;             \
	rev32		b6.16b, b6.16b;             \
	rev32		b7.16b, b7.16b;


.align 3
SYM_FUNC_START(sm4_ce_expand_key)
	/* input:
	 *   x0: 128-bit key
	 *   x1: rkey_enc
	 *   x2: rkey_dec
	 *   x3: fk array
	 *   x4: ck array
	 */
	ld1		{v0.16b}, [x0];
	rev32		v0.16b, v0.16b;
	ld1		{v1.16b}, [x3];
	/* load ck */
	ld1		{v24.16b-v27.16b}, [x4], #64;
	ld1		{v28.16b-v31.16b}, [x4];

	/* input ^ fk */
	eor		v0.16b, v0.16b, v1.16b;

	sm4ekey		v0.4s, v0.4s, v24.4s;
	sm4ekey		v1.4s, v0.4s, v25.4s;
	sm4ekey		v2.4s, v1.4s, v26.4s;
	sm4ekey		v3.4s, v2.4s, v27.4s;
	sm4ekey		v4.4s, v3.4s, v28.4s;
	sm4ekey		v5.4s, v4.4s, v29.4s;
	sm4ekey		v6.4s, v5.4s, v30.4s;
	sm4ekey		v7.4s, v6.4s, v31.4s;

	st1		{v0.16b-v3.16b}, [x1], #64;
	st1		{v4.16b-v7.16b}, [x1];
	rev64		v7.4s, v7.4s;
	rev64		v6.4s, v6.4s;
	rev64		v5.4s, v5.4s;
	rev64		v4.4s, v4.4s;
	rev64		v3.4s, v3.4s;
	rev64		v2.4s, v2.4s;
	rev64		v1.4s, v1.4s;
	rev64		v0.4s, v0.4s;
	ext		v7.16b, v7.16b, v7.16b, #8;
	ext		v6.16b, v6.16b, v6.16b, #8;
	ext		v5.16b, v5.16b, v5.16b, #8;
	ext		v4.16b, v4.16b, v4.16b, #8;
	ext		v3.16b, v3.16b, v3.16b, #8;
	ext		v2.16b, v2.16b, v2.16b, #8;
	ext		v1.16b, v1.16b, v1.16b, #8;
	ext		v0.16b, v0.16b, v0.16b, #8;
	st1		{v7.16b}, [x2], #16;
	st1		{v6.16b}, [x2], #16;
	st1		{v5.16b}, [x2], #16;
	st1		{v4.16b}, [x2], #16;
	st1		{v3.16b}, [x2], #16;
	st1		{v2.16b}, [x2], #16;
	st1		{v1.16b}, [x2], #16;
	st1		{v0.16b}, [x2];

	ret;
SYM_FUNC_END(sm4_ce_expand_key)

.align 3
SYM_FUNC_START(sm4_ce_crypt_block)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 */
	PREPARE;

	ld1		{v0.16b}, [x2];
	SM4_CRYPT_BLK(v0);
	st1		{v0.16b}, [x1];

	ret;
SYM_FUNC_END(sm4_ce_crypt_block)

.align 3
SYM_FUNC_START(sm4_ce_crypt)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   w3: nblocks
	 */
	PREPARE;

.Lcrypt_loop_blk:
	sub		w3, w3, #8;
	tbnz		w3, #31, .Lcrypt_tail8;

	ld1		{v0.16b-v3.16b}, [x2], #64;
	ld1		{v4.16b-v7.16b}, [x2], #64;

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	st1		{v0.16b-v3.16b}, [x1], #64;
	st1		{v4.16b-v7.16b}, [x1], #64;

	cbz		w3, .Lcrypt_end;
	b		.Lcrypt_loop_blk;

.Lcrypt_tail8:
	add		w3, w3, #8;
	cmp		w3, #4;
	blt		.Lcrypt_tail4;

	sub		w3, w3, #4;

	ld1		{v0.16b-v3.16b}, [x2], #64;
	SM4_CRYPT_BLK4(v0, v1, v2, v3);
	st1		{v0.16b-v3.16b}, [x1], #64;

	cbz		w3, .Lcrypt_end;

.Lcrypt_tail4:
	sub		w3, w3, #1;

	ld1		{v0.16b}, [x2], #16;
	SM4_CRYPT_BLK(v0);
	st1		{v0.16b}, [x1], #16;

	cbnz		w3, .Lcrypt_tail4;

.Lcrypt_end:
	ret;
SYM_FUNC_END(sm4_ce_crypt)

.align 3
SYM_FUNC_START(sm4_ce_cbc_enc)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;

	ld1		{RIV.16b}, [x3];

.Lcbc_enc_loop:
	sub		w4, w4, #1;

	ld1		{RTMP0.16b}, [x2], #16;
	eor		RIV.16b, RIV.16b, RTMP0.16b;

	SM4_CRYPT_BLK(RIV);

	st1		{RIV.16b}, [x1], #16;

	cbnz		w4, .Lcbc_enc_loop;

	/* store new IV */
	st1		{RIV.16b}, [x3];

	ret;
SYM_FUNC_END(sm4_ce_cbc_enc)

.align 3
SYM_FUNC_START(sm4_ce_cbc_dec)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;

	ld1		{RIV.16b}, [x3];

.Lcbc_loop_blk:
	sub		w4, w4, #8;
	tbnz		w4, #31, .Lcbc_tail8;

	ld1		{v0.16b-v3.16b}, [x2], #64;
	ld1		{v4.16b-v7.16b}, [x2];

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	sub		x2, x2, #64;
	eor		v0.16b, v0.16b, RIV.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v1.16b, v1.16b, RTMP0.16b;
	eor		v2.16b, v2.16b, RTMP1.16b;
	eor		v3.16b, v3.16b, RTMP2.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	eor		v4.16b, v4.16b, RTMP3.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v5.16b, v5.16b, RTMP0.16b;
	eor		v6.16b, v6.16b, RTMP1.16b;
	eor		v7.16b, v7.16b, RTMP2.16b;

	mov		RIV.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;

	cbz		w4, .Lcbc_end;
	b		.Lcbc_loop_blk;

.Lcbc_tail8:
	add		w4, w4, #8;
	cmp		w4, #4;
	blt		.Lcbc_tail4;

	sub		w4, w4, #4;

	ld1		{v0.16b-v3.16b}, [x2];

	SM4_CRYPT_BLK4(v0, v1, v2, v3);

	eor		v0.16b, v0.16b, RIV.16b;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v1.16b, v1.16b, RTMP0.16b;
	eor		v2.16b, v2.16b, RTMP1.16b;
	eor		v3.16b, v3.16b, RTMP2.16b;

	mov		RIV.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	cbz		w4, .Lcbc_end;

.Lcbc_tail4:
	sub		w4, w4, #1;

	ld1		{v0.16b}, [x2];

	SM4_CRYPT_BLK(v0);

	eor		v0.16b, v0.16b, RIV.16b;
	ld1		{RIV.16b}, [x2], #16;
	st1		{v0.16b}, [x1], #16;

	cbnz		w4, .Lcbc_tail4;

.Lcbc_end:
	/* store new IV */
	st1		{RIV.16b}, [x3];

	ret;
SYM_FUNC_END(sm4_ce_cbc_dec)

.align 3
SYM_FUNC_START(sm4_ce_cfb_enc)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;

	ld1		{RIV.16b}, [x3];

.Lcfb_enc_loop:
	sub		w4, w4, #1;

	SM4_CRYPT_BLK(RIV);

	ld1		{RTMP0.16b}, [x2], #16;
	eor		RIV.16b, RIV.16b, RTMP0.16b;
	st1		{RIV.16b}, [x1], #16;

	cbnz		w4, .Lcfb_enc_loop;

	/* store new IV */
	st1		{RIV.16b}, [x3];

	ret;
SYM_FUNC_END(sm4_ce_cfb_enc)

.align 3
SYM_FUNC_START(sm4_ce_cfb_dec)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: iv (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;

	ld1		{v0.16b}, [x3];

.Lcfb_loop_blk:
	sub		w4, w4, #8;
	tbnz		w4, #31, .Lcfb_tail8;

	ld1		{v1.16b, v2.16b, v3.16b}, [x2], #48;
	ld1		{v4.16b-v7.16b}, [x2];

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	sub		x2, x2, #48;
	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v4.16b, v4.16b, RTMP0.16b;
	eor		v5.16b, v5.16b, RTMP1.16b;
	eor		v6.16b, v6.16b, RTMP2.16b;
	eor		v7.16b, v7.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;

	mov		v0.16b, RTMP3.16b;

	cbz		w4, .Lcfb_end;
	b		.Lcfb_loop_blk;

.Lcfb_tail8:
	add		w4, w4, #8;
	cmp		w4, #4;
	blt		.Lcfb_tail4;

	sub		w4, w4, #4;

	ld1		{v1.16b, v2.16b, v3.16b}, [x2];

	SM4_CRYPT_BLK4(v0, v1, v2, v3);

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	mov		v0.16b, RTMP3.16b;

	cbz		w4, .Lcfb_end;

.Lcfb_tail4:
	sub		w4, w4, #1;

	SM4_CRYPT_BLK(v0);

	ld1		{RTMP0.16b}, [x2], #16;
	eor		v0.16b, v0.16b, RTMP0.16b;
	st1		{v0.16b}, [x1], #16;

	mov		v0.16b, RTMP0.16b;

	cbnz		w4, .Lcfb_tail4;

.Lcfb_end:
	/* store new IV */
	st1		{v0.16b}, [x3];

	ret;
SYM_FUNC_END(sm4_ce_cfb_dec)

.align 3
SYM_FUNC_START(sm4_ce_ctr_enc)
	/* input:
	 *   x0: round key array, CTX
	 *   x1: dst
	 *   x2: src
	 *   x3: ctr (big endian, 128 bit)
	 *   w4: nblocks
	 */
	PREPARE;

	ldp		x7, x8, [x3];
	rev		x7, x7;
	rev		x8, x8;

.Lctr_loop_blk:
	sub		w4, w4, #8;
	tbnz		w4, #31, .Lctr_tail8;

#define inc_le128(vctr)                     \
	mov		vctr.d[1], x8;      \
	mov		vctr.d[0], x7;      \
	adds		x8, x8, #1;         \
	adc		x7, x7, xzr;        \
	rev64		vctr.16b, vctr.16b;

	/* construct CTRs */
	inc_le128(v0);			/* +0 */
	inc_le128(v1);			/* +1 */
	inc_le128(v2);			/* +2 */
	inc_le128(v3);			/* +3 */
	inc_le128(v4);			/* +4 */
	inc_le128(v5);			/* +5 */
	inc_le128(v6);			/* +6 */
	inc_le128(v7);			/* +7 */

	SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v4.16b, v4.16b, RTMP0.16b;
	eor		v5.16b, v5.16b, RTMP1.16b;
	eor		v6.16b, v6.16b, RTMP2.16b;
	eor		v7.16b, v7.16b, RTMP3.16b;
	st1		{v4.16b-v7.16b}, [x1], #64;

	cbz		w4, .Lctr_end;
	b		.Lctr_loop_blk;

.Lctr_tail8:
	add		w4, w4, #8;
	cmp		w4, #4;
	blt		.Lctr_tail4;

	sub		w4, w4, #4;

	/* construct CTRs */
	inc_le128(v0);			/* +0 */
	inc_le128(v1);			/* +1 */
	inc_le128(v2);			/* +2 */
	inc_le128(v3);			/* +3 */

	SM4_CRYPT_BLK4(v0, v1, v2, v3);

	ld1		{RTMP0.16b-RTMP3.16b}, [x2], #64;
	eor		v0.16b, v0.16b, RTMP0.16b;
	eor		v1.16b, v1.16b, RTMP1.16b;
	eor		v2.16b, v2.16b, RTMP2.16b;
	eor		v3.16b, v3.16b, RTMP3.16b;
	st1		{v0.16b-v3.16b}, [x1], #64;

	cbz		w4, .Lctr_end;

.Lctr_tail4:
	sub		w4, w4, #1;

	/* construct CTRs */
	inc_le128(v0);

	SM4_CRYPT_BLK(v0);

	ld1		{RTMP0.16b}, [x2], #16;
	eor		v0.16b, v0.16b, RTMP0.16b;
	st1		{v0.16b}, [x1], #16;

	cbnz		w4, .Lctr_tail4;

.Lctr_end:
	/* store new CTR */
	rev		x7, x7;
	rev		x8, x8;
	stp		x7, x8, [x3];

	ret;
SYM_FUNC_END(sm4_ce_ctr_enc)
+372 −0

File added.

Preview size limit exceeded, changes collapsed.