Commit f0d43b3a authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull s390 updates from Heiko Carstens:
 "Besides all the small improvements and cleanups the most notable part
  is the fast vector/SIMD implementation of the ChaCha20 stream cipher,
  which is an adaptation of Andy Polyakov's code for the kernel.

  Summary:

   - add fast vector/SIMD implementation of the ChaCha20 stream cipher,
     which mainly adapts Andy Polyakov's code for the kernel

   - add status attribute to AP queue device so users can easily figure
     out its status

   - fix race in page table release code, and and lots of documentation

   - remove uevent suppress from cio device driver, since it turned out
     that it generated more problems than it solved problems

   - quite a lot of virtual vs physical address confusion fixes

   - various other small improvements and cleanups all over the place"

* tag 's390-5.17-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (39 commits)
  s390/dasd: use default_groups in kobj_type
  s390/sclp_sd: use default_groups in kobj_type
  s390/pci: simplify __pciwb_mio() inline asm
  s390: remove unused TASK_SIZE_OF
  s390/crash_dump: fix virtual vs physical address handling
  s390/crypto: fix compile error for ChaCha20 module
  s390/mm: check 2KB-fragment page on release
  s390/mm: better annotate 2KB pagetable fragments handling
  s390/mm: fix 2KB pgtable release race
  s390/sclp: release SCLP early buffer after kernel initialization
  s390/nmi: disable interrupts on extended save area update
  s390/zcrypt: CCA control CPRB sending
  s390/disassembler: update opcode table
  s390/uv: fix memblock virtual vs physical address confusion
  s390/smp: fix memblock_phys_free() vs memblock_free() confusion
  s390/sclp: fix memblock_phys_free() vs memblock_free() confusion
  s390/exit: remove dead reference to do_exit from copy_thread
  s390/ap: add missing virt_to_phys address conversion
  s390/pgalloc: use pointers instead of unsigned long values
  s390/pgalloc: add virt/phys address handling to base asce functions
  ...
parents 9b9e2113 0704a858
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -770,6 +770,7 @@ CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
CONFIG_CRYPTO_DES_S390=m
CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_CHACHA_S390=m
CONFIG_CRYPTO_GHASH_S390=m
CONFIG_CRYPTO_CRC32_S390=y
CONFIG_CRYPTO_DEV_VIRTIO=m
+1 −0
Original line number Diff line number Diff line
@@ -757,6 +757,7 @@ CONFIG_CRYPTO_SHA3_256_S390=m
CONFIG_CRYPTO_SHA3_512_S390=m
CONFIG_CRYPTO_DES_S390=m
CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_CHACHA_S390=m
CONFIG_CRYPTO_GHASH_S390=m
CONFIG_CRYPTO_CRC32_S390=y
CONFIG_CRYPTO_DEV_VIRTIO=m
+2 −0
Original line number Diff line number Diff line
@@ -11,9 +11,11 @@ obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
obj-$(CONFIG_S390_PRNG) += prng.o
obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
obj-$(CONFIG_ARCH_RANDOM) += arch_random.o

crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
chacha_s390-y := chacha-glue.o chacha-s390.o
+100 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/*
 * s390 ChaCha stream cipher.
 *
 * Copyright IBM Corp. 2021
 */

#define KMSG_COMPONENT "chacha_s390"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <crypto/algapi.h>
#include <linux/cpufeature.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sizes.h>
#include <asm/fpu/api.h>
#include "chacha-s390.h"

static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
				unsigned int nbytes, const u32 *key,
				u32 *counter)
{
	struct kernel_fpu vxstate;

	kernel_fpu_begin(&vxstate, KERNEL_VXR);
	chacha20_vx(dst, src, nbytes, key, counter);
	kernel_fpu_end(&vxstate, KERNEL_VXR);

	*counter += round_up(nbytes, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
}

static int chacha20_s390(struct skcipher_request *req)
{
	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
	u32 state[CHACHA_STATE_WORDS] __aligned(16);
	struct skcipher_walk walk;
	unsigned int nbytes;
	int rc;

	rc = skcipher_walk_virt(&walk, req, false);
	chacha_init_generic(state, ctx->key, req->iv);

	while (walk.nbytes > 0) {
		nbytes = walk.nbytes;
		if (nbytes < walk.total)
			nbytes = round_down(nbytes, walk.stride);

		if (nbytes <= CHACHA_BLOCK_SIZE) {
			chacha_crypt_generic(state, walk.dst.virt.addr,
					     walk.src.virt.addr, nbytes,
					     ctx->nrounds);
		} else {
			chacha20_crypt_s390(state, walk.dst.virt.addr,
					    walk.src.virt.addr, nbytes,
					    &state[4], &state[12]);
		}
		rc = skcipher_walk_done(&walk, walk.nbytes - nbytes);
	}
	return rc;
}

static struct skcipher_alg chacha_algs[] = {
	{
		.base.cra_name		= "chacha20",
		.base.cra_driver_name	= "chacha20-s390",
		.base.cra_priority	= 900,
		.base.cra_blocksize	= 1,
		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
		.base.cra_module	= THIS_MODULE,

		.min_keysize		= CHACHA_KEY_SIZE,
		.max_keysize		= CHACHA_KEY_SIZE,
		.ivsize			= CHACHA_IV_SIZE,
		.chunksize		= CHACHA_BLOCK_SIZE,
		.setkey			= chacha20_setkey,
		.encrypt		= chacha20_s390,
		.decrypt		= chacha20_s390,
	}
};

static int __init chacha_mod_init(void)
{
	return crypto_register_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
}

static void __exit chacha_mod_fini(void)
{
	crypto_unregister_skciphers(chacha_algs, ARRAY_SIZE(chacha_algs));
}

module_cpu_feature_match(VXRS, chacha_mod_init);
module_exit(chacha_mod_fini);

MODULE_DESCRIPTION("ChaCha20 stream cipher");
MODULE_LICENSE("GPL v2");

MODULE_ALIAS_CRYPTO("chacha20");
+907 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Original implementation written by Andy Polyakov, @dot-asm.
 * This is an adaptation of the original code for kernel use.
 *
 * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
 */

#include <linux/linkage.h>
#include <asm/nospec-insn.h>
#include <asm/vx-insn.h>

#define SP	%r15
#define FRAME	(16 * 8 + 4 * 8)

.data
.align	32

.Lsigma:
.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	# endian-neutral
.long	1,0,0,0
.long	2,0,0,0
.long	3,0,0,0
.long	0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c	# byte swap

.long	0,1,2,3
.long	0x61707865,0x61707865,0x61707865,0x61707865	# smashed sigma
.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574

.previous

	GEN_BR_THUNK %r14

.text

#############################################################################
# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
#		      counst u32 *key, const u32 *counter)

#define	OUT		%r2
#define	INP		%r3
#define	LEN		%r4
#define	KEY		%r5
#define	COUNTER		%r6

#define BEPERM		%v31
#define CTR		%v26

#define K0		%v16
#define K1		%v17
#define K2		%v18
#define K3		%v19

#define XA0		%v0
#define XA1		%v1
#define XA2		%v2
#define XA3		%v3

#define XB0		%v4
#define XB1		%v5
#define XB2		%v6
#define XB3		%v7

#define XC0		%v8
#define XC1		%v9
#define XC2		%v10
#define XC3		%v11

#define XD0		%v12
#define XD1		%v13
#define XD2		%v14
#define XD3		%v15

#define XT0		%v27
#define XT1		%v28
#define XT2		%v29
#define XT3		%v30

ENTRY(chacha20_vx_4x)
	stmg	%r6,%r7,6*8(SP)

	larl	%r7,.Lsigma
	lhi	%r0,10
	lhi	%r1,0

	VL	K0,0,,%r7		# load sigma
	VL	K1,0,,KEY		# load key
	VL	K2,16,,KEY
	VL	K3,0,,COUNTER		# load counter

	VL	BEPERM,0x40,,%r7
	VL	CTR,0x50,,%r7

	VLM	XA0,XA3,0x60,%r7,4	# load [smashed] sigma

	VREPF	XB0,K1,0		# smash the key
	VREPF	XB1,K1,1
	VREPF	XB2,K1,2
	VREPF	XB3,K1,3

	VREPF	XD0,K3,0
	VREPF	XD1,K3,1
	VREPF	XD2,K3,2
	VREPF	XD3,K3,3
	VAF	XD0,XD0,CTR

	VREPF	XC0,K2,0
	VREPF	XC1,K2,1
	VREPF	XC2,K2,2
	VREPF	XC3,K2,3

.Loop_4x:
	VAF	XA0,XA0,XB0
	VX	XD0,XD0,XA0
	VERLLF	XD0,XD0,16

	VAF	XA1,XA1,XB1
	VX	XD1,XD1,XA1
	VERLLF	XD1,XD1,16

	VAF	XA2,XA2,XB2
	VX	XD2,XD2,XA2
	VERLLF	XD2,XD2,16

	VAF	XA3,XA3,XB3
	VX	XD3,XD3,XA3
	VERLLF	XD3,XD3,16

	VAF	XC0,XC0,XD0
	VX	XB0,XB0,XC0
	VERLLF	XB0,XB0,12

	VAF	XC1,XC1,XD1
	VX	XB1,XB1,XC1
	VERLLF	XB1,XB1,12

	VAF	XC2,XC2,XD2
	VX	XB2,XB2,XC2
	VERLLF	XB2,XB2,12

	VAF	XC3,XC3,XD3
	VX	XB3,XB3,XC3
	VERLLF	XB3,XB3,12

	VAF	XA0,XA0,XB0
	VX	XD0,XD0,XA0
	VERLLF	XD0,XD0,8

	VAF	XA1,XA1,XB1
	VX	XD1,XD1,XA1
	VERLLF	XD1,XD1,8

	VAF	XA2,XA2,XB2
	VX	XD2,XD2,XA2
	VERLLF	XD2,XD2,8

	VAF	XA3,XA3,XB3
	VX	XD3,XD3,XA3
	VERLLF	XD3,XD3,8

	VAF	XC0,XC0,XD0
	VX	XB0,XB0,XC0
	VERLLF	XB0,XB0,7

	VAF	XC1,XC1,XD1
	VX	XB1,XB1,XC1
	VERLLF	XB1,XB1,7

	VAF	XC2,XC2,XD2
	VX	XB2,XB2,XC2
	VERLLF	XB2,XB2,7

	VAF	XC3,XC3,XD3
	VX	XB3,XB3,XC3
	VERLLF	XB3,XB3,7

	VAF	XA0,XA0,XB1
	VX	XD3,XD3,XA0
	VERLLF	XD3,XD3,16

	VAF	XA1,XA1,XB2
	VX	XD0,XD0,XA1
	VERLLF	XD0,XD0,16

	VAF	XA2,XA2,XB3
	VX	XD1,XD1,XA2
	VERLLF	XD1,XD1,16

	VAF	XA3,XA3,XB0
	VX	XD2,XD2,XA3
	VERLLF	XD2,XD2,16

	VAF	XC2,XC2,XD3
	VX	XB1,XB1,XC2
	VERLLF	XB1,XB1,12

	VAF	XC3,XC3,XD0
	VX	XB2,XB2,XC3
	VERLLF	XB2,XB2,12

	VAF	XC0,XC0,XD1
	VX	XB3,XB3,XC0
	VERLLF	XB3,XB3,12

	VAF	XC1,XC1,XD2
	VX	XB0,XB0,XC1
	VERLLF	XB0,XB0,12

	VAF	XA0,XA0,XB1
	VX	XD3,XD3,XA0
	VERLLF	XD3,XD3,8

	VAF	XA1,XA1,XB2
	VX	XD0,XD0,XA1
	VERLLF	XD0,XD0,8

	VAF	XA2,XA2,XB3
	VX	XD1,XD1,XA2
	VERLLF	XD1,XD1,8

	VAF	XA3,XA3,XB0
	VX	XD2,XD2,XA3
	VERLLF	XD2,XD2,8

	VAF	XC2,XC2,XD3
	VX	XB1,XB1,XC2
	VERLLF	XB1,XB1,7

	VAF	XC3,XC3,XD0
	VX	XB2,XB2,XC3
	VERLLF	XB2,XB2,7

	VAF	XC0,XC0,XD1
	VX	XB3,XB3,XC0
	VERLLF	XB3,XB3,7

	VAF	XC1,XC1,XD2
	VX	XB0,XB0,XC1
	VERLLF	XB0,XB0,7
	brct	%r0,.Loop_4x

	VAF	XD0,XD0,CTR

	VMRHF	XT0,XA0,XA1		# transpose data
	VMRHF	XT1,XA2,XA3
	VMRLF	XT2,XA0,XA1
	VMRLF	XT3,XA2,XA3
	VPDI	XA0,XT0,XT1,0b0000
	VPDI	XA1,XT0,XT1,0b0101
	VPDI	XA2,XT2,XT3,0b0000
	VPDI	XA3,XT2,XT3,0b0101

	VMRHF	XT0,XB0,XB1
	VMRHF	XT1,XB2,XB3
	VMRLF	XT2,XB0,XB1
	VMRLF	XT3,XB2,XB3
	VPDI	XB0,XT0,XT1,0b0000
	VPDI	XB1,XT0,XT1,0b0101
	VPDI	XB2,XT2,XT3,0b0000
	VPDI	XB3,XT2,XT3,0b0101

	VMRHF	XT0,XC0,XC1
	VMRHF	XT1,XC2,XC3
	VMRLF	XT2,XC0,XC1
	VMRLF	XT3,XC2,XC3
	VPDI	XC0,XT0,XT1,0b0000
	VPDI	XC1,XT0,XT1,0b0101
	VPDI	XC2,XT2,XT3,0b0000
	VPDI	XC3,XT2,XT3,0b0101

	VMRHF	XT0,XD0,XD1
	VMRHF	XT1,XD2,XD3
	VMRLF	XT2,XD0,XD1
	VMRLF	XT3,XD2,XD3
	VPDI	XD0,XT0,XT1,0b0000
	VPDI	XD1,XT0,XT1,0b0101
	VPDI	XD2,XT2,XT3,0b0000
	VPDI	XD3,XT2,XT3,0b0101

	VAF	XA0,XA0,K0
	VAF	XB0,XB0,K1
	VAF	XC0,XC0,K2
	VAF	XD0,XD0,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40

	VAF	XA0,XA1,K0
	VAF	XB0,XB1,K1
	VAF	XC0,XC1,K2
	VAF	XD0,XD1,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_4x

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_4x

	VAF	XA0,XA2,K0
	VAF	XB0,XB2,K1
	VAF	XC0,XC2,K2
	VAF	XD0,XD2,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_4x

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_4x

	VAF	XA0,XA3,K0
	VAF	XB0,XB3,K1
	VAF	XC0,XC3,K2
	VAF	XD0,XD3,K3

	VPERM	XA0,XA0,XA0,BEPERM
	VPERM	XB0,XB0,XB0,BEPERM
	VPERM	XC0,XC0,XC0,BEPERM
	VPERM	XD0,XD0,XD0,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_4x

	VLM	XT0,XT3,0,INP,0

	VX	XT0,XT0,XA0
	VX	XT1,XT1,XB0
	VX	XT2,XT2,XC0
	VX	XT3,XT3,XD0

	VSTM	XT0,XT3,0,OUT,0

.Ldone_4x:
	lmg	%r6,%r7,6*8(SP)
	BR_EX	%r14

.Ltail_4x:
	VLR	XT0,XC0
	VLR	XT1,XD0

	VST	XA0,8*8+0x00,,SP
	VST	XB0,8*8+0x10,,SP
	VST	XT0,8*8+0x20,,SP
	VST	XT1,8*8+0x30,,SP

	lghi	%r1,0

.Loop_tail_4x:
	llgc	%r5,0(%r1,INP)
	llgc	%r6,8*8(%r1,SP)
	xr	%r6,%r5
	stc	%r6,0(%r1,OUT)
	la	%r1,1(%r1)
	brct	LEN,.Loop_tail_4x

	lmg	%r6,%r7,6*8(SP)
	BR_EX	%r14
ENDPROC(chacha20_vx_4x)

#undef	OUT
#undef	INP
#undef	LEN
#undef	KEY
#undef	COUNTER

#undef BEPERM

#undef K0
#undef K1
#undef K2
#undef K3


#############################################################################
# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
#		   counst u32 *key, const u32 *counter)

#define	OUT		%r2
#define	INP		%r3
#define	LEN		%r4
#define	KEY		%r5
#define	COUNTER		%r6

#define BEPERM		%v31

#define K0		%v27
#define K1		%v24
#define K2		%v25
#define K3		%v26

#define A0		%v0
#define B0		%v1
#define C0		%v2
#define D0		%v3

#define A1		%v4
#define B1		%v5
#define C1		%v6
#define D1		%v7

#define A2		%v8
#define B2		%v9
#define C2		%v10
#define D2		%v11

#define A3		%v12
#define B3		%v13
#define C3		%v14
#define D3		%v15

#define A4		%v16
#define B4		%v17
#define C4		%v18
#define D4		%v19

#define A5		%v20
#define B5		%v21
#define C5		%v22
#define D5		%v23

#define T0		%v27
#define T1		%v28
#define T2		%v29
#define T3		%v30

ENTRY(chacha20_vx)
	.insn	rilu,0xc20e00000000,LEN,256	# clgfi LEN,256
	jle	chacha20_vx_4x
	stmg	%r6,%r7,6*8(SP)

	lghi	%r1,-FRAME
	lgr	%r0,SP
	la	SP,0(%r1,SP)
	stg	%r0,0(SP)		# back-chain

	larl	%r7,.Lsigma
	lhi	%r0,10

	VLM	K1,K2,0,KEY,0		# load key
	VL	K3,0,,COUNTER		# load counter

	VLM	K0,BEPERM,0,%r7,4	# load sigma, increments, ...

.Loop_outer_vx:
	VLR	A0,K0
	VLR	B0,K1
	VLR	A1,K0
	VLR	B1,K1
	VLR	A2,K0
	VLR	B2,K1
	VLR	A3,K0
	VLR	B3,K1
	VLR	A4,K0
	VLR	B4,K1
	VLR	A5,K0
	VLR	B5,K1

	VLR	D0,K3
	VAF	D1,K3,T1		# K[3]+1
	VAF	D2,K3,T2		# K[3]+2
	VAF	D3,K3,T3		# K[3]+3
	VAF	D4,D2,T2		# K[3]+4
	VAF	D5,D2,T3		# K[3]+5

	VLR	C0,K2
	VLR	C1,K2
	VLR	C2,K2
	VLR	C3,K2
	VLR	C4,K2
	VLR	C5,K2

	VLR	T1,D1
	VLR	T2,D2
	VLR	T3,D3

.Loop_vx:
	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,16
	VERLLF	D1,D1,16
	VERLLF	D2,D2,16
	VERLLF	D3,D3,16
	VERLLF	D4,D4,16
	VERLLF	D5,D5,16

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,12
	VERLLF	B1,B1,12
	VERLLF	B2,B2,12
	VERLLF	B3,B3,12
	VERLLF	B4,B4,12
	VERLLF	B5,B5,12

	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,8
	VERLLF	D1,D1,8
	VERLLF	D2,D2,8
	VERLLF	D3,D3,8
	VERLLF	D4,D4,8
	VERLLF	D5,D5,8

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,7
	VERLLF	B1,B1,7
	VERLLF	B2,B2,7
	VERLLF	B3,B3,7
	VERLLF	B4,B4,7
	VERLLF	B5,B5,7

	VSLDB	C0,C0,C0,8
	VSLDB	C1,C1,C1,8
	VSLDB	C2,C2,C2,8
	VSLDB	C3,C3,C3,8
	VSLDB	C4,C4,C4,8
	VSLDB	C5,C5,C5,8
	VSLDB	B0,B0,B0,4
	VSLDB	B1,B1,B1,4
	VSLDB	B2,B2,B2,4
	VSLDB	B3,B3,B3,4
	VSLDB	B4,B4,B4,4
	VSLDB	B5,B5,B5,4
	VSLDB	D0,D0,D0,12
	VSLDB	D1,D1,D1,12
	VSLDB	D2,D2,D2,12
	VSLDB	D3,D3,D3,12
	VSLDB	D4,D4,D4,12
	VSLDB	D5,D5,D5,12

	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,16
	VERLLF	D1,D1,16
	VERLLF	D2,D2,16
	VERLLF	D3,D3,16
	VERLLF	D4,D4,16
	VERLLF	D5,D5,16

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,12
	VERLLF	B1,B1,12
	VERLLF	B2,B2,12
	VERLLF	B3,B3,12
	VERLLF	B4,B4,12
	VERLLF	B5,B5,12

	VAF	A0,A0,B0
	VAF	A1,A1,B1
	VAF	A2,A2,B2
	VAF	A3,A3,B3
	VAF	A4,A4,B4
	VAF	A5,A5,B5
	VX	D0,D0,A0
	VX	D1,D1,A1
	VX	D2,D2,A2
	VX	D3,D3,A3
	VX	D4,D4,A4
	VX	D5,D5,A5
	VERLLF	D0,D0,8
	VERLLF	D1,D1,8
	VERLLF	D2,D2,8
	VERLLF	D3,D3,8
	VERLLF	D4,D4,8
	VERLLF	D5,D5,8

	VAF	C0,C0,D0
	VAF	C1,C1,D1
	VAF	C2,C2,D2
	VAF	C3,C3,D3
	VAF	C4,C4,D4
	VAF	C5,C5,D5
	VX	B0,B0,C0
	VX	B1,B1,C1
	VX	B2,B2,C2
	VX	B3,B3,C3
	VX	B4,B4,C4
	VX	B5,B5,C5
	VERLLF	B0,B0,7
	VERLLF	B1,B1,7
	VERLLF	B2,B2,7
	VERLLF	B3,B3,7
	VERLLF	B4,B4,7
	VERLLF	B5,B5,7

	VSLDB	C0,C0,C0,8
	VSLDB	C1,C1,C1,8
	VSLDB	C2,C2,C2,8
	VSLDB	C3,C3,C3,8
	VSLDB	C4,C4,C4,8
	VSLDB	C5,C5,C5,8
	VSLDB	B0,B0,B0,12
	VSLDB	B1,B1,B1,12
	VSLDB	B2,B2,B2,12
	VSLDB	B3,B3,B3,12
	VSLDB	B4,B4,B4,12
	VSLDB	B5,B5,B5,12
	VSLDB	D0,D0,D0,4
	VSLDB	D1,D1,D1,4
	VSLDB	D2,D2,D2,4
	VSLDB	D3,D3,D3,4
	VSLDB	D4,D4,D4,4
	VSLDB	D5,D5,D5,4
	brct	%r0,.Loop_vx

	VAF	A0,A0,K0
	VAF	B0,B0,K1
	VAF	C0,C0,K2
	VAF	D0,D0,K3
	VAF	A1,A1,K0
	VAF	D1,D1,T1		# +K[3]+1

	VPERM	A0,A0,A0,BEPERM
	VPERM	B0,B0,B0,BEPERM
	VPERM	C0,C0,C0,BEPERM
	VPERM	D0,D0,D0,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_vx

	VAF	D2,D2,T2		# +K[3]+2
	VAF	D3,D3,T3		# +K[3]+3
	VLM	T0,T3,0,INP,0

	VX	A0,A0,T0
	VX	B0,B0,T1
	VX	C0,C0,T2
	VX	D0,D0,T3

	VLM	K0,T3,0,%r7,4		# re-load sigma and increments

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	B1,B1,K1
	VAF	C1,C1,K2

	VPERM	A0,A1,A1,BEPERM
	VPERM	B0,B1,B1,BEPERM
	VPERM	C0,C1,C1,BEPERM
	VPERM	D0,D1,D1,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A2,A2,K0
	VAF	B2,B2,K1
	VAF	C2,C2,K2

	VPERM	A0,A2,A2,BEPERM
	VPERM	B0,B2,B2,BEPERM
	VPERM	C0,C2,C2,BEPERM
	VPERM	D0,D2,D2,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A3,A3,K0
	VAF	B3,B3,K1
	VAF	C3,C3,K2
	VAF	D2,K3,T3		# K[3]+3

	VPERM	A0,A3,A3,BEPERM
	VPERM	B0,B3,B3,BEPERM
	VPERM	C0,C3,C3,BEPERM
	VPERM	D0,D3,D3,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_vx

	VAF	D3,D2,T1		# K[3]+4
	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A4,A4,K0
	VAF	B4,B4,K1
	VAF	C4,C4,K2
	VAF	D4,D4,D3		# +K[3]+4
	VAF	D3,D3,T1		# K[3]+5
	VAF	K3,D2,T3		# K[3]+=6

	VPERM	A0,A4,A4,BEPERM
	VPERM	B0,B4,B4,BEPERM
	VPERM	C0,C4,C4,BEPERM
	VPERM	D0,D4,D4,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	aghi	LEN,-0x40
	je	.Ldone_vx

	VAF	A5,A5,K0
	VAF	B5,B5,K1
	VAF	C5,C5,K2
	VAF	D5,D5,D3		# +K[3]+5

	VPERM	A0,A5,A5,BEPERM
	VPERM	B0,B5,B5,BEPERM
	VPERM	C0,C5,C5,BEPERM
	VPERM	D0,D5,D5,BEPERM

	.insn	rilu,0xc20e00000000,LEN,0x40	# clgfi LEN,0x40
	jl	.Ltail_vx

	VLM	A1,D1,0,INP,0

	VX	A0,A0,A1
	VX	B0,B0,B1
	VX	C0,C0,C1
	VX	D0,D0,D1

	VSTM	A0,D0,0,OUT,0

	la	INP,0x40(INP)
	la	OUT,0x40(OUT)
	lhi	%r0,10
	aghi	LEN,-0x40
	jne	.Loop_outer_vx

.Ldone_vx:
	lmg	%r6,%r7,FRAME+6*8(SP)
	la	SP,FRAME(SP)
	BR_EX	%r14

.Ltail_vx:
	VSTM	A0,D0,8*8,SP,3
	lghi	%r1,0

.Loop_tail_vx:
	llgc	%r5,0(%r1,INP)
	llgc	%r6,8*8(%r1,SP)
	xr	%r6,%r5
	stc	%r6,0(%r1,OUT)
	la	%r1,1(%r1)
	brct	LEN,.Loop_tail_vx

	lmg	%r6,%r7,FRAME+6*8(SP)
	la	SP,FRAME(SP)
	BR_EX	%r14
ENDPROC(chacha20_vx)

.previous
Loading