@@ -445,6 +445,12 @@ M: Declan Doherty <declan.doherty@intel.com>
F: drivers/crypto/openssl/
F: doc/guides/cryptodevs/openssl.rst
+ARMv8 Crypto PMD
+M: Zbigniew Bodek <zbigniew.bodek@caviumnetworks.com>
+M: Jerin Jacob <jerin.jacob@caviumnetworks.com>
+F: drivers/crypto/armv8/
+F: doc/guides/cryptodevs/armv8.rst
+
Null Crypto PMD
M: Declan Doherty <declan.doherty@intel.com>
F: drivers/crypto/null/
@@ -406,6 +406,12 @@ CONFIG_RTE_LIBRTE_PMD_ZUC=n
CONFIG_RTE_LIBRTE_PMD_ZUC_DEBUG=n
#
+# Compile PMD for ARMv8 Crypto device
+#
+CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO=n
+CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO_DEBUG=n
+
+#
# Compile PMD for NULL Crypto device
#
CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
@@ -47,3 +47,5 @@ CONFIG_RTE_EAL_IGB_UIO=n
CONFIG_RTE_LIBRTE_FM10K_PMD=n
CONFIG_RTE_SCHED_VECTOR=n
+
+CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO=y
new file mode 100644
@@ -0,0 +1,82 @@
+.. BSD LICENSE
+ Copyright (C) Cavium networks Ltd. 2016.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Cavium networks nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ARMv8 Crypto Poll Mode Driver
+================================
+
+This code provides the initial implementation of the ARMv8 crypto PMD.
+The driver uses ARMv8 cryptographic extensions to process chained crypto
+operations in an optimized way. The core functionality is provided by
+a low-level assembly code specific to all supported cipher and hash
+combinations.
+
+Features
+--------
+
+ARMv8 Crypto PMD has support for the following algorithm pairs:
+
+Supported cipher algorithms:
+* ``RTE_CRYPTO_CIPHER_AES_CBC``
+
+Supported authentication algorithms:
+* ``RTE_CRYPTO_AUTH_SHA1``
+* ``RTE_CRYPTO_AUTH_SHA256``
+* ``RTE_CRYPTO_AUTH_SHA1_HMAC``
+* ``RTE_CRYPTO_AUTH_SHA256_HMAC``
+
+Installation
+------------
+
+To compile ARMv8 Crypto PMD, it has to be enabled in the config/common_base
+file. No additional packages need to be installed.
+The corresponding device can be created only if the following features
+are supported by the CPU:
+
+* ``RTE_CPUFLAG_AES``
+* ``RTE_CPUFLAG_SHA1``
+* ``RTE_CPUFLAG_SHA2``
+* ``RTE_CPUFLAG_NEON``
+
+Initialization
+--------------
+
+User can use app/test application to check how to use this PMD and to verify
+crypto processing.
+
+Test name is cryptodev_sw_armv8_autotest.
+For performance test cryptodev_sw_armv8_perftest can be used.
+
+Limitations
+-----------
+
+* Maximum number of sessions is 2048.
+* Only chained operations are supported.
+* AES-128-CBC is the only supported cipher variant.
+* Input data has to be a multiple of 16 bytes.
@@ -38,6 +38,7 @@ Crypto Device Drivers
overview
aesni_mb
aesni_gcm
+ armv8
kasumi
openssl
null
@@ -38,6 +38,11 @@ New Features
Also, make sure to start the actual text at the margin.
=========================================================
+* **Added armv8 crypto PMD.**
+
+ A new crypto PMD has been added, which provides combined mode cryptografic
+ operations optimized for ARMv8 processors. The driver can be used to enhance
+ performance in processing chained operations such as cipher + HMAC.
Resolved Issues
---------------
@@ -33,6 +33,9 @@ include $(RTE_SDK)/mk/rte.vars.mk
DIRS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_GCM) += aesni_gcm
DIRS-$(CONFIG_RTE_LIBRTE_PMD_AESNI_MB) += aesni_mb
+ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += armv8
+endif
DIRS-$(CONFIG_RTE_LIBRTE_PMD_OPENSSL) += openssl
DIRS-$(CONFIG_RTE_LIBRTE_PMD_QAT) += qat
DIRS-$(CONFIG_RTE_LIBRTE_PMD_SNOW3G) += snow3g
new file mode 100644
@@ -0,0 +1,84 @@
+#
+# BSD LICENSE
+#
+# Copyright (C) Cavium networks Ltd. 2016.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Cavium networks nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_pmd_armv8.a
+
+# build flags
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -I$(SRCDIR)/asm/include
+
+# library version
+LIBABIVER := 1
+
+# versioning export map
+EXPORT_MAP := rte_armv8_pmd_version.map
+
+VPATH += $(SRCDIR)/asm
+
+# library source files
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += rte_armv8_pmd.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += rte_armv8_pmd_ops.c
+# library asm files
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += aes_core.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += sha1_core.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += sha256_core.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += aes128cbc_sha1_hmac.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += aes128cbc_sha256.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += aes128cbc_sha256_hmac.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += sha1_hmac_aes128cbc_dec.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += sha256_aes128cbc_dec.S
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += sha256_hmac_aes128cbc_dec.S
+
+# library dependencies
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_eal
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_mbuf
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_mempool
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_ring
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += lib/librte_cryptodev
+
+# runtime generated assembly symbols
+all: clean assym.s
+
+assym.s: genassym.c
+ @$(CC) $(CFLAGS) -O0 -S $< -o - | \
+ awk '($$1 == "<genassym>") { print "#define " $$2 "\t" $$3 }' > \
+ $(SRCDIR)/asm/$@
+
+.PHONY: clean
+clean:
+ @rm -f $(SRCDIR)/asm/assym.s
+
+include $(RTE_SDK)/mk/rte.lib.mk
new file mode 100644
@@ -0,0 +1,1678 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Enc/Auth Primitive = aes128cbc/sha1_hmac
+ *
+ * Operations:
+ *
+ * out = encrypt-AES128CBC(in)
+ * return_hash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | out))
+ *
+ * Prototype:
+ * void aes128cbc_sha1_hmac(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * aes128cbc_sha1_hmac(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 -- temp register for SHA1
+ * v20 -- ABCD copy (q20)
+ * v21 -- sha working state (q21)
+ * v22 -- sha working state (q22)
+ * v23 -- temp register for SHA1
+ * v24 -- sha state ABCD
+ * v25 -- sha state E
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not defined.
+ * For AES partial blocks the user is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are not optimized at < 12 AES blocks
+ */
+
+ .file "aes128cbc_sha1_hmac.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global aes128cbc_sha1_hmac
+ .type aes128cbc_sha1_hmac,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+aes128cbc_sha1_hmac:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ ld1 {v24.4s, v25.4s},[x6] /* init ABCD, EFGH. (2 cycs) */
+ ldr x6, [x5, #HMAC_OKEYPAD] /* save pointer to o_key_pad partial hash */
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,12 /* no main loop if <12 */
+ b.lt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+/* proceed */
+ ld1 {v3.16b},[x5] /* get 1st ivec */
+ ld1 {v0.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+ mov x11,x4 /* len -> x11 needed at end */
+ lsr x12,x11,6 /* total_blocks */
+
+/*
+ * now we can do the loop prolog, 1st aes sequence of 4 blocks
+ */
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ aese v0.16b,v9.16b
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+ adr x8,.Lrcon /* base address for sha round consts */
+ aesmc v0.16b,v0.16b
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ aese v0.16b,v10.16b
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ aese v0.16b,v16.16b
+ mov x4,x1 /* sha_ptr_in = aes_ptr_out */
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ prfm PLDL1KEEP,[x8,0*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ prfm PLDL1KEEP,[x8,2*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ prfm PLDL1KEEP,[x8,4*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ prfm PLDL1KEEP,[x8,6*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ prfm PLDL1KEEP,[x8,8*64] /* rcon */
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+
+ eor v2.16b,v2.16b,v1.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ mov x2,x0 /* lead_ptr = aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ prfm PLDL1KEEP,[x8,10*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ prfm PLDL1KEEP,[x8,12*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ prfm PLDL1KEEP,[x8,14*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+
+ eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ sub x7,x12,1 /* main_blocks = total_blocks - 1 */
+ and x13,x10,3 /* aes_blocks_left */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+
+/* Note, aes_blocks_left := number after the main (sha) block is done. Can be 0 */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+/*
+ * main combined loop CBC
+ */
+.Lmain_loop:
+/*
+ * because both mov, rev32 and eor have a busy cycle, this takes longer than it looks.
+ * Thats OK since there are 6 cycles before we can use the load anyway; so this goes
+ * as fast as it can without SW pipelining (too complicated given the code size)
+ */
+ rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */
+ ld1 {v0.16b},[x0],16 /* next aes block, update aes_ptr_in */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ aese v0.16b,v8.16b
+ rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */
+ aesmc v0.16b,v0.16b
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aese v0.16b,v9.16b
+ add v19.4s,v4.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v0.16b,v10.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ add v23.4s,v4.4s,v27.4s
+/* no place to get rid of this stall */
+ rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */
+ aesmc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aese v0.16b,v12.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aese v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v0.16b,v15.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aese v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ sha1h s21,s24
+ aese v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/ prev value */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aese v1.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v10.16b
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ sha1h s22,s24
+ aese v1.16b,v12.16b
+ sha1p q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aese v1.16b,v13.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v14.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v1.16b,v15.16b
+ sha1h s22,s24
+ add v23.4s,v5.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v16.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v1.16b,v17.16b
+ sha1h s21,s24
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ sha1p q24,s22,v23.4s
+ add v23.4s,v6.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+/* mode op 2 */
+ eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+ aese v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v2.16b,v2.16b
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v2.16b,v9.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ aesmc v2.16b,v2.16b
+
+ aese v2.16b,v11.16b
+ add v19.4s,v6.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aese v2.16b,v12.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aese v2.16b,v13.16b
+ sha1su1 v29.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aese v2.16b,v14.16b
+ add v23.4s,v6.4s,v27.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v2.16b,v15.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v16.16b
+ add v19.4s,v6.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ sha1su1 v26.4s,v29.4s
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+
+/* mode op 3 */
+ eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */
+
+/* aes xform 3, sha quad 3 */
+ aese v3.16b,v8.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aese v3.16b,v9.16b
+ sha1h s21,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aese v3.16b,v10.16b
+ sha1su1 v29.4s,v28.4s
+ aesmc v3.16b,v3.16b
+ add v19.4s,v7.4s,v26.4s
+ aese v3.16b,v11.16b
+ sha1h s22,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aese v3.16b,v13.16b
+ sha1h s21,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aese v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesmc v3.16b,v3.16b
+ add v19.4s,v7.4s,v28.4s
+ aese v3.16b,v15.16b
+ sha1h s22,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ add v23.4s,v7.4s,v29.4s
+ aese v3.16b,v17.16b
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbnz x7,.Lmain_loop /* loop if more to do */
+
+
+/*
+ * epilog, process remaining aes blocks and b-2 sha block
+ * do this inline (no loop) to overlap with the sha part
+ * note there are 0-3 aes blocks left.
+ */
+
+ rev32 v26.16b,v0.16b /* fix endian w0 */
+ rev32 v27.16b,v1.16b /* fix endian w1 */
+ rev32 v28.16b,v2.16b /* fix endian w2 */
+ rev32 v29.16b,v3.16b /* fix endian w3 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ cbz x13, .Lbm2fromQ0 /* skip if none left */
+ subs x14,x13,1 /* local copy of aes_blocks_left */
+
+/* mode op 0 */
+ ld1 {v0.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ add v19.4s,v4.4s,v26.4s
+ aese v0.16b,v8.16b
+ add v23.4s,v4.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v0.16b,v9.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aese v0.16b,v10.16b
+ sha1su1 v26.4s,v29.4s
+ add v19.4s,v4.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ sha1h s21,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aese v0.16b,v12.16b
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v4.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aese v0.16b,v14.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ sha1h s21,s24
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ sha1c q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ beq .Lbm2fromQ1 /* if aes_blocks_left_count == 0 */
+
+/* mode op 1 */
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1, sha quad 1 */
+ add v23.4s,v5.4s,v27.4s
+ aese v1.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v1.16b,v9.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v10.16b
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v5.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ subs x14,x14,1 /* dec counter */
+ aese v1.16b,v11.16b
+ sha1h s22,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v12.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v5.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v14.16b
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ sha1h s22,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v16.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ beq .Lbm2fromQ2 /* if aes_blocks_left_count == 0 */
+
+/* mode op 2 */
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+ add v19.4s,v6.4s,v28.4s
+ aese v2.16b,v8.16b
+ add v23.4s,v6.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v2.16b,v9.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aese v2.16b,v12.16b
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v14.16b
+ sha1su1 v26.4s,v29.4s
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ sha1m q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ b .Lbm2fromQ3 /* join common code at Quad 3 */
+
+/*
+ * now there is the b-2 sha block before the final one. Execution takes over
+ * in the appropriate part of this depending on how many aes blocks were left.
+ * If there were none, the whole thing is executed.
+ */
+.Lbm2fromQ0:
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+.Lbm2fromQ1:
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+.Lbm2fromQ2:
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+.Lbm2fromQ3:
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ eor v26.16b,v26.16b,v26.16b /* zero reg */
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ eor v27.16b,v27.16b,v27.16b /* zero reg */
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ eor v28.16b,v28.16b,v28.16b /* zero reg */
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+/*
+ * now we can do the final block, either all padding or 1-3 aes blocks
+ * len in x11, aes_blocks_left in x13. should move the aes data setup of this
+ * to the last aes bit.
+ */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ mov w15,0x80 /* that's the 1 of the pad */
+ /* Add one SHA-1 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x9,x11,0xffffffff /* len_lo */
+ mov v26.b[0],w15 /* assume block 0 is dst */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x9,x9,3 /* len_lo in bits */
+ eor v29.16b,v29.16b,v29.16b /* zero reg */
+/*
+ * places the 0x80 in the correct block, copies the appropriate data
+ */
+ cbz x13,.Lpad100 /* no data to get */
+ mov v26.16b,v0.16b
+ sub x14,x13,1 /* dec amount left */
+ mov v27.b[0],w15 /* assume block 1 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v27.16b,v1.16b
+ sub x14,x14,1 /* dec amount left */
+ mov v28.b[0],w15 /* assume block 2 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v28.16b,v2.16b
+ mov v29.b[3],w15 /* block 3, doesn't get rev'd */
+/*
+ * get the len_hi,LenLo in bits according to
+ * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12)
+ * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9)
+ * this is done before the if/else above
+ */
+.Lpad100:
+ mov v29.s[3],w9 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+/*
+ * note that q29 is already built in the correct format, so no swap required
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+/*
+ * do last sha of pad block
+ */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ ld1 {v24.16b,v25.16b}, [x6] /* load o_key_pad partial hash */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+
+ mov x11, #64+20 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ mov v29.s[3], w11 /* move length to the end of the block */
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v3.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ mov w15,0x80 /* sha padding word */
+
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+
+ mov x9,x8 /* top of rcon */
+
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+/*
+ * the idea in the short loop (at least 1) is to break out with the padding
+ * already in place excepting the final word.
+ */
+.Lshort_loop:
+ ld1 {v0.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v9.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v10.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+
+ mov v27.b[3],w15 /* assume this was final block */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v26.16b,v0.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ mov v28.b[3],w15 /* assume this was final block */
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v27.16b,v1.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+
+ mov v29.b[3],w15 /* assume this was final block */
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v28.16b,v2.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+
+ rev32 v29.16b,v3.16b /* load res to sha 0, endian swap */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/*
+ * now we have the sha1 to do for these 4 aes blocks
+ */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+
+ mov v26.b[3],w15 /* assume this was final block */
+
+ sub x10,x10,1 /* dec num_blocks */
+ cbnz x10,.Lshort_loop /* keep looping if more */
+/*
+ * there are between 0 and 3 aes blocks in the final sha1 blocks
+ */
+.Lpost_short_loop:
+ /* Add one SHA-2 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x13,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x13,x13,3 /* len_lo in bits */
+
+ mov v29.s[3],w13 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+/* do final block */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ ld1 {v24.16b,v25.16b}, [x6] /* load o_key_pad partial hash */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+
+ mov x11, #64+20 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ mov v29.s[3], w11 /* move length to the end of the block */
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+ .size aes128cbc_sha1_hmac, .-aes128cbc_sha1_hmac
new file mode 100644
@@ -0,0 +1,1518 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Enc/Auth Primitive = aes128cbc/sha256
+ *
+ * Operations:
+ *
+ * out = encrypt-AES128CBC(in)
+ * return_hash_ptr = SHA256(out)
+ *
+ * Prototype:
+ * void aes128cbc_sha256(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * aes128cbc_sha256(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 - v20 -- round keys
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- regShaStateABCD
+ * v25 -- regShaStateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not defined.
+ * For AES partial blocks the user is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are not optimized at < 12 AES blocks
+ */
+
+ .file "aes128cbc_sha256.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global aes128cbc_sha256
+ .type aes128cbc_sha256,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+.Linit_sha_state:
+ .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+aes128cbc_sha256:
+/* fetch args */
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */
+ adr x12,.Linit_sha_state /* address of sha init state consts */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,12 /* no main loop if <12 */
+ ld1 {v24.4s, v25.4s},[x12] /* init ABCD, EFGH. (2 cycs) */
+ b.lt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+/* proceed */
+ ld1 {v3.16b},[x5] /* get 1st ivec */
+ ld1 {v0.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+ mov x11,x4 /* len -> x11 needed at end */
+ lsr x12,x11,6 /* total_blocks */
+
+/*
+ * now we can do the loop prolog, 1st aes sequence of 4 blocks
+ */
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ aese v0.16b,v9.16b
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+ adr x8,.Lrcon /* base address for sha round consts */
+ aesmc v0.16b,v0.16b
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ aese v0.16b,v10.16b
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ aese v0.16b,v16.16b
+ mov x4,x1 /* sha_ptr_in = aes_ptr_out */
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ prfm PLDL1KEEP,[x8,0*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ prfm PLDL1KEEP,[x8,2*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ prfm PLDL1KEEP,[x8,4*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ prfm PLDL1KEEP,[x8,6*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ prfm PLDL1KEEP,[x8,8*64] /* rcon */
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+
+ eor v2.16b,v2.16b,v1.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ mov x2,x0 /* lead_ptr = aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ prfm PLDL1KEEP,[x8,10*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ prfm PLDL1KEEP,[x8,12*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ prfm PLDL1KEEP,[x8,14*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+
+ eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ sub x7,x12,1 /* main_blocks = total_blocks - 1 */
+ and x13,x10,3 /* aes_blocks_left */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+
+/* Note, aes_blocks_left := number after the main (sha) block is done. Can be 0 */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/*
+ * main combined loop CBC
+ */
+.Lmain_loop:
+
+/*
+ * because both mov, rev32 and eor have a busy cycle, this takes longer than it looks.
+ * Thats OK since there are 6 cycles before we can use the load anyway; so this goes
+ * as fast as it can without SW pipelining (too complicated given the code size)
+ */
+ rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */
+ ld1 {v0.16b},[x0],16 /* next aes block, update aes_ptr_in */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+ ld1 {v5.16b},[x9],16 /* key1 */
+
+/*
+ * aes xform 0, sha quad 0
+ */
+ aese v0.16b,v8.16b
+ ld1 {v6.16b},[x9],16 /* key2 */
+ rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ aesmc v0.16b,v0.16b
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aese v0.16b,v9.16b
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aesmc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aese v0.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+/* no place to get rid of this stall */
+ rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aese v0.16b,v12.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesmc v0.16b,v0.16b
+ sha256su0 v27.4s,v28.4s
+ aese v0.16b,v13.16b
+ sha256h q22, q23, v5.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aese v0.16b,v14.16b
+ ld1 {v5.16b},[x9],16 /* key5 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aese v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aese v0.16b,v16.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd (1 cyc stall on v22) */
+ sha256su0 v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aese v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+
+/* aes xform 1, sha quad 1 */
+ sha256su0 v26.4s,v27.4s
+ eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/ prev value */
+ ld1 {v7.16b},[x9],16 /* key7 */
+ mov v21.16b, v22.16b /* copy abcd */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aese v1.16b,v8.16b
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h2 q23, q21, v4.4s
+ aesmc v1.16b,v1.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aese v1.16b,v9.16b
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aese v1.16b,v10.16b
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aese v1.16b,v11.16b
+ ld1 {v5.16b},[x9],16 /* key5 (extra stall from mov) */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256su0 v28.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ sha256h q22, q23, v6.4s
+ aese v1.16b,v12.16b
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha256su0 v29.4s,v26.4s
+ aese v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aese v1.16b,v14.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aese v1.16b,v15.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+
+
+/* mode op 2 */
+ eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+
+ sha256su0 v26.4s,v27.4s
+ aese v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aese v2.16b,v9.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesmc v2.16b,v2.16b
+ sha256su0 v27.4s,v28.4s
+ aese v2.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aese v2.16b,v11.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aese v2.16b,v13.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256su0 v29.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aese v2.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v7.4s
+ aese v2.16b,v15.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ aesmc v2.16b,v2.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+/* mode op 3 */
+ eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */
+
+/* aes xform 3, sha quad 3 (hash only) */
+
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aese v3.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v4.4s
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v5.4s
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v6.4s
+ aese v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v7.4s
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbnz x7,.Lmain_loop /* loop if more to do */
+
+/*
+ * epilog, process remaining aes blocks and b-2 sha block
+ * do this inline (no loop) to overlap with the sha part
+ * note there are 0-3 aes blocks left.
+ */
+
+ rev32 v26.16b,v0.16b /* fix endian w0 */
+ rev32 v27.16b,v1.16b /* fix endian w1 */
+ rev32 v28.16b,v2.16b /* fix endian w2 */
+ rev32 v29.16b,v3.16b /* fix endian w3 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ cbz x13, .Lbm2fromQ0 /* skip if none left */
+ subs x14,x13,1 /* local copy of aes_blocks_left */
+
+/* mode op 0 */
+ ld1 {v0.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aese v0.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ aesmc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aese v0.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aese v0.16b,v10.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256su0 v27.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aese v0.16b,v12.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aese v0.16b,v14.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ sha256su0 v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ beq .Lbm2fromQ1 /* if aes_blocks_left_count == 0 */
+
+/* mode op 1 */
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1, sha quad 1 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ aese v1.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesmc v1.16b,v1.16b
+ sha256su0 v26.4s,v27.4s
+ aese v1.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v4.4s
+ aese v1.16b,v10.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ subs x14,x14,1 /* dec counter */
+ aese v1.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aese v1.16b,v12.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v6.4s
+ aese v1.16b,v14.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ sha256su0 v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aese v1.16b,v16.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ beq .Lbm2fromQ2 /* if aes_blocks_left_count == 0 */
+
+/* mode op 2 */
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aese v2.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ aesmc v2.16b,v2.16b
+ sha256su0 v26.4s,v27.4s
+ aese v2.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aese v2.16b,v10.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256su0 v27.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aese v2.16b,v12.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aese v2.16b,v14.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ sha256su0 v29.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ b .Lbm2fromQ3 /* join common code at Quad 3 */
+
+/*
+ * now there is the b-2 sha block before the final one. Execution takes over
+ * in the appropriate part of this depending on how many aes blocks were left.
+ * If there were none, the whole thing is executed.
+ */
+/* quad 0 */
+.Lbm2fromQ0:
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+.Lbm2fromQ1:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+.Lbm2fromQ2:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+.Lbm2fromQ3:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ eor v26.16b,v26.16b,v26.16b /* zero reg */
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ eor v27.16b,v27.16b,v27.16b /* zero reg */
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ eor v28.16b,v28.16b,v28.16b /* zero reg */
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+/*
+ * now we can do the final block, either all padding or 1-3 aes blocks
+ * len in x11, aes_blocks_left in x13. should move the aes data setup of this
+ * to the last aes bit.
+ */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov w15,0x80 /* that's the 1 of the pad */
+ lsr x12,x11,32 /* len_hi */
+ and x9,x11,0xffffffff /* len_lo */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov v26.b[0],w15 /* assume block 0 is dst */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x9,x9,3 /* len_lo in bits */
+ eor v29.16b,v29.16b,v29.16b /* zero reg */
+/*
+ * places the 0x80 in the correct block, copies the appropriate data
+ */
+ cbz x13,.Lpad100 /* no data to get */
+ mov v26.16b,v0.16b
+ sub x14,x13,1 /* dec amount left */
+ mov v27.b[0],w15 /* assume block 1 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v27.16b,v1.16b
+ sub x14,x14,1 /* dec amount left */
+ mov v28.b[0],w15 /* assume block 2 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v28.16b,v2.16b
+ mov v29.b[3],w15 /* block 3, doesn't get rev'd */
+/*
+ * get the len_hi, len_lo in bits according to
+ * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12)
+ * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9)
+ * this is done before the if/else above
+ */
+.Lpad100:
+ mov v29.s[3],w9 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+/*
+ * note that q29 is already built in the correct format, so no swap required
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+/*
+ * do last sha of pad block
+ */
+
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ mov x9,sp
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add sp,sp,8*16
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+/*
+ * now we just have to put this into big endian and store!
+ */
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ rev32 v24.16b,v24.16b /* big endian ABCD */
+ ld1 {v12.16b - v15.16b},[x9]
+ rev32 v25.16b,v25.16b /* big endian EFGH */
+
+ st1 {v24.4s,v25.4s},[x3] /* save them both */
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v3.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ mov w15,0x80 /* sha padding word */
+
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+/*
+ * the idea in the short loop (at least 1) is to break out with the padding
+ * already in place excepting the final word.
+ */
+.Lshort_loop:
+
+ ld1 {v0.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v9.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v10.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+
+ mov v27.b[3],w15 /* assume this was final block */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v26.16b,v0.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ mov v28.b[3],w15 /* assume this was final block */
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v27.16b,v1.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+
+ mov v29.b[3],w15 /* assume this was final block */
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v28.16b,v2.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+
+ rev32 v29.16b,v3.16b /* load res to sha 0, endian swap */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/*
+ * now we have the sha256 to do for these 4 aes blocks
+ */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+
+ mov v26.b[3],w15 /* assume this was final block */
+
+ sub x10,x10,1 /* dec num_blocks */
+ cbnz x10,.Lshort_loop /* keep looping if more */
+/*
+ * there are between 0 and 3 aes blocks in the final sha256 blocks
+ */
+.Lpost_short_loop:
+ lsr x12,x11,32 /* len_hi */
+ and x13,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x13,x13,3 /* len_lo in bits */
+
+ mov v29.s[3],w13 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+/* do final block */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ mov x9,sp
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add sp,sp,8*16
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ rev32 v24.16b,v24.16b /* big endian ABCD */
+ ld1 {v12.16b - v15.16b},[x9]
+ rev32 v25.16b,v25.16b /* big endian EFGH */
+
+ st1 {v24.4s,v25.4s},[x3] /* save them both */
+ ret
+
+ .size aes128cbc_sha256, .-aes128cbc_sha256
new file mode 100644
@@ -0,0 +1,1854 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Enc/Auth Primitive = aes128cbc/sha256_hmac
+ *
+ * Operations:
+ *
+ * out = encrypt-AES128CBC(in)
+ * return_hash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | out))
+ *
+ * Prototype:
+ * void aes128cbc_sha256_hmac(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * aes128cbc_sha256_hmac(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 - v20 -- round keys
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- sha state ABCD
+ * v25 -- sha state EFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not defined.
+ * For AES partial blocks the user is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are not optimized at < 12 AES blocks
+ */
+
+ .file "aes128cbc_sha256_hmac.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global aes128cbc_sha256_hmac
+ .type aes128cbc_sha256_hmac,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+.Linit_sha_state:
+ .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+aes128cbc_sha256_hmac:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ ld1 {v24.4s, v25.4s},[x6] /* init ABCD, EFGH. (2 cycs) */
+ ldr x6, [x5, #HMAC_OKEYPAD] /* save pointer to o_key_pad partial hash */
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */
+ adr x12,.Linit_sha_state /* address of sha init state consts */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,12 /* no main loop if <12 */
+ b.lt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+/* proceed */
+ ld1 {v3.16b},[x5] /* get 1st ivec */
+ ld1 {v0.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+ mov x11,x4 /* len -> x11 needed at end */
+ lsr x12,x11,6 /* total_blocks */
+
+/*
+ * now we can do the loop prolog, 1st aes sequence of 4 blocks
+ */
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ aese v0.16b,v9.16b
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+ adr x8,.Lrcon /* base address for sha round consts */
+ aesmc v0.16b,v0.16b
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ aese v0.16b,v10.16b
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ aese v0.16b,v16.16b
+ mov x4,x1 /* sha_ptr_in = aes_ptr_out */
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ prfm PLDL1KEEP,[x8,0*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ prfm PLDL1KEEP,[x8,2*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ prfm PLDL1KEEP,[x8,4*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ prfm PLDL1KEEP,[x8,6*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ prfm PLDL1KEEP,[x8,8*64] /* rcon */
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+
+ eor v2.16b,v2.16b,v1.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ mov x2,x0 /* lead_ptr = aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ prfm PLDL1KEEP,[x8,10*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ prfm PLDL1KEEP,[x8,12*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ prfm PLDL1KEEP,[x8,14*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+
+ eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ sub x7,x12,1 /* main_blocks = total_blocks - 1 */
+ and x13,x10,3 /* aes_blocks_left */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+
+/* Note, aes_blocks_left := number after the main (sha) block is done. Can be 0 */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/*
+ * main combined loop CBC
+ */
+.Lmain_loop:
+
+/*
+ * because both mov, rev32 and eor have a busy cycle, this takes longer than it looks.
+ * Thats OK since there are 6 cycles before we can use the load anyway; so this goes
+ * as fast as it can without SW pipelining (too complicated given the code size)
+ */
+ rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */
+ ld1 {v0.16b},[x0],16 /* next aes block, update aes_ptr_in */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+ ld1 {v5.16b},[x9],16 /* key1 */
+
+/*
+ * aes xform 0, sha quad 0
+ */
+ aese v0.16b,v8.16b
+ ld1 {v6.16b},[x9],16 /* key2 */
+ rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ aesmc v0.16b,v0.16b
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aese v0.16b,v9.16b
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aesmc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aese v0.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+/* no place to get rid of this stall */
+ rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aese v0.16b,v12.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesmc v0.16b,v0.16b
+ sha256su0 v27.4s,v28.4s
+ aese v0.16b,v13.16b
+ sha256h q22, q23, v5.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aese v0.16b,v14.16b
+ ld1 {v5.16b},[x9],16 /* key5 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aese v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aese v0.16b,v16.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd (1 cyc stall on v22) */
+ sha256su0 v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aese v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+
+/* aes xform 1, sha quad 1 */
+ sha256su0 v26.4s,v27.4s
+ eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/ prev value */
+ ld1 {v7.16b},[x9],16 /* key7 */
+ mov v21.16b, v22.16b /* copy abcd */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aese v1.16b,v8.16b
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h2 q23, q21, v4.4s
+ aesmc v1.16b,v1.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aese v1.16b,v9.16b
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aese v1.16b,v10.16b
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aese v1.16b,v11.16b
+ ld1 {v5.16b},[x9],16 /* key5 (extra stall from mov) */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256su0 v28.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ sha256h q22, q23, v6.4s
+ aese v1.16b,v12.16b
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha256su0 v29.4s,v26.4s
+ aese v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aese v1.16b,v14.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aese v1.16b,v15.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+
+
+/* mode op 2 */
+ eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+
+ sha256su0 v26.4s,v27.4s
+ aese v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aese v2.16b,v9.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesmc v2.16b,v2.16b
+ sha256su0 v27.4s,v28.4s
+ aese v2.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aese v2.16b,v11.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aese v2.16b,v13.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256su0 v29.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ aese v2.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v7.4s
+ aese v2.16b,v15.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ aesmc v2.16b,v2.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+/* mode op 3 */
+ eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */
+
+/* aes xform 3, sha quad 3 (hash only) */
+
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aese v3.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v4.4s
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v5.4s
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v6.4s
+ aese v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v3.16b,v3.16b
+ sha256h2 q23, q21, v7.4s
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbnz x7,.Lmain_loop /* loop if more to do */
+
+/*
+ * epilog, process remaining aes blocks and b-2 sha block
+ * do this inline (no loop) to overlap with the sha part
+ * note there are 0-3 aes blocks left.
+ */
+
+ rev32 v26.16b,v0.16b /* fix endian w0 */
+ rev32 v27.16b,v1.16b /* fix endian w1 */
+ rev32 v28.16b,v2.16b /* fix endian w2 */
+ rev32 v29.16b,v3.16b /* fix endian w3 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ cbz x13, .Lbm2fromQ0 /* skip if none left */
+ subs x14,x13,1 /* local copy of aes_blocks_left */
+
+/* mode op 0 */
+ ld1 {v0.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aese v0.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ aesmc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aese v0.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aese v0.16b,v10.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256su0 v27.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aese v0.16b,v12.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aese v0.16b,v14.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ sha256su0 v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ beq .Lbm2fromQ1 /* if aes_blocks_left_count == 0 */
+
+/* mode op 1 */
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1, sha quad 1 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ aese v1.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesmc v1.16b,v1.16b
+ sha256su0 v26.4s,v27.4s
+ aese v1.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v4.4s
+ aese v1.16b,v10.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ subs x14,x14,1 /* dec counter */
+ aese v1.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aese v1.16b,v12.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v6.4s
+ aese v1.16b,v14.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ sha256su0 v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aese v1.16b,v16.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ beq .Lbm2fromQ2 /* if aes_blocks_left_count == 0 */
+
+/* mode op 2 */
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aese v2.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ aesmc v2.16b,v2.16b
+ sha256su0 v26.4s,v27.4s
+ aese v2.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aese v2.16b,v10.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256su0 v27.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aese v2.16b,v12.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesmc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aese v2.16b,v14.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ sha256su0 v29.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ b .Lbm2fromQ3 /* join common code at Quad 3 */
+
+/*
+ * now there is the b-2 sha block before the final one. Execution takes over
+ * in the appropriate part of this depending on how many aes blocks were left.
+ * If there were none, the whole thing is executed.
+ */
+/* quad 0 */
+.Lbm2fromQ0:
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+.Lbm2fromQ1:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+.Lbm2fromQ2:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+.Lbm2fromQ3:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ eor v26.16b,v26.16b,v26.16b /* zero reg */
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ eor v27.16b,v27.16b,v27.16b /* zero reg */
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ eor v28.16b,v28.16b,v28.16b /* zero reg */
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+/*
+ * now we can do the final block, either all padding or 1-3 aes blocks
+ * len in x11, aes_blocks_left in x13. should move the aes data setup of this
+ * to the last aes bit.
+ */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov w15,0x80 /* that's the 1 of the pad */
+ /* Add one SHA-2 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x9,x11,0xffffffff /* len_lo */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov v26.b[0],w15 /* assume block 0 is dst */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x9,x9,3 /* len_lo in bits */
+ eor v29.16b,v29.16b,v29.16b /* zero reg */
+/*
+ * places the 0x80 in the correct block, copies the appropriate data
+ */
+ cbz x13,.Lpad100 /* no data to get */
+ mov v26.16b,v0.16b
+ sub x14,x13,1 /* dec amount left */
+ mov v27.b[0],w15 /* assume block 1 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v27.16b,v1.16b
+ sub x14,x14,1 /* dec amount left */
+ mov v28.b[0],w15 /* assume block 2 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v28.16b,v2.16b
+ mov v29.b[3],w15 /* block 3, doesn't get rev'd */
+/*
+ * get the len_hi,LenLo in bits according to
+ * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12)
+ * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9)
+ * this is done before the if/else above
+ */
+.Lpad100:
+ mov v29.s[3],w9 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+/*
+ * note that q29 is already built in the correct format, so no swap required
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+/*
+ * do last sha of pad block
+ */
+
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v26.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v27.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ adr x8,.Lrcon /* base address for sha round consts */
+
+ ld1 {v24.16b,v25.16b}, [x6] /* load o_key_pad partial hash */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v28.b[3], w11
+
+ mov x11, #64+32 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ mov v29.s[3], w11 /* move length to the end of the block */
+
+ ld1 {v4.16b},[x8],16 /* key0 */
+ ld1 {v5.16b},[x8],16 /* key1 */
+ ld1 {v6.16b},[x8],16 /* key2 */
+ ld1 {v7.16b},[x8],16 /* key3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key4 */
+ ld1 {v5.16b},[x8],16 /* key5 */
+ ld1 {v6.16b},[x8],16 /* key6 */
+ ld1 {v7.16b},[x8],16 /* key7 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key8 */
+ ld1 {v5.16b},[x8],16 /* key9 */
+ ld1 {v6.16b},[x8],16 /* key10 */
+ ld1 {v7.16b},[x8],16 /* key11 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key12 */
+ ld1 {v5.16b},[x8],16 /* key13 */
+ ld1 {v6.16b},[x8],16 /* key14 */
+ ld1 {v7.16b},[x8],16 /* key15 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ st1 {v24.4s,v25.4s},[x3] /* save them both */
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v3.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ mov w15,0x80 /* sha padding word */
+
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+/*
+ * the idea in the short loop (at least 1) is to break out with the padding
+ * already in place excepting the final word.
+ */
+.Lshort_loop:
+ ld1 {v0.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v9.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v10.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+
+ mov v27.b[3],w15 /* assume this was final block */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v26.16b,v0.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v1.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ mov v28.b[3],w15 /* assume this was final block */
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v27.16b,v1.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v2.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+
+ mov v29.b[3],w15 /* assume this was final block */
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ rev32 v28.16b,v2.16b /* load res to sha 0, endian swap */
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+
+ ld1 {v3.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ eor v3.16b,v3.16b,v2.16b /* xor w/ prev value */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+
+ rev32 v29.16b,v3.16b /* load res to sha 0, endian swap */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/*
+ * now we have the sha256 to do for these 4 aes blocks
+ */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+
+ mov v26.b[3],w15 /* assume this was final block */
+
+ sub x10,x10,1 /* dec num_blocks */
+ cbnz x10,.Lshort_loop /* keep looping if more */
+/*
+ * there are between 0 and 3 aes blocks in the final sha256 blocks
+ */
+.Lpost_short_loop:
+ /* Add one SHA-2 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x13,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x13,x13,3 /* len_lo in bits */
+
+ mov v29.s[3],w13 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+/* do final block */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v26.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v27.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ adr x8,.Lrcon /* base address for sha round consts */
+
+ ld1 {v24.16b,v25.16b}, [x6] /* load o_key_pad partial hash */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v28.b[3], w11
+
+ mov x11, #64+32 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ mov v29.s[3], w11 /* move length to the end of the block */
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ ld1 {v4.16b},[x8],16 /* key0 */
+ ld1 {v5.16b},[x8],16 /* key1 */
+ ld1 {v6.16b},[x8],16 /* key2 */
+ ld1 {v7.16b},[x8],16 /* key3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key4 */
+ ld1 {v5.16b},[x8],16 /* key5 */
+ ld1 {v6.16b},[x8],16 /* key6 */
+ ld1 {v7.16b},[x8],16 /* key7 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key8 */
+ ld1 {v5.16b},[x8],16 /* key9 */
+ ld1 {v6.16b},[x8],16 /* key10 */
+ ld1 {v7.16b},[x8],16 /* key11 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key12 */
+ ld1 {v5.16b},[x8],16 /* key13 */
+ ld1 {v6.16b},[x8],16 /* key14 */
+ ld1 {v7.16b},[x8],16 /* key15 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ st1 {v24.4s,v25.4s},[x3] /* save them both */
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+ .size aes128cbc_sha256_hmac, .-aes128cbc_sha256_hmac
new file mode 100644
@@ -0,0 +1,151 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ .file "aes_core.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .align 4
+ .global aes128_key_sched_enc
+ .type aes128_key_sched_enc, %function
+ .global aes128_key_sched_dec
+ .type aes128_key_sched_dec, %function
+
+ /*
+ * AES key expand algorithm for single round.
+ */
+ .macro key_expand res, key, shuffle_mask, rcon, tq0, tq1, td
+ /* temp = rotword(key[3]) */
+ tbl \td\().8b,{\key\().16b},\shuffle_mask\().8b
+ dup \tq0\().2d,\td\().d[0]
+ /* temp = subbytes(temp) */
+ aese \tq0\().16b,v19\().16b /* q19 := 0 */
+ /* temp = temp + rcon */
+ mov w11,\rcon
+ dup \tq1\().4s,w11
+ eor \tq0\().16b,\tq0\().16b,\tq1\().16b
+ /* tq1 = [0, a, b, c] */
+ ext \tq1\().16b,v19\().16b,\key\().16b,12 /* q19 := 0 */
+ eor \res\().16b,\key\().16b,\tq1\().16b
+ /* tq1 = [0, 0, a, b] */
+ ext \tq1\().16b,v19\().16b,\tq1\().16b,12 /* q19 := 0 */
+ eor \res\().16b,\res\().16b,\tq1\().16b
+ /* tq1 = [0, 0, 0, a] */
+ ext \tq1\().16b,v19\().16b,\tq1\().16b,12 /* q19 := 0 */
+ eor \res\().16b,\res\().16b,\tq1\().16b
+ /* + temp */
+ eor \res\().16b,\res\().16b,\tq0\().16b
+ .endm
+/*
+ * *expanded_key, *user_key
+ */
+ .align 4
+aes128_key_sched_enc:
+ sub sp,sp,4*16
+ st1 {v8.16b - v11.16b},[sp]
+ ld1 {v0.16b},[x1] /* user_key */
+ mov w10,0x0e0d /* form shuffle_word */
+ mov w11,0x0c0f
+ orr w10,w10,w11,lsl 16
+ dup v20.4s,w10 /* shuffle_mask */
+ eor v19.16b,v19.16b,v19.16b /* zero */
+ /* Expand key */
+ key_expand v1,v0,v20,0x1,v21,v16,v17
+ key_expand v2,v1,v20,0x2,v21,v16,v17
+ key_expand v3,v2,v20,0x4,v21,v16,v17
+ key_expand v4,v3,v20,0x8,v21,v16,v17
+ key_expand v5,v4,v20,0x10,v21,v16,v17
+ key_expand v6,v5,v20,0x20,v21,v16,v17
+ key_expand v7,v6,v20,0x40,v21,v16,v17
+ key_expand v8,v7,v20,0x80,v21,v16,v17
+ key_expand v9,v8,v20,0x1b,v21,v16,v17
+ key_expand v10,v9,v20,0x36,v21,v16,v17
+ /* Store round keys in the correct order */
+ st1 {v0.16b - v3.16b},[x0],64
+ st1 {v4.16b - v7.16b},[x0],64
+ st1 {v8.16b - v10.16b},[x0],48
+
+ ld1 {v8.16b - v11.16b},[sp]
+ add sp,sp,4*16
+ ret
+
+ .size aes128_key_sched_enc, .-aes128_key_sched_enc
+
+/*
+ * *expanded_key, *user_key
+ */
+ .align 4
+aes128_key_sched_dec:
+ sub sp,sp,4*16
+ st1 {v8.16b-v11.16b},[sp]
+ ld1 {v0.16b},[x1] /* user_key */
+ mov w10,0x0e0d /* form shuffle_word */
+ mov w11,0x0c0f
+ orr w10,w10,w11,lsl 16
+ dup v20.4s,w10 /* shuffle_mask */
+ eor v19.16b,v19.16b,v19.16b /* zero */
+ /*
+ * Expand key.
+ * Intentionally reverse registers order to allow
+ * for multiple store later.
+ * (Store must be performed in the ascending registers' order)
+ */
+ key_expand v10,v0,v20,0x1,v21,v16,v17
+ key_expand v9,v10,v20,0x2,v21,v16,v17
+ key_expand v8,v9,v20,0x4,v21,v16,v17
+ key_expand v7,v8,v20,0x8,v21,v16,v17
+ key_expand v6,v7,v20,0x10,v21,v16,v17
+ key_expand v5,v6,v20,0x20,v21,v16,v17
+ key_expand v4,v5,v20,0x40,v21,v16,v17
+ key_expand v3,v4,v20,0x80,v21,v16,v17
+ key_expand v2,v3,v20,0x1b,v21,v16,v17
+ key_expand v1,v2,v20,0x36,v21,v16,v17
+ /* Inverse mixcolumns for keys 1-9 (registers v10-v2) */
+ aesimc v10.16b, v10.16b
+ aesimc v9.16b, v9.16b
+ aesimc v8.16b, v8.16b
+ aesimc v7.16b, v7.16b
+ aesimc v6.16b, v6.16b
+ aesimc v5.16b, v5.16b
+ aesimc v4.16b, v4.16b
+ aesimc v3.16b, v3.16b
+ aesimc v2.16b, v2.16b
+ /* Store round keys in the correct order */
+ st1 {v1.16b - v4.16b},[x0],64
+ st1 {v5.16b - v8.16b},[x0],64
+ st1 {v9.16b, v10.16b},[x0],32
+ st1 {v0.16b},[x0],16
+
+ ld1 {v8.16b - v11.16b},[sp]
+ add sp,sp,4*16
+ ret
+
+ .size aes128_key_sched_dec, .-aes128_key_sched_dec
new file mode 100644
@@ -0,0 +1,78 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ARMV8_DEFS_H_
+#define _RTE_ARMV8_DEFS_H_
+
+struct crypto_arg {
+ struct {
+ uint8_t *key;
+ uint8_t *iv;
+ } cipher;
+ struct {
+ struct {
+ uint8_t *key;
+ uint8_t *i_key_pad;
+ uint8_t *o_key_pad;
+ } hmac;
+ } digest;
+};
+
+typedef struct crypto_arg crypto_arg_t;
+
+void aes128_key_sched_enc(uint8_t *expanded_key, const uint8_t *user_key);
+void aes128_key_sched_dec(uint8_t *expanded_key, const uint8_t *user_key);
+
+void aes128cbc_sha1_hmac(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+void aes128cbc_sha256(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+void aes128cbc_sha256_hmac(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+void aes128cbc_dec_sha256(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+void sha1_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+void sha256_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+void sha256_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+void sha256_aes128cbc(uint8_t *csrc, uint8_t *cdst, uint8_t *dsrc, uint8_t *ddst,
+ uint64_t len, crypto_arg_t *arg);
+
+int sha1_block_partial(uint8_t *init, const uint8_t *src, uint8_t *dst, uint64_t len);
+int sha1_block(uint8_t *init, const uint8_t *src, uint8_t *dst, uint64_t len);
+
+int sha256_block_partial(uint8_t *init, const uint8_t *src, uint8_t *dst, uint64_t len);
+int sha256_block(uint8_t *init, const uint8_t *src, uint8_t *dst, uint64_t len);
+
+#endif /* _RTE_ARMV8_DEFS_H_ */
new file mode 100644
@@ -0,0 +1,515 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-1 Primitives
+ *
+ * Operations:
+ * sha1_block_partial:
+ * out = partial_sha1(init, in, len) <- no final block
+ *
+ * sha1_block:
+ * out = sha1(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha1_block_partial(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha1_block(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (sucess), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha1_block_partial(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * sha1_block(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v22 -- sha working state ABCD (q22)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16 (+20 for the HMAC),
+ * otherwise error code is returned.
+ *
+ */
+ .file "sha1_core.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .align 4
+ .global sha1_block_partial
+ .type sha1_block_partial,%function
+ .global sha1_block
+ .type sha1_block,%function
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+ .align 4
+.Linit_sha_state:
+ .word 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476
+ .word 0xc3d2e1f0, 0x00000000, 0x00000000, 0x00000000
+
+ .align 4
+
+sha1_block_partial:
+ mov x6, #1 /* indicate partial hash */
+ ands x5, x3, #0x3f /* Check size mod 1 SHA block */
+ b.ne .Lsha1_error
+ cbnz x0, 1f
+ adr x0,.Linit_sha_state /* address of sha init state consts */
+1:
+ ld1 {v24.4s},[x0],16 /* init ABCD */
+ ld1 {v25.4s},[x0] /* and E */
+
+ /* Load SHA-1 constants */
+ adr x4,.Lrcon
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ lsr x5, x3, 2 /* number of 4B blocks */
+ b .Lsha1_loop
+
+sha1_block:
+ mov x6, xzr /* indicate full hash */
+ and x5, x3, #0xf /* check size mod 16B block */
+ cmp x5, #4 /* additional word is accepted */
+ b.eq 1f
+ cbnz x5, .Lsha1_error
+1:
+ cbnz x0, 2f
+ adr x0,.Linit_sha_state /* address of sha init state consts */
+2:
+ ld1 {v24.4s},[x0],16 /* init ABCD */
+ ld1 {v25.4s},[x0] /* and E */
+
+ /* Load SHA-1 constants */
+ adr x4,.Lrcon
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ lsr x5, x3, 2 /* number of 4B blocks */
+ cmp x5, #16 /* at least 16 4B blocks give 1 SHA block */
+ b.lo .Lsha1_last
+
+ .align 4
+
+.Lsha1_loop:
+ sub x5, x5, #16 /* substract 1 SHA block */
+
+ ld1 {v26.16b},[x1],16 /* dsrc[0] */
+ ld1 {v27.16b},[x1],16 /* dsrc[1] */
+ ld1 {v28.16b},[x1],16 /* dsrc[2] */
+ ld1 {v29.16b},[x1],16 /* dsrc[3] */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+/* quad 0 */
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s25,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v4.4s,v27.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v4.4s,v28.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v4.4s,v29.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* quad 1 */
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v5.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v5.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v5.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v6.4s,v29.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v6.4s,v26.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v6.4s,v27.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+/* quad 3 */
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v7.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v16.4s,v7.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v24.4s,v24.4s,v22.4s
+ add v25.4s,v25.4s,v18.4s
+
+ cmp x5, #16
+ b.hs .Lsha1_loop
+
+ /* Store partial hash and return or complete hash */
+ cbz x6, .Lsha1_last
+
+ st1 {v24.16b},[x2],16
+ st1 {v25.16b},[x2]
+
+ mov x0, xzr
+ ret
+
+ /*
+ * Last block with padding. v24-v25[0] contain hash state.
+ */
+.Lsha1_last:
+
+ eor v26.16b, v26.16b, v26.16b
+ eor v27.16b, v27.16b, v27.16b
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ adr x4,.Lrcon
+ /* Number of bits in message */
+ lsl x3, x3, 3
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Fill out the first vector register and the end of the block */
+ mov v29.s[3], w3 /* move length to the end of the block */
+ lsr x3, x3, 32
+ mov v29.s[2], w3 /* and the higher part */
+
+ /* The remaining part is up to 3 16B blocks and up to 1 4B block */
+ mov w6, #0x80 /* that's the 1 of the pad */
+ mov v26.b[3], w6
+ cbz x5,.Lsha1_final
+ /* Are there 3 16B blocks? */
+ cmp x5, #12
+ b.lo 1f
+ ld1 {v26.16b},[x1],16
+ ld1 {v27.16b},[x1],16
+ ld1 {v28.16b},[x1],16
+ rev32 v26.16b, v26.16b
+ rev32 v27.16b, v27.16b
+ rev32 v28.16b, v28.16b
+ sub x5,x5,#12
+ mov v29.b[7], w6
+ cbz x5,.Lsha1_final
+ mov v29.b[7], wzr
+ ld1 {v29.s}[0],[x1],4
+ rev32 v29.16b,v29.16b
+ mov v29.b[7], w6
+ b .Lsha1_final
+1:
+ /* Are there 2 16B blocks? */
+ cmp x5, #8
+ b.lo 2f
+ ld1 {v26.16b},[x1],16
+ ld1 {v27.16b},[x1],16
+ rev32 v26.16b,v26.16b
+ rev32 v27.16b,v27.16b
+ sub x5,x5,#8
+ mov v28.b[7], w6
+ cbz x5,.Lsha1_final
+ mov v28.b[7], wzr
+ ld1 {v28.s}[0],[x1],4
+ rev32 v28.16b,v28.16b
+ mov v28.b[7], w6
+ b .Lsha1_final
+2:
+ /* Is there 1 16B block? */
+ cmp x5, #4
+ b.lo 3f
+ ld1 {v26.16b},[x1],16
+ rev32 v26.16b,v26.16b
+ sub x5,x5,#4
+ mov v27.b[7], w6
+ cbz x5,.Lsha1_final
+ mov v27.b[7], wzr
+ ld1 {v27.s}[0],[x1],4
+ rev32 v27.16b,v27.16b
+ mov v27.b[7], w6
+ b .Lsha1_final
+3:
+ ld1 {v26.s}[0],[x1],4
+ rev32 v26.16b,v26.16b
+ mov v26.b[7], w6
+
+.Lsha1_final:
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+/* quad 0 */
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s25,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v4.4s,v27.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v4.4s,v28.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v4.4s,v29.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* quad 1 */
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v5.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v5.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v5.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v6.4s,v29.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v6.4s,v26.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v6.4s,v27.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+/* quad 3 */
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v7.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v16.4s,v7.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v25.4s,v25.4s,v18.4s
+ add v24.4s,v24.4s,v22.4s
+
+ rev32 v24.16b,v24.16b
+ rev32 v25.16b,v25.16b
+
+ st1 {v24.16b}, [x2],16
+ st1 {v25.s}[0], [x2]
+
+ mov x0, xzr
+ ret
+
+.Lsha1_error:
+ mov x0, #-1
+ ret
+
+ .size sha1_block_partial, .-sha1_block_partial
+ .size sha1_block, .-sha1_block
new file mode 100644
@@ -0,0 +1,1598 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Auth/Dec Primitive = sha1_hmac/aes128cbc
+ *
+ * Operations:
+ *
+ * out = decrypt-AES128CBC(in)
+ * return_ash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | in))
+ *
+ * Prototype:
+ *
+ * void sha1_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * sha1_hmac_aes128cbc_dec(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 -- temp register for SHA1
+ * v20 -- ABCD copy (q20)
+ * v21 -- sha working state (q21)
+ * v22 -- sha working state (q22)
+ * v23 -- temp register for SHA1
+ * v24 -- sha state ABCD
+ * v25 -- sha state E
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not defined.
+ * For AES partial blocks the user is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are less optimized at < 16 AES blocks, however they are somewhat optimized,
+ * and more so than the enc/auth versions.
+ */
+ .file "sha1_hmac_aes128cbc_dec.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global sha1_hmac_aes128cbc_dec
+ .type sha1_hmac_aes128cbc_dec,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+sha1_hmac_aes128cbc_dec:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ ld1 {v24.4s, v25.4s},[x6] /* init ABCD, EFGH. (2 cycs) */
+ ldr x6, [x5, #HMAC_OKEYPAD] /* save pointer to o_key_pad partial hash */
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next *in */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,16 /* no main loop if <16 */
+ blt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x11,x4 /* len -> x11 needed at end */
+ mov x7,sp /* copy for address mode */
+ ld1 {v30.16b},[x5] /* get 1st ivec */
+ lsr x12,x11,6 /* total_blocks (sha) */
+ mov x4,x0 /* sha_ptr_in = *in */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ ld1 {v29.16b},[x4],16 /* next w3 */
+
+/*
+ * now we can do the loop prolog, 1st sha1 block
+ */
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+
+ adr x8,.Lrcon /* base address for sha round consts */
+/*
+ * do the first sha1 block on the plaintext
+ */
+ mov v20.16b,v24.16b /* init working ABCD */
+ st1 {v8.16b},[x7],16
+ st1 {v9.16b},[x7],16
+ rev32 v26.16b,v26.16b /* endian swap w0 */
+ st1 {v10.16b},[x7],16
+ rev32 v27.16b,v27.16b /* endian swap w1 */
+ st1 {v11.16b},[x7],16
+ rev32 v28.16b,v28.16b /* endian swap w2 */
+ st1 {v12.16b},[x7],16
+ rev32 v29.16b,v29.16b /* endian swap w3 */
+ st1 {v13.16b},[x7],16
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ add v19.4s,v4.4s,v26.4s
+ st1 {v14.16b},[x7],16
+ add v23.4s,v4.4s,v27.4s
+ st1 {v15.16b},[x7],16
+/* quad 0 */
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ sha1c q24,s25,v19.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+ sha1c q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v4.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ sha1c q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ sha1c q24,s21,v19.4s
+ add v19.4s,v5.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+/* quad 1 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ sha1p q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v5.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ sha1p q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ sha1p q24,s21,v19.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ sha1p q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v23.4s,v6.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ add v19.4s,v6.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+/* quad 3 */
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ sha1p q24,s21,v19.4s
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ sha1p q24,s22,v23.4s
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ sha1p q24,s21,v19.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ sha1p q24,s22,v23.4s
+
+/*
+ * aes_blocks_left := number after the main (sha) block is done.
+ * can be 0 note we account for the extra unwind in main_blocks
+ */
+ sub x7,x12,2 /* main_blocks = total_blocks - 5 */
+ add v24.4s,v24.4s,v20.4s
+ and x13,x10,3 /* aes_blocks_left */
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ add v25.4s,v25.4s,v21.4s
+ add x2,x0,128 /* lead_ptr = *in */
+ ld1 {v31.16b},[x0],16 /* next aes block, update aes_ptr_in */
+
+/*
+ * main combined loop CBC, can be used by auth/enc version
+ */
+.Lmain_loop:
+/*
+ * because both mov, rev32 and eor have a busy cycle, this takes longer than it looks.
+ * I've rewritten this to hoist the v0 loads but there is still no way to hide the
+ * required latency of these sha-associated instructions. It is a perfect example of
+ * why putting to much time into an NP-complete and NP-hard problem can be a mistake,
+ * even if it looks like a reasonable thing at the surface.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+/* aes xform 0, sha quad 0 */
+ aesd v0.16b,v8.16b
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ add v19.4s,v4.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v0.16b,v10.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ add v23.4s,v4.4s,v27.4s
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesimc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aesd v0.16b,v12.16b
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v15.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aesd v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* get next aes block, with update */
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v1.16b,v8.16b
+ sha1h s21,s24
+ add v19.4s,v5.4s,v28.4s
+ sha1p q24,s22,v23.4s
+ aesimc v1.16b,v1.16b
+ sha1su1 v27.4s,v26.4s
+ aesd v1.16b,v9.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v10.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v28.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1h s21,s24
+ aesd v1.16b,v12.16b
+ sha1p q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ sha1h s22,s24
+ add v19.4s,v5.4s,v26.4s
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v14.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v16.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v6.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ add v23.4s,v6.4s,v29.4s
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+/* aes xform 2, sha quad 2 */
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v9.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v2.16b,v10.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v11.16b
+ sha1su1 v29.4s,v28.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v13.16b
+ sha1su1 v26.4s,v29.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v2.16b,v2.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesd v2.16b,v14.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v15.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v6.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ sha1h s22,s24
+ aesd v2.16b,v16.16b
+ sha1m q24,s21,v19.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v2.16b,v17.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v7.4s,v29.4s
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v19.4s,v7.4s,v26.4s
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+/* aes xform 3, sha quad 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v3.16b,v9.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v10.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v3.16b,v3.16b
+ sha1su1 v29.4s,v28.4s
+ aesd v3.16b,v11.16b
+ sha1h s22,s24
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aesd v3.16b,v13.16b
+ sha1h s21,s24
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesimc v3.16b,v3.16b
+ add v19.4s,v7.4s,v28.4s
+ aesd v3.16b,v15.16b
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ sha1h s22,s24
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v29.4s
+ aesd v3.16b,v17.16b
+ sha1h s21,s24
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ sha1p q24,s22,v23.4s
+ add v24.4s,v24.4s,v20.4s
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* next aes block, update aes_ptr_in */
+ add v25.4s,v25.4s,v21.4s
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbnz x7,.Lmain_loop /* loop if more to do */
+/*
+ * now the loop epilog. Since the reads for sha have already been done in advance, we
+ * have to have an extra unwind. This is why the test for the short cases is 16 and not 12.
+ *
+ * the unwind, which is just the main loop without the tests or final reads.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+/* aes xform 0, sha quad 0 */
+ aesd v0.16b,v8.16b
+ add v19.4s,v4.4s,v26.4s
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesd v0.16b,v9.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ add v23.4s,v4.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aesd v0.16b,v11.16b
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ aesimc v0.16b,v0.16b
+ sha1su1 v26.4s,v29.4s
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v15.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aesd v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ add v23.4s,v5.4s,v27.4s
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v1.16b,v8.16b
+ sha1h s21,s24
+ add v19.4s,v5.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesd v1.16b,v10.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ aesimc v1.16b,v1.16b
+ sha1h s22,s24
+ aesd v1.16b,v11.16b
+ sha1p q24,s21,v19.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v1.16b,v13.16b
+ sha1h s21,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v14.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v23.4s,v5.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v1.16b,v16.16b
+ sha1h s22,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v17.16b
+ add v19.4s,v6.4s,v28.4s
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ sha1su1 v26.4s,v29.4s
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/ prev value */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ add v23.4s,v6.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* mode op 2 */
+/* aes xform 2, sha quad 2 */
+ aesd v2.16b,v8.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ add v19.4s,v6.4s,v26.4s
+ aesd v2.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v13.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesd v2.16b,v14.16b
+ add v23.4s,v6.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v2.16b,v15.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v16.16b
+ add v19.4s,v6.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ sha1su1 v26.4s,v29.4s
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+/* mode op 3 */
+/* aes xform 3, sha quad 3 */
+ aesd v3.16b,v8.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v3.16b,v9.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v10.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v3.16b,v3.16b
+ add v19.4s,v7.4s,v26.4s
+ aesd v3.16b,v11.16b
+ sha1h s22,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v12.16b
+ ld1 {v0.16b},[x0] /* read first aes block, no bump */
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aesd v3.16b,v13.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ add v19.4s,v7.4s,v28.4s
+ aesd v3.16b,v14.16b
+ sha1h s22,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v15.16b
+ add v23.4s,v7.4s,v29.4s
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+/*
+ * now we have to do the 4 aes blocks (b-2) that catch up to where sha is
+ */
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+ eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/*
+ * Now, there is the final b-1 sha1 padded block. This contains between 0-3 aes blocks.
+ * we take some pains to avoid read spill by only reading the blocks that are actually defined.
+ * this is also the final sha block code for the shortCases.
+ */
+.Ljoin_common:
+ mov w15,0x80 /* that's the 1 of the pad */
+ cbnz x13,.Lpad100 /* branch if there is some real data */
+ eor v26.16b,v26.16b,v26.16b /* zero the rest */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v26.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad100:
+ sub x14,x13,1 /* dec amount left */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ cbnz x14,.Lpad200 /* branch if there is some real data */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v27.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad200:
+ sub x14,x14,1 /* dec amount left */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ cbnz x14,.Lpad300 /* branch if there is some real data */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v28.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad300:
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v29.b[3],w15 /* all data is bogus */
+
+.Lpad_done:
+ /* Add one SHA-1 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x14,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x14,x14,3 /* len_lo in bits */
+
+ mov v29.s[3],w14 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+/*
+ * final sha block
+ * the strategy is to combine the 0-3 aes blocks, which is faster but
+ * a little gourmand on code space.
+ */
+ cbz x13,.Lzero_aes_blocks_left /* none to do */
+ ld1 {v0.16b},[x0] /* read first aes block, bump aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ add v19.4s,v4.4s,v26.4s
+ aesd v0.16b,v10.16b
+ add v23.4s,v4.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v12.16b
+ sha1c q24,s25,v19.4s
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v4.4s,v28.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v15.16b
+ sha1c q24,s21,v19.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v4.4s,v29.4s
+ aesd v0.16b,v16.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v17.16b
+ sha1su1 v29.4s,v28.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ sub x13,x13,1 /* dec counter */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbz x13,.Lfrmquad1
+
+/* aes xform 1 */
+ ld1 {v0.16b},[x0] /* read first aes block, bump aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ add v23.4s,v5.4s,v27.4s
+ aesd v0.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s22,v23.4s
+ aesd v0.16b,v11.16b
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v12.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s21,v19.4s
+ aesd v0.16b,v13.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v5.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v0.16b,v14.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s22,v23.4s
+ aesd v0.16b,v15.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v0.16b,v16.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s21,v19.4s
+ aesd v0.16b,v17.16b
+ sha1su1 v26.4s,v29.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ sub x13,x13,1 /* dec counter */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbz x13,.Lfrmquad2
+
+/* aes xform 2 */
+ ld1 {v0.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+ add v19.4s,v6.4s,v28.4s
+ aesd v0.16b,v8.16b
+ add v23.4s,v6.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v10.16b
+ sha1m q24,s21,v19.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v28.4s,v27.4s
+ aesd v0.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v12.16b
+ sha1m q24,s22,v23.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v29.4s,v28.4s
+ aesd v0.16b,v13.16b
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1m q24,s21,v19.4s
+ aesd v0.16b,v15.16b
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ add v23.4s,v6.4s,v27.4s
+ aesd v0.16b,v16.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1m q24,s22,v23.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ sha1su1 v27.4s,v26.4s
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ b .Lfrmquad3
+/*
+ * the final block with no aes component, i.e from here there were zero blocks
+ */
+
+.Lzero_aes_blocks_left:
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+/* quad 1 */
+.Lfrmquad1:
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+/* quad 2 */
+.Lfrmquad2:
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+/* quad 3 */
+.Lfrmquad3:
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ ld1 {v24.16b,v25.16b}, [x6] /* load o_key_pad partial hash */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+
+ mov x11, #64+20 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ mov v29.s[3], w11 /* move length to the end of the block */
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v30.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+ mov x4,x0 /* sha_ptr_in = in */
+
+ mov x9,x8 /* top of rcon */
+
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+/*
+ * This loop does 4 at a time, so that at the end there is a final sha block and 0-3 aes blocks
+ * Note that everything is done serially to avoid complication.
+ */
+.Lshort_loop:
+ cmp x10,4 /* check if 4 or more */
+ blt .Llast_sha_block /* if less, bail to last block */
+
+ ld1 {v31.16b},[x4] /* next w no update */
+ ld1 {v0.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v26.16b,v0.16b /* endian swap for sha */
+ add x0,x0,64
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ ld1 {v1.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v27.16b,v1.16b /* endian swap for sha */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */
+
+ ld1 {v31.16b},[x4] /* read no update */
+ ld1 {v2.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v28.16b,v2.16b /* endian swap for sha */
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ eor v2.16b,v2.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ ld1 {v3.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v29.16b,v3.16b /* endian swap for sha */
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+/*
+ * now we have the sha1 to do for these 4 aes blocks. Note that.
+ */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/* quad 0 */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* quad 1 */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+/* quad 3 */
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ sub x10,x10,4 /* 4 less */
+ b .Lshort_loop /* keep looping */
+/*
+ * this is arranged so that we can join the common unwind code that does the last
+ * sha block and the final 0-3 aes blocks
+ */
+.Llast_sha_block:
+ mov x13,x10 /* copy aes blocks for common */
+ b .Ljoin_common /* join common code */
+
+ .size sha1_hmac_aes128cbc_dec, .-sha1_hmac_aes128cbc_dec
new file mode 100644
@@ -0,0 +1,1619 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Auth/Dec Primitive = sha256/aes128cbc
+ *
+ * Operations:
+ *
+ * out = decrypt-AES128CBC(in)
+ * return_ash_ptr = SHA256(in)
+ *
+ * Prototype:
+ *
+ * void sha256_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * sha256_aes128cbc_dec(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 - v20 -- round keys
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- regShaStateABCD
+ * v25 -- regShaStateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not defined.
+ * For AES partial blocks the user is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are less optimized at < 16 AES blocks, however they are somewhat optimized,
+ * and more so than the enc/auth versions.
+ */
+ .file "sha256_aes128cbc_dec.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global sha256_aes128cbc_dec
+ .type sha256_aes128cbc_dec,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+.Linit_sha_state:
+ .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+sha256_aes128cbc_dec:
+/* fetch args */
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next *in */
+ adr x12,.Linit_sha_state /* address of sha init state consts */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,16 /* no main loop if <16 */
+ ld1 {v24.4s, v25.4s},[x12] /* init ABCD, EFGH. (2 cycs) */
+ blt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x11,x4 /* len -> x11 needed at end */
+ mov x7,sp /* copy for address mode */
+ ld1 {v30.16b},[x5] /* get 1st ivec */
+ lsr x12,x11,6 /* total_blocks (sha) */
+ mov x4,x0 /* sha_ptr_in = *in */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ ld1 {v29.16b},[x4],16 /* next w3 */
+
+/*
+ * now we can do the loop prolog, 1st sha256 block
+ */
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+
+ adr x8,.Lrcon /* base address for sha round consts */
+/*
+ * do the first sha256 block on the plaintext
+ */
+ mov v22.16b,v24.16b /* init working ABCD */
+ st1 {v8.16b},[x7],16
+ mov v23.16b,v25.16b /* init working EFGH */
+ st1 {v9.16b},[x7],16
+
+ rev32 v26.16b,v26.16b /* endian swap w0 */
+ st1 {v10.16b},[x7],16
+ rev32 v27.16b,v27.16b /* endian swap w1 */
+ st1 {v11.16b},[x7],16
+ rev32 v28.16b,v28.16b /* endian swap w2 */
+ st1 {v12.16b},[x7],16
+ rev32 v29.16b,v29.16b /* endian swap w3 */
+ st1 {v13.16b},[x7],16
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ st1 {v14.16b},[x7],16
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ st1 {v15.16b},[x7],16
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ ld1 {v29.16b},[x4],16 /* next w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+/*
+ * aes_blocks_left := number after the main (sha) block is done.
+ * can be 0 note we account for the extra unwind in main_blocks
+ */
+ sub x7,x12,2 /* main_blocks = total_blocks - 5 */
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ and x13,x10,3 /* aes_blocks_left */
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+ add x2,x0,128 /* lead_ptr = *in */
+ ld1 {v31.16b},[x0],16 /* next aes block, update aes_ptr_in */
+
+/*
+ * main combined loop CBC, can be used by auth/enc version
+ */
+.Lmain_loop:
+
+/*
+ * because both mov, rev32 and eor have a busy cycle, this takes longer than it looks.
+ * I've rewritten this to hoist the v0 loads but there is still no way to hide the
+ * required latency of these sha-associated instructions. It is a perfect example of
+ * why putting to much time into an NP-complete and NP-hard problem can be a mistake,
+ * even if it looks like a reasonable thing at the surface.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov x9,x8 /* top of rcon */
+
+/*
+ * aes xform 0, sha quad 0
+ */
+ aesd v0.16b,v8.16b
+ ld1 {v4.16b},[x9],16 /* key0 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aesd v0.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ ld1 {v6.16b},[x9],16 /* key2 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v0.16b,v12.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v13.16b
+ sha256h q22, q23, v5.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v14.16b
+ ld1 {v5.16b},[x9],16 /* key5 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aesd v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v16.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd (1 cyc stall on v22) */
+ sha256su0 v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aesd v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* get next aes block, with update */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+
+/* aes xform 1, sha quad 1 */
+ sha256su0 v26.4s,v27.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ mov v21.16b, v22.16b /* copy abcd */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v1.16b,v8.16b
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h2 q23, q21, v4.4s
+ aesimc v1.16b,v1.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesd v1.16b,v9.16b
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v1.16b,v10.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v1.16b,v1.16b
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesd v1.16b,v11.16b
+ ld1 {v5.16b},[x9],16 /* key5 (extra stall from mov) */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256su0 v28.4s,v29.4s
+ aesimc v1.16b,v1.16b
+ sha256h q22, q23, v6.4s
+ aesd v1.16b,v12.16b
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha256su0 v29.4s,v26.4s
+ aesd v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v1.16b,v14.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* aes xform 2, sha quad 2 */
+
+ sha256su0 v26.4s,v27.4s
+ aesd v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v2.16b,v9.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v2.16b,v2.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v2.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v2.16b,v11.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su0 v28.4s,v29.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v2.16b,v13.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256su0 v29.4s,v26.4s
+ aesimc v2.16b,v2.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesd v2.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v2.16b,v15.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ aesimc v2.16b,v2.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* aes xform 3, sha quad 3 (hash only) */
+
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v3.16b,v9.16b
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v3.16b,v10.16b
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* next aes block, update aes_ptr_in */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbnz x7,.Lmain_loop /* loop if more to do */
+/*
+ * now the loop epilog. Since the reads for sha have already been done in advance, we
+ * have to have an extra unwind. This is why the test for the short cases is 16 and not 12.
+ *
+ * the unwind, which is just the main loop without the tests or final reads.
+ */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+
+/*
+ * aes xform 0, sha quad 0
+ */
+ aesd v0.16b,v8.16b
+ ld1 {v6.16b},[x9],16 /* key2 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ aesimc v0.16b,v0.16b
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesd v0.16b,v9.16b
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aesd v0.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v0.16b,v12.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v13.16b
+ sha256h q22, q23, v5.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v14.16b
+ ld1 {v5.16b},[x9],16 /* key5 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aesd v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v16.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd (1 cyc stall on v22) */
+ sha256su0 v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aesd v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+
+/* aes xform 1, sha quad 1 */
+ sha256su0 v26.4s,v27.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ mov v21.16b, v22.16b /* copy abcd */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v1.16b,v8.16b
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h2 q23, q21, v4.4s
+ aesimc v1.16b,v1.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesd v1.16b,v9.16b
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v1.16b,v10.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v1.16b,v1.16b
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesd v1.16b,v11.16b
+ ld1 {v5.16b},[x9],16 /* key5 (extra stall from mov) */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256su0 v28.4s,v29.4s
+ aesimc v1.16b,v1.16b
+ sha256h q22, q23, v6.4s
+ aesd v1.16b,v12.16b
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha256su0 v29.4s,v26.4s
+ aesd v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v1.16b,v14.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* mode op 2 */
+
+/* aes xform 2, sha quad 2 */
+
+ sha256su0 v26.4s,v27.4s
+ aesd v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v2.16b,v9.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v2.16b,v2.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v2.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v2.16b,v11.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su0 v28.4s,v29.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v2.16b,v13.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256su0 v29.4s,v26.4s
+ aesimc v2.16b,v2.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesd v2.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v2.16b,v15.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ aesimc v2.16b,v2.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* mode op 3 */
+
+/* aes xform 3, sha quad 3 (hash only) */
+
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v3.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v3.16b,v12.16b
+ ld1 {v0.16b},[x0] /* read first aes block, no bump */
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+
+
+/*
+ * now we have to do the 4 aes blocks (b-2) that catch up to where sha is
+ */
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+ eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/*
+ * Now, there is the final b-1 sha256 padded block. This contains between 0-3 aes blocks.
+ * we take some pains to avoid read spill by only reading the blocks that are actually defined.
+ * this is also the final sha block code for the shortCases.
+ */
+.Ljoin_common:
+ mov w15,0x80 /* that's the 1 of the pad */
+ cbnz x13,.Lpad100 /* branch if there is some real data */
+ eor v26.16b,v26.16b,v26.16b /* zero the rest */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v26.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad100:
+ sub x14,x13,1 /* dec amount left */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ cbnz x14,.Lpad200 /* branch if there is some real data */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v27.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad200:
+ sub x14,x14,1 /* dec amount left */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ cbnz x14,.Lpad300 /* branch if there is some real data */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v28.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad300:
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v29.b[3],w15 /* all data is bogus */
+
+.Lpad_done:
+ lsr x12,x11,32 /* len_hi */
+ and x14,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x14,x14,3 /* len_lo in bits */
+
+ mov v29.s[3],w14 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+/*
+ * final sha block
+ * the strategy is to combine the 0-3 aes blocks, which is faster but
+ * a little gourmand on code space.
+ */
+ cbz x13,.Lzero_aes_blocks_left /* none to do */
+ ld1 {v0.16b},[x0] /* read first aes block, bump aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ aesd v0.16b,v8.16b
+ ld1 {v7.16b},[x9],16 /* key3 */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aesd v0.16b,v10.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ aesimc v0.16b,v0.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ aesd v0.16b,v11.16b
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v4.4s
+ aesd v0.16b,v12.16b
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v14.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v6.4s
+ aesd v0.16b,v15.16b
+ sha256h2 q23, q21, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ aesd v0.16b,v16.16b
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v0.16b,v17.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+
+ sub x13,x13,1 /* dec counter */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbz x13,.Lfrmquad1
+
+/* aes xform 1 */
+
+ ld1 {v0.16b},[x0] /* read first aes block, bump aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ aesd v0.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v0.16b,v0.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ aesd v0.16b,v9.16b
+ sha256su0 v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ aesd v0.16b,v10.16b
+ sha256h q22, q23, v4.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v0.16b,v11.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v13.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aesd v0.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v15.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+
+ sha256su0 v29.4s,v26.4s
+ aesd v0.16b,v16.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v0.16b,v17.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+
+ sub x13,x13,1 /* dec counter */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbz x13,.Lfrmquad2
+
+/* aes xform 2 */
+
+ ld1 {v0.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ aesd v0.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v0.16b,v0.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ aesd v0.16b,v9.16b
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v4.4s
+ aesd v0.16b,v10.16b
+ sha256h2 q23, q21, v4.4s
+ aesimc v0.16b,v0.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesd v0.16b,v11.16b
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v5.4s
+ aesd v0.16b,v12.16b
+ sha256h2 q23, q21, v5.4s
+ aesimc v0.16b,v0.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesd v0.16b,v13.16b
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ aesd v0.16b,v14.16b
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v15.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+
+ aesd v0.16b,v16.16b
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aesd v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ b .Lfrmquad3
+/*
+ * the final block with no aes component, i.e from here there were zero blocks
+ */
+
+.Lzero_aes_blocks_left:
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+.Lfrmquad1:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+.Lfrmquad2:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+.Lfrmquad3:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ eor v26.16b,v26.16b,v26.16b /* zero reg */
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ eor v27.16b,v27.16b,v27.16b /* zero reg */
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ eor v28.16b,v28.16b,v28.16b /* zero reg */
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+/*
+ * now we just have to put this into big endian and store! and clean up stack...
+ */
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ rev32 v24.16b,v24.16b /* big endian ABCD */
+ ld1 {v12.16b - v15.16b},[x9]
+ rev32 v25.16b,v25.16b /* big endian EFGH */
+
+ st1 {v24.4s,v25.4s},[x3] /* save them both */
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v30.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+ mov x4,x0 /* sha_ptr_in = in */
+
+/*
+ * This loop does 4 at a time, so that at the end there is a final sha block and 0-3 aes blocks
+ * Note that everything is done serially to avoid complication.
+ */
+.Lshort_loop:
+ cmp x10,4 /* check if 4 or more */
+ blt .Llast_sha_block /* if less, bail to last block */
+
+ ld1 {v31.16b},[x4] /* next w no update */
+ ld1 {v0.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v26.16b,v0.16b /* endian swap for sha */
+ add x0,x0,64
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ ld1 {v1.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v27.16b,v1.16b /* endian swap for sha */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */
+
+ ld1 {v31.16b},[x4] /* read no update */
+ ld1 {v2.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v28.16b,v2.16b /* endian swap for sha */
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ eor v2.16b,v2.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ ld1 {v3.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v29.16b,v3.16b /* endian swap for sha */
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+
+/*
+ * now we have the sha256 to do for these 4 aes blocks. Note that.
+ */
+
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* quad 0 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ sub x10,x10,4 /* 4 less */
+ b .Lshort_loop /* keep looping */
+/*
+ * this is arranged so that we can join the common unwind code that does the last
+ * sha block and the final 0-3 aes blocks
+ */
+.Llast_sha_block:
+ mov x13,x10 /* copy aes blocks for common */
+ b .Ljoin_common /* join common code */
+
+ .size sha256_aes128cbc_dec, .-sha256_aes128cbc_dec
new file mode 100644
@@ -0,0 +1,519 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-2 Primitives
+ *
+ * Operations:
+ * sha256_block_partial:
+ * out = partial_sha256(init, in, len) <- no final block
+ *
+ * sha256_block:
+ * out = sha256(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha256_block_partial(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha256_block(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (sucess), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha256_block_partial(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * sha256_block(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise error code is returned.
+ *
+ */
+ .file "sha256_core.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .align 4
+ .global sha256_block_partial
+ .type sha256_block_partial,%function
+ .global sha256_block
+ .type sha256_block,%function
+
+ .align 4
+.Lrcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+ .align 4
+.Linit_sha_state:
+ .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+ .align 4
+
+sha256_block_partial:
+ mov x6, #1 /* indicate partial hash */
+ ands x5, x3, #0x3f /* Check size mod 1 SHA block */
+ b.ne .Lsha256_error
+ cbnz x0, 1f
+ adr x0,.Linit_sha_state /* address of sha init state consts */
+1:
+ ld1 {v24.4s, v25.4s},[x0] /* init ABCD, EFGH. (2 cycs) */
+ lsr x5, x3, 4 /* number of 16B blocks (will be at least 4) */
+ b .Lsha256_loop
+
+sha256_block:
+ mov x6, xzr /* indicate full hash */
+ ands x5, x3, #0xf /* Check size mod 16B block */
+ b.ne .Lsha256_error
+ cbnz x0, 1f
+ adr x0,.Linit_sha_state /* address of sha init state consts */
+1:
+ ld1 {v24.4s, v25.4s},[x0] /* init ABCD, EFGH. (2 cycs) */
+ lsr x5, x3, 4 /* number of 16B blocks */
+ cmp x5, #4 /* at least 4 16B blocks give 1 SHA block */
+ b.lo .Lsha256_last
+
+ .align 4
+.Lsha256_loop:
+ sub x5, x5, #4 /* substract 1 SHA block */
+ adr x4,.Lrcon
+
+ ld1 {v26.16b},[x1],16 /* dsrc[0] */
+ ld1 {v27.16b},[x1],16 /* dsrc[1] */
+ ld1 {v28.16b},[x1],16 /* dsrc[2] */
+ ld1 {v29.16b},[x1],16 /* dsrc[3] */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key4 */
+ ld1 {v5.16b},[x4],16 /* key5 */
+ ld1 {v6.16b},[x4],16 /* key6 */
+ ld1 {v7.16b},[x4],16 /* key7 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key8 */
+ ld1 {v5.16b},[x4],16 /* key9 */
+ ld1 {v6.16b},[x4],16 /* key10 */
+ ld1 {v7.16b},[x4],16 /* key11 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key12 */
+ ld1 {v5.16b},[x4],16 /* key13 */
+ ld1 {v6.16b},[x4],16 /* key14 */
+ ld1 {v7.16b},[x4],16 /* key15 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ cmp x5, #4
+ b.hs .Lsha256_loop
+
+ /* Store partial hash and return or complete hash */
+ cbz x6, .Lsha256_last
+
+ st1 {v24.16b, v25.16b}, [x2]
+
+ mov x0, xzr
+ ret
+
+ /*
+ * Last block with padding. v24-v25 contain hash state.
+ */
+.Lsha256_last:
+ eor v26.16b, v26.16b, v26.16b
+ eor v27.16b, v27.16b, v27.16b
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ adr x4,.Lrcon
+ lsl x3, x3, 3
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+ /* Fill out the first vector register and the end of the block */
+ mov v29.s[3], w3 /* move length to the end of the block */
+ lsr x3, x3, 32
+ mov v29.s[2], w3 /* and the higher part */
+ /* Set padding 1 to the first reg */
+ mov w6, #0x80 /* that's the 1 of the pad */
+ mov v26.b[3], w6
+ cbz x5,.Lsha256_final
+
+ sub x5, x5, #1
+ mov v27.16b, v26.16b
+ ld1 {v26.16b},[x1],16
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ cbz x5,.Lsha256_final
+
+ sub x5, x5, #1
+ mov v28.16b, v27.16b
+ ld1 {v27.16b},[x1],16
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ cbz x5,.Lsha256_final
+
+ mov v29.b[0], w6
+ ld1 {v28.16b},[x1],16
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+.Lsha256_final:
+
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key4 */
+ ld1 {v5.16b},[x4],16 /* key5 */
+ ld1 {v6.16b},[x4],16 /* key6 */
+ ld1 {v7.16b},[x4],16 /* key7 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key8 */
+ ld1 {v5.16b},[x4],16 /* key9 */
+ ld1 {v6.16b},[x4],16 /* key10 */
+ ld1 {v7.16b},[x4],16 /* key11 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key12 */
+ ld1 {v5.16b},[x4],16 /* key13 */
+ ld1 {v6.16b},[x4],16 /* key14 */
+ ld1 {v7.16b},[x4],16 /* key15 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ st1 {v24.4s,v25.4s},[x2] /* save them both */
+
+ mov x0, xzr
+ ret
+
+.Lsha256_error:
+ mov x0, #-1
+ ret
+
+ .size sha256_block_partial, .-sha256_block_partial
new file mode 100644
@@ -0,0 +1,1791 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Auth/Dec Primitive = sha256_hmac/aes128cbc
+ *
+ * Operations:
+ *
+ * out = decrypt-AES128CBC(in)
+ * return_ash_ptr = SHA256(o_key_pad | SHA256(i_key_pad | in))
+ *
+ * Prototype:
+ *
+ * void sha256_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * sha256_hmac_aes128cbc_dec(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 - v20 -- round keys
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- sha state ABCD
+ * v25 -- sha state EFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not defined.
+ * For AES partial blocks the user is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are less optimized at < 16 AES blocks, however they are somewhat optimized,
+ * and more so than the enc/auth versions.
+ */
+ .file "sha256_hmac_aes128cbc_dec.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global sha256_hmac_aes128cbc_dec
+ .type sha256_hmac_aes128cbc_dec,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+.Linit_sha_state:
+ .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+sha256_hmac_aes128cbc_dec:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ ld1 {v24.4s, v25.4s},[x6] /* init ABCD, EFGH. (2 cycs) */
+ ldr x6, [x5, #HMAC_OKEYPAD] /* save pointer to o_key_pad partial hash */
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next *in */
+ adr x12,.Linit_sha_state /* address of sha init state consts */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,16 /* no main loop if <16 */
+ blt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x11,x4 /* len -> x11 needed at end */
+ mov x7,sp /* copy for address mode */
+ ld1 {v30.16b},[x5] /* get 1st ivec */
+ lsr x12,x11,6 /* total_blocks (sha) */
+ mov x4,x0 /* sha_ptr_in = *in */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ ld1 {v29.16b},[x4],16 /* next w3 */
+
+/*
+ * now we can do the loop prolog, 1st sha256 block
+ */
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+
+ adr x8,.Lrcon /* base address for sha round consts */
+/*
+ * do the first sha256 block on the plaintext
+ */
+
+ mov v22.16b,v24.16b /* init working ABCD */
+ st1 {v8.16b},[x7],16
+ mov v23.16b,v25.16b /* init working EFGH */
+ st1 {v9.16b},[x7],16
+
+ rev32 v26.16b,v26.16b /* endian swap w0 */
+ st1 {v10.16b},[x7],16
+ rev32 v27.16b,v27.16b /* endian swap w1 */
+ st1 {v11.16b},[x7],16
+ rev32 v28.16b,v28.16b /* endian swap w2 */
+ st1 {v12.16b},[x7],16
+ rev32 v29.16b,v29.16b /* endian swap w3 */
+ st1 {v13.16b},[x7],16
+/* quad 0 */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ st1 {v14.16b},[x7],16
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ st1 {v15.16b},[x7],16
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h2 q23, q21, v7.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h2 q23, q21, v4.4s
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v5.4s
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ ld1 {v29.16b},[x4],16 /* next w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+/*
+ * aes_blocks_left := number after the main (sha) block is done.
+ * can be 0 note we account for the extra unwind in main_blocks
+ */
+ sub x7,x12,2 /* main_blocks = total_blocks - 5 */
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ and x13,x10,3 /* aes_blocks_left */
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+ add x2,x0,128 /* lead_ptr = *in */
+ ld1 {v31.16b},[x0],16 /* next aes block, update aes_ptr_in */
+
+/*
+ * main combined loop CBC, can be used by auth/enc version
+ */
+.Lmain_loop:
+
+/*
+ * because both mov, rev32 and eor have a busy cycle, this takes longer than it looks.
+ * I've rewritten this to hoist the v0 loads but there is still no way to hide the
+ * required latency of these sha-associated instructions. It is a perfect example of
+ * why putting to much time into an NP-complete and NP-hard problem can be a mistake,
+ * even if it looks like a reasonable thing at the surface.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov x9,x8 /* top of rcon */
+
+/*
+ * aes xform 0, sha quad 0
+ */
+ aesd v0.16b,v8.16b
+ ld1 {v4.16b},[x9],16 /* key0 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aesd v0.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ ld1 {v6.16b},[x9],16 /* key2 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v0.16b,v12.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v13.16b
+ sha256h q22, q23, v5.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v14.16b
+ ld1 {v5.16b},[x9],16 /* key5 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aesd v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v16.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd (1 cyc stall on v22) */
+ sha256su0 v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aesd v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* get next aes block, with update */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+
+/* aes xform 1, sha quad 1 */
+ sha256su0 v26.4s,v27.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ mov v21.16b, v22.16b /* copy abcd */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v1.16b,v8.16b
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h2 q23, q21, v4.4s
+ aesimc v1.16b,v1.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesd v1.16b,v9.16b
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v1.16b,v10.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v1.16b,v1.16b
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesd v1.16b,v11.16b
+ ld1 {v5.16b},[x9],16 /* key5 (extra stall from mov) */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256su0 v28.4s,v29.4s
+ aesimc v1.16b,v1.16b
+ sha256h q22, q23, v6.4s
+ aesd v1.16b,v12.16b
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha256su0 v29.4s,v26.4s
+ aesd v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v1.16b,v14.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* aes xform 2, sha quad 2 */
+
+ sha256su0 v26.4s,v27.4s
+ aesd v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v2.16b,v9.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v2.16b,v2.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v2.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v2.16b,v11.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su0 v28.4s,v29.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v2.16b,v13.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256su0 v29.4s,v26.4s
+ aesimc v2.16b,v2.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesd v2.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v2.16b,v15.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ aesimc v2.16b,v2.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* aes xform 3, sha quad 3 (hash only) */
+
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v3.16b,v9.16b
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v3.16b,v10.16b
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* next aes block, update aes_ptr_in */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbnz x7,.Lmain_loop /* loop if more to do */
+/*
+ * now the loop epilog. Since the reads for sha have already been done in advance, we
+ * have to have an extra unwind. This is why the test for the short cases is 16 and not 12.
+ *
+ * the unwind, which is just the main loop without the tests or final reads.
+ */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out, streaming */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+
+/*
+ * aes xform 0, sha quad 0
+ */
+ aesd v0.16b,v8.16b
+ ld1 {v6.16b},[x9],16 /* key2 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ aesimc v0.16b,v0.16b
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesd v0.16b,v9.16b
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v26.4s,v27.4s
+ aesd v0.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v0.16b,v12.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v13.16b
+ sha256h q22, q23, v5.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v14.16b
+ ld1 {v5.16b},[x9],16 /* key5 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aesd v0.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v16.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd (1 cyc stall on v22) */
+ sha256su0 v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aesd v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+
+/* aes xform 1, sha quad 1 */
+ sha256su0 v26.4s,v27.4s
+ ld1 {v7.16b},[x9],16 /* key7 */
+ mov v21.16b, v22.16b /* copy abcd */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v1.16b,v8.16b
+ sha256h q22, q23, v4.4s
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h2 q23, q21, v4.4s
+ aesimc v1.16b,v1.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesd v1.16b,v9.16b
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v1.16b,v10.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesimc v1.16b,v1.16b
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesd v1.16b,v11.16b
+ ld1 {v5.16b},[x9],16 /* key5 (extra stall from mov) */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256su0 v28.4s,v29.4s
+ aesimc v1.16b,v1.16b
+ sha256h q22, q23, v6.4s
+ aesd v1.16b,v12.16b
+ sha256h2 q23, q21, v6.4s
+ ld1 {v6.16b},[x9],16 /* key6 */
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha256su0 v29.4s,v26.4s
+ aesd v1.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v1.16b,v1.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v1.16b,v14.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* mode op 2 */
+
+/* aes xform 2, sha quad 2 */
+
+ sha256su0 v26.4s,v27.4s
+ aesd v2.16b,v8.16b
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v2.16b,v9.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ ld1 {v4.16b},[x9],16 /* key4 */
+ aesimc v2.16b,v2.16b
+ sha256su0 v27.4s,v28.4s
+ aesd v2.16b,v10.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v2.16b,v11.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ ld1 {v5.16b},[x9],16 /* key5 */
+ sha256su0 v28.4s,v29.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v2.16b,v13.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256su0 v29.4s,v26.4s
+ aesimc v2.16b,v2.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesd v2.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v2.16b,v2.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v2.16b,v15.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ aesimc v2.16b,v2.16b
+ ld1 {v7.16b},[x9],16 /* key7 */
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/ prev value */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+/* mode op 3 */
+
+/* aes xform 3, sha quad 3 (hash only) */
+
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesd v3.16b,v9.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v3.16b,v12.16b
+ ld1 {v0.16b},[x0] /* read first aes block, no bump */
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v3.16b,v3.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ ld1 {v31.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+
+
+/*
+ * now we have to do the 4 aes blocks (b-2) that catch up to where sha is
+ */
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ ld1 {v1.16b},[x0] /* read next aes block, no update */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ ld1 {v2.16b},[x0] /* read next aes block, no update */
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+ eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */
+ ld1 {v31.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ ld1 {v3.16b},[x0] /* read next aes block, no update */
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */
+ ld1 {v30.16b},[x0],16 /* read next aes block, update aes_ptr_in */
+
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+/*
+ * Now, there is the final b-1 sha256 padded block. This contains between 0-3 aes blocks.
+ * we take some pains to avoid read spill by only reading the blocks that are actually defined.
+ * this is also the final sha block code for the shortCases.
+ */
+.Ljoin_common:
+ mov w15,0x80 /* that's the 1 of the pad */
+ cbnz x13,.Lpad100 /* branch if there is some real data */
+ eor v26.16b,v26.16b,v26.16b /* zero the rest */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v26.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad100:
+ sub x14,x13,1 /* dec amount left */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ cbnz x14,.Lpad200 /* branch if there is some real data */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v27.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad200:
+ sub x14,x14,1 /* dec amount left */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ cbnz x14,.Lpad300 /* branch if there is some real data */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v28.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad300:
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v29.b[3],w15 /* all data is bogus */
+
+.Lpad_done:
+ /* Add one SHA-2 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x14,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x14,x14,3 /* len_lo in bits */
+
+ mov v29.s[3],w14 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+/*
+ * final sha block
+ * the strategy is to combine the 0-3 aes blocks, which is faster but
+ * a little gourmand on code space.
+ */
+ cbz x13,.Lzero_aes_blocks_left /* none to do */
+ ld1 {v0.16b},[x0] /* read first aes block, bump aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ aesd v0.16b,v8.16b
+ ld1 {v7.16b},[x9],16 /* key3 */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ aesd v0.16b,v10.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ aesimc v0.16b,v0.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ aesd v0.16b,v11.16b
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v4.4s
+ aesd v0.16b,v12.16b
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v13.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v14.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v6.4s
+ aesd v0.16b,v15.16b
+ sha256h2 q23, q21, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ aesd v0.16b,v16.16b
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v0.16b,v17.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+
+ sub x13,x13,1 /* dec counter */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbz x13,.Lfrmquad1
+
+/* aes xform 1 */
+
+ ld1 {v0.16b},[x0] /* read first aes block, bump aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ aesd v0.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v0.16b,v0.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ aesd v0.16b,v9.16b
+ sha256su0 v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ aesd v0.16b,v10.16b
+ sha256h q22, q23, v4.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v4.4s
+ aesd v0.16b,v11.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+
+ sha256su0 v27.4s,v28.4s
+ aesd v0.16b,v12.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v5.4s
+ aesd v0.16b,v13.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ aesimc v0.16b,v0.16b
+ sha256su0 v28.4s,v29.4s
+ aesd v0.16b,v14.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v15.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+
+ sha256su0 v29.4s,v26.4s
+ aesd v0.16b,v16.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v7.4s
+ aesd v0.16b,v17.16b
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+
+ sub x13,x13,1 /* dec counter */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ cbz x13,.Lfrmquad2
+
+/* aes xform 2 */
+
+ ld1 {v0.16b},[x0],16 /* read first aes block, bump aes_ptr_in */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ aesd v0.16b,v8.16b
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ aesimc v0.16b,v0.16b
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ aesd v0.16b,v9.16b
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v4.4s
+ aesd v0.16b,v10.16b
+ sha256h2 q23, q21, v4.4s
+ aesimc v0.16b,v0.16b
+ sha256su1 v26.4s,v28.4s,v29.4s
+ aesd v0.16b,v11.16b
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v5.4s
+ aesd v0.16b,v12.16b
+ sha256h2 q23, q21, v5.4s
+ aesimc v0.16b,v0.16b
+ sha256su1 v27.4s,v29.4s,v26.4s
+ aesd v0.16b,v13.16b
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ mov v21.16b, v22.16b /* copy abcd */
+ aesd v0.16b,v14.16b
+ sha256h q22, q23, v6.4s
+ aesimc v0.16b,v0.16b
+ sha256h2 q23, q21, v6.4s
+ aesd v0.16b,v15.16b
+ sha256su1 v28.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+
+ aesd v0.16b,v16.16b
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ aesimc v0.16b,v0.16b
+ sha256h q22, q23, v7.4s
+ aesd v0.16b,v17.16b
+ sha256h2 q23, q21, v7.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ sha256su1 v29.4s,v27.4s,v28.4s
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+
+
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+ b .Lfrmquad3
+/*
+ * the final block with no aes component, i.e from here there were zero blocks
+ */
+
+.Lzero_aes_blocks_left:
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+.Lfrmquad1:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+.Lfrmquad2:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+.Lfrmquad3:
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ eor v26.16b,v26.16b,v26.16b /* zero reg */
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ eor v27.16b,v27.16b,v27.16b /* zero reg */
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ eor v28.16b,v28.16b,v28.16b /* zero reg */
+ sha256h2 q23, q21, v7.4s
+
+ add v26.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v27.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ adr x8,.Lrcon /* base address for sha round consts */
+
+ ld1 {v24.16b,v25.16b}, [x6] /* load o_key_pad partial hash */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v28.b[3], w11
+
+ mov x11, #64+32 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ mov v29.s[3], w11 /* move length to the end of the block */
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ ld1 {v4.16b},[x8],16 /* key0 */
+ ld1 {v5.16b},[x8],16 /* key1 */
+ ld1 {v6.16b},[x8],16 /* key2 */
+ ld1 {v7.16b},[x8],16 /* key3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key4 */
+ ld1 {v5.16b},[x8],16 /* key5 */
+ ld1 {v6.16b},[x8],16 /* key6 */
+ ld1 {v7.16b},[x8],16 /* key7 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key8 */
+ ld1 {v5.16b},[x8],16 /* key9 */
+ ld1 {v6.16b},[x8],16 /* key10 */
+ ld1 {v7.16b},[x8],16 /* key11 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x8],16 /* key12 */
+ ld1 {v5.16b},[x8],16 /* key13 */
+ ld1 {v6.16b},[x8],16 /* key14 */
+ ld1 {v7.16b},[x8],16 /* key15 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ st1 {v24.4s,v25.4s},[x3] /* save them both */
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ st1 {v24.4s,v25.4s},[x3] /* save them both */
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v30.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+ mov x4,x0 /* sha_ptr_in = in */
+
+/*
+ * This loop does 4 at a time, so that at the end there is a final sha block and 0-3 aes blocks
+ * Note that everything is done serially to avoid complication.
+ */
+.Lshort_loop:
+ cmp x10,4 /* check if 4 or more */
+ blt .Llast_sha_block /* if less, bail to last block */
+
+ ld1 {v31.16b},[x4] /* next w no update */
+ ld1 {v0.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v26.16b,v0.16b /* endian swap for sha */
+ add x0,x0,64
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ ld1 {v1.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v27.16b,v1.16b /* endian swap for sha */
+ st1 {v0.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */
+
+ ld1 {v31.16b},[x4] /* read no update */
+ ld1 {v2.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v28.16b,v2.16b /* endian swap for sha */
+ st1 {v1.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ eor v2.16b,v2.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ ld1 {v3.16b},[x4],16 /* read next aes block, update aes_ptr_in */
+ rev32 v29.16b,v3.16b /* endian swap for sha */
+ st1 {v2.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+
+/*
+ * now we have the sha256 to do for these 4 aes blocks. Note that.
+ */
+
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+ st1 {v3.16b},[x1],16 /* save aes res, bump aes_out_ptr */
+
+/* quad 0 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 1 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 2 */
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+
+ sha256su0 v26.4s,v27.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ sha256su0 v27.4s,v28.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ sha256su0 v28.4s,v29.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ sha256su0 v29.4s,v26.4s
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+/* quad 3 */
+
+ ld1 {v4.16b},[x9],16 /* key4 */
+ ld1 {v5.16b},[x9],16 /* key5 */
+ ld1 {v6.16b},[x9],16 /* key6 */
+ ld1 {v7.16b},[x9],16 /* key7 */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ sub x10,x10,4 /* 4 less */
+ b .Lshort_loop /* keep looping */
+/*
+ * this is arranged so that we can join the common unwind code that does the last
+ * sha block and the final 0-3 aes blocks
+ */
+.Llast_sha_block:
+ mov x13,x10 /* copy aes blocks for common */
+ b .Ljoin_common /* join common code */
+
+ .size sha256_hmac_aes128cbc_dec, .-sha256_hmac_aes128cbc_dec
new file mode 100644
@@ -0,0 +1,55 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rte_common.h>
+
+#include "rte_armv8_defs.h"
+
+#define ASSYM(name, offset) \
+do { \
+ asm volatile("----------\n"); \
+ /* Place pattern, name + value in the assembly code */ \
+ asm volatile("\n<genassym> " #name " %0\n" :: "i" (offset)); \
+} while (0)
+
+
+static void __rte_unused
+generate_as_symbols(void)
+{
+
+ ASSYM(CIPHER_KEY, offsetof(struct crypto_arg, cipher.key));
+ ASSYM(CIPHER_IV, offsetof(struct crypto_arg, cipher.iv));
+
+ ASSYM(HMAC_KEY, offsetof(struct crypto_arg, digest.hmac.key));
+ ASSYM(HMAC_IKEYPAD, offsetof(struct crypto_arg, digest.hmac.i_key_pad));
+ ASSYM(HMAC_OKEYPAD, offsetof(struct crypto_arg, digest.hmac.o_key_pad));
+}
new file mode 100644
@@ -0,0 +1,905 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_hexdump.h>
+#include <rte_cryptodev.h>
+#include <rte_cryptodev_pmd.h>
+#include <rte_vdev.h>
+#include <rte_malloc.h>
+#include <rte_cpuflags.h>
+
+#include "rte_armv8_defs.h"
+#include "rte_armv8_pmd_private.h"
+
+static int cryptodev_armv8_crypto_uninit(const char *name);
+
+/**
+ * Pointers to the supported combined mode crypto functions are stored
+ * in the static tables. Each combined (chained) cryptographic operation
+ * can be decribed by a set of numbers:
+ * - order: order of operations (cipher, auth) or (auth, cipher)
+ * - direction: encryption or decryption
+ * - calg: cipher algorithm such as AES_CBC, AES_CTR, etc.
+ * - aalg: authentication algorithm such as SHA1, SHA256, etc.
+ * - keyl: cipher key length, for example 128, 192, 256 bits
+ *
+ * In order to quickly acquire each function pointer based on those numbers,
+ * a hierarchy of arrays is maintained. The final level, 3D array is indexed
+ * by the combined mode function parameters only (cipher algorithm,
+ * authentication algorithm and key length).
+ *
+ * This gives 3 memory accesses to obtain a function pointer instead of
+ * traversing the array manually and comparing function parameters on each loop.
+ *
+ * +--+CRYPTO_FUNC
+ * +--+ENC|
+ * +--+CA|
+ * | +--+DEC
+ * ORDER|
+ * | +--+ENC
+ * +--+AC|
+ * +--+DEC
+ *
+ */
+
+/**
+ * 3D array type for ARM Combined Mode crypto functions pointers.
+ * CRYPTO_CIPHER_MAX: max cipher ID number
+ * CRYPTO_AUTH_MAX: max auth ID number
+ * CRYPTO_CIPHER_KEYLEN_MAX: max key length ID number
+ */
+typedef const crypto_func_t crypto_func_tbl_t[CRYPTO_CIPHER_MAX][CRYPTO_AUTH_MAX][CRYPTO_CIPHER_KEYLEN_MAX];
+
+#define CRYPTO_KEY(keyl) (ARMV8_CRYPTO_CIPHER_KEYLEN_ ## keyl)
+
+/**
+ * Arrays containing pointers to particular cryptographic,
+ * combined mode functions.
+ * crypto_op_ca_encrypt: cipher (encrypt), authenticate
+ * crypto_op_ca_decrypt: cipher (decrypt), authenticate
+ * crypto_op_ac_encrypt: authenticate, cipher (encrypt)
+ * crypto_op_ac_decrypt: authenticate, cipher (decrypt)
+ */
+static const crypto_func_tbl_t
+crypto_op_ca_encrypt = {
+ /* [cipher alg][auth alg][key length] = crypto_function, */
+ [RTE_CRYPTO_CIPHER_AES_CBC][RTE_CRYPTO_AUTH_SHA1_HMAC][CRYPTO_KEY(128)] = aes128cbc_sha1_hmac,
+ [RTE_CRYPTO_CIPHER_AES_CBC][RTE_CRYPTO_AUTH_SHA256][CRYPTO_KEY(128)] = aes128cbc_sha256,
+ [RTE_CRYPTO_CIPHER_AES_CBC][RTE_CRYPTO_AUTH_SHA256_HMAC][CRYPTO_KEY(128)] = aes128cbc_sha256_hmac,
+};
+
+static const crypto_func_tbl_t
+crypto_op_ca_decrypt = {
+ NULL
+};
+
+static const crypto_func_tbl_t
+crypto_op_ac_encrypt = {
+ NULL
+};
+
+static const crypto_func_tbl_t
+crypto_op_ac_decrypt = {
+ /* [cipher alg][auth alg][key length] = crypto_function, */
+ [RTE_CRYPTO_CIPHER_AES_CBC][RTE_CRYPTO_AUTH_SHA1_HMAC][CRYPTO_KEY(128)] = sha1_hmac_aes128cbc_dec,
+ [RTE_CRYPTO_CIPHER_AES_CBC][RTE_CRYPTO_AUTH_SHA256][CRYPTO_KEY(128)] = sha256_aes128cbc_dec,
+ [RTE_CRYPTO_CIPHER_AES_CBC][RTE_CRYPTO_AUTH_SHA256_HMAC][CRYPTO_KEY(128)] = sha256_hmac_aes128cbc_dec,
+};
+
+/**
+ * Arrays containing pointers to particular cryptographic function sets,
+ * covering given cipher operation directions (encrypt, decrypt)
+ * for each order of cipher and authentication pairs.
+ */
+static const crypto_func_tbl_t *
+crypto_cipher_auth[] = {
+ &crypto_op_ca_encrypt,
+ &crypto_op_ca_decrypt,
+ NULL
+};
+
+static const crypto_func_tbl_t *
+crypto_auth_cipher[] = {
+ &crypto_op_ac_encrypt,
+ &crypto_op_ac_decrypt,
+ NULL
+};
+
+/**
+ * Top level array containing pointers to particular cryptographic
+ * function sets, covering given order of chained operations.
+ * crypto_cipher_auth: cipher first, authenticate after
+ * crypto_auth_cipher: authenticate first, cipher after
+ */
+static const crypto_func_tbl_t **
+crypto_chain_order[] = {
+ crypto_cipher_auth,
+ crypto_auth_cipher,
+ NULL
+};
+
+/**
+ * Extract particular combined mode crypto function from the 3D array.
+ */
+#define CRYPTO_GET_ALGO(order, cop, calg, aalg, keyl) \
+({ \
+ crypto_func_tbl_t *func_tbl = \
+ (crypto_chain_order[(order)])[(cop)]; \
+ \
+ ((*func_tbl)[(calg)][(aalg)][CRYPTO_KEY(keyl)]); \
+})
+
+/*----------------------------------------------------------------------------*/
+
+/**
+ * 2D array type for ARM key schedule functions pointers.
+ * CRYPTO_CIPHER_MAX: max cipher ID number
+ * CRYPTO_CIPHER_KEYLEN_MAX: max key length ID number
+ */
+typedef const crypto_key_sched_t crypto_key_sched_tbl_t[CRYPTO_CIPHER_MAX][CRYPTO_CIPHER_KEYLEN_MAX];
+
+static const crypto_key_sched_tbl_t
+crypto_key_sched_encrypt = {
+ /* [cipher alg][key length] = key_expand_func, */
+ [RTE_CRYPTO_CIPHER_AES_CBC][CRYPTO_KEY(128)] = aes128_key_sched_enc,
+};
+
+static const crypto_key_sched_tbl_t
+crypto_key_sched_decrypt = {
+ /* [cipher alg][key length] = key_expand_func, */
+ [RTE_CRYPTO_CIPHER_AES_CBC][CRYPTO_KEY(128)] = aes128_key_sched_dec,
+};
+
+/**
+ * Top level array containing pointers to particular key generation
+ * function sets, covering given operation direction.
+ * crypto_key_sched_encrypt: keys for encryption
+ * crypto_key_sched_decrypt: keys for decryption
+ */
+static const crypto_key_sched_tbl_t *
+crypto_key_sched_dir[] = {
+ &crypto_key_sched_encrypt,
+ &crypto_key_sched_decrypt,
+ NULL
+};
+
+/**
+ * Extract particular combined mode crypto function from the 3D array.
+ */
+#define CRYPTO_GET_KEY_SCHED(cop, calg, keyl) \
+({ \
+ crypto_key_sched_tbl_t *ks_tbl = crypto_key_sched_dir[(cop)]; \
+ \
+ ((*ks_tbl)[(calg)][CRYPTO_KEY(keyl)]); \
+})
+
+/*----------------------------------------------------------------------------*/
+
+/**
+ * Global static parameter used to create a unique name for each
+ * ARMV8 crypto device.
+ */
+static unsigned int unique_name_id;
+
+static inline int
+create_unique_device_name(char *name, size_t size)
+{
+ int ret;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ ret = snprintf(name, size, "%s_%u", RTE_STR(CRYPTODEV_NAME_ARMV8_PMD),
+ unique_name_id++);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/*
+ *------------------------------------------------------------------------------
+ * Session Prepare
+ *------------------------------------------------------------------------------
+ */
+
+/** Get xform chain order */
+static enum armv8_crypto_chain_order
+armv8_crypto_get_chain_order(const struct rte_crypto_sym_xform *xform)
+{
+
+ /*
+ * This driver currently covers only chained operations.
+ * Ignore only cipher or only authentication operations
+ * or chains longer than 2 xform structures.
+ */
+ if (xform->next == NULL || xform->next->next != NULL)
+ return ARMV8_CRYPTO_CHAIN_NOT_SUPPORTED;
+
+ if (xform->type == RTE_CRYPTO_SYM_XFORM_AUTH) {
+ if (xform->next->type == RTE_CRYPTO_SYM_XFORM_CIPHER)
+ return ARMV8_CRYPTO_CHAIN_AUTH_CIPHER;
+ }
+
+ if (xform->type == RTE_CRYPTO_SYM_XFORM_CIPHER) {
+ if (xform->next->type == RTE_CRYPTO_SYM_XFORM_AUTH)
+ return ARMV8_CRYPTO_CHAIN_CIPHER_AUTH;
+ }
+
+ return ARMV8_CRYPTO_CHAIN_NOT_SUPPORTED;
+}
+
+static inline void
+auth_hmac_pad_prepare(struct armv8_crypto_session *sess,
+ const struct rte_crypto_sym_xform *xform)
+{
+ size_t i;
+
+ /* Generate i_key_pad and o_key_pad */
+ memset(sess->auth.hmac.i_key_pad, 0, sizeof(sess->auth.hmac.i_key_pad));
+ rte_memcpy(sess->auth.hmac.i_key_pad, sess->auth.hmac.key,
+ xform->auth.key.length);
+ memset(sess->auth.hmac.o_key_pad, 0, sizeof(sess->auth.hmac.o_key_pad));
+ rte_memcpy(sess->auth.hmac.o_key_pad, sess->auth.hmac.key,
+ xform->auth.key.length);
+ /*
+ * XOR key with IPAD/OPAD values to obtain i_key_pad
+ * and o_key_pad.
+ * Byte-by-byte operation may seem to be the less efficient
+ * here but in fact it's the opposite.
+ * The result ASM code is likely operate on NEON registers
+ * (load auth key to Qx, load IPAD/OPAD to multiple
+ * elements of Qy, eor 128 bits at once).
+ */
+ for (i = 0; i < SHA_BLOCK_MAX; i++) {
+ sess->auth.hmac.i_key_pad[i] ^= HMAC_IPAD_VALUE;
+ sess->auth.hmac.o_key_pad[i] ^= HMAC_OPAD_VALUE;
+ }
+}
+
+static inline int
+auth_set_prerequisites(struct armv8_crypto_session *sess,
+ const struct rte_crypto_sym_xform *xform)
+{
+ uint8_t partial[64] = { 0 };
+ int error;
+
+ switch (xform->auth.algo) {
+ case RTE_CRYPTO_AUTH_SHA1_HMAC:
+ /*
+ * Generate authentication key, i_key_pad and o_key_pad.
+ */
+ /* Zero memory under key */
+ memset(sess->auth.hmac.key, 0, SHA1_AUTH_KEY_LENGTH);
+
+ if (xform->auth.key.length > SHA1_AUTH_KEY_LENGTH) {
+ /*
+ * In case the key is longer than 160 bits
+ * the algorithm will use SHA1(key) instead.
+ */
+ error = sha1_block(NULL, xform->auth.key.data,
+ sess->auth.hmac.key, xform->auth.key.length);
+ if (error != 0)
+ return -1;
+ } else {
+ /*
+ * Now copy the given authentication key to the session
+ * key assuming that the session key is zeroed there is
+ * no need for additional zero padding if the key is
+ * shorter than SHA1_AUTH_KEY_LENGTH.
+ */
+ rte_memcpy(sess->auth.hmac.key, xform->auth.key.data,
+ xform->auth.key.length);
+ }
+
+ /* Prepare HMAC padding: key|pattern */
+ auth_hmac_pad_prepare(sess, xform);
+ /*
+ * Calculate partial hash values for i_key_pad and o_key_pad.
+ * Will be used as initialization state for final HMAC.
+ */
+ error = sha1_block_partial(NULL, sess->auth.hmac.i_key_pad,
+ partial, SHA1_BLOCK_SIZE);
+ if (error != 0)
+ return -1;
+ memcpy(sess->auth.hmac.i_key_pad, partial, SHA1_BLOCK_SIZE);
+
+ error = sha1_block_partial(NULL, sess->auth.hmac.o_key_pad,
+ partial, SHA1_BLOCK_SIZE);
+ if (error != 0)
+ return -1;
+ memcpy(sess->auth.hmac.o_key_pad, partial, SHA1_BLOCK_SIZE);
+
+ break;
+ case RTE_CRYPTO_AUTH_SHA256_HMAC:
+ /*
+ * Generate authentication key, i_key_pad and o_key_pad.
+ */
+ /* Zero memory under key */
+ memset(sess->auth.hmac.key, 0, SHA256_AUTH_KEY_LENGTH);
+
+ if (xform->auth.key.length > SHA256_AUTH_KEY_LENGTH) {
+ /*
+ * In case the key is longer than 256 bits
+ * the algorithm will use SHA256(key) instead.
+ */
+ error = sha256_block(NULL, xform->auth.key.data,
+ sess->auth.hmac.key, xform->auth.key.length);
+ if (error != 0)
+ return -1;
+ } else {
+ /*
+ * Now copy the given authentication key to the session
+ * key assuming that the session key is zeroed there is
+ * no need for additional zero padding if the key is
+ * shorter than SHA256_AUTH_KEY_LENGTH.
+ */
+ rte_memcpy(sess->auth.hmac.key, xform->auth.key.data,
+ xform->auth.key.length);
+ }
+
+ /* Prepare HMAC padding: key|pattern */
+ auth_hmac_pad_prepare(sess, xform);
+ /*
+ * Calculate partial hash values for i_key_pad and o_key_pad.
+ * Will be used as initialization state for final HMAC.
+ */
+ error = sha256_block_partial(NULL, sess->auth.hmac.i_key_pad,
+ partial, SHA256_BLOCK_SIZE);
+ if (error != 0)
+ return -1;
+ memcpy(sess->auth.hmac.i_key_pad, partial, SHA256_BLOCK_SIZE);
+
+ error = sha256_block_partial(NULL, sess->auth.hmac.o_key_pad,
+ partial, SHA256_BLOCK_SIZE);
+ if (error != 0)
+ return -1;
+ memcpy(sess->auth.hmac.o_key_pad, partial, SHA256_BLOCK_SIZE);
+
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static inline int
+cipher_set_prerequisites(struct armv8_crypto_session *sess,
+ const struct rte_crypto_sym_xform *xform)
+{
+ crypto_key_sched_t cipher_key_sched;
+
+ cipher_key_sched = sess->cipher.key_sched;
+ if (likely(cipher_key_sched != NULL)) {
+ /* Set up cipher session key */
+ cipher_key_sched(sess->cipher.key.data, xform->cipher.key.data);
+ }
+
+ return 0;
+}
+
+static int
+armv8_crypto_set_session_chained_parameters(struct armv8_crypto_session *sess,
+ const struct rte_crypto_sym_xform *cipher_xform,
+ const struct rte_crypto_sym_xform *auth_xform)
+{
+ enum armv8_crypto_chain_order order;
+ enum armv8_crypto_cipher_operation cop;
+ enum rte_crypto_cipher_algorithm calg;
+ enum rte_crypto_auth_algorithm aalg;
+
+ /* Validate and prepare scratch order of combined operations */
+ switch (sess->chain_order) {
+ case ARMV8_CRYPTO_CHAIN_CIPHER_AUTH:
+ case ARMV8_CRYPTO_CHAIN_AUTH_CIPHER:
+ order = sess->chain_order;
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* Select cipher direction */
+ sess->cipher.direction = cipher_xform->cipher.op;
+ /* Select cipher key */
+ sess->cipher.key.length = cipher_xform->cipher.key.length;
+ /* Set cipher direction */
+ cop = sess->cipher.direction;
+ /* Set cipher algorithm */
+ calg = cipher_xform->cipher.algo;
+
+ /* Select cipher algo */
+ switch (calg) {
+ /* Cover supported cipher algorithms */
+ case RTE_CRYPTO_CIPHER_AES_CBC:
+ sess->cipher.algo = calg;
+ /* IV len is always 16 bytes (block size) for AES CBC */
+ sess->cipher.iv_len = 16;
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* Select auth generate/verify */
+ sess->auth.operation = auth_xform->auth.op;
+
+ /* Select auth algo */
+ switch (auth_xform->auth.algo) {
+ /* Cover supported hash algorithms */
+ case RTE_CRYPTO_AUTH_SHA256:
+ aalg = auth_xform->auth.algo;
+ sess->auth.mode = ARMV8_CRYPTO_AUTH_AS_AUTH;
+ break;
+ case RTE_CRYPTO_AUTH_SHA1_HMAC:
+ case RTE_CRYPTO_AUTH_SHA256_HMAC: /* Fall through */
+ aalg = auth_xform->auth.algo;
+ sess->auth.mode = ARMV8_CRYPTO_AUTH_AS_HMAC;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Verify supported key lengths and extract proper algorithm */
+ switch (cipher_xform->cipher.key.length << 3) {
+ case 128:
+ sess->crypto_func =
+ CRYPTO_GET_ALGO(order, cop, calg, aalg, 128);
+ sess->cipher.key_sched =
+ CRYPTO_GET_KEY_SCHED(cop, calg, 128);
+ break;
+ case 192:
+ sess->crypto_func =
+ CRYPTO_GET_ALGO(order, cop, calg, aalg, 192);
+ sess->cipher.key_sched =
+ CRYPTO_GET_KEY_SCHED(cop, calg, 192);
+ break;
+ case 256:
+ sess->crypto_func =
+ CRYPTO_GET_ALGO(order, cop, calg, aalg, 256);
+ sess->cipher.key_sched =
+ CRYPTO_GET_KEY_SCHED(cop, calg, 256);
+ break;
+ default:
+ sess->crypto_func = NULL;
+ sess->cipher.key_sched = NULL;
+ return -EINVAL;
+ }
+
+ if (unlikely(sess->crypto_func == NULL)) {
+ /*
+ * If we got here that means that there must be a bug
+ * in the algorithms selection above. Nevertheless keep
+ * it here to catch bug immediately and avoid NULL pointer
+ * dereference in OPs processing.
+ */
+ ARMV8_CRYPTO_LOG_ERR(
+ "No appropriate crypto function for given parameters");
+ return -EINVAL;
+ }
+
+ /* Set up cipher session prerequisites */
+ if (cipher_set_prerequisites(sess, cipher_xform) != 0)
+ return -EINVAL;
+
+ /* Set up authentication session prerequisites */
+ if (auth_set_prerequisites(sess, auth_xform) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+/** Parse crypto xform chain and set private session parameters */
+int
+armv8_crypto_set_session_parameters(struct armv8_crypto_session *sess,
+ const struct rte_crypto_sym_xform *xform)
+{
+ const struct rte_crypto_sym_xform *cipher_xform = NULL;
+ const struct rte_crypto_sym_xform *auth_xform = NULL;
+ bool is_chained_op;
+ int ret;
+
+ /* Filter out spurious/broken requests */
+ if (xform == NULL)
+ return -EINVAL;
+
+ sess->chain_order = armv8_crypto_get_chain_order(xform);
+ switch (sess->chain_order) {
+ case ARMV8_CRYPTO_CHAIN_CIPHER_AUTH:
+ cipher_xform = xform;
+ auth_xform = xform->next;
+ is_chained_op = true;
+ break;
+ case ARMV8_CRYPTO_CHAIN_AUTH_CIPHER:
+ auth_xform = xform;
+ cipher_xform = xform->next;
+ is_chained_op = true;
+ break;
+ default:
+ is_chained_op = false;
+ return -EINVAL;
+ }
+
+ if (is_chained_op) {
+ ret = armv8_crypto_set_session_chained_parameters(sess,
+ cipher_xform, auth_xform);
+ if (unlikely(ret != 0)) {
+ ARMV8_CRYPTO_LOG_ERR(
+ "Invalid/unsupported chained (cipher/auth) parameters");
+ return -EINVAL;
+ }
+ } else {
+ ARMV8_CRYPTO_LOG_ERR("Invalid/unsupported operation");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/** Provide session for operation */
+static struct armv8_crypto_session *
+get_session(struct armv8_crypto_qp *qp, struct rte_crypto_op *op)
+{
+ struct armv8_crypto_session *sess = NULL;
+
+ if (op->sym->sess_type == RTE_CRYPTO_SYM_OP_WITH_SESSION) {
+ /* get existing session */
+ if (likely(op->sym->session != NULL &&
+ op->sym->session->dev_type ==
+ RTE_CRYPTODEV_ARMV8_PMD)) {
+ sess = (struct armv8_crypto_session *)
+ op->sym->session->_private;
+ }
+ } else {
+ /* provide internal session */
+ void *_sess = NULL;
+
+ if (!rte_mempool_get(qp->sess_mp, (void **)&_sess)) {
+ sess = (struct armv8_crypto_session *)
+ ((struct rte_cryptodev_sym_session *)_sess)
+ ->_private;
+
+ if (unlikely(armv8_crypto_set_session_parameters(
+ sess, op->sym->xform) != 0)) {
+ rte_mempool_put(qp->sess_mp, _sess);
+ sess = NULL;
+ } else
+ op->sym->session = _sess;
+ }
+ }
+
+ if (sess == NULL)
+ op->status = RTE_CRYPTO_OP_STATUS_INVALID_SESSION;
+
+ return sess;
+}
+
+/*
+ *------------------------------------------------------------------------------
+ * Process Operations
+ *------------------------------------------------------------------------------
+ */
+
+/*----------------------------------------------------------------------------*/
+
+/** Process cipher operation */
+static void
+process_armv8_chained_op
+ (struct rte_crypto_op *op, struct armv8_crypto_session *sess,
+ struct rte_mbuf *mbuf_src, struct rte_mbuf *mbuf_dst)
+{
+ crypto_func_t crypto_func;
+ crypto_arg_t arg;
+ uint8_t *src, *dst;
+ uint8_t *adst, *asrc;
+ uint64_t srclen;
+
+ srclen = op->sym->cipher.data.length;
+ ARMV8_CRYPTO_ASSERT(
+ op->sym->cipher.data.length == op->sym->auth.data.length);
+
+ src = rte_pktmbuf_mtod_offset(mbuf_src, uint8_t *,
+ op->sym->cipher.data.offset);
+ dst = rte_pktmbuf_mtod_offset(mbuf_dst, uint8_t *,
+ op->sym->cipher.data.offset);
+
+ switch (sess->chain_order) {
+ case ARMV8_CRYPTO_CHAIN_CIPHER_AUTH:
+ asrc = dst;
+ break;
+ case ARMV8_CRYPTO_CHAIN_AUTH_CIPHER:
+ asrc = src;
+ break;
+ default:
+ op->status = RTE_CRYPTO_OP_STATUS_INVALID_ARGS;
+ return;
+ }
+
+ switch (sess->auth.mode) {
+ case ARMV8_CRYPTO_AUTH_AS_AUTH:
+ /* Nothing to do here, just verify correct option */
+ break;
+ case ARMV8_CRYPTO_AUTH_AS_HMAC:
+ arg.digest.hmac.key = sess->auth.hmac.key;
+ arg.digest.hmac.i_key_pad = sess->auth.hmac.i_key_pad;
+ arg.digest.hmac.o_key_pad = sess->auth.hmac.o_key_pad;
+ break;
+ default:
+ op->status = RTE_CRYPTO_OP_STATUS_INVALID_ARGS;
+ return;
+ }
+
+ if (sess->auth.operation == RTE_CRYPTO_AUTH_OP_GENERATE) {
+ adst = op->sym->auth.digest.data;
+ if (adst == NULL) {
+ adst = rte_pktmbuf_mtod_offset(mbuf_dst,
+ uint8_t *,
+ op->sym->auth.data.offset +
+ op->sym->auth.data.length);
+ }
+ } else {
+ adst = (uint8_t *)rte_pktmbuf_append(mbuf_src,
+ op->sym->auth.digest.length);
+ }
+
+ if (unlikely(op->sym->cipher.iv.length != sess->cipher.iv_len)) {
+ op->status = RTE_CRYPTO_OP_STATUS_INVALID_ARGS;
+ return;
+ }
+
+ arg.cipher.iv = op->sym->cipher.iv.data;
+ arg.cipher.key = sess->cipher.key.data;
+ /* Acquire combined mode function */
+ crypto_func = sess->crypto_func;
+ ARMV8_CRYPTO_ASSERT(crypto_func != NULL);
+ crypto_func(src, dst, asrc, adst, srclen, &arg);
+
+ op->status = RTE_CRYPTO_OP_STATUS_SUCCESS;
+ if (sess->auth.operation == RTE_CRYPTO_AUTH_OP_VERIFY) {
+ if (memcmp(adst, op->sym->auth.digest.data,
+ op->sym->auth.digest.length) != 0) {
+ op->status = RTE_CRYPTO_OP_STATUS_AUTH_FAILED;
+ }
+ }
+}
+
+/** Process crypto operation for mbuf */
+static int
+process_op(const struct armv8_crypto_qp *qp, struct rte_crypto_op *op,
+ struct armv8_crypto_session *sess)
+{
+ struct rte_mbuf *msrc, *mdst;
+ int retval;
+
+ msrc = op->sym->m_src;
+ mdst = op->sym->m_dst ? op->sym->m_dst : op->sym->m_src;
+
+ op->status = RTE_CRYPTO_OP_STATUS_NOT_PROCESSED;
+
+ switch (sess->chain_order) {
+ case ARMV8_CRYPTO_CHAIN_CIPHER_AUTH:
+ case ARMV8_CRYPTO_CHAIN_AUTH_CIPHER: /* Fall through */
+ process_armv8_chained_op(op, sess, msrc, mdst);
+ break;
+ default:
+ op->status = RTE_CRYPTO_OP_STATUS_ERROR;
+ break;
+ }
+
+ /* Free session if a session-less crypto op */
+ if (op->sym->sess_type == RTE_CRYPTO_SYM_OP_SESSIONLESS) {
+ memset(sess, 0, sizeof(struct armv8_crypto_session));
+ rte_mempool_put(qp->sess_mp, op->sym->session);
+ op->sym->session = NULL;
+ }
+
+ if (op->status == RTE_CRYPTO_OP_STATUS_NOT_PROCESSED)
+ op->status = RTE_CRYPTO_OP_STATUS_SUCCESS;
+
+ if (op->status != RTE_CRYPTO_OP_STATUS_ERROR)
+ retval = rte_ring_enqueue(qp->processed_ops, (void *)op);
+ else
+ retval = -1;
+
+ return retval;
+}
+
+/*
+ *------------------------------------------------------------------------------
+ * PMD Framework
+ *------------------------------------------------------------------------------
+ */
+
+/** Enqueue burst */
+static uint16_t
+armv8_crypto_pmd_enqueue_burst(void *queue_pair, struct rte_crypto_op **ops,
+ uint16_t nb_ops)
+{
+ struct armv8_crypto_session *sess;
+ struct armv8_crypto_qp *qp = queue_pair;
+ int i, retval;
+
+ for (i = 0; i < nb_ops; i++) {
+ sess = get_session(qp, ops[i]);
+ if (unlikely(sess == NULL))
+ goto enqueue_err;
+
+ retval = process_op(qp, ops[i], sess);
+ if (unlikely(retval < 0))
+ goto enqueue_err;
+ }
+
+ qp->stats.enqueued_count += i;
+ return i;
+
+enqueue_err:
+ if (ops[i] != NULL)
+ ops[i]->status = RTE_CRYPTO_OP_STATUS_INVALID_ARGS;
+
+ qp->stats.enqueue_err_count++;
+ return i;
+}
+
+/** Dequeue burst */
+static uint16_t
+armv8_crypto_pmd_dequeue_burst(void *queue_pair, struct rte_crypto_op **ops,
+ uint16_t nb_ops)
+{
+ struct armv8_crypto_qp *qp = queue_pair;
+
+ unsigned int nb_dequeued = 0;
+
+ nb_dequeued = rte_ring_dequeue_burst(qp->processed_ops,
+ (void **)ops, nb_ops);
+ qp->stats.dequeued_count += nb_dequeued;
+
+ return nb_dequeued;
+}
+
+/** Create ARMv8 crypto device */
+static int
+cryptodev_armv8_crypto_create(const char *name,
+ struct rte_crypto_vdev_init_params *init_params)
+{
+ struct rte_cryptodev *dev;
+ char crypto_dev_name[RTE_CRYPTODEV_NAME_MAX_LEN];
+ struct armv8_crypto_private *internals;
+
+ /* Check CPU for support for AES instruction set */
+ if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_AES)) {
+ ARMV8_CRYPTO_LOG_ERR(
+ "AES instructions not supported by CPU");
+ return -EFAULT;
+ }
+
+ /* Check CPU for support for SHA instruction set */
+ if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_SHA1) ||
+ !rte_cpu_get_flag_enabled(RTE_CPUFLAG_SHA2)) {
+ ARMV8_CRYPTO_LOG_ERR(
+ "SHA1/SHA2 instructions not supported by CPU");
+ return -EFAULT;
+ }
+
+ /* Check CPU for support for Advance SIMD instruction set */
+ if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON)) {
+ ARMV8_CRYPTO_LOG_ERR(
+ "Advanced SIMD instructions not supported by CPU");
+ return -EFAULT;
+ }
+
+ /* create a unique device name */
+ if (create_unique_device_name(crypto_dev_name,
+ RTE_CRYPTODEV_NAME_MAX_LEN) != 0) {
+ ARMV8_CRYPTO_LOG_ERR("failed to create unique cryptodev name");
+ return -EINVAL;
+ }
+
+ dev = rte_cryptodev_pmd_virtual_dev_init(crypto_dev_name,
+ sizeof(struct armv8_crypto_private),
+ init_params->socket_id);
+ if (dev == NULL) {
+ ARMV8_CRYPTO_LOG_ERR("failed to create cryptodev vdev");
+ goto init_error;
+ }
+
+ dev->dev_type = RTE_CRYPTODEV_ARMV8_PMD;
+ dev->dev_ops = rte_armv8_crypto_pmd_ops;
+
+ /* register rx/tx burst functions for data path */
+ dev->dequeue_burst = armv8_crypto_pmd_dequeue_burst;
+ dev->enqueue_burst = armv8_crypto_pmd_enqueue_burst;
+
+ dev->feature_flags = RTE_CRYPTODEV_FF_SYMMETRIC_CRYPTO |
+ RTE_CRYPTODEV_FF_SYM_OPERATION_CHAINING;
+
+ /* Set vector instructions mode supported */
+ internals = dev->data->dev_private;
+
+ internals->max_nb_qpairs = init_params->max_nb_queue_pairs;
+ internals->max_nb_sessions = init_params->max_nb_sessions;
+
+ return 0;
+
+init_error:
+ ARMV8_CRYPTO_LOG_ERR(
+ "driver %s: cryptodev_armv8_crypto_create failed", name);
+
+ cryptodev_armv8_crypto_uninit(crypto_dev_name);
+ return -EFAULT;
+}
+
+/** Initialise ARMv8 crypto device */
+static int
+cryptodev_armv8_crypto_init(const char *name,
+ const char *input_args)
+{
+ struct rte_crypto_vdev_init_params init_params = {
+ RTE_CRYPTODEV_VDEV_DEFAULT_MAX_NB_QUEUE_PAIRS,
+ RTE_CRYPTODEV_VDEV_DEFAULT_MAX_NB_SESSIONS,
+ rte_socket_id()
+ };
+
+ rte_cryptodev_parse_vdev_init_params(&init_params, input_args);
+
+ RTE_LOG(INFO, PMD, "Initialising %s on NUMA node %d\n", name,
+ init_params.socket_id);
+ RTE_LOG(INFO, PMD, " Max number of queue pairs = %d\n",
+ init_params.max_nb_queue_pairs);
+ RTE_LOG(INFO, PMD, " Max number of sessions = %d\n",
+ init_params.max_nb_sessions);
+
+ return cryptodev_armv8_crypto_create(name, &init_params);
+}
+
+/** Uninitialise ARMv8 crypto device */
+static int
+cryptodev_armv8_crypto_uninit(const char *name)
+{
+ if (name == NULL)
+ return -EINVAL;
+
+ RTE_LOG(INFO, PMD,
+ "Closing ARMv8 crypto device %s on numa socket %u\n",
+ name, rte_socket_id());
+
+ return 0;
+}
+
+static struct rte_vdev_driver armv8_crypto_drv = {
+ .probe = cryptodev_armv8_crypto_init,
+ .remove = cryptodev_armv8_crypto_uninit
+};
+
+RTE_PMD_REGISTER_VDEV(CRYPTODEV_NAME_ARMV8_PMD, armv8_crypto_drv);
+RTE_PMD_REGISTER_PARAM_STRING(CRYPTODEV_NAME_ARMV8_PMD,
+ "max_nb_queue_pairs=<int> "
+ "max_nb_sessions=<int> "
+ "socket_id=<int>");
new file mode 100644
@@ -0,0 +1,390 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_malloc.h>
+#include <rte_cryptodev_pmd.h>
+
+#include "rte_armv8_defs.h"
+#include "rte_armv8_pmd_private.h"
+
+
+static const struct rte_cryptodev_capabilities
+ armv8_crypto_pmd_capabilities[] = {
+ { /* SHA256 */
+ .op = RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ {.sym = {
+ .xform_type = RTE_CRYPTO_SYM_XFORM_AUTH,
+ {.auth = {
+ .algo = RTE_CRYPTO_AUTH_SHA256,
+ .block_size = 64,
+ .key_size = {
+ .min = 0,
+ .max = 0,
+ .increment = 0
+ },
+ .digest_size = {
+ .min = 32,
+ .max = 32,
+ .increment = 0
+ },
+ .aad_size = { 0 }
+ }, }
+ }, }
+ },
+ { /* SHA1 HMAC */
+ .op = RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ {.sym = {
+ .xform_type = RTE_CRYPTO_SYM_XFORM_AUTH,
+ {.auth = {
+ .algo = RTE_CRYPTO_AUTH_SHA1_HMAC,
+ .block_size = 64,
+ .key_size = {
+ .min = 16,
+ .max = 128,
+ .increment = 0
+ },
+ .digest_size = {
+ .min = 20,
+ .max = 20,
+ .increment = 0
+ },
+ .aad_size = { 0 }
+ }, }
+ }, }
+ },
+ { /* SHA256 HMAC */
+ .op = RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ {.sym = {
+ .xform_type = RTE_CRYPTO_SYM_XFORM_AUTH,
+ {.auth = {
+ .algo = RTE_CRYPTO_AUTH_SHA256_HMAC,
+ .block_size = 64,
+ .key_size = {
+ .min = 16,
+ .max = 128,
+ .increment = 0
+ },
+ .digest_size = {
+ .min = 32,
+ .max = 32,
+ .increment = 0
+ },
+ .aad_size = { 0 }
+ }, }
+ }, }
+ },
+ { /* AES CBC */
+ .op = RTE_CRYPTO_OP_TYPE_SYMMETRIC,
+ {.sym = {
+ .xform_type = RTE_CRYPTO_SYM_XFORM_CIPHER,
+ {.cipher = {
+ .algo = RTE_CRYPTO_CIPHER_AES_CBC,
+ .block_size = 16,
+ .key_size = {
+ .min = 16,
+ .max = 32,
+ .increment = 8
+ },
+ .iv_size = {
+ .min = 16,
+ .max = 16,
+ .increment = 0
+ }
+ }, }
+ }, }
+ },
+
+ RTE_CRYPTODEV_END_OF_CAPABILITIES_LIST()
+};
+
+
+/** Configure device */
+static int
+armv8_crypto_pmd_config(__rte_unused struct rte_cryptodev *dev)
+{
+ return 0;
+}
+
+/** Start device */
+static int
+armv8_crypto_pmd_start(__rte_unused struct rte_cryptodev *dev)
+{
+ return 0;
+}
+
+/** Stop device */
+static void
+armv8_crypto_pmd_stop(__rte_unused struct rte_cryptodev *dev)
+{
+}
+
+/** Close device */
+static int
+armv8_crypto_pmd_close(__rte_unused struct rte_cryptodev *dev)
+{
+ return 0;
+}
+
+
+/** Get device statistics */
+static void
+armv8_crypto_pmd_stats_get(struct rte_cryptodev *dev,
+ struct rte_cryptodev_stats *stats)
+{
+ int qp_id;
+
+ for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++) {
+ struct armv8_crypto_qp *qp = dev->data->queue_pairs[qp_id];
+
+ stats->enqueued_count += qp->stats.enqueued_count;
+ stats->dequeued_count += qp->stats.dequeued_count;
+
+ stats->enqueue_err_count += qp->stats.enqueue_err_count;
+ stats->dequeue_err_count += qp->stats.dequeue_err_count;
+ }
+}
+
+/** Reset device statistics */
+static void
+armv8_crypto_pmd_stats_reset(struct rte_cryptodev *dev)
+{
+ int qp_id;
+
+ for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++) {
+ struct armv8_crypto_qp *qp = dev->data->queue_pairs[qp_id];
+
+ memset(&qp->stats, 0, sizeof(qp->stats));
+ }
+}
+
+
+/** Get device info */
+static void
+armv8_crypto_pmd_info_get(struct rte_cryptodev *dev,
+ struct rte_cryptodev_info *dev_info)
+{
+ struct armv8_crypto_private *internals = dev->data->dev_private;
+
+ if (dev_info != NULL) {
+ dev_info->dev_type = dev->dev_type;
+ dev_info->feature_flags = dev->feature_flags;
+ dev_info->capabilities = armv8_crypto_pmd_capabilities;
+ dev_info->max_nb_queue_pairs = internals->max_nb_qpairs;
+ dev_info->sym.max_nb_sessions = internals->max_nb_sessions;
+ }
+}
+
+/** Release queue pair */
+static int
+armv8_crypto_pmd_qp_release(struct rte_cryptodev *dev, uint16_t qp_id)
+{
+
+ if (dev->data->queue_pairs[qp_id] != NULL) {
+ rte_free(dev->data->queue_pairs[qp_id]);
+ dev->data->queue_pairs[qp_id] = NULL;
+ }
+
+ return 0;
+}
+
+/** set a unique name for the queue pair based on it's name, dev_id and qp_id */
+static int
+armv8_crypto_pmd_qp_set_unique_name(struct rte_cryptodev *dev,
+ struct armv8_crypto_qp *qp)
+{
+ unsigned int n;
+
+ n = snprintf(qp->name, sizeof(qp->name), "armv8_crypto_pmd_%u_qp_%u",
+ dev->data->dev_id, qp->id);
+
+ if (n > sizeof(qp->name))
+ return -1;
+
+ return 0;
+}
+
+
+/** Create a ring to place processed operations on */
+static struct rte_ring *
+armv8_crypto_pmd_qp_create_processed_ops_ring(struct armv8_crypto_qp *qp,
+ unsigned int ring_size, int socket_id)
+{
+ struct rte_ring *r;
+
+ r = rte_ring_lookup(qp->name);
+ if (r) {
+ if (r->prod.size >= ring_size) {
+ ARMV8_CRYPTO_LOG_INFO(
+ "Reusing existing ring %s for processed ops",
+ qp->name);
+ return r;
+ }
+
+ ARMV8_CRYPTO_LOG_ERR(
+ "Unable to reuse existing ring %s for processed ops",
+ qp->name);
+ return NULL;
+ }
+
+ return rte_ring_create(qp->name, ring_size, socket_id,
+ RING_F_SP_ENQ | RING_F_SC_DEQ);
+}
+
+
+/** Setup a queue pair */
+static int
+armv8_crypto_pmd_qp_setup(struct rte_cryptodev *dev, uint16_t qp_id,
+ const struct rte_cryptodev_qp_conf *qp_conf,
+ int socket_id)
+{
+ struct armv8_crypto_qp *qp = NULL;
+
+ /* Free memory prior to re-allocation if needed. */
+ if (dev->data->queue_pairs[qp_id] != NULL)
+ armv8_crypto_pmd_qp_release(dev, qp_id);
+
+ /* Allocate the queue pair data structure. */
+ qp = rte_zmalloc_socket("ARMv8 PMD Queue Pair", sizeof(*qp),
+ RTE_CACHE_LINE_SIZE, socket_id);
+ if (qp == NULL)
+ return -ENOMEM;
+
+ qp->id = qp_id;
+ dev->data->queue_pairs[qp_id] = qp;
+
+ if (armv8_crypto_pmd_qp_set_unique_name(dev, qp) != 0)
+ goto qp_setup_cleanup;
+
+ qp->processed_ops = armv8_crypto_pmd_qp_create_processed_ops_ring(qp,
+ qp_conf->nb_descriptors, socket_id);
+ if (qp->processed_ops == NULL)
+ goto qp_setup_cleanup;
+
+ qp->sess_mp = dev->data->session_pool;
+
+ memset(&qp->stats, 0, sizeof(qp->stats));
+
+ return 0;
+
+qp_setup_cleanup:
+ if (qp)
+ rte_free(qp);
+
+ return -1;
+}
+
+/** Start queue pair */
+static int
+armv8_crypto_pmd_qp_start(__rte_unused struct rte_cryptodev *dev,
+ __rte_unused uint16_t queue_pair_id)
+{
+ return -ENOTSUP;
+}
+
+/** Stop queue pair */
+static int
+armv8_crypto_pmd_qp_stop(__rte_unused struct rte_cryptodev *dev,
+ __rte_unused uint16_t queue_pair_id)
+{
+ return -ENOTSUP;
+}
+
+/** Return the number of allocated queue pairs */
+static uint32_t
+armv8_crypto_pmd_qp_count(struct rte_cryptodev *dev)
+{
+ return dev->data->nb_queue_pairs;
+}
+
+/** Returns the size of the session structure */
+static unsigned
+armv8_crypto_pmd_session_get_size(struct rte_cryptodev *dev __rte_unused)
+{
+ return sizeof(struct armv8_crypto_session);
+}
+
+/** Configure the session from a crypto xform chain */
+static void *
+armv8_crypto_pmd_session_configure(struct rte_cryptodev *dev __rte_unused,
+ struct rte_crypto_sym_xform *xform, void *sess)
+{
+ if (unlikely(sess == NULL)) {
+ ARMV8_CRYPTO_LOG_ERR("invalid session struct");
+ return NULL;
+ }
+
+ if (armv8_crypto_set_session_parameters(
+ sess, xform) != 0) {
+ ARMV8_CRYPTO_LOG_ERR("failed configure session parameters");
+ return NULL;
+ }
+
+ return sess;
+}
+
+/** Clear the memory of session so it doesn't leave key material behind */
+static void
+armv8_crypto_pmd_session_clear(struct rte_cryptodev *dev __rte_unused,
+ void *sess)
+{
+
+ /* Zero out the whole structure */
+ if (sess)
+ memset(sess, 0, sizeof(struct armv8_crypto_session));
+}
+
+struct rte_cryptodev_ops armv8_crypto_pmd_ops = {
+ .dev_configure = armv8_crypto_pmd_config,
+ .dev_start = armv8_crypto_pmd_start,
+ .dev_stop = armv8_crypto_pmd_stop,
+ .dev_close = armv8_crypto_pmd_close,
+
+ .stats_get = armv8_crypto_pmd_stats_get,
+ .stats_reset = armv8_crypto_pmd_stats_reset,
+
+ .dev_infos_get = armv8_crypto_pmd_info_get,
+
+ .queue_pair_setup = armv8_crypto_pmd_qp_setup,
+ .queue_pair_release = armv8_crypto_pmd_qp_release,
+ .queue_pair_start = armv8_crypto_pmd_qp_start,
+ .queue_pair_stop = armv8_crypto_pmd_qp_stop,
+ .queue_pair_count = armv8_crypto_pmd_qp_count,
+
+ .session_get_size = armv8_crypto_pmd_session_get_size,
+ .session_configure = armv8_crypto_pmd_session_configure,
+ .session_clear = armv8_crypto_pmd_session_clear
+};
+
+struct rte_cryptodev_ops *rte_armv8_crypto_pmd_ops = &armv8_crypto_pmd_ops;
new file mode 100644
@@ -0,0 +1,210 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ARMV8_PMD_PRIVATE_H_
+#define _RTE_ARMV8_PMD_PRIVATE_H_
+
+#define ARMV8_CRYPTO_LOG_ERR(fmt, args...) \
+ RTE_LOG(ERR, CRYPTODEV, "[%s] %s() line %u: " fmt "\n", \
+ RTE_STR(CRYPTODEV_NAME_ARMV8_CRYPTO_PMD), \
+ __func__, __LINE__, ## args)
+
+#ifdef RTE_LIBRTE_ARMV8_CRYPTO_DEBUG
+#define ARMV8_CRYPTO_LOG_INFO(fmt, args...) \
+ RTE_LOG(INFO, CRYPTODEV, "[%s] %s() line %u: " fmt "\n", \
+ RTE_STR(CRYPTODEV_NAME_ARMV8_CRYPTO_PMD), \
+ __func__, __LINE__, ## args)
+
+#define ARMV8_CRYPTO_LOG_DBG(fmt, args...) \
+ RTE_LOG(DEBUG, CRYPTODEV, "[%s] %s() line %u: " fmt "\n", \
+ RTE_STR(CRYPTODEV_NAME_ARMV8_CRYPTO_PMD), \
+ __func__, __LINE__, ## args)
+
+#define ARMV8_CRYPTO_ASSERT(con) \
+do { \
+ if (!(con)) { \
+ rte_panic("%s(): " \
+ con "condition failed, line %u", __func__); \
+ } \
+} while (0)
+
+#else
+#define ARMV8_CRYPTO_LOG_INFO(fmt, args...)
+#define ARMV8_CRYPTO_LOG_DBG(fmt, args...)
+#define ARMV8_CRYPTO_ASSERT(con)
+#endif
+
+#define NBBY 8 /* Number of bits in a byte */
+#define BYTE_LENGTH(x) ((x) / 8) /* Number of bytes in x (roun down) */
+
+/** ARMv8 operation order mode enumerator */
+enum armv8_crypto_chain_order {
+ ARMV8_CRYPTO_CHAIN_CIPHER_AUTH,
+ ARMV8_CRYPTO_CHAIN_AUTH_CIPHER,
+ ARMV8_CRYPTO_CHAIN_NOT_SUPPORTED,
+ ARMV8_CRYPTO_CHAIN_LIST_END = ARMV8_CRYPTO_CHAIN_NOT_SUPPORTED
+};
+
+/** ARMv8 cipher operation enumerator */
+enum armv8_crypto_cipher_operation {
+ ARMV8_CRYPTO_CIPHER_OP_ENCRYPT = RTE_CRYPTO_CIPHER_OP_ENCRYPT,
+ ARMV8_CRYPTO_CIPHER_OP_DECRYPT = RTE_CRYPTO_CIPHER_OP_DECRYPT,
+ ARMV8_CRYPTO_CIPHER_OP_NOT_SUPPORTED,
+ ARMV8_CRYPTO_CIPHER_OP_LIST_END = ARMV8_CRYPTO_CIPHER_OP_NOT_SUPPORTED
+};
+
+enum armv8_crypto_cipher_keylen {
+ ARMV8_CRYPTO_CIPHER_KEYLEN_128,
+ ARMV8_CRYPTO_CIPHER_KEYLEN_192,
+ ARMV8_CRYPTO_CIPHER_KEYLEN_256,
+ ARMV8_CRYPTO_CIPHER_KEYLEN_NOT_SUPPORTED,
+ ARMV8_CRYPTO_CIPHER_KEYLEN_LIST_END =
+ ARMV8_CRYPTO_CIPHER_KEYLEN_NOT_SUPPORTED
+};
+
+/** ARMv8 auth mode enumerator */
+enum armv8_crypto_auth_mode {
+ ARMV8_CRYPTO_AUTH_AS_AUTH,
+ ARMV8_CRYPTO_AUTH_AS_HMAC,
+ ARMV8_CRYPTO_AUTH_AS_CIPHER,
+ ARMV8_CRYPTO_AUTH_NOT_SUPPORTED,
+ ARMV8_CRYPTO_AUTH_LIST_END = ARMV8_CRYPTO_AUTH_NOT_SUPPORTED
+};
+
+#define CRYPTO_ORDER_MAX ARMV8_CRYPTO_CHAIN_LIST_END
+#define CRYPTO_CIPHER_OP_MAX ARMV8_CRYPTO_CIPHER_OP_LIST_END
+#define CRYPTO_CIPHER_KEYLEN_MAX ARMV8_CRYPTO_CIPHER_KEYLEN_LIST_END
+#define CRYPTO_CIPHER_MAX RTE_CRYPTO_CIPHER_LIST_END
+#define CRYPTO_AUTH_MAX RTE_CRYPTO_AUTH_LIST_END
+
+#define HMAC_IPAD_VALUE (0x36)
+#define HMAC_OPAD_VALUE (0x5C)
+
+#define SHA256_AUTH_KEY_LENGTH (BYTE_LENGTH(256))
+#define SHA256_BLOCK_SIZE (BYTE_LENGTH(512))
+
+#define SHA1_AUTH_KEY_LENGTH (BYTE_LENGTH(160))
+#define SHA1_BLOCK_SIZE (BYTE_LENGTH(512))
+
+#define SHA_AUTH_KEY_MAX SHA256_AUTH_KEY_LENGTH
+#define SHA_BLOCK_MAX SHA256_BLOCK_SIZE
+
+typedef void (*crypto_func_t)(uint8_t *, uint8_t *, uint8_t *, uint8_t *,
+ uint64_t, crypto_arg_t *);
+
+typedef void (*crypto_key_sched_t)(uint8_t *, const uint8_t *);
+
+/** private data structure for each ARMv8 crypto device */
+struct armv8_crypto_private {
+ unsigned int max_nb_qpairs;
+ /**< Max number of queue pairs */
+ unsigned int max_nb_sessions;
+ /**< Max number of sessions */
+};
+
+/** ARMv8 crypto queue pair */
+struct armv8_crypto_qp {
+ uint16_t id;
+ /**< Queue Pair Identifier */
+ char name[RTE_CRYPTODEV_NAME_LEN];
+ /**< Unique Queue Pair Name */
+ struct rte_ring *processed_ops;
+ /**< Ring for placing process packets */
+ struct rte_mempool *sess_mp;
+ /**< Session Mempool */
+ struct rte_cryptodev_stats stats;
+ /**< Queue pair statistics */
+} __rte_cache_aligned;
+
+/** ARMv8 crypto private session structure */
+struct armv8_crypto_session {
+ enum armv8_crypto_chain_order chain_order;
+ /**< chain order mode */
+ crypto_func_t crypto_func;
+ /**< cryptographic function to use for this session */
+
+ /** Cipher Parameters */
+ struct {
+ enum rte_crypto_cipher_operation direction;
+ /**< cipher operation direction */
+ enum rte_crypto_cipher_algorithm algo;
+ /**< cipher algorithm */
+ int iv_len;
+ /**< IV length */
+
+ struct {
+ uint8_t data[256];
+ /**< key data */
+ size_t length;
+ /**< key length in bytes */
+ } key;
+
+ crypto_key_sched_t key_sched;
+ /**< Key schedule function */
+ } cipher;
+
+ /** Authentication Parameters */
+ struct {
+ enum rte_crypto_auth_operation operation;
+ /**< auth operation generate or verify */
+ enum armv8_crypto_auth_mode mode;
+ /**< auth operation mode */
+
+ union {
+ struct {
+ /* Add data if needed */
+ } auth;
+
+ struct {
+ uint8_t i_key_pad[SHA_BLOCK_MAX]
+ __rte_cache_aligned;
+ /**< inner pad (max supported block length) */
+ uint8_t o_key_pad[SHA_BLOCK_MAX]
+ __rte_cache_aligned;
+ /**< outer pad (max supported block length) */
+ uint8_t key[SHA_AUTH_KEY_MAX];
+ /**< HMAC key (max supported length)*/
+ } hmac;
+ };
+ } auth;
+
+} __rte_cache_aligned;
+
+/** Set and validate ARMv8 crypto session parameters */
+extern int armv8_crypto_set_session_parameters(
+ struct armv8_crypto_session *sess,
+ const struct rte_crypto_sym_xform *xform);
+
+/** device specific operations function pointer structure */
+extern struct rte_cryptodev_ops *rte_armv8_crypto_pmd_ops;
+
+#endif /* _RTE_ARMV8_PMD_PRIVATE_H_ */
new file mode 100644
@@ -0,0 +1,3 @@
+DPDK_17.02 {
+ local: *;
+};
@@ -66,6 +66,8 @@
/**< KASUMI PMD device name */
#define CRYPTODEV_NAME_ZUC_PMD crypto_zuc
/**< KASUMI PMD device name */
+#define CRYPTODEV_NAME_ARMV8_PMD crypto_armv8
+/**< ARMv8 CM device name */
/** Crypto device type */
enum rte_cryptodev_type {
@@ -77,6 +79,7 @@ enum rte_cryptodev_type {
RTE_CRYPTODEV_KASUMI_PMD, /**< KASUMI PMD */
RTE_CRYPTODEV_ZUC_PMD, /**< ZUC PMD */
RTE_CRYPTODEV_OPENSSL_PMD, /**< OpenSSL PMD */
+ RTE_CRYPTODEV_ARMV8_PMD, /**< ARMv8 crypto PMD */
};
extern const char **rte_cyptodev_names;
@@ -145,6 +145,9 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KASUMI) += -lrte_pmd_kasumi
_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_KASUMI) += -L$(LIBSSO_KASUMI_PATH)/build -lsso_kasumi
_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ZUC) += -lrte_pmd_zuc
_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ZUC) += -L$(LIBSSO_ZUC_PATH)/build -lsso_zuc
+ifeq ($(CONFIG_RTE_ARCH_ARM64),y)
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_ARMV8_CRYPTO) += -lrte_pmd_armv8
+endif
endif # CONFIG_RTE_LIBRTE_CRYPTODEV
endif # !CONFIG_RTE_BUILD_SHARED_LIBS