new file mode 100644
@@ -0,0 +1,1719 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Enc/Auth Primitive = aes128cbc/sha1_hmac
+ *
+ * Operations:
+ *
+ * out = encrypt-AES128CBC(in)
+ * return_hash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | out))
+ *
+ * Prototype:
+ * void aes128cbc_sha1_hmac(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * aes128cbc_sha1_hmac(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 -- temp register for SHA1
+ * v20 -- ABCD copy (q20)
+ * v21 -- sha working state (q21)
+ * v22 -- sha working state (q22)
+ * v23 -- temp register for SHA1
+ * v24 -- sha state ABCD
+ * v25 -- sha state E
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16, otherwise results are not
+ * defined. For AES partial blocks the user is required to pad the input
+ * to modulus 16 = 0.
+ *
+ * Short lengths are not optimized at < 12 AES blocks
+ */
+
+ .file "aes128cbc_sha1_hmac.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global aes128cbc_sha1_hmac
+ .type aes128cbc_sha1_hmac,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+aes128cbc_sha1_hmac:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ /* init ABCD, E */
+ ld1 {v24.4s, v25.4s},[x6]
+ /* save pointer to o_key_pad partial hash */
+ ldr x6, [x5, #HMAC_OKEYPAD]
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,12 /* no main loop if <12 */
+ b.lt .Lshort_cases /* branch if < 12 */
+
+ /* protect registers */
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ /* proceed */
+ ld1 {v3.16b},[x5] /* get 1st ivec */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ mov x11,x4 /* len -> x11 needed at end */
+ lsr x12,x11,6 /* total_blocks */
+/*
+ * now we can do the loop prolog, 1st aes sequence of 4 blocks
+ */
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ eor v0.16b,v0.16b,v3.16b /* xor w/ ivec (modeop) */
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ aesmc v0.16b,v0.16b
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ aese v0.16b,v9.16b
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+ /* base address for sha round consts */
+ adr x8,.Lrcon
+ aesmc v0.16b,v0.16b
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ aese v0.16b,v10.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x0],16
+ aesmc v0.16b,v0.16b
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ aese v0.16b,v16.16b
+ mov x4,x1 /* sha_ptr_in = aes_ptr_out */
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x0],16
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ prfm PLDL1KEEP,[x8,0*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ prfm PLDL1KEEP,[x8,2*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ prfm PLDL1KEEP,[x8,4*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ prfm PLDL1KEEP,[x8,6*64] /* rcon */
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ prfm PLDL1KEEP,[x8,8*64] /* rcon */
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+
+ eor v2.16b,v2.16b,v1.16b /* xor w/ivec (modeop) */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x0],16
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ mov x2,x0 /* lead_ptr = aes_ptr_in */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ prfm PLDL1KEEP,[x8,10*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ prfm PLDL1KEEP,[x8,12*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ prfm PLDL1KEEP,[x8,14*64] /* rcon */
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+
+ eor v3.16b,v3.16b,v2.16b /* xor w/ ivec (modeop) */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ /* main_blocks = total_blocks - 1 */
+ sub x7,x12,1
+ and x13,x10,3 /* aes_blocks_left */
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+
+/*
+ * Note, aes_blocks_left := number after
+ * the main (sha) block is done. Can be 0
+ */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+/*
+ * main combined loop CBC
+ */
+.Lmain_loop:
+/*
+ * because both mov, rev32 and eor have a busy cycle,
+ * this takes longer than it looks.
+ * Thats OK since there are 6 cycles before we can use the load anyway;
+ * so this goes as fast as it can without SW pipelining (too complicated
+ * given the code size)
+ */
+ rev32 v26.16b,v0.16b /* fix endian w0, aes res 0 */
+ /* next aes block, update aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v1.16b /* fix endian w1, aes res 1 */
+ /* pref next aes_ptr_out, streaming */
+ prfm PLDL1KEEP,[x1,64]
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ aese v0.16b,v8.16b
+ rev32 v28.16b,v2.16b /* fix endian w2, aes res 2 */
+ aesmc v0.16b,v0.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x0],16
+ aese v0.16b,v9.16b
+ add v19.4s,v4.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v0.16b,v10.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ add v23.4s,v4.4s,v27.4s
+ /* no place to get rid of this stall */
+ rev32 v29.16b,v3.16b /* fix endian w3, aes res 3 */
+ aesmc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aese v0.16b,v12.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aese v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v0.16b,v15.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aese v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ sha1h s21,s24
+ aese v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ eor v1.16b,v1.16b,v0.16b /* mode op 1 xor w/prev value */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ aese v1.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v10.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x0],16
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ sha1h s22,s24
+ aese v1.16b,v12.16b
+ sha1p q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aese v1.16b,v13.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v14.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v1.16b,v15.16b
+ sha1h s22,s24
+ add v23.4s,v5.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v16.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v1.16b,v17.16b
+ sha1h s21,s24
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ sha1p q24,s22,v23.4s
+ add v23.4s,v6.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+/* mode op 2 */
+ eor v2.16b,v2.16b,v1.16b /* mode of 2 xor w/prev value */
+
+/* aes xform 2, sha quad 2 */
+ aese v2.16b,v8.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ aesmc v2.16b,v2.16b
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v2.16b,v9.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ aesmc v2.16b,v2.16b
+
+ aese v2.16b,v11.16b
+ add v19.4s,v6.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aese v2.16b,v12.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aese v2.16b,v13.16b
+ sha1su1 v29.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x0],16
+ aese v2.16b,v14.16b
+ add v23.4s,v6.4s,v27.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v2.16b,v15.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v16.16b
+ add v19.4s,v6.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ sha1su1 v26.4s,v29.4s
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+
+/* mode op 3 */
+ eor v3.16b,v3.16b,v2.16b /* xor w/prev value */
+
+/* aes xform 3, sha quad 3 */
+ aese v3.16b,v8.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v3.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aese v3.16b,v9.16b
+ sha1h s21,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aese v3.16b,v10.16b
+ sha1su1 v29.4s,v28.4s
+ aesmc v3.16b,v3.16b
+ add v19.4s,v7.4s,v26.4s
+ aese v3.16b,v11.16b
+ sha1h s22,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aese v3.16b,v13.16b
+ sha1h s21,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aese v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesmc v3.16b,v3.16b
+ add v19.4s,v7.4s,v28.4s
+ aese v3.16b,v15.16b
+ sha1h s22,s24
+ aesmc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ add v23.4s,v7.4s,v29.4s
+ aese v3.16b,v17.16b
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ cbnz x7,.Lmain_loop /* loop if more to do */
+
+
+/*
+ * epilog, process remaining aes blocks and b-2 sha block
+ * do this inline (no loop) to overlap with the sha part
+ * note there are 0-3 aes blocks left.
+ */
+ rev32 v26.16b,v0.16b /* fix endian w0 */
+ rev32 v27.16b,v1.16b /* fix endian w1 */
+ rev32 v28.16b,v2.16b /* fix endian w2 */
+ rev32 v29.16b,v3.16b /* fix endian w3 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ cbz x13, .Lbm2fromQ0 /* skip if none left */
+ /* local copy of aes_blocks_left */
+ subs x14,x13,1
+
+/*
+ * mode op 0
+ * read next aes block, update aes_ptr_in
+ */
+ ld1 {v0.16b},[x0],16
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0, sha quad 0 */
+ add v19.4s,v4.4s,v26.4s
+ aese v0.16b,v8.16b
+ add v23.4s,v4.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aese v0.16b,v9.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aese v0.16b,v10.16b
+ sha1su1 v26.4s,v29.4s
+ add v19.4s,v4.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ sha1h s21,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aese v0.16b,v12.16b
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v4.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ sha1h s22,s24
+ aesmc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aese v0.16b,v14.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ sha1h s21,s24
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ sha1c q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ /* if aes_blocks_left_count == 0 */
+ beq .Lbm2fromQ1
+/*
+ * mode op 1
+ * read next aes block, update aes_ptr_in
+ */
+ ld1 {v1.16b},[x0],16
+
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1, sha quad 1 */
+ add v23.4s,v5.4s,v27.4s
+ aese v1.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aese v1.16b,v9.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v10.16b
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v5.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesmc v1.16b,v1.16b
+ subs x14,x14,1 /* dec counter */
+ aese v1.16b,v11.16b
+ sha1h s22,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v12.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v5.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ sha1h s21,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aese v1.16b,v14.16b
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ sha1h s22,s24
+ aesmc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aese v1.16b,v16.16b
+ sha1su1 v26.4s,v29.4s
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ /* if aes_blocks_left_count == 0 */
+ beq .Lbm2fromQ2
+
+/*
+ * mode op 2
+ * read next aes block, update aes_ptr_in
+ */
+ ld1 {v2.16b},[x0],16
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2, sha quad 2 */
+ add v19.4s,v6.4s,v28.4s
+ aese v2.16b,v8.16b
+ add v23.4s,v6.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aese v2.16b,v9.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aese v2.16b,v12.16b
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ sha1h s22,s24
+ aesmc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aese v2.16b,v14.16b
+ sha1su1 v26.4s,v29.4s
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ sha1h s21,s24
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ sha1m q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ /* join common code at Quad 3 */
+ b .Lbm2fromQ3
+
+/*
+ * now there is the b-2 sha block before the final one. Execution takes over
+ * in the appropriate part of this depending on how many aes blocks were left.
+ * If there were none, the whole thing is executed.
+ */
+.Lbm2fromQ0:
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+.Lbm2fromQ1:
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+.Lbm2fromQ2:
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+.Lbm2fromQ3:
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ eor v26.16b,v26.16b,v26.16b /* zero reg */
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ eor v27.16b,v27.16b,v27.16b /* zero reg */
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ eor v28.16b,v28.16b,v28.16b /* zero reg */
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+/*
+ * now we can do the final block, either all padding or 1-3 aes blocks
+ * len in x11, aes_blocks_left in x13. should move the aes data setup of this
+ * to the last aes bit.
+ */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ mov w15,0x80 /* that's the 1 of the pad */
+ /* Add one SHA-1 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x9,x11,0xffffffff /* len_lo */
+ mov v26.b[0],w15 /* assume block 0 is dst */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x9,x9,3 /* len_lo in bits */
+ eor v29.16b,v29.16b,v29.16b /* zero reg */
+/*
+ * places the 0x80 in the correct block, copies the appropriate data
+ */
+ cbz x13,.Lpad100 /* no data to get */
+ mov v26.16b,v0.16b
+ sub x14,x13,1 /* dec amount left */
+ mov v27.b[0],w15 /* assume block 1 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v27.16b,v1.16b
+ sub x14,x14,1 /* dec amount left */
+ mov v28.b[0],w15 /* assume block 2 is dst */
+ cbz x14,.Lpad100 /* branch if done */
+ mov v28.16b,v2.16b
+ mov v29.b[3],w15 /* block 3, doesn't get rev'd */
+/*
+ * get the len_hi,LenLo in bits according to
+ * len_hi = (uint32_t)(((len>>32) & 0xffffffff)<<3); (x12)
+ * len_lo = (uint32_t)((len & 0xffffffff)<<3); (x9)
+ * this is done before the if/else above
+ */
+.Lpad100:
+ mov v29.s[3],w9 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+/*
+ * note that q29 is already built in the correct format, so no swap required
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+/*
+ * do last sha of pad block
+ */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+ /* load o_key_pad partial hash */
+ ld1 {v24.16b,v25.16b}, [x6]
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+
+ mov x11, #64+20 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ /* move length to the end of the block */
+ mov v29.s[3], w11
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v3.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ mov w15,0x80 /* sha padding word */
+
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+
+ mov x9,x8 /* top of rcon */
+
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+/*
+ * the idea in the short loop (at least 1) is to break out with the padding
+ * already in place excepting the final word.
+ */
+.Lshort_loop:
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ eor v0.16b,v0.16b,v3.16b /* xor w/ prev value */
+
+/* aes xform 0 */
+ aese v0.16b,v8.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v9.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v10.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v11.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v12.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v13.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v14.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v15.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ /* assume this was final block */
+ mov v27.b[3],w15
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ /* load res to sha 0, endian swap */
+ rev32 v26.16b,v0.16b
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x0],16
+ eor v1.16b,v1.16b,v0.16b /* xor w/ prev value */
+
+/* aes xform 1 */
+ aese v1.16b,v8.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v9.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v10.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v11.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v12.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v13.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v14.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v15.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ /* assume this was final block */
+ mov v28.b[3],w15
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ /* load res to sha 0, endian swap */
+ rev32 v27.16b,v1.16b
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x0],16
+ eor v2.16b,v2.16b,v1.16b /* xor w/ prev value */
+
+/* aes xform 2 */
+ aese v2.16b,v8.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v9.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v10.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v11.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v12.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v13.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v14.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v15.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v16.16b
+ aesmc v2.16b,v2.16b
+ aese v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ /* assume this was final block */
+ mov v29.b[3],w15
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ /* load res to sha 0, endian swap */
+ rev32 v28.16b,v2.16b
+ sub x10,x10,1 /* dec num_blocks */
+ cbz x10,.Lpost_short_loop /* break if no more */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x0],16
+ eor v3.16b,v3.16b,v2.16b /* xor w/prev value */
+
+/* aes xform 3 */
+ aese v3.16b,v8.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v9.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v10.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v11.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v12.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v13.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v14.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v15.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v16.16b
+ aesmc v3.16b,v3.16b
+ aese v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+ /* load res to sha 0, endian swap */
+ rev32 v29.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+/*
+ * now we have the sha1 to do for these 4 aes blocks
+ */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ eor v26.16b,v26.16b,v26.16b /* zero sha src 0 */
+ eor v27.16b,v27.16b,v27.16b /* zero sha src 1 */
+ eor v28.16b,v28.16b,v28.16b /* zero sha src 2 */
+ eor v29.16b,v29.16b,v29.16b /* zero sha src 3 */
+ /* assume this was final block */
+ mov v26.b[3],w15
+
+ sub x10,x10,1 /* dec num_blocks */
+ cbnz x10,.Lshort_loop /* keep looping if more */
+/*
+ * there are between 0 and 3 aes blocks in the final sha1 blocks
+ */
+.Lpost_short_loop:
+ /* Add one SHA-2 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x13,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x13,x13,3 /* len_lo in bits */
+
+ mov v29.s[3],w13 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+ /* do final block */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+ /* load o_key_pad partial hash */
+ ld1 {v24.16b,v25.16b}, [x6]
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+
+ mov x11, #64+20 /* size of o_key_pad + inner hash */
+ lsl x11, x11, 3
+ /* move length to the end of the block */
+ mov v29.s[3], w11
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+ .size aes128cbc_sha1_hmac, .-aes128cbc_sha1_hmac
new file mode 100644
@@ -0,0 +1,1650 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Combined Auth/Dec Primitive = sha1_hmac/aes128cbc
+ *
+ * Operations:
+ *
+ * out = decrypt-AES128CBC(in)
+ * return_ash_ptr = SHA1(o_key_pad | SHA1(i_key_pad | in))
+ *
+ * Prototype:
+ *
+ * void sha1_hmac_aes128cbc_dec(uint8_t *csrc, uint8_t *cdst,
+ * uint8_t *dsrc, uint8_t *ddst,
+ * uint64_t len, crypto_arg_t *arg)
+ *
+ * Registers used:
+ *
+ * sha1_hmac_aes128cbc_dec(
+ * csrc, x0 (cipher src address)
+ * cdst, x1 (cipher dst address)
+ * dsrc, x2 (digest src address - ignored)
+ * ddst, x3 (digest dst address)
+ * len, x4 (length)
+ * arg x5 :
+ * arg->cipher.key (round keys)
+ * arg->cipher.iv (initialization vector)
+ * arg->digest.hmac.i_key_pad (partially hashed i_key_pad)
+ * arg->digest.hmac.o_key_pad (partially hashed o_key_pad)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v0 - v3 -- aes results
+ * v4 - v7 -- round consts for sha
+ * v8 - v18 -- round keys
+ * v19 -- temp register for SHA1
+ * v20 -- ABCD copy (q20)
+ * v21 -- sha working state (q21)
+ * v22 -- sha working state (q22)
+ * v23 -- temp register for SHA1
+ * v24 -- sha state ABCD
+ * v25 -- sha state E
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16,
+ * otherwise results are not defined. For AES partial blocks the user
+ * is required to pad the input to modulus 16 = 0.
+ *
+ * Short lengths are less optimized at < 16 AES blocks,
+ * however they are somewhat optimized, and more so than the enc/auth versions.
+ */
+ .file "sha1_hmac_aes128cbc_dec.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .global sha1_hmac_aes128cbc_dec
+ .type sha1_hmac_aes128cbc_dec,%function
+
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+sha1_hmac_aes128cbc_dec:
+/* fetch args */
+ ldr x6, [x5, #HMAC_IKEYPAD]
+ /* init ABCD, E */
+ ld1 {v24.4s, v25.4s},[x6]
+ /* save pointer to o_key_pad partial hash */
+ ldr x6, [x5, #HMAC_OKEYPAD]
+
+ ldr x2, [x5, #CIPHER_KEY]
+ ldr x5, [x5, #CIPHER_IV]
+/*
+ * init sha state, prefetch, check for small cases.
+ * Note that the output is prefetched as a load, for the in-place case
+ */
+ prfm PLDL1KEEP,[x0,0] /* pref next *in */
+ prfm PLDL1KEEP,[x1,0] /* pref next aes_ptr_out */
+ lsr x10,x4,4 /* aes_blocks = len/16 */
+ cmp x10,16 /* no main loop if <16 */
+ blt .Lshort_cases /* branch if < 12 */
+
+/* protect registers */
+ sub sp,sp,8*16
+ mov x11,x4 /* len -> x11 needed at end */
+ mov x7,sp /* copy for address mode */
+ ld1 {v30.16b},[x5] /* get 1st ivec */
+ lsr x12,x11,6 /* total_blocks (sha) */
+ mov x4,x0 /* sha_ptr_in = *in */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ ld1 {v29.16b},[x4],16 /* next w3 */
+
+/*
+ * now we can do the loop prolog, 1st sha1 block
+ */
+ prfm PLDL1KEEP,[x0,64] /* pref next aes_ptr_in */
+ prfm PLDL1KEEP,[x1,64] /* pref next aes_ptr_out */
+ /* base address for sha round consts */
+ adr x8,.Lrcon
+/*
+ * do the first sha1 block on the plaintext
+ */
+ mov v20.16b,v24.16b /* init working ABCD */
+ st1 {v8.16b},[x7],16
+ st1 {v9.16b},[x7],16
+ rev32 v26.16b,v26.16b /* endian swap w0 */
+ st1 {v10.16b},[x7],16
+ rev32 v27.16b,v27.16b /* endian swap w1 */
+ st1 {v11.16b},[x7],16
+ rev32 v28.16b,v28.16b /* endian swap w2 */
+ st1 {v12.16b},[x7],16
+ rev32 v29.16b,v29.16b /* endian swap w3 */
+ st1 {v13.16b},[x7],16
+ mov x9,x8 /* top of rcon */
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+ add v19.4s,v4.4s,v26.4s
+ st1 {v14.16b},[x7],16
+ add v23.4s,v4.4s,v27.4s
+ st1 {v15.16b},[x7],16
+/* quad 0 */
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v8.16b},[x2],16 /* rk[0] */
+ sha1c q24,s25,v19.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v9.16b},[x2],16 /* rk[1] */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ ld1 {v10.16b},[x2],16 /* rk[2] */
+ sha1c q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v4.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v11.16b},[x2],16 /* rk[3] */
+ sha1c q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v12.16b},[x2],16 /* rk[4] */
+ sha1c q24,s21,v19.4s
+ add v19.4s,v5.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v13.16b},[x2],16 /* rk[5] */
+/* quad 1 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v14.16b},[x2],16 /* rk[6] */
+ sha1p q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+ add v23.4s,v5.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v15.16b},[x2],16 /* rk[7] */
+ sha1p q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v16.16b},[x2],16 /* rk[8] */
+ sha1p q24,s21,v19.4s
+ sha1su1 v26.4s,v29.4s
+ ld1 {v17.16b},[x2],16 /* rk[9] */
+ add v19.4s,v6.4s,v28.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v18.16b},[x2],16 /* rk[10] */
+ sha1p q24,s22,v23.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v23.4s,v6.4s,v29.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ add v19.4s,v6.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+/* quad 3 */
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ sha1p q24,s21,v19.4s
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ sha1p q24,s22,v23.4s
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ sha1p q24,s21,v19.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ sha1p q24,s22,v23.4s
+
+/*
+ * aes_blocks_left := number after the main (sha) block is done.
+ * can be 0 note we account for the extra unwind in main_blocks
+ */
+ sub x7,x12,2 /* main_blocks=total_blocks-5 */
+ add v24.4s,v24.4s,v20.4s
+ and x13,x10,3 /* aes_blocks_left */
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ add v25.4s,v25.4s,v21.4s
+ add x2,x0,128 /* lead_ptr = *in */
+ /* next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+
+/*
+ * main combined loop CBC, can be used by auth/enc version
+ */
+.Lmain_loop:
+/*
+ * Because both mov, rev32 and eor have a busy cycle,
+ * this takes longer than it looks.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ /* pref next aes_ptr_out, streaming */
+ prfm PLDL1KEEP,[x1,64]
+/* aes xform 0, sha quad 0 */
+ aesd v0.16b,v8.16b
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ add v19.4s,v4.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v0.16b,v10.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ add v23.4s,v4.4s,v27.4s
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ /* read next aes block, no update */
+ ld1 {v1.16b},[x0]
+ aesimc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aesd v0.16b,v12.16b
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v15.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aesd v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ /* get next aes block, with update */
+ ld1 {v30.16b},[x0],16
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ add v23.4s,v5.4s,v27.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ aesd v1.16b,v8.16b
+ sha1h s21,s24
+ add v19.4s,v5.4s,v28.4s
+ sha1p q24,s22,v23.4s
+ aesimc v1.16b,v1.16b
+ sha1su1 v27.4s,v26.4s
+ aesd v1.16b,v9.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v10.16b
+ /* read next aes block, no update */
+ ld1 {v2.16b},[x0]
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v28.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1h s21,s24
+ aesd v1.16b,v12.16b
+ sha1p q24,s22,v23.4s
+ sha1su1 v29.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ sha1h s22,s24
+ add v19.4s,v5.4s,v26.4s
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v14.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v23.4s,v5.4s,v27.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v16.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v6.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ add v23.4s,v6.4s,v29.4s
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+/* aes xform 2, sha quad 2 */
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v2.16b,v8.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v9.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v2.16b,v10.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v11.16b
+ sha1su1 v29.4s,v28.4s
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v13.16b
+ sha1su1 v26.4s,v29.4s
+ add v23.4s,v6.4s,v27.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v2.16b,v2.16b
+ /* read next aes block, no update */
+ ld1 {v3.16b},[x0]
+ aesd v2.16b,v14.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v15.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v6.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ sha1h s22,s24
+ aesd v2.16b,v16.16b
+ sha1m q24,s21,v19.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v2.16b,v17.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v7.4s,v29.4s
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ add v19.4s,v7.4s,v26.4s
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+/* aes xform 3, sha quad 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aesd v3.16b,v9.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v10.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v3.16b,v3.16b
+ sha1su1 v29.4s,v28.4s
+ aesd v3.16b,v11.16b
+ sha1h s22,s24
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aesd v3.16b,v13.16b
+ sha1h s21,s24
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v14.16b
+ sub x7,x7,1 /* dec block count */
+ aesimc v3.16b,v3.16b
+ add v19.4s,v7.4s,v28.4s
+ aesd v3.16b,v15.16b
+ ld1 {v0.16b},[x0] /* next aes block, no update */
+ sha1h s22,s24
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v29.4s
+ aesd v3.16b,v17.16b
+ sha1h s21,s24
+ ld1 {v29.16b},[x4],16 /* next w3 */
+ sha1p q24,s22,v23.4s
+ add v24.4s,v24.4s,v20.4s
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ /* next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+ add v25.4s,v25.4s,v21.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ /* loop if more to do */
+ cbnz x7,.Lmain_loop
+/*
+ * now the loop epilog. Since the reads for sha have already been done
+ * in advance, we have to have an extra unwind.
+ * This is why the test for the short cases is 16 and not 12.
+ *
+ * the unwind, which is just the main loop without the tests or final reads.
+ */
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ prfm PLDL1KEEP,[x2,64] /* pref next lead_ptr */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ /* pref next aes_ptr_out, streaming */
+ prfm PLDL1KEEP,[x1,64]
+/* aes xform 0, sha quad 0 */
+ aesd v0.16b,v8.16b
+ add v19.4s,v4.4s,v26.4s
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ aesimc v0.16b,v0.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ /* read next aes block, no update */
+ ld1 {v1.16b},[x0]
+ aesd v0.16b,v9.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ add v23.4s,v4.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s25,v19.4s
+ aesd v0.16b,v11.16b
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+ aesimc v0.16b,v0.16b
+ sha1su1 v26.4s,v29.4s
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ add v19.4s,v4.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ add v23.4s,v4.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v15.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s21,v19.4s
+ aesd v0.16b,v16.16b
+ sha1su1 v28.4s,v27.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1c q24,s22,v23.4s
+ add v19.4s,v4.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ eor v0.16b,v0.16b,v18.16b /* final res 0 */
+ add v23.4s,v5.4s,v27.4s
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su1 v26.4s,v29.4s
+/* aes xform 1, sha quad 1 */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v1.16b,v8.16b
+ sha1h s21,s24
+ add v19.4s,v5.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ add v23.4s,v5.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ aesd v1.16b,v10.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ /* read next aes block, no update */
+ ld1 {v2.16b},[x0]
+ aesimc v1.16b,v1.16b
+ sha1h s22,s24
+ aesd v1.16b,v11.16b
+ sha1p q24,s21,v19.4s
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v1.16b,v13.16b
+ sha1h s21,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s22,v23.4s
+ aesd v1.16b,v14.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su1 v29.4s,v28.4s
+ aesimc v1.16b,v1.16b
+ add x2,x2,64 /* bump lead_ptr */
+ aesd v1.16b,v15.16b
+ add v23.4s,v5.4s,v27.4s
+ aesimc v1.16b,v1.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v1.16b,v16.16b
+ sha1h s22,s24
+ aesimc v1.16b,v1.16b
+ sha1p q24,s21,v19.4s
+ aesd v1.16b,v17.16b
+ add v19.4s,v6.4s,v28.4s
+ eor v1.16b,v1.16b,v18.16b /* res xf 1 */
+ sha1su1 v26.4s,v29.4s
+ eor v1.16b,v1.16b,v31.16b /* mode op 1 xor w/prev value */
+ sha1su0 v27.4s,v28.4s,v29.4s
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ add v23.4s,v6.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* mode op 2 */
+/* aes xform 2, sha quad 2 */
+ aesd v2.16b,v8.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v10.16b
+ sha1su1 v28.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ add v19.4s,v6.4s,v26.4s
+ aesd v2.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ sha1h s21,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s22,v23.4s
+ aesd v2.16b,v13.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ /* read next aes block, no update */
+ ld1 {v3.16b},[x0]
+ aesd v2.16b,v14.16b
+ add v23.4s,v6.4s,v27.4s
+ aesimc v2.16b,v2.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v2.16b,v15.16b
+ sha1h s22,s24
+ aesimc v2.16b,v2.16b
+ sha1m q24,s21,v19.4s
+ aesd v2.16b,v16.16b
+ add v19.4s,v6.4s,v28.4s
+ aesimc v2.16b,v2.16b
+ sha1su1 v26.4s,v29.4s
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* mode of 2 xor w/prev value */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ add v23.4s,v7.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su1 v28.4s,v27.4s
+/* mode op 3 */
+/* aes xform 3, sha quad 3 */
+ aesd v3.16b,v8.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v3.16b,v3.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+ aesd v3.16b,v9.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v10.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v3.16b,v3.16b
+ add v19.4s,v7.4s,v26.4s
+ aesd v3.16b,v11.16b
+ sha1h s22,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v12.16b
+ /* read first aes block, no bump */
+ ld1 {v0.16b},[x0]
+ aesimc v3.16b,v3.16b
+ add v23.4s,v7.4s,v27.4s
+ aesd v3.16b,v13.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ add v19.4s,v7.4s,v28.4s
+ aesd v3.16b,v14.16b
+ sha1h s22,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s21,v19.4s
+ aesd v3.16b,v15.16b
+ add v23.4s,v7.4s,v29.4s
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ sha1h s21,s24
+ aesimc v3.16b,v3.16b
+ sha1p q24,s22,v23.4s
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* aes res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+/*
+ * now we have to do the 4 aes blocks (b-2) that catch up to where sha is
+ */
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ /* read next aes block, no update */
+ ld1 {v1.16b},[x0]
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b /* res 0 */
+ eor v0.16b,v0.16b,v30.16b /* xor w/ ivec (modeop) */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ /* read next aes block, no update */
+ ld1 {v2.16b},[x0]
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b /* res 1 */
+ eor v1.16b,v1.16b,v31.16b /* xor w/ ivec (modeop) */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v31.16b},[x0],16
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ /* read next aes block, no update */
+ ld1 {v3.16b},[x0]
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b /* res 2 */
+ eor v2.16b,v2.16b,v30.16b /* xor w/ ivec (modeop) */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v30.16b},[x0],16
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b /* res 3 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+/*
+ * Now, there is the final b-1 sha1 padded block.
+ * This contains between 0-3 aes blocks. We take some pains to avoid read spill
+ * by only reading the blocks that are actually defined.
+ * this is also the final sha block code for the short_cases.
+ */
+.Ljoin_common:
+ mov w15,0x80 /* that's the 1 of the pad */
+ cbnz x13,.Lpad100 /* branch if there is some real data */
+ eor v26.16b,v26.16b,v26.16b /* zero the rest */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v26.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad100:
+ sub x14,x13,1 /* dec amount left */
+ ld1 {v26.16b},[x4],16 /* next w0 */
+ cbnz x14,.Lpad200 /* branch if there is some real data */
+ eor v27.16b,v27.16b,v27.16b /* zero the rest */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v27.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad200:
+ sub x14,x14,1 /* dec amount left */
+ ld1 {v27.16b},[x4],16 /* next w1 */
+ cbnz x14,.Lpad300 /* branch if there is some real data */
+ eor v28.16b,v28.16b,v28.16b /* zero the rest */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v28.b[0],w15 /* all data is bogus */
+ b .Lpad_done /* go do rest */
+
+.Lpad300:
+ ld1 {v28.16b},[x4],16 /* next w2 */
+ eor v29.16b,v29.16b,v29.16b /* zero the rest */
+ mov v29.b[3],w15 /* all data is bogus */
+
+.Lpad_done:
+ /* Add one SHA-1 block since hash is calculated including i_key_pad */
+ add x11, x11, #64
+ lsr x12,x11,32 /* len_hi */
+ and x14,x11,0xffffffff /* len_lo */
+ lsl x12,x12,3 /* len_hi in bits */
+ lsl x14,x14,3 /* len_lo in bits */
+
+ mov v29.s[3],w14 /* len_lo */
+ mov v29.s[2],w12 /* len_hi */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+/*
+ * final sha block
+ * the strategy is to combine the 0-3 aes blocks, which is faster but
+ * a little gourmand on code space.
+ */
+ cbz x13,.Lzero_aes_blocks_left /* none to do */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0]
+ ld1 {v31.16b},[x0],16
+
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ add v19.4s,v4.4s,v26.4s
+ aesd v0.16b,v10.16b
+ add v23.4s,v4.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v12.16b
+ sha1c q24,s25,v19.4s
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesd v0.16b,v13.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v14.16b
+ sha1su1 v27.4s,v26.4s
+ add v19.4s,v4.4s,v28.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v15.16b
+ sha1c q24,s21,v19.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v4.4s,v29.4s
+ aesd v0.16b,v16.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1c q24,s22,v23.4s
+ aesd v0.16b,v17.16b
+ sha1su1 v29.4s,v28.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+ /* dec counter */
+ sub x13,x13,1
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ cbz x13,.Lfrmquad1
+
+/* aes xform 1 */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0]
+ ld1 {v30.16b},[x0],16
+ add v23.4s,v5.4s,v27.4s
+ aesd v0.16b,v8.16b
+ add v19.4s,v5.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s22,v23.4s
+ aesd v0.16b,v11.16b
+ sha1su1 v27.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesd v0.16b,v12.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s21,v19.4s
+ aesd v0.16b,v13.16b
+ sha1su1 v28.4s,v27.4s
+ add v23.4s,v5.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesd v0.16b,v14.16b
+ sha1h s21,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s22,v23.4s
+ aesd v0.16b,v15.16b
+ sha1su1 v29.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ add v19.4s,v5.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesd v0.16b,v16.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1p q24,s21,v19.4s
+ aesd v0.16b,v17.16b
+ sha1su1 v26.4s,v29.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ eor v3.16b,v3.16b,v31.16b /* xor w/ ivec (modeop) */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ sub x13,x13,1 /* dec counter */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ cbz x13,.Lfrmquad2
+
+/* aes xform 2 */
+ /* read first aes block, bump aes_ptr_in */
+ ld1 {v0.16b},[x0],16
+ add v19.4s,v6.4s,v28.4s
+ aesd v0.16b,v8.16b
+ add v23.4s,v6.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ sha1su0 v28.4s,v29.4s,v26.4s
+ aesimc v0.16b,v0.16b
+ sha1h s22,s24
+ aesd v0.16b,v10.16b
+ sha1m q24,s21,v19.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v28.4s,v27.4s
+ aesd v0.16b,v11.16b
+ sha1su0 v29.4s,v26.4s,v27.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v12.16b
+ sha1m q24,s22,v23.4s
+ aesimc v0.16b,v0.16b
+ sha1su1 v29.4s,v28.4s
+ aesd v0.16b,v13.16b
+ add v19.4s,v6.4s,v26.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ sha1h s22,s24
+ aesimc v0.16b,v0.16b
+ sha1m q24,s21,v19.4s
+ aesd v0.16b,v15.16b
+ sha1su1 v26.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ add v23.4s,v6.4s,v27.4s
+ aesd v0.16b,v16.16b
+ sha1su0 v27.4s,v28.4s,v29.4s
+ aesimc v0.16b,v0.16b
+ sha1h s21,s24
+ aesd v0.16b,v17.16b
+ sha1m q24,s22,v23.4s
+ eor v3.16b,v0.16b,v18.16b /* res 0 */
+ sha1su1 v27.4s,v26.4s
+ eor v3.16b,v3.16b,v30.16b /* xor w/ ivec (modeop) */
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+ b .Lfrmquad3
+/*
+ * the final block with no aes component, i.e from here there were zero blocks
+ */
+
+.Lzero_aes_blocks_left:
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+/* quad 1 */
+.Lfrmquad1:
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+/* quad 2 */
+.Lfrmquad2:
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+/* quad 3 */
+.Lfrmquad3:
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v26.4s,v24.4s,v20.4s
+ add v27.4s,v25.4s,v21.4s
+
+ /* Calculate final HMAC */
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+ /* load o_key_pad partial hash */
+ ld1 {v24.16b,v25.16b}, [x6]
+ /* working ABCD <- ABCD */
+ mov v20.16b,v24.16b
+
+ /* Set padding 1 to the first reg */
+ mov w11, #0x80 /* that's the 1 of the pad */
+ mov v27.b[7], w11
+ /* size of o_key_pad + inner hash */
+ mov x11, #64+20
+ /* move length to the end of the block */
+ lsl x11, x11, 3
+ mov v29.s[3], w11
+ lsr x11, x11, 32
+ mov v29.s[2], w11 /* and the higher part */
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+
+ st1 {v24.16b}, [x3],16
+ st1 {v25.s}[0], [x3]
+
+ mov x9,sp
+ add sp,sp,8*16
+ ld1 {v8.16b - v11.16b},[x9],4*16
+ ld1 {v12.16b - v15.16b},[x9]
+
+ ret
+
+/*
+ * These are the short cases (less efficient), here used for 1-11 aes blocks.
+ * x10 = aes_blocks
+ */
+.Lshort_cases:
+ sub sp,sp,8*16
+ mov x9,sp /* copy for address mode */
+ st1 {v8.16b - v11.16b},[x9],4*16
+ st1 {v12.16b - v15.16b},[x9]
+
+ ld1 {v30.16b},[x5] /* get ivec */
+ ld1 {v8.16b-v11.16b},[x2],64 /* rk[0-3] */
+ ld1 {v12.16b-v15.16b},[x2],64 /* rk[4-7] */
+ ld1 {v16.16b-v18.16b},[x2] /* rk[8-10] */
+ adr x8,.Lrcon /* rcon */
+ lsl x11,x10,4 /* len = aes_blocks*16 */
+ mov x4,x0 /* sha_ptr_in = in */
+
+ mov x9,x8 /* top of rcon */
+
+ ld1 {v4.16b},[x9],16 /* key0 */
+ ld1 {v5.16b},[x9],16 /* key1 */
+ ld1 {v6.16b},[x9],16 /* key2 */
+ ld1 {v7.16b},[x9],16 /* key3 */
+
+/*
+ * This loop does 4 at a time, so that at the end there is a final sha block
+ * and 0-3 aes blocks. Note that everything is done serially
+ * to avoid complication.
+ */
+.Lshort_loop:
+ cmp x10,4 /* check if 4 or more */
+ /* if less, bail to last block */
+ blt .Llast_sha_block
+
+ ld1 {v31.16b},[x4] /* next w no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v0.16b},[x4],16
+ rev32 v26.16b,v0.16b /* endian swap for sha */
+ add x0,x0,64
+
+/* aes xform 0 */
+ aesd v0.16b,v8.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v9.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v10.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v11.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v12.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v13.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v14.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v15.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v0.16b,v17.16b
+ eor v0.16b,v0.16b,v18.16b
+ eor v0.16b,v0.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v1.16b},[x4],16
+ rev32 v27.16b,v1.16b /* endian swap for sha */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v0.16b},[x1],16
+
+/* aes xform 1 */
+ aesd v1.16b,v8.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v9.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v10.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v11.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v12.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v13.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v14.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v15.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ eor v1.16b,v1.16b,v31.16b /* xor w/ prev value */
+
+ ld1 {v31.16b},[x4] /* read no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v2.16b},[x4],16
+ rev32 v28.16b,v2.16b /* endian swap for sha */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v1.16b},[x1],16
+
+/* aes xform 2 */
+ aesd v2.16b,v8.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v9.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v10.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v11.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v12.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v13.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v14.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v15.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v16.16b
+ aesimc v2.16b,v2.16b
+ aesd v2.16b,v17.16b
+ eor v2.16b,v2.16b,v18.16b
+ eor v2.16b,v2.16b,v30.16b /* xor w/ prev value */
+
+ ld1 {v30.16b},[x4] /* read no update */
+ /* read next aes block, update aes_ptr_in */
+ ld1 {v3.16b},[x4],16
+ rev32 v29.16b,v3.16b /* endian swap for sha */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v2.16b},[x1],16
+
+/* aes xform 3 */
+ aesd v3.16b,v8.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v9.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v10.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v11.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v12.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v13.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v14.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v15.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v16.16b
+ aesimc v3.16b,v3.16b
+ aesd v3.16b,v17.16b
+ eor v3.16b,v3.16b,v18.16b
+ eor v3.16b,v3.16b,v31.16b /* xor w/ prev value */
+/*
+ * now we have the sha1 to do for these 4 aes blocks. Note that.
+ */
+
+ mov v20.16b,v24.16b /* working ABCD <- ABCD */
+ /* save aes res, bump aes_out_ptr */
+ st1 {v3.16b},[x1],16
+/* quad 0 */
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s25,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v4.4s,v27.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v4.4s,v28.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v4.4s,v29.4s
+ sha1h s21,s24
+ sha1c q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v4.4s,v26.4s
+ sha1h s22,s24
+ sha1c q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* quad 1 */
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v5.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v5.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v5.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v5.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v6.4s,v29.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v19.4s,v6.4s,v26.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v23.4s,v6.4s,v27.4s
+ sha1h s21,s24
+ sha1m q24,s22,v23.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v19.4s,v6.4s,v28.4s
+ sha1h s22,s24
+ sha1m q24,s21,v19.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+/* quad 3 */
+ add v19.4s,v7.4s,v26.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v27.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v19.4s,v7.4s,v28.4s
+ sha1h s22,s24
+ sha1p q24,s21,v19.4s
+
+ add v23.4s,v7.4s,v29.4s
+ sha1h s21,s24
+ sha1p q24,s22,v23.4s
+
+ add v25.4s,v25.4s,v21.4s
+ add v24.4s,v24.4s,v20.4s
+
+ sub x10,x10,4 /* 4 less */
+ b .Lshort_loop /* keep looping */
+/*
+ * this is arranged so that we can join the common unwind code
+ * that does the last sha block and the final 0-3 aes blocks
+ */
+.Llast_sha_block:
+ mov x13,x10 /* copy aes blocks for common */
+ b .Ljoin_common /* join common code */
+
+ .size sha1_hmac_aes128cbc_dec, .-sha1_hmac_aes128cbc_dec