new file mode 100644
@@ -0,0 +1,151 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ .file "aes_core.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .align 4
+ .global aes128_key_sched_enc
+ .type aes128_key_sched_enc, %function
+ .global aes128_key_sched_dec
+ .type aes128_key_sched_dec, %function
+
+ /*
+ * AES key expand algorithm for single round.
+ */
+ .macro key_expand res, key, shuffle_mask, rcon, tq0, tq1, td
+ /* temp = rotword(key[3]) */
+ tbl \td\().8b,{\key\().16b},\shuffle_mask\().8b
+ dup \tq0\().2d,\td\().d[0]
+ /* temp = subbytes(temp) */
+ aese \tq0\().16b,v19\().16b /* q19 := 0 */
+ /* temp = temp + rcon */
+ mov w11,\rcon
+ dup \tq1\().4s,w11
+ eor \tq0\().16b,\tq0\().16b,\tq1\().16b
+ /* tq1 = [0, a, b, c] */
+ ext \tq1\().16b,v19\().16b,\key\().16b,12 /* q19 := 0 */
+ eor \res\().16b,\key\().16b,\tq1\().16b
+ /* tq1 = [0, 0, a, b] */
+ ext \tq1\().16b,v19\().16b,\tq1\().16b,12 /* q19 := 0 */
+ eor \res\().16b,\res\().16b,\tq1\().16b
+ /* tq1 = [0, 0, 0, a] */
+ ext \tq1\().16b,v19\().16b,\tq1\().16b,12 /* q19 := 0 */
+ eor \res\().16b,\res\().16b,\tq1\().16b
+ /* + temp */
+ eor \res\().16b,\res\().16b,\tq0\().16b
+ .endm
+/*
+ * *expanded_key, *user_key
+ */
+ .align 4
+aes128_key_sched_enc:
+ sub sp,sp,4*16
+ st1 {v8.16b - v11.16b},[sp]
+ ld1 {v0.16b},[x1] /* user_key */
+ mov w10,0x0e0d /* form shuffle_word */
+ mov w11,0x0c0f
+ orr w10,w10,w11,lsl 16
+ dup v20.4s,w10 /* shuffle_mask */
+ eor v19.16b,v19.16b,v19.16b /* zero */
+ /* Expand key */
+ key_expand v1,v0,v20,0x1,v21,v16,v17
+ key_expand v2,v1,v20,0x2,v21,v16,v17
+ key_expand v3,v2,v20,0x4,v21,v16,v17
+ key_expand v4,v3,v20,0x8,v21,v16,v17
+ key_expand v5,v4,v20,0x10,v21,v16,v17
+ key_expand v6,v5,v20,0x20,v21,v16,v17
+ key_expand v7,v6,v20,0x40,v21,v16,v17
+ key_expand v8,v7,v20,0x80,v21,v16,v17
+ key_expand v9,v8,v20,0x1b,v21,v16,v17
+ key_expand v10,v9,v20,0x36,v21,v16,v17
+ /* Store round keys in the correct order */
+ st1 {v0.16b - v3.16b},[x0],64
+ st1 {v4.16b - v7.16b},[x0],64
+ st1 {v8.16b - v10.16b},[x0],48
+
+ ld1 {v8.16b - v11.16b},[sp]
+ add sp,sp,4*16
+ ret
+
+ .size aes128_key_sched_enc, .-aes128_key_sched_enc
+
+/*
+ * *expanded_key, *user_key
+ */
+ .align 4
+aes128_key_sched_dec:
+ sub sp,sp,4*16
+ st1 {v8.16b-v11.16b},[sp]
+ ld1 {v0.16b},[x1] /* user_key */
+ mov w10,0x0e0d /* form shuffle_word */
+ mov w11,0x0c0f
+ orr w10,w10,w11,lsl 16
+ dup v20.4s,w10 /* shuffle_mask */
+ eor v19.16b,v19.16b,v19.16b /* zero */
+ /*
+ * Expand key.
+ * Intentionally reverse registers order to allow
+ * for multiple store later.
+ * (Store must be performed in the ascending registers' order)
+ */
+ key_expand v10,v0,v20,0x1,v21,v16,v17
+ key_expand v9,v10,v20,0x2,v21,v16,v17
+ key_expand v8,v9,v20,0x4,v21,v16,v17
+ key_expand v7,v8,v20,0x8,v21,v16,v17
+ key_expand v6,v7,v20,0x10,v21,v16,v17
+ key_expand v5,v6,v20,0x20,v21,v16,v17
+ key_expand v4,v5,v20,0x40,v21,v16,v17
+ key_expand v3,v4,v20,0x80,v21,v16,v17
+ key_expand v2,v3,v20,0x1b,v21,v16,v17
+ key_expand v1,v2,v20,0x36,v21,v16,v17
+ /* Inverse mixcolumns for keys 1-9 (registers v10-v2) */
+ aesimc v10.16b, v10.16b
+ aesimc v9.16b, v9.16b
+ aesimc v8.16b, v8.16b
+ aesimc v7.16b, v7.16b
+ aesimc v6.16b, v6.16b
+ aesimc v5.16b, v5.16b
+ aesimc v4.16b, v4.16b
+ aesimc v3.16b, v3.16b
+ aesimc v2.16b, v2.16b
+ /* Store round keys in the correct order */
+ st1 {v1.16b - v4.16b},[x0],64
+ st1 {v5.16b - v8.16b},[x0],64
+ st1 {v9.16b, v10.16b},[x0],32
+ st1 {v0.16b},[x0],16
+
+ ld1 {v8.16b - v11.16b},[sp]
+ add sp,sp,4*16
+ ret
+
+ .size aes128_key_sched_dec, .-aes128_key_sched_dec
new file mode 100644
@@ -0,0 +1,518 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-1 Primitives
+ *
+ * Operations:
+ * sha1_block_partial:
+ * out = partial_sha1(init, in, len) <- no final block
+ *
+ * sha1_block:
+ * out = sha1(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha1_block_partial(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha1_block(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (success), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha1_block_partial(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * sha1_block(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v22 -- sha working state ABCD (q22)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16 (+20 for the HMAC),
+ * otherwise error code is returned.
+ *
+ */
+ .file "sha1_core.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .align 4
+ .global sha1_block_partial
+ .type sha1_block_partial,%function
+ .global sha1_block
+ .type sha1_block,%function
+
+ .align 4
+.Lrcon:
+ .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+ .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+ .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+ .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+ .align 4
+.Linit_sha_state:
+ .word 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476
+ .word 0xc3d2e1f0, 0x00000000, 0x00000000, 0x00000000
+
+ .align 4
+
+sha1_block_partial:
+ mov x6, #1 /* indicate partial hash */
+ ands x5, x3, #0x3f /* Check size mod 1 SHA block */
+ b.ne .Lsha1_error
+ cbnz x0, 1f
+ /* address of sha init state consts */
+ adr x0,.Linit_sha_state
+1:
+ ld1 {v24.4s},[x0],16 /* init ABCD */
+ ld1 {v25.4s},[x0] /* and E */
+
+ /* Load SHA-1 constants */
+ adr x4,.Lrcon
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ lsr x5, x3, 2 /* number of 4B blocks */
+ b .Lsha1_loop
+
+sha1_block:
+ mov x6, xzr /* indicate full hash */
+ and x5, x3, #0xf /* check size mod 16B block */
+ cmp x5, #4 /* additional word is accepted */
+ b.eq 1f
+ cbnz x5, .Lsha1_error
+1:
+ cbnz x0, 2f
+ /* address of sha init state consts */
+ adr x0,.Linit_sha_state
+2:
+ ld1 {v24.4s},[x0],16 /* init ABCD */
+ ld1 {v25.4s},[x0] /* and E */
+
+ /* Load SHA-1 constants */
+ adr x4,.Lrcon
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ lsr x5, x3, 2 /* number of 4B blocks */
+ /* at least 16 4B blocks give 1 SHA block */
+ cmp x5, #16
+ b.lo .Lsha1_last
+
+ .align 4
+
+.Lsha1_loop:
+ sub x5, x5, #16 /* substract 1 SHA block */
+
+ ld1 {v26.16b},[x1],16 /* dsrc[0] */
+ ld1 {v27.16b},[x1],16 /* dsrc[1] */
+ ld1 {v28.16b},[x1],16 /* dsrc[2] */
+ ld1 {v29.16b},[x1],16 /* dsrc[3] */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+/* quad 0 */
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s25,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v4.4s,v27.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v4.4s,v28.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v4.4s,v29.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* quad 1 */
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v5.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v5.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v5.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v6.4s,v29.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v6.4s,v26.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v6.4s,v27.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+/* quad 3 */
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v7.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v16.4s,v7.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v24.4s,v24.4s,v22.4s
+ add v25.4s,v25.4s,v18.4s
+
+ cmp x5, #16
+ b.hs .Lsha1_loop
+
+ /* Store partial hash and return or complete hash */
+ cbz x6, .Lsha1_last
+
+ st1 {v24.16b},[x2],16
+ st1 {v25.16b},[x2]
+
+ mov x0, xzr
+ ret
+
+ /*
+ * Last block with padding. v24-v25[0] contain hash state.
+ */
+.Lsha1_last:
+
+ eor v26.16b, v26.16b, v26.16b
+ eor v27.16b, v27.16b, v27.16b
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ adr x4,.Lrcon
+ /* Number of bits in message */
+ lsl x3, x3, 3
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ /* move length to the end of the block */
+ mov v29.s[3], w3
+ lsr x3, x3, 32
+ /* and the higher part */
+ mov v29.s[2], w3
+
+ /* The remaining part is up to 3 16B blocks and up to 1 4B block */
+ mov w6, #0x80 /* that's the 1 of the pad */
+ mov v26.b[3], w6
+ cbz x5,.Lsha1_final
+ /* Are there 3 16B blocks? */
+ cmp x5, #12
+ b.lo 1f
+ ld1 {v26.16b},[x1],16
+ ld1 {v27.16b},[x1],16
+ ld1 {v28.16b},[x1],16
+ rev32 v26.16b, v26.16b
+ rev32 v27.16b, v27.16b
+ rev32 v28.16b, v28.16b
+ sub x5,x5,#12
+ mov v29.b[7], w6
+ cbz x5,.Lsha1_final
+ mov v29.b[7], wzr
+ ld1 {v29.s}[0],[x1],4
+ rev32 v29.16b,v29.16b
+ mov v29.b[7], w6
+ b .Lsha1_final
+1:
+ /* Are there 2 16B blocks? */
+ cmp x5, #8
+ b.lo 2f
+ ld1 {v26.16b},[x1],16
+ ld1 {v27.16b},[x1],16
+ rev32 v26.16b,v26.16b
+ rev32 v27.16b,v27.16b
+ sub x5,x5,#8
+ mov v28.b[7], w6
+ cbz x5,.Lsha1_final
+ mov v28.b[7], wzr
+ ld1 {v28.s}[0],[x1],4
+ rev32 v28.16b,v28.16b
+ mov v28.b[7], w6
+ b .Lsha1_final
+2:
+ /* Is there 1 16B block? */
+ cmp x5, #4
+ b.lo 3f
+ ld1 {v26.16b},[x1],16
+ rev32 v26.16b,v26.16b
+ sub x5,x5,#4
+ mov v27.b[7], w6
+ cbz x5,.Lsha1_final
+ mov v27.b[7], wzr
+ ld1 {v27.s}[0],[x1],4
+ rev32 v27.16b,v27.16b
+ mov v27.b[7], w6
+ b .Lsha1_final
+3:
+ ld1 {v26.s}[0],[x1],4
+ rev32 v26.16b,v26.16b
+ mov v26.b[7], w6
+
+.Lsha1_final:
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+/* quad 0 */
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s25,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v4.4s,v27.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v4.4s,v28.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v4.4s,v29.4s
+ sha1h s18,s24
+ sha1c q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v4.4s,v26.4s
+ sha1h s19,s24
+ sha1c q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+/* quad 1 */
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v5.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v5.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v5.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v5.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+/* quad 2 */
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+
+ add v17.4s,v6.4s,v29.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v6.4s,v26.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v26.4s,v27.4s,v28.4s
+ sha1su1 v26.4s,v29.4s
+
+ add v17.4s,v6.4s,v27.4s
+ sha1h s18,s24
+ sha1m q24,s19,v17.4s
+ sha1su0 v27.4s,v28.4s,v29.4s
+ sha1su1 v27.4s,v26.4s
+
+ add v16.4s,v6.4s,v28.4s
+ sha1h s19,s24
+ sha1m q24,s18,v16.4s
+ sha1su0 v28.4s,v29.4s,v26.4s
+ sha1su1 v28.4s,v27.4s
+/* quad 3 */
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+ sha1su0 v29.4s,v26.4s,v27.4s
+ sha1su1 v29.4s,v28.4s
+
+ add v16.4s,v7.4s,v26.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v27.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v16.4s,v7.4s,v28.4s
+ sha1h s19,s24
+ sha1p q24,s18,v16.4s
+
+ add v17.4s,v7.4s,v29.4s
+ sha1h s18,s24
+ sha1p q24,s19,v17.4s
+
+ add v25.4s,v25.4s,v18.4s
+ add v24.4s,v24.4s,v22.4s
+
+ rev32 v24.16b,v24.16b
+ rev32 v25.16b,v25.16b
+
+ st1 {v24.16b}, [x2],16
+ st1 {v25.s}[0], [x2]
+
+ mov x0, xzr
+ ret
+
+.Lsha1_error:
+ mov x0, #-1
+ ret
+
+ .size sha1_block_partial, .-sha1_block_partial
+ .size sha1_block, .-sha1_block
new file mode 100644
@@ -0,0 +1,525 @@
+/*
+ * BSD LICENSE
+ *
+ * Copyright (C) Cavium networks Ltd. 2016.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Cavium networks nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "assym.s"
+
+/*
+ * Description:
+ *
+ * Core SHA-2 Primitives
+ *
+ * Operations:
+ * sha256_block_partial:
+ * out = partial_sha256(init, in, len) <- no final block
+ *
+ * sha256_block:
+ * out = sha256(init, in, len)
+ *
+ * Prototype:
+ *
+ * int sha256_block_partial(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * int sha256_block(uint8_t *init,
+ * uint8_t *dsrc, uint8_t *ddst, uint64_t len)
+ *
+ * returns: 0 (success), -1 (failure)
+ *
+ * Registers used:
+ *
+ * sha256_block_partial(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * sha256_block(
+ * init, x0 (hash init state - NULL for default)
+ * dsrc, x1 (digest src address)
+ * ddst, x2 (digest dst address)
+ * len, x3 (length)
+ * )
+ *
+ * Routine register definitions:
+ *
+ * v4 - v7 -- round consts for sha
+ * v21 -- ABCD tmp
+ * v22 -- sha working state ABCD (q22)
+ * v23 -- sha working state EFGH (q23)
+ * v24 -- reg_sha_stateABCD
+ * v25 -- reg_sha_stateEFGH
+ * v26 -- sha block 0
+ * v27 -- sha block 1
+ * v28 -- sha block 2
+ * v29 -- sha block 3
+ * v30 -- reserved
+ * v31 -- reserved
+ *
+ * Constraints:
+ *
+ * The variable "len" must be a multiple of 16,
+ * otherwise error code is returned.
+ *
+ */
+ .file "sha256_core.S"
+ .text
+ .cpu generic+fp+simd+crypto+crc
+ .align 4
+ .global sha256_block_partial
+ .type sha256_block_partial,%function
+ .global sha256_block
+ .type sha256_block,%function
+
+ .align 4
+.Lrcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+ .align 4
+.Linit_sha_state:
+ .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a
+ .word 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+ .align 4
+
+sha256_block_partial:
+ mov x6, #1 /* indicate partial hash */
+ ands x5, x3, #0x3f /* check size mod 1 SHA block */
+ b.ne .Lsha256_error
+ cbnz x0, 1f
+ /* address of sha init state consts */
+ adr x0,.Linit_sha_state
+1:
+ ld1 {v24.4s, v25.4s},[x0] /* init ABCD, EFGH */
+ /* number of 16B blocks (will be at least 4) */
+ lsr x5, x3, 4
+ b .Lsha256_loop
+
+sha256_block:
+ mov x6, xzr /* indicate full hash */
+ ands x5, x3, #0xf /* check size mod 16B block */
+ b.ne .Lsha256_error
+ cbnz x0, 1f
+ /* address of sha init state consts */
+ adr x0,.Linit_sha_state
+1:
+ ld1 {v24.4s, v25.4s},[x0] /* init ABCD, EFGH. (2 cycs) */
+ lsr x5, x3, 4 /* number of 16B blocks */
+ cmp x5, #4 /* at least 4 16B blocks give 1 SHA block */
+ b.lo .Lsha256_last
+
+ .align 4
+.Lsha256_loop:
+ sub x5, x5, #4 /* substract 1 SHA block */
+ adr x4,.Lrcon
+
+ ld1 {v26.16b},[x1],16 /* dsrc[0] */
+ ld1 {v27.16b},[x1],16 /* dsrc[1] */
+ ld1 {v28.16b},[x1],16 /* dsrc[2] */
+ ld1 {v29.16b},[x1],16 /* dsrc[3] */
+
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+ rev32 v29.16b,v29.16b /* fix endian w3 */
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key4 */
+ ld1 {v5.16b},[x4],16 /* key5 */
+ ld1 {v6.16b},[x4],16 /* key6 */
+ ld1 {v7.16b},[x4],16 /* key7 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key8 */
+ ld1 {v5.16b},[x4],16 /* key9 */
+ ld1 {v6.16b},[x4],16 /* key10 */
+ ld1 {v7.16b},[x4],16 /* key11 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key12 */
+ ld1 {v5.16b},[x4],16 /* key13 */
+ ld1 {v6.16b},[x4],16 /* key14 */
+ ld1 {v7.16b},[x4],16 /* key15 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ cmp x5, #4
+ b.hs .Lsha256_loop
+
+ /* Store partial hash and return or complete hash */
+ cbz x6, .Lsha256_last
+
+ st1 {v24.16b, v25.16b}, [x2]
+
+ mov x0, xzr
+ ret
+
+ /*
+ * Last block with padding. v24-v25 contain hash state.
+ */
+.Lsha256_last:
+ eor v26.16b, v26.16b, v26.16b
+ eor v27.16b, v27.16b, v27.16b
+ eor v28.16b, v28.16b, v28.16b
+ eor v29.16b, v29.16b, v29.16b
+
+ adr x4,.Lrcon
+ lsl x3, x3, 3
+
+ mov v22.16b,v24.16b /* working ABCD <- ABCD */
+ mov v23.16b,v25.16b /* working EFGH <- EFGH */
+
+ /* Fill out the first vector register and the end of the block */
+
+ /* move length to the end of the block */
+ mov v29.s[3], w3
+ lsr x3, x3, 32
+ mov v29.s[2], w3 /* and the higher part */
+ /* set padding 1 to the first reg */
+ mov w6, #0x80 /* that's the 1 of the pad */
+ mov v26.b[3], w6
+ cbz x5,.Lsha256_final
+
+ sub x5, x5, #1
+ mov v27.16b, v26.16b
+ ld1 {v26.16b},[x1],16
+ rev32 v26.16b,v26.16b /* fix endian w0 */
+ cbz x5,.Lsha256_final
+
+ sub x5, x5, #1
+ mov v28.16b, v27.16b
+ ld1 {v27.16b},[x1],16
+ rev32 v27.16b,v27.16b /* fix endian w1 */
+ cbz x5,.Lsha256_final
+
+ mov v29.b[0], w6
+ ld1 {v28.16b},[x1],16
+ rev32 v28.16b,v28.16b /* fix endian w2 */
+
+.Lsha256_final:
+
+ ld1 {v4.16b},[x4],16 /* key0 */
+ ld1 {v5.16b},[x4],16 /* key1 */
+ ld1 {v6.16b},[x4],16 /* key2 */
+ ld1 {v7.16b},[x4],16 /* key3 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key0+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key1+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key2+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key3+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key4 */
+ ld1 {v5.16b},[x4],16 /* key5 */
+ ld1 {v6.16b},[x4],16 /* key6 */
+ ld1 {v7.16b},[x4],16 /* key7 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key4+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key5+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key6+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key7+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key8 */
+ ld1 {v5.16b},[x4],16 /* key9 */
+ ld1 {v6.16b},[x4],16 /* key10 */
+ ld1 {v7.16b},[x4],16 /* key11 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key8+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+ sha256su0 v26.4s,v27.4s
+ sha256su1 v26.4s,v28.4s,v29.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key9+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+ sha256su0 v27.4s,v28.4s
+ sha256su1 v27.4s,v29.4s,v26.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key10+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+ sha256su0 v28.4s,v29.4s
+ sha256su1 v28.4s,v26.4s,v27.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key11+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+ sha256su0 v29.4s,v26.4s
+ sha256su1 v29.4s,v27.4s,v28.4s
+
+ ld1 {v4.16b},[x4],16 /* key12 */
+ ld1 {v5.16b},[x4],16 /* key13 */
+ ld1 {v6.16b},[x4],16 /* key14 */
+ ld1 {v7.16b},[x4],16 /* key15 */
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v4.4s,v4.4s,v26.4s /* wk = key12+w0 */
+ sha256h q22, q23, v4.4s
+ sha256h2 q23, q21, v4.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v5.4s,v5.4s,v27.4s /* wk = key13+w1 */
+ sha256h q22, q23, v5.4s
+ sha256h2 q23, q21, v5.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v6.4s,v6.4s,v28.4s /* wk = key14+w2 */
+ sha256h q22, q23, v6.4s
+ sha256h2 q23, q21, v6.4s
+
+ mov v21.16b, v22.16b /* copy abcd */
+
+ add v7.4s,v7.4s,v29.4s /* wk = key15+w3 */
+ sha256h q22, q23, v7.4s
+ sha256h2 q23, q21, v7.4s
+
+ add v24.4s,v24.4s,v22.4s /* ABCD += working copy */
+ add v25.4s,v25.4s,v23.4s /* EFGH += working copy */
+
+ rev32 v24.16b, v24.16b
+ rev32 v25.16b, v25.16b
+ st1 {v24.4s,v25.4s},[x2] /* save them both */
+
+ mov x0, xzr
+ ret
+
+.Lsha256_error:
+ mov x0, #-1
+ ret
+
+ .size sha256_block_partial, .-sha256_block_partial