[gnutls] Added fix for #973210
Nikos Mavrogiannopoulos
nmav at fedoraproject.org
Wed Dec 4 17:26:39 UTC 2013
commit 302b2107bf1fa1afead856823da70ac65ff8d362
Author: Nikos Mavrogiannopoulos <nmav at redhat.com>
Date: Wed Dec 4 18:19:03 2013 +0100
Added fix for #973210
gnutls-3.2.7-asm.patch |11608 ++++++++++++++++++++++++++++++++++++++++++++++++
gnutls.spec | 7 +-
2 files changed, 11613 insertions(+), 2 deletions(-)
---
diff --git a/gnutls-3.2.7-asm.patch b/gnutls-3.2.7-asm.patch
new file mode 100644
index 0000000..511773e
--- /dev/null
+++ b/gnutls-3.2.7-asm.patch
@@ -0,0 +1,11608 @@
+From 8a7565113ab937cc99f8f4c929bde2ee08fc498c Mon Sep 17 00:00:00 2001
+From: Nikos Mavrogiannopoulos <nmav at gnutls.org>
+Date: Tue, 26 Nov 2013 23:19:45 +0100
+Subject: [PATCH 1/2] updated auto-generated asm files. This fixes a valgrind
+ complaint when AES-NI is in use.
+
+---
+ .../x86/coff/appro-aes-gcm-x86-64-coff.s | 574 ++++--
+ lib/accelerated/x86/coff/appro-aes-x86-64-coff.s | 1826 ++++++++++++--------
+ lib/accelerated/x86/coff/padlock-x86-64-coff.s | 495 ++++++
+ lib/accelerated/x86/coff/padlock-x86-coff.s | 352 +++-
+ lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s | 515 ++++--
+ lib/accelerated/x86/elf/appro-aes-x86-64.s | 1609 ++++++++++-------
+ lib/accelerated/x86/elf/padlock-x86-64.s | 462 +++++
+ lib/accelerated/x86/elf/padlock-x86.s | 575 +++++-
+ .../x86/macosx/appro-aes-gcm-x86-64-macosx.s | 515 ++++--
+ .../x86/macosx/appro-aes-x86-64-macosx.s | 1609 ++++++++++-------
+ lib/accelerated/x86/macosx/padlock-x86-64-macosx.s | 462 +++++
+ lib/accelerated/x86/macosx/padlock-x86-macosx.s | 349 +++-
+ 12 files changed, 6978 insertions(+), 2365 deletions(-)
+
+diff --git a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
+index fa449d6..ceb9108 100644
+--- a/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
++++ b/lib/accelerated/x86/coff/appro-aes-gcm-x86-64-coff.s
+@@ -717,6 +717,11 @@ gcm_ghash_4bit:
+ .def gcm_init_clmul; .scl 2; .type 32; .endef
+ .p2align 4
+ gcm_init_clmul:
++.L_init_clmul:
++.LSEH_begin_gcm_init_clmul:
++
++.byte 0x48,0x83,0xec,0x18
++.byte 0x0f,0x29,0x34,0x24
+ movdqu (%rdx),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+@@ -735,15 +740,15 @@ gcm_init_clmul:
+ pxor %xmm5,%xmm2
+
+
++ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm2,%xmm0
++ pxor %xmm2,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+-.byte 102,15,58,68,220,0
++.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+@@ -753,44 +758,137 @@ gcm_init_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++ pxor %xmm4,%xmm0
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ pshufd $78,%xmm2,%xmm3
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm2,%xmm3
++ movdqu %xmm2,0(%rcx)
++ pxor %xmm0,%xmm4
++ movdqu %xmm0,16(%rcx)
++.byte 102,15,58,15,227,8
++ movdqu %xmm4,32(%rcx)
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,15,58,68,194,0
++.byte 102,15,58,68,202,17
++.byte 102,15,58,68,222,0
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++
++ movdqa %xmm3,%xmm4
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
++ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
++
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ movdqa %xmm0,%xmm5
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,15,58,68,194,0
++.byte 102,15,58,68,202,17
++.byte 102,15,58,68,222,0
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++
++ movdqa %xmm3,%xmm4
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
++ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
++
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- movdqu %xmm2,(%rcx)
+- movdqu %xmm0,16(%rcx)
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ pshufd $78,%xmm5,%xmm3
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm5,%xmm3
++ movdqu %xmm5,48(%rcx)
++ pxor %xmm0,%xmm4
++ movdqu %xmm0,64(%rcx)
++.byte 102,15,58,15,227,8
++ movdqu %xmm4,80(%rcx)
++ movaps (%rsp),%xmm6
++ leaq 24(%rsp),%rsp
++.LSEH_end_gcm_init_clmul:
+ .byte 0xf3,0xc3
+
+ .globl gcm_gmult_clmul
+ .def gcm_gmult_clmul; .scl 2; .type 32; .endef
+ .p2align 4
+ gcm_gmult_clmul:
++.L_gmult_clmul:
+ movdqu (%rcx),%xmm0
+ movdqa .Lbswap_mask(%rip),%xmm5
+ movdqu (%rdx),%xmm2
++ movdqu 32(%rdx),%xmm4
+ .byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+ .byte 102,15,58,68,220,0
+@@ -803,194 +901,372 @@ gcm_gmult_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ .byte 102,15,56,0,197
+ movdqu %xmm0,(%rcx)
+ .byte 0xf3,0xc3
+
+ .globl gcm_ghash_clmul
+ .def gcm_ghash_clmul; .scl 2; .type 32; .endef
+-.p2align 4
++.p2align 5
+ gcm_ghash_clmul:
++.L_ghash_clmul:
++ leaq -136(%rsp),%rax
+ .LSEH_begin_gcm_ghash_clmul:
+
+-.byte 0x48,0x83,0xec,0x58
+-.byte 0x0f,0x29,0x34,0x24
+-.byte 0x0f,0x29,0x7c,0x24,0x10
+-.byte 0x44,0x0f,0x29,0x44,0x24,0x20
+-.byte 0x44,0x0f,0x29,0x4c,0x24,0x30
+-.byte 0x44,0x0f,0x29,0x54,0x24,0x40
++.byte 0x48,0x8d,0x60,0xe0
++.byte 0x0f,0x29,0x70,0xe0
++.byte 0x0f,0x29,0x78,0xf0
++.byte 0x44,0x0f,0x29,0x00
++.byte 0x44,0x0f,0x29,0x48,0x10
++.byte 0x44,0x0f,0x29,0x50,0x20
++.byte 0x44,0x0f,0x29,0x58,0x30
++.byte 0x44,0x0f,0x29,0x60,0x40
++.byte 0x44,0x0f,0x29,0x68,0x50
++.byte 0x44,0x0f,0x29,0x70,0x60
++.byte 0x44,0x0f,0x29,0x78,0x70
+ movdqa .Lbswap_mask(%rip),%xmm5
++ movq $11547335547999543296,%rax
+
+ movdqu (%rcx),%xmm0
+ movdqu (%rdx),%xmm2
++ movdqu 32(%rdx),%xmm10
+ .byte 102,15,56,0,197
+
+ subq $16,%r9
+ jz .Lodd_tail
+
+- movdqu 16(%rdx),%xmm8
++ movdqu 16(%rdx),%xmm9
++ cmpq $48,%r9
++ jb .Lskip4x
+
++ subq $48,%r9
++ movdqu 48(%rdx),%xmm14
++ movdqu 64(%rdx),%xmm15
+
+
+
+
+- movdqu (%r8),%xmm3
+- movdqu 16(%r8),%xmm6
+-.byte 102,15,56,0,221
++ movdqu 48(%r8),%xmm6
++ movdqu 32(%r8),%xmm11
+ .byte 102,15,56,0,245
+- pxor %xmm3,%xmm0
+- movdqa %xmm6,%xmm7
+- pshufd $78,%xmm6,%xmm3
+- pshufd $78,%xmm2,%xmm4
+- pxor %xmm6,%xmm3
+- pxor %xmm2,%xmm4
++.byte 102,68,15,56,0,221
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm7
++ pxor %xmm6,%xmm7
+ .byte 102,15,58,68,242,0
+-.byte 102,15,58,68,250,17
+-.byte 102,15,58,68,220,0
+- pxor %xmm6,%xmm3
+- pxor %xmm7,%xmm3
++.byte 102,68,15,58,68,194,17
++.byte 102,65,15,58,68,250,0
++
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,217,0
++.byte 102,69,15,58,68,233,17
++ xorps %xmm11,%xmm6
++.byte 102,69,15,58,68,226,16
++ xorps %xmm13,%xmm8
++ movups 80(%rdx),%xmm10
++ xorps %xmm12,%xmm7
++
++ movdqu 16(%r8),%xmm11
++ movdqu 0(%r8),%xmm3
++.byte 102,68,15,56,0,221
++.byte 102,15,56,0,221
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm3,%xmm0
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,222,0
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,69,15,58,68,238,17
++ xorps %xmm11,%xmm6
++.byte 102,69,15,58,68,226,0
++ xorps %xmm13,%xmm8
++
++ leaq 64(%r8),%r8
++ subq $64,%r9
++ jc .Ltail4x
++
++ jmp .Lmod4_loop
++.p2align 5
++.Lmod4_loop:
++.byte 102,65,15,58,68,199,0
++ xorps %xmm12,%xmm7
++ movdqu 48(%r8),%xmm11
++.byte 102,68,15,56,0,221
++.byte 102,65,15,58,68,207,17
++ xorps %xmm6,%xmm0
++ movdqu 32(%r8),%xmm6
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++.byte 102,65,15,58,68,218,16
++ xorps %xmm8,%xmm1
++ pxor %xmm11,%xmm12
++.byte 102,15,56,0,245
++ movups 32(%rdx),%xmm10
++.byte 102,68,15,58,68,218,0
++ xorps %xmm7,%xmm3
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm7
+
++ pxor %xmm0,%xmm3
++ pxor %xmm6,%xmm7
++ pxor %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+- psrldq $8,%xmm3
++ pslldq $8,%xmm3
++.byte 102,68,15,58,68,234,17
++ psrldq $8,%xmm4
++ pxor %xmm3,%xmm0
++ movdqa .L7_mask(%rip),%xmm3
++ pxor %xmm4,%xmm1
++.byte 102,72,15,110,224
++
++ pand %xmm0,%xmm3
++.byte 102,15,56,0,227
++.byte 102,69,15,58,68,226,0
++ pxor %xmm0,%xmm4
++ psllq $57,%xmm4
++ movdqa %xmm4,%xmm3
+ pslldq $8,%xmm4
+- pxor %xmm3,%xmm7
+- pxor %xmm4,%xmm6
++.byte 102,65,15,58,68,241,0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ movdqu 0(%r8),%xmm3
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++.byte 102,69,15,58,68,193,17
++ xorps %xmm11,%xmm6
++ movdqu 16(%r8),%xmm11
++.byte 102,68,15,56,0,221
++.byte 102,65,15,58,68,250,16
++ xorps %xmm13,%xmm8
++ movups 80(%rdx),%xmm10
++.byte 102,15,56,0,221
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++
++ movdqa %xmm11,%xmm13
++ pxor %xmm12,%xmm7
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,222,0
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ psrlq $1,%xmm0
++.byte 102,69,15,58,68,238,17
++ xorps %xmm11,%xmm6
++ pxor %xmm1,%xmm0
++
++.byte 102,69,15,58,68,226,0
++ xorps %xmm13,%xmm8
++
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm8,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm8,%xmm4
+
+- leaq 32(%r8),%r8
+- subq $32,%r9
+- jbe .Leven_tail
++ leaq 64(%r8),%r8
++ subq $64,%r9
++ jnc .Lmod4_loop
++
++.Ltail4x:
++.byte 102,65,15,58,68,199,0
++ xorps %xmm12,%xmm7
++.byte 102,65,15,58,68,207,17
++ xorps %xmm6,%xmm0
++.byte 102,65,15,58,68,218,16
++ xorps %xmm8,%xmm1
++ pxor %xmm0,%xmm1
++ pxor %xmm7,%xmm3
+
+-.Lmod_loop:
+-.byte 102,65,15,58,68,192,0
+-.byte 102,65,15,58,68,200,17
+-.byte 102,15,58,68,220,0
+- pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
++ pxor %xmm0,%xmm1
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+- movdqu (%r8),%xmm3
+- pxor %xmm6,%xmm0
+- pxor %xmm7,%xmm1
+
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++ pxor %xmm4,%xmm0
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ addq $64,%r9
++ jz .Ldone
++ movdqu 32(%rdx),%xmm10
++ subq $16,%r9
++ jz .Lodd_tail
++.Lskip4x:
++
++
++
++
++
++ movdqu (%r8),%xmm3
+ movdqu 16(%r8),%xmm6
+ .byte 102,15,56,0,221
+ .byte 102,15,56,0,245
++ pxor %xmm3,%xmm0
++
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm3
++ pxor %xmm6,%xmm3
++.byte 102,15,58,68,242,0
++.byte 102,68,15,58,68,194,17
++.byte 102,65,15,58,68,218,0
++
++ leaq 32(%r8),%r8
++ subq $32,%r9
++ jbe .Leven_tail
++ jmp .Lmod_loop
+
+- movdqa %xmm6,%xmm7
+- pshufd $78,%xmm6,%xmm9
+- pshufd $78,%xmm2,%xmm10
+- pxor %xmm6,%xmm9
+- pxor %xmm2,%xmm10
++.p2align 5
++.Lmod_loop:
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm0,%xmm4
++
++.byte 102,65,15,58,68,193,0
++.byte 102,65,15,58,68,201,17
++.byte 102,65,15,58,68,226,16
++
++ pxor %xmm6,%xmm0
++ pxor %xmm8,%xmm1
++ movdqu (%r8),%xmm8
++.byte 102,68,15,56,0,197
++ movdqu 16(%r8),%xmm6
++
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++ pxor %xmm8,%xmm1
++ pxor %xmm3,%xmm4
++.byte 102,15,56,0,245
++ movdqa %xmm4,%xmm3
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
+
++ movdqa %xmm6,%xmm8
++
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+- pxor %xmm3,%xmm0
+ .byte 102,15,58,68,242,0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ pshufd $78,%xmm8,%xmm3
++ pxor %xmm8,%xmm3
+
+-.byte 102,15,58,68,250,17
++.byte 102,68,15,58,68,194,17
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
+-
+-.byte 102,69,15,58,68,202,0
+- movdqa %xmm0,%xmm1
+- pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm8,%xmm4
+- pxor %xmm0,%xmm3
+- pxor %xmm8,%xmm4
+-
+- pxor %xmm6,%xmm9
+- pxor %xmm7,%xmm9
+- movdqa %xmm9,%xmm10
+- psrldq $8,%xmm9
+- pslldq $8,%xmm10
+- pxor %xmm9,%xmm7
+- pxor %xmm10,%xmm6
++.byte 102,65,15,58,68,218,0
++ pxor %xmm1,%xmm0
+
+ leaq 32(%r8),%r8
+ subq $32,%r9
+ ja .Lmod_loop
+
+ .Leven_tail:
+-.byte 102,65,15,58,68,192,0
+-.byte 102,65,15,58,68,200,17
+-.byte 102,15,58,68,220,0
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm0,%xmm4
++
++.byte 102,65,15,58,68,193,0
++.byte 102,65,15,58,68,201,17
++.byte 102,65,15,58,68,226,16
++
++ pxor %xmm6,%xmm0
++ pxor %xmm8,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+-
+- movdqa %xmm3,%xmm4
++ pxor %xmm3,%xmm4
++ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+- pxor %xmm6,%xmm0
+- pxor %xmm7,%xmm1
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ testq %r9,%r9
+ jnz .Ldone
+
+@@ -1000,12 +1276,10 @@ gcm_ghash_clmul:
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+-.byte 102,15,58,68,220,0
++.byte 102,65,15,58,68,218,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+@@ -1015,27 +1289,28 @@ gcm_ghash_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ .Ldone:
+ .byte 102,15,56,0,197
+ movdqu %xmm0,(%rcx)
+@@ -1044,15 +1319,42 @@ gcm_ghash_clmul:
+ movaps 32(%rsp),%xmm8
+ movaps 48(%rsp),%xmm9
+ movaps 64(%rsp),%xmm10
+- addq $88,%rsp
+- .byte 0xf3,0xc3
++ movaps 80(%rsp),%xmm11
++ movaps 96(%rsp),%xmm12
++ movaps 112(%rsp),%xmm13
++ movaps 128(%rsp),%xmm14
++ movaps 144(%rsp),%xmm15
++ leaq 168(%rsp),%rsp
+ .LSEH_end_gcm_ghash_clmul:
++ .byte 0xf3,0xc3
++
++.globl gcm_init_avx
++.def gcm_init_avx; .scl 2; .type 32; .endef
++.p2align 5
++gcm_init_avx:
++ jmp .L_init_clmul
++
++.globl gcm_gmult_avx
++.def gcm_gmult_avx; .scl 2; .type 32; .endef
++.p2align 5
++gcm_gmult_avx:
++ jmp .L_gmult_clmul
++
++.globl gcm_ghash_avx
++.def gcm_ghash_avx; .scl 2; .type 32; .endef
++.p2align 5
++gcm_ghash_avx:
++ jmp .L_ghash_clmul
+
+ .p2align 6
+ .Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+ .L0x1c2_polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
++.L7_mask:
++.long 7,0,7,0
++.L7_mask_poly:
++.long 7,0,450,0
+ .p2align 6
+
+ .Lrem_4bit:
+@@ -1189,10 +1491,13 @@ se_handler:
+ .rva .LSEH_end_gcm_ghash_4bit
+ .rva .LSEH_info_gcm_ghash_4bit
+
++.rva .LSEH_begin_gcm_init_clmul
++.rva .LSEH_end_gcm_init_clmul
++.rva .LSEH_info_gcm_init_clmul
++
+ .rva .LSEH_begin_gcm_ghash_clmul
+ .rva .LSEH_end_gcm_ghash_clmul
+ .rva .LSEH_info_gcm_ghash_clmul
+-
+ .section .xdata
+ .p2align 3
+ .LSEH_info_gcm_gmult_4bit:
+@@ -1203,11 +1508,20 @@ se_handler:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lghash_prologue,.Lghash_epilogue
++.LSEH_info_gcm_init_clmul:
++.byte 0x01,0x08,0x03,0x00
++.byte 0x08,0x68,0x00,0x00
++.byte 0x04,0x22,0x00,0x00
+ .LSEH_info_gcm_ghash_clmul:
+-.byte 0x01,0x1f,0x0b,0x00
+-.byte 0x1f,0xa8,0x04,0x00
+-.byte 0x19,0x98,0x03,0x00
+-.byte 0x13,0x88,0x02,0x00
+-.byte 0x0d,0x78,0x01,0x00
++.byte 0x01,0x33,0x16,0x00
++.byte 0x33,0xf8,0x09,0x00
++.byte 0x2e,0xe8,0x08,0x00
++.byte 0x29,0xd8,0x07,0x00
++.byte 0x24,0xc8,0x06,0x00
++.byte 0x1f,0xb8,0x05,0x00
++.byte 0x1a,0xa8,0x04,0x00
++.byte 0x15,0x98,0x03,0x00
++.byte 0x10,0x88,0x02,0x00
++.byte 0x0c,0x78,0x01,0x00
+ .byte 0x08,0x68,0x00,0x00
+-.byte 0x04,0xa2,0x00,0x00
++.byte 0x04,0x01,0x15,0x00
+diff --git a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
+index 7bd9665..224a226 100644
+--- a/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
++++ b/lib/accelerated/x86/coff/appro-aes-x86-64-coff.s
+@@ -997,211 +997,423 @@ aesni_ctr32_encrypt_blocks:
+ movq %r9,%rcx
+ movq 40(%rsp),%r8
+
+- leaq -200(%rsp),%rsp
+- movaps %xmm6,32(%rsp)
+- movaps %xmm7,48(%rsp)
+- movaps %xmm8,64(%rsp)
+- movaps %xmm9,80(%rsp)
+- movaps %xmm10,96(%rsp)
+- movaps %xmm11,112(%rsp)
+- movaps %xmm12,128(%rsp)
+- movaps %xmm13,144(%rsp)
+- movaps %xmm14,160(%rsp)
+- movaps %xmm15,176(%rsp)
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $288,%rsp
++ andq $-16,%rsp
++ movaps %xmm6,-168(%rax)
++ movaps %xmm7,-152(%rax)
++ movaps %xmm8,-136(%rax)
++ movaps %xmm9,-120(%rax)
++ movaps %xmm10,-104(%rax)
++ movaps %xmm11,-88(%rax)
++ movaps %xmm12,-72(%rax)
++ movaps %xmm13,-56(%rax)
++ movaps %xmm14,-40(%rax)
++ movaps %xmm15,-24(%rax)
+ .Lctr32_body:
++ leaq -8(%rax),%rbp
++
+ cmpq $1,%rdx
+ je .Lctr32_one_shortcut
+
+- movdqu (%r8),%xmm14
+- movdqa .Lbswap_mask(%rip),%xmm15
+- xorl %eax,%eax
+-.byte 102,69,15,58,22,242,3
+-.byte 102,68,15,58,34,240,3
++ movdqu (%r8),%xmm2
++ movdqu (%rcx),%xmm0
++ movl 12(%r8),%r8d
++ pxor %xmm0,%xmm2
++ movl 12(%rcx),%r11d
++ movdqa %xmm2,0(%rsp)
++ bswapl %r8d
++ movdqa %xmm2,%xmm3
++ movdqa %xmm2,%xmm4
++ movdqa %xmm2,%xmm5
++ movdqa %xmm2,64(%rsp)
++ movdqa %xmm2,80(%rsp)
++ movdqa %xmm2,96(%rsp)
++ movdqa %xmm2,112(%rsp)
+
+ movl 240(%rcx),%eax
++
++ leaq 1(%r8),%r9
++ leaq 2(%r8),%r10
++ bswapl %r9d
+ bswapl %r10d
+- pxor %xmm12,%xmm12
+- pxor %xmm13,%xmm13
+-.byte 102,69,15,58,34,226,0
+- leaq 3(%r10),%r11
+-.byte 102,69,15,58,34,235,0
+- incl %r10d
+-.byte 102,69,15,58,34,226,1
+- incq %r11
+-.byte 102,69,15,58,34,235,1
+- incl %r10d
+-.byte 102,69,15,58,34,226,2
+- incq %r11
+-.byte 102,69,15,58,34,235,2
+- movdqa %xmm12,0(%rsp)
+-.byte 102,69,15,56,0,231
+- movdqa %xmm13,16(%rsp)
+-.byte 102,69,15,56,0,239
+-
+- pshufd $192,%xmm12,%xmm2
+- pshufd $128,%xmm12,%xmm3
+- pshufd $64,%xmm12,%xmm4
+- cmpq $6,%rdx
+- jb .Lctr32_tail
+- shrl $1,%eax
+- movq %rcx,%r11
+- movl %eax,%r10d
+- subq $6,%rdx
+- jmp .Lctr32_loop6
++ xorl %r11d,%r9d
++ xorl %r11d,%r10d
++.byte 102,65,15,58,34,217,3
++ leaq 3(%r8),%r9
++ movdqa %xmm3,16(%rsp)
++.byte 102,65,15,58,34,226,3
++ bswapl %r9d
++ leaq 4(%r8),%r10
++ movdqa %xmm4,32(%rsp)
++ xorl %r11d,%r9d
++ bswapl %r10d
++.byte 102,65,15,58,34,233,3
++ xorl %r11d,%r10d
++ movdqa %xmm5,48(%rsp)
++ leaq 5(%r8),%r9
++ movl %r10d,64+12(%rsp)
++ bswapl %r9d
++ leaq 6(%r8),%r10
++ xorl %r11d,%r9d
++ bswapl %r10d
++ movl %r9d,80+12(%rsp)
++ xorl %r11d,%r10d
++ leaq 7(%r8),%r9
++ movl %r10d,96+12(%rsp)
++ bswapl %r9d
++ xorl %r11d,%r9d
++ movl %r9d,112+12(%rsp)
+
+-.p2align 4
+-.Lctr32_loop6:
+- pshufd $192,%xmm13,%xmm5
+- por %xmm14,%xmm2
+- movups (%r11),%xmm0
+- pshufd $128,%xmm13,%xmm6
+- por %xmm14,%xmm3
+- movups 16(%r11),%xmm1
+- pshufd $64,%xmm13,%xmm7
+- por %xmm14,%xmm4
+- por %xmm14,%xmm5
+- xorps %xmm0,%xmm2
+- por %xmm14,%xmm6
+- por %xmm14,%xmm7
++ movups 16(%rcx),%xmm1
+
++ movdqa 64(%rsp),%xmm6
++ movdqa 80(%rsp),%xmm7
+
++ cmpq $8,%rdx
++ jb .Lctr32_tail
+
++ leaq 128(%rcx),%rcx
++ subq $8,%rdx
++ jmp .Lctr32_loop8
+
+- pxor %xmm0,%xmm3
++.p2align 5
++.Lctr32_loop8:
++ addl $8,%r8d
++ movdqa 96(%rsp),%xmm8
+ .byte 102,15,56,220,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++ movl %r8d,%r9d
++ movdqa 112(%rsp),%xmm9
+ .byte 102,15,56,220,217
+- movdqa .Lincrement32(%rip),%xmm13
+- pxor %xmm0,%xmm5
++ bswapl %r9d
++ movups 32-128(%rcx),%xmm0
+ .byte 102,15,56,220,225
+- movdqa 0(%rsp),%xmm12
+- pxor %xmm0,%xmm6
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++ movl %r9d,0+12(%rsp)
++ leaq 1(%r8),%r9
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+- jmp .Lctr32_enc_loop6_enter
+-.p2align 4
+-.Lctr32_enc_loop6:
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 48-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,16+12(%rsp)
++ leaq 2(%r8),%r9
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 64-128(%rcx),%xmm0
+ .byte 102,15,56,220,209
+ .byte 102,15,56,220,217
+- decl %eax
++ bswapl %r9d
+ .byte 102,15,56,220,225
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,233
++ movl %r9d,32+12(%rsp)
++ leaq 3(%r8),%r9
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+-.Lctr32_enc_loop6_enter:
+- movups 16(%rcx),%xmm1
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 80-128(%rcx),%xmm1
+ .byte 102,15,56,220,208
+ .byte 102,15,56,220,216
+- leaq 32(%rcx),%rcx
++ bswapl %r9d
+ .byte 102,15,56,220,224
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,232
++ movl %r9d,48+12(%rsp)
++ leaq 4(%r8),%r9
+ .byte 102,15,56,220,240
+ .byte 102,15,56,220,248
+- movups (%rcx),%xmm0
+- jnz .Lctr32_enc_loop6
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 96-128(%rcx),%xmm0
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++ bswapl %r9d
++.byte 102,15,56,220,225
++ xorl %r11d,%r9d
++.byte 102,15,56,220,233
++ movl %r9d,64+12(%rsp)
++ leaq 5(%r8),%r9
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 112-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,80+12(%rsp)
++ leaq 6(%r8),%r9
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 128-128(%rcx),%xmm0
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++ bswapl %r9d
++.byte 102,15,56,220,225
++ xorl %r11d,%r9d
++.byte 102,15,56,220,233
++ movl %r9d,96+12(%rsp)
++ leaq 7(%r8),%r9
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 144-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,112+12(%rsp)
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++ movdqu 0(%rdi),%xmm10
++.byte 102,68,15,56,220,200
++ movups 160-128(%rcx),%xmm0
++
++ cmpl $11,%eax
++ jb .Lctr32_enc_done
+
+ .byte 102,15,56,220,209
+- paddd %xmm13,%xmm12
+ .byte 102,15,56,220,217
+- paddd 16(%rsp),%xmm13
+ .byte 102,15,56,220,225
+- movdqa %xmm12,0(%rsp)
+ .byte 102,15,56,220,233
+- movdqa %xmm13,16(%rsp)
+ .byte 102,15,56,220,241
+-.byte 102,69,15,56,0,231
+ .byte 102,15,56,220,249
+-.byte 102,69,15,56,0,239
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 176-128(%rcx),%xmm1
+
+-.byte 102,15,56,221,208
+- movups (%rdi),%xmm8
+-.byte 102,15,56,221,216
+- movups 16(%rdi),%xmm9
+-.byte 102,15,56,221,224
+- movups 32(%rdi),%xmm10
+-.byte 102,15,56,221,232
+- movups 48(%rdi),%xmm11
+-.byte 102,15,56,221,240
+- movups 64(%rdi),%xmm1
+-.byte 102,15,56,221,248
+- movups 80(%rdi),%xmm0
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 192-128(%rcx),%xmm0
++ je .Lctr32_enc_done
+
+- xorps %xmm2,%xmm8
+- pshufd $192,%xmm12,%xmm2
+- xorps %xmm3,%xmm9
+- pshufd $128,%xmm12,%xmm3
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- pshufd $64,%xmm12,%xmm4
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- xorps %xmm6,%xmm1
+- movups %xmm11,48(%rsi)
+- xorps %xmm7,%xmm0
+- movups %xmm1,64(%rsi)
+- movups %xmm0,80(%rsi)
+- leaq 96(%rsi),%rsi
+- movl %r10d,%eax
+- subq $6,%rdx
+- jnc .Lctr32_loop6
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 208-128(%rcx),%xmm1
+
+- addq $6,%rdx
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 224-128(%rcx),%xmm0
++
++.Lctr32_enc_done:
++ movdqu 16(%rdi),%xmm11
++ pxor %xmm0,%xmm10
++ movdqu 32(%rdi),%xmm12
++ pxor %xmm0,%xmm11
++ movdqu 48(%rdi),%xmm13
++ pxor %xmm0,%xmm12
++ movdqu 64(%rdi),%xmm14
++ pxor %xmm0,%xmm13
++ movdqu 80(%rdi),%xmm15
++ pxor %xmm0,%xmm14
++.byte 102,15,56,220,209
++ pxor %xmm0,%xmm15
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movdqu 96(%rdi),%xmm1
++
++.byte 102,65,15,56,221,210
++ pxor %xmm0,%xmm1
++ movdqu 112(%rdi),%xmm10
++ leaq 128(%rdi),%rdi
++.byte 102,65,15,56,221,219
++ pxor %xmm0,%xmm10
++ movdqa 0(%rsp),%xmm11
++.byte 102,65,15,56,221,228
++ movdqa 16(%rsp),%xmm12
++.byte 102,65,15,56,221,237
++ movdqa 32(%rsp),%xmm13
++.byte 102,65,15,56,221,246
++ movdqa 48(%rsp),%xmm14
++.byte 102,65,15,56,221,255
++ movdqa 64(%rsp),%xmm15
++.byte 102,68,15,56,221,193
++ movdqa 80(%rsp),%xmm0
++.byte 102,69,15,56,221,202
++ movups 16-128(%rcx),%xmm1
++
++ movups %xmm2,(%rsi)
++ movdqa %xmm11,%xmm2
++ movups %xmm3,16(%rsi)
++ movdqa %xmm12,%xmm3
++ movups %xmm4,32(%rsi)
++ movdqa %xmm13,%xmm4
++ movups %xmm5,48(%rsi)
++ movdqa %xmm14,%xmm5
++ movups %xmm6,64(%rsi)
++ movdqa %xmm15,%xmm6
++ movups %xmm7,80(%rsi)
++ movdqa %xmm0,%xmm7
++ movups %xmm8,96(%rsi)
++ movups %xmm9,112(%rsi)
++ leaq 128(%rsi),%rsi
++
++ subq $8,%rdx
++ jnc .Lctr32_loop8
++
++ addq $8,%rdx
+ jz .Lctr32_done
+- movq %r11,%rcx
+- leal 1(%rax,%rax,1),%eax
++ leaq -128(%rcx),%rcx
+
+ .Lctr32_tail:
+- por %xmm14,%xmm2
+- movups (%rdi),%xmm8
+- cmpq $2,%rdx
+- jb .Lctr32_one
++ leaq 16(%rcx),%rcx
++ cmpq $4,%rdx
++ jb .Lctr32_loop3
++ je .Lctr32_loop4
+
+- por %xmm14,%xmm3
+- movups 16(%rdi),%xmm9
+- je .Lctr32_two
++ movdqa 96(%rsp),%xmm8
++ pxor %xmm9,%xmm9
+
+- pshufd $192,%xmm13,%xmm5
+- por %xmm14,%xmm4
+- movups 32(%rdi),%xmm10
+- cmpq $4,%rdx
+- jb .Lctr32_three
++ movups 16(%rcx),%xmm0
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++ shrl $1,%eax
++.byte 102,15,56,220,225
++ decl %eax
++.byte 102,15,56,220,233
++ movups (%rdi),%xmm10
++.byte 102,15,56,220,241
++ movups 16(%rdi),%xmm11
++.byte 102,15,56,220,249
++ movups 32(%rdi),%xmm12
++.byte 102,68,15,56,220,193
++ movups 16(%rcx),%xmm1
+
+- pshufd $128,%xmm13,%xmm6
+- por %xmm14,%xmm5
+- movups 48(%rdi),%xmm11
+- je .Lctr32_four
++ call .Lenc_loop8_enter
+
+- por %xmm14,%xmm6
+- xorps %xmm7,%xmm7
++ movdqu 48(%rdi),%xmm13
++ pxor %xmm10,%xmm2
++ movdqu 64(%rdi),%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm10,%xmm6
++ movdqu %xmm5,48(%rsi)
++ movdqu %xmm6,64(%rsi)
++ cmpq $6,%rdx
++ jb .Lctr32_done
+
+- call _aesni_encrypt6
++ movups 80(%rdi),%xmm11
++ xorps %xmm11,%xmm7
++ movups %xmm7,80(%rsi)
++ je .Lctr32_done
+
+- movups 64(%rdi),%xmm1
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- xorps %xmm6,%xmm1
+- movups %xmm11,48(%rsi)
+- movups %xmm1,64(%rsi)
++ movups 96(%rdi),%xmm12
++ xorps %xmm12,%xmm8
++ movups %xmm8,96(%rsi)
++ jmp .Lctr32_done
++
++.p2align 5
++.Lctr32_loop4:
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++ movups (%rcx),%xmm1
++ decl %eax
++ jnz .Lctr32_loop4
++.byte 102,15,56,221,209
++ movups (%rdi),%xmm10
++.byte 102,15,56,221,217
++ movups 16(%rdi),%xmm11
++.byte 102,15,56,221,225
++ movups 32(%rdi),%xmm12
++.byte 102,15,56,221,233
++ movups 48(%rdi),%xmm13
++
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
++ xorps %xmm11,%xmm3
++ movups %xmm3,16(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm5,48(%rsi)
++ jmp .Lctr32_done
++
++.p2align 5
++.Lctr32_loop3:
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++ movups (%rcx),%xmm1
++ decl %eax
++ jnz .Lctr32_loop3
++.byte 102,15,56,221,209
++.byte 102,15,56,221,217
++.byte 102,15,56,221,225
++
++ movups (%rdi),%xmm10
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
++ cmpq $2,%rdx
++ jb .Lctr32_done
++
++ movups 16(%rdi),%xmm11
++ xorps %xmm11,%xmm3
++ movups %xmm3,16(%rsi)
++ je .Lctr32_done
++
++ movups 32(%rdi),%xmm12
++ xorps %xmm12,%xmm4
++ movups %xmm4,32(%rsi)
+ jmp .Lctr32_done
+
+ .p2align 4
+ .Lctr32_one_shortcut:
+ movups (%r8),%xmm2
+- movups (%rdi),%xmm8
++ movups (%rdi),%xmm10
+ movl 240(%rcx),%eax
+-.Lctr32_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+@@ -1213,56 +1425,25 @@ aesni_ctr32_encrypt_blocks:
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+ .byte 102,15,56,221,209
+- xorps %xmm2,%xmm8
+- movups %xmm8,(%rsi)
+- jmp .Lctr32_done
+-
+-.p2align 4
+-.Lctr32_two:
+- xorps %xmm4,%xmm4
+- call _aesni_encrypt3
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- movups %xmm9,16(%rsi)
+- jmp .Lctr32_done
+-
+-.p2align 4
+-.Lctr32_three:
+- call _aesni_encrypt3
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- movups %xmm10,32(%rsi)
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
+ jmp .Lctr32_done
+
+ .p2align 4
+-.Lctr32_four:
+- call _aesni_encrypt4
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- movups %xmm11,48(%rsi)
+-
+ .Lctr32_done:
+- movaps 32(%rsp),%xmm6
+- movaps 48(%rsp),%xmm7
+- movaps 64(%rsp),%xmm8
+- movaps 80(%rsp),%xmm9
+- movaps 96(%rsp),%xmm10
+- movaps 112(%rsp),%xmm11
+- movaps 128(%rsp),%xmm12
+- movaps 144(%rsp),%xmm13
+- movaps 160(%rsp),%xmm14
+- movaps 176(%rsp),%xmm15
+- leaq 200(%rsp),%rsp
+-.Lctr32_ret:
++ movaps -160(%rbp),%xmm6
++ movaps -144(%rbp),%xmm7
++ movaps -128(%rbp),%xmm8
++ movaps -112(%rbp),%xmm9
++ movaps -96(%rbp),%xmm10
++ movaps -80(%rbp),%xmm11
++ movaps -64(%rbp),%xmm12
++ movaps -48(%rbp),%xmm13
++ movaps -32(%rbp),%xmm14
++ movaps -16(%rbp),%xmm15
++ leaq (%rbp),%rsp
++ popq %rbp
++.Lctr32_epilogue:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+@@ -1282,18 +1463,22 @@ aesni_xts_encrypt:
+ movq 40(%rsp),%r8
+ movq 48(%rsp),%r9
+
+- leaq -264(%rsp),%rsp
+- movaps %xmm6,96(%rsp)
+- movaps %xmm7,112(%rsp)
+- movaps %xmm8,128(%rsp)
+- movaps %xmm9,144(%rsp)
+- movaps %xmm10,160(%rsp)
+- movaps %xmm11,176(%rsp)
+- movaps %xmm12,192(%rsp)
+- movaps %xmm13,208(%rsp)
+- movaps %xmm14,224(%rsp)
+- movaps %xmm15,240(%rsp)
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $272,%rsp
++ andq $-16,%rsp
++ movaps %xmm6,-168(%rax)
++ movaps %xmm7,-152(%rax)
++ movaps %xmm8,-136(%rax)
++ movaps %xmm9,-120(%rax)
++ movaps %xmm10,-104(%rax)
++ movaps %xmm11,-88(%rax)
++ movaps %xmm12,-72(%rax)
++ movaps %xmm13,-56(%rax)
++ movaps %xmm14,-40(%rax)
++ movaps %xmm15,-24(%rax)
+ .Lxts_enc_body:
++ leaq -8(%rax),%rbp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+@@ -1308,228 +1493,266 @@ aesni_xts_encrypt:
+ leaq 16(%r8),%r8
+ jnz .Loop_enc1_8
+ .byte 102,68,15,56,221,249
++ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
++ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
++ movups 16(%rcx,%r10,1),%xmm1
++ movl %eax,%r10d
++
+ movdqa .Lxts_magic(%rip),%xmm8
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pshufd $95,%xmm15,%xmm9
++ pxor %xmm0,%xmm1
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm10
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm11
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm12
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
++ psrad $31,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm13
++ pxor %xmm14,%xmm15
++ movdqa %xmm15,%xmm14
++ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
++ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
++ movaps %xmm1,96(%rsp)
++
+ subq $96,%rdx
+ jc .Lxts_enc_short
+
+ shrl $1,%eax
+- subl $1,%eax
++ subl $3,%eax
++ movups 16(%r11),%xmm1
+ movl %eax,%r10d
++ leaq .Lxts_magic(%rip),%r8
+ jmp .Lxts_enc_grandloop
+
+-.p2align 4
++.p2align 5
+ .Lxts_enc_grandloop:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+- pand %xmm8,%xmm9
++ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+- movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+- movdqu 48(%rdi),%xmm5
++ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+- movdqu 64(%rdi),%xmm6
++.byte 102,15,56,220,209
++ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+- movdqu 80(%rdi),%xmm7
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,220,217
++ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+- movups (%r11),%xmm0
++.byte 102,15,56,220,225
++ movdqu 80(%rdi),%xmm7
++ pxor %xmm15,%xmm8
++ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+- pxor %xmm15,%xmm7
+-
+-
++.byte 102,15,56,220,233
++ movups 32(%r11),%xmm0
++ leaq 96(%rdi),%rdi
++ pxor %xmm8,%xmm7
+
+- movups 16(%r11),%xmm1
+- pxor %xmm0,%xmm2
+- pxor %xmm0,%xmm3
++ pxor %xmm9,%xmm10
++.byte 102,15,56,220,241
++ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+-.byte 102,15,56,220,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++.byte 102,15,56,220,249
++ movups 48(%r11),%xmm1
++
++.byte 102,15,56,220,208
++ pxor %xmm9,%xmm12
+ movdqa %xmm11,16(%rsp)
+-.byte 102,15,56,220,217
+- pxor %xmm0,%xmm5
++.byte 102,15,56,220,216
++ pxor %xmm9,%xmm13
+ movdqa %xmm12,32(%rsp)
+-.byte 102,15,56,220,225
+- pxor %xmm0,%xmm6
+- movdqa %xmm13,48(%rsp)
+-.byte 102,15,56,220,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++.byte 102,15,56,220,224
++ pxor %xmm9,%xmm14
++.byte 102,15,56,220,232
++ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+-.byte 102,15,56,220,241
+- movdqa %xmm15,80(%rsp)
+-.byte 102,15,56,220,249
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- jmp .Lxts_enc_loop6_enter
+-
+-.p2align 4
++.byte 102,15,56,220,240
++ movdqa %xmm8,80(%rsp)
++.byte 102,15,56,220,248
++ movups 64(%r11),%xmm0
++ leaq 64(%r11),%rcx
++ pshufd $95,%xmm15,%xmm9
++ jmp .Lxts_enc_loop6
++.p2align 5
+ .Lxts_enc_loop6:
+ .byte 102,15,56,220,209
+ .byte 102,15,56,220,217
+- decl %eax
+ .byte 102,15,56,220,225
+ .byte 102,15,56,220,233
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+-.Lxts_enc_loop6_enter:
+ movups 16(%rcx),%xmm1
++ leaq 32(%rcx),%rcx
++
+ .byte 102,15,56,220,208
+ .byte 102,15,56,220,216
+- leaq 32(%rcx),%rcx
+ .byte 102,15,56,220,224
+ .byte 102,15,56,220,232
+ .byte 102,15,56,220,240
+ .byte 102,15,56,220,248
+ movups (%rcx),%xmm0
++ decl %eax
+ jnz .Lxts_enc_loop6
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- paddq %xmm15,%xmm15
++ movdqa (%r8),%xmm8
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,209
+- pand %xmm8,%xmm9
++ paddq %xmm15,%xmm15
++ psrad $31,%xmm14
+ .byte 102,15,56,220,217
+- pcmpgtd %xmm15,%xmm14
++ pand %xmm8,%xmm14
++ movups (%r11),%xmm10
+ .byte 102,15,56,220,225
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,220,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,241
++ movaps %xmm10,%xmm11
+ .byte 102,15,56,220,249
+ movups 16(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm10
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,208
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm10
++ psrad $31,%xmm14
+ .byte 102,15,56,220,216
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,220,224
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,220,232
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,240
++ movaps %xmm11,%xmm12
+ .byte 102,15,56,220,248
+ movups 32(%rcx),%xmm0
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm11
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,209
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm11
++ psrad $31,%xmm14
+ .byte 102,15,56,220,217
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,220,225
+- pxor %xmm9,%xmm15
++ movdqa %xmm13,48(%rsp)
+ .byte 102,15,56,220,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,241
++ movaps %xmm12,%xmm13
+ .byte 102,15,56,220,249
++ movups 48(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm12
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
++.byte 102,15,56,220,208
++ pxor %xmm15,%xmm12
++ psrad $31,%xmm14
++.byte 102,15,56,220,216
+ paddq %xmm15,%xmm15
+-.byte 102,15,56,221,208
+- pand %xmm8,%xmm9
+-.byte 102,15,56,221,216
+- pcmpgtd %xmm15,%xmm14
+-.byte 102,15,56,221,224
+- pxor %xmm9,%xmm15
+-.byte 102,15,56,221,232
+-.byte 102,15,56,221,240
+-.byte 102,15,56,221,248
++ pand %xmm8,%xmm14
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++ pxor %xmm14,%xmm15
++.byte 102,15,56,220,240
++ movaps %xmm13,%xmm14
++.byte 102,15,56,220,248
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm13
++ movdqa %xmm9,%xmm0
++ paddd %xmm9,%xmm9
++.byte 102,15,56,220,209
++ pxor %xmm15,%xmm13
++ psrad $31,%xmm0
++.byte 102,15,56,220,217
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm0
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++ pxor %xmm0,%xmm15
++ movups (%r11),%xmm0
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++ movups 16(%r11),%xmm1
++
++ pxor %xmm15,%xmm14
++ psrad $31,%xmm9
++.byte 102,15,56,221,84,36,0
+ paddq %xmm15,%xmm15
+- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+- xorps 16(%rsp),%xmm3
+- pcmpgtd %xmm15,%xmm14
++.byte 102,15,56,221,92,36,16
++.byte 102,15,56,221,100,36,32
+ pxor %xmm9,%xmm15
+-
+- xorps 32(%rsp),%xmm4
+- movups %xmm2,0(%rsi)
+- xorps 48(%rsp),%xmm5
+- movups %xmm3,16(%rsi)
+- xorps 64(%rsp),%xmm6
+- movups %xmm4,32(%rsi)
+- xorps 80(%rsp),%xmm7
+- movups %xmm5,48(%rsi)
++.byte 102,15,56,221,108,36,48
++.byte 102,15,56,221,116,36,64
++.byte 102,15,56,221,124,36,80
+ movl %r10d,%eax
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
++
+ leaq 96(%rsi),%rsi
++ movups %xmm2,-96(%rsi)
++ movups %xmm3,-80(%rsi)
++ movups %xmm4,-64(%rsi)
++ movups %xmm5,-48(%rsi)
++ movups %xmm6,-32(%rsi)
++ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc .Lxts_enc_grandloop
+
+- leal 3(%rax,%rax,1),%eax
++ leal 7(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+ .Lxts_enc_short:
++ pxor %xmm0,%xmm10
+ addq $96,%rdx
+ jz .Lxts_enc_done
+
++ pxor %xmm0,%xmm11
+ cmpq $32,%rdx
+ jb .Lxts_enc_one
++ pxor %xmm0,%xmm12
+ je .Lxts_enc_two
+
++ pxor %xmm0,%xmm13
+ cmpq $64,%rdx
+ jb .Lxts_enc_three
++ pxor %xmm0,%xmm14
+ je .Lxts_enc_four
+
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+@@ -1632,15 +1855,15 @@ aesni_xts_encrypt:
+
+ call _aesni_encrypt4
+
+- xorps %xmm10,%xmm2
+- movdqa %xmm15,%xmm10
+- xorps %xmm11,%xmm3
+- xorps %xmm12,%xmm4
+- movups %xmm2,(%rsi)
+- xorps %xmm13,%xmm5
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
++ pxor %xmm10,%xmm2
++ movdqa %xmm14,%xmm10
++ pxor %xmm11,%xmm3
++ pxor %xmm12,%xmm4
++ movdqu %xmm2,(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm3,16(%rsi)
++ movdqu %xmm4,32(%rsi)
++ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+@@ -1681,17 +1904,18 @@ aesni_xts_encrypt:
+ movups %xmm2,-16(%rsi)
+
+ .Lxts_enc_ret:
+- movaps 96(%rsp),%xmm6
+- movaps 112(%rsp),%xmm7
+- movaps 128(%rsp),%xmm8
+- movaps 144(%rsp),%xmm9
+- movaps 160(%rsp),%xmm10
+- movaps 176(%rsp),%xmm11
+- movaps 192(%rsp),%xmm12
+- movaps 208(%rsp),%xmm13
+- movaps 224(%rsp),%xmm14
+- movaps 240(%rsp),%xmm15
+- leaq 264(%rsp),%rsp
++ movaps -160(%rbp),%xmm6
++ movaps -144(%rbp),%xmm7
++ movaps -128(%rbp),%xmm8
++ movaps -112(%rbp),%xmm9
++ movaps -96(%rbp),%xmm10
++ movaps -80(%rbp),%xmm11
++ movaps -64(%rbp),%xmm12
++ movaps -48(%rbp),%xmm13
++ movaps -32(%rbp),%xmm14
++ movaps -16(%rbp),%xmm15
++ leaq (%rbp),%rsp
++ popq %rbp
+ .Lxts_enc_epilogue:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+@@ -1712,18 +1936,22 @@ aesni_xts_decrypt:
+ movq 40(%rsp),%r8
+ movq 48(%rsp),%r9
+
+- leaq -264(%rsp),%rsp
+- movaps %xmm6,96(%rsp)
+- movaps %xmm7,112(%rsp)
+- movaps %xmm8,128(%rsp)
+- movaps %xmm9,144(%rsp)
+- movaps %xmm10,160(%rsp)
+- movaps %xmm11,176(%rsp)
+- movaps %xmm12,192(%rsp)
+- movaps %xmm13,208(%rsp)
+- movaps %xmm14,224(%rsp)
+- movaps %xmm15,240(%rsp)
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $272,%rsp
++ andq $-16,%rsp
++ movaps %xmm6,-168(%rax)
++ movaps %xmm7,-152(%rax)
++ movaps %xmm8,-136(%rax)
++ movaps %xmm9,-120(%rax)
++ movaps %xmm10,-104(%rax)
++ movaps %xmm11,-88(%rax)
++ movaps %xmm12,-72(%rax)
++ movaps %xmm13,-56(%rax)
++ movaps %xmm14,-40(%rax)
++ movaps %xmm15,-24(%rax)
+ .Lxts_dec_body:
++ leaq -8(%rax),%rbp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+@@ -1744,228 +1972,266 @@ aesni_xts_decrypt:
+ shlq $4,%rax
+ subq %rax,%rdx
+
++ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
++ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
++ movups 16(%rcx,%r10,1),%xmm1
++ movl %eax,%r10d
++
+ movdqa .Lxts_magic(%rip),%xmm8
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pshufd $95,%xmm15,%xmm9
++ pxor %xmm0,%xmm1
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm10
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm11
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm12
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
++ psrad $31,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm13
++ pxor %xmm14,%xmm15
++ movdqa %xmm15,%xmm14
++ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
++ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
++ movaps %xmm1,96(%rsp)
++
+ subq $96,%rdx
+ jc .Lxts_dec_short
+
+ shrl $1,%eax
+- subl $1,%eax
++ subl $3,%eax
++ movups 16(%r11),%xmm1
+ movl %eax,%r10d
++ leaq .Lxts_magic(%rip),%r8
+ jmp .Lxts_dec_grandloop
+
+-.p2align 4
++.p2align 5
+ .Lxts_dec_grandloop:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+- pand %xmm8,%xmm9
++ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+- movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+- movdqu 48(%rdi),%xmm5
++ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+- movdqu 64(%rdi),%xmm6
++.byte 102,15,56,222,209
++ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+- movdqu 80(%rdi),%xmm7
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,222,217
++ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+- movups (%r11),%xmm0
++.byte 102,15,56,222,225
++ movdqu 80(%rdi),%xmm7
++ pxor %xmm15,%xmm8
++ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+- pxor %xmm15,%xmm7
+-
+-
++.byte 102,15,56,222,233
++ movups 32(%r11),%xmm0
++ leaq 96(%rdi),%rdi
++ pxor %xmm8,%xmm7
+
+- movups 16(%r11),%xmm1
+- pxor %xmm0,%xmm2
+- pxor %xmm0,%xmm3
++ pxor %xmm9,%xmm10
++.byte 102,15,56,222,241
++ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+-.byte 102,15,56,222,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++.byte 102,15,56,222,249
++ movups 48(%r11),%xmm1
++
++.byte 102,15,56,222,208
++ pxor %xmm9,%xmm12
+ movdqa %xmm11,16(%rsp)
+-.byte 102,15,56,222,217
+- pxor %xmm0,%xmm5
++.byte 102,15,56,222,216
++ pxor %xmm9,%xmm13
+ movdqa %xmm12,32(%rsp)
+-.byte 102,15,56,222,225
+- pxor %xmm0,%xmm6
+- movdqa %xmm13,48(%rsp)
+-.byte 102,15,56,222,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++.byte 102,15,56,222,224
++ pxor %xmm9,%xmm14
++.byte 102,15,56,222,232
++ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+-.byte 102,15,56,222,241
+- movdqa %xmm15,80(%rsp)
+-.byte 102,15,56,222,249
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- jmp .Lxts_dec_loop6_enter
+-
+-.p2align 4
++.byte 102,15,56,222,240
++ movdqa %xmm8,80(%rsp)
++.byte 102,15,56,222,248
++ movups 64(%r11),%xmm0
++ leaq 64(%r11),%rcx
++ pshufd $95,%xmm15,%xmm9
++ jmp .Lxts_dec_loop6
++.p2align 5
+ .Lxts_dec_loop6:
+ .byte 102,15,56,222,209
+ .byte 102,15,56,222,217
+- decl %eax
+ .byte 102,15,56,222,225
+ .byte 102,15,56,222,233
+ .byte 102,15,56,222,241
+ .byte 102,15,56,222,249
+-.Lxts_dec_loop6_enter:
+ movups 16(%rcx),%xmm1
++ leaq 32(%rcx),%rcx
++
+ .byte 102,15,56,222,208
+ .byte 102,15,56,222,216
+- leaq 32(%rcx),%rcx
+ .byte 102,15,56,222,224
+ .byte 102,15,56,222,232
+ .byte 102,15,56,222,240
+ .byte 102,15,56,222,248
+ movups (%rcx),%xmm0
++ decl %eax
+ jnz .Lxts_dec_loop6
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- paddq %xmm15,%xmm15
++ movdqa (%r8),%xmm8
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,209
+- pand %xmm8,%xmm9
++ paddq %xmm15,%xmm15
++ psrad $31,%xmm14
+ .byte 102,15,56,222,217
+- pcmpgtd %xmm15,%xmm14
++ pand %xmm8,%xmm14
++ movups (%r11),%xmm10
+ .byte 102,15,56,222,225
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,222,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,241
++ movaps %xmm10,%xmm11
+ .byte 102,15,56,222,249
+ movups 16(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm10
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,208
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm10
++ psrad $31,%xmm14
+ .byte 102,15,56,222,216
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,222,224
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,222,232
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,240
++ movaps %xmm11,%xmm12
+ .byte 102,15,56,222,248
+ movups 32(%rcx),%xmm0
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm11
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,209
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm11
++ psrad $31,%xmm14
+ .byte 102,15,56,222,217
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,222,225
+- pxor %xmm9,%xmm15
++ movdqa %xmm13,48(%rsp)
+ .byte 102,15,56,222,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,241
++ movaps %xmm12,%xmm13
+ .byte 102,15,56,222,249
++ movups 48(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm12
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
++.byte 102,15,56,222,208
++ pxor %xmm15,%xmm12
++ psrad $31,%xmm14
++.byte 102,15,56,222,216
+ paddq %xmm15,%xmm15
+-.byte 102,15,56,223,208
+- pand %xmm8,%xmm9
+-.byte 102,15,56,223,216
+- pcmpgtd %xmm15,%xmm14
+-.byte 102,15,56,223,224
+- pxor %xmm9,%xmm15
+-.byte 102,15,56,223,232
+-.byte 102,15,56,223,240
+-.byte 102,15,56,223,248
++ pand %xmm8,%xmm14
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++ pxor %xmm14,%xmm15
++.byte 102,15,56,222,240
++ movaps %xmm13,%xmm14
++.byte 102,15,56,222,248
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm13
++ movdqa %xmm9,%xmm0
++ paddd %xmm9,%xmm9
++.byte 102,15,56,222,209
++ pxor %xmm15,%xmm13
++ psrad $31,%xmm0
++.byte 102,15,56,222,217
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm0
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++ pxor %xmm0,%xmm15
++ movups (%r11),%xmm0
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++ movups 16(%r11),%xmm1
++
++ pxor %xmm15,%xmm14
++ psrad $31,%xmm9
++.byte 102,15,56,223,84,36,0
+ paddq %xmm15,%xmm15
+- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+- xorps 16(%rsp),%xmm3
+- pcmpgtd %xmm15,%xmm14
++.byte 102,15,56,223,92,36,16
++.byte 102,15,56,223,100,36,32
+ pxor %xmm9,%xmm15
+-
+- xorps 32(%rsp),%xmm4
+- movups %xmm2,0(%rsi)
+- xorps 48(%rsp),%xmm5
+- movups %xmm3,16(%rsi)
+- xorps 64(%rsp),%xmm6
+- movups %xmm4,32(%rsi)
+- xorps 80(%rsp),%xmm7
+- movups %xmm5,48(%rsi)
++.byte 102,15,56,223,108,36,48
++.byte 102,15,56,223,116,36,64
++.byte 102,15,56,223,124,36,80
+ movl %r10d,%eax
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
++
+ leaq 96(%rsi),%rsi
++ movups %xmm2,-96(%rsi)
++ movups %xmm3,-80(%rsi)
++ movups %xmm4,-64(%rsi)
++ movups %xmm5,-48(%rsi)
++ movups %xmm6,-32(%rsi)
++ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc .Lxts_dec_grandloop
+
+- leal 3(%rax,%rax,1),%eax
++ leal 7(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+ .Lxts_dec_short:
++ pxor %xmm0,%xmm10
++ pxor %xmm0,%xmm11
+ addq $96,%rdx
+ jz .Lxts_dec_done
+
++ pxor %xmm0,%xmm12
+ cmpq $32,%rdx
+ jb .Lxts_dec_one
++ pxor %xmm0,%xmm13
+ je .Lxts_dec_two
+
++ pxor %xmm0,%xmm14
+ cmpq $64,%rdx
+ jb .Lxts_dec_three
+ je .Lxts_dec_four
+
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+@@ -2058,7 +2324,7 @@ aesni_xts_decrypt:
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+- movdqa %xmm15,%xmm11
++ movdqa %xmm14,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+@@ -2068,14 +2334,8 @@ aesni_xts_decrypt:
+
+ .p2align 4
+ .Lxts_dec_four:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movups (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movups 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+@@ -2086,16 +2346,16 @@ aesni_xts_decrypt:
+
+ call _aesni_decrypt4
+
+- xorps %xmm10,%xmm2
++ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+- xorps %xmm11,%xmm3
++ pxor %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+- xorps %xmm12,%xmm4
+- movups %xmm2,(%rsi)
+- xorps %xmm13,%xmm5
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm2,(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm3,16(%rsi)
++ movdqu %xmm4,32(%rsi)
++ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+@@ -2155,17 +2415,18 @@ aesni_xts_decrypt:
+ movups %xmm2,(%rsi)
+
+ .Lxts_dec_ret:
+- movaps 96(%rsp),%xmm6
+- movaps 112(%rsp),%xmm7
+- movaps 128(%rsp),%xmm8
+- movaps 144(%rsp),%xmm9
+- movaps 160(%rsp),%xmm10
+- movaps 176(%rsp),%xmm11
+- movaps 192(%rsp),%xmm12
+- movaps 208(%rsp),%xmm13
+- movaps 224(%rsp),%xmm14
+- movaps 240(%rsp),%xmm15
+- leaq 264(%rsp),%rsp
++ movaps -160(%rbp),%xmm6
++ movaps -144(%rbp),%xmm7
++ movaps -128(%rbp),%xmm8
++ movaps -112(%rbp),%xmm9
++ movaps -96(%rbp),%xmm10
++ movaps -80(%rbp),%xmm11
++ movaps -64(%rbp),%xmm12
++ movaps -48(%rbp),%xmm13
++ movaps -32(%rbp),%xmm14
++ movaps -16(%rbp),%xmm15
++ leaq (%rbp),%rsp
++ popq %rbp
+ .Lxts_dec_epilogue:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+@@ -2245,155 +2506,335 @@ aesni_cbc_encrypt:
+
+ .p2align 4
+ .Lcbc_decrypt:
+- leaq -88(%rsp),%rsp
+- movaps %xmm6,(%rsp)
+- movaps %xmm7,16(%rsp)
+- movaps %xmm8,32(%rsp)
+- movaps %xmm9,48(%rsp)
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $176,%rsp
++ andq $-16,%rsp
++ movaps %xmm6,16(%rsp)
++ movaps %xmm7,32(%rsp)
++ movaps %xmm8,48(%rsp)
++ movaps %xmm9,64(%rsp)
++ movaps %xmm10,80(%rsp)
++ movaps %xmm11,96(%rsp)
++ movaps %xmm12,112(%rsp)
++ movaps %xmm13,128(%rsp)
++ movaps %xmm14,144(%rsp)
++ movaps %xmm15,160(%rsp)
+ .Lcbc_decrypt_body:
+- movups (%r8),%xmm9
++ leaq -8(%rax),%rbp
++ movups (%r8),%xmm10
+ movl %r10d,%eax
+- cmpq $112,%rdx
++ cmpq $80,%rdx
+ jbe .Lcbc_dec_tail
+- shrl $1,%r10d
++
++ movups (%rcx),%xmm0
++ movdqu 0(%rdi),%xmm2
++ movdqu 16(%rdi),%xmm3
++ movdqa %xmm2,%xmm11
++ movdqu 32(%rdi),%xmm4
++ movdqa %xmm3,%xmm12
++ movdqu 48(%rdi),%xmm5
++ movdqa %xmm4,%xmm13
++ movdqu 64(%rdi),%xmm6
++ movdqa %xmm5,%xmm14
++ movdqu 80(%rdi),%xmm7
++ movdqa %xmm6,%xmm15
++ cmpq $112,%rdx
++ jbe .Lcbc_dec_six_or_seven
++
+ subq $112,%rdx
+- movl %r10d,%eax
+- movaps %xmm9,64(%rsp)
++ leaq 112(%rcx),%rcx
+ jmp .Lcbc_dec_loop8_enter
+ .p2align 4
+ .Lcbc_dec_loop8:
+- movaps %xmm0,64(%rsp)
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ .Lcbc_dec_loop8_enter:
+- movups (%rcx),%xmm0
+- movups (%rdi),%xmm2
+- movups 16(%rdi),%xmm3
+- movups 16(%rcx),%xmm1
++ movdqu 96(%rdi),%xmm8
++ pxor %xmm0,%xmm2
++ movdqu 112(%rdi),%xmm9
++ pxor %xmm0,%xmm3
++ movups 16-112(%rcx),%xmm1
++ pxor %xmm0,%xmm4
++ xorq %r11,%r11
++ cmpq $112,%rdx
++ pxor %xmm0,%xmm5
++ pxor %xmm0,%xmm6
++ pxor %xmm0,%xmm7
++ pxor %xmm0,%xmm8
+
+- leaq 32(%rcx),%rcx
+- movdqu 32(%rdi),%xmm4
+- xorps %xmm0,%xmm2
+- movdqu 48(%rdi),%xmm5
+- xorps %xmm0,%xmm3
+- movdqu 64(%rdi),%xmm6
+ .byte 102,15,56,222,209
+- pxor %xmm0,%xmm4
+- movdqu 80(%rdi),%xmm7
++ pxor %xmm0,%xmm9
++ movups 32-112(%rcx),%xmm0
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++ setnc %r11b
++.byte 102,68,15,56,222,193
++ shlq $7,%r11
++.byte 102,68,15,56,222,201
++ addq %rdi,%r11
++ movups 48-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 64-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 80-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 96-112(%rcx),%xmm0
++.byte 102,15,56,222,209
+ .byte 102,15,56,222,217
+- pxor %xmm0,%xmm5
+- movdqu 96(%rdi),%xmm8
+ .byte 102,15,56,222,225
+- pxor %xmm0,%xmm6
+- movdqu 112(%rdi),%xmm9
+ .byte 102,15,56,222,233
+- pxor %xmm0,%xmm7
+- decl %eax
+ .byte 102,15,56,222,241
+- pxor %xmm0,%xmm8
+ .byte 102,15,56,222,249
+- pxor %xmm0,%xmm9
+- movups (%rcx),%xmm0
+ .byte 102,68,15,56,222,193
+ .byte 102,68,15,56,222,201
+- movups 16(%rcx),%xmm1
+-
+- call .Ldec_loop8_enter
++ movups 112-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 128-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 144-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 160-112(%rcx),%xmm0
++ cmpl $11,%eax
++ jb .Lcbc_dec_done
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 176-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 192-112(%rcx),%xmm0
++ je .Lcbc_dec_done
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 208-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 224-112(%rcx),%xmm0
++.Lcbc_dec_done:
++.byte 102,15,56,222,209
++ pxor %xmm0,%xmm10
++.byte 102,15,56,222,217
++ pxor %xmm0,%xmm11
++.byte 102,15,56,222,225
++ pxor %xmm0,%xmm12
++.byte 102,15,56,222,233
++ pxor %xmm0,%xmm13
++.byte 102,15,56,222,241
++ pxor %xmm0,%xmm14
++.byte 102,15,56,222,249
++ pxor %xmm0,%xmm15
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movdqu 80(%rdi),%xmm1
++
++.byte 102,65,15,56,223,210
++ movdqu 96(%rdi),%xmm10
++ pxor %xmm0,%xmm1
++.byte 102,65,15,56,223,219
++ pxor %xmm0,%xmm10
++ movdqu 112(%rdi),%xmm0
++ leaq 128(%rdi),%rdi
++.byte 102,65,15,56,223,228
++ movdqu 0(%r11),%xmm11
++.byte 102,65,15,56,223,237
++ movdqu 16(%r11),%xmm12
++.byte 102,65,15,56,223,246
++ movdqu 32(%r11),%xmm13
++.byte 102,65,15,56,223,255
++ movdqu 48(%r11),%xmm14
++.byte 102,68,15,56,223,193
++ movdqu 64(%r11),%xmm15
++.byte 102,69,15,56,223,202
++ movdqa %xmm0,%xmm10
++ movdqu 80(%r11),%xmm1
++ movups -112(%rcx),%xmm0
+
+- movups (%rdi),%xmm1
+- movups 16(%rdi),%xmm0
+- xorps 64(%rsp),%xmm2
+- xorps %xmm1,%xmm3
+- movups 32(%rdi),%xmm1
+- xorps %xmm0,%xmm4
+- movups 48(%rdi),%xmm0
+- xorps %xmm1,%xmm5
+- movups 64(%rdi),%xmm1
+- xorps %xmm0,%xmm6
+- movups 80(%rdi),%xmm0
+- xorps %xmm1,%xmm7
+- movups 96(%rdi),%xmm1
+- xorps %xmm0,%xmm8
+- movups 112(%rdi),%xmm0
+- xorps %xmm1,%xmm9
+ movups %xmm2,(%rsi)
++ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
++ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
++ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+- movl %r10d,%eax
++ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+- movq %r11,%rcx
++ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+- leaq 128(%rdi),%rdi
++ movdqa %xmm1,%xmm7
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
++
+ subq $128,%rdx
+ ja .Lcbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+- movaps %xmm0,%xmm9
++ leaq -112(%rcx),%rcx
+ addq $112,%rdx
+ jle .Lcbc_dec_tail_collected
+- movups %xmm2,(%rsi)
+- leal 1(%r10,%r10,1),%eax
++ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
++ cmpq $80,%rdx
++ jbe .Lcbc_dec_tail
++
++ movaps %xmm11,%xmm2
++.Lcbc_dec_six_or_seven:
++ cmpq $96,%rdx
++ ja .Lcbc_dec_seven
++
++ movaps %xmm7,%xmm8
++ call _aesni_decrypt6
++ pxor %xmm10,%xmm2
++ movaps %xmm8,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ pxor %xmm15,%xmm7
++ movdqu %xmm6,64(%rsi)
++ leaq 80(%rsi),%rsi
++ movdqa %xmm7,%xmm2
++ jmp .Lcbc_dec_tail_collected
++
++.p2align 4
++.Lcbc_dec_seven:
++ movups 96(%rdi),%xmm8
++ xorps %xmm9,%xmm9
++ call _aesni_decrypt8
++ movups 80(%rdi),%xmm9
++ pxor %xmm10,%xmm2
++ movups 96(%rdi),%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ pxor %xmm15,%xmm7
++ movdqu %xmm6,64(%rsi)
++ pxor %xmm9,%xmm8
++ movdqu %xmm7,80(%rsi)
++ leaq 96(%rsi),%rsi
++ movdqa %xmm8,%xmm2
++ jmp .Lcbc_dec_tail_collected
++
+ .Lcbc_dec_tail:
+ movups (%rdi),%xmm2
+- movaps %xmm2,%xmm8
+- cmpq $16,%rdx
++ subq $16,%rdx
+ jbe .Lcbc_dec_one
+
+ movups 16(%rdi),%xmm3
+- movaps %xmm3,%xmm7
+- cmpq $32,%rdx
++ movaps %xmm2,%xmm11
++ subq $16,%rdx
+ jbe .Lcbc_dec_two
+
+ movups 32(%rdi),%xmm4
+- movaps %xmm4,%xmm6
+- cmpq $48,%rdx
++ movaps %xmm3,%xmm12
++ subq $16,%rdx
+ jbe .Lcbc_dec_three
+
+ movups 48(%rdi),%xmm5
+- cmpq $64,%rdx
++ movaps %xmm4,%xmm13
++ subq $16,%rdx
+ jbe .Lcbc_dec_four
+
+ movups 64(%rdi),%xmm6
+- cmpq $80,%rdx
+- jbe .Lcbc_dec_five
+-
+- movups 80(%rdi),%xmm7
+- cmpq $96,%rdx
+- jbe .Lcbc_dec_six
+-
+- movups 96(%rdi),%xmm8
+- movaps %xmm9,64(%rsp)
+- call _aesni_decrypt8
+- movups (%rdi),%xmm1
+- movups 16(%rdi),%xmm0
+- xorps 64(%rsp),%xmm2
+- xorps %xmm1,%xmm3
+- movups 32(%rdi),%xmm1
+- xorps %xmm0,%xmm4
+- movups 48(%rdi),%xmm0
+- xorps %xmm1,%xmm5
+- movups 64(%rdi),%xmm1
+- xorps %xmm0,%xmm6
+- movups 80(%rdi),%xmm0
+- xorps %xmm1,%xmm7
+- movups 96(%rdi),%xmm9
+- xorps %xmm0,%xmm8
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
+- leaq 96(%rsi),%rsi
+- movaps %xmm8,%xmm2
+- subq $112,%rdx
++ movaps %xmm5,%xmm14
++ movaps %xmm6,%xmm15
++ xorps %xmm7,%xmm7
++ call _aesni_decrypt6
++ pxor %xmm10,%xmm2
++ movaps %xmm15,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ leaq 64(%rsi),%rsi
++ movdqa %xmm6,%xmm2
++ subq $16,%rdx
+ jmp .Lcbc_dec_tail_collected
++
+ .p2align 4
+ .Lcbc_dec_one:
++ movaps %xmm2,%xmm11
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+@@ -2405,116 +2846,79 @@ aesni_cbc_encrypt:
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+ .byte 102,15,56,223,209
+- xorps %xmm9,%xmm2
+- movaps %xmm8,%xmm9
+- subq $16,%rdx
++ xorps %xmm10,%xmm2
++ movaps %xmm11,%xmm10
+ jmp .Lcbc_dec_tail_collected
+ .p2align 4
+ .Lcbc_dec_two:
++ movaps %xmm3,%xmm12
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- movaps %xmm7,%xmm9
+- movaps %xmm3,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm12,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ movdqa %xmm3,%xmm2
+ leaq 16(%rsi),%rsi
+- subq $32,%rdx
+ jmp .Lcbc_dec_tail_collected
+ .p2align 4
+ .Lcbc_dec_three:
++ movaps %xmm4,%xmm13
+ call _aesni_decrypt3
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- xorps %xmm7,%xmm4
+- movups %xmm3,16(%rsi)
+- movaps %xmm6,%xmm9
+- movaps %xmm4,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm13,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ movdqa %xmm4,%xmm2
+ leaq 32(%rsi),%rsi
+- subq $48,%rdx
+ jmp .Lcbc_dec_tail_collected
+ .p2align 4
+ .Lcbc_dec_four:
++ movaps %xmm5,%xmm14
+ call _aesni_decrypt4
+- xorps %xmm9,%xmm2
+- movups 48(%rdi),%xmm9
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- xorps %xmm7,%xmm4
+- movups %xmm3,16(%rsi)
+- xorps %xmm6,%xmm5
+- movups %xmm4,32(%rsi)
+- movaps %xmm5,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm14,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ movdqa %xmm5,%xmm2
+ leaq 48(%rsi),%rsi
+- subq $64,%rdx
+- jmp .Lcbc_dec_tail_collected
+-.p2align 4
+-.Lcbc_dec_five:
+- xorps %xmm7,%xmm7
+- call _aesni_decrypt6
+- movups 16(%rdi),%xmm1
+- movups 32(%rdi),%xmm0
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- xorps %xmm1,%xmm4
+- movups 48(%rdi),%xmm1
+- xorps %xmm0,%xmm5
+- movups 64(%rdi),%xmm9
+- xorps %xmm1,%xmm6
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- leaq 64(%rsi),%rsi
+- movaps %xmm6,%xmm2
+- subq $80,%rdx
+- jmp .Lcbc_dec_tail_collected
+-.p2align 4
+-.Lcbc_dec_six:
+- call _aesni_decrypt6
+- movups 16(%rdi),%xmm1
+- movups 32(%rdi),%xmm0
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- xorps %xmm1,%xmm4
+- movups 48(%rdi),%xmm1
+- xorps %xmm0,%xmm5
+- movups 64(%rdi),%xmm0
+- xorps %xmm1,%xmm6
+- movups 80(%rdi),%xmm9
+- xorps %xmm0,%xmm7
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- movups %xmm6,64(%rsi)
+- leaq 80(%rsi),%rsi
+- movaps %xmm7,%xmm2
+- subq $96,%rdx
+ jmp .Lcbc_dec_tail_collected
++
+ .p2align 4
+ .Lcbc_dec_tail_collected:
++ movups %xmm10,(%r8)
+ andq $15,%rdx
+- movups %xmm9,(%r8)
+ jnz .Lcbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ jmp .Lcbc_dec_ret
+ .p2align 4
+ .Lcbc_dec_tail_partial:
+- movaps %xmm2,64(%rsp)
++ movaps %xmm2,(%rsp)
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+- leaq 64(%rsp),%rsi
++ leaq (%rsp),%rsi
+ .long 0x9066A4F3
+
+ .Lcbc_dec_ret:
+- movaps (%rsp),%xmm6
+- movaps 16(%rsp),%xmm7
+- movaps 32(%rsp),%xmm8
+- movaps 48(%rsp),%xmm9
+- leaq 88(%rsp),%rsp
++ movaps 16(%rsp),%xmm6
++ movaps 32(%rsp),%xmm7
++ movaps 48(%rsp),%xmm8
++ movaps 64(%rsp),%xmm9
++ movaps 80(%rsp),%xmm10
++ movaps 96(%rsp),%xmm11
++ movaps 112(%rsp),%xmm12
++ movaps 128(%rsp),%xmm13
++ movaps 144(%rsp),%xmm14
++ movaps 160(%rsp),%xmm15
++ leaq (%rbp),%rsp
++ popq %rbp
+ .Lcbc_ret:
+ movq 8(%rsp),%rdi
+ movq 16(%rsp),%rsi
+@@ -2759,6 +3163,8 @@ __aesni_set_encrypt_key:
+ .long 1,0,0,0
+ .Lxts_magic:
+ .long 0x87,0,1,0
++.Lincrement1:
++.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+
+ .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ .p2align 6
+@@ -2823,45 +3229,9 @@ ccm64_se_handler:
+ jmp .Lcommon_seh_tail
+
+
+-.def ctr32_se_handler; .scl 3; .type 32; .endef
+-.p2align 4
+-ctr32_se_handler:
+- pushq %rsi
+- pushq %rdi
+- pushq %rbx
+- pushq %rbp
+- pushq %r12
+- pushq %r13
+- pushq %r14
+- pushq %r15
+- pushfq
+- subq $64,%rsp
+-
+- movq 120(%r8),%rax
+- movq 248(%r8),%rbx
+-
+- leaq .Lctr32_body(%rip),%r10
+- cmpq %r10,%rbx
+- jb .Lcommon_seh_tail
+-
+- movq 152(%r8),%rax
+-
+- leaq .Lctr32_ret(%rip),%r10
+- cmpq %r10,%rbx
+- jae .Lcommon_seh_tail
+-
+- leaq 32(%rax),%rsi
+- leaq 512(%r8),%rdi
+- movl $20,%ecx
+-.long 0xa548f3fc
+- leaq 200(%rax),%rax
+-
+- jmp .Lcommon_seh_tail
+-
+-
+-.def xts_se_handler; .scl 3; .type 32; .endef
++.def ctr_xts_se_handler; .scl 3; .type 32; .endef
+ .p2align 4
+-xts_se_handler:
++ctr_xts_se_handler:
+ pushq %rsi
+ pushq %rdi
+ pushq %rbx
+@@ -2891,13 +3261,13 @@ xts_se_handler:
+ cmpq %r10,%rbx
+ jae .Lcommon_seh_tail
+
+- leaq 96(%rax),%rsi
++ movq 160(%r8),%rax
++ leaq -160(%rax),%rsi
+ leaq 512(%r8),%rdi
+ movl $20,%ecx
+ .long 0xa548f3fc
+- leaq 104+160(%rax),%rax
+
+- jmp .Lcommon_seh_tail
++ jmp .Lcommon_rbp_tail
+
+ .def cbc_se_handler; .scl 3; .type 32; .endef
+ .p2align 4
+@@ -2928,11 +3298,16 @@ cbc_se_handler:
+ cmpq %r10,%rbx
+ jae .Lcommon_seh_tail
+
+- leaq 0(%rax),%rsi
++ leaq 16(%rax),%rsi
+ leaq 512(%r8),%rdi
+- movl $8,%ecx
++ movl $20,%ecx
+ .long 0xa548f3fc
+- leaq 88(%rax),%rax
++
++.Lcommon_rbp_tail:
++ movq 160(%r8),%rax
++ movq (%rax),%rbp
++ leaq 8(%rax),%rax
++ movq %rbp,160(%r8)
+ jmp .Lcommon_seh_tail
+
+ .Lrestore_cbc_rax:
+@@ -3029,14 +3404,15 @@ cbc_se_handler:
+ .rva .Lccm64_dec_body,.Lccm64_dec_ret
+ .LSEH_info_ctr32:
+ .byte 9,0,0,0
+-.rva ctr32_se_handler
++.rva ctr_xts_se_handler
++.rva .Lctr32_body,.Lctr32_epilogue
+ .LSEH_info_xts_enc:
+ .byte 9,0,0,0
+-.rva xts_se_handler
++.rva ctr_xts_se_handler
+ .rva .Lxts_enc_body,.Lxts_enc_epilogue
+ .LSEH_info_xts_dec:
+ .byte 9,0,0,0
+-.rva xts_se_handler
++.rva ctr_xts_se_handler
+ .rva .Lxts_dec_body,.Lxts_dec_epilogue
+ .LSEH_info_cbc:
+ .byte 9,0,0,0
+diff --git a/lib/accelerated/x86/coff/padlock-x86-64-coff.s b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
+index 9f658ee..a3a0e30 100644
+--- a/lib/accelerated/x86/coff/padlock-x86-64-coff.s
++++ b/lib/accelerated/x86/coff/padlock-x86-64-coff.s
+@@ -686,6 +686,501 @@ padlock_cbc_encrypt:
+ movq 16(%rsp),%rsi
+ .byte 0xf3,0xc3
+ .LSEH_end_padlock_cbc_encrypt:
++.globl padlock_cfb_encrypt
++.def padlock_cfb_encrypt; .scl 2; .type 32; .endef
++.p2align 4
++padlock_cfb_encrypt:
++ movq %rdi,8(%rsp)
++ movq %rsi,16(%rsp)
++ movq %rsp,%rax
++.LSEH_begin_padlock_cfb_encrypt:
++ movq %rcx,%rdi
++ movq %rdx,%rsi
++ movq %r8,%rdx
++ movq %r9,%rcx
++
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz .Lcfb_abort
++ testq $15,%rcx
++ jnz .Lcfb_abort
++ leaq .Lpadlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz .Lcfb_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz .Lcfb_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++ jmp .Lcfb_loop
++.p2align 4
++.Lcfb_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz .Lcfb_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++.Lcfb_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,224
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz .Lcfb_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++.Lcfb_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jnz .Lcfb_loop
++ cmpq %rbp,%rsp
++ je .Lcfb_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++.Lcfb_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja .Lcfb_bzero
++
++.Lcfb_done:
++ leaq (%rbp),%rsp
++ jmp .Lcfb_exit
++
++.p2align 4
++.Lcfb_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,224
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++.Lcfb_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++.Lcfb_abort:
++ popq %rbx
++ popq %rbp
++ movq 8(%rsp),%rdi
++ movq 16(%rsp),%rsi
++ .byte 0xf3,0xc3
++.LSEH_end_padlock_cfb_encrypt:
++.globl padlock_ofb_encrypt
++.def padlock_ofb_encrypt; .scl 2; .type 32; .endef
++.p2align 4
++padlock_ofb_encrypt:
++ movq %rdi,8(%rsp)
++ movq %rsi,16(%rsp)
++ movq %rsp,%rax
++.LSEH_begin_padlock_ofb_encrypt:
++ movq %rcx,%rdi
++ movq %rdx,%rsi
++ movq %r8,%rdx
++ movq %r9,%rcx
++
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz .Lofb_abort
++ testq $15,%rcx
++ jnz .Lofb_abort
++ leaq .Lpadlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz .Lofb_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz .Lofb_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++ jmp .Lofb_loop
++.p2align 4
++.Lofb_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz .Lofb_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++.Lofb_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,232
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz .Lofb_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++.Lofb_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jnz .Lofb_loop
++ cmpq %rbp,%rsp
++ je .Lofb_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++.Lofb_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja .Lofb_bzero
++
++.Lofb_done:
++ leaq (%rbp),%rsp
++ jmp .Lofb_exit
++
++.p2align 4
++.Lofb_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,232
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++.Lofb_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++.Lofb_abort:
++ popq %rbx
++ popq %rbp
++ movq 8(%rsp),%rdi
++ movq 16(%rsp),%rsi
++ .byte 0xf3,0xc3
++.LSEH_end_padlock_ofb_encrypt:
++.globl padlock_ctr32_encrypt
++.def padlock_ctr32_encrypt; .scl 2; .type 32; .endef
++.p2align 4
++padlock_ctr32_encrypt:
++ movq %rdi,8(%rsp)
++ movq %rsi,16(%rsp)
++ movq %rsp,%rax
++.LSEH_begin_padlock_ctr32_encrypt:
++ movq %rcx,%rdi
++ movq %rdx,%rsi
++ movq %r8,%rdx
++ movq %r9,%rcx
++
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz .Lctr32_abort
++ testq $15,%rcx
++ jnz .Lctr32_abort
++ leaq .Lpadlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz .Lctr32_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz .Lctr32_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++.Lctr32_reenter:
++ movl -4(%rdx),%eax
++ bswapl %eax
++ negl %eax
++ andl $31,%eax
++ movq $512,%rbx
++ shll $4,%eax
++ cmovzq %rbx,%rax
++ cmpq %rax,%rcx
++ cmovaq %rax,%rbx
++ cmovbeq %rcx,%rbx
++ cmpq %rbx,%rcx
++ ja .Lctr32_loop
++ movq %rsi,%rax
++ cmpq %rsp,%rbp
++ cmoveq %rdi,%rax
++ addq %rcx,%rax
++ negq %rax
++ andq $4095,%rax
++ cmpq $32,%rax
++ movq $-32,%rax
++ cmovaeq %rbx,%rax
++ andq %rax,%rbx
++ jz .Lctr32_unaligned_tail
++ jmp .Lctr32_loop
++.p2align 4
++.Lctr32_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz .Lctr32_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++.Lctr32_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++ movl -4(%rdx),%eax
++ testl $4294901760,%eax
++ jnz .Lctr32_no_carry
++ bswapl %eax
++ addl $65536,%eax
++ bswapl %eax
++ movl %eax,-4(%rdx)
++.Lctr32_no_carry:
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz .Lctr32_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++.Lctr32_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jz .Lctr32_break
++ cmpq %rbx,%rcx
++ jae .Lctr32_loop
++ movq %rcx,%rbx
++ movq %rsi,%rax
++ cmpq %rsp,%rbp
++ cmoveq %rdi,%rax
++ addq %rcx,%rax
++ negq %rax
++ andq $4095,%rax
++ cmpq $32,%rax
++ movq $-32,%rax
++ cmovaeq %rbx,%rax
++ andq %rax,%rbx
++ jnz .Lctr32_loop
++.Lctr32_unaligned_tail:
++ xorl %eax,%eax
++ cmpq %rsp,%rbp
++ cmoveq %rcx,%rax
++ movq %rdi,%r8
++ movq %rcx,%rbx
++ subq %rax,%rsp
++ shrq $3,%rcx
++ leaq (%rsp),%rdi
++.byte 0xf3,0x48,0xa5
++ movq %rsp,%rsi
++ movq %r8,%rdi
++ movq %rbx,%rcx
++ jmp .Lctr32_loop
++.p2align 4
++.Lctr32_break:
++ cmpq %rbp,%rsp
++ je .Lctr32_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++.Lctr32_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja .Lctr32_bzero
++
++.Lctr32_done:
++ leaq (%rbp),%rsp
++ jmp .Lctr32_exit
++
++.p2align 4
++.Lctr32_aligned:
++ movl -4(%rdx),%eax
++ bswapl %eax
++ negl %eax
++ andl $65535,%eax
++ movq $1048576,%rbx
++ shll $4,%eax
++ cmovzq %rbx,%rax
++ cmpq %rax,%rcx
++ cmovaq %rax,%rbx
++ cmovbeq %rcx,%rbx
++ jbe .Lctr32_aligned_skip
++
++.Lctr32_aligned_loop:
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++
++ movl -4(%rdx),%eax
++ bswapl %eax
++ addl $65536,%eax
++ bswapl %eax
++ movl %eax,-4(%rdx)
++
++ movq %r10,%rcx
++ subq %r11,%rcx
++ movq $1048576,%rbx
++ jz .Lctr32_exit
++ cmpq %rbx,%rcx
++ jae .Lctr32_aligned_loop
++
++.Lctr32_aligned_skip:
++ leaq (%rsi,%rcx,1),%rbp
++ negq %rbp
++ andq $4095,%rbp
++ xorl %eax,%eax
++ cmpq $32,%rbp
++ movq $32-1,%rbp
++ cmovaeq %rax,%rbp
++ andq %rcx,%rbp
++ subq %rbp,%rcx
++ jz .Lctr32_aligned_tail
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++ testq %rbp,%rbp
++ jz .Lctr32_exit
++
++.Lctr32_aligned_tail:
++ movq %rdi,%r8
++ movq %rbp,%rbx
++ movq %rbp,%rcx
++ leaq (%rsp),%rbp
++ subq %rcx,%rsp
++ shrq $3,%rcx
++ leaq (%rsp),%rdi
++.byte 0xf3,0x48,0xa5
++ leaq (%r8),%rdi
++ leaq (%rsp),%rsi
++ movq %rbx,%rcx
++ jmp .Lctr32_loop
++.Lctr32_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++.Lctr32_abort:
++ popq %rbx
++ popq %rbp
++ movq 8(%rsp),%rdi
++ movq 16(%rsp),%rsi
++ .byte 0xf3,0xc3
++.LSEH_end_padlock_ctr32_encrypt:
+ .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ .p2align 4
+ .data
+diff --git a/lib/accelerated/x86/coff/padlock-x86-coff.s b/lib/accelerated/x86/coff/padlock-x86-coff.s
+index 69eb468..d969f30 100644
+--- a/lib/accelerated/x86/coff/padlock-x86-coff.s
++++ b/lib/accelerated/x86/coff/padlock-x86-coff.s
+@@ -515,6 +515,354 @@ _padlock_cbc_encrypt:
+ popl %ebx
+ popl %ebp
+ ret
++.globl _padlock_cfb_encrypt
++.def _padlock_cfb_encrypt; .scl 2; .type 32; .endef
++.align 16
++_padlock_cfb_encrypt:
++.L_padlock_cfb_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz .L028cfb_abort
++ testl $15,%ecx
++ jnz .L028cfb_abort
++ leal .Lpadlock_saved_context,%eax
++ pushfl
++ cld
++ call __padlock_verify_ctx
++.L029cfb_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%edx)
++ jnz .L030cfb_aligned
++ testl $15,%edi
++ setz %al
++ testl $15,%esi
++ setz %bl
++ testl %ebx,%eax
++ jnz .L030cfb_aligned
++ negl %eax
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp .L031cfb_loop
++.align 16
++.L031cfb_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ testl $15,%edi
++ cmovnzl %esp,%edi
++ testl $15,%esi
++ jz .L032cfb_inp_aligned
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++ movl %ebx,%ecx
++ movl %edi,%esi
++.L032cfb_inp_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,224
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ testl $15,%edi
++ jz .L033cfb_out_aligned
++ movl %ebx,%ecx
++ leal (%esp),%esi
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++.L033cfb_out_aligned:
++ movl 4(%ebp),%esi
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz .L031cfb_loop
++ cmpl %ebp,%esp
++ je .L034cfb_done
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++.L035cfb_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja .L035cfb_bzero
++.L034cfb_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ jmp .L036cfb_exit
++.align 16
++.L030cfb_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,224
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++.L036cfb_exit:
++ movl $1,%eax
++ leal 4(%esp),%esp
++.L028cfb_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
++.globl _padlock_ofb_encrypt
++.def _padlock_ofb_encrypt; .scl 2; .type 32; .endef
++.align 16
++_padlock_ofb_encrypt:
++.L_padlock_ofb_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz .L037ofb_abort
++ testl $15,%ecx
++ jnz .L037ofb_abort
++ leal .Lpadlock_saved_context,%eax
++ pushfl
++ cld
++ call __padlock_verify_ctx
++.L038ofb_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%edx)
++ jnz .L039ofb_aligned
++ testl $15,%edi
++ setz %al
++ testl $15,%esi
++ setz %bl
++ testl %ebx,%eax
++ jnz .L039ofb_aligned
++ negl %eax
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp .L040ofb_loop
++.align 16
++.L040ofb_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ testl $15,%edi
++ cmovnzl %esp,%edi
++ testl $15,%esi
++ jz .L041ofb_inp_aligned
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++ movl %ebx,%ecx
++ movl %edi,%esi
++.L041ofb_inp_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,232
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ testl $15,%edi
++ jz .L042ofb_out_aligned
++ movl %ebx,%ecx
++ leal (%esp),%esi
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++.L042ofb_out_aligned:
++ movl 4(%ebp),%esi
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz .L040ofb_loop
++ cmpl %ebp,%esp
++ je .L043ofb_done
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++.L044ofb_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja .L044ofb_bzero
++.L043ofb_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ jmp .L045ofb_exit
++.align 16
++.L039ofb_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,232
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++.L045ofb_exit:
++ movl $1,%eax
++ leal 4(%esp),%esp
++.L037ofb_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
++.globl _padlock_ctr32_encrypt
++.def _padlock_ctr32_encrypt; .scl 2; .type 32; .endef
++.align 16
++_padlock_ctr32_encrypt:
++.L_padlock_ctr32_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz .L046ctr32_abort
++ testl $15,%ecx
++ jnz .L046ctr32_abort
++ leal .Lpadlock_saved_context,%eax
++ pushfl
++ cld
++ call __padlock_verify_ctx
++.L047ctr32_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ movq -16(%edx),%mm0
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp .L048ctr32_loop
++.align 16
++.L048ctr32_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ movl -4(%edx),%ecx
++ xorl %edi,%edi
++ movl -8(%edx),%eax
++.L049ctr32_prepare:
++ movl %ecx,12(%esp,%edi,1)
++ bswap %ecx
++ movq %mm0,(%esp,%edi,1)
++ incl %ecx
++ movl %eax,8(%esp,%edi,1)
++ bswap %ecx
++ leal 16(%edi),%edi
++ cmpl %ebx,%edi
++ jb .L049ctr32_prepare
++ movl %ecx,-4(%edx)
++ leal (%esp),%esi
++ leal (%esp),%edi
++ movl %ebx,%ecx
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,200
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ movl 4(%ebp),%esi
++ xorl %ecx,%ecx
++.L050ctr32_xor:
++ movups (%esi,%ecx,1),%xmm1
++ leal 16(%ecx),%ecx
++ pxor -16(%esp,%ecx,1),%xmm1
++ movups %xmm1,-16(%edi,%ecx,1)
++ cmpl %ebx,%ecx
++ jb .L050ctr32_xor
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz .L048ctr32_loop
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++.L051ctr32_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja .L051ctr32_bzero
++.L052ctr32_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ movl $1,%eax
++ leal 4(%esp),%esp
++ emms
++.L046ctr32_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
+ .globl _padlock_xstore
+ .def _padlock_xstore; .scl 2; .type 32; .endef
+ .align 16
+@@ -533,10 +881,10 @@ __win32_segv_handler:
+ movl 4(%esp),%edx
+ movl 12(%esp),%ecx
+ cmpl $3221225477,(%edx)
+- jne .L028ret
++ jne .L053ret
+ addl $4,184(%ecx)
+ movl $0,%eax
+-.L028ret:
++.L053ret:
+ ret
+ .globl _padlock_sha1_oneshot
+ .def _padlock_sha1_oneshot; .scl 2; .type 32; .endef
+diff --git a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
+index 8f2b96f..9755951 100644
+--- a/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
++++ b/lib/accelerated/x86/elf/appro-aes-gcm-x86-64.s
+@@ -697,6 +697,7 @@ gcm_ghash_4bit:
+ .type gcm_init_clmul, at function
+ .align 16
+ gcm_init_clmul:
++.L_init_clmul:
+ movdqu (%rsi),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+@@ -715,15 +716,15 @@ gcm_init_clmul:
+ pxor %xmm5,%xmm2
+
+
++ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm2,%xmm0
++ pxor %xmm2,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+-.byte 102,15,58,68,220,0
++.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+@@ -733,44 +734,134 @@ gcm_init_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++ pxor %xmm4,%xmm0
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ pshufd $78,%xmm2,%xmm3
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm2,%xmm3
++ movdqu %xmm2,0(%rdi)
++ pxor %xmm0,%xmm4
++ movdqu %xmm0,16(%rdi)
++.byte 102,15,58,15,227,8
++ movdqu %xmm4,32(%rdi)
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,15,58,68,194,0
++.byte 102,15,58,68,202,17
++.byte 102,15,58,68,222,0
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++
++ movdqa %xmm3,%xmm4
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
++ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
++
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ movdqa %xmm0,%xmm5
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,15,58,68,194,0
++.byte 102,15,58,68,202,17
++.byte 102,15,58,68,222,0
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++
++ movdqa %xmm3,%xmm4
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
++ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
++
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- movdqu %xmm2,(%rdi)
+- movdqu %xmm0,16(%rdi)
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ pshufd $78,%xmm5,%xmm3
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm5,%xmm3
++ movdqu %xmm5,48(%rdi)
++ pxor %xmm0,%xmm4
++ movdqu %xmm0,64(%rdi)
++.byte 102,15,58,15,227,8
++ movdqu %xmm4,80(%rdi)
+ .byte 0xf3,0xc3
+ .size gcm_init_clmul,.-gcm_init_clmul
+ .globl gcm_gmult_clmul
+ .type gcm_gmult_clmul, at function
+ .align 16
+ gcm_gmult_clmul:
++.L_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa .Lbswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
++ movdqu 32(%rsi),%xmm4
+ .byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+ .byte 102,15,58,68,220,0
+@@ -783,186 +874,358 @@ gcm_gmult_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ .byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+ .size gcm_gmult_clmul,.-gcm_gmult_clmul
+ .globl gcm_ghash_clmul
+ .type gcm_ghash_clmul, at function
+-.align 16
++.align 32
+ gcm_ghash_clmul:
++.L_ghash_clmul:
+ movdqa .Lbswap_mask(%rip),%xmm5
++ movq $11547335547999543296,%rax
+
+ movdqu (%rdi),%xmm0
+ movdqu (%rsi),%xmm2
++ movdqu 32(%rsi),%xmm10
+ .byte 102,15,56,0,197
+
+ subq $16,%rcx
+ jz .Lodd_tail
+
+- movdqu 16(%rsi),%xmm8
++ movdqu 16(%rsi),%xmm9
++ cmpq $48,%rcx
++ jb .Lskip4x
+
++ subq $48,%rcx
++ movdqu 48(%rsi),%xmm14
++ movdqu 64(%rsi),%xmm15
+
+
+
+
+- movdqu (%rdx),%xmm3
+- movdqu 16(%rdx),%xmm6
+-.byte 102,15,56,0,221
++ movdqu 48(%rdx),%xmm6
++ movdqu 32(%rdx),%xmm11
+ .byte 102,15,56,0,245
+- pxor %xmm3,%xmm0
+- movdqa %xmm6,%xmm7
+- pshufd $78,%xmm6,%xmm3
+- pshufd $78,%xmm2,%xmm4
+- pxor %xmm6,%xmm3
+- pxor %xmm2,%xmm4
++.byte 102,68,15,56,0,221
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm7
++ pxor %xmm6,%xmm7
+ .byte 102,15,58,68,242,0
+-.byte 102,15,58,68,250,17
+-.byte 102,15,58,68,220,0
+- pxor %xmm6,%xmm3
+- pxor %xmm7,%xmm3
++.byte 102,68,15,58,68,194,17
++.byte 102,65,15,58,68,250,0
+
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,217,0
++.byte 102,69,15,58,68,233,17
++ xorps %xmm11,%xmm6
++.byte 102,69,15,58,68,226,16
++ xorps %xmm13,%xmm8
++ movups 80(%rsi),%xmm10
++ xorps %xmm12,%xmm7
++
++ movdqu 16(%rdx),%xmm11
++ movdqu 0(%rdx),%xmm3
++.byte 102,68,15,56,0,221
++.byte 102,15,56,0,221
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm3,%xmm0
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,222,0
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,69,15,58,68,238,17
++ xorps %xmm11,%xmm6
++.byte 102,69,15,58,68,226,0
++ xorps %xmm13,%xmm8
++
++ leaq 64(%rdx),%rdx
++ subq $64,%rcx
++ jc .Ltail4x
++
++ jmp .Lmod4_loop
++.align 32
++.Lmod4_loop:
++.byte 102,65,15,58,68,199,0
++ xorps %xmm12,%xmm7
++ movdqu 48(%rdx),%xmm11
++.byte 102,68,15,56,0,221
++.byte 102,65,15,58,68,207,17
++ xorps %xmm6,%xmm0
++ movdqu 32(%rdx),%xmm6
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++.byte 102,65,15,58,68,218,16
++ xorps %xmm8,%xmm1
++ pxor %xmm11,%xmm12
++.byte 102,15,56,0,245
++ movups 32(%rsi),%xmm10
++.byte 102,68,15,58,68,218,0
++ xorps %xmm7,%xmm3
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm7
++
++ pxor %xmm0,%xmm3
++ pxor %xmm6,%xmm7
++ pxor %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+- psrldq $8,%xmm3
++ pslldq $8,%xmm3
++.byte 102,68,15,58,68,234,17
++ psrldq $8,%xmm4
++ pxor %xmm3,%xmm0
++ movdqa .L7_mask(%rip),%xmm3
++ pxor %xmm4,%xmm1
++.byte 102,72,15,110,224
++
++ pand %xmm0,%xmm3
++.byte 102,15,56,0,227
++.byte 102,69,15,58,68,226,0
++ pxor %xmm0,%xmm4
++ psllq $57,%xmm4
++ movdqa %xmm4,%xmm3
+ pslldq $8,%xmm4
+- pxor %xmm3,%xmm7
+- pxor %xmm4,%xmm6
++.byte 102,65,15,58,68,241,0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ movdqu 0(%rdx),%xmm3
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++.byte 102,69,15,58,68,193,17
++ xorps %xmm11,%xmm6
++ movdqu 16(%rdx),%xmm11
++.byte 102,68,15,56,0,221
++.byte 102,65,15,58,68,250,16
++ xorps %xmm13,%xmm8
++ movups 80(%rsi),%xmm10
++.byte 102,15,56,0,221
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++
++ movdqa %xmm11,%xmm13
++ pxor %xmm12,%xmm7
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,222,0
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ psrlq $1,%xmm0
++.byte 102,69,15,58,68,238,17
++ xorps %xmm11,%xmm6
++ pxor %xmm1,%xmm0
++
++.byte 102,69,15,58,68,226,0
++ xorps %xmm13,%xmm8
++
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm8,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm8,%xmm4
+
+- leaq 32(%rdx),%rdx
+- subq $32,%rcx
+- jbe .Leven_tail
++ leaq 64(%rdx),%rdx
++ subq $64,%rcx
++ jnc .Lmod4_loop
++
++.Ltail4x:
++.byte 102,65,15,58,68,199,0
++ xorps %xmm12,%xmm7
++.byte 102,65,15,58,68,207,17
++ xorps %xmm6,%xmm0
++.byte 102,65,15,58,68,218,16
++ xorps %xmm8,%xmm1
++ pxor %xmm0,%xmm1
++ pxor %xmm7,%xmm3
+
+-.Lmod_loop:
+-.byte 102,65,15,58,68,192,0
+-.byte 102,65,15,58,68,200,17
+-.byte 102,15,58,68,220,0
+- pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
++ pxor %xmm0,%xmm1
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+- movdqu (%rdx),%xmm3
+- pxor %xmm6,%xmm0
+- pxor %xmm7,%xmm1
+
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++ pxor %xmm4,%xmm0
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ addq $64,%rcx
++ jz .Ldone
++ movdqu 32(%rsi),%xmm10
++ subq $16,%rcx
++ jz .Lodd_tail
++.Lskip4x:
++
++
++
++
++
++ movdqu (%rdx),%xmm3
+ movdqu 16(%rdx),%xmm6
+ .byte 102,15,56,0,221
+ .byte 102,15,56,0,245
++ pxor %xmm3,%xmm0
++
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm3
++ pxor %xmm6,%xmm3
++.byte 102,15,58,68,242,0
++.byte 102,68,15,58,68,194,17
++.byte 102,65,15,58,68,218,0
++
++ leaq 32(%rdx),%rdx
++ subq $32,%rcx
++ jbe .Leven_tail
++ jmp .Lmod_loop
+
+- movdqa %xmm6,%xmm7
+- pshufd $78,%xmm6,%xmm9
+- pshufd $78,%xmm2,%xmm10
+- pxor %xmm6,%xmm9
+- pxor %xmm2,%xmm10
++.align 32
++.Lmod_loop:
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm0,%xmm4
++
++.byte 102,65,15,58,68,193,0
++.byte 102,65,15,58,68,201,17
++.byte 102,65,15,58,68,226,16
++
++ pxor %xmm6,%xmm0
++ pxor %xmm8,%xmm1
++ movdqu (%rdx),%xmm8
++.byte 102,68,15,56,0,197
++ movdqu 16(%rdx),%xmm6
++
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++ pxor %xmm8,%xmm1
++ pxor %xmm3,%xmm4
++.byte 102,15,56,0,245
++ movdqa %xmm4,%xmm3
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
+
++ movdqa %xmm6,%xmm8
++
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+- pxor %xmm3,%xmm0
+ .byte 102,15,58,68,242,0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ pshufd $78,%xmm8,%xmm3
++ pxor %xmm8,%xmm3
+
+-.byte 102,15,58,68,250,17
++.byte 102,68,15,58,68,194,17
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
+-
+-.byte 102,69,15,58,68,202,0
+- movdqa %xmm0,%xmm1
+- pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm8,%xmm4
+- pxor %xmm0,%xmm3
+- pxor %xmm8,%xmm4
+-
+- pxor %xmm6,%xmm9
+- pxor %xmm7,%xmm9
+- movdqa %xmm9,%xmm10
+- psrldq $8,%xmm9
+- pslldq $8,%xmm10
+- pxor %xmm9,%xmm7
+- pxor %xmm10,%xmm6
++.byte 102,65,15,58,68,218,0
++ pxor %xmm1,%xmm0
+
+ leaq 32(%rdx),%rdx
+ subq $32,%rcx
+ ja .Lmod_loop
+
+ .Leven_tail:
+-.byte 102,65,15,58,68,192,0
+-.byte 102,65,15,58,68,200,17
+-.byte 102,15,58,68,220,0
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm0,%xmm4
++
++.byte 102,65,15,58,68,193,0
++.byte 102,65,15,58,68,201,17
++.byte 102,65,15,58,68,226,16
++
++ pxor %xmm6,%xmm0
++ pxor %xmm8,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+-
+- movdqa %xmm3,%xmm4
++ pxor %xmm3,%xmm4
++ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+- pxor %xmm6,%xmm0
+- pxor %xmm7,%xmm1
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ testq %rcx,%rcx
+ jnz .Ldone
+
+@@ -972,12 +1235,10 @@ gcm_ghash_clmul:
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+-.byte 102,15,58,68,220,0
++.byte 102,65,15,58,68,218,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+@@ -987,38 +1248,60 @@ gcm_ghash_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ .Ldone:
+ .byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+-.LSEH_end_gcm_ghash_clmul:
+ .size gcm_ghash_clmul,.-gcm_ghash_clmul
++.globl gcm_init_avx
++.type gcm_init_avx, at function
++.align 32
++gcm_init_avx:
++ jmp .L_init_clmul
++.size gcm_init_avx,.-gcm_init_avx
++.globl gcm_gmult_avx
++.type gcm_gmult_avx, at function
++.align 32
++gcm_gmult_avx:
++ jmp .L_gmult_clmul
++.size gcm_gmult_avx,.-gcm_gmult_avx
++.globl gcm_ghash_avx
++.type gcm_ghash_avx, at function
++.align 32
++gcm_ghash_avx:
++ jmp .L_ghash_clmul
++.size gcm_ghash_avx,.-gcm_ghash_avx
+ .align 64
+ .Lbswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+ .L0x1c2_polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
++.L7_mask:
++.long 7,0,7,0
++.L7_mask_poly:
++.long 7,0,450,0
+ .align 64
+ .type .Lrem_4bit, at object
+ .Lrem_4bit:
+diff --git a/lib/accelerated/x86/elf/appro-aes-x86-64.s b/lib/accelerated/x86/elf/appro-aes-x86-64.s
+index f48666f..d3734a6 100644
+--- a/lib/accelerated/x86/elf/appro-aes-x86-64.s
++++ b/lib/accelerated/x86/elf/appro-aes-x86-64.s
+@@ -925,199 +925,412 @@ aesni_ccm64_decrypt_blocks:
+ .type aesni_ctr32_encrypt_blocks, at function
+ .align 16
+ aesni_ctr32_encrypt_blocks:
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $128,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
++
+ cmpq $1,%rdx
+ je .Lctr32_one_shortcut
+
+- movdqu (%r8),%xmm14
+- movdqa .Lbswap_mask(%rip),%xmm15
+- xorl %eax,%eax
+-.byte 102,69,15,58,22,242,3
+-.byte 102,68,15,58,34,240,3
++ movdqu (%r8),%xmm2
++ movdqu (%rcx),%xmm0
++ movl 12(%r8),%r8d
++ pxor %xmm0,%xmm2
++ movl 12(%rcx),%r11d
++ movdqa %xmm2,0(%rsp)
++ bswapl %r8d
++ movdqa %xmm2,%xmm3
++ movdqa %xmm2,%xmm4
++ movdqa %xmm2,%xmm5
++ movdqa %xmm2,64(%rsp)
++ movdqa %xmm2,80(%rsp)
++ movdqa %xmm2,96(%rsp)
++ movdqa %xmm2,112(%rsp)
+
+ movl 240(%rcx),%eax
++
++ leaq 1(%r8),%r9
++ leaq 2(%r8),%r10
++ bswapl %r9d
+ bswapl %r10d
+- pxor %xmm12,%xmm12
+- pxor %xmm13,%xmm13
+-.byte 102,69,15,58,34,226,0
+- leaq 3(%r10),%r11
+-.byte 102,69,15,58,34,235,0
+- incl %r10d
+-.byte 102,69,15,58,34,226,1
+- incq %r11
+-.byte 102,69,15,58,34,235,1
+- incl %r10d
+-.byte 102,69,15,58,34,226,2
+- incq %r11
+-.byte 102,69,15,58,34,235,2
+- movdqa %xmm12,-40(%rsp)
+-.byte 102,69,15,56,0,231
+- movdqa %xmm13,-24(%rsp)
+-.byte 102,69,15,56,0,239
+-
+- pshufd $192,%xmm12,%xmm2
+- pshufd $128,%xmm12,%xmm3
+- pshufd $64,%xmm12,%xmm4
+- cmpq $6,%rdx
+- jb .Lctr32_tail
+- shrl $1,%eax
+- movq %rcx,%r11
+- movl %eax,%r10d
+- subq $6,%rdx
+- jmp .Lctr32_loop6
++ xorl %r11d,%r9d
++ xorl %r11d,%r10d
++.byte 102,65,15,58,34,217,3
++ leaq 3(%r8),%r9
++ movdqa %xmm3,16(%rsp)
++.byte 102,65,15,58,34,226,3
++ bswapl %r9d
++ leaq 4(%r8),%r10
++ movdqa %xmm4,32(%rsp)
++ xorl %r11d,%r9d
++ bswapl %r10d
++.byte 102,65,15,58,34,233,3
++ xorl %r11d,%r10d
++ movdqa %xmm5,48(%rsp)
++ leaq 5(%r8),%r9
++ movl %r10d,64+12(%rsp)
++ bswapl %r9d
++ leaq 6(%r8),%r10
++ xorl %r11d,%r9d
++ bswapl %r10d
++ movl %r9d,80+12(%rsp)
++ xorl %r11d,%r10d
++ leaq 7(%r8),%r9
++ movl %r10d,96+12(%rsp)
++ bswapl %r9d
++ xorl %r11d,%r9d
++ movl %r9d,112+12(%rsp)
+
+-.align 16
+-.Lctr32_loop6:
+- pshufd $192,%xmm13,%xmm5
+- por %xmm14,%xmm2
+- movups (%r11),%xmm0
+- pshufd $128,%xmm13,%xmm6
+- por %xmm14,%xmm3
+- movups 16(%r11),%xmm1
+- pshufd $64,%xmm13,%xmm7
+- por %xmm14,%xmm4
+- por %xmm14,%xmm5
+- xorps %xmm0,%xmm2
+- por %xmm14,%xmm6
+- por %xmm14,%xmm7
++ movups 16(%rcx),%xmm1
+
++ movdqa 64(%rsp),%xmm6
++ movdqa 80(%rsp),%xmm7
+
++ cmpq $8,%rdx
++ jb .Lctr32_tail
+
++ leaq 128(%rcx),%rcx
++ subq $8,%rdx
++ jmp .Lctr32_loop8
+
+- pxor %xmm0,%xmm3
++.align 32
++.Lctr32_loop8:
++ addl $8,%r8d
++ movdqa 96(%rsp),%xmm8
+ .byte 102,15,56,220,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++ movl %r8d,%r9d
++ movdqa 112(%rsp),%xmm9
+ .byte 102,15,56,220,217
+- movdqa .Lincrement32(%rip),%xmm13
+- pxor %xmm0,%xmm5
++ bswapl %r9d
++ movups 32-128(%rcx),%xmm0
+ .byte 102,15,56,220,225
+- movdqa -40(%rsp),%xmm12
+- pxor %xmm0,%xmm6
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++ movl %r9d,0+12(%rsp)
++ leaq 1(%r8),%r9
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+- jmp .Lctr32_enc_loop6_enter
+-.align 16
+-.Lctr32_enc_loop6:
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 48-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,16+12(%rsp)
++ leaq 2(%r8),%r9
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 64-128(%rcx),%xmm0
+ .byte 102,15,56,220,209
+ .byte 102,15,56,220,217
+- decl %eax
++ bswapl %r9d
+ .byte 102,15,56,220,225
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,233
++ movl %r9d,32+12(%rsp)
++ leaq 3(%r8),%r9
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+-.Lctr32_enc_loop6_enter:
+- movups 16(%rcx),%xmm1
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 80-128(%rcx),%xmm1
+ .byte 102,15,56,220,208
+ .byte 102,15,56,220,216
+- leaq 32(%rcx),%rcx
++ bswapl %r9d
+ .byte 102,15,56,220,224
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,232
++ movl %r9d,48+12(%rsp)
++ leaq 4(%r8),%r9
+ .byte 102,15,56,220,240
+ .byte 102,15,56,220,248
+- movups (%rcx),%xmm0
+- jnz .Lctr32_enc_loop6
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 96-128(%rcx),%xmm0
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++ bswapl %r9d
++.byte 102,15,56,220,225
++ xorl %r11d,%r9d
++.byte 102,15,56,220,233
++ movl %r9d,64+12(%rsp)
++ leaq 5(%r8),%r9
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 112-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,80+12(%rsp)
++ leaq 6(%r8),%r9
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 128-128(%rcx),%xmm0
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++ bswapl %r9d
++.byte 102,15,56,220,225
++ xorl %r11d,%r9d
++.byte 102,15,56,220,233
++ movl %r9d,96+12(%rsp)
++ leaq 7(%r8),%r9
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 144-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,112+12(%rsp)
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++ movdqu 0(%rdi),%xmm10
++.byte 102,68,15,56,220,200
++ movups 160-128(%rcx),%xmm0
++
++ cmpl $11,%eax
++ jb .Lctr32_enc_done
+
+ .byte 102,15,56,220,209
+- paddd %xmm13,%xmm12
+ .byte 102,15,56,220,217
+- paddd -24(%rsp),%xmm13
+ .byte 102,15,56,220,225
+- movdqa %xmm12,-40(%rsp)
+ .byte 102,15,56,220,233
+- movdqa %xmm13,-24(%rsp)
+ .byte 102,15,56,220,241
+-.byte 102,69,15,56,0,231
+ .byte 102,15,56,220,249
+-.byte 102,69,15,56,0,239
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 176-128(%rcx),%xmm1
+
+-.byte 102,15,56,221,208
+- movups (%rdi),%xmm8
+-.byte 102,15,56,221,216
+- movups 16(%rdi),%xmm9
+-.byte 102,15,56,221,224
+- movups 32(%rdi),%xmm10
+-.byte 102,15,56,221,232
+- movups 48(%rdi),%xmm11
+-.byte 102,15,56,221,240
+- movups 64(%rdi),%xmm1
+-.byte 102,15,56,221,248
+- movups 80(%rdi),%xmm0
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 192-128(%rcx),%xmm0
++ je .Lctr32_enc_done
+
+- xorps %xmm2,%xmm8
+- pshufd $192,%xmm12,%xmm2
+- xorps %xmm3,%xmm9
+- pshufd $128,%xmm12,%xmm3
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- pshufd $64,%xmm12,%xmm4
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- xorps %xmm6,%xmm1
+- movups %xmm11,48(%rsi)
+- xorps %xmm7,%xmm0
+- movups %xmm1,64(%rsi)
+- movups %xmm0,80(%rsi)
+- leaq 96(%rsi),%rsi
+- movl %r10d,%eax
+- subq $6,%rdx
+- jnc .Lctr32_loop6
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 208-128(%rcx),%xmm1
++
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 224-128(%rcx),%xmm0
++
++.Lctr32_enc_done:
++ movdqu 16(%rdi),%xmm11
++ pxor %xmm0,%xmm10
++ movdqu 32(%rdi),%xmm12
++ pxor %xmm0,%xmm11
++ movdqu 48(%rdi),%xmm13
++ pxor %xmm0,%xmm12
++ movdqu 64(%rdi),%xmm14
++ pxor %xmm0,%xmm13
++ movdqu 80(%rdi),%xmm15
++ pxor %xmm0,%xmm14
++.byte 102,15,56,220,209
++ pxor %xmm0,%xmm15
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movdqu 96(%rdi),%xmm1
++
++.byte 102,65,15,56,221,210
++ pxor %xmm0,%xmm1
++ movdqu 112(%rdi),%xmm10
++ leaq 128(%rdi),%rdi
++.byte 102,65,15,56,221,219
++ pxor %xmm0,%xmm10
++ movdqa 0(%rsp),%xmm11
++.byte 102,65,15,56,221,228
++ movdqa 16(%rsp),%xmm12
++.byte 102,65,15,56,221,237
++ movdqa 32(%rsp),%xmm13
++.byte 102,65,15,56,221,246
++ movdqa 48(%rsp),%xmm14
++.byte 102,65,15,56,221,255
++ movdqa 64(%rsp),%xmm15
++.byte 102,68,15,56,221,193
++ movdqa 80(%rsp),%xmm0
++.byte 102,69,15,56,221,202
++ movups 16-128(%rcx),%xmm1
++
++ movups %xmm2,(%rsi)
++ movdqa %xmm11,%xmm2
++ movups %xmm3,16(%rsi)
++ movdqa %xmm12,%xmm3
++ movups %xmm4,32(%rsi)
++ movdqa %xmm13,%xmm4
++ movups %xmm5,48(%rsi)
++ movdqa %xmm14,%xmm5
++ movups %xmm6,64(%rsi)
++ movdqa %xmm15,%xmm6
++ movups %xmm7,80(%rsi)
++ movdqa %xmm0,%xmm7
++ movups %xmm8,96(%rsi)
++ movups %xmm9,112(%rsi)
++ leaq 128(%rsi),%rsi
++
++ subq $8,%rdx
++ jnc .Lctr32_loop8
+
+- addq $6,%rdx
++ addq $8,%rdx
+ jz .Lctr32_done
+- movq %r11,%rcx
+- leal 1(%rax,%rax,1),%eax
++ leaq -128(%rcx),%rcx
+
+ .Lctr32_tail:
+- por %xmm14,%xmm2
+- movups (%rdi),%xmm8
+- cmpq $2,%rdx
+- jb .Lctr32_one
++ leaq 16(%rcx),%rcx
++ cmpq $4,%rdx
++ jb .Lctr32_loop3
++ je .Lctr32_loop4
+
+- por %xmm14,%xmm3
+- movups 16(%rdi),%xmm9
+- je .Lctr32_two
++ movdqa 96(%rsp),%xmm8
++ pxor %xmm9,%xmm9
+
+- pshufd $192,%xmm13,%xmm5
+- por %xmm14,%xmm4
+- movups 32(%rdi),%xmm10
+- cmpq $4,%rdx
+- jb .Lctr32_three
++ movups 16(%rcx),%xmm0
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++ shrl $1,%eax
++.byte 102,15,56,220,225
++ decl %eax
++.byte 102,15,56,220,233
++ movups (%rdi),%xmm10
++.byte 102,15,56,220,241
++ movups 16(%rdi),%xmm11
++.byte 102,15,56,220,249
++ movups 32(%rdi),%xmm12
++.byte 102,68,15,56,220,193
++ movups 16(%rcx),%xmm1
+
+- pshufd $128,%xmm13,%xmm6
+- por %xmm14,%xmm5
+- movups 48(%rdi),%xmm11
+- je .Lctr32_four
++ call .Lenc_loop8_enter
+
+- por %xmm14,%xmm6
+- xorps %xmm7,%xmm7
++ movdqu 48(%rdi),%xmm13
++ pxor %xmm10,%xmm2
++ movdqu 64(%rdi),%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm10,%xmm6
++ movdqu %xmm5,48(%rsi)
++ movdqu %xmm6,64(%rsi)
++ cmpq $6,%rdx
++ jb .Lctr32_done
+
+- call _aesni_encrypt6
++ movups 80(%rdi),%xmm11
++ xorps %xmm11,%xmm7
++ movups %xmm7,80(%rsi)
++ je .Lctr32_done
+
+- movups 64(%rdi),%xmm1
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- xorps %xmm6,%xmm1
+- movups %xmm11,48(%rsi)
+- movups %xmm1,64(%rsi)
++ movups 96(%rdi),%xmm12
++ xorps %xmm12,%xmm8
++ movups %xmm8,96(%rsi)
++ jmp .Lctr32_done
++
++.align 32
++.Lctr32_loop4:
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++ movups (%rcx),%xmm1
++ decl %eax
++ jnz .Lctr32_loop4
++.byte 102,15,56,221,209
++ movups (%rdi),%xmm10
++.byte 102,15,56,221,217
++ movups 16(%rdi),%xmm11
++.byte 102,15,56,221,225
++ movups 32(%rdi),%xmm12
++.byte 102,15,56,221,233
++ movups 48(%rdi),%xmm13
++
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
++ xorps %xmm11,%xmm3
++ movups %xmm3,16(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm5,48(%rsi)
++ jmp .Lctr32_done
++
++.align 32
++.Lctr32_loop3:
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++ movups (%rcx),%xmm1
++ decl %eax
++ jnz .Lctr32_loop3
++.byte 102,15,56,221,209
++.byte 102,15,56,221,217
++.byte 102,15,56,221,225
++
++ movups (%rdi),%xmm10
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
++ cmpq $2,%rdx
++ jb .Lctr32_done
++
++ movups 16(%rdi),%xmm11
++ xorps %xmm11,%xmm3
++ movups %xmm3,16(%rsi)
++ je .Lctr32_done
++
++ movups 32(%rdi),%xmm12
++ xorps %xmm12,%xmm4
++ movups %xmm4,32(%rsi)
+ jmp .Lctr32_done
+
+ .align 16
+ .Lctr32_one_shortcut:
+ movups (%r8),%xmm2
+- movups (%rdi),%xmm8
++ movups (%rdi),%xmm10
+ movl 240(%rcx),%eax
+-.Lctr32_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+@@ -1129,51 +1342,26 @@ aesni_ctr32_encrypt_blocks:
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_7
+ .byte 102,15,56,221,209
+- xorps %xmm2,%xmm8
+- movups %xmm8,(%rsi)
+- jmp .Lctr32_done
+-
+-.align 16
+-.Lctr32_two:
+- xorps %xmm4,%xmm4
+- call _aesni_encrypt3
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- movups %xmm9,16(%rsi)
+- jmp .Lctr32_done
+-
+-.align 16
+-.Lctr32_three:
+- call _aesni_encrypt3
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- movups %xmm10,32(%rsi)
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
+ jmp .Lctr32_done
+
+ .align 16
+-.Lctr32_four:
+- call _aesni_encrypt4
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- movups %xmm11,48(%rsi)
+-
+ .Lctr32_done:
++ leaq (%rbp),%rsp
++ popq %rbp
++.Lctr32_epilogue:
+ .byte 0xf3,0xc3
+ .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+ .globl aesni_xts_encrypt
+ .type aesni_xts_encrypt, at function
+ .align 16
+ aesni_xts_encrypt:
+- leaq -104(%rsp),%rsp
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $112,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+@@ -1188,228 +1376,266 @@ aesni_xts_encrypt:
+ leaq 16(%r8),%r8
+ jnz .Loop_enc1_8
+ .byte 102,68,15,56,221,249
++ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
++ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
++ movups 16(%rcx,%r10,1),%xmm1
++ movl %eax,%r10d
++
+ movdqa .Lxts_magic(%rip),%xmm8
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pshufd $95,%xmm15,%xmm9
++ pxor %xmm0,%xmm1
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm10
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm11
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm12
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
++ psrad $31,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm13
++ pxor %xmm14,%xmm15
++ movdqa %xmm15,%xmm14
++ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
++ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
++ movaps %xmm1,96(%rsp)
++
+ subq $96,%rdx
+ jc .Lxts_enc_short
+
+ shrl $1,%eax
+- subl $1,%eax
++ subl $3,%eax
++ movups 16(%r11),%xmm1
+ movl %eax,%r10d
++ leaq .Lxts_magic(%rip),%r8
+ jmp .Lxts_enc_grandloop
+
+-.align 16
++.align 32
+ .Lxts_enc_grandloop:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+- pand %xmm8,%xmm9
++ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+- movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+- movdqu 48(%rdi),%xmm5
++ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+- movdqu 64(%rdi),%xmm6
++.byte 102,15,56,220,209
++ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+- movdqu 80(%rdi),%xmm7
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,220,217
++ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+- movups (%r11),%xmm0
++.byte 102,15,56,220,225
++ movdqu 80(%rdi),%xmm7
++ pxor %xmm15,%xmm8
++ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+- pxor %xmm15,%xmm7
+-
+-
++.byte 102,15,56,220,233
++ movups 32(%r11),%xmm0
++ leaq 96(%rdi),%rdi
++ pxor %xmm8,%xmm7
+
+- movups 16(%r11),%xmm1
+- pxor %xmm0,%xmm2
+- pxor %xmm0,%xmm3
++ pxor %xmm9,%xmm10
++.byte 102,15,56,220,241
++ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+-.byte 102,15,56,220,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++.byte 102,15,56,220,249
++ movups 48(%r11),%xmm1
++
++.byte 102,15,56,220,208
++ pxor %xmm9,%xmm12
+ movdqa %xmm11,16(%rsp)
+-.byte 102,15,56,220,217
+- pxor %xmm0,%xmm5
++.byte 102,15,56,220,216
++ pxor %xmm9,%xmm13
+ movdqa %xmm12,32(%rsp)
+-.byte 102,15,56,220,225
+- pxor %xmm0,%xmm6
+- movdqa %xmm13,48(%rsp)
+-.byte 102,15,56,220,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++.byte 102,15,56,220,224
++ pxor %xmm9,%xmm14
++.byte 102,15,56,220,232
++ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+-.byte 102,15,56,220,241
+- movdqa %xmm15,80(%rsp)
+-.byte 102,15,56,220,249
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- jmp .Lxts_enc_loop6_enter
+-
+-.align 16
++.byte 102,15,56,220,240
++ movdqa %xmm8,80(%rsp)
++.byte 102,15,56,220,248
++ movups 64(%r11),%xmm0
++ leaq 64(%r11),%rcx
++ pshufd $95,%xmm15,%xmm9
++ jmp .Lxts_enc_loop6
++.align 32
+ .Lxts_enc_loop6:
+ .byte 102,15,56,220,209
+ .byte 102,15,56,220,217
+- decl %eax
+ .byte 102,15,56,220,225
+ .byte 102,15,56,220,233
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+-.Lxts_enc_loop6_enter:
+ movups 16(%rcx),%xmm1
++ leaq 32(%rcx),%rcx
++
+ .byte 102,15,56,220,208
+ .byte 102,15,56,220,216
+- leaq 32(%rcx),%rcx
+ .byte 102,15,56,220,224
+ .byte 102,15,56,220,232
+ .byte 102,15,56,220,240
+ .byte 102,15,56,220,248
+ movups (%rcx),%xmm0
++ decl %eax
+ jnz .Lxts_enc_loop6
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- paddq %xmm15,%xmm15
++ movdqa (%r8),%xmm8
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,209
+- pand %xmm8,%xmm9
++ paddq %xmm15,%xmm15
++ psrad $31,%xmm14
+ .byte 102,15,56,220,217
+- pcmpgtd %xmm15,%xmm14
++ pand %xmm8,%xmm14
++ movups (%r11),%xmm10
+ .byte 102,15,56,220,225
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,220,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,241
++ movaps %xmm10,%xmm11
+ .byte 102,15,56,220,249
+ movups 16(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm10
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,208
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm10
++ psrad $31,%xmm14
+ .byte 102,15,56,220,216
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,220,224
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,220,232
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,240
++ movaps %xmm11,%xmm12
+ .byte 102,15,56,220,248
+ movups 32(%rcx),%xmm0
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm11
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,209
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm11
++ psrad $31,%xmm14
+ .byte 102,15,56,220,217
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,220,225
+- pxor %xmm9,%xmm15
++ movdqa %xmm13,48(%rsp)
+ .byte 102,15,56,220,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,241
++ movaps %xmm12,%xmm13
+ .byte 102,15,56,220,249
++ movups 48(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm12
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
++.byte 102,15,56,220,208
++ pxor %xmm15,%xmm12
++ psrad $31,%xmm14
++.byte 102,15,56,220,216
+ paddq %xmm15,%xmm15
+-.byte 102,15,56,221,208
+- pand %xmm8,%xmm9
+-.byte 102,15,56,221,216
+- pcmpgtd %xmm15,%xmm14
+-.byte 102,15,56,221,224
+- pxor %xmm9,%xmm15
+-.byte 102,15,56,221,232
+-.byte 102,15,56,221,240
+-.byte 102,15,56,221,248
++ pand %xmm8,%xmm14
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++ pxor %xmm14,%xmm15
++.byte 102,15,56,220,240
++ movaps %xmm13,%xmm14
++.byte 102,15,56,220,248
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm13
++ movdqa %xmm9,%xmm0
++ paddd %xmm9,%xmm9
++.byte 102,15,56,220,209
++ pxor %xmm15,%xmm13
++ psrad $31,%xmm0
++.byte 102,15,56,220,217
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm0
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++ pxor %xmm0,%xmm15
++ movups (%r11),%xmm0
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++ movups 16(%r11),%xmm1
++
++ pxor %xmm15,%xmm14
++ psrad $31,%xmm9
++.byte 102,15,56,221,84,36,0
+ paddq %xmm15,%xmm15
+- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+- xorps 16(%rsp),%xmm3
+- pcmpgtd %xmm15,%xmm14
++.byte 102,15,56,221,92,36,16
++.byte 102,15,56,221,100,36,32
+ pxor %xmm9,%xmm15
+-
+- xorps 32(%rsp),%xmm4
+- movups %xmm2,0(%rsi)
+- xorps 48(%rsp),%xmm5
+- movups %xmm3,16(%rsi)
+- xorps 64(%rsp),%xmm6
+- movups %xmm4,32(%rsi)
+- xorps 80(%rsp),%xmm7
+- movups %xmm5,48(%rsi)
++.byte 102,15,56,221,108,36,48
++.byte 102,15,56,221,116,36,64
++.byte 102,15,56,221,124,36,80
+ movl %r10d,%eax
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
++
+ leaq 96(%rsi),%rsi
++ movups %xmm2,-96(%rsi)
++ movups %xmm3,-80(%rsi)
++ movups %xmm4,-64(%rsi)
++ movups %xmm5,-48(%rsi)
++ movups %xmm6,-32(%rsi)
++ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc .Lxts_enc_grandloop
+
+- leal 3(%rax,%rax,1),%eax
++ leal 7(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+ .Lxts_enc_short:
++ pxor %xmm0,%xmm10
+ addq $96,%rdx
+ jz .Lxts_enc_done
+
++ pxor %xmm0,%xmm11
+ cmpq $32,%rdx
+ jb .Lxts_enc_one
++ pxor %xmm0,%xmm12
+ je .Lxts_enc_two
+
++ pxor %xmm0,%xmm13
+ cmpq $64,%rdx
+ jb .Lxts_enc_three
++ pxor %xmm0,%xmm14
+ je .Lxts_enc_four
+
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+@@ -1512,15 +1738,15 @@ aesni_xts_encrypt:
+
+ call _aesni_encrypt4
+
+- xorps %xmm10,%xmm2
+- movdqa %xmm15,%xmm10
+- xorps %xmm11,%xmm3
+- xorps %xmm12,%xmm4
+- movups %xmm2,(%rsi)
+- xorps %xmm13,%xmm5
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
++ pxor %xmm10,%xmm2
++ movdqa %xmm14,%xmm10
++ pxor %xmm11,%xmm3
++ pxor %xmm12,%xmm4
++ movdqu %xmm2,(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm3,16(%rsi)
++ movdqu %xmm4,32(%rsi)
++ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_enc_done
+
+@@ -1561,7 +1787,8 @@ aesni_xts_encrypt:
+ movups %xmm2,-16(%rsi)
+
+ .Lxts_enc_ret:
+- leaq 104(%rsp),%rsp
++ leaq (%rbp),%rsp
++ popq %rbp
+ .Lxts_enc_epilogue:
+ .byte 0xf3,0xc3
+ .size aesni_xts_encrypt,.-aesni_xts_encrypt
+@@ -1569,7 +1796,11 @@ aesni_xts_encrypt:
+ .type aesni_xts_decrypt, at function
+ .align 16
+ aesni_xts_decrypt:
+- leaq -104(%rsp),%rsp
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $112,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+@@ -1590,228 +1821,266 @@ aesni_xts_decrypt:
+ shlq $4,%rax
+ subq %rax,%rdx
+
++ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
++ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
++ movups 16(%rcx,%r10,1),%xmm1
++ movl %eax,%r10d
++
+ movdqa .Lxts_magic(%rip),%xmm8
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pshufd $95,%xmm15,%xmm9
++ pxor %xmm0,%xmm1
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm10
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm11
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm12
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
++ psrad $31,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm13
++ pxor %xmm14,%xmm15
++ movdqa %xmm15,%xmm14
++ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
++ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
++ movaps %xmm1,96(%rsp)
++
+ subq $96,%rdx
+ jc .Lxts_dec_short
+
+ shrl $1,%eax
+- subl $1,%eax
++ subl $3,%eax
++ movups 16(%r11),%xmm1
+ movl %eax,%r10d
++ leaq .Lxts_magic(%rip),%r8
+ jmp .Lxts_dec_grandloop
+
+-.align 16
++.align 32
+ .Lxts_dec_grandloop:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+- pand %xmm8,%xmm9
++ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+- movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+- movdqu 48(%rdi),%xmm5
++ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+- movdqu 64(%rdi),%xmm6
++.byte 102,15,56,222,209
++ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+- movdqu 80(%rdi),%xmm7
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,222,217
++ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+- movups (%r11),%xmm0
++.byte 102,15,56,222,225
++ movdqu 80(%rdi),%xmm7
++ pxor %xmm15,%xmm8
++ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+- pxor %xmm15,%xmm7
+-
+-
++.byte 102,15,56,222,233
++ movups 32(%r11),%xmm0
++ leaq 96(%rdi),%rdi
++ pxor %xmm8,%xmm7
+
+- movups 16(%r11),%xmm1
+- pxor %xmm0,%xmm2
+- pxor %xmm0,%xmm3
++ pxor %xmm9,%xmm10
++.byte 102,15,56,222,241
++ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+-.byte 102,15,56,222,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++.byte 102,15,56,222,249
++ movups 48(%r11),%xmm1
++
++.byte 102,15,56,222,208
++ pxor %xmm9,%xmm12
+ movdqa %xmm11,16(%rsp)
+-.byte 102,15,56,222,217
+- pxor %xmm0,%xmm5
++.byte 102,15,56,222,216
++ pxor %xmm9,%xmm13
+ movdqa %xmm12,32(%rsp)
+-.byte 102,15,56,222,225
+- pxor %xmm0,%xmm6
+- movdqa %xmm13,48(%rsp)
+-.byte 102,15,56,222,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++.byte 102,15,56,222,224
++ pxor %xmm9,%xmm14
++.byte 102,15,56,222,232
++ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+-.byte 102,15,56,222,241
+- movdqa %xmm15,80(%rsp)
+-.byte 102,15,56,222,249
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- jmp .Lxts_dec_loop6_enter
+-
+-.align 16
++.byte 102,15,56,222,240
++ movdqa %xmm8,80(%rsp)
++.byte 102,15,56,222,248
++ movups 64(%r11),%xmm0
++ leaq 64(%r11),%rcx
++ pshufd $95,%xmm15,%xmm9
++ jmp .Lxts_dec_loop6
++.align 32
+ .Lxts_dec_loop6:
+ .byte 102,15,56,222,209
+ .byte 102,15,56,222,217
+- decl %eax
+ .byte 102,15,56,222,225
+ .byte 102,15,56,222,233
+ .byte 102,15,56,222,241
+ .byte 102,15,56,222,249
+-.Lxts_dec_loop6_enter:
+ movups 16(%rcx),%xmm1
++ leaq 32(%rcx),%rcx
++
+ .byte 102,15,56,222,208
+ .byte 102,15,56,222,216
+- leaq 32(%rcx),%rcx
+ .byte 102,15,56,222,224
+ .byte 102,15,56,222,232
+ .byte 102,15,56,222,240
+ .byte 102,15,56,222,248
+ movups (%rcx),%xmm0
++ decl %eax
+ jnz .Lxts_dec_loop6
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- paddq %xmm15,%xmm15
++ movdqa (%r8),%xmm8
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,209
+- pand %xmm8,%xmm9
++ paddq %xmm15,%xmm15
++ psrad $31,%xmm14
+ .byte 102,15,56,222,217
+- pcmpgtd %xmm15,%xmm14
++ pand %xmm8,%xmm14
++ movups (%r11),%xmm10
+ .byte 102,15,56,222,225
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,222,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,241
++ movaps %xmm10,%xmm11
+ .byte 102,15,56,222,249
+ movups 16(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm10
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,208
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm10
++ psrad $31,%xmm14
+ .byte 102,15,56,222,216
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,222,224
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,222,232
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,240
++ movaps %xmm11,%xmm12
+ .byte 102,15,56,222,248
+ movups 32(%rcx),%xmm0
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm11
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,209
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm11
++ psrad $31,%xmm14
+ .byte 102,15,56,222,217
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,222,225
+- pxor %xmm9,%xmm15
++ movdqa %xmm13,48(%rsp)
+ .byte 102,15,56,222,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,241
++ movaps %xmm12,%xmm13
+ .byte 102,15,56,222,249
++ movups 48(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm12
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
++.byte 102,15,56,222,208
++ pxor %xmm15,%xmm12
++ psrad $31,%xmm14
++.byte 102,15,56,222,216
+ paddq %xmm15,%xmm15
+-.byte 102,15,56,223,208
+- pand %xmm8,%xmm9
+-.byte 102,15,56,223,216
+- pcmpgtd %xmm15,%xmm14
+-.byte 102,15,56,223,224
+- pxor %xmm9,%xmm15
+-.byte 102,15,56,223,232
+-.byte 102,15,56,223,240
+-.byte 102,15,56,223,248
++ pand %xmm8,%xmm14
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++ pxor %xmm14,%xmm15
++.byte 102,15,56,222,240
++ movaps %xmm13,%xmm14
++.byte 102,15,56,222,248
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm13
++ movdqa %xmm9,%xmm0
++ paddd %xmm9,%xmm9
++.byte 102,15,56,222,209
++ pxor %xmm15,%xmm13
++ psrad $31,%xmm0
++.byte 102,15,56,222,217
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm0
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++ pxor %xmm0,%xmm15
++ movups (%r11),%xmm0
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++ movups 16(%r11),%xmm1
++
++ pxor %xmm15,%xmm14
++ psrad $31,%xmm9
++.byte 102,15,56,223,84,36,0
+ paddq %xmm15,%xmm15
+- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+- xorps 16(%rsp),%xmm3
+- pcmpgtd %xmm15,%xmm14
++.byte 102,15,56,223,92,36,16
++.byte 102,15,56,223,100,36,32
+ pxor %xmm9,%xmm15
+-
+- xorps 32(%rsp),%xmm4
+- movups %xmm2,0(%rsi)
+- xorps 48(%rsp),%xmm5
+- movups %xmm3,16(%rsi)
+- xorps 64(%rsp),%xmm6
+- movups %xmm4,32(%rsi)
+- xorps 80(%rsp),%xmm7
+- movups %xmm5,48(%rsi)
++.byte 102,15,56,223,108,36,48
++.byte 102,15,56,223,116,36,64
++.byte 102,15,56,223,124,36,80
+ movl %r10d,%eax
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
++
+ leaq 96(%rsi),%rsi
++ movups %xmm2,-96(%rsi)
++ movups %xmm3,-80(%rsi)
++ movups %xmm4,-64(%rsi)
++ movups %xmm5,-48(%rsi)
++ movups %xmm6,-32(%rsi)
++ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc .Lxts_dec_grandloop
+
+- leal 3(%rax,%rax,1),%eax
++ leal 7(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+ .Lxts_dec_short:
++ pxor %xmm0,%xmm10
++ pxor %xmm0,%xmm11
+ addq $96,%rdx
+ jz .Lxts_dec_done
+
++ pxor %xmm0,%xmm12
+ cmpq $32,%rdx
+ jb .Lxts_dec_one
++ pxor %xmm0,%xmm13
+ je .Lxts_dec_two
+
++ pxor %xmm0,%xmm14
+ cmpq $64,%rdx
+ jb .Lxts_dec_three
+ je .Lxts_dec_four
+
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+@@ -1904,7 +2173,7 @@ aesni_xts_decrypt:
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+- movdqa %xmm15,%xmm11
++ movdqa %xmm14,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+@@ -1914,14 +2183,8 @@ aesni_xts_decrypt:
+
+ .align 16
+ .Lxts_dec_four:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movups (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movups 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+@@ -1932,16 +2195,16 @@ aesni_xts_decrypt:
+
+ call _aesni_decrypt4
+
+- xorps %xmm10,%xmm2
++ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+- xorps %xmm11,%xmm3
++ pxor %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+- xorps %xmm12,%xmm4
+- movups %xmm2,(%rsi)
+- xorps %xmm13,%xmm5
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm2,(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm3,16(%rsi)
++ movdqu %xmm4,32(%rsi)
++ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp .Lxts_dec_done
+
+@@ -2001,7 +2264,8 @@ aesni_xts_decrypt:
+ movups %xmm2,(%rsi)
+
+ .Lxts_dec_ret:
+- leaq 104(%rsp),%rsp
++ leaq (%rbp),%rsp
++ popq %rbp
+ .Lxts_dec_epilogue:
+ .byte 0xf3,0xc3
+ .size aesni_xts_decrypt,.-aesni_xts_decrypt
+@@ -2068,149 +2332,324 @@ aesni_cbc_encrypt:
+
+ .align 16
+ .Lcbc_decrypt:
+- movups (%r8),%xmm9
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $16,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
++ movups (%r8),%xmm10
+ movl %r10d,%eax
+- cmpq $112,%rdx
++ cmpq $80,%rdx
+ jbe .Lcbc_dec_tail
+- shrl $1,%r10d
++
++ movups (%rcx),%xmm0
++ movdqu 0(%rdi),%xmm2
++ movdqu 16(%rdi),%xmm3
++ movdqa %xmm2,%xmm11
++ movdqu 32(%rdi),%xmm4
++ movdqa %xmm3,%xmm12
++ movdqu 48(%rdi),%xmm5
++ movdqa %xmm4,%xmm13
++ movdqu 64(%rdi),%xmm6
++ movdqa %xmm5,%xmm14
++ movdqu 80(%rdi),%xmm7
++ movdqa %xmm6,%xmm15
++ cmpq $112,%rdx
++ jbe .Lcbc_dec_six_or_seven
++
+ subq $112,%rdx
+- movl %r10d,%eax
+- movaps %xmm9,-24(%rsp)
++ leaq 112(%rcx),%rcx
+ jmp .Lcbc_dec_loop8_enter
+ .align 16
+ .Lcbc_dec_loop8:
+- movaps %xmm0,-24(%rsp)
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ .Lcbc_dec_loop8_enter:
+- movups (%rcx),%xmm0
+- movups (%rdi),%xmm2
+- movups 16(%rdi),%xmm3
+- movups 16(%rcx),%xmm1
++ movdqu 96(%rdi),%xmm8
++ pxor %xmm0,%xmm2
++ movdqu 112(%rdi),%xmm9
++ pxor %xmm0,%xmm3
++ movups 16-112(%rcx),%xmm1
++ pxor %xmm0,%xmm4
++ xorq %r11,%r11
++ cmpq $112,%rdx
++ pxor %xmm0,%xmm5
++ pxor %xmm0,%xmm6
++ pxor %xmm0,%xmm7
++ pxor %xmm0,%xmm8
+
+- leaq 32(%rcx),%rcx
+- movdqu 32(%rdi),%xmm4
+- xorps %xmm0,%xmm2
+- movdqu 48(%rdi),%xmm5
+- xorps %xmm0,%xmm3
+- movdqu 64(%rdi),%xmm6
+ .byte 102,15,56,222,209
+- pxor %xmm0,%xmm4
+- movdqu 80(%rdi),%xmm7
++ pxor %xmm0,%xmm9
++ movups 32-112(%rcx),%xmm0
+ .byte 102,15,56,222,217
+- pxor %xmm0,%xmm5
+- movdqu 96(%rdi),%xmm8
+ .byte 102,15,56,222,225
+- pxor %xmm0,%xmm6
+- movdqu 112(%rdi),%xmm9
+ .byte 102,15,56,222,233
+- pxor %xmm0,%xmm7
+- decl %eax
+ .byte 102,15,56,222,241
+- pxor %xmm0,%xmm8
+ .byte 102,15,56,222,249
+- pxor %xmm0,%xmm9
+- movups (%rcx),%xmm0
++ setnc %r11b
+ .byte 102,68,15,56,222,193
++ shlq $7,%r11
+ .byte 102,68,15,56,222,201
+- movups 16(%rcx),%xmm1
+-
+- call .Ldec_loop8_enter
++ addq %rdi,%r11
++ movups 48-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 64-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 80-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 96-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 112-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 128-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 144-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 160-112(%rcx),%xmm0
++ cmpl $11,%eax
++ jb .Lcbc_dec_done
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 176-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 192-112(%rcx),%xmm0
++ je .Lcbc_dec_done
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 208-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 224-112(%rcx),%xmm0
++.Lcbc_dec_done:
++.byte 102,15,56,222,209
++ pxor %xmm0,%xmm10
++.byte 102,15,56,222,217
++ pxor %xmm0,%xmm11
++.byte 102,15,56,222,225
++ pxor %xmm0,%xmm12
++.byte 102,15,56,222,233
++ pxor %xmm0,%xmm13
++.byte 102,15,56,222,241
++ pxor %xmm0,%xmm14
++.byte 102,15,56,222,249
++ pxor %xmm0,%xmm15
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movdqu 80(%rdi),%xmm1
++
++.byte 102,65,15,56,223,210
++ movdqu 96(%rdi),%xmm10
++ pxor %xmm0,%xmm1
++.byte 102,65,15,56,223,219
++ pxor %xmm0,%xmm10
++ movdqu 112(%rdi),%xmm0
++ leaq 128(%rdi),%rdi
++.byte 102,65,15,56,223,228
++ movdqu 0(%r11),%xmm11
++.byte 102,65,15,56,223,237
++ movdqu 16(%r11),%xmm12
++.byte 102,65,15,56,223,246
++ movdqu 32(%r11),%xmm13
++.byte 102,65,15,56,223,255
++ movdqu 48(%r11),%xmm14
++.byte 102,68,15,56,223,193
++ movdqu 64(%r11),%xmm15
++.byte 102,69,15,56,223,202
++ movdqa %xmm0,%xmm10
++ movdqu 80(%r11),%xmm1
++ movups -112(%rcx),%xmm0
+
+- movups (%rdi),%xmm1
+- movups 16(%rdi),%xmm0
+- xorps -24(%rsp),%xmm2
+- xorps %xmm1,%xmm3
+- movups 32(%rdi),%xmm1
+- xorps %xmm0,%xmm4
+- movups 48(%rdi),%xmm0
+- xorps %xmm1,%xmm5
+- movups 64(%rdi),%xmm1
+- xorps %xmm0,%xmm6
+- movups 80(%rdi),%xmm0
+- xorps %xmm1,%xmm7
+- movups 96(%rdi),%xmm1
+- xorps %xmm0,%xmm8
+- movups 112(%rdi),%xmm0
+- xorps %xmm1,%xmm9
+ movups %xmm2,(%rsi)
++ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
++ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
++ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+- movl %r10d,%eax
++ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+- movq %r11,%rcx
++ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+- leaq 128(%rdi),%rdi
++ movdqa %xmm1,%xmm7
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
++
+ subq $128,%rdx
+ ja .Lcbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+- movaps %xmm0,%xmm9
++ leaq -112(%rcx),%rcx
+ addq $112,%rdx
+ jle .Lcbc_dec_tail_collected
+- movups %xmm2,(%rsi)
+- leal 1(%r10,%r10,1),%eax
++ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
++ cmpq $80,%rdx
++ jbe .Lcbc_dec_tail
++
++ movaps %xmm11,%xmm2
++.Lcbc_dec_six_or_seven:
++ cmpq $96,%rdx
++ ja .Lcbc_dec_seven
++
++ movaps %xmm7,%xmm8
++ call _aesni_decrypt6
++ pxor %xmm10,%xmm2
++ movaps %xmm8,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ pxor %xmm15,%xmm7
++ movdqu %xmm6,64(%rsi)
++ leaq 80(%rsi),%rsi
++ movdqa %xmm7,%xmm2
++ jmp .Lcbc_dec_tail_collected
++
++.align 16
++.Lcbc_dec_seven:
++ movups 96(%rdi),%xmm8
++ xorps %xmm9,%xmm9
++ call _aesni_decrypt8
++ movups 80(%rdi),%xmm9
++ pxor %xmm10,%xmm2
++ movups 96(%rdi),%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ pxor %xmm15,%xmm7
++ movdqu %xmm6,64(%rsi)
++ pxor %xmm9,%xmm8
++ movdqu %xmm7,80(%rsi)
++ leaq 96(%rsi),%rsi
++ movdqa %xmm8,%xmm2
++ jmp .Lcbc_dec_tail_collected
++
+ .Lcbc_dec_tail:
+ movups (%rdi),%xmm2
+- movaps %xmm2,%xmm8
+- cmpq $16,%rdx
++ subq $16,%rdx
+ jbe .Lcbc_dec_one
+
+ movups 16(%rdi),%xmm3
+- movaps %xmm3,%xmm7
+- cmpq $32,%rdx
++ movaps %xmm2,%xmm11
++ subq $16,%rdx
+ jbe .Lcbc_dec_two
+
+ movups 32(%rdi),%xmm4
+- movaps %xmm4,%xmm6
+- cmpq $48,%rdx
++ movaps %xmm3,%xmm12
++ subq $16,%rdx
+ jbe .Lcbc_dec_three
+
+ movups 48(%rdi),%xmm5
+- cmpq $64,%rdx
++ movaps %xmm4,%xmm13
++ subq $16,%rdx
+ jbe .Lcbc_dec_four
+
+ movups 64(%rdi),%xmm6
+- cmpq $80,%rdx
+- jbe .Lcbc_dec_five
+-
+- movups 80(%rdi),%xmm7
+- cmpq $96,%rdx
+- jbe .Lcbc_dec_six
+-
+- movups 96(%rdi),%xmm8
+- movaps %xmm9,-24(%rsp)
+- call _aesni_decrypt8
+- movups (%rdi),%xmm1
+- movups 16(%rdi),%xmm0
+- xorps -24(%rsp),%xmm2
+- xorps %xmm1,%xmm3
+- movups 32(%rdi),%xmm1
+- xorps %xmm0,%xmm4
+- movups 48(%rdi),%xmm0
+- xorps %xmm1,%xmm5
+- movups 64(%rdi),%xmm1
+- xorps %xmm0,%xmm6
+- movups 80(%rdi),%xmm0
+- xorps %xmm1,%xmm7
+- movups 96(%rdi),%xmm9
+- xorps %xmm0,%xmm8
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
+- leaq 96(%rsi),%rsi
+- movaps %xmm8,%xmm2
+- subq $112,%rdx
++ movaps %xmm5,%xmm14
++ movaps %xmm6,%xmm15
++ xorps %xmm7,%xmm7
++ call _aesni_decrypt6
++ pxor %xmm10,%xmm2
++ movaps %xmm15,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ leaq 64(%rsi),%rsi
++ movdqa %xmm6,%xmm2
++ subq $16,%rdx
+ jmp .Lcbc_dec_tail_collected
++
+ .align 16
+ .Lcbc_dec_one:
++ movaps %xmm2,%xmm11
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+@@ -2222,111 +2661,69 @@ aesni_cbc_encrypt:
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_16
+ .byte 102,15,56,223,209
+- xorps %xmm9,%xmm2
+- movaps %xmm8,%xmm9
+- subq $16,%rdx
++ xorps %xmm10,%xmm2
++ movaps %xmm11,%xmm10
+ jmp .Lcbc_dec_tail_collected
+ .align 16
+ .Lcbc_dec_two:
++ movaps %xmm3,%xmm12
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- movaps %xmm7,%xmm9
+- movaps %xmm3,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm12,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ movdqa %xmm3,%xmm2
+ leaq 16(%rsi),%rsi
+- subq $32,%rdx
+ jmp .Lcbc_dec_tail_collected
+ .align 16
+ .Lcbc_dec_three:
++ movaps %xmm4,%xmm13
+ call _aesni_decrypt3
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- xorps %xmm7,%xmm4
+- movups %xmm3,16(%rsi)
+- movaps %xmm6,%xmm9
+- movaps %xmm4,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm13,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ movdqa %xmm4,%xmm2
+ leaq 32(%rsi),%rsi
+- subq $48,%rdx
+ jmp .Lcbc_dec_tail_collected
+ .align 16
+ .Lcbc_dec_four:
++ movaps %xmm5,%xmm14
+ call _aesni_decrypt4
+- xorps %xmm9,%xmm2
+- movups 48(%rdi),%xmm9
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- xorps %xmm7,%xmm4
+- movups %xmm3,16(%rsi)
+- xorps %xmm6,%xmm5
+- movups %xmm4,32(%rsi)
+- movaps %xmm5,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm14,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ movdqa %xmm5,%xmm2
+ leaq 48(%rsi),%rsi
+- subq $64,%rdx
+- jmp .Lcbc_dec_tail_collected
+-.align 16
+-.Lcbc_dec_five:
+- xorps %xmm7,%xmm7
+- call _aesni_decrypt6
+- movups 16(%rdi),%xmm1
+- movups 32(%rdi),%xmm0
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- xorps %xmm1,%xmm4
+- movups 48(%rdi),%xmm1
+- xorps %xmm0,%xmm5
+- movups 64(%rdi),%xmm9
+- xorps %xmm1,%xmm6
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- leaq 64(%rsi),%rsi
+- movaps %xmm6,%xmm2
+- subq $80,%rdx
+- jmp .Lcbc_dec_tail_collected
+-.align 16
+-.Lcbc_dec_six:
+- call _aesni_decrypt6
+- movups 16(%rdi),%xmm1
+- movups 32(%rdi),%xmm0
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- xorps %xmm1,%xmm4
+- movups 48(%rdi),%xmm1
+- xorps %xmm0,%xmm5
+- movups 64(%rdi),%xmm0
+- xorps %xmm1,%xmm6
+- movups 80(%rdi),%xmm9
+- xorps %xmm0,%xmm7
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- movups %xmm6,64(%rsi)
+- leaq 80(%rsi),%rsi
+- movaps %xmm7,%xmm2
+- subq $96,%rdx
+ jmp .Lcbc_dec_tail_collected
++
+ .align 16
+ .Lcbc_dec_tail_collected:
++ movups %xmm10,(%r8)
+ andq $15,%rdx
+- movups %xmm9,(%r8)
+ jnz .Lcbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ jmp .Lcbc_dec_ret
+ .align 16
+ .Lcbc_dec_tail_partial:
+- movaps %xmm2,-24(%rsp)
++ movaps %xmm2,(%rsp)
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+- leaq -24(%rsp),%rsi
++ leaq (%rsp),%rsi
+ .long 0x9066A4F3
+
+ .Lcbc_dec_ret:
++ leaq (%rbp),%rsp
++ popq %rbp
+ .Lcbc_ret:
+ .byte 0xf3,0xc3
+ .size aesni_cbc_encrypt,.-aesni_cbc_encrypt
+@@ -2569,6 +2966,8 @@ __aesni_set_encrypt_key:
+ .long 1,0,0,0
+ .Lxts_magic:
+ .long 0x87,0,1,0
++.Lincrement1:
++.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+
+ .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ .align 64
+diff --git a/lib/accelerated/x86/elf/padlock-x86-64.s b/lib/accelerated/x86/elf/padlock-x86-64.s
+index 4709ac2..2ac113d 100644
+--- a/lib/accelerated/x86/elf/padlock-x86-64.s
++++ b/lib/accelerated/x86/elf/padlock-x86-64.s
+@@ -595,6 +595,468 @@ padlock_cbc_encrypt:
+ popq %rbp
+ .byte 0xf3,0xc3
+ .size padlock_cbc_encrypt,.-padlock_cbc_encrypt
++.globl padlock_cfb_encrypt
++.type padlock_cfb_encrypt, at function
++.align 16
++padlock_cfb_encrypt:
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz .Lcfb_abort
++ testq $15,%rcx
++ jnz .Lcfb_abort
++ leaq .Lpadlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz .Lcfb_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz .Lcfb_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++ jmp .Lcfb_loop
++.align 16
++.Lcfb_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz .Lcfb_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++.Lcfb_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,224
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz .Lcfb_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++.Lcfb_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jnz .Lcfb_loop
++ cmpq %rbp,%rsp
++ je .Lcfb_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++.Lcfb_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja .Lcfb_bzero
++
++.Lcfb_done:
++ leaq (%rbp),%rsp
++ jmp .Lcfb_exit
++
++.align 16
++.Lcfb_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,224
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++.Lcfb_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++.Lcfb_abort:
++ popq %rbx
++ popq %rbp
++ .byte 0xf3,0xc3
++.size padlock_cfb_encrypt,.-padlock_cfb_encrypt
++.globl padlock_ofb_encrypt
++.type padlock_ofb_encrypt, at function
++.align 16
++padlock_ofb_encrypt:
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz .Lofb_abort
++ testq $15,%rcx
++ jnz .Lofb_abort
++ leaq .Lpadlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz .Lofb_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz .Lofb_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++ jmp .Lofb_loop
++.align 16
++.Lofb_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz .Lofb_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++.Lofb_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,232
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz .Lofb_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++.Lofb_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jnz .Lofb_loop
++ cmpq %rbp,%rsp
++ je .Lofb_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++.Lofb_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja .Lofb_bzero
++
++.Lofb_done:
++ leaq (%rbp),%rsp
++ jmp .Lofb_exit
++
++.align 16
++.Lofb_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,232
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++.Lofb_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++.Lofb_abort:
++ popq %rbx
++ popq %rbp
++ .byte 0xf3,0xc3
++.size padlock_ofb_encrypt,.-padlock_ofb_encrypt
++.globl padlock_ctr32_encrypt
++.type padlock_ctr32_encrypt, at function
++.align 16
++padlock_ctr32_encrypt:
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz .Lctr32_abort
++ testq $15,%rcx
++ jnz .Lctr32_abort
++ leaq .Lpadlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz .Lctr32_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz .Lctr32_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++.Lctr32_reenter:
++ movl -4(%rdx),%eax
++ bswapl %eax
++ negl %eax
++ andl $31,%eax
++ movq $512,%rbx
++ shll $4,%eax
++ cmovzq %rbx,%rax
++ cmpq %rax,%rcx
++ cmovaq %rax,%rbx
++ cmovbeq %rcx,%rbx
++ cmpq %rbx,%rcx
++ ja .Lctr32_loop
++ movq %rsi,%rax
++ cmpq %rsp,%rbp
++ cmoveq %rdi,%rax
++ addq %rcx,%rax
++ negq %rax
++ andq $4095,%rax
++ cmpq $32,%rax
++ movq $-32,%rax
++ cmovaeq %rbx,%rax
++ andq %rax,%rbx
++ jz .Lctr32_unaligned_tail
++ jmp .Lctr32_loop
++.align 16
++.Lctr32_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz .Lctr32_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++.Lctr32_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++ movl -4(%rdx),%eax
++ testl $4294901760,%eax
++ jnz .Lctr32_no_carry
++ bswapl %eax
++ addl $65536,%eax
++ bswapl %eax
++ movl %eax,-4(%rdx)
++.Lctr32_no_carry:
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz .Lctr32_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++.Lctr32_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jz .Lctr32_break
++ cmpq %rbx,%rcx
++ jae .Lctr32_loop
++ movq %rcx,%rbx
++ movq %rsi,%rax
++ cmpq %rsp,%rbp
++ cmoveq %rdi,%rax
++ addq %rcx,%rax
++ negq %rax
++ andq $4095,%rax
++ cmpq $32,%rax
++ movq $-32,%rax
++ cmovaeq %rbx,%rax
++ andq %rax,%rbx
++ jnz .Lctr32_loop
++.Lctr32_unaligned_tail:
++ xorl %eax,%eax
++ cmpq %rsp,%rbp
++ cmoveq %rcx,%rax
++ movq %rdi,%r8
++ movq %rcx,%rbx
++ subq %rax,%rsp
++ shrq $3,%rcx
++ leaq (%rsp),%rdi
++.byte 0xf3,0x48,0xa5
++ movq %rsp,%rsi
++ movq %r8,%rdi
++ movq %rbx,%rcx
++ jmp .Lctr32_loop
++.align 16
++.Lctr32_break:
++ cmpq %rbp,%rsp
++ je .Lctr32_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++.Lctr32_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja .Lctr32_bzero
++
++.Lctr32_done:
++ leaq (%rbp),%rsp
++ jmp .Lctr32_exit
++
++.align 16
++.Lctr32_aligned:
++ movl -4(%rdx),%eax
++ bswapl %eax
++ negl %eax
++ andl $65535,%eax
++ movq $1048576,%rbx
++ shll $4,%eax
++ cmovzq %rbx,%rax
++ cmpq %rax,%rcx
++ cmovaq %rax,%rbx
++ cmovbeq %rcx,%rbx
++ jbe .Lctr32_aligned_skip
++
++.Lctr32_aligned_loop:
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++
++ movl -4(%rdx),%eax
++ bswapl %eax
++ addl $65536,%eax
++ bswapl %eax
++ movl %eax,-4(%rdx)
++
++ movq %r10,%rcx
++ subq %r11,%rcx
++ movq $1048576,%rbx
++ jz .Lctr32_exit
++ cmpq %rbx,%rcx
++ jae .Lctr32_aligned_loop
++
++.Lctr32_aligned_skip:
++ leaq (%rsi,%rcx,1),%rbp
++ negq %rbp
++ andq $4095,%rbp
++ xorl %eax,%eax
++ cmpq $32,%rbp
++ movq $32-1,%rbp
++ cmovaeq %rax,%rbp
++ andq %rcx,%rbp
++ subq %rbp,%rcx
++ jz .Lctr32_aligned_tail
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++ testq %rbp,%rbp
++ jz .Lctr32_exit
++
++.Lctr32_aligned_tail:
++ movq %rdi,%r8
++ movq %rbp,%rbx
++ movq %rbp,%rcx
++ leaq (%rsp),%rbp
++ subq %rcx,%rsp
++ shrq $3,%rcx
++ leaq (%rsp),%rdi
++.byte 0xf3,0x48,0xa5
++ leaq (%r8),%rdi
++ leaq (%rsp),%rsi
++ movq %rbx,%rcx
++ jmp .Lctr32_loop
++.Lctr32_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++.Lctr32_abort:
++ popq %rbx
++ popq %rbp
++ .byte 0xf3,0xc3
++.size padlock_ctr32_encrypt,.-padlock_ctr32_encrypt
+ .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ .align 16
+ .data
+diff --git a/lib/accelerated/x86/elf/padlock-x86.s b/lib/accelerated/x86/elf/padlock-x86.s
+index ea982ec..2199255 100644
+--- a/lib/accelerated/x86/elf/padlock-x86.s
++++ b/lib/accelerated/x86/elf/padlock-x86.s
+@@ -187,16 +187,14 @@ padlock_ecb_encrypt:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+- cmpl $128,%ecx
+- jbe .L006ecb_short
+ testl $32,(%edx)
+- jnz .L007ecb_aligned
++ jnz .L006ecb_aligned
+ testl $15,%edi
+ setz %al
+ testl $15,%esi
+ setz %bl
+ testl %ebx,%eax
+- jnz .L007ecb_aligned
++ jnz .L006ecb_aligned
+ negl %eax
+ movl $512,%ebx
+ notl %eax
+@@ -208,10 +206,28 @@ padlock_ecb_encrypt:
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
+ andl $-16,%esp
+- jmp .L008ecb_loop
++ movl %eax,16(%ebp)
++ cmpl %ebx,%ecx
++ ja .L007ecb_loop
++ movl %esi,%eax
++ cmpl %esp,%ebp
++ cmovel %edi,%eax
++ addl %ecx,%eax
++ negl %eax
++ andl $4095,%eax
++ cmpl $128,%eax
++ movl $-128,%eax
++ cmovael %ebx,%eax
++ andl %eax,%ebx
++ jz .L008ecb_unaligned_tail
++ jmp .L007ecb_loop
+ .align 16
+-.L008ecb_loop:
++.L007ecb_loop:
+ movl %edi,(%ebp)
+ movl %esi,4(%ebp)
+ movl %ecx,8(%ebp)
+@@ -236,8 +252,8 @@ padlock_ecb_encrypt:
+ testl $15,%edi
+ jz .L010ecb_out_aligned
+ movl %ebx,%ecx
+- shrl $2,%ecx
+ leal (%esp),%esi
++ shrl $2,%ecx
+ .byte 243,165
+ subl %ebx,%edi
+ .L010ecb_out_aligned:
+@@ -247,43 +263,75 @@ padlock_ecb_encrypt:
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+- jnz .L008ecb_loop
++ jz .L011ecb_break
++ cmpl %ebx,%ecx
++ jae .L007ecb_loop
++.L008ecb_unaligned_tail:
++ xorl %eax,%eax
+ cmpl %ebp,%esp
+- je .L011ecb_done
++ cmovel %ecx,%eax
++ subl %eax,%esp
++ movl %edi,%eax
++ movl %ecx,%ebx
++ shrl $2,%ecx
++ leal (%esp),%edi
++.byte 243,165
++ movl %esp,%esi
++ movl %eax,%edi
++ movl %ebx,%ecx
++ jmp .L007ecb_loop
++.align 16
++.L011ecb_break:
++ cmpl %ebp,%esp
++ je .L012ecb_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+-.L012ecb_bzero:
++.L013ecb_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+- ja .L012ecb_bzero
+-.L011ecb_done:
++ ja .L013ecb_bzero
++.L012ecb_done:
++ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+- jmp .L013ecb_exit
++ jmp .L014ecb_exit
+ .align 16
+-.L006ecb_short:
++.L006ecb_aligned:
++ leal (%esi,%ecx,1),%ebp
++ negl %ebp
++ andl $4095,%ebp
+ xorl %eax,%eax
+- leal -24(%esp),%ebp
+- subl %ecx,%eax
+- leal (%eax,%ebp,1),%esp
+- andl $-16,%esp
+- xorl %ebx,%ebx
+-.L014ecb_short_copy:
+- movups (%esi,%ebx,1),%xmm0
+- leal 16(%ebx),%ebx
+- cmpl %ebx,%ecx
+- movaps %xmm0,-16(%esp,%ebx,1)
+- ja .L014ecb_short_copy
+- movl %esp,%esi
+- movl %ecx,%ebx
+- jmp .L008ecb_loop
+-.align 16
+-.L007ecb_aligned:
++ cmpl $128,%ebp
++ movl $127,%ebp
++ cmovael %eax,%ebp
++ andl %ecx,%ebp
++ subl %ebp,%ecx
++ jz .L015ecb_aligned_tail
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+ .byte 243,15,167,200
+-.L013ecb_exit:
++ testl %ebp,%ebp
++ jz .L014ecb_exit
++.L015ecb_aligned_tail:
++ movl %ebp,%ecx
++ leal -24(%esp),%ebp
++ movl %ebp,%esp
++ movl %ebp,%eax
++ subl %ecx,%esp
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ movl %edi,%eax
++ movl %ecx,%ebx
++ shrl $2,%ecx
++ leal (%esp),%edi
++.byte 243,165
++ movl %esp,%esi
++ movl %eax,%edi
++ movl %ebx,%ecx
++ jmp .L007ecb_loop
++.L014ecb_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+ .L004ecb_abort:
+@@ -307,19 +355,17 @@ padlock_cbc_encrypt:
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ testl $15,%edx
+- jnz .L015cbc_abort
++ jnz .L016cbc_abort
+ testl $15,%ecx
+- jnz .L015cbc_abort
+- leal .Lpadlock_saved_context-.L016cbc_pic_point,%eax
++ jnz .L016cbc_abort
++ leal .Lpadlock_saved_context-.L017cbc_pic_point,%eax
+ pushfl
+ cld
+ call _padlock_verify_ctx
+-.L016cbc_pic_point:
++.L017cbc_pic_point:
+ leal 16(%edx),%edx
+ xorl %eax,%eax
+ xorl %ebx,%ebx
+- cmpl $64,%ecx
+- jbe .L017cbc_short
+ testl $32,(%edx)
+ jnz .L018cbc_aligned
+ testl $15,%edi
+@@ -339,7 +385,25 @@ padlock_cbc_encrypt:
+ negl %eax
+ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
+ andl $-16,%esp
++ movl %eax,16(%ebp)
++ cmpl %ebx,%ecx
++ ja .L019cbc_loop
++ movl %esi,%eax
++ cmpl %esp,%ebp
++ cmovel %edi,%eax
++ addl %ecx,%eax
++ negl %eax
++ andl $4095,%eax
++ cmpl $64,%eax
++ movl $-64,%eax
++ cmovael %ebx,%eax
++ andl %eax,%ebx
++ jz .L020cbc_unaligned_tail
+ jmp .L019cbc_loop
+ .align 16
+ .L019cbc_loop:
+@@ -351,13 +415,13 @@ padlock_cbc_encrypt:
+ testl $15,%edi
+ cmovnzl %esp,%edi
+ testl $15,%esi
+- jz .L020cbc_inp_aligned
++ jz .L021cbc_inp_aligned
+ shrl $2,%ecx
+ .byte 243,165
+ subl %ebx,%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+-.L020cbc_inp_aligned:
++.L021cbc_inp_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+@@ -367,67 +431,450 @@ padlock_cbc_encrypt:
+ movl (%ebp),%edi
+ movl 12(%ebp),%ebx
+ testl $15,%edi
+- jz .L021cbc_out_aligned
++ jz .L022cbc_out_aligned
+ movl %ebx,%ecx
+- shrl $2,%ecx
+ leal (%esp),%esi
++ shrl $2,%ecx
+ .byte 243,165
+ subl %ebx,%edi
+-.L021cbc_out_aligned:
++.L022cbc_out_aligned:
+ movl 4(%ebp),%esi
+ movl 8(%ebp),%ecx
+ addl %ebx,%edi
+ addl %ebx,%esi
+ subl %ebx,%ecx
+ movl $512,%ebx
+- jnz .L019cbc_loop
++ jz .L023cbc_break
++ cmpl %ebx,%ecx
++ jae .L019cbc_loop
++.L020cbc_unaligned_tail:
++ xorl %eax,%eax
+ cmpl %ebp,%esp
+- je .L022cbc_done
++ cmovel %ecx,%eax
++ subl %eax,%esp
++ movl %edi,%eax
++ movl %ecx,%ebx
++ shrl $2,%ecx
++ leal (%esp),%edi
++.byte 243,165
++ movl %esp,%esi
++ movl %eax,%edi
++ movl %ebx,%ecx
++ jmp .L019cbc_loop
++.align 16
++.L023cbc_break:
++ cmpl %ebp,%esp
++ je .L024cbc_done
+ pxor %xmm0,%xmm0
+ leal (%esp),%eax
+-.L023cbc_bzero:
++.L025cbc_bzero:
+ movaps %xmm0,(%eax)
+ leal 16(%eax),%eax
+ cmpl %eax,%ebp
+- ja .L023cbc_bzero
+-.L022cbc_done:
++ ja .L025cbc_bzero
++.L024cbc_done:
++ movl 16(%ebp),%ebp
+ leal 24(%ebp),%esp
+- jmp .L024cbc_exit
++ jmp .L026cbc_exit
+ .align 16
+-.L017cbc_short:
++.L018cbc_aligned:
++ leal (%esi,%ecx,1),%ebp
++ negl %ebp
++ andl $4095,%ebp
+ xorl %eax,%eax
++ cmpl $64,%ebp
++ movl $63,%ebp
++ cmovael %eax,%ebp
++ andl %ecx,%ebp
++ subl %ebp,%ecx
++ jz .L027cbc_aligned_tail
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,208
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++ testl %ebp,%ebp
++ jz .L026cbc_exit
++.L027cbc_aligned_tail:
++ movl %ebp,%ecx
+ leal -24(%esp),%ebp
+- subl %ecx,%eax
++ movl %ebp,%esp
++ movl %ebp,%eax
++ subl %ecx,%esp
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ movl %edi,%eax
++ movl %ecx,%ebx
++ shrl $2,%ecx
++ leal (%esp),%edi
++.byte 243,165
++ movl %esp,%esi
++ movl %eax,%edi
++ movl %ebx,%ecx
++ jmp .L019cbc_loop
++.L026cbc_exit:
++ movl $1,%eax
++ leal 4(%esp),%esp
++.L016cbc_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
++.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin
++.globl padlock_cfb_encrypt
++.type padlock_cfb_encrypt, at function
++.align 16
++padlock_cfb_encrypt:
++.L_padlock_cfb_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz .L028cfb_abort
++ testl $15,%ecx
++ jnz .L028cfb_abort
++ leal .Lpadlock_saved_context-.L029cfb_pic_point,%eax
++ pushfl
++ cld
++ call _padlock_verify_ctx
++.L029cfb_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%edx)
++ jnz .L030cfb_aligned
++ testl $15,%edi
++ setz %al
++ testl $15,%esi
++ setz %bl
++ testl %ebx,%eax
++ jnz .L030cfb_aligned
++ negl %eax
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
+ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
+ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp .L031cfb_loop
++.align 16
++.L031cfb_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ testl $15,%edi
++ cmovnzl %esp,%edi
++ testl $15,%esi
++ jz .L032cfb_inp_aligned
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++ movl %ebx,%ecx
++ movl %edi,%esi
++.L032cfb_inp_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,224
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ testl $15,%edi
++ jz .L033cfb_out_aligned
++ movl %ebx,%ecx
++ leal (%esp),%esi
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++.L033cfb_out_aligned:
++ movl 4(%ebp),%esi
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz .L031cfb_loop
++ cmpl %ebp,%esp
++ je .L034cfb_done
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++.L035cfb_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja .L035cfb_bzero
++.L034cfb_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ jmp .L036cfb_exit
++.align 16
++.L030cfb_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,224
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++.L036cfb_exit:
++ movl $1,%eax
++ leal 4(%esp),%esp
++.L028cfb_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
++.size padlock_cfb_encrypt,.-.L_padlock_cfb_encrypt_begin
++.globl padlock_ofb_encrypt
++.type padlock_ofb_encrypt, at function
++.align 16
++padlock_ofb_encrypt:
++.L_padlock_ofb_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz .L037ofb_abort
++ testl $15,%ecx
++ jnz .L037ofb_abort
++ leal .Lpadlock_saved_context-.L038ofb_pic_point,%eax
++ pushfl
++ cld
++ call _padlock_verify_ctx
++.L038ofb_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
+ xorl %ebx,%ebx
+-.L025cbc_short_copy:
+- movups (%esi,%ebx,1),%xmm0
+- leal 16(%ebx),%ebx
++ testl $32,(%edx)
++ jnz .L039ofb_aligned
++ testl $15,%edi
++ setz %al
++ testl $15,%esi
++ setz %bl
++ testl %ebx,%eax
++ jnz .L039ofb_aligned
++ negl %eax
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
+ cmpl %ebx,%ecx
+- movaps %xmm0,-16(%esp,%ebx,1)
+- ja .L025cbc_short_copy
+- movl %esp,%esi
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
+ movl %ecx,%ebx
+- jmp .L019cbc_loop
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp .L040ofb_loop
+ .align 16
+-.L018cbc_aligned:
++.L040ofb_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ testl $15,%edi
++ cmovnzl %esp,%edi
++ testl $15,%esi
++ jz .L041ofb_inp_aligned
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++ movl %ebx,%ecx
++ movl %edi,%esi
++.L041ofb_inp_aligned:
+ leal -16(%edx),%eax
+ leal 16(%edx),%ebx
+ shrl $4,%ecx
+-.byte 243,15,167,208
++.byte 243,15,167,232
+ movaps (%eax),%xmm0
+ movaps %xmm0,-16(%edx)
+-.L024cbc_exit:
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ testl $15,%edi
++ jz .L042ofb_out_aligned
++ movl %ebx,%ecx
++ leal (%esp),%esi
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++.L042ofb_out_aligned:
++ movl 4(%ebp),%esi
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz .L040ofb_loop
++ cmpl %ebp,%esp
++ je .L043ofb_done
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++.L044ofb_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja .L044ofb_bzero
++.L043ofb_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ jmp .L045ofb_exit
++.align 16
++.L039ofb_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,232
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++.L045ofb_exit:
+ movl $1,%eax
+ leal 4(%esp),%esp
+-.L015cbc_abort:
++.L037ofb_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+-.size padlock_cbc_encrypt,.-.L_padlock_cbc_encrypt_begin
++.size padlock_ofb_encrypt,.-.L_padlock_ofb_encrypt_begin
++.globl padlock_ctr32_encrypt
++.type padlock_ctr32_encrypt, at function
++.align 16
++padlock_ctr32_encrypt:
++.L_padlock_ctr32_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz .L046ctr32_abort
++ testl $15,%ecx
++ jnz .L046ctr32_abort
++ leal .Lpadlock_saved_context-.L047ctr32_pic_point,%eax
++ pushfl
++ cld
++ call _padlock_verify_ctx
++.L047ctr32_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ movq -16(%edx),%mm0
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp .L048ctr32_loop
++.align 16
++.L048ctr32_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ movl -4(%edx),%ecx
++ xorl %edi,%edi
++ movl -8(%edx),%eax
++.L049ctr32_prepare:
++ movl %ecx,12(%esp,%edi,1)
++ bswap %ecx
++ movq %mm0,(%esp,%edi,1)
++ incl %ecx
++ movl %eax,8(%esp,%edi,1)
++ bswap %ecx
++ leal 16(%edi),%edi
++ cmpl %ebx,%edi
++ jb .L049ctr32_prepare
++ movl %ecx,-4(%edx)
++ leal (%esp),%esi
++ leal (%esp),%edi
++ movl %ebx,%ecx
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,200
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ movl 4(%ebp),%esi
++ xorl %ecx,%ecx
++.L050ctr32_xor:
++ movups (%esi,%ecx,1),%xmm1
++ leal 16(%ecx),%ecx
++ pxor -16(%esp,%ecx,1),%xmm1
++ movups %xmm1,-16(%edi,%ecx,1)
++ cmpl %ebx,%ecx
++ jb .L050ctr32_xor
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz .L048ctr32_loop
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++.L051ctr32_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja .L051ctr32_bzero
++.L052ctr32_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ movl $1,%eax
++ leal 4(%esp),%esp
++ emms
++.L046ctr32_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
++.size padlock_ctr32_encrypt,.-.L_padlock_ctr32_encrypt_begin
+ .globl padlock_xstore
+ .type padlock_xstore, at function
+ .align 16
+@@ -447,10 +894,10 @@ _win32_segv_handler:
+ movl 4(%esp),%edx
+ movl 12(%esp),%ecx
+ cmpl $3221225477,(%edx)
+- jne .L026ret
++ jne .L053ret
+ addl $4,184(%ecx)
+ movl $0,%eax
+-.L026ret:
++.L053ret:
+ ret
+ .size _win32_segv_handler,.-_win32_segv_handler
+ .globl padlock_sha1_oneshot
+diff --git a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
+index cfac705..eac88ae 100644
+--- a/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
++++ b/lib/accelerated/x86/macosx/appro-aes-gcm-x86-64-macosx.s
+@@ -699,6 +699,7 @@ L$ghash_epilogue:
+
+ .p2align 4
+ _gcm_init_clmul:
++L$_init_clmul:
+ movdqu (%rsi),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+@@ -717,15 +718,15 @@ _gcm_init_clmul:
+ pxor %xmm5,%xmm2
+
+
++ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm2,%xmm0
++ pxor %xmm2,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+-.byte 102,15,58,68,220,0
++.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+@@ -735,44 +736,134 @@ _gcm_init_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++ pxor %xmm4,%xmm0
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ pshufd $78,%xmm2,%xmm3
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm2,%xmm3
++ movdqu %xmm2,0(%rdi)
++ pxor %xmm0,%xmm4
++ movdqu %xmm0,16(%rdi)
++.byte 102,15,58,15,227,8
++ movdqu %xmm4,32(%rdi)
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,15,58,68,194,0
++.byte 102,15,58,68,202,17
++.byte 102,15,58,68,222,0
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++
++ movdqa %xmm3,%xmm4
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
++ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
++
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ movdqa %xmm0,%xmm5
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,15,58,68,194,0
++.byte 102,15,58,68,202,17
++.byte 102,15,58,68,222,0
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++
++ movdqa %xmm3,%xmm4
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
++ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
++
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- movdqu %xmm2,(%rdi)
+- movdqu %xmm0,16(%rdi)
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ pshufd $78,%xmm5,%xmm3
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm5,%xmm3
++ movdqu %xmm5,48(%rdi)
++ pxor %xmm0,%xmm4
++ movdqu %xmm0,64(%rdi)
++.byte 102,15,58,15,227,8
++ movdqu %xmm4,80(%rdi)
+ .byte 0xf3,0xc3
+
+ .globl _gcm_gmult_clmul
+
+ .p2align 4
+ _gcm_gmult_clmul:
++L$_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa L$bswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
++ movdqu 32(%rsi),%xmm4
+ .byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+ .byte 102,15,58,68,220,0
+@@ -785,186 +876,358 @@ _gcm_gmult_clmul:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ .byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+
+ .globl _gcm_ghash_clmul
+
+-.p2align 4
++.p2align 5
+ _gcm_ghash_clmul:
++L$_ghash_clmul:
+ movdqa L$bswap_mask(%rip),%xmm5
++ movq $11547335547999543296,%rax
+
+ movdqu (%rdi),%xmm0
+ movdqu (%rsi),%xmm2
++ movdqu 32(%rsi),%xmm10
+ .byte 102,15,56,0,197
+
+ subq $16,%rcx
+ jz L$odd_tail
+
+- movdqu 16(%rsi),%xmm8
++ movdqu 16(%rsi),%xmm9
++ cmpq $48,%rcx
++ jb L$skip4x
+
++ subq $48,%rcx
++ movdqu 48(%rsi),%xmm14
++ movdqu 64(%rsi),%xmm15
+
+
+
+
+- movdqu (%rdx),%xmm3
+- movdqu 16(%rdx),%xmm6
+-.byte 102,15,56,0,221
++ movdqu 48(%rdx),%xmm6
++ movdqu 32(%rdx),%xmm11
+ .byte 102,15,56,0,245
+- pxor %xmm3,%xmm0
+- movdqa %xmm6,%xmm7
+- pshufd $78,%xmm6,%xmm3
+- pshufd $78,%xmm2,%xmm4
+- pxor %xmm6,%xmm3
+- pxor %xmm2,%xmm4
++.byte 102,68,15,56,0,221
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm7
++ pxor %xmm6,%xmm7
+ .byte 102,15,58,68,242,0
+-.byte 102,15,58,68,250,17
+-.byte 102,15,58,68,220,0
+- pxor %xmm6,%xmm3
+- pxor %xmm7,%xmm3
++.byte 102,68,15,58,68,194,17
++.byte 102,65,15,58,68,250,0
++
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,217,0
++.byte 102,69,15,58,68,233,17
++ xorps %xmm11,%xmm6
++.byte 102,69,15,58,68,226,16
++ xorps %xmm13,%xmm8
++ movups 80(%rsi),%xmm10
++ xorps %xmm12,%xmm7
++
++ movdqu 16(%rdx),%xmm11
++ movdqu 0(%rdx),%xmm3
++.byte 102,68,15,56,0,221
++.byte 102,15,56,0,221
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm3,%xmm0
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,222,0
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm3
++ pxor %xmm0,%xmm3
++.byte 102,69,15,58,68,238,17
++ xorps %xmm11,%xmm6
++.byte 102,69,15,58,68,226,0
++ xorps %xmm13,%xmm8
++
++ leaq 64(%rdx),%rdx
++ subq $64,%rcx
++ jc L$tail4x
++
++ jmp L$mod4_loop
++.p2align 5
++L$mod4_loop:
++.byte 102,65,15,58,68,199,0
++ xorps %xmm12,%xmm7
++ movdqu 48(%rdx),%xmm11
++.byte 102,68,15,56,0,221
++.byte 102,65,15,58,68,207,17
++ xorps %xmm6,%xmm0
++ movdqu 32(%rdx),%xmm6
++ movdqa %xmm11,%xmm13
++ pshufd $78,%xmm11,%xmm12
++.byte 102,65,15,58,68,218,16
++ xorps %xmm8,%xmm1
++ pxor %xmm11,%xmm12
++.byte 102,15,56,0,245
++ movups 32(%rsi),%xmm10
++.byte 102,68,15,58,68,218,0
++ xorps %xmm7,%xmm3
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm7
+
++ pxor %xmm0,%xmm3
++ pxor %xmm6,%xmm7
++ pxor %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+- psrldq $8,%xmm3
++ pslldq $8,%xmm3
++.byte 102,68,15,58,68,234,17
++ psrldq $8,%xmm4
++ pxor %xmm3,%xmm0
++ movdqa L$7_mask(%rip),%xmm3
++ pxor %xmm4,%xmm1
++.byte 102,72,15,110,224
++
++ pand %xmm0,%xmm3
++.byte 102,15,56,0,227
++.byte 102,69,15,58,68,226,0
++ pxor %xmm0,%xmm4
++ psllq $57,%xmm4
++ movdqa %xmm4,%xmm3
+ pslldq $8,%xmm4
+- pxor %xmm3,%xmm7
+- pxor %xmm4,%xmm6
++.byte 102,65,15,58,68,241,0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ movdqu 0(%rdx),%xmm3
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++.byte 102,69,15,58,68,193,17
++ xorps %xmm11,%xmm6
++ movdqu 16(%rdx),%xmm11
++.byte 102,68,15,56,0,221
++.byte 102,65,15,58,68,250,16
++ xorps %xmm13,%xmm8
++ movups 80(%rsi),%xmm10
++.byte 102,15,56,0,221
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++
++ movdqa %xmm11,%xmm13
++ pxor %xmm12,%xmm7
++ pshufd $78,%xmm11,%xmm12
++ pxor %xmm11,%xmm12
++.byte 102,69,15,58,68,222,0
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ psrlq $1,%xmm0
++.byte 102,69,15,58,68,238,17
++ xorps %xmm11,%xmm6
++ pxor %xmm1,%xmm0
++
++.byte 102,69,15,58,68,226,0
++ xorps %xmm13,%xmm8
++
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm8,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm8,%xmm4
+
+- leaq 32(%rdx),%rdx
+- subq $32,%rcx
+- jbe L$even_tail
++ leaq 64(%rdx),%rdx
++ subq $64,%rcx
++ jnc L$mod4_loop
++
++L$tail4x:
++.byte 102,65,15,58,68,199,0
++ xorps %xmm12,%xmm7
++.byte 102,65,15,58,68,207,17
++ xorps %xmm6,%xmm0
++.byte 102,65,15,58,68,218,16
++ xorps %xmm8,%xmm1
++ pxor %xmm0,%xmm1
++ pxor %xmm7,%xmm3
+
+-L$mod_loop:
+-.byte 102,65,15,58,68,192,0
+-.byte 102,65,15,58,68,200,17
+-.byte 102,15,58,68,220,0
+- pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
++ pxor %xmm0,%xmm1
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+- movdqu (%rdx),%xmm3
+- pxor %xmm6,%xmm0
+- pxor %xmm7,%xmm1
+
++ movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
++ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
++ psllq $57,%xmm0
++ movdqa %xmm0,%xmm3
++ pslldq $8,%xmm0
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++
++
++ movdqa %xmm0,%xmm4
++ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
++ pxor %xmm4,%xmm0
++ psrlq $1,%xmm0
++ pxor %xmm1,%xmm0
++ addq $64,%rcx
++ jz L$done
++ movdqu 32(%rsi),%xmm10
++ subq $16,%rcx
++ jz L$odd_tail
++L$skip4x:
++
++
++
++
++
++ movdqu (%rdx),%xmm3
+ movdqu 16(%rdx),%xmm6
+ .byte 102,15,56,0,221
+ .byte 102,15,56,0,245
++ pxor %xmm3,%xmm0
++
++ movdqa %xmm6,%xmm8
++ pshufd $78,%xmm6,%xmm3
++ pxor %xmm6,%xmm3
++.byte 102,15,58,68,242,0
++.byte 102,68,15,58,68,194,17
++.byte 102,65,15,58,68,218,0
++
++ leaq 32(%rdx),%rdx
++ subq $32,%rcx
++ jbe L$even_tail
++ jmp L$mod_loop
++
++.p2align 5
++L$mod_loop:
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm0,%xmm4
++
++.byte 102,65,15,58,68,193,0
++.byte 102,65,15,58,68,201,17
++.byte 102,65,15,58,68,226,16
++
++ pxor %xmm6,%xmm0
++ pxor %xmm8,%xmm1
++ movdqu (%rdx),%xmm8
++.byte 102,68,15,56,0,197
++ movdqu 16(%rdx),%xmm6
+
+- movdqa %xmm6,%xmm7
+- pshufd $78,%xmm6,%xmm9
+- pshufd $78,%xmm2,%xmm10
+- pxor %xmm6,%xmm9
+- pxor %xmm2,%xmm10
++ pxor %xmm0,%xmm3
++ pxor %xmm1,%xmm3
++ pxor %xmm8,%xmm1
++ pxor %xmm3,%xmm4
++.byte 102,15,56,0,245
++ movdqa %xmm4,%xmm3
++ psrldq $8,%xmm3
++ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
++ pxor %xmm4,%xmm0
+
++ movdqa %xmm6,%xmm8
++
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
+- pxor %xmm3,%xmm0
+ .byte 102,15,58,68,242,0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
++ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
++ pshufd $78,%xmm8,%xmm3
++ pxor %xmm8,%xmm3
+
+-.byte 102,15,58,68,250,17
++.byte 102,68,15,58,68,194,17
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
+-
+-.byte 102,69,15,58,68,202,0
+- movdqa %xmm0,%xmm1
+- pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm8,%xmm4
+- pxor %xmm0,%xmm3
+- pxor %xmm8,%xmm4
+-
+- pxor %xmm6,%xmm9
+- pxor %xmm7,%xmm9
+- movdqa %xmm9,%xmm10
+- psrldq $8,%xmm9
+- pslldq $8,%xmm10
+- pxor %xmm9,%xmm7
+- pxor %xmm10,%xmm6
++.byte 102,65,15,58,68,218,0
++ pxor %xmm1,%xmm0
+
+ leaq 32(%rdx),%rdx
+ subq $32,%rcx
+ ja L$mod_loop
+
+ L$even_tail:
+-.byte 102,65,15,58,68,192,0
+-.byte 102,65,15,58,68,200,17
+-.byte 102,15,58,68,220,0
++ movdqa %xmm0,%xmm1
++ pshufd $78,%xmm0,%xmm4
++ pxor %xmm0,%xmm4
++
++.byte 102,65,15,58,68,193,0
++.byte 102,65,15,58,68,201,17
++.byte 102,65,15,58,68,226,16
++
++ pxor %xmm6,%xmm0
++ pxor %xmm8,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+-
+- movdqa %xmm3,%xmm4
++ pxor %xmm3,%xmm4
++ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+- pxor %xmm6,%xmm0
+- pxor %xmm7,%xmm1
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ testq %rcx,%rcx
+ jnz L$done
+
+@@ -974,12 +1237,10 @@ L$odd_tail:
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+- pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+- pxor %xmm2,%xmm4
+ .byte 102,15,58,68,194,0
+ .byte 102,15,58,68,202,17
+-.byte 102,15,58,68,220,0
++.byte 102,65,15,58,68,218,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+@@ -989,38 +1250,60 @@ L$odd_tail:
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
++ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+- psllq $1,%xmm0
+- pxor %xmm3,%xmm0
+ psllq $5,%xmm0
++ pxor %xmm0,%xmm3
++ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+- movdqa %xmm0,%xmm4
++ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+- psrldq $8,%xmm4
+- pxor %xmm3,%xmm0
+- pxor %xmm4,%xmm1
++ psrldq $8,%xmm3
++ pxor %xmm4,%xmm0
++ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+- psrlq $5,%xmm0
+- pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
++ pxor %xmm4,%xmm1
++ pxor %xmm0,%xmm4
++ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+- pxor %xmm1,%xmm4
+ psrlq $1,%xmm0
+- pxor %xmm4,%xmm0
++ pxor %xmm1,%xmm0
+ L$done:
+ .byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+-L$SEH_end_gcm_ghash_clmul:
++
++.globl _gcm_init_avx
++
++.p2align 5
++_gcm_init_avx:
++ jmp L$_init_clmul
++
++.globl _gcm_gmult_avx
++
++.p2align 5
++_gcm_gmult_avx:
++ jmp L$_gmult_clmul
++
++.globl _gcm_ghash_avx
++
++.p2align 5
++_gcm_ghash_avx:
++ jmp L$_ghash_clmul
+
+ .p2align 6
+ L$bswap_mask:
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+ L$0x1c2_polynomial:
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
++L$7_mask:
++.long 7,0,7,0
++L$7_mask_poly:
++.long 7,0,450,0
+ .p2align 6
+
+ L$rem_4bit:
+diff --git a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
+index a82f0a5..e2cfa17 100644
+--- a/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
++++ b/lib/accelerated/x86/macosx/appro-aes-x86-64-macosx.s
+@@ -927,199 +927,412 @@ L$oop_enc1_6:
+
+ .p2align 4
+ _aesni_ctr32_encrypt_blocks:
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $128,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
++
+ cmpq $1,%rdx
+ je L$ctr32_one_shortcut
+
+- movdqu (%r8),%xmm14
+- movdqa L$bswap_mask(%rip),%xmm15
+- xorl %eax,%eax
+-.byte 102,69,15,58,22,242,3
+-.byte 102,68,15,58,34,240,3
++ movdqu (%r8),%xmm2
++ movdqu (%rcx),%xmm0
++ movl 12(%r8),%r8d
++ pxor %xmm0,%xmm2
++ movl 12(%rcx),%r11d
++ movdqa %xmm2,0(%rsp)
++ bswapl %r8d
++ movdqa %xmm2,%xmm3
++ movdqa %xmm2,%xmm4
++ movdqa %xmm2,%xmm5
++ movdqa %xmm2,64(%rsp)
++ movdqa %xmm2,80(%rsp)
++ movdqa %xmm2,96(%rsp)
++ movdqa %xmm2,112(%rsp)
+
+ movl 240(%rcx),%eax
++
++ leaq 1(%r8),%r9
++ leaq 2(%r8),%r10
++ bswapl %r9d
+ bswapl %r10d
+- pxor %xmm12,%xmm12
+- pxor %xmm13,%xmm13
+-.byte 102,69,15,58,34,226,0
+- leaq 3(%r10),%r11
+-.byte 102,69,15,58,34,235,0
+- incl %r10d
+-.byte 102,69,15,58,34,226,1
+- incq %r11
+-.byte 102,69,15,58,34,235,1
+- incl %r10d
+-.byte 102,69,15,58,34,226,2
+- incq %r11
+-.byte 102,69,15,58,34,235,2
+- movdqa %xmm12,-40(%rsp)
+-.byte 102,69,15,56,0,231
+- movdqa %xmm13,-24(%rsp)
+-.byte 102,69,15,56,0,239
+-
+- pshufd $192,%xmm12,%xmm2
+- pshufd $128,%xmm12,%xmm3
+- pshufd $64,%xmm12,%xmm4
+- cmpq $6,%rdx
+- jb L$ctr32_tail
+- shrl $1,%eax
+- movq %rcx,%r11
+- movl %eax,%r10d
+- subq $6,%rdx
+- jmp L$ctr32_loop6
++ xorl %r11d,%r9d
++ xorl %r11d,%r10d
++.byte 102,65,15,58,34,217,3
++ leaq 3(%r8),%r9
++ movdqa %xmm3,16(%rsp)
++.byte 102,65,15,58,34,226,3
++ bswapl %r9d
++ leaq 4(%r8),%r10
++ movdqa %xmm4,32(%rsp)
++ xorl %r11d,%r9d
++ bswapl %r10d
++.byte 102,65,15,58,34,233,3
++ xorl %r11d,%r10d
++ movdqa %xmm5,48(%rsp)
++ leaq 5(%r8),%r9
++ movl %r10d,64+12(%rsp)
++ bswapl %r9d
++ leaq 6(%r8),%r10
++ xorl %r11d,%r9d
++ bswapl %r10d
++ movl %r9d,80+12(%rsp)
++ xorl %r11d,%r10d
++ leaq 7(%r8),%r9
++ movl %r10d,96+12(%rsp)
++ bswapl %r9d
++ xorl %r11d,%r9d
++ movl %r9d,112+12(%rsp)
+
+-.p2align 4
+-L$ctr32_loop6:
+- pshufd $192,%xmm13,%xmm5
+- por %xmm14,%xmm2
+- movups (%r11),%xmm0
+- pshufd $128,%xmm13,%xmm6
+- por %xmm14,%xmm3
+- movups 16(%r11),%xmm1
+- pshufd $64,%xmm13,%xmm7
+- por %xmm14,%xmm4
+- por %xmm14,%xmm5
+- xorps %xmm0,%xmm2
+- por %xmm14,%xmm6
+- por %xmm14,%xmm7
++ movups 16(%rcx),%xmm1
+
++ movdqa 64(%rsp),%xmm6
++ movdqa 80(%rsp),%xmm7
+
++ cmpq $8,%rdx
++ jb L$ctr32_tail
+
++ leaq 128(%rcx),%rcx
++ subq $8,%rdx
++ jmp L$ctr32_loop8
+
+- pxor %xmm0,%xmm3
++.p2align 5
++L$ctr32_loop8:
++ addl $8,%r8d
++ movdqa 96(%rsp),%xmm8
+ .byte 102,15,56,220,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++ movl %r8d,%r9d
++ movdqa 112(%rsp),%xmm9
+ .byte 102,15,56,220,217
+- movdqa L$increment32(%rip),%xmm13
+- pxor %xmm0,%xmm5
++ bswapl %r9d
++ movups 32-128(%rcx),%xmm0
+ .byte 102,15,56,220,225
+- movdqa -40(%rsp),%xmm12
+- pxor %xmm0,%xmm6
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++ movl %r9d,0+12(%rsp)
++ leaq 1(%r8),%r9
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+- jmp L$ctr32_enc_loop6_enter
+-.p2align 4
+-L$ctr32_enc_loop6:
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 48-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,16+12(%rsp)
++ leaq 2(%r8),%r9
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 64-128(%rcx),%xmm0
+ .byte 102,15,56,220,209
+ .byte 102,15,56,220,217
+- decl %eax
++ bswapl %r9d
+ .byte 102,15,56,220,225
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,233
++ movl %r9d,32+12(%rsp)
++ leaq 3(%r8),%r9
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+-L$ctr32_enc_loop6_enter:
+- movups 16(%rcx),%xmm1
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 80-128(%rcx),%xmm1
+ .byte 102,15,56,220,208
+ .byte 102,15,56,220,216
+- leaq 32(%rcx),%rcx
++ bswapl %r9d
+ .byte 102,15,56,220,224
++ xorl %r11d,%r9d
+ .byte 102,15,56,220,232
++ movl %r9d,48+12(%rsp)
++ leaq 4(%r8),%r9
+ .byte 102,15,56,220,240
+ .byte 102,15,56,220,248
+- movups (%rcx),%xmm0
+- jnz L$ctr32_enc_loop6
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 96-128(%rcx),%xmm0
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++ bswapl %r9d
++.byte 102,15,56,220,225
++ xorl %r11d,%r9d
++.byte 102,15,56,220,233
++ movl %r9d,64+12(%rsp)
++ leaq 5(%r8),%r9
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 112-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,80+12(%rsp)
++ leaq 6(%r8),%r9
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 128-128(%rcx),%xmm0
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++ bswapl %r9d
++.byte 102,15,56,220,225
++ xorl %r11d,%r9d
++.byte 102,15,56,220,233
++ movl %r9d,96+12(%rsp)
++ leaq 7(%r8),%r9
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 144-128(%rcx),%xmm1
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++ bswapl %r9d
++.byte 102,15,56,220,224
++ xorl %r11d,%r9d
++.byte 102,15,56,220,232
++ movl %r9d,112+12(%rsp)
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++ movdqu 0(%rdi),%xmm10
++.byte 102,68,15,56,220,200
++ movups 160-128(%rcx),%xmm0
++
++ cmpl $11,%eax
++ jb L$ctr32_enc_done
+
+ .byte 102,15,56,220,209
+- paddd %xmm13,%xmm12
+ .byte 102,15,56,220,217
+- paddd -24(%rsp),%xmm13
+ .byte 102,15,56,220,225
+- movdqa %xmm12,-40(%rsp)
+ .byte 102,15,56,220,233
+- movdqa %xmm13,-24(%rsp)
+ .byte 102,15,56,220,241
+-.byte 102,69,15,56,0,231
+ .byte 102,15,56,220,249
+-.byte 102,69,15,56,0,239
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 176-128(%rcx),%xmm1
+
+-.byte 102,15,56,221,208
+- movups (%rdi),%xmm8
+-.byte 102,15,56,221,216
+- movups 16(%rdi),%xmm9
+-.byte 102,15,56,221,224
+- movups 32(%rdi),%xmm10
+-.byte 102,15,56,221,232
+- movups 48(%rdi),%xmm11
+-.byte 102,15,56,221,240
+- movups 64(%rdi),%xmm1
+-.byte 102,15,56,221,248
+- movups 80(%rdi),%xmm0
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 192-128(%rcx),%xmm0
++ je L$ctr32_enc_done
+
+- xorps %xmm2,%xmm8
+- pshufd $192,%xmm12,%xmm2
+- xorps %xmm3,%xmm9
+- pshufd $128,%xmm12,%xmm3
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- pshufd $64,%xmm12,%xmm4
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- xorps %xmm6,%xmm1
+- movups %xmm11,48(%rsi)
+- xorps %xmm7,%xmm0
+- movups %xmm1,64(%rsi)
+- movups %xmm0,80(%rsi)
+- leaq 96(%rsi),%rsi
+- movl %r10d,%eax
+- subq $6,%rdx
+- jnc L$ctr32_loop6
++.byte 102,15,56,220,209
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movups 208-128(%rcx),%xmm1
++
++.byte 102,15,56,220,208
++.byte 102,15,56,220,216
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++.byte 102,15,56,220,240
++.byte 102,15,56,220,248
++.byte 102,68,15,56,220,192
++.byte 102,68,15,56,220,200
++ movups 224-128(%rcx),%xmm0
++
++L$ctr32_enc_done:
++ movdqu 16(%rdi),%xmm11
++ pxor %xmm0,%xmm10
++ movdqu 32(%rdi),%xmm12
++ pxor %xmm0,%xmm11
++ movdqu 48(%rdi),%xmm13
++ pxor %xmm0,%xmm12
++ movdqu 64(%rdi),%xmm14
++ pxor %xmm0,%xmm13
++ movdqu 80(%rdi),%xmm15
++ pxor %xmm0,%xmm14
++.byte 102,15,56,220,209
++ pxor %xmm0,%xmm15
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++.byte 102,68,15,56,220,193
++.byte 102,68,15,56,220,201
++ movdqu 96(%rdi),%xmm1
++
++.byte 102,65,15,56,221,210
++ pxor %xmm0,%xmm1
++ movdqu 112(%rdi),%xmm10
++ leaq 128(%rdi),%rdi
++.byte 102,65,15,56,221,219
++ pxor %xmm0,%xmm10
++ movdqa 0(%rsp),%xmm11
++.byte 102,65,15,56,221,228
++ movdqa 16(%rsp),%xmm12
++.byte 102,65,15,56,221,237
++ movdqa 32(%rsp),%xmm13
++.byte 102,65,15,56,221,246
++ movdqa 48(%rsp),%xmm14
++.byte 102,65,15,56,221,255
++ movdqa 64(%rsp),%xmm15
++.byte 102,68,15,56,221,193
++ movdqa 80(%rsp),%xmm0
++.byte 102,69,15,56,221,202
++ movups 16-128(%rcx),%xmm1
++
++ movups %xmm2,(%rsi)
++ movdqa %xmm11,%xmm2
++ movups %xmm3,16(%rsi)
++ movdqa %xmm12,%xmm3
++ movups %xmm4,32(%rsi)
++ movdqa %xmm13,%xmm4
++ movups %xmm5,48(%rsi)
++ movdqa %xmm14,%xmm5
++ movups %xmm6,64(%rsi)
++ movdqa %xmm15,%xmm6
++ movups %xmm7,80(%rsi)
++ movdqa %xmm0,%xmm7
++ movups %xmm8,96(%rsi)
++ movups %xmm9,112(%rsi)
++ leaq 128(%rsi),%rsi
++
++ subq $8,%rdx
++ jnc L$ctr32_loop8
+
+- addq $6,%rdx
++ addq $8,%rdx
+ jz L$ctr32_done
+- movq %r11,%rcx
+- leal 1(%rax,%rax,1),%eax
++ leaq -128(%rcx),%rcx
+
+ L$ctr32_tail:
+- por %xmm14,%xmm2
+- movups (%rdi),%xmm8
+- cmpq $2,%rdx
+- jb L$ctr32_one
++ leaq 16(%rcx),%rcx
++ cmpq $4,%rdx
++ jb L$ctr32_loop3
++ je L$ctr32_loop4
+
+- por %xmm14,%xmm3
+- movups 16(%rdi),%xmm9
+- je L$ctr32_two
++ movdqa 96(%rsp),%xmm8
++ pxor %xmm9,%xmm9
+
+- pshufd $192,%xmm13,%xmm5
+- por %xmm14,%xmm4
+- movups 32(%rdi),%xmm10
+- cmpq $4,%rdx
+- jb L$ctr32_three
++ movups 16(%rcx),%xmm0
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++ shrl $1,%eax
++.byte 102,15,56,220,225
++ decl %eax
++.byte 102,15,56,220,233
++ movups (%rdi),%xmm10
++.byte 102,15,56,220,241
++ movups 16(%rdi),%xmm11
++.byte 102,15,56,220,249
++ movups 32(%rdi),%xmm12
++.byte 102,68,15,56,220,193
++ movups 16(%rcx),%xmm1
+
+- pshufd $128,%xmm13,%xmm6
+- por %xmm14,%xmm5
+- movups 48(%rdi),%xmm11
+- je L$ctr32_four
++ call L$enc_loop8_enter
+
+- por %xmm14,%xmm6
+- xorps %xmm7,%xmm7
++ movdqu 48(%rdi),%xmm13
++ pxor %xmm10,%xmm2
++ movdqu 64(%rdi),%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm10,%xmm6
++ movdqu %xmm5,48(%rsi)
++ movdqu %xmm6,64(%rsi)
++ cmpq $6,%rdx
++ jb L$ctr32_done
+
+- call _aesni_encrypt6
++ movups 80(%rdi),%xmm11
++ xorps %xmm11,%xmm7
++ movups %xmm7,80(%rsi)
++ je L$ctr32_done
+
+- movups 64(%rdi),%xmm1
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- xorps %xmm6,%xmm1
+- movups %xmm11,48(%rsi)
+- movups %xmm1,64(%rsi)
++ movups 96(%rdi),%xmm12
++ xorps %xmm12,%xmm8
++ movups %xmm8,96(%rsi)
++ jmp L$ctr32_done
++
++.p2align 5
++L$ctr32_loop4:
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++ movups (%rcx),%xmm1
++ decl %eax
++ jnz L$ctr32_loop4
++.byte 102,15,56,221,209
++ movups (%rdi),%xmm10
++.byte 102,15,56,221,217
++ movups 16(%rdi),%xmm11
++.byte 102,15,56,221,225
++ movups 32(%rdi),%xmm12
++.byte 102,15,56,221,233
++ movups 48(%rdi),%xmm13
++
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
++ xorps %xmm11,%xmm3
++ movups %xmm3,16(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm5,48(%rsi)
++ jmp L$ctr32_done
++
++.p2align 5
++L$ctr32_loop3:
++.byte 102,15,56,220,209
++ leaq 16(%rcx),%rcx
++.byte 102,15,56,220,217
++.byte 102,15,56,220,225
++ movups (%rcx),%xmm1
++ decl %eax
++ jnz L$ctr32_loop3
++.byte 102,15,56,221,209
++.byte 102,15,56,221,217
++.byte 102,15,56,221,225
++
++ movups (%rdi),%xmm10
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
++ cmpq $2,%rdx
++ jb L$ctr32_done
++
++ movups 16(%rdi),%xmm11
++ xorps %xmm11,%xmm3
++ movups %xmm3,16(%rsi)
++ je L$ctr32_done
++
++ movups 32(%rdi),%xmm12
++ xorps %xmm12,%xmm4
++ movups %xmm4,32(%rsi)
+ jmp L$ctr32_done
+
+ .p2align 4
+ L$ctr32_one_shortcut:
+ movups (%r8),%xmm2
+- movups (%rdi),%xmm8
++ movups (%rdi),%xmm10
+ movl 240(%rcx),%eax
+-L$ctr32_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+@@ -1131,51 +1344,26 @@ L$oop_enc1_7:
+ leaq 16(%rcx),%rcx
+ jnz L$oop_enc1_7
+ .byte 102,15,56,221,209
+- xorps %xmm2,%xmm8
+- movups %xmm8,(%rsi)
+- jmp L$ctr32_done
+-
+-.p2align 4
+-L$ctr32_two:
+- xorps %xmm4,%xmm4
+- call _aesni_encrypt3
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- movups %xmm9,16(%rsi)
+- jmp L$ctr32_done
+-
+-.p2align 4
+-L$ctr32_three:
+- call _aesni_encrypt3
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- movups %xmm10,32(%rsi)
++ xorps %xmm10,%xmm2
++ movups %xmm2,(%rsi)
+ jmp L$ctr32_done
+
+ .p2align 4
+-L$ctr32_four:
+- call _aesni_encrypt4
+- xorps %xmm2,%xmm8
+- xorps %xmm3,%xmm9
+- movups %xmm8,(%rsi)
+- xorps %xmm4,%xmm10
+- movups %xmm9,16(%rsi)
+- xorps %xmm5,%xmm11
+- movups %xmm10,32(%rsi)
+- movups %xmm11,48(%rsi)
+-
+ L$ctr32_done:
++ leaq (%rbp),%rsp
++ popq %rbp
++L$ctr32_epilogue:
+ .byte 0xf3,0xc3
+
+ .globl _aesni_xts_encrypt
+
+ .p2align 4
+ _aesni_xts_encrypt:
+- leaq -104(%rsp),%rsp
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $112,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+@@ -1190,228 +1378,266 @@ L$oop_enc1_8:
+ leaq 16(%r8),%r8
+ jnz L$oop_enc1_8
+ .byte 102,68,15,56,221,249
++ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
++ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
++ movups 16(%rcx,%r10,1),%xmm1
++ movl %eax,%r10d
++
+ movdqa L$xts_magic(%rip),%xmm8
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pshufd $95,%xmm15,%xmm9
++ pxor %xmm0,%xmm1
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm10
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm11
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm12
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
++ psrad $31,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm13
++ pxor %xmm14,%xmm15
++ movdqa %xmm15,%xmm14
++ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
++ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
++ movaps %xmm1,96(%rsp)
++
+ subq $96,%rdx
+ jc L$xts_enc_short
+
+ shrl $1,%eax
+- subl $1,%eax
++ subl $3,%eax
++ movups 16(%r11),%xmm1
+ movl %eax,%r10d
++ leaq L$xts_magic(%rip),%r8
+ jmp L$xts_enc_grandloop
+
+-.p2align 4
++.p2align 5
+ L$xts_enc_grandloop:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+- pand %xmm8,%xmm9
++ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+- movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+- movdqu 48(%rdi),%xmm5
++ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+- movdqu 64(%rdi),%xmm6
++.byte 102,15,56,220,209
++ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+- movdqu 80(%rdi),%xmm7
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,220,217
++ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+- movups (%r11),%xmm0
++.byte 102,15,56,220,225
++ movdqu 80(%rdi),%xmm7
++ pxor %xmm15,%xmm8
++ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+- pxor %xmm15,%xmm7
+-
+-
++.byte 102,15,56,220,233
++ movups 32(%r11),%xmm0
++ leaq 96(%rdi),%rdi
++ pxor %xmm8,%xmm7
+
+- movups 16(%r11),%xmm1
+- pxor %xmm0,%xmm2
+- pxor %xmm0,%xmm3
++ pxor %xmm9,%xmm10
++.byte 102,15,56,220,241
++ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+-.byte 102,15,56,220,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++.byte 102,15,56,220,249
++ movups 48(%r11),%xmm1
++
++.byte 102,15,56,220,208
++ pxor %xmm9,%xmm12
+ movdqa %xmm11,16(%rsp)
+-.byte 102,15,56,220,217
+- pxor %xmm0,%xmm5
++.byte 102,15,56,220,216
++ pxor %xmm9,%xmm13
+ movdqa %xmm12,32(%rsp)
+-.byte 102,15,56,220,225
+- pxor %xmm0,%xmm6
+- movdqa %xmm13,48(%rsp)
+-.byte 102,15,56,220,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++.byte 102,15,56,220,224
++ pxor %xmm9,%xmm14
++.byte 102,15,56,220,232
++ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+-.byte 102,15,56,220,241
+- movdqa %xmm15,80(%rsp)
+-.byte 102,15,56,220,249
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- jmp L$xts_enc_loop6_enter
+-
+-.p2align 4
++.byte 102,15,56,220,240
++ movdqa %xmm8,80(%rsp)
++.byte 102,15,56,220,248
++ movups 64(%r11),%xmm0
++ leaq 64(%r11),%rcx
++ pshufd $95,%xmm15,%xmm9
++ jmp L$xts_enc_loop6
++.p2align 5
+ L$xts_enc_loop6:
+ .byte 102,15,56,220,209
+ .byte 102,15,56,220,217
+- decl %eax
+ .byte 102,15,56,220,225
+ .byte 102,15,56,220,233
+ .byte 102,15,56,220,241
+ .byte 102,15,56,220,249
+-L$xts_enc_loop6_enter:
+ movups 16(%rcx),%xmm1
++ leaq 32(%rcx),%rcx
++
+ .byte 102,15,56,220,208
+ .byte 102,15,56,220,216
+- leaq 32(%rcx),%rcx
+ .byte 102,15,56,220,224
+ .byte 102,15,56,220,232
+ .byte 102,15,56,220,240
+ .byte 102,15,56,220,248
+ movups (%rcx),%xmm0
++ decl %eax
+ jnz L$xts_enc_loop6
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- paddq %xmm15,%xmm15
++ movdqa (%r8),%xmm8
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,209
+- pand %xmm8,%xmm9
++ paddq %xmm15,%xmm15
++ psrad $31,%xmm14
+ .byte 102,15,56,220,217
+- pcmpgtd %xmm15,%xmm14
++ pand %xmm8,%xmm14
++ movups (%r11),%xmm10
+ .byte 102,15,56,220,225
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,220,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,241
++ movaps %xmm10,%xmm11
+ .byte 102,15,56,220,249
+ movups 16(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm10
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,208
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm10
++ psrad $31,%xmm14
+ .byte 102,15,56,220,216
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,220,224
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,220,232
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,240
++ movaps %xmm11,%xmm12
+ .byte 102,15,56,220,248
+ movups 32(%rcx),%xmm0
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm11
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,220,209
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm11
++ psrad $31,%xmm14
+ .byte 102,15,56,220,217
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,220,225
+- pxor %xmm9,%xmm15
++ movdqa %xmm13,48(%rsp)
+ .byte 102,15,56,220,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,220,241
++ movaps %xmm12,%xmm13
+ .byte 102,15,56,220,249
++ movups 48(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm12
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
++.byte 102,15,56,220,208
++ pxor %xmm15,%xmm12
++ psrad $31,%xmm14
++.byte 102,15,56,220,216
+ paddq %xmm15,%xmm15
+-.byte 102,15,56,221,208
+- pand %xmm8,%xmm9
+-.byte 102,15,56,221,216
+- pcmpgtd %xmm15,%xmm14
+-.byte 102,15,56,221,224
+- pxor %xmm9,%xmm15
+-.byte 102,15,56,221,232
+-.byte 102,15,56,221,240
+-.byte 102,15,56,221,248
++ pand %xmm8,%xmm14
++.byte 102,15,56,220,224
++.byte 102,15,56,220,232
++ pxor %xmm14,%xmm15
++.byte 102,15,56,220,240
++ movaps %xmm13,%xmm14
++.byte 102,15,56,220,248
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm13
++ movdqa %xmm9,%xmm0
++ paddd %xmm9,%xmm9
++.byte 102,15,56,220,209
++ pxor %xmm15,%xmm13
++ psrad $31,%xmm0
++.byte 102,15,56,220,217
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm0
++.byte 102,15,56,220,225
++.byte 102,15,56,220,233
++ pxor %xmm0,%xmm15
++ movups (%r11),%xmm0
++.byte 102,15,56,220,241
++.byte 102,15,56,220,249
++ movups 16(%r11),%xmm1
++
++ pxor %xmm15,%xmm14
++ psrad $31,%xmm9
++.byte 102,15,56,221,84,36,0
+ paddq %xmm15,%xmm15
+- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+- xorps 16(%rsp),%xmm3
+- pcmpgtd %xmm15,%xmm14
++.byte 102,15,56,221,92,36,16
++.byte 102,15,56,221,100,36,32
+ pxor %xmm9,%xmm15
+-
+- xorps 32(%rsp),%xmm4
+- movups %xmm2,0(%rsi)
+- xorps 48(%rsp),%xmm5
+- movups %xmm3,16(%rsi)
+- xorps 64(%rsp),%xmm6
+- movups %xmm4,32(%rsi)
+- xorps 80(%rsp),%xmm7
+- movups %xmm5,48(%rsi)
++.byte 102,15,56,221,108,36,48
++.byte 102,15,56,221,116,36,64
++.byte 102,15,56,221,124,36,80
+ movl %r10d,%eax
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
++
+ leaq 96(%rsi),%rsi
++ movups %xmm2,-96(%rsi)
++ movups %xmm3,-80(%rsi)
++ movups %xmm4,-64(%rsi)
++ movups %xmm5,-48(%rsi)
++ movups %xmm6,-32(%rsi)
++ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc L$xts_enc_grandloop
+
+- leal 3(%rax,%rax,1),%eax
++ leal 7(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+ L$xts_enc_short:
++ pxor %xmm0,%xmm10
+ addq $96,%rdx
+ jz L$xts_enc_done
+
++ pxor %xmm0,%xmm11
+ cmpq $32,%rdx
+ jb L$xts_enc_one
++ pxor %xmm0,%xmm12
+ je L$xts_enc_two
+
++ pxor %xmm0,%xmm13
+ cmpq $64,%rdx
+ jb L$xts_enc_three
++ pxor %xmm0,%xmm14
+ je L$xts_enc_four
+
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+@@ -1514,15 +1740,15 @@ L$xts_enc_four:
+
+ call _aesni_encrypt4
+
+- xorps %xmm10,%xmm2
+- movdqa %xmm15,%xmm10
+- xorps %xmm11,%xmm3
+- xorps %xmm12,%xmm4
+- movups %xmm2,(%rsi)
+- xorps %xmm13,%xmm5
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
++ pxor %xmm10,%xmm2
++ movdqa %xmm14,%xmm10
++ pxor %xmm11,%xmm3
++ pxor %xmm12,%xmm4
++ movdqu %xmm2,(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm3,16(%rsi)
++ movdqu %xmm4,32(%rsi)
++ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp L$xts_enc_done
+
+@@ -1563,7 +1789,8 @@ L$oop_enc1_10:
+ movups %xmm2,-16(%rsi)
+
+ L$xts_enc_ret:
+- leaq 104(%rsp),%rsp
++ leaq (%rbp),%rsp
++ popq %rbp
+ L$xts_enc_epilogue:
+ .byte 0xf3,0xc3
+
+@@ -1571,7 +1798,11 @@ L$xts_enc_epilogue:
+
+ .p2align 4
+ _aesni_xts_decrypt:
+- leaq -104(%rsp),%rsp
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $112,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
+ movups (%r9),%xmm15
+ movl 240(%r8),%eax
+ movl 240(%rcx),%r10d
+@@ -1592,228 +1823,266 @@ L$oop_enc1_11:
+ shlq $4,%rax
+ subq %rax,%rdx
+
++ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %r10d,%eax
++ shll $4,%r10d
+ movq %rdx,%r9
+ andq $-16,%rdx
+
++ movups 16(%rcx,%r10,1),%xmm1
++ movl %eax,%r10d
++
+ movdqa L$xts_magic(%rip),%xmm8
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pshufd $95,%xmm15,%xmm9
++ pxor %xmm0,%xmm1
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm10
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm10
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm11
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm11
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm12
++ psrad $31,%xmm14
+ paddq %xmm15,%xmm15
+- pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
+- pxor %xmm9,%xmm15
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm12
++ pxor %xmm14,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ movdqa %xmm15,%xmm13
++ psrad $31,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
++ pxor %xmm0,%xmm13
++ pxor %xmm14,%xmm15
++ movdqa %xmm15,%xmm14
++ psrad $31,%xmm9
+ paddq %xmm15,%xmm15
+ pand %xmm8,%xmm9
+- pcmpgtd %xmm15,%xmm14
++ pxor %xmm0,%xmm14
+ pxor %xmm9,%xmm15
++ movaps %xmm1,96(%rsp)
++
+ subq $96,%rdx
+ jc L$xts_dec_short
+
+ shrl $1,%eax
+- subl $1,%eax
++ subl $3,%eax
++ movups 16(%r11),%xmm1
+ movl %eax,%r10d
++ leaq L$xts_magic(%rip),%r8
+ jmp L$xts_dec_grandloop
+
+-.p2align 4
++.p2align 5
+ L$xts_dec_grandloop:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu 0(%rdi),%xmm2
+- pand %xmm8,%xmm9
++ movdqa %xmm0,%xmm8
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+- movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+- movdqu 48(%rdi),%xmm5
++ movdqu 32(%rdi),%xmm4
+ pxor %xmm11,%xmm3
+- movdqu 64(%rdi),%xmm6
++.byte 102,15,56,222,209
++ movdqu 48(%rdi),%xmm5
+ pxor %xmm12,%xmm4
+- movdqu 80(%rdi),%xmm7
+- leaq 96(%rdi),%rdi
++.byte 102,15,56,222,217
++ movdqu 64(%rdi),%xmm6
+ pxor %xmm13,%xmm5
+- movups (%r11),%xmm0
++.byte 102,15,56,222,225
++ movdqu 80(%rdi),%xmm7
++ pxor %xmm15,%xmm8
++ movdqa 96(%rsp),%xmm9
+ pxor %xmm14,%xmm6
+- pxor %xmm15,%xmm7
+-
+-
++.byte 102,15,56,222,233
++ movups 32(%r11),%xmm0
++ leaq 96(%rdi),%rdi
++ pxor %xmm8,%xmm7
+
+- movups 16(%r11),%xmm1
+- pxor %xmm0,%xmm2
+- pxor %xmm0,%xmm3
++ pxor %xmm9,%xmm10
++.byte 102,15,56,222,241
++ pxor %xmm9,%xmm11
+ movdqa %xmm10,0(%rsp)
+-.byte 102,15,56,222,209
+- leaq 32(%r11),%rcx
+- pxor %xmm0,%xmm4
++.byte 102,15,56,222,249
++ movups 48(%r11),%xmm1
++
++.byte 102,15,56,222,208
++ pxor %xmm9,%xmm12
+ movdqa %xmm11,16(%rsp)
+-.byte 102,15,56,222,217
+- pxor %xmm0,%xmm5
++.byte 102,15,56,222,216
++ pxor %xmm9,%xmm13
+ movdqa %xmm12,32(%rsp)
+-.byte 102,15,56,222,225
+- pxor %xmm0,%xmm6
+- movdqa %xmm13,48(%rsp)
+-.byte 102,15,56,222,233
+- pxor %xmm0,%xmm7
+- movups (%rcx),%xmm0
+- decl %eax
++.byte 102,15,56,222,224
++ pxor %xmm9,%xmm14
++.byte 102,15,56,222,232
++ pxor %xmm9,%xmm8
+ movdqa %xmm14,64(%rsp)
+-.byte 102,15,56,222,241
+- movdqa %xmm15,80(%rsp)
+-.byte 102,15,56,222,249
+- pxor %xmm14,%xmm14
+- pcmpgtd %xmm15,%xmm14
+- jmp L$xts_dec_loop6_enter
+-
+-.p2align 4
++.byte 102,15,56,222,240
++ movdqa %xmm8,80(%rsp)
++.byte 102,15,56,222,248
++ movups 64(%r11),%xmm0
++ leaq 64(%r11),%rcx
++ pshufd $95,%xmm15,%xmm9
++ jmp L$xts_dec_loop6
++.p2align 5
+ L$xts_dec_loop6:
+ .byte 102,15,56,222,209
+ .byte 102,15,56,222,217
+- decl %eax
+ .byte 102,15,56,222,225
+ .byte 102,15,56,222,233
+ .byte 102,15,56,222,241
+ .byte 102,15,56,222,249
+-L$xts_dec_loop6_enter:
+ movups 16(%rcx),%xmm1
++ leaq 32(%rcx),%rcx
++
+ .byte 102,15,56,222,208
+ .byte 102,15,56,222,216
+- leaq 32(%rcx),%rcx
+ .byte 102,15,56,222,224
+ .byte 102,15,56,222,232
+ .byte 102,15,56,222,240
+ .byte 102,15,56,222,248
+ movups (%rcx),%xmm0
++ decl %eax
+ jnz L$xts_dec_loop6
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- paddq %xmm15,%xmm15
++ movdqa (%r8),%xmm8
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,209
+- pand %xmm8,%xmm9
++ paddq %xmm15,%xmm15
++ psrad $31,%xmm14
+ .byte 102,15,56,222,217
+- pcmpgtd %xmm15,%xmm14
++ pand %xmm8,%xmm14
++ movups (%r11),%xmm10
+ .byte 102,15,56,222,225
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,222,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,241
++ movaps %xmm10,%xmm11
+ .byte 102,15,56,222,249
+ movups 16(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm10
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,208
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm10
++ psrad $31,%xmm14
+ .byte 102,15,56,222,216
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,222,224
+- pxor %xmm9,%xmm15
+ .byte 102,15,56,222,232
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,240
++ movaps %xmm11,%xmm12
+ .byte 102,15,56,222,248
+ movups 32(%rcx),%xmm0
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm11
+- paddq %xmm15,%xmm15
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
+ .byte 102,15,56,222,209
+- pand %xmm8,%xmm9
++ pxor %xmm15,%xmm11
++ psrad $31,%xmm14
+ .byte 102,15,56,222,217
+- pcmpgtd %xmm15,%xmm14
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm14
+ .byte 102,15,56,222,225
+- pxor %xmm9,%xmm15
++ movdqa %xmm13,48(%rsp)
+ .byte 102,15,56,222,233
++ pxor %xmm14,%xmm15
+ .byte 102,15,56,222,241
++ movaps %xmm12,%xmm13
+ .byte 102,15,56,222,249
++ movups 48(%rcx),%xmm1
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm12
++ movdqa %xmm9,%xmm14
++ paddd %xmm9,%xmm9
++.byte 102,15,56,222,208
++ pxor %xmm15,%xmm12
++ psrad $31,%xmm14
++.byte 102,15,56,222,216
+ paddq %xmm15,%xmm15
+-.byte 102,15,56,223,208
+- pand %xmm8,%xmm9
+-.byte 102,15,56,223,216
+- pcmpgtd %xmm15,%xmm14
+-.byte 102,15,56,223,224
+- pxor %xmm9,%xmm15
+-.byte 102,15,56,223,232
+-.byte 102,15,56,223,240
+-.byte 102,15,56,223,248
++ pand %xmm8,%xmm14
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++ pxor %xmm14,%xmm15
++.byte 102,15,56,222,240
++ movaps %xmm13,%xmm14
++.byte 102,15,56,222,248
+
+- pshufd $19,%xmm14,%xmm9
+- pxor %xmm14,%xmm14
+- movdqa %xmm15,%xmm13
++ movdqa %xmm9,%xmm0
++ paddd %xmm9,%xmm9
++.byte 102,15,56,222,209
++ pxor %xmm15,%xmm13
++ psrad $31,%xmm0
++.byte 102,15,56,222,217
++ paddq %xmm15,%xmm15
++ pand %xmm8,%xmm0
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++ pxor %xmm0,%xmm15
++ movups (%r11),%xmm0
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++ movups 16(%r11),%xmm1
++
++ pxor %xmm15,%xmm14
++ psrad $31,%xmm9
++.byte 102,15,56,223,84,36,0
+ paddq %xmm15,%xmm15
+- xorps 0(%rsp),%xmm2
+ pand %xmm8,%xmm9
+- xorps 16(%rsp),%xmm3
+- pcmpgtd %xmm15,%xmm14
++.byte 102,15,56,223,92,36,16
++.byte 102,15,56,223,100,36,32
+ pxor %xmm9,%xmm15
+-
+- xorps 32(%rsp),%xmm4
+- movups %xmm2,0(%rsi)
+- xorps 48(%rsp),%xmm5
+- movups %xmm3,16(%rsi)
+- xorps 64(%rsp),%xmm6
+- movups %xmm4,32(%rsi)
+- xorps 80(%rsp),%xmm7
+- movups %xmm5,48(%rsi)
++.byte 102,15,56,223,108,36,48
++.byte 102,15,56,223,116,36,64
++.byte 102,15,56,223,124,36,80
+ movl %r10d,%eax
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
++
+ leaq 96(%rsi),%rsi
++ movups %xmm2,-96(%rsi)
++ movups %xmm3,-80(%rsi)
++ movups %xmm4,-64(%rsi)
++ movups %xmm5,-48(%rsi)
++ movups %xmm6,-32(%rsi)
++ movups %xmm7,-16(%rsi)
+ subq $96,%rdx
+ jnc L$xts_dec_grandloop
+
+- leal 3(%rax,%rax,1),%eax
++ leal 7(%rax,%rax,1),%eax
+ movq %r11,%rcx
+ movl %eax,%r10d
+
+ L$xts_dec_short:
++ pxor %xmm0,%xmm10
++ pxor %xmm0,%xmm11
+ addq $96,%rdx
+ jz L$xts_dec_done
+
++ pxor %xmm0,%xmm12
+ cmpq $32,%rdx
+ jb L$xts_dec_one
++ pxor %xmm0,%xmm13
+ je L$xts_dec_two
+
++ pxor %xmm0,%xmm14
+ cmpq $64,%rdx
+ jb L$xts_dec_three
+ je L$xts_dec_four
+
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movdqu (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movdqu 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movdqu 32(%rdi),%xmm4
+ pxor %xmm10,%xmm2
+ movdqu 48(%rdi),%xmm5
+@@ -1906,7 +2175,7 @@ L$xts_dec_three:
+ xorps %xmm10,%xmm2
+ movdqa %xmm13,%xmm10
+ xorps %xmm11,%xmm3
+- movdqa %xmm15,%xmm11
++ movdqa %xmm14,%xmm11
+ xorps %xmm12,%xmm4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+@@ -1916,14 +2185,8 @@ L$xts_dec_three:
+
+ .p2align 4
+ L$xts_dec_four:
+- pshufd $19,%xmm14,%xmm9
+- movdqa %xmm15,%xmm14
+- paddq %xmm15,%xmm15
+ movups (%rdi),%xmm2
+- pand %xmm8,%xmm9
+ movups 16(%rdi),%xmm3
+- pxor %xmm9,%xmm15
+-
+ movups 32(%rdi),%xmm4
+ xorps %xmm10,%xmm2
+ movups 48(%rdi),%xmm5
+@@ -1934,16 +2197,16 @@ L$xts_dec_four:
+
+ call _aesni_decrypt4
+
+- xorps %xmm10,%xmm2
++ pxor %xmm10,%xmm2
+ movdqa %xmm14,%xmm10
+- xorps %xmm11,%xmm3
++ pxor %xmm11,%xmm3
+ movdqa %xmm15,%xmm11
+- xorps %xmm12,%xmm4
+- movups %xmm2,(%rsi)
+- xorps %xmm13,%xmm5
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm2,(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm3,16(%rsi)
++ movdqu %xmm4,32(%rsi)
++ movdqu %xmm5,48(%rsi)
+ leaq 64(%rsi),%rsi
+ jmp L$xts_dec_done
+
+@@ -2003,7 +2266,8 @@ L$oop_dec1_14:
+ movups %xmm2,(%rsi)
+
+ L$xts_dec_ret:
+- leaq 104(%rsp),%rsp
++ leaq (%rbp),%rsp
++ popq %rbp
+ L$xts_dec_epilogue:
+ .byte 0xf3,0xc3
+
+@@ -2070,149 +2334,324 @@ L$cbc_enc_tail:
+
+ .p2align 4
+ L$cbc_decrypt:
+- movups (%r8),%xmm9
++ leaq (%rsp),%rax
++ pushq %rbp
++ subq $16,%rsp
++ andq $-16,%rsp
++ leaq -8(%rax),%rbp
++ movups (%r8),%xmm10
+ movl %r10d,%eax
+- cmpq $112,%rdx
++ cmpq $80,%rdx
+ jbe L$cbc_dec_tail
+- shrl $1,%r10d
++
++ movups (%rcx),%xmm0
++ movdqu 0(%rdi),%xmm2
++ movdqu 16(%rdi),%xmm3
++ movdqa %xmm2,%xmm11
++ movdqu 32(%rdi),%xmm4
++ movdqa %xmm3,%xmm12
++ movdqu 48(%rdi),%xmm5
++ movdqa %xmm4,%xmm13
++ movdqu 64(%rdi),%xmm6
++ movdqa %xmm5,%xmm14
++ movdqu 80(%rdi),%xmm7
++ movdqa %xmm6,%xmm15
++ cmpq $112,%rdx
++ jbe L$cbc_dec_six_or_seven
++
+ subq $112,%rdx
+- movl %r10d,%eax
+- movaps %xmm9,-24(%rsp)
++ leaq 112(%rcx),%rcx
+ jmp L$cbc_dec_loop8_enter
+ .p2align 4
+ L$cbc_dec_loop8:
+- movaps %xmm0,-24(%rsp)
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ L$cbc_dec_loop8_enter:
+- movups (%rcx),%xmm0
+- movups (%rdi),%xmm2
+- movups 16(%rdi),%xmm3
+- movups 16(%rcx),%xmm1
++ movdqu 96(%rdi),%xmm8
++ pxor %xmm0,%xmm2
++ movdqu 112(%rdi),%xmm9
++ pxor %xmm0,%xmm3
++ movups 16-112(%rcx),%xmm1
++ pxor %xmm0,%xmm4
++ xorq %r11,%r11
++ cmpq $112,%rdx
++ pxor %xmm0,%xmm5
++ pxor %xmm0,%xmm6
++ pxor %xmm0,%xmm7
++ pxor %xmm0,%xmm8
+
+- leaq 32(%rcx),%rcx
+- movdqu 32(%rdi),%xmm4
+- xorps %xmm0,%xmm2
+- movdqu 48(%rdi),%xmm5
+- xorps %xmm0,%xmm3
+- movdqu 64(%rdi),%xmm6
+ .byte 102,15,56,222,209
+- pxor %xmm0,%xmm4
+- movdqu 80(%rdi),%xmm7
++ pxor %xmm0,%xmm9
++ movups 32-112(%rcx),%xmm0
+ .byte 102,15,56,222,217
+- pxor %xmm0,%xmm5
+- movdqu 96(%rdi),%xmm8
+ .byte 102,15,56,222,225
+- pxor %xmm0,%xmm6
+- movdqu 112(%rdi),%xmm9
+ .byte 102,15,56,222,233
+- pxor %xmm0,%xmm7
+- decl %eax
+ .byte 102,15,56,222,241
+- pxor %xmm0,%xmm8
+ .byte 102,15,56,222,249
+- pxor %xmm0,%xmm9
+- movups (%rcx),%xmm0
++ setnc %r11b
+ .byte 102,68,15,56,222,193
++ shlq $7,%r11
+ .byte 102,68,15,56,222,201
+- movups 16(%rcx),%xmm1
+-
+- call L$dec_loop8_enter
++ addq %rdi,%r11
++ movups 48-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 64-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 80-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 96-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 112-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 128-112(%rcx),%xmm0
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 144-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 160-112(%rcx),%xmm0
++ cmpl $11,%eax
++ jb L$cbc_dec_done
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 176-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 192-112(%rcx),%xmm0
++ je L$cbc_dec_done
++.byte 102,15,56,222,209
++.byte 102,15,56,222,217
++.byte 102,15,56,222,225
++.byte 102,15,56,222,233
++.byte 102,15,56,222,241
++.byte 102,15,56,222,249
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movups 208-112(%rcx),%xmm1
++.byte 102,15,56,222,208
++.byte 102,15,56,222,216
++.byte 102,15,56,222,224
++.byte 102,15,56,222,232
++.byte 102,15,56,222,240
++.byte 102,15,56,222,248
++.byte 102,68,15,56,222,192
++.byte 102,68,15,56,222,200
++ movups 224-112(%rcx),%xmm0
++L$cbc_dec_done:
++.byte 102,15,56,222,209
++ pxor %xmm0,%xmm10
++.byte 102,15,56,222,217
++ pxor %xmm0,%xmm11
++.byte 102,15,56,222,225
++ pxor %xmm0,%xmm12
++.byte 102,15,56,222,233
++ pxor %xmm0,%xmm13
++.byte 102,15,56,222,241
++ pxor %xmm0,%xmm14
++.byte 102,15,56,222,249
++ pxor %xmm0,%xmm15
++.byte 102,68,15,56,222,193
++.byte 102,68,15,56,222,201
++ movdqu 80(%rdi),%xmm1
++
++.byte 102,65,15,56,223,210
++ movdqu 96(%rdi),%xmm10
++ pxor %xmm0,%xmm1
++.byte 102,65,15,56,223,219
++ pxor %xmm0,%xmm10
++ movdqu 112(%rdi),%xmm0
++ leaq 128(%rdi),%rdi
++.byte 102,65,15,56,223,228
++ movdqu 0(%r11),%xmm11
++.byte 102,65,15,56,223,237
++ movdqu 16(%r11),%xmm12
++.byte 102,65,15,56,223,246
++ movdqu 32(%r11),%xmm13
++.byte 102,65,15,56,223,255
++ movdqu 48(%r11),%xmm14
++.byte 102,68,15,56,223,193
++ movdqu 64(%r11),%xmm15
++.byte 102,69,15,56,223,202
++ movdqa %xmm0,%xmm10
++ movdqu 80(%r11),%xmm1
++ movups -112(%rcx),%xmm0
+
+- movups (%rdi),%xmm1
+- movups 16(%rdi),%xmm0
+- xorps -24(%rsp),%xmm2
+- xorps %xmm1,%xmm3
+- movups 32(%rdi),%xmm1
+- xorps %xmm0,%xmm4
+- movups 48(%rdi),%xmm0
+- xorps %xmm1,%xmm5
+- movups 64(%rdi),%xmm1
+- xorps %xmm0,%xmm6
+- movups 80(%rdi),%xmm0
+- xorps %xmm1,%xmm7
+- movups 96(%rdi),%xmm1
+- xorps %xmm0,%xmm8
+- movups 112(%rdi),%xmm0
+- xorps %xmm1,%xmm9
+ movups %xmm2,(%rsi)
++ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
++ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
++ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+- movl %r10d,%eax
++ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+- movq %r11,%rcx
++ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+- leaq 128(%rdi),%rdi
++ movdqa %xmm1,%xmm7
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
++
+ subq $128,%rdx
+ ja L$cbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+- movaps %xmm0,%xmm9
++ leaq -112(%rcx),%rcx
+ addq $112,%rdx
+ jle L$cbc_dec_tail_collected
+- movups %xmm2,(%rsi)
+- leal 1(%r10,%r10,1),%eax
++ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
++ cmpq $80,%rdx
++ jbe L$cbc_dec_tail
++
++ movaps %xmm11,%xmm2
++L$cbc_dec_six_or_seven:
++ cmpq $96,%rdx
++ ja L$cbc_dec_seven
++
++ movaps %xmm7,%xmm8
++ call _aesni_decrypt6
++ pxor %xmm10,%xmm2
++ movaps %xmm8,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ pxor %xmm15,%xmm7
++ movdqu %xmm6,64(%rsi)
++ leaq 80(%rsi),%rsi
++ movdqa %xmm7,%xmm2
++ jmp L$cbc_dec_tail_collected
++
++.p2align 4
++L$cbc_dec_seven:
++ movups 96(%rdi),%xmm8
++ xorps %xmm9,%xmm9
++ call _aesni_decrypt8
++ movups 80(%rdi),%xmm9
++ pxor %xmm10,%xmm2
++ movups 96(%rdi),%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ pxor %xmm15,%xmm7
++ movdqu %xmm6,64(%rsi)
++ pxor %xmm9,%xmm8
++ movdqu %xmm7,80(%rsi)
++ leaq 96(%rsi),%rsi
++ movdqa %xmm8,%xmm2
++ jmp L$cbc_dec_tail_collected
++
+ L$cbc_dec_tail:
+ movups (%rdi),%xmm2
+- movaps %xmm2,%xmm8
+- cmpq $16,%rdx
++ subq $16,%rdx
+ jbe L$cbc_dec_one
+
+ movups 16(%rdi),%xmm3
+- movaps %xmm3,%xmm7
+- cmpq $32,%rdx
++ movaps %xmm2,%xmm11
++ subq $16,%rdx
+ jbe L$cbc_dec_two
+
+ movups 32(%rdi),%xmm4
+- movaps %xmm4,%xmm6
+- cmpq $48,%rdx
++ movaps %xmm3,%xmm12
++ subq $16,%rdx
+ jbe L$cbc_dec_three
+
+ movups 48(%rdi),%xmm5
+- cmpq $64,%rdx
++ movaps %xmm4,%xmm13
++ subq $16,%rdx
+ jbe L$cbc_dec_four
+
+ movups 64(%rdi),%xmm6
+- cmpq $80,%rdx
+- jbe L$cbc_dec_five
+-
+- movups 80(%rdi),%xmm7
+- cmpq $96,%rdx
+- jbe L$cbc_dec_six
+-
+- movups 96(%rdi),%xmm8
+- movaps %xmm9,-24(%rsp)
+- call _aesni_decrypt8
+- movups (%rdi),%xmm1
+- movups 16(%rdi),%xmm0
+- xorps -24(%rsp),%xmm2
+- xorps %xmm1,%xmm3
+- movups 32(%rdi),%xmm1
+- xorps %xmm0,%xmm4
+- movups 48(%rdi),%xmm0
+- xorps %xmm1,%xmm5
+- movups 64(%rdi),%xmm1
+- xorps %xmm0,%xmm6
+- movups 80(%rdi),%xmm0
+- xorps %xmm1,%xmm7
+- movups 96(%rdi),%xmm9
+- xorps %xmm0,%xmm8
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- movups %xmm6,64(%rsi)
+- movups %xmm7,80(%rsi)
+- leaq 96(%rsi),%rsi
+- movaps %xmm8,%xmm2
+- subq $112,%rdx
++ movaps %xmm5,%xmm14
++ movaps %xmm6,%xmm15
++ xorps %xmm7,%xmm7
++ call _aesni_decrypt6
++ pxor %xmm10,%xmm2
++ movaps %xmm15,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ pxor %xmm14,%xmm6
++ movdqu %xmm5,48(%rsi)
++ leaq 64(%rsi),%rsi
++ movdqa %xmm6,%xmm2
++ subq $16,%rdx
+ jmp L$cbc_dec_tail_collected
++
+ .p2align 4
+ L$cbc_dec_one:
++ movaps %xmm2,%xmm11
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+@@ -2224,111 +2663,69 @@ L$oop_dec1_16:
+ leaq 16(%rcx),%rcx
+ jnz L$oop_dec1_16
+ .byte 102,15,56,223,209
+- xorps %xmm9,%xmm2
+- movaps %xmm8,%xmm9
+- subq $16,%rdx
++ xorps %xmm10,%xmm2
++ movaps %xmm11,%xmm10
+ jmp L$cbc_dec_tail_collected
+ .p2align 4
+ L$cbc_dec_two:
++ movaps %xmm3,%xmm12
+ xorps %xmm4,%xmm4
+ call _aesni_decrypt3
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- movaps %xmm7,%xmm9
+- movaps %xmm3,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm12,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ movdqa %xmm3,%xmm2
+ leaq 16(%rsi),%rsi
+- subq $32,%rdx
+ jmp L$cbc_dec_tail_collected
+ .p2align 4
+ L$cbc_dec_three:
++ movaps %xmm4,%xmm13
+ call _aesni_decrypt3
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- xorps %xmm7,%xmm4
+- movups %xmm3,16(%rsi)
+- movaps %xmm6,%xmm9
+- movaps %xmm4,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm13,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ movdqa %xmm4,%xmm2
+ leaq 32(%rsi),%rsi
+- subq $48,%rdx
+ jmp L$cbc_dec_tail_collected
+ .p2align 4
+ L$cbc_dec_four:
++ movaps %xmm5,%xmm14
+ call _aesni_decrypt4
+- xorps %xmm9,%xmm2
+- movups 48(%rdi),%xmm9
+- xorps %xmm8,%xmm3
+- movups %xmm2,(%rsi)
+- xorps %xmm7,%xmm4
+- movups %xmm3,16(%rsi)
+- xorps %xmm6,%xmm5
+- movups %xmm4,32(%rsi)
+- movaps %xmm5,%xmm2
++ pxor %xmm10,%xmm2
++ movaps %xmm14,%xmm10
++ pxor %xmm11,%xmm3
++ movdqu %xmm2,(%rsi)
++ pxor %xmm12,%xmm4
++ movdqu %xmm3,16(%rsi)
++ pxor %xmm13,%xmm5
++ movdqu %xmm4,32(%rsi)
++ movdqa %xmm5,%xmm2
+ leaq 48(%rsi),%rsi
+- subq $64,%rdx
+- jmp L$cbc_dec_tail_collected
+-.p2align 4
+-L$cbc_dec_five:
+- xorps %xmm7,%xmm7
+- call _aesni_decrypt6
+- movups 16(%rdi),%xmm1
+- movups 32(%rdi),%xmm0
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- xorps %xmm1,%xmm4
+- movups 48(%rdi),%xmm1
+- xorps %xmm0,%xmm5
+- movups 64(%rdi),%xmm9
+- xorps %xmm1,%xmm6
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- leaq 64(%rsi),%rsi
+- movaps %xmm6,%xmm2
+- subq $80,%rdx
+- jmp L$cbc_dec_tail_collected
+-.p2align 4
+-L$cbc_dec_six:
+- call _aesni_decrypt6
+- movups 16(%rdi),%xmm1
+- movups 32(%rdi),%xmm0
+- xorps %xmm9,%xmm2
+- xorps %xmm8,%xmm3
+- xorps %xmm1,%xmm4
+- movups 48(%rdi),%xmm1
+- xorps %xmm0,%xmm5
+- movups 64(%rdi),%xmm0
+- xorps %xmm1,%xmm6
+- movups 80(%rdi),%xmm9
+- xorps %xmm0,%xmm7
+- movups %xmm2,(%rsi)
+- movups %xmm3,16(%rsi)
+- movups %xmm4,32(%rsi)
+- movups %xmm5,48(%rsi)
+- movups %xmm6,64(%rsi)
+- leaq 80(%rsi),%rsi
+- movaps %xmm7,%xmm2
+- subq $96,%rdx
+ jmp L$cbc_dec_tail_collected
++
+ .p2align 4
+ L$cbc_dec_tail_collected:
++ movups %xmm10,(%r8)
+ andq $15,%rdx
+- movups %xmm9,(%r8)
+ jnz L$cbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ jmp L$cbc_dec_ret
+ .p2align 4
+ L$cbc_dec_tail_partial:
+- movaps %xmm2,-24(%rsp)
++ movaps %xmm2,(%rsp)
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+- leaq -24(%rsp),%rsi
++ leaq (%rsp),%rsi
+ .long 0x9066A4F3
+
+ L$cbc_dec_ret:
++ leaq (%rbp),%rsp
++ popq %rbp
+ L$cbc_ret:
+ .byte 0xf3,0xc3
+
+@@ -2571,6 +2968,8 @@ L$increment64:
+ .long 1,0,0,0
+ L$xts_magic:
+ .long 0x87,0,1,0
++L$increment1:
++.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+
+ .byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ .p2align 6
+diff --git a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
+index b9ec30c..1327e82 100644
+--- a/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
++++ b/lib/accelerated/x86/macosx/padlock-x86-64-macosx.s
+@@ -597,6 +597,468 @@ L$cbc_abort:
+ popq %rbp
+ .byte 0xf3,0xc3
+
++.globl _padlock_cfb_encrypt
++
++.p2align 4
++_padlock_cfb_encrypt:
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz L$cfb_abort
++ testq $15,%rcx
++ jnz L$cfb_abort
++ leaq L$padlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz L$cfb_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz L$cfb_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++ jmp L$cfb_loop
++.p2align 4
++L$cfb_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz L$cfb_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++L$cfb_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,224
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz L$cfb_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++L$cfb_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jnz L$cfb_loop
++ cmpq %rbp,%rsp
++ je L$cfb_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++L$cfb_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja L$cfb_bzero
++
++L$cfb_done:
++ leaq (%rbp),%rsp
++ jmp L$cfb_exit
++
++.p2align 4
++L$cfb_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,224
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++L$cfb_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++L$cfb_abort:
++ popq %rbx
++ popq %rbp
++ .byte 0xf3,0xc3
++
++.globl _padlock_ofb_encrypt
++
++.p2align 4
++_padlock_ofb_encrypt:
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz L$ofb_abort
++ testq $15,%rcx
++ jnz L$ofb_abort
++ leaq L$padlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz L$ofb_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz L$ofb_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++ jmp L$ofb_loop
++.p2align 4
++L$ofb_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz L$ofb_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++L$ofb_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,232
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz L$ofb_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++L$ofb_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jnz L$ofb_loop
++ cmpq %rbp,%rsp
++ je L$ofb_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++L$ofb_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja L$ofb_bzero
++
++L$ofb_done:
++ leaq (%rbp),%rsp
++ jmp L$ofb_exit
++
++.p2align 4
++L$ofb_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,232
++ movdqa (%rax),%xmm0
++ movdqa %xmm0,-16(%rdx)
++L$ofb_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++L$ofb_abort:
++ popq %rbx
++ popq %rbp
++ .byte 0xf3,0xc3
++
++.globl _padlock_ctr32_encrypt
++
++.p2align 4
++_padlock_ctr32_encrypt:
++ pushq %rbp
++ pushq %rbx
++
++ xorl %eax,%eax
++ testq $15,%rdx
++ jnz L$ctr32_abort
++ testq $15,%rcx
++ jnz L$ctr32_abort
++ leaq L$padlock_saved_context(%rip),%rax
++ pushf
++ cld
++ call _padlock_verify_ctx
++ leaq 16(%rdx),%rdx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%rdx)
++ jnz L$ctr32_aligned
++ testq $15,%rdi
++ setz %al
++ testq $15,%rsi
++ setz %bl
++ testl %ebx,%eax
++ jnz L$ctr32_aligned
++ negq %rax
++ movq $512,%rbx
++ notq %rax
++ leaq (%rsp),%rbp
++ cmpq %rbx,%rcx
++ cmovcq %rcx,%rbx
++ andq %rbx,%rax
++ movq %rcx,%rbx
++ negq %rax
++ andq $512-1,%rbx
++ leaq (%rax,%rbp,1),%rsp
++ movq $512,%rax
++ cmovzq %rax,%rbx
++L$ctr32_reenter:
++ movl -4(%rdx),%eax
++ bswapl %eax
++ negl %eax
++ andl $31,%eax
++ movq $512,%rbx
++ shll $4,%eax
++ cmovzq %rbx,%rax
++ cmpq %rax,%rcx
++ cmovaq %rax,%rbx
++ cmovbeq %rcx,%rbx
++ cmpq %rbx,%rcx
++ ja L$ctr32_loop
++ movq %rsi,%rax
++ cmpq %rsp,%rbp
++ cmoveq %rdi,%rax
++ addq %rcx,%rax
++ negq %rax
++ andq $4095,%rax
++ cmpq $32,%rax
++ movq $-32,%rax
++ cmovaeq %rbx,%rax
++ andq %rax,%rbx
++ jz L$ctr32_unaligned_tail
++ jmp L$ctr32_loop
++.p2align 4
++L$ctr32_loop:
++ cmpq %rcx,%rbx
++ cmovaq %rcx,%rbx
++ movq %rdi,%r8
++ movq %rsi,%r9
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++ testq $15,%rdi
++ cmovnzq %rsp,%rdi
++ testq $15,%rsi
++ jz L$ctr32_inp_aligned
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++ movq %rbx,%rcx
++ movq %rdi,%rsi
++L$ctr32_inp_aligned:
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++ movl -4(%rdx),%eax
++ testl $4294901760,%eax
++ jnz L$ctr32_no_carry
++ bswapl %eax
++ addl $65536,%eax
++ bswapl %eax
++ movl %eax,-4(%rdx)
++L$ctr32_no_carry:
++ movq %r8,%rdi
++ movq %r11,%rbx
++ testq $15,%rdi
++ jz L$ctr32_out_aligned
++ movq %rbx,%rcx
++ leaq (%rsp),%rsi
++ shrq $3,%rcx
++.byte 0xf3,0x48,0xa5
++ subq %rbx,%rdi
++L$ctr32_out_aligned:
++ movq %r9,%rsi
++ movq %r10,%rcx
++ addq %rbx,%rdi
++ addq %rbx,%rsi
++ subq %rbx,%rcx
++ movq $512,%rbx
++ jz L$ctr32_break
++ cmpq %rbx,%rcx
++ jae L$ctr32_loop
++ movq %rcx,%rbx
++ movq %rsi,%rax
++ cmpq %rsp,%rbp
++ cmoveq %rdi,%rax
++ addq %rcx,%rax
++ negq %rax
++ andq $4095,%rax
++ cmpq $32,%rax
++ movq $-32,%rax
++ cmovaeq %rbx,%rax
++ andq %rax,%rbx
++ jnz L$ctr32_loop
++L$ctr32_unaligned_tail:
++ xorl %eax,%eax
++ cmpq %rsp,%rbp
++ cmoveq %rcx,%rax
++ movq %rdi,%r8
++ movq %rcx,%rbx
++ subq %rax,%rsp
++ shrq $3,%rcx
++ leaq (%rsp),%rdi
++.byte 0xf3,0x48,0xa5
++ movq %rsp,%rsi
++ movq %r8,%rdi
++ movq %rbx,%rcx
++ jmp L$ctr32_loop
++.p2align 4
++L$ctr32_break:
++ cmpq %rbp,%rsp
++ je L$ctr32_done
++
++ pxor %xmm0,%xmm0
++ leaq (%rsp),%rax
++L$ctr32_bzero:
++ movaps %xmm0,(%rax)
++ leaq 16(%rax),%rax
++ cmpq %rax,%rbp
++ ja L$ctr32_bzero
++
++L$ctr32_done:
++ leaq (%rbp),%rsp
++ jmp L$ctr32_exit
++
++.p2align 4
++L$ctr32_aligned:
++ movl -4(%rdx),%eax
++ bswapl %eax
++ negl %eax
++ andl $65535,%eax
++ movq $1048576,%rbx
++ shll $4,%eax
++ cmovzq %rbx,%rax
++ cmpq %rax,%rcx
++ cmovaq %rax,%rbx
++ cmovbeq %rcx,%rbx
++ jbe L$ctr32_aligned_skip
++
++L$ctr32_aligned_loop:
++ movq %rcx,%r10
++ movq %rbx,%rcx
++ movq %rbx,%r11
++
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++
++ movl -4(%rdx),%eax
++ bswapl %eax
++ addl $65536,%eax
++ bswapl %eax
++ movl %eax,-4(%rdx)
++
++ movq %r10,%rcx
++ subq %r11,%rcx
++ movq $1048576,%rbx
++ jz L$ctr32_exit
++ cmpq %rbx,%rcx
++ jae L$ctr32_aligned_loop
++
++L$ctr32_aligned_skip:
++ leaq (%rsi,%rcx,1),%rbp
++ negq %rbp
++ andq $4095,%rbp
++ xorl %eax,%eax
++ cmpq $32,%rbp
++ movq $32-1,%rbp
++ cmovaeq %rax,%rbp
++ andq %rcx,%rbp
++ subq %rbp,%rcx
++ jz L$ctr32_aligned_tail
++ leaq -16(%rdx),%rax
++ leaq 16(%rdx),%rbx
++ shrq $4,%rcx
++.byte 0xf3,0x0f,0xa7,216
++ testq %rbp,%rbp
++ jz L$ctr32_exit
++
++L$ctr32_aligned_tail:
++ movq %rdi,%r8
++ movq %rbp,%rbx
++ movq %rbp,%rcx
++ leaq (%rsp),%rbp
++ subq %rcx,%rsp
++ shrq $3,%rcx
++ leaq (%rsp),%rdi
++.byte 0xf3,0x48,0xa5
++ leaq (%r8),%rdi
++ leaq (%rsp),%rsi
++ movq %rbx,%rcx
++ jmp L$ctr32_loop
++L$ctr32_exit:
++ movl $1,%eax
++ leaq 8(%rsp),%rsp
++L$ctr32_abort:
++ popq %rbx
++ popq %rbp
++ .byte 0xf3,0xc3
++
+ .byte 86,73,65,32,80,97,100,108,111,99,107,32,120,56,54,95,54,52,32,109,111,100,117,108,101,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+ .p2align 4
+ .data
+diff --git a/lib/accelerated/x86/macosx/padlock-x86-macosx.s b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
+index 7a38b7c..1a2fa92 100644
+--- a/lib/accelerated/x86/macosx/padlock-x86-macosx.s
++++ b/lib/accelerated/x86/macosx/padlock-x86-macosx.s
+@@ -510,6 +510,351 @@ L016cbc_abort:
+ popl %ebx
+ popl %ebp
+ ret
++.globl _padlock_cfb_encrypt
++.align 4
++_padlock_cfb_encrypt:
++L_padlock_cfb_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz L028cfb_abort
++ testl $15,%ecx
++ jnz L028cfb_abort
++ leal Lpadlock_saved_context-L029cfb_pic_point,%eax
++ pushfl
++ cld
++ call __padlock_verify_ctx
++L029cfb_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%edx)
++ jnz L030cfb_aligned
++ testl $15,%edi
++ setz %al
++ testl $15,%esi
++ setz %bl
++ testl %ebx,%eax
++ jnz L030cfb_aligned
++ negl %eax
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp L031cfb_loop
++.align 4,0x90
++L031cfb_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ testl $15,%edi
++ cmovnzl %esp,%edi
++ testl $15,%esi
++ jz L032cfb_inp_aligned
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++ movl %ebx,%ecx
++ movl %edi,%esi
++L032cfb_inp_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,224
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ testl $15,%edi
++ jz L033cfb_out_aligned
++ movl %ebx,%ecx
++ leal (%esp),%esi
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++L033cfb_out_aligned:
++ movl 4(%ebp),%esi
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz L031cfb_loop
++ cmpl %ebp,%esp
++ je L034cfb_done
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++L035cfb_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja L035cfb_bzero
++L034cfb_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ jmp L036cfb_exit
++.align 4,0x90
++L030cfb_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,224
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++L036cfb_exit:
++ movl $1,%eax
++ leal 4(%esp),%esp
++L028cfb_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
++.globl _padlock_ofb_encrypt
++.align 4
++_padlock_ofb_encrypt:
++L_padlock_ofb_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz L037ofb_abort
++ testl $15,%ecx
++ jnz L037ofb_abort
++ leal Lpadlock_saved_context-L038ofb_pic_point,%eax
++ pushfl
++ cld
++ call __padlock_verify_ctx
++L038ofb_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ xorl %ebx,%ebx
++ testl $32,(%edx)
++ jnz L039ofb_aligned
++ testl $15,%edi
++ setz %al
++ testl $15,%esi
++ setz %bl
++ testl %ebx,%eax
++ jnz L039ofb_aligned
++ negl %eax
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp L040ofb_loop
++.align 4,0x90
++L040ofb_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ testl $15,%edi
++ cmovnzl %esp,%edi
++ testl $15,%esi
++ jz L041ofb_inp_aligned
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++ movl %ebx,%ecx
++ movl %edi,%esi
++L041ofb_inp_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,232
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ testl $15,%edi
++ jz L042ofb_out_aligned
++ movl %ebx,%ecx
++ leal (%esp),%esi
++ shrl $2,%ecx
++.byte 243,165
++ subl %ebx,%edi
++L042ofb_out_aligned:
++ movl 4(%ebp),%esi
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz L040ofb_loop
++ cmpl %ebp,%esp
++ je L043ofb_done
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++L044ofb_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja L044ofb_bzero
++L043ofb_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ jmp L045ofb_exit
++.align 4,0x90
++L039ofb_aligned:
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,232
++ movaps (%eax),%xmm0
++ movaps %xmm0,-16(%edx)
++L045ofb_exit:
++ movl $1,%eax
++ leal 4(%esp),%esp
++L037ofb_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
++.globl _padlock_ctr32_encrypt
++.align 4
++_padlock_ctr32_encrypt:
++L_padlock_ctr32_encrypt_begin:
++ pushl %ebp
++ pushl %ebx
++ pushl %esi
++ pushl %edi
++ movl 20(%esp),%edi
++ movl 24(%esp),%esi
++ movl 28(%esp),%edx
++ movl 32(%esp),%ecx
++ testl $15,%edx
++ jnz L046ctr32_abort
++ testl $15,%ecx
++ jnz L046ctr32_abort
++ leal Lpadlock_saved_context-L047ctr32_pic_point,%eax
++ pushfl
++ cld
++ call __padlock_verify_ctx
++L047ctr32_pic_point:
++ leal 16(%edx),%edx
++ xorl %eax,%eax
++ movq -16(%edx),%mm0
++ movl $512,%ebx
++ notl %eax
++ leal -24(%esp),%ebp
++ cmpl %ebx,%ecx
++ cmovcl %ecx,%ebx
++ andl %ebx,%eax
++ movl %ecx,%ebx
++ negl %eax
++ andl $511,%ebx
++ leal (%eax,%ebp,1),%esp
++ movl $512,%eax
++ cmovzl %eax,%ebx
++ movl %ebp,%eax
++ andl $-16,%ebp
++ andl $-16,%esp
++ movl %eax,16(%ebp)
++ jmp L048ctr32_loop
++.align 4,0x90
++L048ctr32_loop:
++ movl %edi,(%ebp)
++ movl %esi,4(%ebp)
++ movl %ecx,8(%ebp)
++ movl %ebx,%ecx
++ movl %ebx,12(%ebp)
++ movl -4(%edx),%ecx
++ xorl %edi,%edi
++ movl -8(%edx),%eax
++L049ctr32_prepare:
++ movl %ecx,12(%esp,%edi,1)
++ bswap %ecx
++ movq %mm0,(%esp,%edi,1)
++ incl %ecx
++ movl %eax,8(%esp,%edi,1)
++ bswap %ecx
++ leal 16(%edi),%edi
++ cmpl %ebx,%edi
++ jb L049ctr32_prepare
++ movl %ecx,-4(%edx)
++ leal (%esp),%esi
++ leal (%esp),%edi
++ movl %ebx,%ecx
++ leal -16(%edx),%eax
++ leal 16(%edx),%ebx
++ shrl $4,%ecx
++.byte 243,15,167,200
++ movl (%ebp),%edi
++ movl 12(%ebp),%ebx
++ movl 4(%ebp),%esi
++ xorl %ecx,%ecx
++L050ctr32_xor:
++ movups (%esi,%ecx,1),%xmm1
++ leal 16(%ecx),%ecx
++ pxor -16(%esp,%ecx,1),%xmm1
++ movups %xmm1,-16(%edi,%ecx,1)
++ cmpl %ebx,%ecx
++ jb L050ctr32_xor
++ movl 8(%ebp),%ecx
++ addl %ebx,%edi
++ addl %ebx,%esi
++ subl %ebx,%ecx
++ movl $512,%ebx
++ jnz L048ctr32_loop
++ pxor %xmm0,%xmm0
++ leal (%esp),%eax
++L051ctr32_bzero:
++ movaps %xmm0,(%eax)
++ leal 16(%eax),%eax
++ cmpl %eax,%ebp
++ ja L051ctr32_bzero
++L052ctr32_done:
++ movl 16(%ebp),%ebp
++ leal 24(%ebp),%esp
++ movl $1,%eax
++ leal 4(%esp),%esp
++ emms
++L046ctr32_abort:
++ popl %edi
++ popl %esi
++ popl %ebx
++ popl %ebp
++ ret
+ .globl _padlock_xstore
+ .align 4
+ _padlock_xstore:
+@@ -526,10 +871,10 @@ __win32_segv_handler:
+ movl 4(%esp),%edx
+ movl 12(%esp),%ecx
+ cmpl $3221225477,(%edx)
+- jne L028ret
++ jne L053ret
+ addl $4,184(%ecx)
+ movl $0,%eax
+-L028ret:
++L053ret:
+ ret
+ .globl _padlock_sha1_oneshot
+ .align 4
+--
+1.8.4.2
+
diff --git a/gnutls.spec b/gnutls.spec
index 16cf652..cc83bef 100644
--- a/gnutls.spec
+++ b/gnutls.spec
@@ -27,6 +27,7 @@ Source0: %{name}-%{version}-hobbled.tar.xz
Source1: libgnutls-config
Source2: hobble-gnutls
Patch1: gnutls-3.2.7-rpath.patch
+Patch2: gnutls-3.2.7-asm.patch
# Use only FIPS approved ciphers in the FIPS mode
Patch7: gnutls-2.12.21-fips-algorithms.patch
Patch8: gnutls-3.1.11-nosrp.patch
@@ -131,6 +132,7 @@ This package contains Guile bindings for the library.
%setup -q
%patch1 -p1 -b .rpath
+%patch2 -p1 -b .asm
# This patch is not applicable as we use nettle now but some parts will be
# later reused.
#%patch7 -p1 -b .fips
@@ -266,8 +268,9 @@ fi
%endif
%changelog
-* Tue Nov 27 2013 Nikos Mavrogiannopoulos <nmav at redhat.com> 3.2.7-2
-- Use the following root key for unbound /var/lib/unbound/root.key (#1012494)
+* Wed Dec 4 2013 Nikos Mavrogiannopoulos <nmav at redhat.com> 3.2.7-2
+- Use the correct root key for unbound /var/lib/unbound/root.key (#1012494)
+- Pull asm fixes from upstream (#973210)
* Mon Nov 25 2013 Nikos Mavrogiannopoulos <nmav at redhat.com> 3.2.7-1
- new upstream release
More information about the scm-commits
mailing list